diff --git "a/checkpoint-12000/trainer_state.json" "b/checkpoint-12000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-12000/trainer_state.json" @@ -0,0 +1,84208 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.33110641277242986, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_runtime": 27.428, + "eval_samples_per_second": 1.167, + "eval_steps_per_second": 0.146, + "step": 0 + }, + { + "epoch": 2.7592201064369157e-05, + "grad_norm": 3.6326730251312256, + "learning_rate": 0.001, + "loss": 0.4698, + "step": 1 + }, + { + "epoch": 5.5184402128738314e-05, + "grad_norm": 0.062148336321115494, + "learning_rate": 0.001, + "loss": 0.4517, + "step": 2 + }, + { + "epoch": 8.277660319310746e-05, + "grad_norm": 0.0332246758043766, + "learning_rate": 0.001, + "loss": 0.4677, + "step": 3 + }, + { + "epoch": 0.00011036880425747663, + "grad_norm": 0.10165657848119736, + "learning_rate": 0.001, + "loss": 0.4603, + "step": 4 + }, + { + "epoch": 0.00013796100532184578, + "grad_norm": 0.04767799377441406, + "learning_rate": 0.001, + "loss": 0.4623, + "step": 5 + }, + { + "epoch": 0.00016555320638621493, + "grad_norm": 0.023448018357157707, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 6 + }, + { + "epoch": 0.0001931454074505841, + "grad_norm": 0.013601069338619709, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 7 + }, + { + "epoch": 0.00022073760851495325, + "grad_norm": 0.009370611980557442, + "learning_rate": 0.001, + "loss": 0.403, + "step": 8 + }, + { + "epoch": 0.0002483298095793224, + "grad_norm": 0.008329728618264198, + "learning_rate": 0.001, + "loss": 0.4436, + "step": 9 + }, + { + "epoch": 0.00027592201064369155, + "grad_norm": 0.004966976586729288, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 10 + }, + { + "epoch": 0.0003035142117080607, + "grad_norm": 0.004952501505613327, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 11 + }, + { + "epoch": 0.00033110641277242985, + "grad_norm": 0.008474660106003284, + "learning_rate": 0.001, + "loss": 0.4487, + "step": 12 + }, + { + "epoch": 0.000358698613836799, + "grad_norm": 0.004617814905941486, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 13 + }, + { + "epoch": 0.0003862908149011682, + "grad_norm": 0.0045386566780507565, + "learning_rate": 0.001, + "loss": 0.4456, + "step": 14 + }, + { + "epoch": 0.00041388301596553736, + "grad_norm": 0.0031611656304448843, + "learning_rate": 0.001, + "loss": 0.3454, + "step": 15 + }, + { + "epoch": 0.0004414752170299065, + "grad_norm": 0.0026714960113167763, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 16 + }, + { + "epoch": 0.00046906741809427566, + "grad_norm": 0.0028185201808810234, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 17 + }, + { + "epoch": 0.0004966596191586448, + "grad_norm": 0.002944110194221139, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 18 + }, + { + "epoch": 0.000524251820223014, + "grad_norm": 0.004909531679004431, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 19 + }, + { + "epoch": 0.0005518440212873831, + "grad_norm": 0.0034190011210739613, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 20 + }, + { + "epoch": 0.0005794362223517523, + "grad_norm": 0.003598715178668499, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 21 + }, + { + "epoch": 0.0006070284234161214, + "grad_norm": 0.004229442682117224, + "learning_rate": 0.001, + "loss": 0.461, + "step": 22 + }, + { + "epoch": 0.0006346206244804906, + "grad_norm": 0.0024963070172816515, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 23 + }, + { + "epoch": 0.0006622128255448597, + "grad_norm": 0.002500817645341158, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 24 + }, + { + "epoch": 0.0006898050266092289, + "grad_norm": 0.002520238049328327, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 25 + }, + { + "epoch": 0.000717397227673598, + "grad_norm": 0.002308758907020092, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 26 + }, + { + "epoch": 0.0007449894287379673, + "grad_norm": 0.0020286778453737497, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 27 + }, + { + "epoch": 0.0007725816298023364, + "grad_norm": 0.0013328269124031067, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 28 + }, + { + "epoch": 0.0008001738308667056, + "grad_norm": 0.003233084687963128, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 29 + }, + { + "epoch": 0.0008277660319310747, + "grad_norm": 0.0017853466561064124, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 30 + }, + { + "epoch": 0.0008553582329954439, + "grad_norm": 0.0021520040463656187, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 31 + }, + { + "epoch": 0.000882950434059813, + "grad_norm": 0.002464903285726905, + "learning_rate": 0.001, + "loss": 0.4404, + "step": 32 + }, + { + "epoch": 0.0009105426351241822, + "grad_norm": 0.0017728271195665002, + "learning_rate": 0.001, + "loss": 0.406, + "step": 33 + }, + { + "epoch": 0.0009381348361885513, + "grad_norm": 0.001393413869664073, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 34 + }, + { + "epoch": 0.0009657270372529205, + "grad_norm": 0.001682270085439086, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 35 + }, + { + "epoch": 0.0009933192383172896, + "grad_norm": 0.0018528420478105545, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 36 + }, + { + "epoch": 0.0010209114393816589, + "grad_norm": 0.0014731371775269508, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 37 + }, + { + "epoch": 0.001048503640446028, + "grad_norm": 0.0012595922453328967, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 38 + }, + { + "epoch": 0.0010760958415103972, + "grad_norm": 0.0011791549623012543, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 39 + }, + { + "epoch": 0.0011036880425747662, + "grad_norm": 0.001127090072259307, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 40 + }, + { + "epoch": 0.0011312802436391355, + "grad_norm": 0.0015457386616617441, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 41 + }, + { + "epoch": 0.0011588724447035045, + "grad_norm": 0.0013682112330570817, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 42 + }, + { + "epoch": 0.0011864646457678738, + "grad_norm": 0.0012363678542897105, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 43 + }, + { + "epoch": 0.0012140568468322428, + "grad_norm": 0.0017921420512720942, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 44 + }, + { + "epoch": 0.001241649047896612, + "grad_norm": 0.001728672650642693, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 45 + }, + { + "epoch": 0.0012692412489609811, + "grad_norm": 0.0038352590054273605, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 46 + }, + { + "epoch": 0.0012968334500253504, + "grad_norm": 0.001102472422644496, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 47 + }, + { + "epoch": 0.0013244256510897194, + "grad_norm": 0.0060236481949687, + "learning_rate": 0.001, + "loss": 0.4381, + "step": 48 + }, + { + "epoch": 0.0013520178521540887, + "grad_norm": 0.0013644751161336899, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 49 + }, + { + "epoch": 0.0013796100532184577, + "grad_norm": 0.0019051303388550878, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 50 + }, + { + "epoch": 0.001407202254282827, + "grad_norm": 0.0021409685723483562, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 51 + }, + { + "epoch": 0.001434794455347196, + "grad_norm": 0.0015203810762614012, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 52 + }, + { + "epoch": 0.0014623866564115653, + "grad_norm": 0.0018806306179612875, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 53 + }, + { + "epoch": 0.0014899788574759345, + "grad_norm": 0.0024247588589787483, + "learning_rate": 0.001, + "loss": 0.4347, + "step": 54 + }, + { + "epoch": 0.0015175710585403036, + "grad_norm": 0.000940295634791255, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 55 + }, + { + "epoch": 0.0015451632596046728, + "grad_norm": 0.0012640036875382066, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 56 + }, + { + "epoch": 0.0015727554606690419, + "grad_norm": 0.00419093482196331, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 57 + }, + { + "epoch": 0.0016003476617334111, + "grad_norm": 0.002213024301454425, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 58 + }, + { + "epoch": 0.0016279398627977802, + "grad_norm": 0.004215582739561796, + "learning_rate": 0.001, + "loss": 0.4603, + "step": 59 + }, + { + "epoch": 0.0016555320638621494, + "grad_norm": 0.0016123323002830148, + "learning_rate": 0.001, + "loss": 0.4395, + "step": 60 + }, + { + "epoch": 0.0016831242649265185, + "grad_norm": 0.0014739107573404908, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 61 + }, + { + "epoch": 0.0017107164659908877, + "grad_norm": 0.0014588399790227413, + "learning_rate": 0.001, + "loss": 0.4476, + "step": 62 + }, + { + "epoch": 0.0017383086670552568, + "grad_norm": 0.0037299874238669872, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 63 + }, + { + "epoch": 0.001765900868119626, + "grad_norm": 0.0027305828407406807, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 64 + }, + { + "epoch": 0.001793493069183995, + "grad_norm": 0.0027416169177740812, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 65 + }, + { + "epoch": 0.0018210852702483643, + "grad_norm": 0.0017876614583656192, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 66 + }, + { + "epoch": 0.0018486774713127334, + "grad_norm": 0.0019625681452453136, + "learning_rate": 0.001, + "loss": 0.443, + "step": 67 + }, + { + "epoch": 0.0018762696723771026, + "grad_norm": 0.0032296774443238974, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 68 + }, + { + "epoch": 0.0019038618734414717, + "grad_norm": 0.0022506825625896454, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 69 + }, + { + "epoch": 0.001931454074505841, + "grad_norm": 0.002406027168035507, + "learning_rate": 0.001, + "loss": 0.4453, + "step": 70 + }, + { + "epoch": 0.00195904627557021, + "grad_norm": 0.0014062536647543311, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 71 + }, + { + "epoch": 0.0019866384766345792, + "grad_norm": 0.0020411391742527485, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 72 + }, + { + "epoch": 0.0020142306776989483, + "grad_norm": 0.0023991591297090054, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 73 + }, + { + "epoch": 0.0020418228787633178, + "grad_norm": 0.002641907660290599, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 74 + }, + { + "epoch": 0.002069415079827687, + "grad_norm": 0.001838643685914576, + "learning_rate": 0.001, + "loss": 0.384, + "step": 75 + }, + { + "epoch": 0.002097007280892056, + "grad_norm": 0.0014234330737963319, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 76 + }, + { + "epoch": 0.002124599481956425, + "grad_norm": 0.0020837297197431326, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 77 + }, + { + "epoch": 0.0021521916830207944, + "grad_norm": 0.002444372745230794, + "learning_rate": 0.001, + "loss": 0.4, + "step": 78 + }, + { + "epoch": 0.0021797838840851634, + "grad_norm": 0.001574559137225151, + "learning_rate": 0.001, + "loss": 0.447, + "step": 79 + }, + { + "epoch": 0.0022073760851495324, + "grad_norm": 0.002465339843183756, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 80 + }, + { + "epoch": 0.0022349682862139015, + "grad_norm": 0.0014318680623546243, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 81 + }, + { + "epoch": 0.002262560487278271, + "grad_norm": 0.0018497896380722523, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 82 + }, + { + "epoch": 0.00229015268834264, + "grad_norm": 0.0023651847150176764, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 83 + }, + { + "epoch": 0.002317744889407009, + "grad_norm": 0.0019378148717805743, + "learning_rate": 0.001, + "loss": 0.408, + "step": 84 + }, + { + "epoch": 0.002345337090471378, + "grad_norm": 0.0015967771178111434, + "learning_rate": 0.001, + "loss": 0.3456, + "step": 85 + }, + { + "epoch": 0.0023729292915357476, + "grad_norm": 0.0013114806497469544, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 86 + }, + { + "epoch": 0.0024005214926001166, + "grad_norm": 0.0018527540378272533, + "learning_rate": 0.001, + "loss": 0.4442, + "step": 87 + }, + { + "epoch": 0.0024281136936644856, + "grad_norm": 0.0018121429020538926, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 88 + }, + { + "epoch": 0.0024557058947288547, + "grad_norm": 0.0017777991015464067, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 89 + }, + { + "epoch": 0.002483298095793224, + "grad_norm": 0.0014042413095012307, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 90 + }, + { + "epoch": 0.002510890296857593, + "grad_norm": 0.0023180118296295404, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 91 + }, + { + "epoch": 0.0025384824979219622, + "grad_norm": 0.0018260233337059617, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 92 + }, + { + "epoch": 0.0025660746989863317, + "grad_norm": 0.0020317393355071545, + "learning_rate": 0.001, + "loss": 0.43, + "step": 93 + }, + { + "epoch": 0.0025936669000507008, + "grad_norm": 0.0014620552537962794, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 94 + }, + { + "epoch": 0.00262125910111507, + "grad_norm": 0.0030886537861078978, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 95 + }, + { + "epoch": 0.002648851302179439, + "grad_norm": 0.001787678455002606, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 96 + }, + { + "epoch": 0.0026764435032438083, + "grad_norm": 0.0020450972951948643, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 97 + }, + { + "epoch": 0.0027040357043081773, + "grad_norm": 0.0015356248477473855, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 98 + }, + { + "epoch": 0.0027316279053725464, + "grad_norm": 0.0019487686222419143, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 99 + }, + { + "epoch": 0.0027592201064369154, + "grad_norm": 0.0016876134322956204, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 100 + }, + { + "epoch": 0.002786812307501285, + "grad_norm": 0.0015564693603664637, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 101 + }, + { + "epoch": 0.002814404508565654, + "grad_norm": 0.0011805463582277298, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 102 + }, + { + "epoch": 0.002841996709630023, + "grad_norm": 0.0027237439062446356, + "learning_rate": 0.001, + "loss": 0.4386, + "step": 103 + }, + { + "epoch": 0.002869588910694392, + "grad_norm": 0.001259063370525837, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 104 + }, + { + "epoch": 0.0028971811117587615, + "grad_norm": 0.0012861357536166906, + "learning_rate": 0.001, + "loss": 0.437, + "step": 105 + }, + { + "epoch": 0.0029247733128231305, + "grad_norm": 0.0017483624396845698, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 106 + }, + { + "epoch": 0.0029523655138874996, + "grad_norm": 0.0022901813499629498, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 107 + }, + { + "epoch": 0.002979957714951869, + "grad_norm": 0.00259222649037838, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 108 + }, + { + "epoch": 0.003007549916016238, + "grad_norm": 0.0057897912338376045, + "learning_rate": 0.001, + "loss": 0.375, + "step": 109 + }, + { + "epoch": 0.003035142117080607, + "grad_norm": 0.002251250436529517, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 110 + }, + { + "epoch": 0.003062734318144976, + "grad_norm": 0.0033183628693223, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 111 + }, + { + "epoch": 0.0030903265192093457, + "grad_norm": 0.001699024927802384, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 112 + }, + { + "epoch": 0.0031179187202737147, + "grad_norm": 0.002592903096228838, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 113 + }, + { + "epoch": 0.0031455109213380837, + "grad_norm": 0.001526323496364057, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 114 + }, + { + "epoch": 0.003173103122402453, + "grad_norm": 0.0022348894271999598, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 115 + }, + { + "epoch": 0.0032006953234668223, + "grad_norm": 0.0024093682877719402, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 116 + }, + { + "epoch": 0.0032282875245311913, + "grad_norm": 0.0054725524969398975, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 117 + }, + { + "epoch": 0.0032558797255955603, + "grad_norm": 0.0026599527336657047, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 118 + }, + { + "epoch": 0.0032834719266599294, + "grad_norm": 0.002410522662103176, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 119 + }, + { + "epoch": 0.003311064127724299, + "grad_norm": 0.0026565720327198505, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 120 + }, + { + "epoch": 0.003338656328788668, + "grad_norm": 0.0017594440141692758, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 121 + }, + { + "epoch": 0.003366248529853037, + "grad_norm": 0.001644355827011168, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 122 + }, + { + "epoch": 0.003393840730917406, + "grad_norm": 0.001211437862366438, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 123 + }, + { + "epoch": 0.0034214329319817755, + "grad_norm": 0.0012707557762041688, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 124 + }, + { + "epoch": 0.0034490251330461445, + "grad_norm": 0.00231559993699193, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 125 + }, + { + "epoch": 0.0034766173341105135, + "grad_norm": 0.003967110998928547, + "learning_rate": 0.001, + "loss": 0.4437, + "step": 126 + }, + { + "epoch": 0.003504209535174883, + "grad_norm": 0.0014507032465189695, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 127 + }, + { + "epoch": 0.003531801736239252, + "grad_norm": 0.0023966797161847353, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 128 + }, + { + "epoch": 0.003559393937303621, + "grad_norm": 0.0019388011423870921, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 129 + }, + { + "epoch": 0.00358698613836799, + "grad_norm": 0.0012276334455236793, + "learning_rate": 0.001, + "loss": 0.4624, + "step": 130 + }, + { + "epoch": 0.0036145783394323596, + "grad_norm": 0.0016817412106320262, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 131 + }, + { + "epoch": 0.0036421705404967287, + "grad_norm": 0.0014624105533584952, + "learning_rate": 0.001, + "loss": 0.415, + "step": 132 + }, + { + "epoch": 0.0036697627415610977, + "grad_norm": 0.0023928394075483084, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 133 + }, + { + "epoch": 0.0036973549426254667, + "grad_norm": 0.00194596650544554, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 134 + }, + { + "epoch": 0.0037249471436898362, + "grad_norm": 0.0017044798005372286, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 135 + }, + { + "epoch": 0.0037525393447542053, + "grad_norm": 0.0017636306583881378, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 136 + }, + { + "epoch": 0.0037801315458185743, + "grad_norm": 0.0042044175788760185, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 137 + }, + { + "epoch": 0.0038077237468829433, + "grad_norm": 0.0023780064657330513, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 138 + }, + { + "epoch": 0.003835315947947313, + "grad_norm": 0.0018606351222842932, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 139 + }, + { + "epoch": 0.003862908149011682, + "grad_norm": 0.0020658932626247406, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 140 + }, + { + "epoch": 0.003890500350076051, + "grad_norm": 0.0021609310060739517, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 141 + }, + { + "epoch": 0.00391809255114042, + "grad_norm": 0.0029214313253760338, + "learning_rate": 0.001, + "loss": 0.4448, + "step": 142 + }, + { + "epoch": 0.003945684752204789, + "grad_norm": 0.0013527723494917154, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 143 + }, + { + "epoch": 0.0039732769532691585, + "grad_norm": 0.0014684811467304826, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 144 + }, + { + "epoch": 0.0040008691543335275, + "grad_norm": 0.002133656293153763, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 145 + }, + { + "epoch": 0.0040284613553978965, + "grad_norm": 0.0017957837553694844, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 146 + }, + { + "epoch": 0.004056053556462266, + "grad_norm": 0.001586731756106019, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 147 + }, + { + "epoch": 0.0040836457575266355, + "grad_norm": 0.0015585459768772125, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 148 + }, + { + "epoch": 0.0041112379585910045, + "grad_norm": 0.001772695453837514, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 149 + }, + { + "epoch": 0.004138830159655374, + "grad_norm": 0.0032015417236834764, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 150 + }, + { + "epoch": 0.004166422360719743, + "grad_norm": 0.0014684676425531507, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 151 + }, + { + "epoch": 0.004194014561784112, + "grad_norm": 0.0014085081638768315, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 152 + }, + { + "epoch": 0.004221606762848481, + "grad_norm": 0.00355419609695673, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 153 + }, + { + "epoch": 0.00424919896391285, + "grad_norm": 0.0021562143228948116, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 154 + }, + { + "epoch": 0.004276791164977219, + "grad_norm": 0.001616101711988449, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 155 + }, + { + "epoch": 0.004304383366041589, + "grad_norm": 0.0020637568086385727, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 156 + }, + { + "epoch": 0.004331975567105958, + "grad_norm": 0.0013927265536040068, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 157 + }, + { + "epoch": 0.004359567768170327, + "grad_norm": 0.001993082230910659, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 158 + }, + { + "epoch": 0.004387159969234696, + "grad_norm": 0.001766142901033163, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 159 + }, + { + "epoch": 0.004414752170299065, + "grad_norm": 0.0017266202485188842, + "learning_rate": 0.001, + "loss": 0.381, + "step": 160 + }, + { + "epoch": 0.004442344371363434, + "grad_norm": 0.0017482074908912182, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 161 + }, + { + "epoch": 0.004469936572427803, + "grad_norm": 0.004118494223803282, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 162 + }, + { + "epoch": 0.004497528773492173, + "grad_norm": 0.00243256869725883, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 163 + }, + { + "epoch": 0.004525120974556542, + "grad_norm": 0.001603225595317781, + "learning_rate": 0.001, + "loss": 0.409, + "step": 164 + }, + { + "epoch": 0.004552713175620911, + "grad_norm": 0.0016920053167268634, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 165 + }, + { + "epoch": 0.00458030537668528, + "grad_norm": 0.0016063055954873562, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 166 + }, + { + "epoch": 0.004607897577749649, + "grad_norm": 0.0018608705140650272, + "learning_rate": 0.001, + "loss": 0.419, + "step": 167 + }, + { + "epoch": 0.004635489778814018, + "grad_norm": 0.001778002129867673, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 168 + }, + { + "epoch": 0.004663081979878387, + "grad_norm": 0.0041995905339717865, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 169 + }, + { + "epoch": 0.004690674180942756, + "grad_norm": 0.0013721170835196972, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 170 + }, + { + "epoch": 0.004718266382007126, + "grad_norm": 0.0018527969950810075, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 171 + }, + { + "epoch": 0.004745858583071495, + "grad_norm": 0.001698823063634336, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 172 + }, + { + "epoch": 0.004773450784135864, + "grad_norm": 0.0015160103794187307, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 173 + }, + { + "epoch": 0.004801042985200233, + "grad_norm": 0.0016305141616612673, + "learning_rate": 0.001, + "loss": 0.4338, + "step": 174 + }, + { + "epoch": 0.004828635186264602, + "grad_norm": 0.0016023332718759775, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 175 + }, + { + "epoch": 0.004856227387328971, + "grad_norm": 0.0021146016661077738, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 176 + }, + { + "epoch": 0.00488381958839334, + "grad_norm": 0.00299852411262691, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 177 + }, + { + "epoch": 0.004911411789457709, + "grad_norm": 0.002649805974215269, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 178 + }, + { + "epoch": 0.004939003990522079, + "grad_norm": 0.002029626164585352, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 179 + }, + { + "epoch": 0.004966596191586448, + "grad_norm": 0.002307730261236429, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 180 + }, + { + "epoch": 0.004994188392650817, + "grad_norm": 0.0019044012296944857, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 181 + }, + { + "epoch": 0.005021780593715186, + "grad_norm": 0.0012402728898450732, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 182 + }, + { + "epoch": 0.005049372794779555, + "grad_norm": 0.0015751667087897658, + "learning_rate": 0.001, + "loss": 0.375, + "step": 183 + }, + { + "epoch": 0.0050769649958439245, + "grad_norm": 0.0018701856024563313, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 184 + }, + { + "epoch": 0.0051045571969082935, + "grad_norm": 0.002105995314195752, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 185 + }, + { + "epoch": 0.005132149397972663, + "grad_norm": 0.0030122329480946064, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 186 + }, + { + "epoch": 0.0051597415990370325, + "grad_norm": 0.0018410159973427653, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 187 + }, + { + "epoch": 0.0051873338001014015, + "grad_norm": 0.0017063120612874627, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 188 + }, + { + "epoch": 0.0052149260011657705, + "grad_norm": 0.001945548108778894, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 189 + }, + { + "epoch": 0.00524251820223014, + "grad_norm": 0.002262406051158905, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 190 + }, + { + "epoch": 0.005270110403294509, + "grad_norm": 0.003793769981712103, + "learning_rate": 0.001, + "loss": 0.42, + "step": 191 + }, + { + "epoch": 0.005297702604358878, + "grad_norm": 0.0024173606652766466, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 192 + }, + { + "epoch": 0.005325294805423247, + "grad_norm": 0.0017464763950556517, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 193 + }, + { + "epoch": 0.005352887006487617, + "grad_norm": 0.0034284228459000587, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 194 + }, + { + "epoch": 0.005380479207551986, + "grad_norm": 0.001637522829696536, + "learning_rate": 0.001, + "loss": 0.404, + "step": 195 + }, + { + "epoch": 0.005408071408616355, + "grad_norm": 0.0019539205823093653, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 196 + }, + { + "epoch": 0.005435663609680724, + "grad_norm": 0.0023392250295728445, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 197 + }, + { + "epoch": 0.005463255810745093, + "grad_norm": 0.005975689273327589, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 198 + }, + { + "epoch": 0.005490848011809462, + "grad_norm": 0.005825843196362257, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 199 + }, + { + "epoch": 0.005518440212873831, + "grad_norm": 0.0023729391396045685, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 200 + }, + { + "epoch": 0.005546032413938201, + "grad_norm": 0.0020657021086663008, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 201 + }, + { + "epoch": 0.00557362461500257, + "grad_norm": 0.0022743509616702795, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 202 + }, + { + "epoch": 0.005601216816066939, + "grad_norm": 0.002227703807875514, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 203 + }, + { + "epoch": 0.005628809017131308, + "grad_norm": 0.0023818998597562313, + "learning_rate": 0.001, + "loss": 0.454, + "step": 204 + }, + { + "epoch": 0.005656401218195677, + "grad_norm": 0.002208840800449252, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 205 + }, + { + "epoch": 0.005683993419260046, + "grad_norm": 0.0025773572269827127, + "learning_rate": 0.001, + "loss": 0.4447, + "step": 206 + }, + { + "epoch": 0.005711585620324415, + "grad_norm": 0.002260175533592701, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 207 + }, + { + "epoch": 0.005739177821388784, + "grad_norm": 0.0023025628179311752, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 208 + }, + { + "epoch": 0.005766770022453154, + "grad_norm": 0.0032320781610906124, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 209 + }, + { + "epoch": 0.005794362223517523, + "grad_norm": 0.0018922177841886878, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 210 + }, + { + "epoch": 0.005821954424581892, + "grad_norm": 0.0019504919182509184, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 211 + }, + { + "epoch": 0.005849546625646261, + "grad_norm": 0.002194691449403763, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 212 + }, + { + "epoch": 0.00587713882671063, + "grad_norm": 0.013725458644330502, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 213 + }, + { + "epoch": 0.005904731027774999, + "grad_norm": 0.0019010631367564201, + "learning_rate": 0.001, + "loss": 0.4347, + "step": 214 + }, + { + "epoch": 0.005932323228839368, + "grad_norm": 0.005194473545998335, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 215 + }, + { + "epoch": 0.005959915429903738, + "grad_norm": 0.002996876835823059, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 216 + }, + { + "epoch": 0.005987507630968107, + "grad_norm": 0.0020920925308018923, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 217 + }, + { + "epoch": 0.006015099832032476, + "grad_norm": 0.0023856102488934994, + "learning_rate": 0.001, + "loss": 0.401, + "step": 218 + }, + { + "epoch": 0.006042692033096845, + "grad_norm": 0.0022957061883062124, + "learning_rate": 0.001, + "loss": 0.4424, + "step": 219 + }, + { + "epoch": 0.006070284234161214, + "grad_norm": 0.00439048558473587, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 220 + }, + { + "epoch": 0.006097876435225583, + "grad_norm": 0.0023554619401693344, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 221 + }, + { + "epoch": 0.006125468636289952, + "grad_norm": 0.002596774371340871, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 222 + }, + { + "epoch": 0.006153060837354321, + "grad_norm": 0.002902804408222437, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 223 + }, + { + "epoch": 0.006180653038418691, + "grad_norm": 0.002238060114905238, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 224 + }, + { + "epoch": 0.00620824523948306, + "grad_norm": 0.0024735773913562298, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 225 + }, + { + "epoch": 0.006235837440547429, + "grad_norm": 0.0043862126767635345, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 226 + }, + { + "epoch": 0.0062634296416117985, + "grad_norm": 0.002900010673329234, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 227 + }, + { + "epoch": 0.0062910218426761675, + "grad_norm": 0.0025010716635733843, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 228 + }, + { + "epoch": 0.0063186140437405365, + "grad_norm": 0.004444291349500418, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 229 + }, + { + "epoch": 0.006346206244804906, + "grad_norm": 0.0021149932872503996, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 230 + }, + { + "epoch": 0.006373798445869275, + "grad_norm": 0.001915531582199037, + "learning_rate": 0.001, + "loss": 0.4471, + "step": 231 + }, + { + "epoch": 0.0064013906469336445, + "grad_norm": 0.001954560400918126, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 232 + }, + { + "epoch": 0.006428982847998014, + "grad_norm": 0.002395547227934003, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 233 + }, + { + "epoch": 0.006456575049062383, + "grad_norm": 0.002621949650347233, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 234 + }, + { + "epoch": 0.006484167250126752, + "grad_norm": 0.0037250854074954987, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 235 + }, + { + "epoch": 0.006511759451191121, + "grad_norm": 0.002147798193618655, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 236 + }, + { + "epoch": 0.00653935165225549, + "grad_norm": 0.002425707643851638, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 237 + }, + { + "epoch": 0.006566943853319859, + "grad_norm": 0.002085171639919281, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 238 + }, + { + "epoch": 0.006594536054384229, + "grad_norm": 0.0023823371157050133, + "learning_rate": 0.001, + "loss": 0.403, + "step": 239 + }, + { + "epoch": 0.006622128255448598, + "grad_norm": 0.0020582638680934906, + "learning_rate": 0.001, + "loss": 0.402, + "step": 240 + }, + { + "epoch": 0.006649720456512967, + "grad_norm": 0.002760798903182149, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 241 + }, + { + "epoch": 0.006677312657577336, + "grad_norm": 0.0030749242287129164, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 242 + }, + { + "epoch": 0.006704904858641705, + "grad_norm": 0.0023086306173354387, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 243 + }, + { + "epoch": 0.006732497059706074, + "grad_norm": 0.0025757497642189264, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 244 + }, + { + "epoch": 0.006760089260770443, + "grad_norm": 0.002158039715141058, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 245 + }, + { + "epoch": 0.006787681461834812, + "grad_norm": 0.0018623712239786983, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 246 + }, + { + "epoch": 0.006815273662899182, + "grad_norm": 0.0022342281881719828, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 247 + }, + { + "epoch": 0.006842865863963551, + "grad_norm": 0.0026764923240989447, + "learning_rate": 0.001, + "loss": 0.392, + "step": 248 + }, + { + "epoch": 0.00687045806502792, + "grad_norm": 0.0038766246289014816, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 249 + }, + { + "epoch": 0.006898050266092289, + "grad_norm": 0.002432230394333601, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 250 + }, + { + "epoch": 0.006925642467156658, + "grad_norm": 0.003260681638494134, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 251 + }, + { + "epoch": 0.006953234668221027, + "grad_norm": 0.0023729840759187937, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 252 + }, + { + "epoch": 0.006980826869285396, + "grad_norm": 0.0032021389342844486, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 253 + }, + { + "epoch": 0.007008419070349766, + "grad_norm": 0.003267744556069374, + "learning_rate": 0.001, + "loss": 0.412, + "step": 254 + }, + { + "epoch": 0.007036011271414135, + "grad_norm": 0.0024608762469142675, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 255 + }, + { + "epoch": 0.007063603472478504, + "grad_norm": 0.002270517172291875, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 256 + }, + { + "epoch": 0.007091195673542873, + "grad_norm": 0.002375125652179122, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 257 + }, + { + "epoch": 0.007118787874607242, + "grad_norm": 0.0021832643542438745, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 258 + }, + { + "epoch": 0.007146380075671611, + "grad_norm": 0.0018151308177039027, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 259 + }, + { + "epoch": 0.00717397227673598, + "grad_norm": 0.0018910926301032305, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 260 + }, + { + "epoch": 0.007201564477800349, + "grad_norm": 0.0024862713180482388, + "learning_rate": 0.001, + "loss": 0.4328, + "step": 261 + }, + { + "epoch": 0.007229156678864719, + "grad_norm": 0.0027056324761360884, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 262 + }, + { + "epoch": 0.007256748879929088, + "grad_norm": 0.0018339842790737748, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 263 + }, + { + "epoch": 0.007284341080993457, + "grad_norm": 0.002902393927797675, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 264 + }, + { + "epoch": 0.007311933282057826, + "grad_norm": 0.0022214376367628574, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 265 + }, + { + "epoch": 0.007339525483122195, + "grad_norm": 0.002073576208204031, + "learning_rate": 0.001, + "loss": 0.4365, + "step": 266 + }, + { + "epoch": 0.0073671176841865645, + "grad_norm": 0.002394103677943349, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 267 + }, + { + "epoch": 0.0073947098852509335, + "grad_norm": 0.002571861259639263, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 268 + }, + { + "epoch": 0.007422302086315303, + "grad_norm": 0.0016914806328713894, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 269 + }, + { + "epoch": 0.0074498942873796725, + "grad_norm": 0.003131776349619031, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 270 + }, + { + "epoch": 0.0074774864884440415, + "grad_norm": 0.0046376376412808895, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 271 + }, + { + "epoch": 0.0075050786895084105, + "grad_norm": 0.0030513983219861984, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 272 + }, + { + "epoch": 0.00753267089057278, + "grad_norm": 0.0018306419951841235, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 273 + }, + { + "epoch": 0.007560263091637149, + "grad_norm": 0.002655989723280072, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 274 + }, + { + "epoch": 0.007587855292701518, + "grad_norm": 0.002580752596259117, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 275 + }, + { + "epoch": 0.007615447493765887, + "grad_norm": 0.002781835151836276, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 276 + }, + { + "epoch": 0.007643039694830257, + "grad_norm": 0.0023255913984030485, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 277 + }, + { + "epoch": 0.007670631895894626, + "grad_norm": 0.0026517838705331087, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 278 + }, + { + "epoch": 0.007698224096958995, + "grad_norm": 0.009500747546553612, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 279 + }, + { + "epoch": 0.007725816298023364, + "grad_norm": 0.002862557303160429, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 280 + }, + { + "epoch": 0.007753408499087733, + "grad_norm": 0.0030510432552546263, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 281 + }, + { + "epoch": 0.007781000700152102, + "grad_norm": 0.00323986797593534, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 282 + }, + { + "epoch": 0.007808592901216471, + "grad_norm": 0.0030146867502480745, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 283 + }, + { + "epoch": 0.00783618510228084, + "grad_norm": 0.002789125544950366, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 284 + }, + { + "epoch": 0.007863777303345209, + "grad_norm": 0.0027734541799873114, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 285 + }, + { + "epoch": 0.007891369504409579, + "grad_norm": 0.0021806685253977776, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 286 + }, + { + "epoch": 0.007918961705473947, + "grad_norm": 0.003913281951099634, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 287 + }, + { + "epoch": 0.007946553906538317, + "grad_norm": 0.0021719373762607574, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 288 + }, + { + "epoch": 0.007974146107602687, + "grad_norm": 0.003087391145527363, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 289 + }, + { + "epoch": 0.008001738308667055, + "grad_norm": 0.0025539237540215254, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 290 + }, + { + "epoch": 0.008029330509731425, + "grad_norm": 0.002219514222815633, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 291 + }, + { + "epoch": 0.008056922710795793, + "grad_norm": 0.0028160493820905685, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 292 + }, + { + "epoch": 0.008084514911860163, + "grad_norm": 0.003081710310652852, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 293 + }, + { + "epoch": 0.008112107112924531, + "grad_norm": 0.0024016399402171373, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 294 + }, + { + "epoch": 0.008139699313988901, + "grad_norm": 0.0020687165670096874, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 295 + }, + { + "epoch": 0.008167291515053271, + "grad_norm": 0.002624873537570238, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 296 + }, + { + "epoch": 0.00819488371611764, + "grad_norm": 0.0027543094474822283, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 297 + }, + { + "epoch": 0.008222475917182009, + "grad_norm": 0.005131872370839119, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 298 + }, + { + "epoch": 0.008250068118246377, + "grad_norm": 0.0030383355915546417, + "learning_rate": 0.001, + "loss": 0.421, + "step": 299 + }, + { + "epoch": 0.008277660319310747, + "grad_norm": 0.00250818463973701, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 300 + }, + { + "epoch": 0.008305252520375115, + "grad_norm": 0.006147111766040325, + "learning_rate": 0.001, + "loss": 0.386, + "step": 301 + }, + { + "epoch": 0.008332844721439485, + "grad_norm": 0.003149918746203184, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 302 + }, + { + "epoch": 0.008360436922503853, + "grad_norm": 0.002058635698631406, + "learning_rate": 0.001, + "loss": 0.4586, + "step": 303 + }, + { + "epoch": 0.008388029123568223, + "grad_norm": 0.002103931736201048, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 304 + }, + { + "epoch": 0.008415621324632593, + "grad_norm": 0.0036677306052297354, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 305 + }, + { + "epoch": 0.008443213525696961, + "grad_norm": 0.002338196150958538, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 306 + }, + { + "epoch": 0.008470805726761331, + "grad_norm": 0.002481523435562849, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 307 + }, + { + "epoch": 0.0084983979278257, + "grad_norm": 0.0023166737519204617, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 308 + }, + { + "epoch": 0.00852599012889007, + "grad_norm": 0.00202556187286973, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 309 + }, + { + "epoch": 0.008553582329954438, + "grad_norm": 0.0018110661767423153, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 310 + }, + { + "epoch": 0.008581174531018807, + "grad_norm": 0.003058563219383359, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 311 + }, + { + "epoch": 0.008608766732083177, + "grad_norm": 0.004681632854044437, + "learning_rate": 0.001, + "loss": 0.4428, + "step": 312 + }, + { + "epoch": 0.008636358933147546, + "grad_norm": 0.0023308703675866127, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 313 + }, + { + "epoch": 0.008663951134211915, + "grad_norm": 0.0029331271070986986, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 314 + }, + { + "epoch": 0.008691543335276284, + "grad_norm": 0.0021851372439414263, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 315 + }, + { + "epoch": 0.008719135536340654, + "grad_norm": 0.0027176933363080025, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 316 + }, + { + "epoch": 0.008746727737405022, + "grad_norm": 0.0023573623038828373, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 317 + }, + { + "epoch": 0.008774319938469392, + "grad_norm": 0.0023507485166192055, + "learning_rate": 0.001, + "loss": 0.429, + "step": 318 + }, + { + "epoch": 0.008801912139533762, + "grad_norm": 0.0026601781137287617, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 319 + }, + { + "epoch": 0.00882950434059813, + "grad_norm": 0.002715731505304575, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 320 + }, + { + "epoch": 0.0088570965416625, + "grad_norm": 0.004710033070296049, + "learning_rate": 0.001, + "loss": 0.3585, + "step": 321 + }, + { + "epoch": 0.008884688742726868, + "grad_norm": 0.0027322748210281134, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 322 + }, + { + "epoch": 0.008912280943791238, + "grad_norm": 0.0024419347755610943, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 323 + }, + { + "epoch": 0.008939873144855606, + "grad_norm": 0.004387510009109974, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 324 + }, + { + "epoch": 0.008967465345919976, + "grad_norm": 0.0033073730301111937, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 325 + }, + { + "epoch": 0.008995057546984346, + "grad_norm": 0.002799189416691661, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 326 + }, + { + "epoch": 0.009022649748048714, + "grad_norm": 0.0031710139010101557, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 327 + }, + { + "epoch": 0.009050241949113084, + "grad_norm": 0.0027909427881240845, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 328 + }, + { + "epoch": 0.009077834150177452, + "grad_norm": 0.0035845122765749693, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 329 + }, + { + "epoch": 0.009105426351241822, + "grad_norm": 0.0034582631196826696, + "learning_rate": 0.001, + "loss": 0.404, + "step": 330 + }, + { + "epoch": 0.00913301855230619, + "grad_norm": 0.005869538523256779, + "learning_rate": 0.001, + "loss": 0.3392, + "step": 331 + }, + { + "epoch": 0.00916061075337056, + "grad_norm": 0.0037916686851531267, + "learning_rate": 0.001, + "loss": 0.4509, + "step": 332 + }, + { + "epoch": 0.009188202954434928, + "grad_norm": 0.004066781606525183, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 333 + }, + { + "epoch": 0.009215795155499298, + "grad_norm": 0.002667912980541587, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 334 + }, + { + "epoch": 0.009243387356563668, + "grad_norm": 0.0032026427797973156, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 335 + }, + { + "epoch": 0.009270979557628036, + "grad_norm": 0.002195654669776559, + "learning_rate": 0.001, + "loss": 0.374, + "step": 336 + }, + { + "epoch": 0.009298571758692406, + "grad_norm": 0.002170421415939927, + "learning_rate": 0.001, + "loss": 0.416, + "step": 337 + }, + { + "epoch": 0.009326163959756774, + "grad_norm": 0.002630366710945964, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 338 + }, + { + "epoch": 0.009353756160821144, + "grad_norm": 0.0026457132771611214, + "learning_rate": 0.001, + "loss": 0.436, + "step": 339 + }, + { + "epoch": 0.009381348361885512, + "grad_norm": 0.002360823331400752, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 340 + }, + { + "epoch": 0.009408940562949882, + "grad_norm": 0.0026195445097982883, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 341 + }, + { + "epoch": 0.009436532764014252, + "grad_norm": 0.0018392925849184394, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 342 + }, + { + "epoch": 0.00946412496507862, + "grad_norm": 0.0029209102503955364, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 343 + }, + { + "epoch": 0.00949171716614299, + "grad_norm": 0.00245369179174304, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 344 + }, + { + "epoch": 0.009519309367207358, + "grad_norm": 0.0021382428240031004, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 345 + }, + { + "epoch": 0.009546901568271728, + "grad_norm": 0.0018214443698525429, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 346 + }, + { + "epoch": 0.009574493769336096, + "grad_norm": 0.0025145094841718674, + "learning_rate": 0.001, + "loss": 0.3583, + "step": 347 + }, + { + "epoch": 0.009602085970400466, + "grad_norm": 0.002913502510637045, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 348 + }, + { + "epoch": 0.009629678171464836, + "grad_norm": 0.002811063313856721, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 349 + }, + { + "epoch": 0.009657270372529204, + "grad_norm": 0.0031305616721510887, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 350 + }, + { + "epoch": 0.009684862573593574, + "grad_norm": 0.002753301290795207, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 351 + }, + { + "epoch": 0.009712454774657943, + "grad_norm": 0.002502492628991604, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 352 + }, + { + "epoch": 0.009740046975722312, + "grad_norm": 0.0029816054739058018, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 353 + }, + { + "epoch": 0.00976763917678668, + "grad_norm": 0.00210220436565578, + "learning_rate": 0.001, + "loss": 0.42, + "step": 354 + }, + { + "epoch": 0.00979523137785105, + "grad_norm": 0.002407314023002982, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 355 + }, + { + "epoch": 0.009822823578915419, + "grad_norm": 0.0025853144470602274, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 356 + }, + { + "epoch": 0.009850415779979789, + "grad_norm": 0.0020144209265708923, + "learning_rate": 0.001, + "loss": 0.4465, + "step": 357 + }, + { + "epoch": 0.009878007981044159, + "grad_norm": 0.0026473626494407654, + "learning_rate": 0.001, + "loss": 0.384, + "step": 358 + }, + { + "epoch": 0.009905600182108527, + "grad_norm": 0.005975659471005201, + "learning_rate": 0.001, + "loss": 0.4328, + "step": 359 + }, + { + "epoch": 0.009933192383172897, + "grad_norm": 0.0024808787275105715, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 360 + }, + { + "epoch": 0.009960784584237265, + "grad_norm": 0.011580473743379116, + "learning_rate": 0.001, + "loss": 0.4419, + "step": 361 + }, + { + "epoch": 0.009988376785301635, + "grad_norm": 0.0025010586250573397, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 362 + }, + { + "epoch": 0.010015968986366003, + "grad_norm": 0.0029951941687613726, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 363 + }, + { + "epoch": 0.010043561187430373, + "grad_norm": 0.0038613236974924803, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 364 + }, + { + "epoch": 0.010071153388494743, + "grad_norm": 0.00325029413215816, + "learning_rate": 0.001, + "loss": 0.392, + "step": 365 + }, + { + "epoch": 0.01009874558955911, + "grad_norm": 0.002298851264640689, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 366 + }, + { + "epoch": 0.01012633779062348, + "grad_norm": 0.003252133959904313, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 367 + }, + { + "epoch": 0.010153929991687849, + "grad_norm": 0.0026516057550907135, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 368 + }, + { + "epoch": 0.010181522192752219, + "grad_norm": 0.0033587999641895294, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 369 + }, + { + "epoch": 0.010209114393816587, + "grad_norm": 0.0030985455960035324, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 370 + }, + { + "epoch": 0.010236706594880957, + "grad_norm": 0.0028507187962532043, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 371 + }, + { + "epoch": 0.010264298795945327, + "grad_norm": 0.0024174090940505266, + "learning_rate": 0.001, + "loss": 0.4607, + "step": 372 + }, + { + "epoch": 0.010291890997009695, + "grad_norm": 0.003416180843487382, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 373 + }, + { + "epoch": 0.010319483198074065, + "grad_norm": 0.0031029239762574434, + "learning_rate": 0.001, + "loss": 0.402, + "step": 374 + }, + { + "epoch": 0.010347075399138433, + "grad_norm": 0.0023850388824939728, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 375 + }, + { + "epoch": 0.010374667600202803, + "grad_norm": 0.0026809065602719784, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 376 + }, + { + "epoch": 0.010402259801267171, + "grad_norm": 0.004038350190967321, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 377 + }, + { + "epoch": 0.010429852002331541, + "grad_norm": 0.0030194921419024467, + "learning_rate": 0.001, + "loss": 0.4525, + "step": 378 + }, + { + "epoch": 0.010457444203395911, + "grad_norm": 0.0023160662967711687, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 379 + }, + { + "epoch": 0.01048503640446028, + "grad_norm": 0.0031149794813245535, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 380 + }, + { + "epoch": 0.010512628605524649, + "grad_norm": 0.0025531111750751734, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 381 + }, + { + "epoch": 0.010540220806589017, + "grad_norm": 0.003499183803796768, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 382 + }, + { + "epoch": 0.010567813007653387, + "grad_norm": 0.004215524531900883, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 383 + }, + { + "epoch": 0.010595405208717755, + "grad_norm": 0.0024398749228566885, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 384 + }, + { + "epoch": 0.010622997409782125, + "grad_norm": 0.003436741651967168, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 385 + }, + { + "epoch": 0.010650589610846493, + "grad_norm": 0.0023827480617910624, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 386 + }, + { + "epoch": 0.010678181811910863, + "grad_norm": 0.0032302262261509895, + "learning_rate": 0.001, + "loss": 0.399, + "step": 387 + }, + { + "epoch": 0.010705774012975233, + "grad_norm": 0.004642815329134464, + "learning_rate": 0.001, + "loss": 0.341, + "step": 388 + }, + { + "epoch": 0.010733366214039601, + "grad_norm": 0.002242898801341653, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 389 + }, + { + "epoch": 0.010760958415103971, + "grad_norm": 0.002073424868285656, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 390 + }, + { + "epoch": 0.01078855061616834, + "grad_norm": 0.0029485910199582577, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 391 + }, + { + "epoch": 0.01081614281723271, + "grad_norm": 0.003694765968248248, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 392 + }, + { + "epoch": 0.010843735018297078, + "grad_norm": 0.004921768791973591, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 393 + }, + { + "epoch": 0.010871327219361447, + "grad_norm": 0.0033451106864959, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 394 + }, + { + "epoch": 0.010898919420425817, + "grad_norm": 0.003096395405009389, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 395 + }, + { + "epoch": 0.010926511621490186, + "grad_norm": 0.002606458030641079, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 396 + }, + { + "epoch": 0.010954103822554555, + "grad_norm": 0.002552368212491274, + "learning_rate": 0.001, + "loss": 0.436, + "step": 397 + }, + { + "epoch": 0.010981696023618924, + "grad_norm": 0.002896772464737296, + "learning_rate": 0.001, + "loss": 0.3605, + "step": 398 + }, + { + "epoch": 0.011009288224683294, + "grad_norm": 0.004260985646396875, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 399 + }, + { + "epoch": 0.011036880425747662, + "grad_norm": 0.0023302636109292507, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 400 + }, + { + "epoch": 0.011064472626812032, + "grad_norm": 0.0028871933463960886, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 401 + }, + { + "epoch": 0.011092064827876402, + "grad_norm": 0.003007282270118594, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 402 + }, + { + "epoch": 0.01111965702894077, + "grad_norm": 0.0023298682644963264, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 403 + }, + { + "epoch": 0.01114724923000514, + "grad_norm": 0.0032247831113636494, + "learning_rate": 0.001, + "loss": 0.404, + "step": 404 + }, + { + "epoch": 0.011174841431069508, + "grad_norm": 0.002755606546998024, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 405 + }, + { + "epoch": 0.011202433632133878, + "grad_norm": 0.0030489088967442513, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 406 + }, + { + "epoch": 0.011230025833198246, + "grad_norm": 0.0037001632153987885, + "learning_rate": 0.001, + "loss": 0.395, + "step": 407 + }, + { + "epoch": 0.011257618034262616, + "grad_norm": 0.0033608553931117058, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 408 + }, + { + "epoch": 0.011285210235326984, + "grad_norm": 0.00226064445450902, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 409 + }, + { + "epoch": 0.011312802436391354, + "grad_norm": 0.0037739353720098734, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 410 + }, + { + "epoch": 0.011340394637455724, + "grad_norm": 0.0032893354073166847, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 411 + }, + { + "epoch": 0.011367986838520092, + "grad_norm": 0.0036774203181266785, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 412 + }, + { + "epoch": 0.011395579039584462, + "grad_norm": 0.0031027563381940126, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 413 + }, + { + "epoch": 0.01142317124064883, + "grad_norm": 0.006735049653798342, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 414 + }, + { + "epoch": 0.0114507634417132, + "grad_norm": 0.0036674696020781994, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 415 + }, + { + "epoch": 0.011478355642777568, + "grad_norm": 0.0060126762837171555, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 416 + }, + { + "epoch": 0.011505947843841938, + "grad_norm": 0.005611831322312355, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 417 + }, + { + "epoch": 0.011533540044906308, + "grad_norm": 0.004347233567386866, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 418 + }, + { + "epoch": 0.011561132245970676, + "grad_norm": 0.0035771261900663376, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 419 + }, + { + "epoch": 0.011588724447035046, + "grad_norm": 0.002888308372348547, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 420 + }, + { + "epoch": 0.011616316648099414, + "grad_norm": 0.006516223307698965, + "learning_rate": 0.001, + "loss": 0.395, + "step": 421 + }, + { + "epoch": 0.011643908849163784, + "grad_norm": 0.004090446978807449, + "learning_rate": 0.001, + "loss": 0.3573, + "step": 422 + }, + { + "epoch": 0.011671501050228152, + "grad_norm": 0.002728004939854145, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 423 + }, + { + "epoch": 0.011699093251292522, + "grad_norm": 0.002871421165764332, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 424 + }, + { + "epoch": 0.011726685452356892, + "grad_norm": 0.0021404619328677654, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 425 + }, + { + "epoch": 0.01175427765342126, + "grad_norm": 0.0027238999027758837, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 426 + }, + { + "epoch": 0.01178186985448563, + "grad_norm": 0.0025113308802247047, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 427 + }, + { + "epoch": 0.011809462055549998, + "grad_norm": 0.0029517943039536476, + "learning_rate": 0.001, + "loss": 0.415, + "step": 428 + }, + { + "epoch": 0.011837054256614368, + "grad_norm": 0.0023259243462234735, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 429 + }, + { + "epoch": 0.011864646457678736, + "grad_norm": 0.002625027671456337, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 430 + }, + { + "epoch": 0.011892238658743106, + "grad_norm": 0.0032775115687400103, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 431 + }, + { + "epoch": 0.011919830859807476, + "grad_norm": 0.002229101490229368, + "learning_rate": 0.001, + "loss": 0.4495, + "step": 432 + }, + { + "epoch": 0.011947423060871844, + "grad_norm": 0.0024553320836275816, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 433 + }, + { + "epoch": 0.011975015261936214, + "grad_norm": 0.0025780866853892803, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 434 + }, + { + "epoch": 0.012002607463000583, + "grad_norm": 0.003502671839669347, + "learning_rate": 0.001, + "loss": 0.3563, + "step": 435 + }, + { + "epoch": 0.012030199664064952, + "grad_norm": 0.004440659191459417, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 436 + }, + { + "epoch": 0.01205779186512932, + "grad_norm": 0.0051866755820810795, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 437 + }, + { + "epoch": 0.01208538406619369, + "grad_norm": 0.002548003103584051, + "learning_rate": 0.001, + "loss": 0.4387, + "step": 438 + }, + { + "epoch": 0.012112976267258059, + "grad_norm": 0.0030665360391139984, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 439 + }, + { + "epoch": 0.012140568468322429, + "grad_norm": 0.0032852613367140293, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 440 + }, + { + "epoch": 0.012168160669386799, + "grad_norm": 0.0030793712940067053, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 441 + }, + { + "epoch": 0.012195752870451167, + "grad_norm": 0.0028317072428762913, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 442 + }, + { + "epoch": 0.012223345071515537, + "grad_norm": 0.0029394179582595825, + "learning_rate": 0.001, + "loss": 0.373, + "step": 443 + }, + { + "epoch": 0.012250937272579905, + "grad_norm": 0.0032694858964532614, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 444 + }, + { + "epoch": 0.012278529473644275, + "grad_norm": 0.002599178347736597, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 445 + }, + { + "epoch": 0.012306121674708643, + "grad_norm": 0.0053001102060079575, + "learning_rate": 0.001, + "loss": 0.393, + "step": 446 + }, + { + "epoch": 0.012333713875773013, + "grad_norm": 0.0027281581424176693, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 447 + }, + { + "epoch": 0.012361306076837383, + "grad_norm": 0.0040309545584023, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 448 + }, + { + "epoch": 0.01238889827790175, + "grad_norm": 0.0024766335263848305, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 449 + }, + { + "epoch": 0.01241649047896612, + "grad_norm": 0.0033247419632971287, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 450 + }, + { + "epoch": 0.012444082680030489, + "grad_norm": 0.0030413721688091755, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 451 + }, + { + "epoch": 0.012471674881094859, + "grad_norm": 0.0068315728567540646, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 452 + }, + { + "epoch": 0.012499267082159227, + "grad_norm": 0.0032153630163520575, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 453 + }, + { + "epoch": 0.012526859283223597, + "grad_norm": 0.003297319170087576, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 454 + }, + { + "epoch": 0.012554451484287967, + "grad_norm": 0.003063608892261982, + "learning_rate": 0.001, + "loss": 0.44, + "step": 455 + }, + { + "epoch": 0.012582043685352335, + "grad_norm": 0.0030015911906957626, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 456 + }, + { + "epoch": 0.012609635886416705, + "grad_norm": 0.0032717676367610693, + "learning_rate": 0.001, + "loss": 0.3561, + "step": 457 + }, + { + "epoch": 0.012637228087481073, + "grad_norm": 0.0031338001135736704, + "learning_rate": 0.001, + "loss": 0.393, + "step": 458 + }, + { + "epoch": 0.012664820288545443, + "grad_norm": 0.0055600800551474094, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 459 + }, + { + "epoch": 0.012692412489609811, + "grad_norm": 0.009609074331820011, + "learning_rate": 0.001, + "loss": 0.3647, + "step": 460 + }, + { + "epoch": 0.012720004690674181, + "grad_norm": 0.0031838284339755774, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 461 + }, + { + "epoch": 0.01274759689173855, + "grad_norm": 0.002876470098271966, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 462 + }, + { + "epoch": 0.01277518909280292, + "grad_norm": 0.0027010440826416016, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 463 + }, + { + "epoch": 0.012802781293867289, + "grad_norm": 0.0024778309743851423, + "learning_rate": 0.001, + "loss": 0.446, + "step": 464 + }, + { + "epoch": 0.012830373494931657, + "grad_norm": 0.0021515442058444023, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 465 + }, + { + "epoch": 0.012857965695996027, + "grad_norm": 0.0023079351522028446, + "learning_rate": 0.001, + "loss": 0.392, + "step": 466 + }, + { + "epoch": 0.012885557897060395, + "grad_norm": 0.002527826000005007, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 467 + }, + { + "epoch": 0.012913150098124765, + "grad_norm": 0.003819882869720459, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 468 + }, + { + "epoch": 0.012940742299189133, + "grad_norm": 0.002980564022436738, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 469 + }, + { + "epoch": 0.012968334500253503, + "grad_norm": 0.004125640727579594, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 470 + }, + { + "epoch": 0.012995926701317873, + "grad_norm": 0.0025943296495825052, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 471 + }, + { + "epoch": 0.013023518902382241, + "grad_norm": 0.002629917813464999, + "learning_rate": 0.001, + "loss": 0.366, + "step": 472 + }, + { + "epoch": 0.013051111103446611, + "grad_norm": 0.0021362872794270515, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 473 + }, + { + "epoch": 0.01307870330451098, + "grad_norm": 0.002968277782201767, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 474 + }, + { + "epoch": 0.01310629550557535, + "grad_norm": 0.002851092955097556, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 475 + }, + { + "epoch": 0.013133887706639718, + "grad_norm": 0.0023420127108693123, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 476 + }, + { + "epoch": 0.013161479907704087, + "grad_norm": 0.003359900787472725, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 477 + }, + { + "epoch": 0.013189072108768457, + "grad_norm": 0.0038361374754458666, + "learning_rate": 0.001, + "loss": 0.392, + "step": 478 + }, + { + "epoch": 0.013216664309832826, + "grad_norm": 0.003093348117545247, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 479 + }, + { + "epoch": 0.013244256510897195, + "grad_norm": 0.003098709974437952, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 480 + }, + { + "epoch": 0.013271848711961564, + "grad_norm": 0.002880271291360259, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 481 + }, + { + "epoch": 0.013299440913025934, + "grad_norm": 0.005491374060511589, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 482 + }, + { + "epoch": 0.013327033114090302, + "grad_norm": 0.0029838404152542353, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 483 + }, + { + "epoch": 0.013354625315154672, + "grad_norm": 0.00296381744556129, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 484 + }, + { + "epoch": 0.013382217516219042, + "grad_norm": 0.0037699625827372074, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 485 + }, + { + "epoch": 0.01340980971728341, + "grad_norm": 0.0022699085529893637, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 486 + }, + { + "epoch": 0.01343740191834778, + "grad_norm": 0.003976910375058651, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 487 + }, + { + "epoch": 0.013464994119412148, + "grad_norm": 0.003803370287641883, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 488 + }, + { + "epoch": 0.013492586320476518, + "grad_norm": 0.0036107334308326244, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 489 + }, + { + "epoch": 0.013520178521540886, + "grad_norm": 0.0034353930968791246, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 490 + }, + { + "epoch": 0.013547770722605256, + "grad_norm": 0.006415998097509146, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 491 + }, + { + "epoch": 0.013575362923669624, + "grad_norm": 0.0031678853556513786, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 492 + }, + { + "epoch": 0.013602955124733994, + "grad_norm": 0.003964356612414122, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 493 + }, + { + "epoch": 0.013630547325798364, + "grad_norm": 0.004771376959979534, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 494 + }, + { + "epoch": 0.013658139526862732, + "grad_norm": 0.0035825970117002726, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 495 + }, + { + "epoch": 0.013685731727927102, + "grad_norm": 0.0036343352403491735, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 496 + }, + { + "epoch": 0.01371332392899147, + "grad_norm": 0.002659760881215334, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 497 + }, + { + "epoch": 0.01374091613005584, + "grad_norm": 0.003316852729767561, + "learning_rate": 0.001, + "loss": 0.4527, + "step": 498 + }, + { + "epoch": 0.013768508331120208, + "grad_norm": 0.0034902123734354973, + "learning_rate": 0.001, + "loss": 0.4, + "step": 499 + }, + { + "epoch": 0.013796100532184578, + "grad_norm": 0.002378157339990139, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 500 + }, + { + "epoch": 0.013796100532184578, + "eval_runtime": 25.0496, + "eval_samples_per_second": 1.277, + "eval_steps_per_second": 0.16, + "step": 500 + }, + { + "epoch": 0.013823692733248948, + "grad_norm": 0.005439688451588154, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 501 + }, + { + "epoch": 0.013851284934313316, + "grad_norm": 0.0026094792410731316, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 502 + }, + { + "epoch": 0.013878877135377686, + "grad_norm": 0.002377827186137438, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 503 + }, + { + "epoch": 0.013906469336442054, + "grad_norm": 0.004557560198009014, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 504 + }, + { + "epoch": 0.013934061537506424, + "grad_norm": 0.0030535697005689144, + "learning_rate": 0.001, + "loss": 0.388, + "step": 505 + }, + { + "epoch": 0.013961653738570792, + "grad_norm": 0.0032748053781688213, + "learning_rate": 0.001, + "loss": 0.386, + "step": 506 + }, + { + "epoch": 0.013989245939635162, + "grad_norm": 0.003947910387068987, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 507 + }, + { + "epoch": 0.014016838140699532, + "grad_norm": 0.0033470303751528263, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 508 + }, + { + "epoch": 0.0140444303417639, + "grad_norm": 0.011195505037903786, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 509 + }, + { + "epoch": 0.01407202254282827, + "grad_norm": 0.0028728062752634287, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 510 + }, + { + "epoch": 0.014099614743892638, + "grad_norm": 0.0035205583553761244, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 511 + }, + { + "epoch": 0.014127206944957008, + "grad_norm": 0.003692334285005927, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 512 + }, + { + "epoch": 0.014154799146021376, + "grad_norm": 0.004000712651759386, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 513 + }, + { + "epoch": 0.014182391347085746, + "grad_norm": 0.0022262849379330873, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 514 + }, + { + "epoch": 0.014209983548150115, + "grad_norm": 0.005194266326725483, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 515 + }, + { + "epoch": 0.014237575749214484, + "grad_norm": 0.0022841247264295816, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 516 + }, + { + "epoch": 0.014265167950278854, + "grad_norm": 0.003351402934640646, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 517 + }, + { + "epoch": 0.014292760151343223, + "grad_norm": 0.0031243113335222006, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 518 + }, + { + "epoch": 0.014320352352407592, + "grad_norm": 0.003822315251454711, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 519 + }, + { + "epoch": 0.01434794455347196, + "grad_norm": 0.0023426790721714497, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 520 + }, + { + "epoch": 0.01437553675453633, + "grad_norm": 0.0027026657480746508, + "learning_rate": 0.001, + "loss": 0.373, + "step": 521 + }, + { + "epoch": 0.014403128955600699, + "grad_norm": 0.0023081700783222914, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 522 + }, + { + "epoch": 0.014430721156665069, + "grad_norm": 0.003079401096329093, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 523 + }, + { + "epoch": 0.014458313357729439, + "grad_norm": 0.0027522866148501635, + "learning_rate": 0.001, + "loss": 0.397, + "step": 524 + }, + { + "epoch": 0.014485905558793807, + "grad_norm": 0.002710319822654128, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 525 + }, + { + "epoch": 0.014513497759858177, + "grad_norm": 0.002838405082002282, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 526 + }, + { + "epoch": 0.014541089960922545, + "grad_norm": 0.0028795620892196894, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 527 + }, + { + "epoch": 0.014568682161986915, + "grad_norm": 0.002826494164764881, + "learning_rate": 0.001, + "loss": 0.4381, + "step": 528 + }, + { + "epoch": 0.014596274363051283, + "grad_norm": 0.002902933629229665, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 529 + }, + { + "epoch": 0.014623866564115653, + "grad_norm": 0.0034344778396189213, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 530 + }, + { + "epoch": 0.014651458765180023, + "grad_norm": 0.002652675611898303, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 531 + }, + { + "epoch": 0.01467905096624439, + "grad_norm": 0.00368179427459836, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 532 + }, + { + "epoch": 0.01470664316730876, + "grad_norm": 0.0028401080053299665, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 533 + }, + { + "epoch": 0.014734235368373129, + "grad_norm": 0.0047442615032196045, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 534 + }, + { + "epoch": 0.014761827569437499, + "grad_norm": 0.0031780744902789593, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 535 + }, + { + "epoch": 0.014789419770501867, + "grad_norm": 0.004496684763580561, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 536 + }, + { + "epoch": 0.014817011971566237, + "grad_norm": 0.0026886628475040197, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 537 + }, + { + "epoch": 0.014844604172630607, + "grad_norm": 0.004109974484890699, + "learning_rate": 0.001, + "loss": 0.4547, + "step": 538 + }, + { + "epoch": 0.014872196373694975, + "grad_norm": 0.004985136911273003, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 539 + }, + { + "epoch": 0.014899788574759345, + "grad_norm": 0.00309336488135159, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 540 + }, + { + "epoch": 0.014927380775823713, + "grad_norm": 0.0031439203303307295, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 541 + }, + { + "epoch": 0.014954972976888083, + "grad_norm": 0.0027853951323777437, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 542 + }, + { + "epoch": 0.014982565177952451, + "grad_norm": 0.0026657688431441784, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 543 + }, + { + "epoch": 0.015010157379016821, + "grad_norm": 0.01247773040086031, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 544 + }, + { + "epoch": 0.01503774958008119, + "grad_norm": 0.002995160175487399, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 545 + }, + { + "epoch": 0.01506534178114556, + "grad_norm": 0.002868333365768194, + "learning_rate": 0.001, + "loss": 0.435, + "step": 546 + }, + { + "epoch": 0.015092933982209929, + "grad_norm": 0.004190264735370874, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 547 + }, + { + "epoch": 0.015120526183274297, + "grad_norm": 0.003989442717283964, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 548 + }, + { + "epoch": 0.015148118384338667, + "grad_norm": 0.0031920846085995436, + "learning_rate": 0.001, + "loss": 0.428, + "step": 549 + }, + { + "epoch": 0.015175710585403035, + "grad_norm": 0.0037179081700742245, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 550 + }, + { + "epoch": 0.015203302786467405, + "grad_norm": 0.003381606424227357, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 551 + }, + { + "epoch": 0.015230894987531773, + "grad_norm": 0.0029906374402344227, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 552 + }, + { + "epoch": 0.015258487188596143, + "grad_norm": 0.0036886725574731827, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 553 + }, + { + "epoch": 0.015286079389660513, + "grad_norm": 0.0037765917368233204, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 554 + }, + { + "epoch": 0.015313671590724881, + "grad_norm": 0.00531379971653223, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 555 + }, + { + "epoch": 0.015341263791789251, + "grad_norm": 0.003965814132243395, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 556 + }, + { + "epoch": 0.01536885599285362, + "grad_norm": 0.0031218293588608503, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 557 + }, + { + "epoch": 0.01539644819391799, + "grad_norm": 0.0027389719616621733, + "learning_rate": 0.001, + "loss": 0.356, + "step": 558 + }, + { + "epoch": 0.015424040394982358, + "grad_norm": 0.003185281064361334, + "learning_rate": 0.001, + "loss": 0.4471, + "step": 559 + }, + { + "epoch": 0.015451632596046727, + "grad_norm": 0.003925340250134468, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 560 + }, + { + "epoch": 0.015479224797111097, + "grad_norm": 0.0033257382456213236, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 561 + }, + { + "epoch": 0.015506816998175466, + "grad_norm": 0.0033314244356006384, + "learning_rate": 0.001, + "loss": 0.405, + "step": 562 + }, + { + "epoch": 0.015534409199239835, + "grad_norm": 0.0034566351678222418, + "learning_rate": 0.001, + "loss": 0.396, + "step": 563 + }, + { + "epoch": 0.015562001400304204, + "grad_norm": 0.0038912983145564795, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 564 + }, + { + "epoch": 0.015589593601368574, + "grad_norm": 0.004165092017501593, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 565 + }, + { + "epoch": 0.015617185802432942, + "grad_norm": 0.004660595208406448, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 566 + }, + { + "epoch": 0.01564477800349731, + "grad_norm": 0.0033354307524859905, + "learning_rate": 0.001, + "loss": 0.372, + "step": 567 + }, + { + "epoch": 0.01567237020456168, + "grad_norm": 0.002619788981974125, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 568 + }, + { + "epoch": 0.01569996240562605, + "grad_norm": 0.004775674548000097, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 569 + }, + { + "epoch": 0.015727554606690418, + "grad_norm": 0.0030274325981736183, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 570 + }, + { + "epoch": 0.015755146807754788, + "grad_norm": 0.0031505031511187553, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 571 + }, + { + "epoch": 0.015782739008819158, + "grad_norm": 0.005169673822820187, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 572 + }, + { + "epoch": 0.015810331209883528, + "grad_norm": 0.002902763895690441, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 573 + }, + { + "epoch": 0.015837923410947894, + "grad_norm": 0.0037871890235692263, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 574 + }, + { + "epoch": 0.015865515612012264, + "grad_norm": 0.004014831036329269, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 575 + }, + { + "epoch": 0.015893107813076634, + "grad_norm": 0.004346000496298075, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 576 + }, + { + "epoch": 0.015920700014141004, + "grad_norm": 0.0033372112084180117, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 577 + }, + { + "epoch": 0.015948292215205374, + "grad_norm": 0.0031993126031011343, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 578 + }, + { + "epoch": 0.01597588441626974, + "grad_norm": 0.0026338782627135515, + "learning_rate": 0.001, + "loss": 0.422, + "step": 579 + }, + { + "epoch": 0.01600347661733411, + "grad_norm": 0.003543695667758584, + "learning_rate": 0.001, + "loss": 0.385, + "step": 580 + }, + { + "epoch": 0.01603106881839848, + "grad_norm": 0.003536064876243472, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 581 + }, + { + "epoch": 0.01605866101946285, + "grad_norm": 0.0031016895081847906, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 582 + }, + { + "epoch": 0.016086253220527216, + "grad_norm": 0.004082076251506805, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 583 + }, + { + "epoch": 0.016113845421591586, + "grad_norm": 0.003274913877248764, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 584 + }, + { + "epoch": 0.016141437622655956, + "grad_norm": 0.003890890395268798, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 585 + }, + { + "epoch": 0.016169029823720326, + "grad_norm": 0.0035965372808277607, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 586 + }, + { + "epoch": 0.016196622024784696, + "grad_norm": 0.02042904868721962, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 587 + }, + { + "epoch": 0.016224214225849062, + "grad_norm": 0.00438309321179986, + "learning_rate": 0.001, + "loss": 0.363, + "step": 588 + }, + { + "epoch": 0.016251806426913432, + "grad_norm": 0.004080008715391159, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 589 + }, + { + "epoch": 0.016279398627977802, + "grad_norm": 0.004915320780128241, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 590 + }, + { + "epoch": 0.016306990829042172, + "grad_norm": 0.0037770781200379133, + "learning_rate": 0.001, + "loss": 0.451, + "step": 591 + }, + { + "epoch": 0.016334583030106542, + "grad_norm": 0.0039979214780032635, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 592 + }, + { + "epoch": 0.01636217523117091, + "grad_norm": 0.0035175576340407133, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 593 + }, + { + "epoch": 0.01638976743223528, + "grad_norm": 0.003597306553274393, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 594 + }, + { + "epoch": 0.016417359633299648, + "grad_norm": 0.002302913460880518, + "learning_rate": 0.001, + "loss": 0.423, + "step": 595 + }, + { + "epoch": 0.016444951834364018, + "grad_norm": 0.0026150753255933523, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 596 + }, + { + "epoch": 0.016472544035428385, + "grad_norm": 0.002459397306665778, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 597 + }, + { + "epoch": 0.016500136236492755, + "grad_norm": 0.00271104765124619, + "learning_rate": 0.001, + "loss": 0.417, + "step": 598 + }, + { + "epoch": 0.016527728437557124, + "grad_norm": 0.003122537862509489, + "learning_rate": 0.001, + "loss": 0.383, + "step": 599 + }, + { + "epoch": 0.016555320638621494, + "grad_norm": 0.0028217623475939035, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 600 + }, + { + "epoch": 0.016582912839685864, + "grad_norm": 0.0029432664159685373, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 601 + }, + { + "epoch": 0.01661050504075023, + "grad_norm": 0.0030998094007372856, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 602 + }, + { + "epoch": 0.0166380972418146, + "grad_norm": 0.005838985554873943, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 603 + }, + { + "epoch": 0.01666568944287897, + "grad_norm": 0.003850255161523819, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 604 + }, + { + "epoch": 0.01669328164394334, + "grad_norm": 0.0026384994853287935, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 605 + }, + { + "epoch": 0.016720873845007707, + "grad_norm": 0.003034224035218358, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 606 + }, + { + "epoch": 0.016748466046072077, + "grad_norm": 0.003741856198757887, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 607 + }, + { + "epoch": 0.016776058247136447, + "grad_norm": 0.0029668582137674093, + "learning_rate": 0.001, + "loss": 0.4492, + "step": 608 + }, + { + "epoch": 0.016803650448200817, + "grad_norm": 0.006931662559509277, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 609 + }, + { + "epoch": 0.016831242649265186, + "grad_norm": 0.005429959390312433, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 610 + }, + { + "epoch": 0.016858834850329553, + "grad_norm": 0.008053823374211788, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 611 + }, + { + "epoch": 0.016886427051393923, + "grad_norm": 0.0056150988675653934, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 612 + }, + { + "epoch": 0.016914019252458293, + "grad_norm": 0.005078485235571861, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 613 + }, + { + "epoch": 0.016941611453522663, + "grad_norm": 0.0043738181702792645, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 614 + }, + { + "epoch": 0.016969203654587033, + "grad_norm": 0.004265911877155304, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 615 + }, + { + "epoch": 0.0169967958556514, + "grad_norm": 0.004496406763792038, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 616 + }, + { + "epoch": 0.01702438805671577, + "grad_norm": 0.004225629381835461, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 617 + }, + { + "epoch": 0.01705198025778014, + "grad_norm": 0.004265735857188702, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 618 + }, + { + "epoch": 0.01707957245884451, + "grad_norm": 0.0045761000365018845, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 619 + }, + { + "epoch": 0.017107164659908875, + "grad_norm": 0.0046028876677155495, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 620 + }, + { + "epoch": 0.017134756860973245, + "grad_norm": 0.003382893279194832, + "learning_rate": 0.001, + "loss": 0.385, + "step": 621 + }, + { + "epoch": 0.017162349062037615, + "grad_norm": 0.003314296016469598, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 622 + }, + { + "epoch": 0.017189941263101985, + "grad_norm": 0.0035887316334992647, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 623 + }, + { + "epoch": 0.017217533464166355, + "grad_norm": 0.003954799845814705, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 624 + }, + { + "epoch": 0.01724512566523072, + "grad_norm": 0.003979894332587719, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 625 + }, + { + "epoch": 0.01727271786629509, + "grad_norm": 0.0029897873755544424, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 626 + }, + { + "epoch": 0.01730031006735946, + "grad_norm": 0.0038952503819018602, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 627 + }, + { + "epoch": 0.01732790226842383, + "grad_norm": 0.004283791407942772, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 628 + }, + { + "epoch": 0.0173554944694882, + "grad_norm": 0.004268042277544737, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 629 + }, + { + "epoch": 0.017383086670552567, + "grad_norm": 0.0034829305950552225, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 630 + }, + { + "epoch": 0.017410678871616937, + "grad_norm": 0.004891110584139824, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 631 + }, + { + "epoch": 0.017438271072681307, + "grad_norm": 0.009508724324405193, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 632 + }, + { + "epoch": 0.017465863273745677, + "grad_norm": 0.003979192115366459, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 633 + }, + { + "epoch": 0.017493455474810043, + "grad_norm": 0.006190054584294558, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 634 + }, + { + "epoch": 0.017521047675874413, + "grad_norm": 0.003986046649515629, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 635 + }, + { + "epoch": 0.017548639876938783, + "grad_norm": 0.002939463360235095, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 636 + }, + { + "epoch": 0.017576232078003153, + "grad_norm": 0.005479493178427219, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 637 + }, + { + "epoch": 0.017603824279067523, + "grad_norm": 0.00527595542371273, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 638 + }, + { + "epoch": 0.01763141648013189, + "grad_norm": 0.004213378299027681, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 639 + }, + { + "epoch": 0.01765900868119626, + "grad_norm": 0.004745858255773783, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 640 + }, + { + "epoch": 0.01768660088226063, + "grad_norm": 0.0038344142958521843, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 641 + }, + { + "epoch": 0.017714193083325, + "grad_norm": 0.004831280559301376, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 642 + }, + { + "epoch": 0.017741785284389366, + "grad_norm": 0.0035769357345998287, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 643 + }, + { + "epoch": 0.017769377485453736, + "grad_norm": 0.00572391739115119, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 644 + }, + { + "epoch": 0.017796969686518106, + "grad_norm": 0.005403329152613878, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 645 + }, + { + "epoch": 0.017824561887582475, + "grad_norm": 0.003341693663969636, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 646 + }, + { + "epoch": 0.017852154088646845, + "grad_norm": 0.00487833097577095, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 647 + }, + { + "epoch": 0.017879746289711212, + "grad_norm": 0.004141534212976694, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 648 + }, + { + "epoch": 0.01790733849077558, + "grad_norm": 0.004081532824784517, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 649 + }, + { + "epoch": 0.01793493069183995, + "grad_norm": 0.004535014741122723, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 650 + }, + { + "epoch": 0.01796252289290432, + "grad_norm": 0.003384925192221999, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 651 + }, + { + "epoch": 0.01799011509396869, + "grad_norm": 0.003582439851015806, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 652 + }, + { + "epoch": 0.018017707295033058, + "grad_norm": 0.0033751516602933407, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 653 + }, + { + "epoch": 0.018045299496097428, + "grad_norm": 0.004208603408187628, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 654 + }, + { + "epoch": 0.018072891697161798, + "grad_norm": 0.005743230227380991, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 655 + }, + { + "epoch": 0.018100483898226168, + "grad_norm": 0.00536683714017272, + "learning_rate": 0.001, + "loss": 0.3488, + "step": 656 + }, + { + "epoch": 0.018128076099290534, + "grad_norm": 0.003961362410336733, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 657 + }, + { + "epoch": 0.018155668300354904, + "grad_norm": 0.004491451662033796, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 658 + }, + { + "epoch": 0.018183260501419274, + "grad_norm": 0.005195736885070801, + "learning_rate": 0.001, + "loss": 0.406, + "step": 659 + }, + { + "epoch": 0.018210852702483644, + "grad_norm": 0.003914379980415106, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 660 + }, + { + "epoch": 0.018238444903548014, + "grad_norm": 0.00485312519595027, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 661 + }, + { + "epoch": 0.01826603710461238, + "grad_norm": 0.00337631581351161, + "learning_rate": 0.001, + "loss": 0.4469, + "step": 662 + }, + { + "epoch": 0.01829362930567675, + "grad_norm": 0.00616971543058753, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 663 + }, + { + "epoch": 0.01832122150674112, + "grad_norm": 0.0032724293414503336, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 664 + }, + { + "epoch": 0.01834881370780549, + "grad_norm": 0.003196743782609701, + "learning_rate": 0.001, + "loss": 0.39, + "step": 665 + }, + { + "epoch": 0.018376405908869856, + "grad_norm": 0.00492246774956584, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 666 + }, + { + "epoch": 0.018403998109934226, + "grad_norm": 0.0029899398796260357, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 667 + }, + { + "epoch": 0.018431590310998596, + "grad_norm": 0.00527157774195075, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 668 + }, + { + "epoch": 0.018459182512062966, + "grad_norm": 0.002288073068484664, + "learning_rate": 0.001, + "loss": 0.407, + "step": 669 + }, + { + "epoch": 0.018486774713127336, + "grad_norm": 0.0031164512038230896, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 670 + }, + { + "epoch": 0.018514366914191702, + "grad_norm": 0.0031951169949024916, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 671 + }, + { + "epoch": 0.018541959115256072, + "grad_norm": 0.0035726516507565975, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 672 + }, + { + "epoch": 0.018569551316320442, + "grad_norm": 0.0036449262406677008, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 673 + }, + { + "epoch": 0.018597143517384812, + "grad_norm": 0.003955410327762365, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 674 + }, + { + "epoch": 0.018624735718449182, + "grad_norm": 0.0030794877093285322, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 675 + }, + { + "epoch": 0.01865232791951355, + "grad_norm": 0.00874530989676714, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 676 + }, + { + "epoch": 0.01867992012057792, + "grad_norm": 0.0042561995796859264, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 677 + }, + { + "epoch": 0.018707512321642288, + "grad_norm": 0.00404794467613101, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 678 + }, + { + "epoch": 0.018735104522706658, + "grad_norm": 0.004145196173340082, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 679 + }, + { + "epoch": 0.018762696723771025, + "grad_norm": 0.00520210200920701, + "learning_rate": 0.001, + "loss": 0.375, + "step": 680 + }, + { + "epoch": 0.018790288924835395, + "grad_norm": 0.004079956095665693, + "learning_rate": 0.001, + "loss": 0.377, + "step": 681 + }, + { + "epoch": 0.018817881125899764, + "grad_norm": 0.0040541659109294415, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 682 + }, + { + "epoch": 0.018845473326964134, + "grad_norm": 0.004334924276918173, + "learning_rate": 0.001, + "loss": 0.414, + "step": 683 + }, + { + "epoch": 0.018873065528028504, + "grad_norm": 0.0037169347051531076, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 684 + }, + { + "epoch": 0.01890065772909287, + "grad_norm": 0.005052134394645691, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 685 + }, + { + "epoch": 0.01892824993015724, + "grad_norm": 0.003371543250977993, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 686 + }, + { + "epoch": 0.01895584213122161, + "grad_norm": 0.002939490368589759, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 687 + }, + { + "epoch": 0.01898343433228598, + "grad_norm": 0.003637490328401327, + "learning_rate": 0.001, + "loss": 0.3654, + "step": 688 + }, + { + "epoch": 0.019011026533350347, + "grad_norm": 0.003198450431227684, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 689 + }, + { + "epoch": 0.019038618734414717, + "grad_norm": 0.004216643515974283, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 690 + }, + { + "epoch": 0.019066210935479087, + "grad_norm": 0.004692459478974342, + "learning_rate": 0.001, + "loss": 0.3592, + "step": 691 + }, + { + "epoch": 0.019093803136543457, + "grad_norm": 0.0043529337272048, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 692 + }, + { + "epoch": 0.019121395337607826, + "grad_norm": 0.004598288331180811, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 693 + }, + { + "epoch": 0.019148987538672193, + "grad_norm": 0.0035506533458828926, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 694 + }, + { + "epoch": 0.019176579739736563, + "grad_norm": 0.0035839497577399015, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 695 + }, + { + "epoch": 0.019204171940800933, + "grad_norm": 0.005143923219293356, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 696 + }, + { + "epoch": 0.019231764141865303, + "grad_norm": 0.003638223512098193, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 697 + }, + { + "epoch": 0.019259356342929673, + "grad_norm": 0.003875188762322068, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 698 + }, + { + "epoch": 0.01928694854399404, + "grad_norm": 0.004542836919426918, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 699 + }, + { + "epoch": 0.01931454074505841, + "grad_norm": 0.00382284470833838, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 700 + }, + { + "epoch": 0.01934213294612278, + "grad_norm": 0.004041844513267279, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 701 + }, + { + "epoch": 0.01936972514718715, + "grad_norm": 0.0031541790813207626, + "learning_rate": 0.001, + "loss": 0.4363, + "step": 702 + }, + { + "epoch": 0.019397317348251515, + "grad_norm": 0.00377883343026042, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 703 + }, + { + "epoch": 0.019424909549315885, + "grad_norm": 0.0043754116632044315, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 704 + }, + { + "epoch": 0.019452501750380255, + "grad_norm": 0.0037639569491147995, + "learning_rate": 0.001, + "loss": 0.399, + "step": 705 + }, + { + "epoch": 0.019480093951444625, + "grad_norm": 0.003134689759463072, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 706 + }, + { + "epoch": 0.019507686152508995, + "grad_norm": 0.0035348469391465187, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 707 + }, + { + "epoch": 0.01953527835357336, + "grad_norm": 0.0043303752318024635, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 708 + }, + { + "epoch": 0.01956287055463773, + "grad_norm": 0.004089695867151022, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 709 + }, + { + "epoch": 0.0195904627557021, + "grad_norm": 0.004323967732489109, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 710 + }, + { + "epoch": 0.01961805495676647, + "grad_norm": 0.003221880178898573, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 711 + }, + { + "epoch": 0.019645647157830837, + "grad_norm": 0.004038861952722073, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 712 + }, + { + "epoch": 0.019673239358895207, + "grad_norm": 0.00399352191016078, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 713 + }, + { + "epoch": 0.019700831559959577, + "grad_norm": 0.003673392115160823, + "learning_rate": 0.001, + "loss": 0.408, + "step": 714 + }, + { + "epoch": 0.019728423761023947, + "grad_norm": 0.004011491779237986, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 715 + }, + { + "epoch": 0.019756015962088317, + "grad_norm": 0.005186786409467459, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 716 + }, + { + "epoch": 0.019783608163152683, + "grad_norm": 0.005963923409581184, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 717 + }, + { + "epoch": 0.019811200364217053, + "grad_norm": 0.004075867123901844, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 718 + }, + { + "epoch": 0.019838792565281423, + "grad_norm": 0.0034671409521251917, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 719 + }, + { + "epoch": 0.019866384766345793, + "grad_norm": 0.004385409411042929, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 720 + }, + { + "epoch": 0.019893976967410163, + "grad_norm": 0.0036052153445780277, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 721 + }, + { + "epoch": 0.01992156916847453, + "grad_norm": 0.0038659870624542236, + "learning_rate": 0.001, + "loss": 0.415, + "step": 722 + }, + { + "epoch": 0.0199491613695389, + "grad_norm": 0.006984284613281488, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 723 + }, + { + "epoch": 0.01997675357060327, + "grad_norm": 0.00536124873906374, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 724 + }, + { + "epoch": 0.02000434577166764, + "grad_norm": 0.005374076310545206, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 725 + }, + { + "epoch": 0.020031937972732006, + "grad_norm": 0.004354424774646759, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 726 + }, + { + "epoch": 0.020059530173796376, + "grad_norm": 0.004792310297489166, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 727 + }, + { + "epoch": 0.020087122374860746, + "grad_norm": 0.004100026097148657, + "learning_rate": 0.001, + "loss": 0.384, + "step": 728 + }, + { + "epoch": 0.020114714575925115, + "grad_norm": 0.00429933238774538, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 729 + }, + { + "epoch": 0.020142306776989485, + "grad_norm": 0.003623842727392912, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 730 + }, + { + "epoch": 0.020169898978053852, + "grad_norm": 0.003992657642811537, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 731 + }, + { + "epoch": 0.02019749117911822, + "grad_norm": 0.004881622269749641, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 732 + }, + { + "epoch": 0.02022508338018259, + "grad_norm": 0.004593881778419018, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 733 + }, + { + "epoch": 0.02025267558124696, + "grad_norm": 0.00592020945623517, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 734 + }, + { + "epoch": 0.02028026778231133, + "grad_norm": 0.004086300730705261, + "learning_rate": 0.001, + "loss": 0.365, + "step": 735 + }, + { + "epoch": 0.020307859983375698, + "grad_norm": 0.008120741695165634, + "learning_rate": 0.001, + "loss": 0.4389, + "step": 736 + }, + { + "epoch": 0.020335452184440068, + "grad_norm": 0.003741480875760317, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 737 + }, + { + "epoch": 0.020363044385504438, + "grad_norm": 0.00764830969274044, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 738 + }, + { + "epoch": 0.020390636586568808, + "grad_norm": 0.0035037719644606113, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 739 + }, + { + "epoch": 0.020418228787633174, + "grad_norm": 0.00437674205750227, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 740 + }, + { + "epoch": 0.020445820988697544, + "grad_norm": 0.004506263881921768, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 741 + }, + { + "epoch": 0.020473413189761914, + "grad_norm": 0.004545763600617647, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 742 + }, + { + "epoch": 0.020501005390826284, + "grad_norm": 0.0050647263415157795, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 743 + }, + { + "epoch": 0.020528597591890654, + "grad_norm": 0.006431067828088999, + "learning_rate": 0.001, + "loss": 0.3693, + "step": 744 + }, + { + "epoch": 0.02055618979295502, + "grad_norm": 0.00419000955298543, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 745 + }, + { + "epoch": 0.02058378199401939, + "grad_norm": 0.005267234519124031, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 746 + }, + { + "epoch": 0.02061137419508376, + "grad_norm": 0.0034249969758093357, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 747 + }, + { + "epoch": 0.02063896639614813, + "grad_norm": 0.00436822697520256, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 748 + }, + { + "epoch": 0.020666558597212496, + "grad_norm": 0.002724075224250555, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 749 + }, + { + "epoch": 0.020694150798276866, + "grad_norm": 0.0030281811486929655, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 750 + }, + { + "epoch": 0.020721742999341236, + "grad_norm": 0.004672218579798937, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 751 + }, + { + "epoch": 0.020749335200405606, + "grad_norm": 0.004708775784820318, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 752 + }, + { + "epoch": 0.020776927401469976, + "grad_norm": 0.009867183864116669, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 753 + }, + { + "epoch": 0.020804519602534342, + "grad_norm": 0.003236403688788414, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 754 + }, + { + "epoch": 0.020832111803598712, + "grad_norm": 0.003183891996741295, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 755 + }, + { + "epoch": 0.020859704004663082, + "grad_norm": 0.0026451335288584232, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 756 + }, + { + "epoch": 0.020887296205727452, + "grad_norm": 0.0030694296583533287, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 757 + }, + { + "epoch": 0.020914888406791822, + "grad_norm": 0.004503907170146704, + "learning_rate": 0.001, + "loss": 0.365, + "step": 758 + }, + { + "epoch": 0.02094248060785619, + "grad_norm": 0.0033429849427193403, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 759 + }, + { + "epoch": 0.02097007280892056, + "grad_norm": 0.005442556459456682, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 760 + }, + { + "epoch": 0.020997665009984928, + "grad_norm": 0.0036932167131453753, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 761 + }, + { + "epoch": 0.021025257211049298, + "grad_norm": 0.004506903700530529, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 762 + }, + { + "epoch": 0.021052849412113665, + "grad_norm": 0.003803919767960906, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 763 + }, + { + "epoch": 0.021080441613178034, + "grad_norm": 0.005300893913954496, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 764 + }, + { + "epoch": 0.021108033814242404, + "grad_norm": 0.005235253367573023, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 765 + }, + { + "epoch": 0.021135626015306774, + "grad_norm": 0.00439833290874958, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 766 + }, + { + "epoch": 0.021163218216371144, + "grad_norm": 0.003949869889765978, + "learning_rate": 0.001, + "loss": 0.3549, + "step": 767 + }, + { + "epoch": 0.02119081041743551, + "grad_norm": 0.0055135395377874374, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 768 + }, + { + "epoch": 0.02121840261849988, + "grad_norm": 0.004492396488785744, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 769 + }, + { + "epoch": 0.02124599481956425, + "grad_norm": 0.0040711937472224236, + "learning_rate": 0.001, + "loss": 0.3678, + "step": 770 + }, + { + "epoch": 0.02127358702062862, + "grad_norm": 0.0038428730331361294, + "learning_rate": 0.001, + "loss": 0.437, + "step": 771 + }, + { + "epoch": 0.021301179221692987, + "grad_norm": 0.003672838443890214, + "learning_rate": 0.001, + "loss": 0.397, + "step": 772 + }, + { + "epoch": 0.021328771422757357, + "grad_norm": 0.003955157473683357, + "learning_rate": 0.001, + "loss": 0.4375, + "step": 773 + }, + { + "epoch": 0.021356363623821727, + "grad_norm": 0.005334306508302689, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 774 + }, + { + "epoch": 0.021383955824886097, + "grad_norm": 0.004772811662405729, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 775 + }, + { + "epoch": 0.021411548025950466, + "grad_norm": 0.005606191698461771, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 776 + }, + { + "epoch": 0.021439140227014833, + "grad_norm": 0.004244519397616386, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 777 + }, + { + "epoch": 0.021466732428079203, + "grad_norm": 0.0028992686420679092, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 778 + }, + { + "epoch": 0.021494324629143573, + "grad_norm": 0.0031759096309542656, + "learning_rate": 0.001, + "loss": 0.437, + "step": 779 + }, + { + "epoch": 0.021521916830207943, + "grad_norm": 0.007329195272177458, + "learning_rate": 0.001, + "loss": 0.3619, + "step": 780 + }, + { + "epoch": 0.021549509031272313, + "grad_norm": 0.006688808090984821, + "learning_rate": 0.001, + "loss": 0.4491, + "step": 781 + }, + { + "epoch": 0.02157710123233668, + "grad_norm": 0.00896657258272171, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 782 + }, + { + "epoch": 0.02160469343340105, + "grad_norm": 0.005213500466197729, + "learning_rate": 0.001, + "loss": 0.4435, + "step": 783 + }, + { + "epoch": 0.02163228563446542, + "grad_norm": 0.0036038621328771114, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 784 + }, + { + "epoch": 0.02165987783552979, + "grad_norm": 0.003994919825345278, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 785 + }, + { + "epoch": 0.021687470036594155, + "grad_norm": 0.004139062017202377, + "learning_rate": 0.001, + "loss": 0.395, + "step": 786 + }, + { + "epoch": 0.021715062237658525, + "grad_norm": 0.008915436454117298, + "learning_rate": 0.001, + "loss": 0.421, + "step": 787 + }, + { + "epoch": 0.021742654438722895, + "grad_norm": 0.003705317387357354, + "learning_rate": 0.001, + "loss": 0.3576, + "step": 788 + }, + { + "epoch": 0.021770246639787265, + "grad_norm": 0.004881418775767088, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 789 + }, + { + "epoch": 0.021797838840851635, + "grad_norm": 0.00573571864515543, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 790 + }, + { + "epoch": 0.021825431041916, + "grad_norm": 0.002695683157071471, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 791 + }, + { + "epoch": 0.02185302324298037, + "grad_norm": 0.003691880265250802, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 792 + }, + { + "epoch": 0.02188061544404474, + "grad_norm": 0.003528386354446411, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 793 + }, + { + "epoch": 0.02190820764510911, + "grad_norm": 0.0033985383342951536, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 794 + }, + { + "epoch": 0.021935799846173477, + "grad_norm": 0.003107238095253706, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 795 + }, + { + "epoch": 0.021963392047237847, + "grad_norm": 0.0034598596394062042, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 796 + }, + { + "epoch": 0.021990984248302217, + "grad_norm": 0.004219945520162582, + "learning_rate": 0.001, + "loss": 0.4331, + "step": 797 + }, + { + "epoch": 0.022018576449366587, + "grad_norm": 0.003744914662092924, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 798 + }, + { + "epoch": 0.022046168650430957, + "grad_norm": 0.004566433373838663, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 799 + }, + { + "epoch": 0.022073760851495323, + "grad_norm": 0.003758675418794155, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 800 + }, + { + "epoch": 0.022101353052559693, + "grad_norm": 0.0030292419251054525, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 801 + }, + { + "epoch": 0.022128945253624063, + "grad_norm": 0.004074092488735914, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 802 + }, + { + "epoch": 0.022156537454688433, + "grad_norm": 0.00471070921048522, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 803 + }, + { + "epoch": 0.022184129655752803, + "grad_norm": 0.007867252454161644, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 804 + }, + { + "epoch": 0.02221172185681717, + "grad_norm": 0.003963668830692768, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 805 + }, + { + "epoch": 0.02223931405788154, + "grad_norm": 0.005508980248123407, + "learning_rate": 0.001, + "loss": 0.395, + "step": 806 + }, + { + "epoch": 0.02226690625894591, + "grad_norm": 0.003475068835541606, + "learning_rate": 0.001, + "loss": 0.434, + "step": 807 + }, + { + "epoch": 0.02229449846001028, + "grad_norm": 0.0032727334182709455, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 808 + }, + { + "epoch": 0.022322090661074646, + "grad_norm": 0.0029324067290872335, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 809 + }, + { + "epoch": 0.022349682862139016, + "grad_norm": 0.005346233956515789, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 810 + }, + { + "epoch": 0.022377275063203386, + "grad_norm": 0.004121492151170969, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 811 + }, + { + "epoch": 0.022404867264267755, + "grad_norm": 0.004381257575005293, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 812 + }, + { + "epoch": 0.022432459465332125, + "grad_norm": 0.005230156239122152, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 813 + }, + { + "epoch": 0.022460051666396492, + "grad_norm": 0.004103715531527996, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 814 + }, + { + "epoch": 0.02248764386746086, + "grad_norm": 0.007542972918599844, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 815 + }, + { + "epoch": 0.02251523606852523, + "grad_norm": 0.003434807062149048, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 816 + }, + { + "epoch": 0.0225428282695896, + "grad_norm": 0.003715425031259656, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 817 + }, + { + "epoch": 0.022570420470653968, + "grad_norm": 0.005767806898802519, + "learning_rate": 0.001, + "loss": 0.433, + "step": 818 + }, + { + "epoch": 0.022598012671718338, + "grad_norm": 0.007371674291789532, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 819 + }, + { + "epoch": 0.022625604872782708, + "grad_norm": 0.0067810118198394775, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 820 + }, + { + "epoch": 0.022653197073847078, + "grad_norm": 0.004905116744339466, + "learning_rate": 0.001, + "loss": 0.3493, + "step": 821 + }, + { + "epoch": 0.022680789274911448, + "grad_norm": 0.0027144188061356544, + "learning_rate": 0.001, + "loss": 0.424, + "step": 822 + }, + { + "epoch": 0.022708381475975814, + "grad_norm": 0.013933762907981873, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 823 + }, + { + "epoch": 0.022735973677040184, + "grad_norm": 0.002978452481329441, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 824 + }, + { + "epoch": 0.022763565878104554, + "grad_norm": 0.0060105458833277225, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 825 + }, + { + "epoch": 0.022791158079168924, + "grad_norm": 0.004003115464001894, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 826 + }, + { + "epoch": 0.022818750280233294, + "grad_norm": 0.008088911883533001, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 827 + }, + { + "epoch": 0.02284634248129766, + "grad_norm": 0.005061788484454155, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 828 + }, + { + "epoch": 0.02287393468236203, + "grad_norm": 0.00426626717671752, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 829 + }, + { + "epoch": 0.0229015268834264, + "grad_norm": 0.0029426752589643, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 830 + }, + { + "epoch": 0.02292911908449077, + "grad_norm": 0.005651662591844797, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 831 + }, + { + "epoch": 0.022956711285555136, + "grad_norm": 0.0026404529344290495, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 832 + }, + { + "epoch": 0.022984303486619506, + "grad_norm": 0.0029586381278932095, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 833 + }, + { + "epoch": 0.023011895687683876, + "grad_norm": 0.0026806823443621397, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 834 + }, + { + "epoch": 0.023039487888748246, + "grad_norm": 0.004135414958000183, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 835 + }, + { + "epoch": 0.023067080089812616, + "grad_norm": 0.0023586770985275507, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 836 + }, + { + "epoch": 0.023094672290876982, + "grad_norm": 0.004498126916587353, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 837 + }, + { + "epoch": 0.023122264491941352, + "grad_norm": 0.003083957824856043, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 838 + }, + { + "epoch": 0.023149856693005722, + "grad_norm": 0.002510181860998273, + "learning_rate": 0.001, + "loss": 0.3618, + "step": 839 + }, + { + "epoch": 0.023177448894070092, + "grad_norm": 0.002793237566947937, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 840 + }, + { + "epoch": 0.023205041095134462, + "grad_norm": 0.002880933927372098, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 841 + }, + { + "epoch": 0.02323263329619883, + "grad_norm": 0.004375714808702469, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 842 + }, + { + "epoch": 0.0232602254972632, + "grad_norm": 0.0033162750769406557, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 843 + }, + { + "epoch": 0.023287817698327568, + "grad_norm": 0.004250579979270697, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 844 + }, + { + "epoch": 0.023315409899391938, + "grad_norm": 0.0036853316705673933, + "learning_rate": 0.001, + "loss": 0.3553, + "step": 845 + }, + { + "epoch": 0.023343002100456305, + "grad_norm": 0.005420148838311434, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 846 + }, + { + "epoch": 0.023370594301520674, + "grad_norm": 0.002736524445936084, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 847 + }, + { + "epoch": 0.023398186502585044, + "grad_norm": 0.0048865810967981815, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 848 + }, + { + "epoch": 0.023425778703649414, + "grad_norm": 0.0033940684515982866, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 849 + }, + { + "epoch": 0.023453370904713784, + "grad_norm": 0.00402069790288806, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 850 + }, + { + "epoch": 0.02348096310577815, + "grad_norm": 0.004255734849721193, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 851 + }, + { + "epoch": 0.02350855530684252, + "grad_norm": 0.0029521863907575607, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 852 + }, + { + "epoch": 0.02353614750790689, + "grad_norm": 0.007674784865230322, + "learning_rate": 0.001, + "loss": 0.399, + "step": 853 + }, + { + "epoch": 0.02356373970897126, + "grad_norm": 0.003159287618473172, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 854 + }, + { + "epoch": 0.023591331910035627, + "grad_norm": 0.00475959200412035, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 855 + }, + { + "epoch": 0.023618924111099997, + "grad_norm": 0.003215026343241334, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 856 + }, + { + "epoch": 0.023646516312164367, + "grad_norm": 0.0038892014417797327, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 857 + }, + { + "epoch": 0.023674108513228737, + "grad_norm": 0.003636479377746582, + "learning_rate": 0.001, + "loss": 0.408, + "step": 858 + }, + { + "epoch": 0.023701700714293106, + "grad_norm": 0.0037935448344796896, + "learning_rate": 0.001, + "loss": 0.381, + "step": 859 + }, + { + "epoch": 0.023729292915357473, + "grad_norm": 0.004558536224067211, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 860 + }, + { + "epoch": 0.023756885116421843, + "grad_norm": 0.003610727610066533, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 861 + }, + { + "epoch": 0.023784477317486213, + "grad_norm": 0.0035163855645805597, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 862 + }, + { + "epoch": 0.023812069518550583, + "grad_norm": 0.003148287069052458, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 863 + }, + { + "epoch": 0.023839661719614953, + "grad_norm": 0.0030976871494203806, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 864 + }, + { + "epoch": 0.02386725392067932, + "grad_norm": 0.003255886258557439, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 865 + }, + { + "epoch": 0.02389484612174369, + "grad_norm": 0.003951660823076963, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 866 + }, + { + "epoch": 0.02392243832280806, + "grad_norm": 0.0038296151906251907, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 867 + }, + { + "epoch": 0.02395003052387243, + "grad_norm": 0.003523425431922078, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 868 + }, + { + "epoch": 0.023977622724936795, + "grad_norm": 0.003752518445253372, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 869 + }, + { + "epoch": 0.024005214926001165, + "grad_norm": 0.005763133056461811, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 870 + }, + { + "epoch": 0.024032807127065535, + "grad_norm": 0.009727811440825462, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 871 + }, + { + "epoch": 0.024060399328129905, + "grad_norm": 0.006550495512783527, + "learning_rate": 0.001, + "loss": 0.434, + "step": 872 + }, + { + "epoch": 0.024087991529194275, + "grad_norm": 0.003153527621179819, + "learning_rate": 0.001, + "loss": 0.416, + "step": 873 + }, + { + "epoch": 0.02411558373025864, + "grad_norm": 0.004990814719349146, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 874 + }, + { + "epoch": 0.02414317593132301, + "grad_norm": 0.0033743572421371937, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 875 + }, + { + "epoch": 0.02417076813238738, + "grad_norm": 0.004232621286064386, + "learning_rate": 0.001, + "loss": 0.407, + "step": 876 + }, + { + "epoch": 0.02419836033345175, + "grad_norm": 0.0045753102749586105, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 877 + }, + { + "epoch": 0.024225952534516117, + "grad_norm": 0.005658434238284826, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 878 + }, + { + "epoch": 0.024253544735580487, + "grad_norm": 0.003705628216266632, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 879 + }, + { + "epoch": 0.024281136936644857, + "grad_norm": 0.004948351066559553, + "learning_rate": 0.001, + "loss": 0.4735, + "step": 880 + }, + { + "epoch": 0.024308729137709227, + "grad_norm": 0.003841443918645382, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 881 + }, + { + "epoch": 0.024336321338773597, + "grad_norm": 0.006151879671961069, + "learning_rate": 0.001, + "loss": 0.3591, + "step": 882 + }, + { + "epoch": 0.024363913539837963, + "grad_norm": 0.004619830287992954, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 883 + }, + { + "epoch": 0.024391505740902333, + "grad_norm": 0.004188140854239464, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 884 + }, + { + "epoch": 0.024419097941966703, + "grad_norm": 0.004255149979144335, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 885 + }, + { + "epoch": 0.024446690143031073, + "grad_norm": 0.00479822838678956, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 886 + }, + { + "epoch": 0.024474282344095443, + "grad_norm": 0.004328257869929075, + "learning_rate": 0.001, + "loss": 0.4, + "step": 887 + }, + { + "epoch": 0.02450187454515981, + "grad_norm": 0.0029929610900580883, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 888 + }, + { + "epoch": 0.02452946674622418, + "grad_norm": 0.003600528696551919, + "learning_rate": 0.001, + "loss": 0.393, + "step": 889 + }, + { + "epoch": 0.02455705894728855, + "grad_norm": 0.0042820703238248825, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 890 + }, + { + "epoch": 0.02458465114835292, + "grad_norm": 0.004282411653548479, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 891 + }, + { + "epoch": 0.024612243349417286, + "grad_norm": 0.0038429568521678448, + "learning_rate": 0.001, + "loss": 0.416, + "step": 892 + }, + { + "epoch": 0.024639835550481656, + "grad_norm": 0.0035555907525122166, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 893 + }, + { + "epoch": 0.024667427751546026, + "grad_norm": 0.0035739324521273375, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 894 + }, + { + "epoch": 0.024695019952610395, + "grad_norm": 0.004270533565431833, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 895 + }, + { + "epoch": 0.024722612153674765, + "grad_norm": 0.003811136120930314, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 896 + }, + { + "epoch": 0.024750204354739132, + "grad_norm": 0.004315483383834362, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 897 + }, + { + "epoch": 0.0247777965558035, + "grad_norm": 0.003949417732656002, + "learning_rate": 0.001, + "loss": 0.403, + "step": 898 + }, + { + "epoch": 0.02480538875686787, + "grad_norm": 0.004281886387616396, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 899 + }, + { + "epoch": 0.02483298095793224, + "grad_norm": 0.004681427031755447, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 900 + }, + { + "epoch": 0.024860573158996608, + "grad_norm": 0.008973667398095131, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 901 + }, + { + "epoch": 0.024888165360060978, + "grad_norm": 0.004130321089178324, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 902 + }, + { + "epoch": 0.024915757561125348, + "grad_norm": 0.005490110721439123, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 903 + }, + { + "epoch": 0.024943349762189718, + "grad_norm": 0.007906914688646793, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 904 + }, + { + "epoch": 0.024970941963254088, + "grad_norm": 0.022035721689462662, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 905 + }, + { + "epoch": 0.024998534164318454, + "grad_norm": 0.0064789773896336555, + "learning_rate": 0.001, + "loss": 0.3644, + "step": 906 + }, + { + "epoch": 0.025026126365382824, + "grad_norm": 0.003105413168668747, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 907 + }, + { + "epoch": 0.025053718566447194, + "grad_norm": 0.0031007244251668453, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 908 + }, + { + "epoch": 0.025081310767511564, + "grad_norm": 0.003298420924693346, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 909 + }, + { + "epoch": 0.025108902968575934, + "grad_norm": 0.0028220806270837784, + "learning_rate": 0.001, + "loss": 0.428, + "step": 910 + }, + { + "epoch": 0.0251364951696403, + "grad_norm": 0.0031455964781343937, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 911 + }, + { + "epoch": 0.02516408737070467, + "grad_norm": 0.0031188016291707754, + "learning_rate": 0.001, + "loss": 0.403, + "step": 912 + }, + { + "epoch": 0.02519167957176904, + "grad_norm": 0.0035306380596011877, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 913 + }, + { + "epoch": 0.02521927177283341, + "grad_norm": 0.010045260190963745, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 914 + }, + { + "epoch": 0.025246863973897776, + "grad_norm": 0.002441234653815627, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 915 + }, + { + "epoch": 0.025274456174962146, + "grad_norm": 0.0037013038527220488, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 916 + }, + { + "epoch": 0.025302048376026516, + "grad_norm": 0.004198992624878883, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 917 + }, + { + "epoch": 0.025329640577090886, + "grad_norm": 0.0041293492540717125, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 918 + }, + { + "epoch": 0.025357232778155256, + "grad_norm": 0.0031841343734413385, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 919 + }, + { + "epoch": 0.025384824979219622, + "grad_norm": 0.003239045385271311, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 920 + }, + { + "epoch": 0.025412417180283992, + "grad_norm": 0.004458332899957895, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 921 + }, + { + "epoch": 0.025440009381348362, + "grad_norm": 0.00542887207120657, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 922 + }, + { + "epoch": 0.025467601582412732, + "grad_norm": 0.0039135608822107315, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 923 + }, + { + "epoch": 0.0254951937834771, + "grad_norm": 0.003864714875817299, + "learning_rate": 0.001, + "loss": 0.399, + "step": 924 + }, + { + "epoch": 0.02552278598454147, + "grad_norm": 0.003554902272298932, + "learning_rate": 0.001, + "loss": 0.3696, + "step": 925 + }, + { + "epoch": 0.02555037818560584, + "grad_norm": 0.003268434898927808, + "learning_rate": 0.001, + "loss": 0.437, + "step": 926 + }, + { + "epoch": 0.025577970386670208, + "grad_norm": 0.003844626247882843, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 927 + }, + { + "epoch": 0.025605562587734578, + "grad_norm": 0.0027172230184078217, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 928 + }, + { + "epoch": 0.025633154788798945, + "grad_norm": 0.0031102465000003576, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 929 + }, + { + "epoch": 0.025660746989863314, + "grad_norm": 0.00434950040653348, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 930 + }, + { + "epoch": 0.025688339190927684, + "grad_norm": 0.0034932976122945547, + "learning_rate": 0.001, + "loss": 0.4578, + "step": 931 + }, + { + "epoch": 0.025715931391992054, + "grad_norm": 0.00359813729301095, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 932 + }, + { + "epoch": 0.025743523593056424, + "grad_norm": 0.004753883462399244, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 933 + }, + { + "epoch": 0.02577111579412079, + "grad_norm": 0.0034384452737867832, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 934 + }, + { + "epoch": 0.02579870799518516, + "grad_norm": 0.0031536207534372807, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 935 + }, + { + "epoch": 0.02582630019624953, + "grad_norm": 0.0027636385057121515, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 936 + }, + { + "epoch": 0.0258538923973139, + "grad_norm": 0.002858042251318693, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 937 + }, + { + "epoch": 0.025881484598378267, + "grad_norm": 0.0022810858208686113, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 938 + }, + { + "epoch": 0.025909076799442637, + "grad_norm": 0.004132518544793129, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 939 + }, + { + "epoch": 0.025936669000507007, + "grad_norm": 0.00525138434022665, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 940 + }, + { + "epoch": 0.025964261201571377, + "grad_norm": 0.003772285534068942, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 941 + }, + { + "epoch": 0.025991853402635746, + "grad_norm": 0.0035187045577913523, + "learning_rate": 0.001, + "loss": 0.3605, + "step": 942 + }, + { + "epoch": 0.026019445603700113, + "grad_norm": 0.00876245740801096, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 943 + }, + { + "epoch": 0.026047037804764483, + "grad_norm": 0.002588030882179737, + "learning_rate": 0.001, + "loss": 0.404, + "step": 944 + }, + { + "epoch": 0.026074630005828853, + "grad_norm": 0.003442909335717559, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 945 + }, + { + "epoch": 0.026102222206893223, + "grad_norm": 0.003080985974520445, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 946 + }, + { + "epoch": 0.02612981440795759, + "grad_norm": 0.0026055227499455214, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 947 + }, + { + "epoch": 0.02615740660902196, + "grad_norm": 0.00331856869161129, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 948 + }, + { + "epoch": 0.02618499881008633, + "grad_norm": 0.003704390488564968, + "learning_rate": 0.001, + "loss": 0.409, + "step": 949 + }, + { + "epoch": 0.0262125910111507, + "grad_norm": 0.0028117795009166002, + "learning_rate": 0.001, + "loss": 0.4347, + "step": 950 + }, + { + "epoch": 0.02624018321221507, + "grad_norm": 0.005540241952985525, + "learning_rate": 0.001, + "loss": 0.394, + "step": 951 + }, + { + "epoch": 0.026267775413279435, + "grad_norm": 0.0033867203164845705, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 952 + }, + { + "epoch": 0.026295367614343805, + "grad_norm": 0.005016230046749115, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 953 + }, + { + "epoch": 0.026322959815408175, + "grad_norm": 0.003560574259608984, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 954 + }, + { + "epoch": 0.026350552016472545, + "grad_norm": 0.006354731973260641, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 955 + }, + { + "epoch": 0.026378144217536915, + "grad_norm": 0.0036503588780760765, + "learning_rate": 0.001, + "loss": 0.4549, + "step": 956 + }, + { + "epoch": 0.02640573641860128, + "grad_norm": 0.0061042881570756435, + "learning_rate": 0.001, + "loss": 0.419, + "step": 957 + }, + { + "epoch": 0.02643332861966565, + "grad_norm": 0.0033794385381042957, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 958 + }, + { + "epoch": 0.02646092082073002, + "grad_norm": 0.00359528511762619, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 959 + }, + { + "epoch": 0.02648851302179439, + "grad_norm": 0.0028623088728636503, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 960 + }, + { + "epoch": 0.026516105222858757, + "grad_norm": 0.0029379641637206078, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 961 + }, + { + "epoch": 0.026543697423923127, + "grad_norm": 0.0034645479172468185, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 962 + }, + { + "epoch": 0.026571289624987497, + "grad_norm": 0.0062490347772836685, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 963 + }, + { + "epoch": 0.026598881826051867, + "grad_norm": 0.010679258033633232, + "learning_rate": 0.001, + "loss": 0.362, + "step": 964 + }, + { + "epoch": 0.026626474027116237, + "grad_norm": 0.003056140150874853, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 965 + }, + { + "epoch": 0.026654066228180603, + "grad_norm": 0.0043676625937223434, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 966 + }, + { + "epoch": 0.026681658429244973, + "grad_norm": 0.004542070906609297, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 967 + }, + { + "epoch": 0.026709250630309343, + "grad_norm": 0.003998064436018467, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 968 + }, + { + "epoch": 0.026736842831373713, + "grad_norm": 0.00380288646556437, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 969 + }, + { + "epoch": 0.026764435032438083, + "grad_norm": 0.005197952967137098, + "learning_rate": 0.001, + "loss": 0.37, + "step": 970 + }, + { + "epoch": 0.02679202723350245, + "grad_norm": 0.0045735882595181465, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 971 + }, + { + "epoch": 0.02681961943456682, + "grad_norm": 0.013644758611917496, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 972 + }, + { + "epoch": 0.02684721163563119, + "grad_norm": 0.0032162275165319443, + "learning_rate": 0.001, + "loss": 0.4419, + "step": 973 + }, + { + "epoch": 0.02687480383669556, + "grad_norm": 0.003487576497718692, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 974 + }, + { + "epoch": 0.026902396037759926, + "grad_norm": 0.0028352616354823112, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 975 + }, + { + "epoch": 0.026929988238824296, + "grad_norm": 0.0032060358207672834, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 976 + }, + { + "epoch": 0.026957580439888666, + "grad_norm": 0.004065185319632292, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 977 + }, + { + "epoch": 0.026985172640953035, + "grad_norm": 0.0032863083761185408, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 978 + }, + { + "epoch": 0.027012764842017405, + "grad_norm": 0.0028903197962790728, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 979 + }, + { + "epoch": 0.027040357043081772, + "grad_norm": 0.0034798365086317062, + "learning_rate": 0.001, + "loss": 0.4378, + "step": 980 + }, + { + "epoch": 0.02706794924414614, + "grad_norm": 0.0034869504161179066, + "learning_rate": 0.001, + "loss": 0.451, + "step": 981 + }, + { + "epoch": 0.02709554144521051, + "grad_norm": 0.003647441975772381, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 982 + }, + { + "epoch": 0.02712313364627488, + "grad_norm": 0.003412257879972458, + "learning_rate": 0.001, + "loss": 0.427, + "step": 983 + }, + { + "epoch": 0.027150725847339248, + "grad_norm": 0.0046013942919671535, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 984 + }, + { + "epoch": 0.027178318048403618, + "grad_norm": 0.00263599562458694, + "learning_rate": 0.001, + "loss": 0.438, + "step": 985 + }, + { + "epoch": 0.027205910249467988, + "grad_norm": 0.00280582788400352, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 986 + }, + { + "epoch": 0.027233502450532358, + "grad_norm": 0.004769660532474518, + "learning_rate": 0.001, + "loss": 0.3607, + "step": 987 + }, + { + "epoch": 0.027261094651596728, + "grad_norm": 0.004607087001204491, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 988 + }, + { + "epoch": 0.027288686852661094, + "grad_norm": 0.003465674351900816, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 989 + }, + { + "epoch": 0.027316279053725464, + "grad_norm": 0.003429507138207555, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 990 + }, + { + "epoch": 0.027343871254789834, + "grad_norm": 0.004496248438954353, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 991 + }, + { + "epoch": 0.027371463455854204, + "grad_norm": 0.0036759956274181604, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 992 + }, + { + "epoch": 0.027399055656918574, + "grad_norm": 0.003747584531083703, + "learning_rate": 0.001, + "loss": 0.43, + "step": 993 + }, + { + "epoch": 0.02742664785798294, + "grad_norm": 0.0032122142147272825, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 994 + }, + { + "epoch": 0.02745424005904731, + "grad_norm": 0.0035362818744033575, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 995 + }, + { + "epoch": 0.02748183226011168, + "grad_norm": 0.002750742482021451, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 996 + }, + { + "epoch": 0.02750942446117605, + "grad_norm": 0.004634341225028038, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 997 + }, + { + "epoch": 0.027537016662240416, + "grad_norm": 0.0034089027903974056, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 998 + }, + { + "epoch": 0.027564608863304786, + "grad_norm": 0.008999533019959927, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 999 + }, + { + "epoch": 0.027592201064369156, + "grad_norm": 0.007176951505243778, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 1000 + }, + { + "epoch": 0.027592201064369156, + "eval_runtime": 23.9716, + "eval_samples_per_second": 1.335, + "eval_steps_per_second": 0.167, + "step": 1000 + }, + { + "epoch": 0.027619793265433526, + "grad_norm": 0.0036988353822380304, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 1001 + }, + { + "epoch": 0.027647385466497896, + "grad_norm": 0.0030345297418534756, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 1002 + }, + { + "epoch": 0.027674977667562262, + "grad_norm": 0.003801414743065834, + "learning_rate": 0.001, + "loss": 0.405, + "step": 1003 + }, + { + "epoch": 0.027702569868626632, + "grad_norm": 0.002585576381534338, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 1004 + }, + { + "epoch": 0.027730162069691002, + "grad_norm": 0.002826697425916791, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 1005 + }, + { + "epoch": 0.027757754270755372, + "grad_norm": 0.0023764553479850292, + "learning_rate": 0.001, + "loss": 0.4434, + "step": 1006 + }, + { + "epoch": 0.02778534647181974, + "grad_norm": 0.002361831720918417, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 1007 + }, + { + "epoch": 0.02781293867288411, + "grad_norm": 0.0033377348445355892, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 1008 + }, + { + "epoch": 0.02784053087394848, + "grad_norm": 0.004634097684174776, + "learning_rate": 0.001, + "loss": 0.421, + "step": 1009 + }, + { + "epoch": 0.027868123075012848, + "grad_norm": 0.0033739793580025434, + "learning_rate": 0.001, + "loss": 0.398, + "step": 1010 + }, + { + "epoch": 0.027895715276077218, + "grad_norm": 0.003304282436147332, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 1011 + }, + { + "epoch": 0.027923307477141585, + "grad_norm": 0.007290132809430361, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 1012 + }, + { + "epoch": 0.027950899678205954, + "grad_norm": 0.008183951489627361, + "learning_rate": 0.001, + "loss": 0.39, + "step": 1013 + }, + { + "epoch": 0.027978491879270324, + "grad_norm": 0.012678315863013268, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 1014 + }, + { + "epoch": 0.028006084080334694, + "grad_norm": 0.00686604343354702, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 1015 + }, + { + "epoch": 0.028033676281399064, + "grad_norm": 0.004761406686156988, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 1016 + }, + { + "epoch": 0.02806126848246343, + "grad_norm": 0.004743502475321293, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 1017 + }, + { + "epoch": 0.0280888606835278, + "grad_norm": 0.003992531448602676, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 1018 + }, + { + "epoch": 0.02811645288459217, + "grad_norm": 0.0027658091858029366, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 1019 + }, + { + "epoch": 0.02814404508565654, + "grad_norm": 0.002945561660453677, + "learning_rate": 0.001, + "loss": 0.3572, + "step": 1020 + }, + { + "epoch": 0.028171637286720907, + "grad_norm": 0.004641372710466385, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 1021 + }, + { + "epoch": 0.028199229487785277, + "grad_norm": 0.0027934517711400986, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 1022 + }, + { + "epoch": 0.028226821688849647, + "grad_norm": 0.0028974004089832306, + "learning_rate": 0.001, + "loss": 0.417, + "step": 1023 + }, + { + "epoch": 0.028254413889914017, + "grad_norm": 0.0028739396948367357, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 1024 + }, + { + "epoch": 0.028282006090978386, + "grad_norm": 0.0029768026433885098, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 1025 + }, + { + "epoch": 0.028309598292042753, + "grad_norm": 0.0032322900369763374, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 1026 + }, + { + "epoch": 0.028337190493107123, + "grad_norm": 0.0037773947697132826, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 1027 + }, + { + "epoch": 0.028364782694171493, + "grad_norm": 0.0037146552931517363, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 1028 + }, + { + "epoch": 0.028392374895235863, + "grad_norm": 0.003289048792794347, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 1029 + }, + { + "epoch": 0.02841996709630023, + "grad_norm": 0.003675120184198022, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 1030 + }, + { + "epoch": 0.0284475592973646, + "grad_norm": 0.002748252125456929, + "learning_rate": 0.001, + "loss": 0.457, + "step": 1031 + }, + { + "epoch": 0.02847515149842897, + "grad_norm": 0.0026871277950704098, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 1032 + }, + { + "epoch": 0.02850274369949334, + "grad_norm": 0.0031060713808983564, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 1033 + }, + { + "epoch": 0.02853033590055771, + "grad_norm": 0.0029715097043663263, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 1034 + }, + { + "epoch": 0.028557928101622075, + "grad_norm": 0.004726918879896402, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 1035 + }, + { + "epoch": 0.028585520302686445, + "grad_norm": 0.0036529232747852802, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 1036 + }, + { + "epoch": 0.028613112503750815, + "grad_norm": 0.003458012593910098, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 1037 + }, + { + "epoch": 0.028640704704815185, + "grad_norm": 0.0026079469826072454, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 1038 + }, + { + "epoch": 0.028668296905879555, + "grad_norm": 0.004490693099796772, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 1039 + }, + { + "epoch": 0.02869588910694392, + "grad_norm": 0.0030477193649858236, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 1040 + }, + { + "epoch": 0.02872348130800829, + "grad_norm": 0.0033396866638213396, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 1041 + }, + { + "epoch": 0.02875107350907266, + "grad_norm": 0.0030391959007829428, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 1042 + }, + { + "epoch": 0.02877866571013703, + "grad_norm": 0.0035509227309376, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 1043 + }, + { + "epoch": 0.028806257911201397, + "grad_norm": 0.0039217835292220116, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 1044 + }, + { + "epoch": 0.028833850112265767, + "grad_norm": 0.007786846719682217, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 1045 + }, + { + "epoch": 0.028861442313330137, + "grad_norm": 0.0030108222272247076, + "learning_rate": 0.001, + "loss": 0.4365, + "step": 1046 + }, + { + "epoch": 0.028889034514394507, + "grad_norm": 0.0058325594291090965, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 1047 + }, + { + "epoch": 0.028916626715458877, + "grad_norm": 0.0032810927368700504, + "learning_rate": 0.001, + "loss": 0.38, + "step": 1048 + }, + { + "epoch": 0.028944218916523243, + "grad_norm": 0.0035062895622104406, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 1049 + }, + { + "epoch": 0.028971811117587613, + "grad_norm": 0.003582380712032318, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 1050 + }, + { + "epoch": 0.028999403318651983, + "grad_norm": 0.003514527576044202, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1051 + }, + { + "epoch": 0.029026995519716353, + "grad_norm": 0.003413026686757803, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 1052 + }, + { + "epoch": 0.02905458772078072, + "grad_norm": 0.0038671360816806555, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 1053 + }, + { + "epoch": 0.02908217992184509, + "grad_norm": 0.004694768693298101, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 1054 + }, + { + "epoch": 0.02910977212290946, + "grad_norm": 0.004935418255627155, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 1055 + }, + { + "epoch": 0.02913736432397383, + "grad_norm": 0.0034861708991229534, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 1056 + }, + { + "epoch": 0.0291649565250382, + "grad_norm": 0.00353567604906857, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 1057 + }, + { + "epoch": 0.029192548726102566, + "grad_norm": 0.0043948497623205185, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 1058 + }, + { + "epoch": 0.029220140927166936, + "grad_norm": 0.00446993438526988, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 1059 + }, + { + "epoch": 0.029247733128231305, + "grad_norm": 0.003901879768818617, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 1060 + }, + { + "epoch": 0.029275325329295675, + "grad_norm": 0.004764191340655088, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 1061 + }, + { + "epoch": 0.029302917530360045, + "grad_norm": 0.0031485301442444324, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 1062 + }, + { + "epoch": 0.029330509731424412, + "grad_norm": 0.004180070944130421, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 1063 + }, + { + "epoch": 0.02935810193248878, + "grad_norm": 0.005659409333020449, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 1064 + }, + { + "epoch": 0.02938569413355315, + "grad_norm": 0.0038438751362264156, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 1065 + }, + { + "epoch": 0.02941328633461752, + "grad_norm": 0.00400773249566555, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 1066 + }, + { + "epoch": 0.029440878535681888, + "grad_norm": 0.005502818617969751, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 1067 + }, + { + "epoch": 0.029468470736746258, + "grad_norm": 0.003952248953282833, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 1068 + }, + { + "epoch": 0.029496062937810628, + "grad_norm": 0.011100285686552525, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 1069 + }, + { + "epoch": 0.029523655138874998, + "grad_norm": 0.00686876242980361, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 1070 + }, + { + "epoch": 0.029551247339939368, + "grad_norm": 0.007713994476944208, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 1071 + }, + { + "epoch": 0.029578839541003734, + "grad_norm": 0.009867096319794655, + "learning_rate": 0.001, + "loss": 0.3383, + "step": 1072 + }, + { + "epoch": 0.029606431742068104, + "grad_norm": 0.004925290122628212, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 1073 + }, + { + "epoch": 0.029634023943132474, + "grad_norm": 0.004095774609595537, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 1074 + }, + { + "epoch": 0.029661616144196844, + "grad_norm": 0.0056510199792683125, + "learning_rate": 0.001, + "loss": 0.386, + "step": 1075 + }, + { + "epoch": 0.029689208345261214, + "grad_norm": 0.003869240405037999, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 1076 + }, + { + "epoch": 0.02971680054632558, + "grad_norm": 0.0034619278740137815, + "learning_rate": 0.001, + "loss": 0.392, + "step": 1077 + }, + { + "epoch": 0.02974439274738995, + "grad_norm": 0.003166050184518099, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 1078 + }, + { + "epoch": 0.02977198494845432, + "grad_norm": 0.004651426337659359, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 1079 + }, + { + "epoch": 0.02979957714951869, + "grad_norm": 0.00306075275875628, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 1080 + }, + { + "epoch": 0.029827169350583056, + "grad_norm": 0.0034789969213306904, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 1081 + }, + { + "epoch": 0.029854761551647426, + "grad_norm": 0.004120729863643646, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 1082 + }, + { + "epoch": 0.029882353752711796, + "grad_norm": 0.005171756725758314, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 1083 + }, + { + "epoch": 0.029909945953776166, + "grad_norm": 0.00312454323284328, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 1084 + }, + { + "epoch": 0.029937538154840536, + "grad_norm": 0.003719372907653451, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 1085 + }, + { + "epoch": 0.029965130355904902, + "grad_norm": 0.003698839107528329, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 1086 + }, + { + "epoch": 0.029992722556969272, + "grad_norm": 0.0035393829457461834, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 1087 + }, + { + "epoch": 0.030020314758033642, + "grad_norm": 0.004070476163178682, + "learning_rate": 0.001, + "loss": 0.3524, + "step": 1088 + }, + { + "epoch": 0.030047906959098012, + "grad_norm": 0.004007712937891483, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 1089 + }, + { + "epoch": 0.03007549916016238, + "grad_norm": 0.0035324685741215944, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 1090 + }, + { + "epoch": 0.03010309136122675, + "grad_norm": 0.007462177891284227, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 1091 + }, + { + "epoch": 0.03013068356229112, + "grad_norm": 0.003764525754377246, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 1092 + }, + { + "epoch": 0.030158275763355488, + "grad_norm": 0.0034473277628421783, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 1093 + }, + { + "epoch": 0.030185867964419858, + "grad_norm": 0.0025921028573065996, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 1094 + }, + { + "epoch": 0.030213460165484225, + "grad_norm": 0.004729332402348518, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 1095 + }, + { + "epoch": 0.030241052366548594, + "grad_norm": 0.0037650500889867544, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 1096 + }, + { + "epoch": 0.030268644567612964, + "grad_norm": 0.0038791224360466003, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 1097 + }, + { + "epoch": 0.030296236768677334, + "grad_norm": 0.004454473964869976, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 1098 + }, + { + "epoch": 0.030323828969741704, + "grad_norm": 0.0029884730465710163, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 1099 + }, + { + "epoch": 0.03035142117080607, + "grad_norm": 0.0049886396154761314, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 1100 + }, + { + "epoch": 0.03037901337187044, + "grad_norm": 0.0025764929596334696, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 1101 + }, + { + "epoch": 0.03040660557293481, + "grad_norm": 0.0029421220533549786, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 1102 + }, + { + "epoch": 0.03043419777399918, + "grad_norm": 0.014296631328761578, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 1103 + }, + { + "epoch": 0.030461789975063547, + "grad_norm": 0.0035509790759533644, + "learning_rate": 0.001, + "loss": 0.3593, + "step": 1104 + }, + { + "epoch": 0.030489382176127917, + "grad_norm": 0.0033370009623467922, + "learning_rate": 0.001, + "loss": 0.4534, + "step": 1105 + }, + { + "epoch": 0.030516974377192287, + "grad_norm": 0.004134595859795809, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 1106 + }, + { + "epoch": 0.030544566578256657, + "grad_norm": 0.005164284259080887, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 1107 + }, + { + "epoch": 0.030572158779321026, + "grad_norm": 0.004688777029514313, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 1108 + }, + { + "epoch": 0.030599750980385393, + "grad_norm": 0.0034766956232488155, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 1109 + }, + { + "epoch": 0.030627343181449763, + "grad_norm": 0.0031806135084480047, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 1110 + }, + { + "epoch": 0.030654935382514133, + "grad_norm": 0.0031535644084215164, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 1111 + }, + { + "epoch": 0.030682527583578503, + "grad_norm": 0.004275255836546421, + "learning_rate": 0.001, + "loss": 0.397, + "step": 1112 + }, + { + "epoch": 0.03071011978464287, + "grad_norm": 0.002956011099740863, + "learning_rate": 0.001, + "loss": 0.415, + "step": 1113 + }, + { + "epoch": 0.03073771198570724, + "grad_norm": 0.002727919491007924, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 1114 + }, + { + "epoch": 0.03076530418677161, + "grad_norm": 0.0037477565929293633, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 1115 + }, + { + "epoch": 0.03079289638783598, + "grad_norm": 0.002299915300682187, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 1116 + }, + { + "epoch": 0.03082048858890035, + "grad_norm": 0.0031318028923124075, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 1117 + }, + { + "epoch": 0.030848080789964715, + "grad_norm": 0.004523344803601503, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 1118 + }, + { + "epoch": 0.030875672991029085, + "grad_norm": 0.0020371493883430958, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 1119 + }, + { + "epoch": 0.030903265192093455, + "grad_norm": 0.0022572767920792103, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 1120 + }, + { + "epoch": 0.030930857393157825, + "grad_norm": 0.0032437213230878115, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 1121 + }, + { + "epoch": 0.030958449594222195, + "grad_norm": 0.004357442259788513, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 1122 + }, + { + "epoch": 0.03098604179528656, + "grad_norm": 0.0027816223446279764, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 1123 + }, + { + "epoch": 0.03101363399635093, + "grad_norm": 0.002367631997913122, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 1124 + }, + { + "epoch": 0.0310412261974153, + "grad_norm": 0.007127678487449884, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 1125 + }, + { + "epoch": 0.03106881839847967, + "grad_norm": 0.003470206633210182, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 1126 + }, + { + "epoch": 0.031096410599544037, + "grad_norm": 0.003238279139623046, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 1127 + }, + { + "epoch": 0.031124002800608407, + "grad_norm": 0.0037913359701633453, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 1128 + }, + { + "epoch": 0.031151595001672777, + "grad_norm": 0.006213251501321793, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 1129 + }, + { + "epoch": 0.031179187202737147, + "grad_norm": 0.004787992220371962, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 1130 + }, + { + "epoch": 0.031206779403801517, + "grad_norm": 0.0029551470652222633, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 1131 + }, + { + "epoch": 0.031234371604865883, + "grad_norm": 0.0033430701587349176, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 1132 + }, + { + "epoch": 0.03126196380593026, + "grad_norm": 0.0027807389851659536, + "learning_rate": 0.001, + "loss": 0.397, + "step": 1133 + }, + { + "epoch": 0.03128955600699462, + "grad_norm": 0.003867300460115075, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 1134 + }, + { + "epoch": 0.03131714820805899, + "grad_norm": 0.0034717791713774204, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 1135 + }, + { + "epoch": 0.03134474040912336, + "grad_norm": 0.0047624786384403706, + "learning_rate": 0.001, + "loss": 0.415, + "step": 1136 + }, + { + "epoch": 0.03137233261018773, + "grad_norm": 0.0029204641468822956, + "learning_rate": 0.001, + "loss": 0.4426, + "step": 1137 + }, + { + "epoch": 0.0313999248112521, + "grad_norm": 0.0029500045347958803, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 1138 + }, + { + "epoch": 0.03142751701231647, + "grad_norm": 0.0030622980557382107, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 1139 + }, + { + "epoch": 0.031455109213380836, + "grad_norm": 0.006081267725676298, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 1140 + }, + { + "epoch": 0.03148270141444521, + "grad_norm": 0.0026580330450087786, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 1141 + }, + { + "epoch": 0.031510293615509576, + "grad_norm": 0.0036745185498148203, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 1142 + }, + { + "epoch": 0.03153788581657394, + "grad_norm": 0.0035352655686438084, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 1143 + }, + { + "epoch": 0.031565478017638315, + "grad_norm": 0.005509037058800459, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 1144 + }, + { + "epoch": 0.03159307021870268, + "grad_norm": 0.0026996792294085026, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 1145 + }, + { + "epoch": 0.031620662419767055, + "grad_norm": 0.0030703977681696415, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 1146 + }, + { + "epoch": 0.03164825462083142, + "grad_norm": 0.004798520356416702, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 1147 + }, + { + "epoch": 0.03167584682189579, + "grad_norm": 0.0030252067372202873, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 1148 + }, + { + "epoch": 0.03170343902296016, + "grad_norm": 0.0031654182821512222, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 1149 + }, + { + "epoch": 0.03173103122402453, + "grad_norm": 0.005452923942357302, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 1150 + }, + { + "epoch": 0.0317586234250889, + "grad_norm": 0.004767664708197117, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 1151 + }, + { + "epoch": 0.03178621562615327, + "grad_norm": 0.0034988594707101583, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 1152 + }, + { + "epoch": 0.031813807827217634, + "grad_norm": 0.0034391777589917183, + "learning_rate": 0.001, + "loss": 0.407, + "step": 1153 + }, + { + "epoch": 0.03184140002828201, + "grad_norm": 0.003413598518818617, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 1154 + }, + { + "epoch": 0.031868992229346374, + "grad_norm": 0.00447838706895709, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 1155 + }, + { + "epoch": 0.03189658443041075, + "grad_norm": 0.002987177576869726, + "learning_rate": 0.001, + "loss": 0.385, + "step": 1156 + }, + { + "epoch": 0.031924176631475114, + "grad_norm": 0.002857605693861842, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 1157 + }, + { + "epoch": 0.03195176883253948, + "grad_norm": 0.0050466121174395084, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 1158 + }, + { + "epoch": 0.031979361033603854, + "grad_norm": 0.003727944800630212, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 1159 + }, + { + "epoch": 0.03200695323466822, + "grad_norm": 0.003613363951444626, + "learning_rate": 0.001, + "loss": 0.408, + "step": 1160 + }, + { + "epoch": 0.03203454543573259, + "grad_norm": 0.003519849618896842, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 1161 + }, + { + "epoch": 0.03206213763679696, + "grad_norm": 0.005940241273492575, + "learning_rate": 0.001, + "loss": 0.423, + "step": 1162 + }, + { + "epoch": 0.032089729837861326, + "grad_norm": 0.0036697194445878267, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 1163 + }, + { + "epoch": 0.0321173220389257, + "grad_norm": 0.0032535314094275236, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 1164 + }, + { + "epoch": 0.032144914239990066, + "grad_norm": 0.0032929456792771816, + "learning_rate": 0.001, + "loss": 0.435, + "step": 1165 + }, + { + "epoch": 0.03217250644105443, + "grad_norm": 0.0031699042301625013, + "learning_rate": 0.001, + "loss": 0.3564, + "step": 1166 + }, + { + "epoch": 0.032200098642118806, + "grad_norm": 0.004080107901245356, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 1167 + }, + { + "epoch": 0.03222769084318317, + "grad_norm": 0.0031074616126716137, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 1168 + }, + { + "epoch": 0.032255283044247546, + "grad_norm": 0.004778381437063217, + "learning_rate": 0.001, + "loss": 0.4701, + "step": 1169 + }, + { + "epoch": 0.03228287524531191, + "grad_norm": 0.004103434272110462, + "learning_rate": 0.001, + "loss": 0.427, + "step": 1170 + }, + { + "epoch": 0.03231046744637628, + "grad_norm": 0.0030324216932058334, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 1171 + }, + { + "epoch": 0.03233805964744065, + "grad_norm": 0.0036339950747787952, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 1172 + }, + { + "epoch": 0.03236565184850502, + "grad_norm": 0.008613920770585537, + "learning_rate": 0.001, + "loss": 0.4442, + "step": 1173 + }, + { + "epoch": 0.03239324404956939, + "grad_norm": 0.014312573708593845, + "learning_rate": 0.001, + "loss": 0.383, + "step": 1174 + }, + { + "epoch": 0.03242083625063376, + "grad_norm": 0.00535425404086709, + "learning_rate": 0.001, + "loss": 0.398, + "step": 1175 + }, + { + "epoch": 0.032448428451698125, + "grad_norm": 0.007150169927626848, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 1176 + }, + { + "epoch": 0.0324760206527625, + "grad_norm": 0.0030312181916087866, + "learning_rate": 0.001, + "loss": 0.4539, + "step": 1177 + }, + { + "epoch": 0.032503612853826865, + "grad_norm": 0.003885834477841854, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 1178 + }, + { + "epoch": 0.03253120505489124, + "grad_norm": 0.0037573217414319515, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 1179 + }, + { + "epoch": 0.032558797255955604, + "grad_norm": 0.015517042018473148, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 1180 + }, + { + "epoch": 0.03258638945701997, + "grad_norm": 0.004123013466596603, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 1181 + }, + { + "epoch": 0.032613981658084344, + "grad_norm": 0.00358961196616292, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 1182 + }, + { + "epoch": 0.03264157385914871, + "grad_norm": 0.002588262315839529, + "learning_rate": 0.001, + "loss": 0.4649, + "step": 1183 + }, + { + "epoch": 0.032669166060213084, + "grad_norm": 0.0033034030348062515, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 1184 + }, + { + "epoch": 0.03269675826127745, + "grad_norm": 0.0026907999999821186, + "learning_rate": 0.001, + "loss": 0.435, + "step": 1185 + }, + { + "epoch": 0.03272435046234182, + "grad_norm": 0.0029784864746034145, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 1186 + }, + { + "epoch": 0.03275194266340619, + "grad_norm": 0.004101334605365992, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 1187 + }, + { + "epoch": 0.03277953486447056, + "grad_norm": 0.008756603114306927, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 1188 + }, + { + "epoch": 0.03280712706553492, + "grad_norm": 0.005191429052501917, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 1189 + }, + { + "epoch": 0.032834719266599297, + "grad_norm": 0.00770029379054904, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 1190 + }, + { + "epoch": 0.03286231146766366, + "grad_norm": 0.005141628440469503, + "learning_rate": 0.001, + "loss": 0.4402, + "step": 1191 + }, + { + "epoch": 0.032889903668728036, + "grad_norm": 0.004422733094543219, + "learning_rate": 0.001, + "loss": 0.3201, + "step": 1192 + }, + { + "epoch": 0.0329174958697924, + "grad_norm": 0.0060617136768996716, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 1193 + }, + { + "epoch": 0.03294508807085677, + "grad_norm": 0.007132702972739935, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 1194 + }, + { + "epoch": 0.03297268027192114, + "grad_norm": 0.006239313166588545, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 1195 + }, + { + "epoch": 0.03300027247298551, + "grad_norm": 0.00840328261256218, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 1196 + }, + { + "epoch": 0.03302786467404988, + "grad_norm": 0.004313162062317133, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 1197 + }, + { + "epoch": 0.03305545687511425, + "grad_norm": 0.004403871949762106, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 1198 + }, + { + "epoch": 0.033083049076178615, + "grad_norm": 0.005073420237749815, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 1199 + }, + { + "epoch": 0.03311064127724299, + "grad_norm": 0.00479225255548954, + "learning_rate": 0.001, + "loss": 0.398, + "step": 1200 + }, + { + "epoch": 0.033138233478307355, + "grad_norm": 0.003564857877790928, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 1201 + }, + { + "epoch": 0.03316582567937173, + "grad_norm": 0.004907668102532625, + "learning_rate": 0.001, + "loss": 0.3734, + "step": 1202 + }, + { + "epoch": 0.033193417880436095, + "grad_norm": 0.003451892174780369, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 1203 + }, + { + "epoch": 0.03322101008150046, + "grad_norm": 0.0029975506477057934, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 1204 + }, + { + "epoch": 0.033248602282564835, + "grad_norm": 0.005138040985912085, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 1205 + }, + { + "epoch": 0.0332761944836292, + "grad_norm": 0.004696134477853775, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 1206 + }, + { + "epoch": 0.033303786684693575, + "grad_norm": 0.005408620461821556, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 1207 + }, + { + "epoch": 0.03333137888575794, + "grad_norm": 0.004990086425095797, + "learning_rate": 0.001, + "loss": 0.408, + "step": 1208 + }, + { + "epoch": 0.03335897108682231, + "grad_norm": 0.003546757623553276, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 1209 + }, + { + "epoch": 0.03338656328788668, + "grad_norm": 0.003440044354647398, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 1210 + }, + { + "epoch": 0.03341415548895105, + "grad_norm": 0.005536787211894989, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 1211 + }, + { + "epoch": 0.033441747690015414, + "grad_norm": 0.005742164328694344, + "learning_rate": 0.001, + "loss": 0.386, + "step": 1212 + }, + { + "epoch": 0.03346933989107979, + "grad_norm": 0.009792082943022251, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 1213 + }, + { + "epoch": 0.033496932092144154, + "grad_norm": 0.004209910985082388, + "learning_rate": 0.001, + "loss": 0.3601, + "step": 1214 + }, + { + "epoch": 0.03352452429320853, + "grad_norm": 0.0035643631126731634, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 1215 + }, + { + "epoch": 0.03355211649427289, + "grad_norm": 0.0032242017332464457, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 1216 + }, + { + "epoch": 0.03357970869533726, + "grad_norm": 0.004148339852690697, + "learning_rate": 0.001, + "loss": 0.4498, + "step": 1217 + }, + { + "epoch": 0.03360730089640163, + "grad_norm": 0.004815380088984966, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 1218 + }, + { + "epoch": 0.033634893097466, + "grad_norm": 0.0036340258084237576, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 1219 + }, + { + "epoch": 0.03366248529853037, + "grad_norm": 0.0031413130927830935, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 1220 + }, + { + "epoch": 0.03369007749959474, + "grad_norm": 0.005375964101403952, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 1221 + }, + { + "epoch": 0.033717669700659106, + "grad_norm": 0.0039574578404426575, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 1222 + }, + { + "epoch": 0.03374526190172348, + "grad_norm": 0.0040151095017790794, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 1223 + }, + { + "epoch": 0.033772854102787846, + "grad_norm": 0.005083186086267233, + "learning_rate": 0.001, + "loss": 0.411, + "step": 1224 + }, + { + "epoch": 0.03380044630385222, + "grad_norm": 0.00605596462264657, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 1225 + }, + { + "epoch": 0.033828038504916585, + "grad_norm": 0.0039448305033147335, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 1226 + }, + { + "epoch": 0.03385563070598095, + "grad_norm": 0.012280398979783058, + "learning_rate": 0.001, + "loss": 0.377, + "step": 1227 + }, + { + "epoch": 0.033883222907045325, + "grad_norm": 0.049376230686903, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 1228 + }, + { + "epoch": 0.03391081510810969, + "grad_norm": 0.0036013920325785875, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 1229 + }, + { + "epoch": 0.033938407309174065, + "grad_norm": 0.0094405896961689, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 1230 + }, + { + "epoch": 0.03396599951023843, + "grad_norm": 0.007406734395772219, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 1231 + }, + { + "epoch": 0.0339935917113028, + "grad_norm": 0.004268816206604242, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 1232 + }, + { + "epoch": 0.03402118391236717, + "grad_norm": 0.003923129290342331, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 1233 + }, + { + "epoch": 0.03404877611343154, + "grad_norm": 0.003082707989960909, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 1234 + }, + { + "epoch": 0.03407636831449591, + "grad_norm": 0.004590165335685015, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 1235 + }, + { + "epoch": 0.03410396051556028, + "grad_norm": 0.003626961726695299, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 1236 + }, + { + "epoch": 0.034131552716624644, + "grad_norm": 0.003703797934576869, + "learning_rate": 0.001, + "loss": 0.382, + "step": 1237 + }, + { + "epoch": 0.03415914491768902, + "grad_norm": 0.005130970384925604, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 1238 + }, + { + "epoch": 0.034186737118753384, + "grad_norm": 0.0035557232331484556, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 1239 + }, + { + "epoch": 0.03421432931981775, + "grad_norm": 0.0043634334579110146, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 1240 + }, + { + "epoch": 0.034241921520882124, + "grad_norm": 0.006564748473465443, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 1241 + }, + { + "epoch": 0.03426951372194649, + "grad_norm": 0.0034478590823709965, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 1242 + }, + { + "epoch": 0.034297105923010864, + "grad_norm": 0.003941735252737999, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 1243 + }, + { + "epoch": 0.03432469812407523, + "grad_norm": 0.004107107874006033, + "learning_rate": 0.001, + "loss": 0.3597, + "step": 1244 + }, + { + "epoch": 0.034352290325139596, + "grad_norm": 0.0032025808468461037, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 1245 + }, + { + "epoch": 0.03437988252620397, + "grad_norm": 0.0033102971501648426, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 1246 + }, + { + "epoch": 0.034407474727268336, + "grad_norm": 0.00529972231015563, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 1247 + }, + { + "epoch": 0.03443506692833271, + "grad_norm": 0.004502330906689167, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 1248 + }, + { + "epoch": 0.034462659129397076, + "grad_norm": 0.0027463252190500498, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 1249 + }, + { + "epoch": 0.03449025133046144, + "grad_norm": 0.0033640682231634855, + "learning_rate": 0.001, + "loss": 0.388, + "step": 1250 + }, + { + "epoch": 0.034517843531525816, + "grad_norm": 0.007285924628376961, + "learning_rate": 0.001, + "loss": 0.413, + "step": 1251 + }, + { + "epoch": 0.03454543573259018, + "grad_norm": 0.004217895213514566, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 1252 + }, + { + "epoch": 0.034573027933654556, + "grad_norm": 0.0027172528207302094, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 1253 + }, + { + "epoch": 0.03460062013471892, + "grad_norm": 0.0033437691163271666, + "learning_rate": 0.001, + "loss": 0.412, + "step": 1254 + }, + { + "epoch": 0.03462821233578329, + "grad_norm": 0.003804217092692852, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 1255 + }, + { + "epoch": 0.03465580453684766, + "grad_norm": 0.0027781634125858545, + "learning_rate": 0.001, + "loss": 0.456, + "step": 1256 + }, + { + "epoch": 0.03468339673791203, + "grad_norm": 0.003424674505367875, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 1257 + }, + { + "epoch": 0.0347109889389764, + "grad_norm": 0.003052354324609041, + "learning_rate": 0.001, + "loss": 0.4331, + "step": 1258 + }, + { + "epoch": 0.03473858114004077, + "grad_norm": 0.004982203710824251, + "learning_rate": 0.001, + "loss": 0.3557, + "step": 1259 + }, + { + "epoch": 0.034766173341105135, + "grad_norm": 0.003158049425110221, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 1260 + }, + { + "epoch": 0.03479376554216951, + "grad_norm": 0.0035426870454102755, + "learning_rate": 0.001, + "loss": 0.3496, + "step": 1261 + }, + { + "epoch": 0.034821357743233874, + "grad_norm": 0.0026156532112509012, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 1262 + }, + { + "epoch": 0.03484894994429824, + "grad_norm": 0.0027896466199308634, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 1263 + }, + { + "epoch": 0.034876542145362614, + "grad_norm": 0.002534053521230817, + "learning_rate": 0.001, + "loss": 0.379, + "step": 1264 + }, + { + "epoch": 0.03490413434642698, + "grad_norm": 0.005910890176892281, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 1265 + }, + { + "epoch": 0.034931726547491354, + "grad_norm": 0.004372291266918182, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 1266 + }, + { + "epoch": 0.03495931874855572, + "grad_norm": 0.0036974658723920584, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 1267 + }, + { + "epoch": 0.03498691094962009, + "grad_norm": 0.04189533367753029, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 1268 + }, + { + "epoch": 0.03501450315068446, + "grad_norm": 0.0037905664648860693, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 1269 + }, + { + "epoch": 0.03504209535174883, + "grad_norm": 0.004496126435697079, + "learning_rate": 0.001, + "loss": 0.3499, + "step": 1270 + }, + { + "epoch": 0.0350696875528132, + "grad_norm": 0.002956201322376728, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 1271 + }, + { + "epoch": 0.03509727975387757, + "grad_norm": 0.004545163828879595, + "learning_rate": 0.001, + "loss": 0.399, + "step": 1272 + }, + { + "epoch": 0.03512487195494193, + "grad_norm": 0.005399242043495178, + "learning_rate": 0.001, + "loss": 0.409, + "step": 1273 + }, + { + "epoch": 0.035152464156006306, + "grad_norm": 0.003836257616057992, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 1274 + }, + { + "epoch": 0.03518005635707067, + "grad_norm": 0.0035388548858463764, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 1275 + }, + { + "epoch": 0.035207648558135046, + "grad_norm": 0.009006824344396591, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 1276 + }, + { + "epoch": 0.03523524075919941, + "grad_norm": 0.006458511110395193, + "learning_rate": 0.001, + "loss": 0.4337, + "step": 1277 + }, + { + "epoch": 0.03526283296026378, + "grad_norm": 0.0033469260670244694, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 1278 + }, + { + "epoch": 0.03529042516132815, + "grad_norm": 0.003662231145426631, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 1279 + }, + { + "epoch": 0.03531801736239252, + "grad_norm": 0.004838781896978617, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 1280 + }, + { + "epoch": 0.03534560956345689, + "grad_norm": 0.003931723535060883, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 1281 + }, + { + "epoch": 0.03537320176452126, + "grad_norm": 0.0037028163205832243, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 1282 + }, + { + "epoch": 0.035400793965585625, + "grad_norm": 0.00402813870459795, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 1283 + }, + { + "epoch": 0.03542838616665, + "grad_norm": 0.0033616770524531603, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 1284 + }, + { + "epoch": 0.035455978367714365, + "grad_norm": 0.002920625265687704, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 1285 + }, + { + "epoch": 0.03548357056877873, + "grad_norm": 0.007799847517162561, + "learning_rate": 0.001, + "loss": 0.3389, + "step": 1286 + }, + { + "epoch": 0.035511162769843105, + "grad_norm": 0.0034114914014935493, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 1287 + }, + { + "epoch": 0.03553875497090747, + "grad_norm": 0.0037257294170558453, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 1288 + }, + { + "epoch": 0.035566347171971845, + "grad_norm": 0.0022869498934596777, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 1289 + }, + { + "epoch": 0.03559393937303621, + "grad_norm": 0.006633399520069361, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 1290 + }, + { + "epoch": 0.03562153157410058, + "grad_norm": 0.004205191507935524, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 1291 + }, + { + "epoch": 0.03564912377516495, + "grad_norm": 0.0037389290519058704, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 1292 + }, + { + "epoch": 0.03567671597622932, + "grad_norm": 0.00401865690946579, + "learning_rate": 0.001, + "loss": 0.411, + "step": 1293 + }, + { + "epoch": 0.03570430817729369, + "grad_norm": 0.0036637093871831894, + "learning_rate": 0.001, + "loss": 0.3678, + "step": 1294 + }, + { + "epoch": 0.03573190037835806, + "grad_norm": 0.002707039937376976, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 1295 + }, + { + "epoch": 0.035759492579422424, + "grad_norm": 0.004088058602064848, + "learning_rate": 0.001, + "loss": 0.388, + "step": 1296 + }, + { + "epoch": 0.0357870847804868, + "grad_norm": 0.0029987411107867956, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 1297 + }, + { + "epoch": 0.03581467698155116, + "grad_norm": 0.0037499302998185158, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 1298 + }, + { + "epoch": 0.03584226918261554, + "grad_norm": 0.00426569813862443, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 1299 + }, + { + "epoch": 0.0358698613836799, + "grad_norm": 0.0036391124594956636, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 1300 + }, + { + "epoch": 0.03589745358474427, + "grad_norm": 0.003542504971846938, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 1301 + }, + { + "epoch": 0.03592504578580864, + "grad_norm": 0.003770799608901143, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 1302 + }, + { + "epoch": 0.03595263798687301, + "grad_norm": 0.006019794847816229, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 1303 + }, + { + "epoch": 0.03598023018793738, + "grad_norm": 0.0027127231005579233, + "learning_rate": 0.001, + "loss": 0.4451, + "step": 1304 + }, + { + "epoch": 0.03600782238900175, + "grad_norm": 0.004511113744229078, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 1305 + }, + { + "epoch": 0.036035414590066116, + "grad_norm": 0.0061105103231966496, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 1306 + }, + { + "epoch": 0.03606300679113049, + "grad_norm": 0.003959209658205509, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 1307 + }, + { + "epoch": 0.036090598992194856, + "grad_norm": 0.006086795590817928, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 1308 + }, + { + "epoch": 0.03611819119325922, + "grad_norm": 0.003370011458173394, + "learning_rate": 0.001, + "loss": 0.447, + "step": 1309 + }, + { + "epoch": 0.036145783394323595, + "grad_norm": 0.004544582683593035, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 1310 + }, + { + "epoch": 0.03617337559538796, + "grad_norm": 0.004398183431476355, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 1311 + }, + { + "epoch": 0.036200967796452335, + "grad_norm": 0.0034012263640761375, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 1312 + }, + { + "epoch": 0.0362285599975167, + "grad_norm": 0.004038006532937288, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1313 + }, + { + "epoch": 0.03625615219858107, + "grad_norm": 0.004931545816361904, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 1314 + }, + { + "epoch": 0.03628374439964544, + "grad_norm": 0.004279931075870991, + "learning_rate": 0.001, + "loss": 0.4528, + "step": 1315 + }, + { + "epoch": 0.03631133660070981, + "grad_norm": 0.0031895472202450037, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 1316 + }, + { + "epoch": 0.03633892880177418, + "grad_norm": 0.005345645360648632, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 1317 + }, + { + "epoch": 0.03636652100283855, + "grad_norm": 0.004938568454235792, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 1318 + }, + { + "epoch": 0.036394113203902914, + "grad_norm": 0.004648009780794382, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 1319 + }, + { + "epoch": 0.03642170540496729, + "grad_norm": 0.006550830788910389, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 1320 + }, + { + "epoch": 0.036449297606031654, + "grad_norm": 0.002846076153218746, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 1321 + }, + { + "epoch": 0.03647688980709603, + "grad_norm": 0.004674985073506832, + "learning_rate": 0.001, + "loss": 0.4307, + "step": 1322 + }, + { + "epoch": 0.036504482008160394, + "grad_norm": 0.003952328581362963, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 1323 + }, + { + "epoch": 0.03653207420922476, + "grad_norm": 0.0029479297809302807, + "learning_rate": 0.001, + "loss": 0.405, + "step": 1324 + }, + { + "epoch": 0.036559666410289134, + "grad_norm": 0.0032731324899941683, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 1325 + }, + { + "epoch": 0.0365872586113535, + "grad_norm": 0.003791957162320614, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 1326 + }, + { + "epoch": 0.03661485081241787, + "grad_norm": 0.009052555076777935, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 1327 + }, + { + "epoch": 0.03664244301348224, + "grad_norm": 0.005174124613404274, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 1328 + }, + { + "epoch": 0.036670035214546606, + "grad_norm": 0.008013852871954441, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 1329 + }, + { + "epoch": 0.03669762741561098, + "grad_norm": 0.006423450540751219, + "learning_rate": 0.001, + "loss": 0.402, + "step": 1330 + }, + { + "epoch": 0.036725219616675346, + "grad_norm": 0.006040682550519705, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 1331 + }, + { + "epoch": 0.03675281181773971, + "grad_norm": 0.0055701639503240585, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 1332 + }, + { + "epoch": 0.036780404018804086, + "grad_norm": 0.01270906999707222, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 1333 + }, + { + "epoch": 0.03680799621986845, + "grad_norm": 0.003626336809247732, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 1334 + }, + { + "epoch": 0.036835588420932826, + "grad_norm": 0.004632920026779175, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 1335 + }, + { + "epoch": 0.03686318062199719, + "grad_norm": 0.0060633327811956406, + "learning_rate": 0.001, + "loss": 0.424, + "step": 1336 + }, + { + "epoch": 0.03689077282306156, + "grad_norm": 0.004871148616075516, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 1337 + }, + { + "epoch": 0.03691836502412593, + "grad_norm": 0.0031172886956483126, + "learning_rate": 0.001, + "loss": 0.4316, + "step": 1338 + }, + { + "epoch": 0.0369459572251903, + "grad_norm": 0.003916094545274973, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 1339 + }, + { + "epoch": 0.03697354942625467, + "grad_norm": 0.0040051937103271484, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 1340 + }, + { + "epoch": 0.03700114162731904, + "grad_norm": 0.006250888109207153, + "learning_rate": 0.001, + "loss": 0.402, + "step": 1341 + }, + { + "epoch": 0.037028733828383405, + "grad_norm": 0.002888569375500083, + "learning_rate": 0.001, + "loss": 0.405, + "step": 1342 + }, + { + "epoch": 0.03705632602944778, + "grad_norm": 0.004510107915848494, + "learning_rate": 0.001, + "loss": 0.368, + "step": 1343 + }, + { + "epoch": 0.037083918230512145, + "grad_norm": 0.007241697516292334, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 1344 + }, + { + "epoch": 0.03711151043157652, + "grad_norm": 0.0032798638567328453, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 1345 + }, + { + "epoch": 0.037139102632640884, + "grad_norm": 0.003440143307670951, + "learning_rate": 0.001, + "loss": 0.4474, + "step": 1346 + }, + { + "epoch": 0.03716669483370525, + "grad_norm": 0.0044672004878520966, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 1347 + }, + { + "epoch": 0.037194287034769624, + "grad_norm": 0.0031135319732129574, + "learning_rate": 0.001, + "loss": 0.389, + "step": 1348 + }, + { + "epoch": 0.03722187923583399, + "grad_norm": 0.004170152824372053, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 1349 + }, + { + "epoch": 0.037249471436898364, + "grad_norm": 0.0036481074057519436, + "learning_rate": 0.001, + "loss": 0.4454, + "step": 1350 + }, + { + "epoch": 0.03727706363796273, + "grad_norm": 0.003243829123675823, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 1351 + }, + { + "epoch": 0.0373046558390271, + "grad_norm": 0.0034886065404862165, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 1352 + }, + { + "epoch": 0.03733224804009147, + "grad_norm": 0.004647396504878998, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 1353 + }, + { + "epoch": 0.03735984024115584, + "grad_norm": 0.004046002868562937, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 1354 + }, + { + "epoch": 0.0373874324422202, + "grad_norm": 0.004573929589241743, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 1355 + }, + { + "epoch": 0.037415024643284576, + "grad_norm": 0.006424955558031797, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 1356 + }, + { + "epoch": 0.03744261684434894, + "grad_norm": 0.0033393288031220436, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 1357 + }, + { + "epoch": 0.037470209045413316, + "grad_norm": 0.0031134155578911304, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 1358 + }, + { + "epoch": 0.03749780124647768, + "grad_norm": 0.00366019899956882, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 1359 + }, + { + "epoch": 0.03752539344754205, + "grad_norm": 0.003400568151846528, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 1360 + }, + { + "epoch": 0.03755298564860642, + "grad_norm": 0.002846767893061042, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 1361 + }, + { + "epoch": 0.03758057784967079, + "grad_norm": 0.0031303889118134975, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 1362 + }, + { + "epoch": 0.03760817005073516, + "grad_norm": 0.0043816519901156425, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 1363 + }, + { + "epoch": 0.03763576225179953, + "grad_norm": 0.004520198330283165, + "learning_rate": 0.001, + "loss": 0.349, + "step": 1364 + }, + { + "epoch": 0.037663354452863895, + "grad_norm": 0.005284131038933992, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 1365 + }, + { + "epoch": 0.03769094665392827, + "grad_norm": 0.0037800793070346117, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 1366 + }, + { + "epoch": 0.037718538854992635, + "grad_norm": 0.004985132720321417, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 1367 + }, + { + "epoch": 0.03774613105605701, + "grad_norm": 0.0036822801921516657, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 1368 + }, + { + "epoch": 0.037773723257121375, + "grad_norm": 0.0032694186083972454, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 1369 + }, + { + "epoch": 0.03780131545818574, + "grad_norm": 0.003384978510439396, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 1370 + }, + { + "epoch": 0.037828907659250115, + "grad_norm": 0.0035624897573143244, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 1371 + }, + { + "epoch": 0.03785649986031448, + "grad_norm": 0.004096219781786203, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 1372 + }, + { + "epoch": 0.037884092061378855, + "grad_norm": 0.004491012543439865, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 1373 + }, + { + "epoch": 0.03791168426244322, + "grad_norm": 0.0034480481408536434, + "learning_rate": 0.001, + "loss": 0.422, + "step": 1374 + }, + { + "epoch": 0.03793927646350759, + "grad_norm": 0.006217781454324722, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 1375 + }, + { + "epoch": 0.03796686866457196, + "grad_norm": 0.004664869979023933, + "learning_rate": 0.001, + "loss": 0.423, + "step": 1376 + }, + { + "epoch": 0.03799446086563633, + "grad_norm": 0.008887716569006443, + "learning_rate": 0.001, + "loss": 0.425, + "step": 1377 + }, + { + "epoch": 0.038022053066700694, + "grad_norm": 0.003177997190505266, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 1378 + }, + { + "epoch": 0.03804964526776507, + "grad_norm": 0.0035175783559679985, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 1379 + }, + { + "epoch": 0.038077237468829433, + "grad_norm": 0.0047409930266439915, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 1380 + }, + { + "epoch": 0.03810482966989381, + "grad_norm": 0.007139190100133419, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 1381 + }, + { + "epoch": 0.03813242187095817, + "grad_norm": 0.0083334194496274, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 1382 + }, + { + "epoch": 0.03816001407202254, + "grad_norm": 0.0037119935732334852, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 1383 + }, + { + "epoch": 0.03818760627308691, + "grad_norm": 0.00669982610270381, + "learning_rate": 0.001, + "loss": 0.386, + "step": 1384 + }, + { + "epoch": 0.03821519847415128, + "grad_norm": 0.00357433152385056, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 1385 + }, + { + "epoch": 0.03824279067521565, + "grad_norm": 0.004873959813266993, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 1386 + }, + { + "epoch": 0.03827038287628002, + "grad_norm": 0.011126353405416012, + "learning_rate": 0.001, + "loss": 0.3646, + "step": 1387 + }, + { + "epoch": 0.038297975077344386, + "grad_norm": 0.0038117689546197653, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 1388 + }, + { + "epoch": 0.03832556727840876, + "grad_norm": 0.011077326722443104, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 1389 + }, + { + "epoch": 0.038353159479473126, + "grad_norm": 0.005909190978854895, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 1390 + }, + { + "epoch": 0.0383807516805375, + "grad_norm": 0.0056834593415260315, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 1391 + }, + { + "epoch": 0.038408343881601865, + "grad_norm": 0.0038277830462902784, + "learning_rate": 0.001, + "loss": 0.3663, + "step": 1392 + }, + { + "epoch": 0.03843593608266623, + "grad_norm": 0.005587390623986721, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 1393 + }, + { + "epoch": 0.038463528283730605, + "grad_norm": 0.004197390284389257, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 1394 + }, + { + "epoch": 0.03849112048479497, + "grad_norm": 0.0047178626991808414, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 1395 + }, + { + "epoch": 0.038518712685859345, + "grad_norm": 0.0031104707159101963, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 1396 + }, + { + "epoch": 0.03854630488692371, + "grad_norm": 0.005666974000632763, + "learning_rate": 0.001, + "loss": 0.378, + "step": 1397 + }, + { + "epoch": 0.03857389708798808, + "grad_norm": 0.006076369900256395, + "learning_rate": 0.001, + "loss": 0.388, + "step": 1398 + }, + { + "epoch": 0.03860148928905245, + "grad_norm": 0.004276493098586798, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 1399 + }, + { + "epoch": 0.03862908149011682, + "grad_norm": 0.00408798037096858, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 1400 + }, + { + "epoch": 0.038656673691181184, + "grad_norm": 0.003366732969880104, + "learning_rate": 0.001, + "loss": 0.3514, + "step": 1401 + }, + { + "epoch": 0.03868426589224556, + "grad_norm": 0.003257429925724864, + "learning_rate": 0.001, + "loss": 0.414, + "step": 1402 + }, + { + "epoch": 0.038711858093309924, + "grad_norm": 0.0036397224757820368, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 1403 + }, + { + "epoch": 0.0387394502943743, + "grad_norm": 0.00421003857627511, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 1404 + }, + { + "epoch": 0.038767042495438664, + "grad_norm": 0.004263239912688732, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 1405 + }, + { + "epoch": 0.03879463469650303, + "grad_norm": 0.0025550604332238436, + "learning_rate": 0.001, + "loss": 0.403, + "step": 1406 + }, + { + "epoch": 0.038822226897567404, + "grad_norm": 0.003278963966295123, + "learning_rate": 0.001, + "loss": 0.401, + "step": 1407 + }, + { + "epoch": 0.03884981909863177, + "grad_norm": 0.002250393619760871, + "learning_rate": 0.001, + "loss": 0.4521, + "step": 1408 + }, + { + "epoch": 0.038877411299696144, + "grad_norm": 0.002963767386972904, + "learning_rate": 0.001, + "loss": 0.4566, + "step": 1409 + }, + { + "epoch": 0.03890500350076051, + "grad_norm": 0.006573919206857681, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 1410 + }, + { + "epoch": 0.038932595701824876, + "grad_norm": 0.005289596039801836, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 1411 + }, + { + "epoch": 0.03896018790288925, + "grad_norm": 0.0031945211812853813, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 1412 + }, + { + "epoch": 0.038987780103953616, + "grad_norm": 0.002856782404705882, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 1413 + }, + { + "epoch": 0.03901537230501799, + "grad_norm": 0.0037729961331933737, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 1414 + }, + { + "epoch": 0.039042964506082356, + "grad_norm": 0.00266630039550364, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 1415 + }, + { + "epoch": 0.03907055670714672, + "grad_norm": 0.0042518191039562225, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 1416 + }, + { + "epoch": 0.039098148908211096, + "grad_norm": 0.0030879939440637827, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 1417 + }, + { + "epoch": 0.03912574110927546, + "grad_norm": 0.0036084537860006094, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 1418 + }, + { + "epoch": 0.039153333310339836, + "grad_norm": 0.003634381340816617, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 1419 + }, + { + "epoch": 0.0391809255114042, + "grad_norm": 0.0030327397398650646, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 1420 + }, + { + "epoch": 0.03920851771246857, + "grad_norm": 0.008019665256142616, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 1421 + }, + { + "epoch": 0.03923610991353294, + "grad_norm": 0.004183803219348192, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 1422 + }, + { + "epoch": 0.03926370211459731, + "grad_norm": 0.005366318393498659, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 1423 + }, + { + "epoch": 0.039291294315661675, + "grad_norm": 0.003394525730982423, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 1424 + }, + { + "epoch": 0.03931888651672605, + "grad_norm": 0.003373740240931511, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 1425 + }, + { + "epoch": 0.039346478717790415, + "grad_norm": 0.002769649960100651, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 1426 + }, + { + "epoch": 0.03937407091885479, + "grad_norm": 0.005319521296769381, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 1427 + }, + { + "epoch": 0.039401663119919154, + "grad_norm": 0.0031118986662477255, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 1428 + }, + { + "epoch": 0.03942925532098352, + "grad_norm": 0.0032665557228028774, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 1429 + }, + { + "epoch": 0.039456847522047894, + "grad_norm": 0.0047190007753670216, + "learning_rate": 0.001, + "loss": 0.3242, + "step": 1430 + }, + { + "epoch": 0.03948443972311226, + "grad_norm": 0.0038909264840185642, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 1431 + }, + { + "epoch": 0.039512031924176634, + "grad_norm": 0.004970925394445658, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 1432 + }, + { + "epoch": 0.039539624125241, + "grad_norm": 0.004649787209928036, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 1433 + }, + { + "epoch": 0.03956721632630537, + "grad_norm": 0.0030645502265542746, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 1434 + }, + { + "epoch": 0.03959480852736974, + "grad_norm": 0.005270305555313826, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 1435 + }, + { + "epoch": 0.03962240072843411, + "grad_norm": 0.004368067253381014, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 1436 + }, + { + "epoch": 0.03964999292949848, + "grad_norm": 0.0032691999804228544, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 1437 + }, + { + "epoch": 0.03967758513056285, + "grad_norm": 0.003509074915200472, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 1438 + }, + { + "epoch": 0.03970517733162721, + "grad_norm": 0.00402647303417325, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 1439 + }, + { + "epoch": 0.039732769532691586, + "grad_norm": 0.003934496082365513, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 1440 + }, + { + "epoch": 0.03976036173375595, + "grad_norm": 0.0035782591439783573, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 1441 + }, + { + "epoch": 0.039787953934820326, + "grad_norm": 0.0036837344523519278, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 1442 + }, + { + "epoch": 0.03981554613588469, + "grad_norm": 0.003719213418662548, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1443 + }, + { + "epoch": 0.03984313833694906, + "grad_norm": 0.008284253068268299, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 1444 + }, + { + "epoch": 0.03987073053801343, + "grad_norm": 0.0037160192150622606, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 1445 + }, + { + "epoch": 0.0398983227390778, + "grad_norm": 0.003967109136283398, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 1446 + }, + { + "epoch": 0.039925914940142165, + "grad_norm": 0.003119664965197444, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 1447 + }, + { + "epoch": 0.03995350714120654, + "grad_norm": 0.0027750935405492783, + "learning_rate": 0.001, + "loss": 0.403, + "step": 1448 + }, + { + "epoch": 0.039981099342270905, + "grad_norm": 0.008331545628607273, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 1449 + }, + { + "epoch": 0.04000869154333528, + "grad_norm": 0.004883471876382828, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 1450 + }, + { + "epoch": 0.040036283744399645, + "grad_norm": 0.0037747775204479694, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 1451 + }, + { + "epoch": 0.04006387594546401, + "grad_norm": 0.0035403715446591377, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 1452 + }, + { + "epoch": 0.040091468146528385, + "grad_norm": 0.012222831137478352, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 1453 + }, + { + "epoch": 0.04011906034759275, + "grad_norm": 0.029686246067285538, + "learning_rate": 0.001, + "loss": 0.3688, + "step": 1454 + }, + { + "epoch": 0.040146652548657125, + "grad_norm": 0.007045884151011705, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 1455 + }, + { + "epoch": 0.04017424474972149, + "grad_norm": 0.0033973727840930223, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 1456 + }, + { + "epoch": 0.04020183695078586, + "grad_norm": 0.004133992828428745, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 1457 + }, + { + "epoch": 0.04022942915185023, + "grad_norm": 0.003264515893533826, + "learning_rate": 0.001, + "loss": 0.384, + "step": 1458 + }, + { + "epoch": 0.0402570213529146, + "grad_norm": 0.00320844491943717, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 1459 + }, + { + "epoch": 0.04028461355397897, + "grad_norm": 0.0038754413835704327, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 1460 + }, + { + "epoch": 0.04031220575504334, + "grad_norm": 0.009661386720836163, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 1461 + }, + { + "epoch": 0.040339797956107704, + "grad_norm": 0.010238132439553738, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 1462 + }, + { + "epoch": 0.04036739015717208, + "grad_norm": 0.04491569846868515, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 1463 + }, + { + "epoch": 0.04039498235823644, + "grad_norm": 0.0031146227847784758, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 1464 + }, + { + "epoch": 0.04042257455930082, + "grad_norm": 0.0035386565141379833, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 1465 + }, + { + "epoch": 0.04045016676036518, + "grad_norm": 0.0033056430984288454, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 1466 + }, + { + "epoch": 0.04047775896142955, + "grad_norm": 0.0025265736039727926, + "learning_rate": 0.001, + "loss": 0.3693, + "step": 1467 + }, + { + "epoch": 0.04050535116249392, + "grad_norm": 0.004877384752035141, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 1468 + }, + { + "epoch": 0.04053294336355829, + "grad_norm": 0.006324070505797863, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 1469 + }, + { + "epoch": 0.04056053556462266, + "grad_norm": 0.004497391637414694, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 1470 + }, + { + "epoch": 0.04058812776568703, + "grad_norm": 0.003843271406367421, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 1471 + }, + { + "epoch": 0.040615719966751396, + "grad_norm": 0.003053538501262665, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 1472 + }, + { + "epoch": 0.04064331216781577, + "grad_norm": 0.0038446690887212753, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 1473 + }, + { + "epoch": 0.040670904368880136, + "grad_norm": 0.003199394093826413, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 1474 + }, + { + "epoch": 0.0406984965699445, + "grad_norm": 0.003342741634696722, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 1475 + }, + { + "epoch": 0.040726088771008875, + "grad_norm": 0.004331924952566624, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 1476 + }, + { + "epoch": 0.04075368097207324, + "grad_norm": 0.00260713673196733, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 1477 + }, + { + "epoch": 0.040781273173137615, + "grad_norm": 0.003122882917523384, + "learning_rate": 0.001, + "loss": 0.3598, + "step": 1478 + }, + { + "epoch": 0.04080886537420198, + "grad_norm": 0.0028676025103777647, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 1479 + }, + { + "epoch": 0.04083645757526635, + "grad_norm": 0.0038910373114049435, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 1480 + }, + { + "epoch": 0.04086404977633072, + "grad_norm": 0.003291686065495014, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 1481 + }, + { + "epoch": 0.04089164197739509, + "grad_norm": 0.006192247848957777, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 1482 + }, + { + "epoch": 0.04091923417845946, + "grad_norm": 0.002540907124057412, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 1483 + }, + { + "epoch": 0.04094682637952383, + "grad_norm": 0.003824718063697219, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 1484 + }, + { + "epoch": 0.040974418580588194, + "grad_norm": 0.0036246173549443483, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 1485 + }, + { + "epoch": 0.04100201078165257, + "grad_norm": 0.030295656993985176, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 1486 + }, + { + "epoch": 0.041029602982716934, + "grad_norm": 0.0050461613573133945, + "learning_rate": 0.001, + "loss": 0.412, + "step": 1487 + }, + { + "epoch": 0.04105719518378131, + "grad_norm": 0.0023631087969988585, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 1488 + }, + { + "epoch": 0.041084787384845674, + "grad_norm": 0.002529110526666045, + "learning_rate": 0.001, + "loss": 0.4547, + "step": 1489 + }, + { + "epoch": 0.04111237958591004, + "grad_norm": 0.00316584762185812, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 1490 + }, + { + "epoch": 0.041139971786974414, + "grad_norm": 0.002411734312772751, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 1491 + }, + { + "epoch": 0.04116756398803878, + "grad_norm": 0.0029997879173606634, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 1492 + }, + { + "epoch": 0.04119515618910315, + "grad_norm": 0.003948535770177841, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 1493 + }, + { + "epoch": 0.04122274839016752, + "grad_norm": 0.002781339455395937, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 1494 + }, + { + "epoch": 0.041250340591231886, + "grad_norm": 0.015317432582378387, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 1495 + }, + { + "epoch": 0.04127793279229626, + "grad_norm": 0.0075756567530334, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 1496 + }, + { + "epoch": 0.041305524993360626, + "grad_norm": 0.002881971187889576, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 1497 + }, + { + "epoch": 0.04133311719442499, + "grad_norm": 0.004069055896252394, + "learning_rate": 0.001, + "loss": 0.4491, + "step": 1498 + }, + { + "epoch": 0.041360709395489366, + "grad_norm": 0.002320400904864073, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 1499 + }, + { + "epoch": 0.04138830159655373, + "grad_norm": 0.003089721780270338, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 1500 + }, + { + "epoch": 0.04138830159655373, + "eval_runtime": 23.6686, + "eval_samples_per_second": 1.352, + "eval_steps_per_second": 0.169, + "step": 1500 + }, + { + "epoch": 0.041415893797618106, + "grad_norm": 0.004915047902613878, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 1501 + }, + { + "epoch": 0.04144348599868247, + "grad_norm": 0.00290488894097507, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 1502 + }, + { + "epoch": 0.04147107819974684, + "grad_norm": 0.0034425961785018444, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 1503 + }, + { + "epoch": 0.04149867040081121, + "grad_norm": 0.003592686727643013, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 1504 + }, + { + "epoch": 0.04152626260187558, + "grad_norm": 0.005649790167808533, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 1505 + }, + { + "epoch": 0.04155385480293995, + "grad_norm": 0.002451283158734441, + "learning_rate": 0.001, + "loss": 0.4477, + "step": 1506 + }, + { + "epoch": 0.04158144700400432, + "grad_norm": 0.0028861695900559425, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 1507 + }, + { + "epoch": 0.041609039205068685, + "grad_norm": 0.0033806059509515762, + "learning_rate": 0.001, + "loss": 0.398, + "step": 1508 + }, + { + "epoch": 0.04163663140613306, + "grad_norm": 0.003824063576757908, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 1509 + }, + { + "epoch": 0.041664223607197425, + "grad_norm": 0.00630558468401432, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 1510 + }, + { + "epoch": 0.0416918158082618, + "grad_norm": 0.0037113004364073277, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 1511 + }, + { + "epoch": 0.041719408009326164, + "grad_norm": 0.0054063801653683186, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1512 + }, + { + "epoch": 0.04174700021039053, + "grad_norm": 0.003154453821480274, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 1513 + }, + { + "epoch": 0.041774592411454904, + "grad_norm": 0.0029439502395689487, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 1514 + }, + { + "epoch": 0.04180218461251927, + "grad_norm": 0.003378200577571988, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 1515 + }, + { + "epoch": 0.041829776813583644, + "grad_norm": 0.003158586798235774, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 1516 + }, + { + "epoch": 0.04185736901464801, + "grad_norm": 0.004686887841671705, + "learning_rate": 0.001, + "loss": 0.39, + "step": 1517 + }, + { + "epoch": 0.04188496121571238, + "grad_norm": 0.004565032664686441, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 1518 + }, + { + "epoch": 0.04191255341677675, + "grad_norm": 0.005517443176358938, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 1519 + }, + { + "epoch": 0.04194014561784112, + "grad_norm": 0.002761922078207135, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 1520 + }, + { + "epoch": 0.04196773781890548, + "grad_norm": 0.0039441585540771484, + "learning_rate": 0.001, + "loss": 0.386, + "step": 1521 + }, + { + "epoch": 0.041995330019969856, + "grad_norm": 0.00710391066968441, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 1522 + }, + { + "epoch": 0.04202292222103422, + "grad_norm": 0.025746062397956848, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 1523 + }, + { + "epoch": 0.042050514422098596, + "grad_norm": 0.004072318784892559, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 1524 + }, + { + "epoch": 0.04207810662316296, + "grad_norm": 0.0024748845025897026, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 1525 + }, + { + "epoch": 0.04210569882422733, + "grad_norm": 0.007111032959073782, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 1526 + }, + { + "epoch": 0.0421332910252917, + "grad_norm": 0.005953185725957155, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 1527 + }, + { + "epoch": 0.04216088322635607, + "grad_norm": 0.004936009179800749, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 1528 + }, + { + "epoch": 0.04218847542742044, + "grad_norm": 0.004421617835760117, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 1529 + }, + { + "epoch": 0.04221606762848481, + "grad_norm": 0.007696077227592468, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 1530 + }, + { + "epoch": 0.042243659829549175, + "grad_norm": 0.0060005756095051765, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 1531 + }, + { + "epoch": 0.04227125203061355, + "grad_norm": 0.006462580990046263, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 1532 + }, + { + "epoch": 0.042298844231677915, + "grad_norm": 0.023720385506749153, + "learning_rate": 0.001, + "loss": 0.3589, + "step": 1533 + }, + { + "epoch": 0.04232643643274229, + "grad_norm": 0.003752040909603238, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 1534 + }, + { + "epoch": 0.042354028633806655, + "grad_norm": 0.005650446284562349, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 1535 + }, + { + "epoch": 0.04238162083487102, + "grad_norm": 0.003995851147919893, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 1536 + }, + { + "epoch": 0.042409213035935395, + "grad_norm": 0.003202822059392929, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 1537 + }, + { + "epoch": 0.04243680523699976, + "grad_norm": 0.004148907493799925, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 1538 + }, + { + "epoch": 0.042464397438064135, + "grad_norm": 0.004003866575658321, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 1539 + }, + { + "epoch": 0.0424919896391285, + "grad_norm": 0.003789936425164342, + "learning_rate": 0.001, + "loss": 0.4593, + "step": 1540 + }, + { + "epoch": 0.04251958184019287, + "grad_norm": 0.004240360110998154, + "learning_rate": 0.001, + "loss": 0.353, + "step": 1541 + }, + { + "epoch": 0.04254717404125724, + "grad_norm": 0.002904722234234214, + "learning_rate": 0.001, + "loss": 0.4451, + "step": 1542 + }, + { + "epoch": 0.04257476624232161, + "grad_norm": 0.004250886384397745, + "learning_rate": 0.001, + "loss": 0.4395, + "step": 1543 + }, + { + "epoch": 0.042602358443385974, + "grad_norm": 0.0044527859427034855, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 1544 + }, + { + "epoch": 0.04262995064445035, + "grad_norm": 0.006279831752181053, + "learning_rate": 0.001, + "loss": 0.354, + "step": 1545 + }, + { + "epoch": 0.042657542845514713, + "grad_norm": 0.004428897984325886, + "learning_rate": 0.001, + "loss": 0.407, + "step": 1546 + }, + { + "epoch": 0.04268513504657909, + "grad_norm": 0.00569180166348815, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 1547 + }, + { + "epoch": 0.04271272724764345, + "grad_norm": 0.011190955527126789, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 1548 + }, + { + "epoch": 0.04274031944870782, + "grad_norm": 0.015735691413283348, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 1549 + }, + { + "epoch": 0.04276791164977219, + "grad_norm": 0.0033663571812212467, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 1550 + }, + { + "epoch": 0.04279550385083656, + "grad_norm": 0.005885041318833828, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 1551 + }, + { + "epoch": 0.04282309605190093, + "grad_norm": 0.022580578923225403, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 1552 + }, + { + "epoch": 0.0428506882529653, + "grad_norm": 0.004381990525871515, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 1553 + }, + { + "epoch": 0.042878280454029666, + "grad_norm": 0.0038387307431548834, + "learning_rate": 0.001, + "loss": 0.38, + "step": 1554 + }, + { + "epoch": 0.04290587265509404, + "grad_norm": 0.027915263548493385, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 1555 + }, + { + "epoch": 0.042933464856158406, + "grad_norm": 0.006606489885598421, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 1556 + }, + { + "epoch": 0.04296105705722278, + "grad_norm": 0.013772227801382542, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 1557 + }, + { + "epoch": 0.042988649258287145, + "grad_norm": 0.01971166953444481, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 1558 + }, + { + "epoch": 0.04301624145935151, + "grad_norm": 0.0028942166827619076, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 1559 + }, + { + "epoch": 0.043043833660415885, + "grad_norm": 0.004030006006360054, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 1560 + }, + { + "epoch": 0.04307142586148025, + "grad_norm": 0.0030979826115071774, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 1561 + }, + { + "epoch": 0.043099018062544625, + "grad_norm": 0.012965509667992592, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 1562 + }, + { + "epoch": 0.04312661026360899, + "grad_norm": 0.0034757067915052176, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 1563 + }, + { + "epoch": 0.04315420246467336, + "grad_norm": 0.003752148011699319, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 1564 + }, + { + "epoch": 0.04318179466573773, + "grad_norm": 0.002841662149876356, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 1565 + }, + { + "epoch": 0.0432093868668021, + "grad_norm": 0.0031714646611362696, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 1566 + }, + { + "epoch": 0.043236979067866464, + "grad_norm": 0.004814228042960167, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 1567 + }, + { + "epoch": 0.04326457126893084, + "grad_norm": 0.004090086091309786, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 1568 + }, + { + "epoch": 0.043292163469995204, + "grad_norm": 0.0035240172874182463, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 1569 + }, + { + "epoch": 0.04331975567105958, + "grad_norm": 0.0025755963288247585, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 1570 + }, + { + "epoch": 0.043347347872123944, + "grad_norm": 0.010493564419448376, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 1571 + }, + { + "epoch": 0.04337494007318831, + "grad_norm": 0.003166765673086047, + "learning_rate": 0.001, + "loss": 0.4402, + "step": 1572 + }, + { + "epoch": 0.043402532274252684, + "grad_norm": 0.004130365792661905, + "learning_rate": 0.001, + "loss": 0.395, + "step": 1573 + }, + { + "epoch": 0.04343012447531705, + "grad_norm": 0.003527469700202346, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 1574 + }, + { + "epoch": 0.043457716676381423, + "grad_norm": 0.0034512521233409643, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 1575 + }, + { + "epoch": 0.04348530887744579, + "grad_norm": 0.002996640047058463, + "learning_rate": 0.001, + "loss": 0.3649, + "step": 1576 + }, + { + "epoch": 0.043512901078510156, + "grad_norm": 0.0035705615300685167, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 1577 + }, + { + "epoch": 0.04354049327957453, + "grad_norm": 0.004510779399424791, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 1578 + }, + { + "epoch": 0.043568085480638896, + "grad_norm": 0.005176417529582977, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 1579 + }, + { + "epoch": 0.04359567768170327, + "grad_norm": 0.0046493723057210445, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 1580 + }, + { + "epoch": 0.043623269882767636, + "grad_norm": 0.004336629528552294, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 1581 + }, + { + "epoch": 0.043650862083832, + "grad_norm": 0.0031178700737655163, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 1582 + }, + { + "epoch": 0.043678454284896376, + "grad_norm": 0.006402260158210993, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 1583 + }, + { + "epoch": 0.04370604648596074, + "grad_norm": 0.004152487497776747, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 1584 + }, + { + "epoch": 0.043733638687025116, + "grad_norm": 0.00424406910315156, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 1585 + }, + { + "epoch": 0.04376123088808948, + "grad_norm": 0.005350259132683277, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 1586 + }, + { + "epoch": 0.04378882308915385, + "grad_norm": 0.0027786684222519398, + "learning_rate": 0.001, + "loss": 0.4478, + "step": 1587 + }, + { + "epoch": 0.04381641529021822, + "grad_norm": 0.004228509031236172, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 1588 + }, + { + "epoch": 0.04384400749128259, + "grad_norm": 0.0037349634803831577, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 1589 + }, + { + "epoch": 0.043871599692346955, + "grad_norm": 0.0034225585404783487, + "learning_rate": 0.001, + "loss": 0.4, + "step": 1590 + }, + { + "epoch": 0.04389919189341133, + "grad_norm": 0.00584405055269599, + "learning_rate": 0.001, + "loss": 0.386, + "step": 1591 + }, + { + "epoch": 0.043926784094475695, + "grad_norm": 0.004452804569154978, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 1592 + }, + { + "epoch": 0.04395437629554007, + "grad_norm": 0.0026068915612995625, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 1593 + }, + { + "epoch": 0.043981968496604434, + "grad_norm": 0.003229719353839755, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 1594 + }, + { + "epoch": 0.0440095606976688, + "grad_norm": 0.005484900437295437, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 1595 + }, + { + "epoch": 0.044037152898733174, + "grad_norm": 0.007316559553146362, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 1596 + }, + { + "epoch": 0.04406474509979754, + "grad_norm": 0.009250715374946594, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 1597 + }, + { + "epoch": 0.044092337300861914, + "grad_norm": 0.004528039135038853, + "learning_rate": 0.001, + "loss": 0.371, + "step": 1598 + }, + { + "epoch": 0.04411992950192628, + "grad_norm": 0.005715006496757269, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 1599 + }, + { + "epoch": 0.04414752170299065, + "grad_norm": 0.0036250154953449965, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 1600 + }, + { + "epoch": 0.04417511390405502, + "grad_norm": 0.007519236300140619, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 1601 + }, + { + "epoch": 0.04420270610511939, + "grad_norm": 0.005943160969763994, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 1602 + }, + { + "epoch": 0.04423029830618376, + "grad_norm": 0.00410908367484808, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 1603 + }, + { + "epoch": 0.04425789050724813, + "grad_norm": 0.004322184715420008, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 1604 + }, + { + "epoch": 0.04428548270831249, + "grad_norm": 0.0035136695951223373, + "learning_rate": 0.001, + "loss": 0.38, + "step": 1605 + }, + { + "epoch": 0.044313074909376866, + "grad_norm": 0.020684808492660522, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 1606 + }, + { + "epoch": 0.04434066711044123, + "grad_norm": 0.00836837850511074, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 1607 + }, + { + "epoch": 0.044368259311505606, + "grad_norm": 0.004143499303609133, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 1608 + }, + { + "epoch": 0.04439585151256997, + "grad_norm": 0.005604143720120192, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 1609 + }, + { + "epoch": 0.04442344371363434, + "grad_norm": 0.0037680943496525288, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 1610 + }, + { + "epoch": 0.04445103591469871, + "grad_norm": 0.004539195913821459, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 1611 + }, + { + "epoch": 0.04447862811576308, + "grad_norm": 0.003528768662363291, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 1612 + }, + { + "epoch": 0.044506220316827445, + "grad_norm": 0.002794221742078662, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 1613 + }, + { + "epoch": 0.04453381251789182, + "grad_norm": 0.003042758908122778, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 1614 + }, + { + "epoch": 0.044561404718956185, + "grad_norm": 0.0024623468052595854, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 1615 + }, + { + "epoch": 0.04458899692002056, + "grad_norm": 0.0035985438153147697, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 1616 + }, + { + "epoch": 0.044616589121084925, + "grad_norm": 0.007022172212600708, + "learning_rate": 0.001, + "loss": 0.3593, + "step": 1617 + }, + { + "epoch": 0.04464418132214929, + "grad_norm": 0.003542872378602624, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 1618 + }, + { + "epoch": 0.044671773523213665, + "grad_norm": 0.0038910373114049435, + "learning_rate": 0.001, + "loss": 0.423, + "step": 1619 + }, + { + "epoch": 0.04469936572427803, + "grad_norm": 0.0027147457003593445, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 1620 + }, + { + "epoch": 0.044726957925342405, + "grad_norm": 0.002940715989097953, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 1621 + }, + { + "epoch": 0.04475455012640677, + "grad_norm": 0.0030967697966843843, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 1622 + }, + { + "epoch": 0.04478214232747114, + "grad_norm": 0.002238066168501973, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 1623 + }, + { + "epoch": 0.04480973452853551, + "grad_norm": 0.003412696998566389, + "learning_rate": 0.001, + "loss": 0.413, + "step": 1624 + }, + { + "epoch": 0.04483732672959988, + "grad_norm": 0.0026292402762919664, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 1625 + }, + { + "epoch": 0.04486491893066425, + "grad_norm": 0.004190067294985056, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 1626 + }, + { + "epoch": 0.04489251113172862, + "grad_norm": 0.0036978810094296932, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 1627 + }, + { + "epoch": 0.044920103332792984, + "grad_norm": 0.004438107367604971, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 1628 + }, + { + "epoch": 0.04494769553385736, + "grad_norm": 0.004876590799540281, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 1629 + }, + { + "epoch": 0.04497528773492172, + "grad_norm": 0.004272471182048321, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 1630 + }, + { + "epoch": 0.0450028799359861, + "grad_norm": 0.006823899690061808, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 1631 + }, + { + "epoch": 0.04503047213705046, + "grad_norm": 0.008064229972660542, + "learning_rate": 0.001, + "loss": 0.417, + "step": 1632 + }, + { + "epoch": 0.04505806433811483, + "grad_norm": 0.004609786439687014, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 1633 + }, + { + "epoch": 0.0450856565391792, + "grad_norm": 0.0027909104246646166, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 1634 + }, + { + "epoch": 0.04511324874024357, + "grad_norm": 0.0036747294943779707, + "learning_rate": 0.001, + "loss": 0.3533, + "step": 1635 + }, + { + "epoch": 0.045140840941307936, + "grad_norm": 0.0037599606439471245, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 1636 + }, + { + "epoch": 0.04516843314237231, + "grad_norm": 0.0029045911505818367, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 1637 + }, + { + "epoch": 0.045196025343436676, + "grad_norm": 0.0038696962874382734, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 1638 + }, + { + "epoch": 0.04522361754450105, + "grad_norm": 0.004320462234318256, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 1639 + }, + { + "epoch": 0.045251209745565416, + "grad_norm": 0.002876073122024536, + "learning_rate": 0.001, + "loss": 0.406, + "step": 1640 + }, + { + "epoch": 0.04527880194662978, + "grad_norm": 0.002509112237021327, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 1641 + }, + { + "epoch": 0.045306394147694155, + "grad_norm": 0.012423038482666016, + "learning_rate": 0.001, + "loss": 0.3616, + "step": 1642 + }, + { + "epoch": 0.04533398634875852, + "grad_norm": 0.00442110188305378, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 1643 + }, + { + "epoch": 0.045361578549822895, + "grad_norm": 0.003638372290879488, + "learning_rate": 0.001, + "loss": 0.4611, + "step": 1644 + }, + { + "epoch": 0.04538917075088726, + "grad_norm": 0.0037818588316440582, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 1645 + }, + { + "epoch": 0.04541676295195163, + "grad_norm": 0.0035038308706134558, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 1646 + }, + { + "epoch": 0.045444355153016, + "grad_norm": 0.003616967238485813, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 1647 + }, + { + "epoch": 0.04547194735408037, + "grad_norm": 0.002239943016320467, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 1648 + }, + { + "epoch": 0.04549953955514474, + "grad_norm": 0.0036853745114058256, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 1649 + }, + { + "epoch": 0.04552713175620911, + "grad_norm": 0.00262830825522542, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 1650 + }, + { + "epoch": 0.045554723957273474, + "grad_norm": 0.010400556027889252, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 1651 + }, + { + "epoch": 0.04558231615833785, + "grad_norm": 0.0026481510140001774, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 1652 + }, + { + "epoch": 0.045609908359402214, + "grad_norm": 0.0029154308140277863, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 1653 + }, + { + "epoch": 0.04563750056046659, + "grad_norm": 0.0028925796505063772, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 1654 + }, + { + "epoch": 0.045665092761530954, + "grad_norm": 0.0028384907636791468, + "learning_rate": 0.001, + "loss": 0.411, + "step": 1655 + }, + { + "epoch": 0.04569268496259532, + "grad_norm": 0.0039413925260305405, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 1656 + }, + { + "epoch": 0.045720277163659694, + "grad_norm": 0.004098753910511732, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 1657 + }, + { + "epoch": 0.04574786936472406, + "grad_norm": 0.0032568899914622307, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 1658 + }, + { + "epoch": 0.045775461565788426, + "grad_norm": 0.0038113740738481283, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 1659 + }, + { + "epoch": 0.0458030537668528, + "grad_norm": 0.004003043286502361, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 1660 + }, + { + "epoch": 0.045830645967917166, + "grad_norm": 0.003037869231775403, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 1661 + }, + { + "epoch": 0.04585823816898154, + "grad_norm": 0.004264459945261478, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 1662 + }, + { + "epoch": 0.045885830370045906, + "grad_norm": 0.0026220069266855717, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 1663 + }, + { + "epoch": 0.04591342257111027, + "grad_norm": 0.003170250216498971, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 1664 + }, + { + "epoch": 0.045941014772174646, + "grad_norm": 0.004167741164565086, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 1665 + }, + { + "epoch": 0.04596860697323901, + "grad_norm": 0.0035946646239608526, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 1666 + }, + { + "epoch": 0.045996199174303386, + "grad_norm": 0.003959175664931536, + "learning_rate": 0.001, + "loss": 0.407, + "step": 1667 + }, + { + "epoch": 0.04602379137536775, + "grad_norm": 0.004025304224342108, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 1668 + }, + { + "epoch": 0.04605138357643212, + "grad_norm": 0.004997665528208017, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 1669 + }, + { + "epoch": 0.04607897577749649, + "grad_norm": 0.003882192773744464, + "learning_rate": 0.001, + "loss": 0.397, + "step": 1670 + }, + { + "epoch": 0.04610656797856086, + "grad_norm": 0.006935080513358116, + "learning_rate": 0.001, + "loss": 0.412, + "step": 1671 + }, + { + "epoch": 0.04613416017962523, + "grad_norm": 0.0038688175845891237, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 1672 + }, + { + "epoch": 0.0461617523806896, + "grad_norm": 0.0037809666246175766, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 1673 + }, + { + "epoch": 0.046189344581753965, + "grad_norm": 0.009138336405158043, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 1674 + }, + { + "epoch": 0.04621693678281834, + "grad_norm": 0.004577755928039551, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 1675 + }, + { + "epoch": 0.046244528983882704, + "grad_norm": 0.0038164344150573015, + "learning_rate": 0.001, + "loss": 0.441, + "step": 1676 + }, + { + "epoch": 0.04627212118494708, + "grad_norm": 0.004704809281975031, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 1677 + }, + { + "epoch": 0.046299713386011444, + "grad_norm": 0.008598784916102886, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1678 + }, + { + "epoch": 0.04632730558707581, + "grad_norm": 0.004788943100720644, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 1679 + }, + { + "epoch": 0.046354897788140184, + "grad_norm": 0.0038863064255565405, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 1680 + }, + { + "epoch": 0.04638248998920455, + "grad_norm": 0.0029795279260724783, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 1681 + }, + { + "epoch": 0.046410082190268924, + "grad_norm": 0.003151806304231286, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 1682 + }, + { + "epoch": 0.04643767439133329, + "grad_norm": 0.006868270225822926, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 1683 + }, + { + "epoch": 0.04646526659239766, + "grad_norm": 0.00662572355940938, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 1684 + }, + { + "epoch": 0.04649285879346203, + "grad_norm": 0.01428454089909792, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 1685 + }, + { + "epoch": 0.0465204509945264, + "grad_norm": 0.006754969246685505, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 1686 + }, + { + "epoch": 0.04654804319559076, + "grad_norm": 0.00340940966270864, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 1687 + }, + { + "epoch": 0.046575635396655136, + "grad_norm": 0.003416246036067605, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 1688 + }, + { + "epoch": 0.0466032275977195, + "grad_norm": 0.0039048672188073397, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 1689 + }, + { + "epoch": 0.046630819798783876, + "grad_norm": 0.003685768460854888, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 1690 + }, + { + "epoch": 0.04665841199984824, + "grad_norm": 0.006730150897055864, + "learning_rate": 0.001, + "loss": 0.3419, + "step": 1691 + }, + { + "epoch": 0.04668600420091261, + "grad_norm": 0.004073324613273144, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 1692 + }, + { + "epoch": 0.04671359640197698, + "grad_norm": 0.0040067946538329124, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 1693 + }, + { + "epoch": 0.04674118860304135, + "grad_norm": 0.006488442420959473, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 1694 + }, + { + "epoch": 0.04676878080410572, + "grad_norm": 0.006833325605839491, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 1695 + }, + { + "epoch": 0.04679637300517009, + "grad_norm": 0.0035828256513923407, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 1696 + }, + { + "epoch": 0.046823965206234455, + "grad_norm": 0.0032494564075022936, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 1697 + }, + { + "epoch": 0.04685155740729883, + "grad_norm": 0.0033526027109473944, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 1698 + }, + { + "epoch": 0.046879149608363195, + "grad_norm": 0.004358900245279074, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 1699 + }, + { + "epoch": 0.04690674180942757, + "grad_norm": 0.0045670876279473305, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 1700 + }, + { + "epoch": 0.046934334010491935, + "grad_norm": 0.004190321080386639, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 1701 + }, + { + "epoch": 0.0469619262115563, + "grad_norm": 0.0030443707946687937, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 1702 + }, + { + "epoch": 0.046989518412620675, + "grad_norm": 0.003604897065088153, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 1703 + }, + { + "epoch": 0.04701711061368504, + "grad_norm": 0.003690918907523155, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 1704 + }, + { + "epoch": 0.047044702814749415, + "grad_norm": 0.0034342606086283922, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 1705 + }, + { + "epoch": 0.04707229501581378, + "grad_norm": 0.003786922199651599, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 1706 + }, + { + "epoch": 0.04709988721687815, + "grad_norm": 0.0021744174882769585, + "learning_rate": 0.001, + "loss": 0.3615, + "step": 1707 + }, + { + "epoch": 0.04712747941794252, + "grad_norm": 0.004297124687582254, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 1708 + }, + { + "epoch": 0.04715507161900689, + "grad_norm": 0.003097895532846451, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 1709 + }, + { + "epoch": 0.047182663820071254, + "grad_norm": 0.0027625923976302147, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 1710 + }, + { + "epoch": 0.04721025602113563, + "grad_norm": 0.024495547637343407, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 1711 + }, + { + "epoch": 0.04723784822219999, + "grad_norm": 0.006063805893063545, + "learning_rate": 0.001, + "loss": 0.4558, + "step": 1712 + }, + { + "epoch": 0.04726544042326437, + "grad_norm": 0.002210379345342517, + "learning_rate": 0.001, + "loss": 0.4529, + "step": 1713 + }, + { + "epoch": 0.04729303262432873, + "grad_norm": 0.002300212625414133, + "learning_rate": 0.001, + "loss": 0.42, + "step": 1714 + }, + { + "epoch": 0.0473206248253931, + "grad_norm": 0.002808920806273818, + "learning_rate": 0.001, + "loss": 0.4337, + "step": 1715 + }, + { + "epoch": 0.04734821702645747, + "grad_norm": 0.0031099985353648663, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 1716 + }, + { + "epoch": 0.04737580922752184, + "grad_norm": 0.003029707819223404, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 1717 + }, + { + "epoch": 0.04740340142858621, + "grad_norm": 0.005905452184379101, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 1718 + }, + { + "epoch": 0.04743099362965058, + "grad_norm": 0.002499626949429512, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 1719 + }, + { + "epoch": 0.047458585830714946, + "grad_norm": 0.0029100440442562103, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 1720 + }, + { + "epoch": 0.04748617803177932, + "grad_norm": 0.0029877678025513887, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 1721 + }, + { + "epoch": 0.047513770232843686, + "grad_norm": 0.0023172239307314157, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 1722 + }, + { + "epoch": 0.04754136243390806, + "grad_norm": 0.0031008669175207615, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 1723 + }, + { + "epoch": 0.047568954634972425, + "grad_norm": 0.003588200779631734, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 1724 + }, + { + "epoch": 0.04759654683603679, + "grad_norm": 0.0027301576919853687, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 1725 + }, + { + "epoch": 0.047624139037101165, + "grad_norm": 0.003348779398947954, + "learning_rate": 0.001, + "loss": 0.3554, + "step": 1726 + }, + { + "epoch": 0.04765173123816553, + "grad_norm": 0.003580378834158182, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 1727 + }, + { + "epoch": 0.047679323439229905, + "grad_norm": 0.004255024716258049, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 1728 + }, + { + "epoch": 0.04770691564029427, + "grad_norm": 0.0033867263700813055, + "learning_rate": 0.001, + "loss": 0.3654, + "step": 1729 + }, + { + "epoch": 0.04773450784135864, + "grad_norm": 0.005044759716838598, + "learning_rate": 0.001, + "loss": 0.416, + "step": 1730 + }, + { + "epoch": 0.04776210004242301, + "grad_norm": 0.0034105442464351654, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 1731 + }, + { + "epoch": 0.04778969224348738, + "grad_norm": 0.004883192479610443, + "learning_rate": 0.001, + "loss": 0.4679, + "step": 1732 + }, + { + "epoch": 0.047817284444551744, + "grad_norm": 0.003439564723521471, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 1733 + }, + { + "epoch": 0.04784487664561612, + "grad_norm": 0.006362561602145433, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 1734 + }, + { + "epoch": 0.047872468846680484, + "grad_norm": 0.004292085766792297, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 1735 + }, + { + "epoch": 0.04790006104774486, + "grad_norm": 0.00374743458814919, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 1736 + }, + { + "epoch": 0.047927653248809224, + "grad_norm": 0.004549616016447544, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 1737 + }, + { + "epoch": 0.04795524544987359, + "grad_norm": 0.008672794327139854, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 1738 + }, + { + "epoch": 0.047982837650937964, + "grad_norm": 0.007166683673858643, + "learning_rate": 0.001, + "loss": 0.416, + "step": 1739 + }, + { + "epoch": 0.04801042985200233, + "grad_norm": 0.0030777885112911463, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 1740 + }, + { + "epoch": 0.048038022053066703, + "grad_norm": 0.0049812328070402145, + "learning_rate": 0.001, + "loss": 0.424, + "step": 1741 + }, + { + "epoch": 0.04806561425413107, + "grad_norm": 0.005072145257145166, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 1742 + }, + { + "epoch": 0.048093206455195436, + "grad_norm": 0.004582617431879044, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 1743 + }, + { + "epoch": 0.04812079865625981, + "grad_norm": 0.004273936152458191, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 1744 + }, + { + "epoch": 0.048148390857324176, + "grad_norm": 0.003426861949265003, + "learning_rate": 0.001, + "loss": 0.3663, + "step": 1745 + }, + { + "epoch": 0.04817598305838855, + "grad_norm": 0.0036460154224187136, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 1746 + }, + { + "epoch": 0.048203575259452916, + "grad_norm": 0.003482312895357609, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 1747 + }, + { + "epoch": 0.04823116746051728, + "grad_norm": 0.0033532571978867054, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 1748 + }, + { + "epoch": 0.048258759661581656, + "grad_norm": 0.0038984876591712236, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 1749 + }, + { + "epoch": 0.04828635186264602, + "grad_norm": 0.005925590638071299, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 1750 + }, + { + "epoch": 0.048313944063710396, + "grad_norm": 0.005235752090811729, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 1751 + }, + { + "epoch": 0.04834153626477476, + "grad_norm": 0.00452176108956337, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 1752 + }, + { + "epoch": 0.04836912846583913, + "grad_norm": 0.004116971977055073, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 1753 + }, + { + "epoch": 0.0483967206669035, + "grad_norm": 0.0037535110022872686, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 1754 + }, + { + "epoch": 0.04842431286796787, + "grad_norm": 0.0049346694722771645, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 1755 + }, + { + "epoch": 0.048451905069032235, + "grad_norm": 0.006009018048644066, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 1756 + }, + { + "epoch": 0.04847949727009661, + "grad_norm": 0.003077111905440688, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 1757 + }, + { + "epoch": 0.048507089471160975, + "grad_norm": 0.004866272211074829, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 1758 + }, + { + "epoch": 0.04853468167222535, + "grad_norm": 0.004328357521444559, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 1759 + }, + { + "epoch": 0.048562273873289714, + "grad_norm": 0.003560771932825446, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 1760 + }, + { + "epoch": 0.04858986607435408, + "grad_norm": 0.00306741357780993, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 1761 + }, + { + "epoch": 0.048617458275418454, + "grad_norm": 0.010002429597079754, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 1762 + }, + { + "epoch": 0.04864505047648282, + "grad_norm": 0.006219691131263971, + "learning_rate": 0.001, + "loss": 0.3574, + "step": 1763 + }, + { + "epoch": 0.048672642677547194, + "grad_norm": 0.0033367029391229153, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 1764 + }, + { + "epoch": 0.04870023487861156, + "grad_norm": 0.002981371246278286, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 1765 + }, + { + "epoch": 0.04872782707967593, + "grad_norm": 0.004406462889164686, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 1766 + }, + { + "epoch": 0.0487554192807403, + "grad_norm": 0.0028726779855787754, + "learning_rate": 0.001, + "loss": 0.3537, + "step": 1767 + }, + { + "epoch": 0.04878301148180467, + "grad_norm": 0.0029508452862501144, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 1768 + }, + { + "epoch": 0.04881060368286904, + "grad_norm": 0.004091752227395773, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 1769 + }, + { + "epoch": 0.04883819588393341, + "grad_norm": 0.0027895302046090364, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 1770 + }, + { + "epoch": 0.04886578808499777, + "grad_norm": 0.003976823296397924, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 1771 + }, + { + "epoch": 0.048893380286062146, + "grad_norm": 0.0030748520512133837, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 1772 + }, + { + "epoch": 0.04892097248712651, + "grad_norm": 0.005313929636031389, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 1773 + }, + { + "epoch": 0.048948564688190886, + "grad_norm": 0.003955441992729902, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 1774 + }, + { + "epoch": 0.04897615688925525, + "grad_norm": 0.007145262788981199, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 1775 + }, + { + "epoch": 0.04900374909031962, + "grad_norm": 0.003774230368435383, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 1776 + }, + { + "epoch": 0.04903134129138399, + "grad_norm": 0.002694410039111972, + "learning_rate": 0.001, + "loss": 0.389, + "step": 1777 + }, + { + "epoch": 0.04905893349244836, + "grad_norm": 0.003494556527584791, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 1778 + }, + { + "epoch": 0.049086525693512725, + "grad_norm": 0.0033225903753191233, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 1779 + }, + { + "epoch": 0.0491141178945771, + "grad_norm": 0.0031309863552451134, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 1780 + }, + { + "epoch": 0.049141710095641465, + "grad_norm": 0.0028971245046705008, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 1781 + }, + { + "epoch": 0.04916930229670584, + "grad_norm": 0.006014004349708557, + "learning_rate": 0.001, + "loss": 0.3521, + "step": 1782 + }, + { + "epoch": 0.049196894497770205, + "grad_norm": 0.002988451160490513, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 1783 + }, + { + "epoch": 0.04922448669883457, + "grad_norm": 0.0036536797415465117, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 1784 + }, + { + "epoch": 0.049252078899898945, + "grad_norm": 0.00857572816312313, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 1785 + }, + { + "epoch": 0.04927967110096331, + "grad_norm": 0.003724129404872656, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 1786 + }, + { + "epoch": 0.049307263302027685, + "grad_norm": 0.004292929545044899, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 1787 + }, + { + "epoch": 0.04933485550309205, + "grad_norm": 0.004144448321312666, + "learning_rate": 0.001, + "loss": 0.3601, + "step": 1788 + }, + { + "epoch": 0.04936244770415642, + "grad_norm": 0.002937472425401211, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 1789 + }, + { + "epoch": 0.04939003990522079, + "grad_norm": 0.0032268317881971598, + "learning_rate": 0.001, + "loss": 0.3306, + "step": 1790 + }, + { + "epoch": 0.04941763210628516, + "grad_norm": 0.0034790022764354944, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 1791 + }, + { + "epoch": 0.04944522430734953, + "grad_norm": 0.004379100166261196, + "learning_rate": 0.001, + "loss": 0.387, + "step": 1792 + }, + { + "epoch": 0.0494728165084139, + "grad_norm": 0.00435485178604722, + "learning_rate": 0.001, + "loss": 0.361, + "step": 1793 + }, + { + "epoch": 0.049500408709478264, + "grad_norm": 0.0028884608764201403, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 1794 + }, + { + "epoch": 0.04952800091054264, + "grad_norm": 0.0035733478143811226, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 1795 + }, + { + "epoch": 0.049555593111607, + "grad_norm": 0.003197557758539915, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 1796 + }, + { + "epoch": 0.04958318531267138, + "grad_norm": 0.012362437322735786, + "learning_rate": 0.001, + "loss": 0.4401, + "step": 1797 + }, + { + "epoch": 0.04961077751373574, + "grad_norm": 0.005861248355358839, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 1798 + }, + { + "epoch": 0.04963836971480011, + "grad_norm": 0.0055047376081347466, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 1799 + }, + { + "epoch": 0.04966596191586448, + "grad_norm": 0.005381823051720858, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 1800 + }, + { + "epoch": 0.04969355411692885, + "grad_norm": 0.003702189540490508, + "learning_rate": 0.001, + "loss": 0.416, + "step": 1801 + }, + { + "epoch": 0.049721146317993216, + "grad_norm": 0.005075744818896055, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 1802 + }, + { + "epoch": 0.04974873851905759, + "grad_norm": 0.0033848159946501255, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 1803 + }, + { + "epoch": 0.049776330720121956, + "grad_norm": 0.007472567725926638, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 1804 + }, + { + "epoch": 0.04980392292118633, + "grad_norm": 0.005941023584455252, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 1805 + }, + { + "epoch": 0.049831515122250696, + "grad_norm": 0.00963501911610365, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 1806 + }, + { + "epoch": 0.04985910732331506, + "grad_norm": 0.017620790749788284, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 1807 + }, + { + "epoch": 0.049886699524379435, + "grad_norm": 0.0027190614491701126, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 1808 + }, + { + "epoch": 0.0499142917254438, + "grad_norm": 0.003102682065218687, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 1809 + }, + { + "epoch": 0.049941883926508175, + "grad_norm": 0.0033104156609624624, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 1810 + }, + { + "epoch": 0.04996947612757254, + "grad_norm": 0.0038089246954768896, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 1811 + }, + { + "epoch": 0.04999706832863691, + "grad_norm": 0.005627461709082127, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 1812 + }, + { + "epoch": 0.05002466052970128, + "grad_norm": 0.0027909004129469395, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 1813 + }, + { + "epoch": 0.05005225273076565, + "grad_norm": 0.0031870375387370586, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 1814 + }, + { + "epoch": 0.05007984493183002, + "grad_norm": 0.0030249350238591433, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 1815 + }, + { + "epoch": 0.05010743713289439, + "grad_norm": 0.0051695844158530235, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 1816 + }, + { + "epoch": 0.050135029333958754, + "grad_norm": 0.004010764416307211, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 1817 + }, + { + "epoch": 0.05016262153502313, + "grad_norm": 0.004209370352327824, + "learning_rate": 0.001, + "loss": 0.3638, + "step": 1818 + }, + { + "epoch": 0.050190213736087494, + "grad_norm": 0.004149060230702162, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 1819 + }, + { + "epoch": 0.05021780593715187, + "grad_norm": 0.02054491639137268, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1820 + }, + { + "epoch": 0.050245398138216234, + "grad_norm": 0.017421457916498184, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 1821 + }, + { + "epoch": 0.0502729903392806, + "grad_norm": 0.002541644498705864, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 1822 + }, + { + "epoch": 0.050300582540344974, + "grad_norm": 0.0031819387804716825, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 1823 + }, + { + "epoch": 0.05032817474140934, + "grad_norm": 0.004389981739223003, + "learning_rate": 0.001, + "loss": 0.414, + "step": 1824 + }, + { + "epoch": 0.050355766942473706, + "grad_norm": 0.0037767868489027023, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 1825 + }, + { + "epoch": 0.05038335914353808, + "grad_norm": 0.0033333280589431524, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 1826 + }, + { + "epoch": 0.050410951344602446, + "grad_norm": 0.0033245261292904615, + "learning_rate": 0.001, + "loss": 0.4512, + "step": 1827 + }, + { + "epoch": 0.05043854354566682, + "grad_norm": 0.005083107389509678, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 1828 + }, + { + "epoch": 0.050466135746731186, + "grad_norm": 0.002904941327869892, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 1829 + }, + { + "epoch": 0.05049372794779555, + "grad_norm": 0.0927748754620552, + "learning_rate": 0.001, + "loss": 0.381, + "step": 1830 + }, + { + "epoch": 0.050521320148859926, + "grad_norm": 0.0028325989842414856, + "learning_rate": 0.001, + "loss": 0.4329, + "step": 1831 + }, + { + "epoch": 0.05054891234992429, + "grad_norm": 0.005994097795337439, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 1832 + }, + { + "epoch": 0.050576504550988666, + "grad_norm": 0.0032757038716226816, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 1833 + }, + { + "epoch": 0.05060409675205303, + "grad_norm": 0.0055892630480229855, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 1834 + }, + { + "epoch": 0.0506316889531174, + "grad_norm": 0.007300201803445816, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 1835 + }, + { + "epoch": 0.05065928115418177, + "grad_norm": 0.004705760162323713, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 1836 + }, + { + "epoch": 0.05068687335524614, + "grad_norm": 0.0038876638282090425, + "learning_rate": 0.001, + "loss": 0.4482, + "step": 1837 + }, + { + "epoch": 0.05071446555631051, + "grad_norm": 0.0049439528957009315, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 1838 + }, + { + "epoch": 0.05074205775737488, + "grad_norm": 0.00367721077054739, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 1839 + }, + { + "epoch": 0.050769649958439245, + "grad_norm": 0.0021761716343462467, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 1840 + }, + { + "epoch": 0.05079724215950362, + "grad_norm": 0.003433308796957135, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 1841 + }, + { + "epoch": 0.050824834360567984, + "grad_norm": 0.0041929068975150585, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 1842 + }, + { + "epoch": 0.05085242656163236, + "grad_norm": 0.003800287377089262, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 1843 + }, + { + "epoch": 0.050880018762696724, + "grad_norm": 0.00222361390478909, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 1844 + }, + { + "epoch": 0.05090761096376109, + "grad_norm": 0.010053042322397232, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 1845 + }, + { + "epoch": 0.050935203164825464, + "grad_norm": 0.002249652985483408, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 1846 + }, + { + "epoch": 0.05096279536588983, + "grad_norm": 0.0034179852809756994, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 1847 + }, + { + "epoch": 0.0509903875669542, + "grad_norm": 0.0027194637805223465, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 1848 + }, + { + "epoch": 0.05101797976801857, + "grad_norm": 0.00189922412391752, + "learning_rate": 0.001, + "loss": 0.45, + "step": 1849 + }, + { + "epoch": 0.05104557196908294, + "grad_norm": 0.0029718675650656223, + "learning_rate": 0.001, + "loss": 0.408, + "step": 1850 + }, + { + "epoch": 0.05107316417014731, + "grad_norm": 0.002118749311193824, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 1851 + }, + { + "epoch": 0.05110075637121168, + "grad_norm": 0.001921525807119906, + "learning_rate": 0.001, + "loss": 0.401, + "step": 1852 + }, + { + "epoch": 0.05112834857227604, + "grad_norm": 0.002200416987761855, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 1853 + }, + { + "epoch": 0.051155940773340416, + "grad_norm": 0.004263362381607294, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 1854 + }, + { + "epoch": 0.05118353297440478, + "grad_norm": 0.002614164724946022, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 1855 + }, + { + "epoch": 0.051211125175469156, + "grad_norm": 0.002143037738278508, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 1856 + }, + { + "epoch": 0.05123871737653352, + "grad_norm": 0.0025298171676695347, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 1857 + }, + { + "epoch": 0.05126630957759789, + "grad_norm": 0.0026806145906448364, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 1858 + }, + { + "epoch": 0.05129390177866226, + "grad_norm": 0.0022738908883184195, + "learning_rate": 0.001, + "loss": 0.4439, + "step": 1859 + }, + { + "epoch": 0.05132149397972663, + "grad_norm": 0.0023423023521900177, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 1860 + }, + { + "epoch": 0.051349086180791, + "grad_norm": 0.0033382964320480824, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 1861 + }, + { + "epoch": 0.05137667838185537, + "grad_norm": 0.002349057700484991, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 1862 + }, + { + "epoch": 0.051404270582919735, + "grad_norm": 0.0029646153561770916, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 1863 + }, + { + "epoch": 0.05143186278398411, + "grad_norm": 0.003112231148406863, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 1864 + }, + { + "epoch": 0.051459454985048475, + "grad_norm": 0.006748152896761894, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 1865 + }, + { + "epoch": 0.05148704718611285, + "grad_norm": 0.003962017595767975, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 1866 + }, + { + "epoch": 0.051514639387177215, + "grad_norm": 0.0032168615143746138, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 1867 + }, + { + "epoch": 0.05154223158824158, + "grad_norm": 0.00549092423170805, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 1868 + }, + { + "epoch": 0.051569823789305955, + "grad_norm": 0.005641186144202948, + "learning_rate": 0.001, + "loss": 0.373, + "step": 1869 + }, + { + "epoch": 0.05159741599037032, + "grad_norm": 0.0029503426048904657, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 1870 + }, + { + "epoch": 0.05162500819143469, + "grad_norm": 0.002859857166185975, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 1871 + }, + { + "epoch": 0.05165260039249906, + "grad_norm": 0.0032987252343446016, + "learning_rate": 0.001, + "loss": 0.377, + "step": 1872 + }, + { + "epoch": 0.05168019259356343, + "grad_norm": 0.002959759905934334, + "learning_rate": 0.001, + "loss": 0.361, + "step": 1873 + }, + { + "epoch": 0.0517077847946278, + "grad_norm": 0.00282686366699636, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 1874 + }, + { + "epoch": 0.05173537699569217, + "grad_norm": 0.003676011925563216, + "learning_rate": 0.001, + "loss": 0.418, + "step": 1875 + }, + { + "epoch": 0.051762969196756534, + "grad_norm": 0.0026158930268138647, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 1876 + }, + { + "epoch": 0.05179056139782091, + "grad_norm": 0.0024638089817017317, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 1877 + }, + { + "epoch": 0.05181815359888527, + "grad_norm": 0.003338116453960538, + "learning_rate": 0.001, + "loss": 0.392, + "step": 1878 + }, + { + "epoch": 0.05184574579994965, + "grad_norm": 0.004489647690206766, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 1879 + }, + { + "epoch": 0.05187333800101401, + "grad_norm": 0.002798578003421426, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 1880 + }, + { + "epoch": 0.05190093020207838, + "grad_norm": 0.0029343098867684603, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 1881 + }, + { + "epoch": 0.05192852240314275, + "grad_norm": 0.003431658959016204, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 1882 + }, + { + "epoch": 0.05195611460420712, + "grad_norm": 0.002329075476154685, + "learning_rate": 0.001, + "loss": 0.404, + "step": 1883 + }, + { + "epoch": 0.05198370680527149, + "grad_norm": 0.002889421069994569, + "learning_rate": 0.001, + "loss": 0.394, + "step": 1884 + }, + { + "epoch": 0.05201129900633586, + "grad_norm": 0.003102013608440757, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 1885 + }, + { + "epoch": 0.052038891207400226, + "grad_norm": 0.0030128401704132557, + "learning_rate": 0.001, + "loss": 0.402, + "step": 1886 + }, + { + "epoch": 0.0520664834084646, + "grad_norm": 0.002882935106754303, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 1887 + }, + { + "epoch": 0.052094075609528966, + "grad_norm": 0.0033189402893185616, + "learning_rate": 0.001, + "loss": 0.382, + "step": 1888 + }, + { + "epoch": 0.05212166781059334, + "grad_norm": 0.003128107637166977, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 1889 + }, + { + "epoch": 0.052149260011657705, + "grad_norm": 0.0028161504305899143, + "learning_rate": 0.001, + "loss": 0.4362, + "step": 1890 + }, + { + "epoch": 0.05217685221272207, + "grad_norm": 0.00556217972189188, + "learning_rate": 0.001, + "loss": 0.426, + "step": 1891 + }, + { + "epoch": 0.052204444413786445, + "grad_norm": 0.006903808563947678, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 1892 + }, + { + "epoch": 0.05223203661485081, + "grad_norm": 0.008768963627517223, + "learning_rate": 0.001, + "loss": 0.407, + "step": 1893 + }, + { + "epoch": 0.05225962881591518, + "grad_norm": 0.0038300056476145983, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 1894 + }, + { + "epoch": 0.05228722101697955, + "grad_norm": 0.005089603364467621, + "learning_rate": 0.001, + "loss": 0.387, + "step": 1895 + }, + { + "epoch": 0.05231481321804392, + "grad_norm": 0.004371874965727329, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 1896 + }, + { + "epoch": 0.05234240541910829, + "grad_norm": 0.0037384675815701485, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 1897 + }, + { + "epoch": 0.05236999762017266, + "grad_norm": 0.012704477645456791, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 1898 + }, + { + "epoch": 0.052397589821237024, + "grad_norm": 0.004448601044714451, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 1899 + }, + { + "epoch": 0.0524251820223014, + "grad_norm": 0.004198121372610331, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 1900 + }, + { + "epoch": 0.052452774223365764, + "grad_norm": 0.008598609827458858, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 1901 + }, + { + "epoch": 0.05248036642443014, + "grad_norm": 0.005187192931771278, + "learning_rate": 0.001, + "loss": 0.388, + "step": 1902 + }, + { + "epoch": 0.052507958625494504, + "grad_norm": 0.0033852029591798782, + "learning_rate": 0.001, + "loss": 0.418, + "step": 1903 + }, + { + "epoch": 0.05253555082655887, + "grad_norm": 0.004068314563483, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 1904 + }, + { + "epoch": 0.052563143027623244, + "grad_norm": 0.004143074620515108, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 1905 + }, + { + "epoch": 0.05259073522868761, + "grad_norm": 0.004285483155399561, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 1906 + }, + { + "epoch": 0.05261832742975198, + "grad_norm": 0.004647474270313978, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 1907 + }, + { + "epoch": 0.05264591963081635, + "grad_norm": 0.002305620349943638, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 1908 + }, + { + "epoch": 0.052673511831880716, + "grad_norm": 0.004161974415183067, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 1909 + }, + { + "epoch": 0.05270110403294509, + "grad_norm": 0.004659401252865791, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 1910 + }, + { + "epoch": 0.052728696234009456, + "grad_norm": 0.003204295178875327, + "learning_rate": 0.001, + "loss": 0.3579, + "step": 1911 + }, + { + "epoch": 0.05275628843507383, + "grad_norm": 0.002777168760076165, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 1912 + }, + { + "epoch": 0.052783880636138196, + "grad_norm": 0.003622923046350479, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 1913 + }, + { + "epoch": 0.05281147283720256, + "grad_norm": 0.003526994027197361, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 1914 + }, + { + "epoch": 0.052839065038266936, + "grad_norm": 0.0032004155218601227, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 1915 + }, + { + "epoch": 0.0528666572393313, + "grad_norm": 0.003097831504419446, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 1916 + }, + { + "epoch": 0.052894249440395676, + "grad_norm": 0.0046899812296032906, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 1917 + }, + { + "epoch": 0.05292184164146004, + "grad_norm": 0.00434432877227664, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 1918 + }, + { + "epoch": 0.05294943384252441, + "grad_norm": 0.0034439517185091972, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 1919 + }, + { + "epoch": 0.05297702604358878, + "grad_norm": 0.0031842549797147512, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 1920 + }, + { + "epoch": 0.05300461824465315, + "grad_norm": 0.002607315080240369, + "learning_rate": 0.001, + "loss": 0.4513, + "step": 1921 + }, + { + "epoch": 0.053032210445717515, + "grad_norm": 0.003620088566094637, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 1922 + }, + { + "epoch": 0.05305980264678189, + "grad_norm": 0.0034968543332070112, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 1923 + }, + { + "epoch": 0.053087394847846255, + "grad_norm": 0.0023364322260022163, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 1924 + }, + { + "epoch": 0.05311498704891063, + "grad_norm": 0.002855852944776416, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 1925 + }, + { + "epoch": 0.053142579249974994, + "grad_norm": 0.0027531140949577093, + "learning_rate": 0.001, + "loss": 0.3575, + "step": 1926 + }, + { + "epoch": 0.05317017145103936, + "grad_norm": 0.0032164284493774176, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 1927 + }, + { + "epoch": 0.053197763652103734, + "grad_norm": 0.002508921315893531, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 1928 + }, + { + "epoch": 0.0532253558531681, + "grad_norm": 0.0026788045652210712, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 1929 + }, + { + "epoch": 0.053252948054232474, + "grad_norm": 0.0051200552843511105, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 1930 + }, + { + "epoch": 0.05328054025529684, + "grad_norm": 0.004671086091548204, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 1931 + }, + { + "epoch": 0.05330813245636121, + "grad_norm": 0.0028699622489511967, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 1932 + }, + { + "epoch": 0.05333572465742558, + "grad_norm": 0.003617132781073451, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 1933 + }, + { + "epoch": 0.05336331685848995, + "grad_norm": 0.0036757884081453085, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 1934 + }, + { + "epoch": 0.05339090905955432, + "grad_norm": 0.002974196569994092, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 1935 + }, + { + "epoch": 0.05341850126061869, + "grad_norm": 0.0034452476538717747, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 1936 + }, + { + "epoch": 0.05344609346168305, + "grad_norm": 0.010058878920972347, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 1937 + }, + { + "epoch": 0.053473685662747426, + "grad_norm": 0.007609996944665909, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 1938 + }, + { + "epoch": 0.05350127786381179, + "grad_norm": 0.006538870744407177, + "learning_rate": 0.001, + "loss": 0.4252, + "step": 1939 + }, + { + "epoch": 0.053528870064876166, + "grad_norm": 0.003294197143986821, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 1940 + }, + { + "epoch": 0.05355646226594053, + "grad_norm": 0.0027565581258386374, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 1941 + }, + { + "epoch": 0.0535840544670049, + "grad_norm": 0.0027171720284968615, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 1942 + }, + { + "epoch": 0.05361164666806927, + "grad_norm": 0.00246808142401278, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 1943 + }, + { + "epoch": 0.05363923886913364, + "grad_norm": 0.0030862221028655767, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 1944 + }, + { + "epoch": 0.053666831070198005, + "grad_norm": 0.002278526546433568, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 1945 + }, + { + "epoch": 0.05369442327126238, + "grad_norm": 0.002600095234811306, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 1946 + }, + { + "epoch": 0.053722015472326745, + "grad_norm": 0.0028381014708429575, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 1947 + }, + { + "epoch": 0.05374960767339112, + "grad_norm": 0.0026186273898929358, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 1948 + }, + { + "epoch": 0.053777199874455485, + "grad_norm": 0.0026584621518850327, + "learning_rate": 0.001, + "loss": 0.445, + "step": 1949 + }, + { + "epoch": 0.05380479207551985, + "grad_norm": 0.002734116045758128, + "learning_rate": 0.001, + "loss": 0.396, + "step": 1950 + }, + { + "epoch": 0.053832384276584225, + "grad_norm": 0.0053114392794668674, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 1951 + }, + { + "epoch": 0.05385997647764859, + "grad_norm": 0.005844905972480774, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 1952 + }, + { + "epoch": 0.053887568678712965, + "grad_norm": 0.002963895909488201, + "learning_rate": 0.001, + "loss": 0.406, + "step": 1953 + }, + { + "epoch": 0.05391516087977733, + "grad_norm": 0.004491107538342476, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 1954 + }, + { + "epoch": 0.0539427530808417, + "grad_norm": 0.003358663059771061, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 1955 + }, + { + "epoch": 0.05397034528190607, + "grad_norm": 0.003177732229232788, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 1956 + }, + { + "epoch": 0.05399793748297044, + "grad_norm": 0.0027129724621772766, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 1957 + }, + { + "epoch": 0.05402552968403481, + "grad_norm": 0.00669928640127182, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 1958 + }, + { + "epoch": 0.05405312188509918, + "grad_norm": 0.0033415532670915127, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 1959 + }, + { + "epoch": 0.054080714086163544, + "grad_norm": 0.0033846443984657526, + "learning_rate": 0.001, + "loss": 0.4469, + "step": 1960 + }, + { + "epoch": 0.05410830628722792, + "grad_norm": 0.0034599697683006525, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 1961 + }, + { + "epoch": 0.05413589848829228, + "grad_norm": 0.004474925808608532, + "learning_rate": 0.001, + "loss": 0.367, + "step": 1962 + }, + { + "epoch": 0.05416349068935666, + "grad_norm": 0.005082536954432726, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 1963 + }, + { + "epoch": 0.05419108289042102, + "grad_norm": 0.008428883738815784, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 1964 + }, + { + "epoch": 0.05421867509148539, + "grad_norm": 0.0027383146807551384, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 1965 + }, + { + "epoch": 0.05424626729254976, + "grad_norm": 0.0036172361578792334, + "learning_rate": 0.001, + "loss": 0.424, + "step": 1966 + }, + { + "epoch": 0.05427385949361413, + "grad_norm": 0.0034358236007392406, + "learning_rate": 0.001, + "loss": 0.41, + "step": 1967 + }, + { + "epoch": 0.054301451694678496, + "grad_norm": 0.0043413047678768635, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 1968 + }, + { + "epoch": 0.05432904389574287, + "grad_norm": 0.0025661587715148926, + "learning_rate": 0.001, + "loss": 0.435, + "step": 1969 + }, + { + "epoch": 0.054356636096807236, + "grad_norm": 0.005917475093156099, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 1970 + }, + { + "epoch": 0.05438422829787161, + "grad_norm": 0.003292621113359928, + "learning_rate": 0.001, + "loss": 0.455, + "step": 1971 + }, + { + "epoch": 0.054411820498935975, + "grad_norm": 0.0028464931529015303, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 1972 + }, + { + "epoch": 0.05443941270000034, + "grad_norm": 0.003331197891384363, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 1973 + }, + { + "epoch": 0.054467004901064715, + "grad_norm": 0.005236343014985323, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 1974 + }, + { + "epoch": 0.05449459710212908, + "grad_norm": 0.003114610444754362, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 1975 + }, + { + "epoch": 0.054522189303193455, + "grad_norm": 0.0027003728318959475, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 1976 + }, + { + "epoch": 0.05454978150425782, + "grad_norm": 0.00410441542044282, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 1977 + }, + { + "epoch": 0.05457737370532219, + "grad_norm": 0.006601103115826845, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 1978 + }, + { + "epoch": 0.05460496590638656, + "grad_norm": 0.003532871138304472, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 1979 + }, + { + "epoch": 0.05463255810745093, + "grad_norm": 0.0024505204055458307, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 1980 + }, + { + "epoch": 0.0546601503085153, + "grad_norm": 0.0038412862922996283, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 1981 + }, + { + "epoch": 0.05468774250957967, + "grad_norm": 0.00442805141210556, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 1982 + }, + { + "epoch": 0.054715334710644034, + "grad_norm": 0.004178628791123629, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 1983 + }, + { + "epoch": 0.05474292691170841, + "grad_norm": 0.0023596303071826696, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 1984 + }, + { + "epoch": 0.054770519112772774, + "grad_norm": 0.0026005019899457693, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 1985 + }, + { + "epoch": 0.05479811131383715, + "grad_norm": 0.0043563637882471085, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 1986 + }, + { + "epoch": 0.054825703514901514, + "grad_norm": 0.004494437016546726, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 1987 + }, + { + "epoch": 0.05485329571596588, + "grad_norm": 0.0030924968887120485, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 1988 + }, + { + "epoch": 0.054880887917030254, + "grad_norm": 0.0029607934411615133, + "learning_rate": 0.001, + "loss": 0.402, + "step": 1989 + }, + { + "epoch": 0.05490848011809462, + "grad_norm": 0.004574719350785017, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 1990 + }, + { + "epoch": 0.054936072319158986, + "grad_norm": 0.0027861008420586586, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 1991 + }, + { + "epoch": 0.05496366452022336, + "grad_norm": 0.003089543664827943, + "learning_rate": 0.001, + "loss": 0.452, + "step": 1992 + }, + { + "epoch": 0.054991256721287726, + "grad_norm": 0.0027578859589993954, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 1993 + }, + { + "epoch": 0.0550188489223521, + "grad_norm": 0.003672545775771141, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 1994 + }, + { + "epoch": 0.055046441123416466, + "grad_norm": 0.0028012071270495653, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 1995 + }, + { + "epoch": 0.05507403332448083, + "grad_norm": 0.0035645493771880865, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 1996 + }, + { + "epoch": 0.055101625525545206, + "grad_norm": 0.004443664103746414, + "learning_rate": 0.001, + "loss": 0.4631, + "step": 1997 + }, + { + "epoch": 0.05512921772660957, + "grad_norm": 0.002620038343593478, + "learning_rate": 0.001, + "loss": 0.4562, + "step": 1998 + }, + { + "epoch": 0.055156809927673946, + "grad_norm": 0.004475294146686792, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 1999 + }, + { + "epoch": 0.05518440212873831, + "grad_norm": 0.003935279790312052, + "learning_rate": 0.001, + "loss": 0.4801, + "step": 2000 + }, + { + "epoch": 0.05518440212873831, + "eval_runtime": 24.2254, + "eval_samples_per_second": 1.321, + "eval_steps_per_second": 0.165, + "step": 2000 + }, + { + "epoch": 0.05521199432980268, + "grad_norm": 0.01289259921759367, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 2001 + }, + { + "epoch": 0.05523958653086705, + "grad_norm": 0.0044218553230166435, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 2002 + }, + { + "epoch": 0.05526717873193142, + "grad_norm": 0.0024792973417788744, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 2003 + }, + { + "epoch": 0.05529477093299579, + "grad_norm": 0.0030391570180654526, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 2004 + }, + { + "epoch": 0.05532236313406016, + "grad_norm": 0.0027939057908952236, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 2005 + }, + { + "epoch": 0.055349955335124525, + "grad_norm": 0.003908068872988224, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 2006 + }, + { + "epoch": 0.0553775475361889, + "grad_norm": 0.006802330259233713, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 2007 + }, + { + "epoch": 0.055405139737253264, + "grad_norm": 0.005997187457978725, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 2008 + }, + { + "epoch": 0.05543273193831764, + "grad_norm": 0.004595068749040365, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 2009 + }, + { + "epoch": 0.055460324139382004, + "grad_norm": 0.0029883128590881824, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 2010 + }, + { + "epoch": 0.05548791634044637, + "grad_norm": 0.0025319228880107403, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 2011 + }, + { + "epoch": 0.055515508541510744, + "grad_norm": 0.004491596017032862, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 2012 + }, + { + "epoch": 0.05554310074257511, + "grad_norm": 0.01933477073907852, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 2013 + }, + { + "epoch": 0.05557069294363948, + "grad_norm": 0.0026234816759824753, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 2014 + }, + { + "epoch": 0.05559828514470385, + "grad_norm": 0.0022836877033114433, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 2015 + }, + { + "epoch": 0.05562587734576822, + "grad_norm": 0.003522195853292942, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 2016 + }, + { + "epoch": 0.05565346954683259, + "grad_norm": 0.0039251442067325115, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 2017 + }, + { + "epoch": 0.05568106174789696, + "grad_norm": 0.011906804516911507, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 2018 + }, + { + "epoch": 0.05570865394896132, + "grad_norm": 0.002788014244288206, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 2019 + }, + { + "epoch": 0.055736246150025696, + "grad_norm": 0.006921442225575447, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 2020 + }, + { + "epoch": 0.05576383835109006, + "grad_norm": 0.004114898853003979, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 2021 + }, + { + "epoch": 0.055791430552154436, + "grad_norm": 0.0033005515579134226, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 2022 + }, + { + "epoch": 0.0558190227532188, + "grad_norm": 0.00554333720356226, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 2023 + }, + { + "epoch": 0.05584661495428317, + "grad_norm": 0.009507423266768456, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 2024 + }, + { + "epoch": 0.05587420715534754, + "grad_norm": 0.0032099178060889244, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 2025 + }, + { + "epoch": 0.05590179935641191, + "grad_norm": 0.0028057810850441456, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 2026 + }, + { + "epoch": 0.05592939155747628, + "grad_norm": 0.00344046950340271, + "learning_rate": 0.001, + "loss": 0.4409, + "step": 2027 + }, + { + "epoch": 0.05595698375854065, + "grad_norm": 0.004159116186201572, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 2028 + }, + { + "epoch": 0.055984575959605015, + "grad_norm": 0.017573168501257896, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 2029 + }, + { + "epoch": 0.05601216816066939, + "grad_norm": 0.0059144701808691025, + "learning_rate": 0.001, + "loss": 0.38, + "step": 2030 + }, + { + "epoch": 0.056039760361733755, + "grad_norm": 0.003630187129601836, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 2031 + }, + { + "epoch": 0.05606735256279813, + "grad_norm": 0.003033621236681938, + "learning_rate": 0.001, + "loss": 0.393, + "step": 2032 + }, + { + "epoch": 0.056094944763862495, + "grad_norm": 0.003519507823511958, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 2033 + }, + { + "epoch": 0.05612253696492686, + "grad_norm": 0.004492396954447031, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 2034 + }, + { + "epoch": 0.056150129165991235, + "grad_norm": 0.004509568680077791, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 2035 + }, + { + "epoch": 0.0561777213670556, + "grad_norm": 0.0026023637037724257, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 2036 + }, + { + "epoch": 0.05620531356811997, + "grad_norm": 0.0026178304105997086, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 2037 + }, + { + "epoch": 0.05623290576918434, + "grad_norm": 0.0024824494030326605, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 2038 + }, + { + "epoch": 0.05626049797024871, + "grad_norm": 0.0025145653635263443, + "learning_rate": 0.001, + "loss": 0.4485, + "step": 2039 + }, + { + "epoch": 0.05628809017131308, + "grad_norm": 0.005017207004129887, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 2040 + }, + { + "epoch": 0.05631568237237745, + "grad_norm": 0.005889947526156902, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 2041 + }, + { + "epoch": 0.056343274573441814, + "grad_norm": 0.003475229488685727, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 2042 + }, + { + "epoch": 0.05637086677450619, + "grad_norm": 0.00831044651567936, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 2043 + }, + { + "epoch": 0.05639845897557055, + "grad_norm": 0.003840766381472349, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 2044 + }, + { + "epoch": 0.05642605117663493, + "grad_norm": 0.004076477140188217, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 2045 + }, + { + "epoch": 0.05645364337769929, + "grad_norm": 0.004721554461866617, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 2046 + }, + { + "epoch": 0.05648123557876366, + "grad_norm": 0.003165911417454481, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 2047 + }, + { + "epoch": 0.05650882777982803, + "grad_norm": 0.0031442714389413595, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 2048 + }, + { + "epoch": 0.0565364199808924, + "grad_norm": 0.004942482803016901, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 2049 + }, + { + "epoch": 0.05656401218195677, + "grad_norm": 0.004441413562744856, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 2050 + }, + { + "epoch": 0.05659160438302114, + "grad_norm": 0.005134286358952522, + "learning_rate": 0.001, + "loss": 0.415, + "step": 2051 + }, + { + "epoch": 0.056619196584085506, + "grad_norm": 0.0030015951488167048, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 2052 + }, + { + "epoch": 0.05664678878514988, + "grad_norm": 0.0028276192024350166, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 2053 + }, + { + "epoch": 0.056674380986214246, + "grad_norm": 0.0030078550335019827, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 2054 + }, + { + "epoch": 0.05670197318727862, + "grad_norm": 0.0047853728756308556, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 2055 + }, + { + "epoch": 0.056729565388342985, + "grad_norm": 0.003354682819917798, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 2056 + }, + { + "epoch": 0.05675715758940735, + "grad_norm": 0.002619178267195821, + "learning_rate": 0.001, + "loss": 0.353, + "step": 2057 + }, + { + "epoch": 0.056784749790471725, + "grad_norm": 0.003579444019123912, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 2058 + }, + { + "epoch": 0.05681234199153609, + "grad_norm": 0.002495531225576997, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 2059 + }, + { + "epoch": 0.05683993419260046, + "grad_norm": 0.0025894520804286003, + "learning_rate": 0.001, + "loss": 0.4454, + "step": 2060 + }, + { + "epoch": 0.05686752639366483, + "grad_norm": 0.0035489851143211126, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 2061 + }, + { + "epoch": 0.0568951185947292, + "grad_norm": 0.00259172054938972, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 2062 + }, + { + "epoch": 0.05692271079579357, + "grad_norm": 0.0038454467430710793, + "learning_rate": 0.001, + "loss": 0.434, + "step": 2063 + }, + { + "epoch": 0.05695030299685794, + "grad_norm": 0.002690738532692194, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 2064 + }, + { + "epoch": 0.056977895197922304, + "grad_norm": 0.0031391652300953865, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 2065 + }, + { + "epoch": 0.05700548739898668, + "grad_norm": 0.0025237516965717077, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 2066 + }, + { + "epoch": 0.057033079600051044, + "grad_norm": 0.005846772808581591, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 2067 + }, + { + "epoch": 0.05706067180111542, + "grad_norm": 0.0032868178095668554, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 2068 + }, + { + "epoch": 0.057088264002179784, + "grad_norm": 0.0038759801536798477, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 2069 + }, + { + "epoch": 0.05711585620324415, + "grad_norm": 0.002897894475609064, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 2070 + }, + { + "epoch": 0.057143448404308524, + "grad_norm": 0.003541572019457817, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 2071 + }, + { + "epoch": 0.05717104060537289, + "grad_norm": 0.003856340888887644, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 2072 + }, + { + "epoch": 0.05719863280643726, + "grad_norm": 0.003133943770080805, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 2073 + }, + { + "epoch": 0.05722622500750163, + "grad_norm": 0.0026191668584942818, + "learning_rate": 0.001, + "loss": 0.4348, + "step": 2074 + }, + { + "epoch": 0.057253817208565996, + "grad_norm": 0.003371886443346739, + "learning_rate": 0.001, + "loss": 0.4478, + "step": 2075 + }, + { + "epoch": 0.05728140940963037, + "grad_norm": 0.003778213867917657, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 2076 + }, + { + "epoch": 0.057309001610694736, + "grad_norm": 0.0024114828556776047, + "learning_rate": 0.001, + "loss": 0.397, + "step": 2077 + }, + { + "epoch": 0.05733659381175911, + "grad_norm": 0.0026140043046325445, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 2078 + }, + { + "epoch": 0.057364186012823476, + "grad_norm": 0.006618298124521971, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 2079 + }, + { + "epoch": 0.05739177821388784, + "grad_norm": 0.002615788020193577, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 2080 + }, + { + "epoch": 0.057419370414952216, + "grad_norm": 0.0076182009652256966, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 2081 + }, + { + "epoch": 0.05744696261601658, + "grad_norm": 0.0025556047912687063, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 2082 + }, + { + "epoch": 0.05747455481708095, + "grad_norm": 0.0024526710622012615, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 2083 + }, + { + "epoch": 0.05750214701814532, + "grad_norm": 0.004499271512031555, + "learning_rate": 0.001, + "loss": 0.3638, + "step": 2084 + }, + { + "epoch": 0.05752973921920969, + "grad_norm": 0.0039004157297313213, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 2085 + }, + { + "epoch": 0.05755733142027406, + "grad_norm": 0.005158510524779558, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 2086 + }, + { + "epoch": 0.05758492362133843, + "grad_norm": 0.0028425029013305902, + "learning_rate": 0.001, + "loss": 0.408, + "step": 2087 + }, + { + "epoch": 0.057612515822402795, + "grad_norm": 0.0027261306531727314, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 2088 + }, + { + "epoch": 0.05764010802346717, + "grad_norm": 0.0026766748633235693, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 2089 + }, + { + "epoch": 0.057667700224531535, + "grad_norm": 0.0038580589462071657, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 2090 + }, + { + "epoch": 0.05769529242559591, + "grad_norm": 0.0030884486623108387, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 2091 + }, + { + "epoch": 0.057722884626660274, + "grad_norm": 0.002511868719011545, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 2092 + }, + { + "epoch": 0.05775047682772464, + "grad_norm": 0.003216751618310809, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 2093 + }, + { + "epoch": 0.057778069028789014, + "grad_norm": 0.0028110183775424957, + "learning_rate": 0.001, + "loss": 0.412, + "step": 2094 + }, + { + "epoch": 0.05780566122985338, + "grad_norm": 0.00429938780143857, + "learning_rate": 0.001, + "loss": 0.371, + "step": 2095 + }, + { + "epoch": 0.057833253430917754, + "grad_norm": 0.005798738915473223, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 2096 + }, + { + "epoch": 0.05786084563198212, + "grad_norm": 0.0031060322653502226, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 2097 + }, + { + "epoch": 0.05788843783304649, + "grad_norm": 0.0039985994808375835, + "learning_rate": 0.001, + "loss": 0.3612, + "step": 2098 + }, + { + "epoch": 0.05791603003411086, + "grad_norm": 0.0036842762492597103, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 2099 + }, + { + "epoch": 0.05794362223517523, + "grad_norm": 0.002856861101463437, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 2100 + }, + { + "epoch": 0.0579712144362396, + "grad_norm": 0.00465161819010973, + "learning_rate": 0.001, + "loss": 0.404, + "step": 2101 + }, + { + "epoch": 0.057998806637303967, + "grad_norm": 0.0029720210004597902, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 2102 + }, + { + "epoch": 0.05802639883836833, + "grad_norm": 0.0034581513609737158, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 2103 + }, + { + "epoch": 0.058053991039432706, + "grad_norm": 0.0030470779165625572, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 2104 + }, + { + "epoch": 0.05808158324049707, + "grad_norm": 0.005939992144703865, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 2105 + }, + { + "epoch": 0.05810917544156144, + "grad_norm": 0.005322432145476341, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 2106 + }, + { + "epoch": 0.05813676764262581, + "grad_norm": 0.0031803702004253864, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 2107 + }, + { + "epoch": 0.05816435984369018, + "grad_norm": 0.004405119922012091, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 2108 + }, + { + "epoch": 0.05819195204475455, + "grad_norm": 0.005180948879569769, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 2109 + }, + { + "epoch": 0.05821954424581892, + "grad_norm": 0.003976668696850538, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 2110 + }, + { + "epoch": 0.058247136446883285, + "grad_norm": 0.006682976149022579, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 2111 + }, + { + "epoch": 0.05827472864794766, + "grad_norm": 0.0036576632410287857, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 2112 + }, + { + "epoch": 0.058302320849012025, + "grad_norm": 0.0037393097300082445, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 2113 + }, + { + "epoch": 0.0583299130500764, + "grad_norm": 0.004572103265672922, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 2114 + }, + { + "epoch": 0.058357505251140765, + "grad_norm": 0.004839747212827206, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 2115 + }, + { + "epoch": 0.05838509745220513, + "grad_norm": 0.002434584079310298, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 2116 + }, + { + "epoch": 0.058412689653269505, + "grad_norm": 0.00401110528036952, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 2117 + }, + { + "epoch": 0.05844028185433387, + "grad_norm": 0.015435201115906239, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 2118 + }, + { + "epoch": 0.058467874055398245, + "grad_norm": 0.005054370500147343, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 2119 + }, + { + "epoch": 0.05849546625646261, + "grad_norm": 0.0033668207470327616, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 2120 + }, + { + "epoch": 0.05852305845752698, + "grad_norm": 0.004512968007475138, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 2121 + }, + { + "epoch": 0.05855065065859135, + "grad_norm": 0.003313259920105338, + "learning_rate": 0.001, + "loss": 0.436, + "step": 2122 + }, + { + "epoch": 0.05857824285965572, + "grad_norm": 0.0035786698572337627, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 2123 + }, + { + "epoch": 0.05860583506072009, + "grad_norm": 0.002702909056097269, + "learning_rate": 0.001, + "loss": 0.414, + "step": 2124 + }, + { + "epoch": 0.05863342726178446, + "grad_norm": 0.004725235048681498, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 2125 + }, + { + "epoch": 0.058661019462848824, + "grad_norm": 0.0036240858025848866, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 2126 + }, + { + "epoch": 0.0586886116639132, + "grad_norm": 0.004966442938894033, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 2127 + }, + { + "epoch": 0.05871620386497756, + "grad_norm": 0.0031415580306202173, + "learning_rate": 0.001, + "loss": 0.3542, + "step": 2128 + }, + { + "epoch": 0.05874379606604194, + "grad_norm": 0.003191297873854637, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 2129 + }, + { + "epoch": 0.0587713882671063, + "grad_norm": 0.0039037340320646763, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 2130 + }, + { + "epoch": 0.05879898046817067, + "grad_norm": 0.00381074589677155, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 2131 + }, + { + "epoch": 0.05882657266923504, + "grad_norm": 0.004218887537717819, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 2132 + }, + { + "epoch": 0.05885416487029941, + "grad_norm": 0.004148995969444513, + "learning_rate": 0.001, + "loss": 0.456, + "step": 2133 + }, + { + "epoch": 0.058881757071363776, + "grad_norm": 0.0040593999437987804, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 2134 + }, + { + "epoch": 0.05890934927242815, + "grad_norm": 0.004279328975826502, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 2135 + }, + { + "epoch": 0.058936941473492516, + "grad_norm": 0.00401785783469677, + "learning_rate": 0.001, + "loss": 0.4497, + "step": 2136 + }, + { + "epoch": 0.05896453367455689, + "grad_norm": 0.004279072396457195, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 2137 + }, + { + "epoch": 0.058992125875621255, + "grad_norm": 0.004601733293384314, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 2138 + }, + { + "epoch": 0.05901971807668562, + "grad_norm": 0.010014613159000874, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 2139 + }, + { + "epoch": 0.059047310277749995, + "grad_norm": 0.004289823584258556, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 2140 + }, + { + "epoch": 0.05907490247881436, + "grad_norm": 0.004107081796973944, + "learning_rate": 0.001, + "loss": 0.406, + "step": 2141 + }, + { + "epoch": 0.059102494679878735, + "grad_norm": 0.004256140440702438, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 2142 + }, + { + "epoch": 0.0591300868809431, + "grad_norm": 0.00437437929213047, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 2143 + }, + { + "epoch": 0.05915767908200747, + "grad_norm": 0.004300633445382118, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 2144 + }, + { + "epoch": 0.05918527128307184, + "grad_norm": 0.0044938200153410435, + "learning_rate": 0.001, + "loss": 0.401, + "step": 2145 + }, + { + "epoch": 0.05921286348413621, + "grad_norm": 0.004241921007633209, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 2146 + }, + { + "epoch": 0.05924045568520058, + "grad_norm": 0.005186907015740871, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 2147 + }, + { + "epoch": 0.05926804788626495, + "grad_norm": 0.0033411476761102676, + "learning_rate": 0.001, + "loss": 0.446, + "step": 2148 + }, + { + "epoch": 0.059295640087329314, + "grad_norm": 0.003417744068428874, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 2149 + }, + { + "epoch": 0.05932323228839369, + "grad_norm": 0.0042548892088234425, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 2150 + }, + { + "epoch": 0.059350824489458054, + "grad_norm": 0.0036170384846627712, + "learning_rate": 0.001, + "loss": 0.377, + "step": 2151 + }, + { + "epoch": 0.05937841669052243, + "grad_norm": 0.0034266202710568905, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 2152 + }, + { + "epoch": 0.059406008891586794, + "grad_norm": 0.0034320498816668987, + "learning_rate": 0.001, + "loss": 0.4498, + "step": 2153 + }, + { + "epoch": 0.05943360109265116, + "grad_norm": 0.0033084768801927567, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 2154 + }, + { + "epoch": 0.059461193293715534, + "grad_norm": 0.003638029098510742, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 2155 + }, + { + "epoch": 0.0594887854947799, + "grad_norm": 0.0036058463156223297, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 2156 + }, + { + "epoch": 0.059516377695844266, + "grad_norm": 0.003367355791851878, + "learning_rate": 0.001, + "loss": 0.3577, + "step": 2157 + }, + { + "epoch": 0.05954396989690864, + "grad_norm": 0.003909154795110226, + "learning_rate": 0.001, + "loss": 0.404, + "step": 2158 + }, + { + "epoch": 0.059571562097973006, + "grad_norm": 0.0027421792037785053, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2159 + }, + { + "epoch": 0.05959915429903738, + "grad_norm": 0.003349900944158435, + "learning_rate": 0.001, + "loss": 0.38, + "step": 2160 + }, + { + "epoch": 0.059626746500101746, + "grad_norm": 0.003867817111313343, + "learning_rate": 0.001, + "loss": 0.4502, + "step": 2161 + }, + { + "epoch": 0.05965433870116611, + "grad_norm": 0.002976678777486086, + "learning_rate": 0.001, + "loss": 0.4679, + "step": 2162 + }, + { + "epoch": 0.059681930902230486, + "grad_norm": 0.004093970637768507, + "learning_rate": 0.001, + "loss": 0.4623, + "step": 2163 + }, + { + "epoch": 0.05970952310329485, + "grad_norm": 0.008607220835983753, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 2164 + }, + { + "epoch": 0.059737115304359226, + "grad_norm": 0.003642444731667638, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 2165 + }, + { + "epoch": 0.05976470750542359, + "grad_norm": 0.004864281043410301, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 2166 + }, + { + "epoch": 0.05979229970648796, + "grad_norm": 0.003302594181150198, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 2167 + }, + { + "epoch": 0.05981989190755233, + "grad_norm": 0.004229418467730284, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 2168 + }, + { + "epoch": 0.0598474841086167, + "grad_norm": 0.026235150173306465, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 2169 + }, + { + "epoch": 0.05987507630968107, + "grad_norm": 0.0035215525422245264, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 2170 + }, + { + "epoch": 0.05990266851074544, + "grad_norm": 0.0038247762713581324, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 2171 + }, + { + "epoch": 0.059930260711809805, + "grad_norm": 0.0033723246306180954, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 2172 + }, + { + "epoch": 0.05995785291287418, + "grad_norm": 0.003272601403295994, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 2173 + }, + { + "epoch": 0.059985445113938544, + "grad_norm": 0.00700679887086153, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 2174 + }, + { + "epoch": 0.06001303731500292, + "grad_norm": 0.004312647040933371, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 2175 + }, + { + "epoch": 0.060040629516067284, + "grad_norm": 0.0028906308580189943, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2176 + }, + { + "epoch": 0.06006822171713165, + "grad_norm": 0.0023235634434968233, + "learning_rate": 0.001, + "loss": 0.406, + "step": 2177 + }, + { + "epoch": 0.060095813918196024, + "grad_norm": 0.003625315148383379, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2178 + }, + { + "epoch": 0.06012340611926039, + "grad_norm": 0.0026299748569726944, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 2179 + }, + { + "epoch": 0.06015099832032476, + "grad_norm": 0.003912107087671757, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 2180 + }, + { + "epoch": 0.06017859052138913, + "grad_norm": 0.0030596598517149687, + "learning_rate": 0.001, + "loss": 0.396, + "step": 2181 + }, + { + "epoch": 0.0602061827224535, + "grad_norm": 0.0034961546771228313, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 2182 + }, + { + "epoch": 0.06023377492351787, + "grad_norm": 0.0026868092827498913, + "learning_rate": 0.001, + "loss": 0.341, + "step": 2183 + }, + { + "epoch": 0.06026136712458224, + "grad_norm": 0.0036376873031258583, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 2184 + }, + { + "epoch": 0.0602889593256466, + "grad_norm": 0.0034194989129900932, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 2185 + }, + { + "epoch": 0.060316551526710976, + "grad_norm": 0.00262664002366364, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 2186 + }, + { + "epoch": 0.06034414372777534, + "grad_norm": 0.002487578894942999, + "learning_rate": 0.001, + "loss": 0.4719, + "step": 2187 + }, + { + "epoch": 0.060371735928839716, + "grad_norm": 0.002136924536898732, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 2188 + }, + { + "epoch": 0.06039932812990408, + "grad_norm": 0.0025951487477868795, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 2189 + }, + { + "epoch": 0.06042692033096845, + "grad_norm": 0.007234814576804638, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 2190 + }, + { + "epoch": 0.06045451253203282, + "grad_norm": 0.04125404730439186, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 2191 + }, + { + "epoch": 0.06048210473309719, + "grad_norm": 0.0031280010007321835, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 2192 + }, + { + "epoch": 0.06050969693416156, + "grad_norm": 0.003333252388983965, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 2193 + }, + { + "epoch": 0.06053728913522593, + "grad_norm": 0.002960551530122757, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 2194 + }, + { + "epoch": 0.060564881336290295, + "grad_norm": 0.003678489476442337, + "learning_rate": 0.001, + "loss": 0.4412, + "step": 2195 + }, + { + "epoch": 0.06059247353735467, + "grad_norm": 0.008868735283613205, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 2196 + }, + { + "epoch": 0.060620065738419035, + "grad_norm": 0.009513468481600285, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 2197 + }, + { + "epoch": 0.06064765793948341, + "grad_norm": 0.0050335777923464775, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 2198 + }, + { + "epoch": 0.060675250140547775, + "grad_norm": 0.006050426512956619, + "learning_rate": 0.001, + "loss": 0.4329, + "step": 2199 + }, + { + "epoch": 0.06070284234161214, + "grad_norm": 0.002776517765596509, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 2200 + }, + { + "epoch": 0.060730434542676515, + "grad_norm": 0.003950697835534811, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 2201 + }, + { + "epoch": 0.06075802674374088, + "grad_norm": 0.003235916141420603, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 2202 + }, + { + "epoch": 0.06078561894480525, + "grad_norm": 0.0029406328685581684, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 2203 + }, + { + "epoch": 0.06081321114586962, + "grad_norm": 0.0045188660733401775, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 2204 + }, + { + "epoch": 0.06084080334693399, + "grad_norm": 0.0031814551912248135, + "learning_rate": 0.001, + "loss": 0.391, + "step": 2205 + }, + { + "epoch": 0.06086839554799836, + "grad_norm": 0.0054643419571220875, + "learning_rate": 0.001, + "loss": 0.426, + "step": 2206 + }, + { + "epoch": 0.06089598774906273, + "grad_norm": 0.0061082998290658, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 2207 + }, + { + "epoch": 0.060923579950127094, + "grad_norm": 0.002864258596673608, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 2208 + }, + { + "epoch": 0.06095117215119147, + "grad_norm": 0.014014131389558315, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 2209 + }, + { + "epoch": 0.06097876435225583, + "grad_norm": 0.0033069124910980463, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 2210 + }, + { + "epoch": 0.06100635655332021, + "grad_norm": 0.0032278632279485464, + "learning_rate": 0.001, + "loss": 0.4836, + "step": 2211 + }, + { + "epoch": 0.06103394875438457, + "grad_norm": 0.0036987089551985264, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 2212 + }, + { + "epoch": 0.06106154095544894, + "grad_norm": 0.006201690062880516, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 2213 + }, + { + "epoch": 0.06108913315651331, + "grad_norm": 0.0028827004134655, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 2214 + }, + { + "epoch": 0.06111672535757768, + "grad_norm": 0.0034600994549691677, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 2215 + }, + { + "epoch": 0.06114431755864205, + "grad_norm": 0.09618855267763138, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 2216 + }, + { + "epoch": 0.06117190975970642, + "grad_norm": 0.003716063452884555, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 2217 + }, + { + "epoch": 0.061199501960770786, + "grad_norm": 0.002988336840644479, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 2218 + }, + { + "epoch": 0.06122709416183516, + "grad_norm": 0.0036907619796693325, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 2219 + }, + { + "epoch": 0.061254686362899526, + "grad_norm": 0.002765973797068, + "learning_rate": 0.001, + "loss": 0.4421, + "step": 2220 + }, + { + "epoch": 0.0612822785639639, + "grad_norm": 0.00515265017747879, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 2221 + }, + { + "epoch": 0.061309870765028265, + "grad_norm": 0.0030234006699174643, + "learning_rate": 0.001, + "loss": 0.4428, + "step": 2222 + }, + { + "epoch": 0.06133746296609263, + "grad_norm": 0.007390964776277542, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 2223 + }, + { + "epoch": 0.061365055167157005, + "grad_norm": 0.0025635359343141317, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 2224 + }, + { + "epoch": 0.06139264736822137, + "grad_norm": 0.0057982392609119415, + "learning_rate": 0.001, + "loss": 0.4358, + "step": 2225 + }, + { + "epoch": 0.06142023956928574, + "grad_norm": 0.003062683856114745, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 2226 + }, + { + "epoch": 0.06144783177035011, + "grad_norm": 0.0026797873433679342, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 2227 + }, + { + "epoch": 0.06147542397141448, + "grad_norm": 0.0024085459299385548, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 2228 + }, + { + "epoch": 0.06150301617247885, + "grad_norm": 0.004409399814903736, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 2229 + }, + { + "epoch": 0.06153060837354322, + "grad_norm": 0.003322464181110263, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 2230 + }, + { + "epoch": 0.061558200574607584, + "grad_norm": 0.00409911060705781, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 2231 + }, + { + "epoch": 0.06158579277567196, + "grad_norm": 0.0029169321060180664, + "learning_rate": 0.001, + "loss": 0.3508, + "step": 2232 + }, + { + "epoch": 0.061613384976736324, + "grad_norm": 0.00469607999548316, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 2233 + }, + { + "epoch": 0.0616409771778007, + "grad_norm": 0.0029155181255191565, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 2234 + }, + { + "epoch": 0.061668569378865064, + "grad_norm": 0.003221947466954589, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 2235 + }, + { + "epoch": 0.06169616157992943, + "grad_norm": 0.0033768792636692524, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 2236 + }, + { + "epoch": 0.061723753780993804, + "grad_norm": 0.002673777285963297, + "learning_rate": 0.001, + "loss": 0.3536, + "step": 2237 + }, + { + "epoch": 0.06175134598205817, + "grad_norm": 0.002202989300712943, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 2238 + }, + { + "epoch": 0.06177893818312254, + "grad_norm": 0.00238011684268713, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 2239 + }, + { + "epoch": 0.06180653038418691, + "grad_norm": 0.0027684099040925503, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 2240 + }, + { + "epoch": 0.061834122585251276, + "grad_norm": 0.0032890914008021355, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 2241 + }, + { + "epoch": 0.06186171478631565, + "grad_norm": 0.003589882282540202, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 2242 + }, + { + "epoch": 0.061889306987380016, + "grad_norm": 0.002842566231265664, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 2243 + }, + { + "epoch": 0.06191689918844439, + "grad_norm": 0.0028953177388757467, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 2244 + }, + { + "epoch": 0.061944491389508756, + "grad_norm": 0.003262540325522423, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 2245 + }, + { + "epoch": 0.06197208359057312, + "grad_norm": 0.004046915099024773, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 2246 + }, + { + "epoch": 0.061999675791637496, + "grad_norm": 0.0038636084645986557, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 2247 + }, + { + "epoch": 0.06202726799270186, + "grad_norm": 0.002859740052372217, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 2248 + }, + { + "epoch": 0.06205486019376623, + "grad_norm": 0.002851482480764389, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 2249 + }, + { + "epoch": 0.0620824523948306, + "grad_norm": 0.0031568272970616817, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 2250 + }, + { + "epoch": 0.06211004459589497, + "grad_norm": 0.003563236678019166, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 2251 + }, + { + "epoch": 0.06213763679695934, + "grad_norm": 0.0029816743917763233, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 2252 + }, + { + "epoch": 0.06216522899802371, + "grad_norm": 0.002947515808045864, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 2253 + }, + { + "epoch": 0.062192821199088075, + "grad_norm": 0.003983738832175732, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 2254 + }, + { + "epoch": 0.06222041340015245, + "grad_norm": 0.0053352476097643375, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 2255 + }, + { + "epoch": 0.062248005601216815, + "grad_norm": 0.004124592524021864, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 2256 + }, + { + "epoch": 0.06227559780228119, + "grad_norm": 0.004866636358201504, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 2257 + }, + { + "epoch": 0.062303190003345554, + "grad_norm": 0.005429052747786045, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 2258 + }, + { + "epoch": 0.06233078220440992, + "grad_norm": 0.00399363599717617, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 2259 + }, + { + "epoch": 0.062358374405474294, + "grad_norm": 0.0047408780083060265, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 2260 + }, + { + "epoch": 0.06238596660653866, + "grad_norm": 0.007790642790496349, + "learning_rate": 0.001, + "loss": 0.362, + "step": 2261 + }, + { + "epoch": 0.062413558807603034, + "grad_norm": 0.003895730245858431, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 2262 + }, + { + "epoch": 0.0624411510086674, + "grad_norm": 0.004620977211743593, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 2263 + }, + { + "epoch": 0.06246874320973177, + "grad_norm": 0.002770553808659315, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 2264 + }, + { + "epoch": 0.06249633541079614, + "grad_norm": 0.0061469171196222305, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 2265 + }, + { + "epoch": 0.06252392761186051, + "grad_norm": 0.00576475216075778, + "learning_rate": 0.001, + "loss": 0.391, + "step": 2266 + }, + { + "epoch": 0.06255151981292488, + "grad_norm": 0.004090351052582264, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 2267 + }, + { + "epoch": 0.06257911201398925, + "grad_norm": 0.024996625259518623, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 2268 + }, + { + "epoch": 0.06260670421505361, + "grad_norm": 0.0244828462600708, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 2269 + }, + { + "epoch": 0.06263429641611798, + "grad_norm": 0.0046532778069376945, + "learning_rate": 0.001, + "loss": 0.432, + "step": 2270 + }, + { + "epoch": 0.06266188861718236, + "grad_norm": 0.004564880859106779, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 2271 + }, + { + "epoch": 0.06268948081824673, + "grad_norm": 0.0029092628974467516, + "learning_rate": 0.001, + "loss": 0.4537, + "step": 2272 + }, + { + "epoch": 0.06271707301931109, + "grad_norm": 0.0033110990189015865, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 2273 + }, + { + "epoch": 0.06274466522037546, + "grad_norm": 0.0045304871164262295, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 2274 + }, + { + "epoch": 0.06277225742143983, + "grad_norm": 0.0031519641634076834, + "learning_rate": 0.001, + "loss": 0.392, + "step": 2275 + }, + { + "epoch": 0.0627998496225042, + "grad_norm": 0.003920139744877815, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 2276 + }, + { + "epoch": 0.06282744182356857, + "grad_norm": 0.004257616586983204, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 2277 + }, + { + "epoch": 0.06285503402463294, + "grad_norm": 0.0038890133146196604, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 2278 + }, + { + "epoch": 0.0628826262256973, + "grad_norm": 0.0028080164920538664, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 2279 + }, + { + "epoch": 0.06291021842676167, + "grad_norm": 0.008050143718719482, + "learning_rate": 0.001, + "loss": 0.415, + "step": 2280 + }, + { + "epoch": 0.06293781062782605, + "grad_norm": 0.0025884988717734814, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 2281 + }, + { + "epoch": 0.06296540282889042, + "grad_norm": 0.0030551603995263577, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 2282 + }, + { + "epoch": 0.06299299502995478, + "grad_norm": 0.0027584030758589506, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 2283 + }, + { + "epoch": 0.06302058723101915, + "grad_norm": 0.0033046863973140717, + "learning_rate": 0.001, + "loss": 0.3583, + "step": 2284 + }, + { + "epoch": 0.06304817943208352, + "grad_norm": 0.0036664491053670645, + "learning_rate": 0.001, + "loss": 0.405, + "step": 2285 + }, + { + "epoch": 0.06307577163314788, + "grad_norm": 0.002570797922089696, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 2286 + }, + { + "epoch": 0.06310336383421226, + "grad_norm": 0.003837966127321124, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 2287 + }, + { + "epoch": 0.06313095603527663, + "grad_norm": 0.00263983360491693, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 2288 + }, + { + "epoch": 0.063158548236341, + "grad_norm": 0.0033676191233098507, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 2289 + }, + { + "epoch": 0.06318614043740536, + "grad_norm": 0.0033159011509269476, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 2290 + }, + { + "epoch": 0.06321373263846973, + "grad_norm": 0.002879378153011203, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 2291 + }, + { + "epoch": 0.06324132483953411, + "grad_norm": 0.004017788916826248, + "learning_rate": 0.001, + "loss": 0.3722, + "step": 2292 + }, + { + "epoch": 0.06326891704059848, + "grad_norm": 0.0034288205206394196, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 2293 + }, + { + "epoch": 0.06329650924166284, + "grad_norm": 0.003537122393026948, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 2294 + }, + { + "epoch": 0.06332410144272721, + "grad_norm": 0.002853949787095189, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 2295 + }, + { + "epoch": 0.06335169364379158, + "grad_norm": 0.00246223621070385, + "learning_rate": 0.001, + "loss": 0.4675, + "step": 2296 + }, + { + "epoch": 0.06337928584485596, + "grad_norm": 0.003921873867511749, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 2297 + }, + { + "epoch": 0.06340687804592032, + "grad_norm": 0.0037953928112983704, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 2298 + }, + { + "epoch": 0.06343447024698469, + "grad_norm": 0.003141796449199319, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 2299 + }, + { + "epoch": 0.06346206244804906, + "grad_norm": 0.0069033862091600895, + "learning_rate": 0.001, + "loss": 0.4432, + "step": 2300 + }, + { + "epoch": 0.06348965464911342, + "grad_norm": 0.00247231125831604, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 2301 + }, + { + "epoch": 0.0635172468501778, + "grad_norm": 0.003614683635532856, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 2302 + }, + { + "epoch": 0.06354483905124217, + "grad_norm": 0.0028486682567745447, + "learning_rate": 0.001, + "loss": 0.4465, + "step": 2303 + }, + { + "epoch": 0.06357243125230654, + "grad_norm": 0.0033698193728923798, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 2304 + }, + { + "epoch": 0.0636000234533709, + "grad_norm": 0.0028043955098837614, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 2305 + }, + { + "epoch": 0.06362761565443527, + "grad_norm": 0.0027136304415762424, + "learning_rate": 0.001, + "loss": 0.4328, + "step": 2306 + }, + { + "epoch": 0.06365520785549965, + "grad_norm": 0.00574844004586339, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 2307 + }, + { + "epoch": 0.06368280005656402, + "grad_norm": 0.002718136878684163, + "learning_rate": 0.001, + "loss": 0.443, + "step": 2308 + }, + { + "epoch": 0.06371039225762838, + "grad_norm": 0.0043097492307424545, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 2309 + }, + { + "epoch": 0.06373798445869275, + "grad_norm": 0.006927134469151497, + "learning_rate": 0.001, + "loss": 0.4365, + "step": 2310 + }, + { + "epoch": 0.06376557665975711, + "grad_norm": 0.0031723494175821543, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 2311 + }, + { + "epoch": 0.0637931688608215, + "grad_norm": 0.0034040913451462984, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 2312 + }, + { + "epoch": 0.06382076106188586, + "grad_norm": 0.003764103166759014, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 2313 + }, + { + "epoch": 0.06384835326295023, + "grad_norm": 0.0026512970216572285, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 2314 + }, + { + "epoch": 0.0638759454640146, + "grad_norm": 0.003220104379579425, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 2315 + }, + { + "epoch": 0.06390353766507896, + "grad_norm": 0.0035847953986376524, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 2316 + }, + { + "epoch": 0.06393112986614334, + "grad_norm": 0.004416230600327253, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 2317 + }, + { + "epoch": 0.06395872206720771, + "grad_norm": 0.00270696054212749, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 2318 + }, + { + "epoch": 0.06398631426827207, + "grad_norm": 0.0026564293075352907, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 2319 + }, + { + "epoch": 0.06401390646933644, + "grad_norm": 0.0031192628666758537, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 2320 + }, + { + "epoch": 0.0640414986704008, + "grad_norm": 0.003101060399785638, + "learning_rate": 0.001, + "loss": 0.4423, + "step": 2321 + }, + { + "epoch": 0.06406909087146519, + "grad_norm": 0.00393852312117815, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 2322 + }, + { + "epoch": 0.06409668307252955, + "grad_norm": 0.003799998899921775, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 2323 + }, + { + "epoch": 0.06412427527359392, + "grad_norm": 0.008037365972995758, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 2324 + }, + { + "epoch": 0.06415186747465829, + "grad_norm": 0.004333310294896364, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 2325 + }, + { + "epoch": 0.06417945967572265, + "grad_norm": 0.0026929888408631086, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 2326 + }, + { + "epoch": 0.06420705187678703, + "grad_norm": 0.0040913717821240425, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 2327 + }, + { + "epoch": 0.0642346440778514, + "grad_norm": 0.005933169275522232, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 2328 + }, + { + "epoch": 0.06426223627891577, + "grad_norm": 0.003139302134513855, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 2329 + }, + { + "epoch": 0.06428982847998013, + "grad_norm": 0.003957003820687532, + "learning_rate": 0.001, + "loss": 0.3589, + "step": 2330 + }, + { + "epoch": 0.0643174206810445, + "grad_norm": 0.003573901252821088, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 2331 + }, + { + "epoch": 0.06434501288210887, + "grad_norm": 0.0033237498719245195, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 2332 + }, + { + "epoch": 0.06437260508317325, + "grad_norm": 0.0034847762435674667, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 2333 + }, + { + "epoch": 0.06440019728423761, + "grad_norm": 0.004198945127427578, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 2334 + }, + { + "epoch": 0.06442778948530198, + "grad_norm": 0.003995297942310572, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 2335 + }, + { + "epoch": 0.06445538168636634, + "grad_norm": 0.003225005231797695, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 2336 + }, + { + "epoch": 0.06448297388743071, + "grad_norm": 0.0031134854070842266, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 2337 + }, + { + "epoch": 0.06451056608849509, + "grad_norm": 0.004270988516509533, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 2338 + }, + { + "epoch": 0.06453815828955946, + "grad_norm": 0.004274028819054365, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 2339 + }, + { + "epoch": 0.06456575049062382, + "grad_norm": 0.003787413239479065, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 2340 + }, + { + "epoch": 0.06459334269168819, + "grad_norm": 0.003217950463294983, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 2341 + }, + { + "epoch": 0.06462093489275256, + "grad_norm": 0.0038233373779803514, + "learning_rate": 0.001, + "loss": 0.3671, + "step": 2342 + }, + { + "epoch": 0.06464852709381694, + "grad_norm": 0.0038631544448435307, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 2343 + }, + { + "epoch": 0.0646761192948813, + "grad_norm": 0.002935303607955575, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 2344 + }, + { + "epoch": 0.06470371149594567, + "grad_norm": 0.004563242197036743, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 2345 + }, + { + "epoch": 0.06473130369701004, + "grad_norm": 0.004774071741849184, + "learning_rate": 0.001, + "loss": 0.409, + "step": 2346 + }, + { + "epoch": 0.0647588958980744, + "grad_norm": 0.0027761361561715603, + "learning_rate": 0.001, + "loss": 0.435, + "step": 2347 + }, + { + "epoch": 0.06478648809913878, + "grad_norm": 0.0030395982321351767, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 2348 + }, + { + "epoch": 0.06481408030020315, + "grad_norm": 0.034988418221473694, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 2349 + }, + { + "epoch": 0.06484167250126752, + "grad_norm": 0.003620629198849201, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 2350 + }, + { + "epoch": 0.06486926470233188, + "grad_norm": 0.003206313122063875, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 2351 + }, + { + "epoch": 0.06489685690339625, + "grad_norm": 0.0028597121126949787, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 2352 + }, + { + "epoch": 0.06492444910446063, + "grad_norm": 0.0034794441889971495, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 2353 + }, + { + "epoch": 0.064952041305525, + "grad_norm": 0.0031176162883639336, + "learning_rate": 0.001, + "loss": 0.3651, + "step": 2354 + }, + { + "epoch": 0.06497963350658936, + "grad_norm": 0.0035436085890978575, + "learning_rate": 0.001, + "loss": 0.37, + "step": 2355 + }, + { + "epoch": 0.06500722570765373, + "grad_norm": 0.003136987565085292, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 2356 + }, + { + "epoch": 0.0650348179087181, + "grad_norm": 0.003193167271092534, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 2357 + }, + { + "epoch": 0.06506241010978248, + "grad_norm": 0.0033892260398715734, + "learning_rate": 0.001, + "loss": 0.3589, + "step": 2358 + }, + { + "epoch": 0.06509000231084684, + "grad_norm": 0.0065610939636826515, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 2359 + }, + { + "epoch": 0.06511759451191121, + "grad_norm": 0.005481204017996788, + "learning_rate": 0.001, + "loss": 0.3597, + "step": 2360 + }, + { + "epoch": 0.06514518671297558, + "grad_norm": 0.0103605967015028, + "learning_rate": 0.001, + "loss": 0.3356, + "step": 2361 + }, + { + "epoch": 0.06517277891403994, + "grad_norm": 0.0066803195513784885, + "learning_rate": 0.001, + "loss": 0.426, + "step": 2362 + }, + { + "epoch": 0.06520037111510432, + "grad_norm": 0.004259153269231319, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 2363 + }, + { + "epoch": 0.06522796331616869, + "grad_norm": 0.012415111064910889, + "learning_rate": 0.001, + "loss": 0.3602, + "step": 2364 + }, + { + "epoch": 0.06525555551723305, + "grad_norm": 0.0029489900916814804, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 2365 + }, + { + "epoch": 0.06528314771829742, + "grad_norm": 0.006090483628213406, + "learning_rate": 0.001, + "loss": 0.379, + "step": 2366 + }, + { + "epoch": 0.06531073991936179, + "grad_norm": 0.003908079583197832, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 2367 + }, + { + "epoch": 0.06533833212042617, + "grad_norm": 0.0039390516467392445, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 2368 + }, + { + "epoch": 0.06536592432149053, + "grad_norm": 0.009213199838995934, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 2369 + }, + { + "epoch": 0.0653935165225549, + "grad_norm": 0.0031545236706733704, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 2370 + }, + { + "epoch": 0.06542110872361927, + "grad_norm": 0.003665713593363762, + "learning_rate": 0.001, + "loss": 0.379, + "step": 2371 + }, + { + "epoch": 0.06544870092468363, + "grad_norm": 0.00421819556504488, + "learning_rate": 0.001, + "loss": 0.3466, + "step": 2372 + }, + { + "epoch": 0.06547629312574801, + "grad_norm": 0.0041595143266022205, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 2373 + }, + { + "epoch": 0.06550388532681238, + "grad_norm": 0.00258744228631258, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 2374 + }, + { + "epoch": 0.06553147752787675, + "grad_norm": 0.0025976793840527534, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 2375 + }, + { + "epoch": 0.06555906972894111, + "grad_norm": 0.002732262946665287, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 2376 + }, + { + "epoch": 0.06558666193000548, + "grad_norm": 0.0024393522180616856, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 2377 + }, + { + "epoch": 0.06561425413106985, + "grad_norm": 0.006330225151032209, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 2378 + }, + { + "epoch": 0.06564184633213423, + "grad_norm": 0.0031101806089282036, + "learning_rate": 0.001, + "loss": 0.362, + "step": 2379 + }, + { + "epoch": 0.06566943853319859, + "grad_norm": 0.0026855298783630133, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 2380 + }, + { + "epoch": 0.06569703073426296, + "grad_norm": 0.002515793778002262, + "learning_rate": 0.001, + "loss": 0.4513, + "step": 2381 + }, + { + "epoch": 0.06572462293532733, + "grad_norm": 0.0025803048629313707, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 2382 + }, + { + "epoch": 0.06575221513639169, + "grad_norm": 0.0031242026016116142, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 2383 + }, + { + "epoch": 0.06577980733745607, + "grad_norm": 0.003442540764808655, + "learning_rate": 0.001, + "loss": 0.425, + "step": 2384 + }, + { + "epoch": 0.06580739953852044, + "grad_norm": 0.0035992697812616825, + "learning_rate": 0.001, + "loss": 0.3475, + "step": 2385 + }, + { + "epoch": 0.0658349917395848, + "grad_norm": 0.0026494376361370087, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 2386 + }, + { + "epoch": 0.06586258394064917, + "grad_norm": 0.0026794499717652798, + "learning_rate": 0.001, + "loss": 0.4355, + "step": 2387 + }, + { + "epoch": 0.06589017614171354, + "grad_norm": 0.003907007165253162, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 2388 + }, + { + "epoch": 0.06591776834277792, + "grad_norm": 0.0038369898684322834, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 2389 + }, + { + "epoch": 0.06594536054384229, + "grad_norm": 0.003516831435263157, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 2390 + }, + { + "epoch": 0.06597295274490665, + "grad_norm": 0.0029046509880572557, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 2391 + }, + { + "epoch": 0.06600054494597102, + "grad_norm": 0.0029956242069602013, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 2392 + }, + { + "epoch": 0.06602813714703538, + "grad_norm": 0.0032762791961431503, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 2393 + }, + { + "epoch": 0.06605572934809976, + "grad_norm": 0.0029915180057287216, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 2394 + }, + { + "epoch": 0.06608332154916413, + "grad_norm": 0.004418436903506517, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 2395 + }, + { + "epoch": 0.0661109137502285, + "grad_norm": 0.003055717097595334, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 2396 + }, + { + "epoch": 0.06613850595129286, + "grad_norm": 0.0034056156873703003, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 2397 + }, + { + "epoch": 0.06616609815235723, + "grad_norm": 0.002760351402685046, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 2398 + }, + { + "epoch": 0.06619369035342161, + "grad_norm": 0.0054167998023331165, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 2399 + }, + { + "epoch": 0.06622128255448598, + "grad_norm": 0.002874811412766576, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 2400 + }, + { + "epoch": 0.06624887475555034, + "grad_norm": 0.0038131948094815016, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 2401 + }, + { + "epoch": 0.06627646695661471, + "grad_norm": 0.01067175529897213, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 2402 + }, + { + "epoch": 0.06630405915767908, + "grad_norm": 0.003224137471988797, + "learning_rate": 0.001, + "loss": 0.426, + "step": 2403 + }, + { + "epoch": 0.06633165135874346, + "grad_norm": 0.004995389375835657, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 2404 + }, + { + "epoch": 0.06635924355980782, + "grad_norm": 0.0033968077041208744, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 2405 + }, + { + "epoch": 0.06638683576087219, + "grad_norm": 0.003337099449709058, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 2406 + }, + { + "epoch": 0.06641442796193656, + "grad_norm": 0.0033400380052626133, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2407 + }, + { + "epoch": 0.06644202016300092, + "grad_norm": 0.0059796725399792194, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 2408 + }, + { + "epoch": 0.0664696123640653, + "grad_norm": 0.0036425914149731398, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 2409 + }, + { + "epoch": 0.06649720456512967, + "grad_norm": 0.003386643249541521, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 2410 + }, + { + "epoch": 0.06652479676619404, + "grad_norm": 0.0025921366177499294, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 2411 + }, + { + "epoch": 0.0665523889672584, + "grad_norm": 0.003707844065502286, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 2412 + }, + { + "epoch": 0.06657998116832277, + "grad_norm": 0.005570289213210344, + "learning_rate": 0.001, + "loss": 0.395, + "step": 2413 + }, + { + "epoch": 0.06660757336938715, + "grad_norm": 0.004504153970628977, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 2414 + }, + { + "epoch": 0.06663516557045152, + "grad_norm": 0.0021061068400740623, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 2415 + }, + { + "epoch": 0.06666275777151588, + "grad_norm": 0.00822344422340393, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 2416 + }, + { + "epoch": 0.06669034997258025, + "grad_norm": 0.003418115433305502, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 2417 + }, + { + "epoch": 0.06671794217364461, + "grad_norm": 0.015280312858521938, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 2418 + }, + { + "epoch": 0.066745534374709, + "grad_norm": 0.006191336549818516, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 2419 + }, + { + "epoch": 0.06677312657577336, + "grad_norm": 0.0025210208259522915, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 2420 + }, + { + "epoch": 0.06680071877683773, + "grad_norm": 0.003629886545240879, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 2421 + }, + { + "epoch": 0.0668283109779021, + "grad_norm": 0.005250046495348215, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 2422 + }, + { + "epoch": 0.06685590317896646, + "grad_norm": 0.0036179288290441036, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 2423 + }, + { + "epoch": 0.06688349538003083, + "grad_norm": 0.005279705859720707, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 2424 + }, + { + "epoch": 0.06691108758109521, + "grad_norm": 0.0030266193207353354, + "learning_rate": 0.001, + "loss": 0.38, + "step": 2425 + }, + { + "epoch": 0.06693867978215957, + "grad_norm": 0.007787918671965599, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 2426 + }, + { + "epoch": 0.06696627198322394, + "grad_norm": 0.00260373717173934, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 2427 + }, + { + "epoch": 0.06699386418428831, + "grad_norm": 0.002870837925001979, + "learning_rate": 0.001, + "loss": 0.4329, + "step": 2428 + }, + { + "epoch": 0.06702145638535267, + "grad_norm": 0.0024147257208824158, + "learning_rate": 0.001, + "loss": 0.4342, + "step": 2429 + }, + { + "epoch": 0.06704904858641705, + "grad_norm": 0.003153110621497035, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 2430 + }, + { + "epoch": 0.06707664078748142, + "grad_norm": 0.0036289046984165907, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 2431 + }, + { + "epoch": 0.06710423298854579, + "grad_norm": 0.003097312757745385, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 2432 + }, + { + "epoch": 0.06713182518961015, + "grad_norm": 0.003282829187810421, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 2433 + }, + { + "epoch": 0.06715941739067452, + "grad_norm": 0.006064482033252716, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 2434 + }, + { + "epoch": 0.0671870095917389, + "grad_norm": 0.007535384502261877, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 2435 + }, + { + "epoch": 0.06721460179280327, + "grad_norm": 0.003580626333132386, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 2436 + }, + { + "epoch": 0.06724219399386763, + "grad_norm": 0.004072824027389288, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 2437 + }, + { + "epoch": 0.067269786194932, + "grad_norm": 0.0034644294064491987, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 2438 + }, + { + "epoch": 0.06729737839599637, + "grad_norm": 0.0031727415043860674, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 2439 + }, + { + "epoch": 0.06732497059706075, + "grad_norm": 0.0025959559716284275, + "learning_rate": 0.001, + "loss": 0.4412, + "step": 2440 + }, + { + "epoch": 0.06735256279812511, + "grad_norm": 0.007867682725191116, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 2441 + }, + { + "epoch": 0.06738015499918948, + "grad_norm": 0.003531220369040966, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 2442 + }, + { + "epoch": 0.06740774720025385, + "grad_norm": 0.0036349524743855, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 2443 + }, + { + "epoch": 0.06743533940131821, + "grad_norm": 0.003158099949359894, + "learning_rate": 0.001, + "loss": 0.429, + "step": 2444 + }, + { + "epoch": 0.06746293160238259, + "grad_norm": 0.002893587574362755, + "learning_rate": 0.001, + "loss": 0.4316, + "step": 2445 + }, + { + "epoch": 0.06749052380344696, + "grad_norm": 0.002769331680610776, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 2446 + }, + { + "epoch": 0.06751811600451132, + "grad_norm": 0.0028869437519460917, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 2447 + }, + { + "epoch": 0.06754570820557569, + "grad_norm": 0.004190088715404272, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 2448 + }, + { + "epoch": 0.06757330040664006, + "grad_norm": 0.00414698664098978, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 2449 + }, + { + "epoch": 0.06760089260770444, + "grad_norm": 0.0034248684532940388, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 2450 + }, + { + "epoch": 0.0676284848087688, + "grad_norm": 0.0032379438634961843, + "learning_rate": 0.001, + "loss": 0.3517, + "step": 2451 + }, + { + "epoch": 0.06765607700983317, + "grad_norm": 0.0037679013330489397, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 2452 + }, + { + "epoch": 0.06768366921089754, + "grad_norm": 0.0025305391754955053, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 2453 + }, + { + "epoch": 0.0677112614119619, + "grad_norm": 0.010560314171016216, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 2454 + }, + { + "epoch": 0.06773885361302628, + "grad_norm": 0.0022722717840224504, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 2455 + }, + { + "epoch": 0.06776644581409065, + "grad_norm": 0.0035940262023359537, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 2456 + }, + { + "epoch": 0.06779403801515502, + "grad_norm": 0.0032793956343084574, + "learning_rate": 0.001, + "loss": 0.381, + "step": 2457 + }, + { + "epoch": 0.06782163021621938, + "grad_norm": 0.0032437799964100122, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 2458 + }, + { + "epoch": 0.06784922241728375, + "grad_norm": 0.004813835956156254, + "learning_rate": 0.001, + "loss": 0.3529, + "step": 2459 + }, + { + "epoch": 0.06787681461834813, + "grad_norm": 0.0028039722237735987, + "learning_rate": 0.001, + "loss": 0.384, + "step": 2460 + }, + { + "epoch": 0.0679044068194125, + "grad_norm": 0.0028152938466519117, + "learning_rate": 0.001, + "loss": 0.396, + "step": 2461 + }, + { + "epoch": 0.06793199902047686, + "grad_norm": 0.004003360401839018, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 2462 + }, + { + "epoch": 0.06795959122154123, + "grad_norm": 0.003837777767330408, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 2463 + }, + { + "epoch": 0.0679871834226056, + "grad_norm": 0.0043929507955908775, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 2464 + }, + { + "epoch": 0.06801477562366998, + "grad_norm": 0.005224680993705988, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 2465 + }, + { + "epoch": 0.06804236782473434, + "grad_norm": 0.0029568690806627274, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 2466 + }, + { + "epoch": 0.06806996002579871, + "grad_norm": 0.0061539956368505955, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 2467 + }, + { + "epoch": 0.06809755222686308, + "grad_norm": 0.003775696037337184, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 2468 + }, + { + "epoch": 0.06812514442792744, + "grad_norm": 0.003055511973798275, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 2469 + }, + { + "epoch": 0.06815273662899182, + "grad_norm": 0.004253302235156298, + "learning_rate": 0.001, + "loss": 0.391, + "step": 2470 + }, + { + "epoch": 0.06818032883005619, + "grad_norm": 0.00398953165858984, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 2471 + }, + { + "epoch": 0.06820792103112056, + "grad_norm": 0.005971815902739763, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 2472 + }, + { + "epoch": 0.06823551323218492, + "grad_norm": 0.0031450032256543636, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 2473 + }, + { + "epoch": 0.06826310543324929, + "grad_norm": 0.013066442683339119, + "learning_rate": 0.001, + "loss": 0.374, + "step": 2474 + }, + { + "epoch": 0.06829069763431365, + "grad_norm": 0.005165675655007362, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 2475 + }, + { + "epoch": 0.06831828983537803, + "grad_norm": 0.002924390835687518, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 2476 + }, + { + "epoch": 0.0683458820364424, + "grad_norm": 0.0032948816660791636, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 2477 + }, + { + "epoch": 0.06837347423750677, + "grad_norm": 0.004480746109038591, + "learning_rate": 0.001, + "loss": 0.3155, + "step": 2478 + }, + { + "epoch": 0.06840106643857113, + "grad_norm": 0.0036851740442216396, + "learning_rate": 0.001, + "loss": 0.3506, + "step": 2479 + }, + { + "epoch": 0.0684286586396355, + "grad_norm": 0.0027917283587157726, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 2480 + }, + { + "epoch": 0.06845625084069988, + "grad_norm": 0.0024517588317394257, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 2481 + }, + { + "epoch": 0.06848384304176425, + "grad_norm": 0.002968950429931283, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 2482 + }, + { + "epoch": 0.06851143524282861, + "grad_norm": 0.0027774290647357702, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 2483 + }, + { + "epoch": 0.06853902744389298, + "grad_norm": 0.002697261283174157, + "learning_rate": 0.001, + "loss": 0.4455, + "step": 2484 + }, + { + "epoch": 0.06856661964495735, + "grad_norm": 0.003583157667890191, + "learning_rate": 0.001, + "loss": 0.397, + "step": 2485 + }, + { + "epoch": 0.06859421184602173, + "grad_norm": 0.002954701893031597, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 2486 + }, + { + "epoch": 0.0686218040470861, + "grad_norm": 0.00595908472314477, + "learning_rate": 0.001, + "loss": 0.4411, + "step": 2487 + }, + { + "epoch": 0.06864939624815046, + "grad_norm": 0.0025385827757418156, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 2488 + }, + { + "epoch": 0.06867698844921483, + "grad_norm": 0.0031357340048998594, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 2489 + }, + { + "epoch": 0.06870458065027919, + "grad_norm": 0.006353291217237711, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 2490 + }, + { + "epoch": 0.06873217285134357, + "grad_norm": 0.0036874916404485703, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 2491 + }, + { + "epoch": 0.06875976505240794, + "grad_norm": 0.0032723871991038322, + "learning_rate": 0.001, + "loss": 0.4342, + "step": 2492 + }, + { + "epoch": 0.0687873572534723, + "grad_norm": 0.004007409326732159, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 2493 + }, + { + "epoch": 0.06881494945453667, + "grad_norm": 0.0029772506095469, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 2494 + }, + { + "epoch": 0.06884254165560104, + "grad_norm": 0.002340099308639765, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 2495 + }, + { + "epoch": 0.06887013385666542, + "grad_norm": 0.0038869476411491632, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 2496 + }, + { + "epoch": 0.06889772605772979, + "grad_norm": 0.004465511068701744, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 2497 + }, + { + "epoch": 0.06892531825879415, + "grad_norm": 0.004165589809417725, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 2498 + }, + { + "epoch": 0.06895291045985852, + "grad_norm": 0.0023313488345593214, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 2499 + }, + { + "epoch": 0.06898050266092288, + "grad_norm": 0.004344920627772808, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 2500 + }, + { + "epoch": 0.06898050266092288, + "eval_runtime": 24.3559, + "eval_samples_per_second": 1.314, + "eval_steps_per_second": 0.164, + "step": 2500 + }, + { + "epoch": 0.06900809486198727, + "grad_norm": 0.0032904972322285175, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 2501 + }, + { + "epoch": 0.06903568706305163, + "grad_norm": 0.0037113146390765905, + "learning_rate": 0.001, + "loss": 0.3731, + "step": 2502 + }, + { + "epoch": 0.069063279264116, + "grad_norm": 0.0028964923694729805, + "learning_rate": 0.001, + "loss": 0.4537, + "step": 2503 + }, + { + "epoch": 0.06909087146518036, + "grad_norm": 0.005055161193013191, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 2504 + }, + { + "epoch": 0.06911846366624473, + "grad_norm": 0.0033460462000221014, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 2505 + }, + { + "epoch": 0.06914605586730911, + "grad_norm": 0.00268213776871562, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 2506 + }, + { + "epoch": 0.06917364806837348, + "grad_norm": 0.0038208633195608854, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 2507 + }, + { + "epoch": 0.06920124026943784, + "grad_norm": 0.0029110198374837637, + "learning_rate": 0.001, + "loss": 0.4377, + "step": 2508 + }, + { + "epoch": 0.06922883247050221, + "grad_norm": 0.004070633556693792, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 2509 + }, + { + "epoch": 0.06925642467156658, + "grad_norm": 0.004861120600253344, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 2510 + }, + { + "epoch": 0.06928401687263096, + "grad_norm": 0.004837087355554104, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 2511 + }, + { + "epoch": 0.06931160907369532, + "grad_norm": 0.005938942078500986, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 2512 + }, + { + "epoch": 0.06933920127475969, + "grad_norm": 0.007488689385354519, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 2513 + }, + { + "epoch": 0.06936679347582406, + "grad_norm": 0.007844404317438602, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 2514 + }, + { + "epoch": 0.06939438567688842, + "grad_norm": 0.006541546434164047, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 2515 + }, + { + "epoch": 0.0694219778779528, + "grad_norm": 0.004513300955295563, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 2516 + }, + { + "epoch": 0.06944957007901717, + "grad_norm": 0.0028703995048999786, + "learning_rate": 0.001, + "loss": 0.4453, + "step": 2517 + }, + { + "epoch": 0.06947716228008154, + "grad_norm": 0.0036693988367915154, + "learning_rate": 0.001, + "loss": 0.4475, + "step": 2518 + }, + { + "epoch": 0.0695047544811459, + "grad_norm": 0.0035753645934164524, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 2519 + }, + { + "epoch": 0.06953234668221027, + "grad_norm": 0.0029638251289725304, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 2520 + }, + { + "epoch": 0.06955993888327464, + "grad_norm": 0.002767800120636821, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 2521 + }, + { + "epoch": 0.06958753108433902, + "grad_norm": 0.00324231362901628, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 2522 + }, + { + "epoch": 0.06961512328540338, + "grad_norm": 0.0024813879281282425, + "learning_rate": 0.001, + "loss": 0.38, + "step": 2523 + }, + { + "epoch": 0.06964271548646775, + "grad_norm": 0.0032533248886466026, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 2524 + }, + { + "epoch": 0.06967030768753212, + "grad_norm": 0.005439402535557747, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 2525 + }, + { + "epoch": 0.06969789988859648, + "grad_norm": 0.002291264943778515, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 2526 + }, + { + "epoch": 0.06972549208966086, + "grad_norm": 0.0028953540604561567, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 2527 + }, + { + "epoch": 0.06975308429072523, + "grad_norm": 0.003083001123741269, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 2528 + }, + { + "epoch": 0.0697806764917896, + "grad_norm": 0.00382791250012815, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 2529 + }, + { + "epoch": 0.06980826869285396, + "grad_norm": 0.004844884853810072, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 2530 + }, + { + "epoch": 0.06983586089391833, + "grad_norm": 0.0025243901181966066, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 2531 + }, + { + "epoch": 0.06986345309498271, + "grad_norm": 0.003347366116940975, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 2532 + }, + { + "epoch": 0.06989104529604707, + "grad_norm": 0.004716082476079464, + "learning_rate": 0.001, + "loss": 0.3485, + "step": 2533 + }, + { + "epoch": 0.06991863749711144, + "grad_norm": 0.004542906302958727, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 2534 + }, + { + "epoch": 0.06994622969817581, + "grad_norm": 0.002997052390128374, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 2535 + }, + { + "epoch": 0.06997382189924017, + "grad_norm": 0.0032483143731951714, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 2536 + }, + { + "epoch": 0.07000141410030455, + "grad_norm": 0.0036252515856176615, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 2537 + }, + { + "epoch": 0.07002900630136892, + "grad_norm": 0.003113416489213705, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 2538 + }, + { + "epoch": 0.07005659850243329, + "grad_norm": 0.003129280637949705, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 2539 + }, + { + "epoch": 0.07008419070349765, + "grad_norm": 0.005747736897319555, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 2540 + }, + { + "epoch": 0.07011178290456202, + "grad_norm": 0.01309170015156269, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 2541 + }, + { + "epoch": 0.0701393751056264, + "grad_norm": 0.0030168737284839153, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 2542 + }, + { + "epoch": 0.07016696730669077, + "grad_norm": 0.0039615570567548275, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 2543 + }, + { + "epoch": 0.07019455950775513, + "grad_norm": 0.034471940249204636, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 2544 + }, + { + "epoch": 0.0702221517088195, + "grad_norm": 0.00423240615054965, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 2545 + }, + { + "epoch": 0.07024974390988387, + "grad_norm": 0.004391579423099756, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 2546 + }, + { + "epoch": 0.07027733611094825, + "grad_norm": 0.004261813126504421, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 2547 + }, + { + "epoch": 0.07030492831201261, + "grad_norm": 0.005609198939055204, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 2548 + }, + { + "epoch": 0.07033252051307698, + "grad_norm": 0.0033081392757594585, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 2549 + }, + { + "epoch": 0.07036011271414135, + "grad_norm": 0.0029132398776710033, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 2550 + }, + { + "epoch": 0.07038770491520571, + "grad_norm": 0.004867136478424072, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 2551 + }, + { + "epoch": 0.07041529711627009, + "grad_norm": 0.004058686550706625, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 2552 + }, + { + "epoch": 0.07044288931733446, + "grad_norm": 0.0027920163702219725, + "learning_rate": 0.001, + "loss": 0.426, + "step": 2553 + }, + { + "epoch": 0.07047048151839883, + "grad_norm": 0.00434753717854619, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 2554 + }, + { + "epoch": 0.07049807371946319, + "grad_norm": 0.0051083932630717754, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 2555 + }, + { + "epoch": 0.07052566592052756, + "grad_norm": 0.0038204751908779144, + "learning_rate": 0.001, + "loss": 0.381, + "step": 2556 + }, + { + "epoch": 0.07055325812159194, + "grad_norm": 0.0038525923155248165, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 2557 + }, + { + "epoch": 0.0705808503226563, + "grad_norm": 0.0032179048284888268, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 2558 + }, + { + "epoch": 0.07060844252372067, + "grad_norm": 0.008977223187685013, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 2559 + }, + { + "epoch": 0.07063603472478504, + "grad_norm": 0.004340124782174826, + "learning_rate": 0.001, + "loss": 0.3442, + "step": 2560 + }, + { + "epoch": 0.0706636269258494, + "grad_norm": 0.0031930410768836737, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 2561 + }, + { + "epoch": 0.07069121912691378, + "grad_norm": 0.0034286684822291136, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 2562 + }, + { + "epoch": 0.07071881132797815, + "grad_norm": 0.003299806034192443, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 2563 + }, + { + "epoch": 0.07074640352904252, + "grad_norm": 0.0048191421665251255, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 2564 + }, + { + "epoch": 0.07077399573010688, + "grad_norm": 0.006031329277902842, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 2565 + }, + { + "epoch": 0.07080158793117125, + "grad_norm": 0.00475485622882843, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 2566 + }, + { + "epoch": 0.07082918013223562, + "grad_norm": 0.0029015771578997374, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 2567 + }, + { + "epoch": 0.0708567723333, + "grad_norm": 0.0026210874784737825, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 2568 + }, + { + "epoch": 0.07088436453436436, + "grad_norm": 0.00292952754534781, + "learning_rate": 0.001, + "loss": 0.4424, + "step": 2569 + }, + { + "epoch": 0.07091195673542873, + "grad_norm": 0.004151395056396723, + "learning_rate": 0.001, + "loss": 0.3488, + "step": 2570 + }, + { + "epoch": 0.0709395489364931, + "grad_norm": 0.003007207065820694, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 2571 + }, + { + "epoch": 0.07096714113755746, + "grad_norm": 0.0025580485817044973, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 2572 + }, + { + "epoch": 0.07099473333862184, + "grad_norm": 0.0034552421420812607, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 2573 + }, + { + "epoch": 0.07102232553968621, + "grad_norm": 0.004341066349297762, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 2574 + }, + { + "epoch": 0.07104991774075058, + "grad_norm": 0.0033516965340822935, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 2575 + }, + { + "epoch": 0.07107750994181494, + "grad_norm": 0.0036367857828736305, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 2576 + }, + { + "epoch": 0.07110510214287931, + "grad_norm": 0.0029854371678084135, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 2577 + }, + { + "epoch": 0.07113269434394369, + "grad_norm": 0.002374214818701148, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 2578 + }, + { + "epoch": 0.07116028654500806, + "grad_norm": 0.002954955445602536, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 2579 + }, + { + "epoch": 0.07118787874607242, + "grad_norm": 0.003998765256255865, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 2580 + }, + { + "epoch": 0.07121547094713679, + "grad_norm": 0.0029591761995106936, + "learning_rate": 0.001, + "loss": 0.4329, + "step": 2581 + }, + { + "epoch": 0.07124306314820116, + "grad_norm": 0.0039256964810192585, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 2582 + }, + { + "epoch": 0.07127065534926554, + "grad_norm": 0.005670437589287758, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 2583 + }, + { + "epoch": 0.0712982475503299, + "grad_norm": 0.0024786926805973053, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 2584 + }, + { + "epoch": 0.07132583975139427, + "grad_norm": 0.0045395889319479465, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 2585 + }, + { + "epoch": 0.07135343195245863, + "grad_norm": 0.002754254499450326, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 2586 + }, + { + "epoch": 0.071381024153523, + "grad_norm": 0.0032517199870198965, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 2587 + }, + { + "epoch": 0.07140861635458738, + "grad_norm": 0.0037906805519014597, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 2588 + }, + { + "epoch": 0.07143620855565175, + "grad_norm": 0.00529517512768507, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 2589 + }, + { + "epoch": 0.07146380075671611, + "grad_norm": 0.00859268568456173, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 2590 + }, + { + "epoch": 0.07149139295778048, + "grad_norm": 0.003779566613957286, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 2591 + }, + { + "epoch": 0.07151898515884485, + "grad_norm": 0.0031110162381082773, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 2592 + }, + { + "epoch": 0.07154657735990923, + "grad_norm": 0.004116731230169535, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 2593 + }, + { + "epoch": 0.0715741695609736, + "grad_norm": 0.00332248630002141, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 2594 + }, + { + "epoch": 0.07160176176203796, + "grad_norm": 0.003030715975910425, + "learning_rate": 0.001, + "loss": 0.4413, + "step": 2595 + }, + { + "epoch": 0.07162935396310233, + "grad_norm": 0.0028603002429008484, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 2596 + }, + { + "epoch": 0.0716569461641667, + "grad_norm": 0.0035293481778353453, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 2597 + }, + { + "epoch": 0.07168453836523107, + "grad_norm": 0.004880653228610754, + "learning_rate": 0.001, + "loss": 0.384, + "step": 2598 + }, + { + "epoch": 0.07171213056629544, + "grad_norm": 0.0037580877542495728, + "learning_rate": 0.001, + "loss": 0.373, + "step": 2599 + }, + { + "epoch": 0.0717397227673598, + "grad_norm": 0.004738042131066322, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 2600 + }, + { + "epoch": 0.07176731496842417, + "grad_norm": 0.0030720685608685017, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 2601 + }, + { + "epoch": 0.07179490716948854, + "grad_norm": 0.003540902165696025, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 2602 + }, + { + "epoch": 0.07182249937055292, + "grad_norm": 0.0028504952788352966, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 2603 + }, + { + "epoch": 0.07185009157161729, + "grad_norm": 0.0041265105828642845, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 2604 + }, + { + "epoch": 0.07187768377268165, + "grad_norm": 0.0034538819454610348, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 2605 + }, + { + "epoch": 0.07190527597374602, + "grad_norm": 0.002804661402478814, + "learning_rate": 0.001, + "loss": 0.388, + "step": 2606 + }, + { + "epoch": 0.07193286817481039, + "grad_norm": 0.006338078528642654, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 2607 + }, + { + "epoch": 0.07196046037587477, + "grad_norm": 0.0034297939855605364, + "learning_rate": 0.001, + "loss": 0.3497, + "step": 2608 + }, + { + "epoch": 0.07198805257693913, + "grad_norm": 0.0029641755390912294, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 2609 + }, + { + "epoch": 0.0720156447780035, + "grad_norm": 0.0038228612393140793, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 2610 + }, + { + "epoch": 0.07204323697906787, + "grad_norm": 0.0033363320399075747, + "learning_rate": 0.001, + "loss": 0.395, + "step": 2611 + }, + { + "epoch": 0.07207082918013223, + "grad_norm": 0.002400481840595603, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 2612 + }, + { + "epoch": 0.0720984213811966, + "grad_norm": 0.00315939006395638, + "learning_rate": 0.001, + "loss": 0.4, + "step": 2613 + }, + { + "epoch": 0.07212601358226098, + "grad_norm": 0.00354985473677516, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 2614 + }, + { + "epoch": 0.07215360578332534, + "grad_norm": 0.0028168221469968557, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 2615 + }, + { + "epoch": 0.07218119798438971, + "grad_norm": 0.0025577114429324865, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 2616 + }, + { + "epoch": 0.07220879018545408, + "grad_norm": 0.003505377098917961, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 2617 + }, + { + "epoch": 0.07223638238651844, + "grad_norm": 0.005422186106443405, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 2618 + }, + { + "epoch": 0.07226397458758282, + "grad_norm": 0.003959451802074909, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 2619 + }, + { + "epoch": 0.07229156678864719, + "grad_norm": 0.002390485256910324, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 2620 + }, + { + "epoch": 0.07231915898971156, + "grad_norm": 0.0019209344172850251, + "learning_rate": 0.001, + "loss": 0.395, + "step": 2621 + }, + { + "epoch": 0.07234675119077592, + "grad_norm": 0.0027938170824199915, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 2622 + }, + { + "epoch": 0.07237434339184029, + "grad_norm": 0.002219612244516611, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 2623 + }, + { + "epoch": 0.07240193559290467, + "grad_norm": 0.002710830420255661, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 2624 + }, + { + "epoch": 0.07242952779396904, + "grad_norm": 0.002441881690174341, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 2625 + }, + { + "epoch": 0.0724571199950334, + "grad_norm": 0.002835171762853861, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 2626 + }, + { + "epoch": 0.07248471219609777, + "grad_norm": 0.005800854880362749, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 2627 + }, + { + "epoch": 0.07251230439716214, + "grad_norm": 0.0058226133696734905, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 2628 + }, + { + "epoch": 0.07253989659822652, + "grad_norm": 0.0031959593761712313, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 2629 + }, + { + "epoch": 0.07256748879929088, + "grad_norm": 0.007225159555673599, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 2630 + }, + { + "epoch": 0.07259508100035525, + "grad_norm": 0.003910002298653126, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 2631 + }, + { + "epoch": 0.07262267320141962, + "grad_norm": 0.0047545540146529675, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 2632 + }, + { + "epoch": 0.07265026540248398, + "grad_norm": 0.0037914151325821877, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 2633 + }, + { + "epoch": 0.07267785760354836, + "grad_norm": 0.007707824464887381, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 2634 + }, + { + "epoch": 0.07270544980461273, + "grad_norm": 0.0031401002779603004, + "learning_rate": 0.001, + "loss": 0.3622, + "step": 2635 + }, + { + "epoch": 0.0727330420056771, + "grad_norm": 0.005065685138106346, + "learning_rate": 0.001, + "loss": 0.3612, + "step": 2636 + }, + { + "epoch": 0.07276063420674146, + "grad_norm": 0.0033927711192518473, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 2637 + }, + { + "epoch": 0.07278822640780583, + "grad_norm": 0.0026958470698446035, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 2638 + }, + { + "epoch": 0.07281581860887021, + "grad_norm": 0.004273698199540377, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 2639 + }, + { + "epoch": 0.07284341080993458, + "grad_norm": 0.0033470946364104748, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 2640 + }, + { + "epoch": 0.07287100301099894, + "grad_norm": 0.0030789680313318968, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 2641 + }, + { + "epoch": 0.07289859521206331, + "grad_norm": 0.003024066798388958, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 2642 + }, + { + "epoch": 0.07292618741312767, + "grad_norm": 0.0035327670630067587, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 2643 + }, + { + "epoch": 0.07295377961419205, + "grad_norm": 0.005246289074420929, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 2644 + }, + { + "epoch": 0.07298137181525642, + "grad_norm": 0.0033100054133683443, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 2645 + }, + { + "epoch": 0.07300896401632079, + "grad_norm": 0.014535458758473396, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 2646 + }, + { + "epoch": 0.07303655621738515, + "grad_norm": 0.006717653013765812, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 2647 + }, + { + "epoch": 0.07306414841844952, + "grad_norm": 0.005544296000152826, + "learning_rate": 0.001, + "loss": 0.3546, + "step": 2648 + }, + { + "epoch": 0.0730917406195139, + "grad_norm": 0.002779677277430892, + "learning_rate": 0.001, + "loss": 0.415, + "step": 2649 + }, + { + "epoch": 0.07311933282057827, + "grad_norm": 0.0030521864537149668, + "learning_rate": 0.001, + "loss": 0.41, + "step": 2650 + }, + { + "epoch": 0.07314692502164263, + "grad_norm": 0.0032854536548256874, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 2651 + }, + { + "epoch": 0.073174517222707, + "grad_norm": 0.002539557870477438, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 2652 + }, + { + "epoch": 0.07320210942377137, + "grad_norm": 0.004703735467046499, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 2653 + }, + { + "epoch": 0.07322970162483575, + "grad_norm": 0.0029251237865537405, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 2654 + }, + { + "epoch": 0.07325729382590011, + "grad_norm": 0.0026435167528688908, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 2655 + }, + { + "epoch": 0.07328488602696448, + "grad_norm": 0.004778844770044088, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 2656 + }, + { + "epoch": 0.07331247822802885, + "grad_norm": 0.003458186285570264, + "learning_rate": 0.001, + "loss": 0.4593, + "step": 2657 + }, + { + "epoch": 0.07334007042909321, + "grad_norm": 0.0034521680790930986, + "learning_rate": 0.001, + "loss": 0.4509, + "step": 2658 + }, + { + "epoch": 0.07336766263015758, + "grad_norm": 0.002783542964607477, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 2659 + }, + { + "epoch": 0.07339525483122196, + "grad_norm": 0.004468636121600866, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 2660 + }, + { + "epoch": 0.07342284703228633, + "grad_norm": 0.0026452334132045507, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 2661 + }, + { + "epoch": 0.07345043923335069, + "grad_norm": 0.003701903624460101, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 2662 + }, + { + "epoch": 0.07347803143441506, + "grad_norm": 0.002505905693396926, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 2663 + }, + { + "epoch": 0.07350562363547943, + "grad_norm": 0.002993806730955839, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 2664 + }, + { + "epoch": 0.0735332158365438, + "grad_norm": 0.002987109124660492, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 2665 + }, + { + "epoch": 0.07356080803760817, + "grad_norm": 0.0028901277109980583, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 2666 + }, + { + "epoch": 0.07358840023867254, + "grad_norm": 0.0037864744663238525, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 2667 + }, + { + "epoch": 0.0736159924397369, + "grad_norm": 0.005365621764212847, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 2668 + }, + { + "epoch": 0.07364358464080127, + "grad_norm": 0.0025341627188026905, + "learning_rate": 0.001, + "loss": 0.4598, + "step": 2669 + }, + { + "epoch": 0.07367117684186565, + "grad_norm": 0.0026153249200433493, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 2670 + }, + { + "epoch": 0.07369876904293002, + "grad_norm": 0.00424772035330534, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 2671 + }, + { + "epoch": 0.07372636124399438, + "grad_norm": 0.005840023048222065, + "learning_rate": 0.001, + "loss": 0.382, + "step": 2672 + }, + { + "epoch": 0.07375395344505875, + "grad_norm": 0.00886671431362629, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 2673 + }, + { + "epoch": 0.07378154564612312, + "grad_norm": 0.004951406270265579, + "learning_rate": 0.001, + "loss": 0.383, + "step": 2674 + }, + { + "epoch": 0.0738091378471875, + "grad_norm": 0.004046993795782328, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 2675 + }, + { + "epoch": 0.07383673004825186, + "grad_norm": 0.003650445956736803, + "learning_rate": 0.001, + "loss": 0.4343, + "step": 2676 + }, + { + "epoch": 0.07386432224931623, + "grad_norm": 0.0027846968732774258, + "learning_rate": 0.001, + "loss": 0.4657, + "step": 2677 + }, + { + "epoch": 0.0738919144503806, + "grad_norm": 0.0027162914630025625, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 2678 + }, + { + "epoch": 0.07391950665144496, + "grad_norm": 0.002881822641938925, + "learning_rate": 0.001, + "loss": 0.435, + "step": 2679 + }, + { + "epoch": 0.07394709885250934, + "grad_norm": 0.006673680152744055, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 2680 + }, + { + "epoch": 0.07397469105357371, + "grad_norm": 0.0028485655784606934, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 2681 + }, + { + "epoch": 0.07400228325463808, + "grad_norm": 0.004903602413833141, + "learning_rate": 0.001, + "loss": 0.3578, + "step": 2682 + }, + { + "epoch": 0.07402987545570244, + "grad_norm": 0.010232421569526196, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 2683 + }, + { + "epoch": 0.07405746765676681, + "grad_norm": 0.01437798049300909, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 2684 + }, + { + "epoch": 0.07408505985783119, + "grad_norm": 0.0036920029670000076, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 2685 + }, + { + "epoch": 0.07411265205889556, + "grad_norm": 0.008351249620318413, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 2686 + }, + { + "epoch": 0.07414024425995992, + "grad_norm": 0.004162284545600414, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 2687 + }, + { + "epoch": 0.07416783646102429, + "grad_norm": 0.0036121357697993517, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 2688 + }, + { + "epoch": 0.07419542866208866, + "grad_norm": 0.0035279730800539255, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 2689 + }, + { + "epoch": 0.07422302086315304, + "grad_norm": 0.0031304580625146627, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 2690 + }, + { + "epoch": 0.0742506130642174, + "grad_norm": 0.006544687785208225, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 2691 + }, + { + "epoch": 0.07427820526528177, + "grad_norm": 0.004466759506613016, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 2692 + }, + { + "epoch": 0.07430579746634614, + "grad_norm": 0.0027272431179881096, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 2693 + }, + { + "epoch": 0.0743333896674105, + "grad_norm": 0.0032364106737077236, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 2694 + }, + { + "epoch": 0.07436098186847488, + "grad_norm": 0.002972652204334736, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 2695 + }, + { + "epoch": 0.07438857406953925, + "grad_norm": 0.0027488903142511845, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 2696 + }, + { + "epoch": 0.07441616627060361, + "grad_norm": 0.003380288602784276, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 2697 + }, + { + "epoch": 0.07444375847166798, + "grad_norm": 0.002856023609638214, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 2698 + }, + { + "epoch": 0.07447135067273235, + "grad_norm": 0.003780797589570284, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 2699 + }, + { + "epoch": 0.07449894287379673, + "grad_norm": 0.006125589832663536, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 2700 + }, + { + "epoch": 0.0745265350748611, + "grad_norm": 0.003866039216518402, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 2701 + }, + { + "epoch": 0.07455412727592546, + "grad_norm": 0.0035539071541279554, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 2702 + }, + { + "epoch": 0.07458171947698983, + "grad_norm": 0.003555738367140293, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 2703 + }, + { + "epoch": 0.0746093116780542, + "grad_norm": 0.0025483653880655766, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 2704 + }, + { + "epoch": 0.07463690387911857, + "grad_norm": 0.005306419916450977, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 2705 + }, + { + "epoch": 0.07466449608018294, + "grad_norm": 0.004178240429610014, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 2706 + }, + { + "epoch": 0.0746920882812473, + "grad_norm": 0.0047726561315357685, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 2707 + }, + { + "epoch": 0.07471968048231167, + "grad_norm": 0.003424114780500531, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 2708 + }, + { + "epoch": 0.07474727268337604, + "grad_norm": 0.004055993165820837, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 2709 + }, + { + "epoch": 0.0747748648844404, + "grad_norm": 0.0035490887239575386, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 2710 + }, + { + "epoch": 0.07480245708550479, + "grad_norm": 0.0075918626971542835, + "learning_rate": 0.001, + "loss": 0.4556, + "step": 2711 + }, + { + "epoch": 0.07483004928656915, + "grad_norm": 0.00878838449716568, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 2712 + }, + { + "epoch": 0.07485764148763352, + "grad_norm": 0.004037888254970312, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 2713 + }, + { + "epoch": 0.07488523368869789, + "grad_norm": 0.003275372786447406, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 2714 + }, + { + "epoch": 0.07491282588976225, + "grad_norm": 0.005921733099967241, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 2715 + }, + { + "epoch": 0.07494041809082663, + "grad_norm": 0.003877841867506504, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 2716 + }, + { + "epoch": 0.074968010291891, + "grad_norm": 0.004174409434199333, + "learning_rate": 0.001, + "loss": 0.385, + "step": 2717 + }, + { + "epoch": 0.07499560249295537, + "grad_norm": 0.004188715014606714, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 2718 + }, + { + "epoch": 0.07502319469401973, + "grad_norm": 0.0031484768260270357, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 2719 + }, + { + "epoch": 0.0750507868950841, + "grad_norm": 0.006613558623939753, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 2720 + }, + { + "epoch": 0.07507837909614848, + "grad_norm": 0.0036274839658290148, + "learning_rate": 0.001, + "loss": 0.3511, + "step": 2721 + }, + { + "epoch": 0.07510597129721285, + "grad_norm": 0.004630135837942362, + "learning_rate": 0.001, + "loss": 0.412, + "step": 2722 + }, + { + "epoch": 0.07513356349827721, + "grad_norm": 0.0061690667644143105, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 2723 + }, + { + "epoch": 0.07516115569934158, + "grad_norm": 0.005458046216517687, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 2724 + }, + { + "epoch": 0.07518874790040594, + "grad_norm": 0.007112190593034029, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 2725 + }, + { + "epoch": 0.07521634010147032, + "grad_norm": 0.0031167047563940287, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 2726 + }, + { + "epoch": 0.07524393230253469, + "grad_norm": 0.003523972351104021, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 2727 + }, + { + "epoch": 0.07527152450359906, + "grad_norm": 0.003653216175734997, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 2728 + }, + { + "epoch": 0.07529911670466342, + "grad_norm": 0.003052507760003209, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 2729 + }, + { + "epoch": 0.07532670890572779, + "grad_norm": 0.002761593321338296, + "learning_rate": 0.001, + "loss": 0.3611, + "step": 2730 + }, + { + "epoch": 0.07535430110679217, + "grad_norm": 0.0036866154987365007, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 2731 + }, + { + "epoch": 0.07538189330785654, + "grad_norm": 0.008540788665413857, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 2732 + }, + { + "epoch": 0.0754094855089209, + "grad_norm": 0.0023057356011122465, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 2733 + }, + { + "epoch": 0.07543707770998527, + "grad_norm": 0.006381066981703043, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 2734 + }, + { + "epoch": 0.07546466991104964, + "grad_norm": 0.0035177739337086678, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 2735 + }, + { + "epoch": 0.07549226211211402, + "grad_norm": 0.0035792491398751736, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 2736 + }, + { + "epoch": 0.07551985431317838, + "grad_norm": 0.0046767196618020535, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 2737 + }, + { + "epoch": 0.07554744651424275, + "grad_norm": 0.0037277627270668745, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 2738 + }, + { + "epoch": 0.07557503871530712, + "grad_norm": 0.003284999867901206, + "learning_rate": 0.001, + "loss": 0.404, + "step": 2739 + }, + { + "epoch": 0.07560263091637148, + "grad_norm": 0.0028962334617972374, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 2740 + }, + { + "epoch": 0.07563022311743586, + "grad_norm": 0.003682551207020879, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 2741 + }, + { + "epoch": 0.07565781531850023, + "grad_norm": 0.0028806542977690697, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 2742 + }, + { + "epoch": 0.0756854075195646, + "grad_norm": 0.003922648727893829, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 2743 + }, + { + "epoch": 0.07571299972062896, + "grad_norm": 0.0026945569552481174, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 2744 + }, + { + "epoch": 0.07574059192169333, + "grad_norm": 0.0057632802054286, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 2745 + }, + { + "epoch": 0.07576818412275771, + "grad_norm": 0.006452036090195179, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 2746 + }, + { + "epoch": 0.07579577632382208, + "grad_norm": 0.0030313171446323395, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 2747 + }, + { + "epoch": 0.07582336852488644, + "grad_norm": 0.003243018174543977, + "learning_rate": 0.001, + "loss": 0.3503, + "step": 2748 + }, + { + "epoch": 0.07585096072595081, + "grad_norm": 0.0030355071648955345, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 2749 + }, + { + "epoch": 0.07587855292701517, + "grad_norm": 0.003429468721151352, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 2750 + }, + { + "epoch": 0.07590614512807956, + "grad_norm": 0.0029639608692377806, + "learning_rate": 0.001, + "loss": 0.4498, + "step": 2751 + }, + { + "epoch": 0.07593373732914392, + "grad_norm": 0.004542426206171513, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 2752 + }, + { + "epoch": 0.07596132953020829, + "grad_norm": 0.002826446434482932, + "learning_rate": 0.001, + "loss": 0.405, + "step": 2753 + }, + { + "epoch": 0.07598892173127265, + "grad_norm": 0.004107923712581396, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 2754 + }, + { + "epoch": 0.07601651393233702, + "grad_norm": 0.0028436153661459684, + "learning_rate": 0.001, + "loss": 0.4484, + "step": 2755 + }, + { + "epoch": 0.07604410613340139, + "grad_norm": 0.004834937863051891, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 2756 + }, + { + "epoch": 0.07607169833446577, + "grad_norm": 0.003390131751075387, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 2757 + }, + { + "epoch": 0.07609929053553013, + "grad_norm": 0.002966254251077771, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 2758 + }, + { + "epoch": 0.0761268827365945, + "grad_norm": 0.00300071039237082, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 2759 + }, + { + "epoch": 0.07615447493765887, + "grad_norm": 0.0030495107639580965, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 2760 + }, + { + "epoch": 0.07618206713872323, + "grad_norm": 0.002420577686280012, + "learning_rate": 0.001, + "loss": 0.4494, + "step": 2761 + }, + { + "epoch": 0.07620965933978761, + "grad_norm": 0.0036301531363278627, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 2762 + }, + { + "epoch": 0.07623725154085198, + "grad_norm": 0.0024638317991048098, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 2763 + }, + { + "epoch": 0.07626484374191635, + "grad_norm": 0.003583789337426424, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 2764 + }, + { + "epoch": 0.07629243594298071, + "grad_norm": 0.003300134791061282, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 2765 + }, + { + "epoch": 0.07632002814404508, + "grad_norm": 0.0033223924692720175, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 2766 + }, + { + "epoch": 0.07634762034510946, + "grad_norm": 0.003998725675046444, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 2767 + }, + { + "epoch": 0.07637521254617383, + "grad_norm": 0.0030690559651702642, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 2768 + }, + { + "epoch": 0.07640280474723819, + "grad_norm": 0.0026303695049136877, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 2769 + }, + { + "epoch": 0.07643039694830256, + "grad_norm": 0.004332841839641333, + "learning_rate": 0.001, + "loss": 0.3492, + "step": 2770 + }, + { + "epoch": 0.07645798914936693, + "grad_norm": 0.0027642296627163887, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 2771 + }, + { + "epoch": 0.0764855813504313, + "grad_norm": 0.0023598058614879847, + "learning_rate": 0.001, + "loss": 0.453, + "step": 2772 + }, + { + "epoch": 0.07651317355149567, + "grad_norm": 0.002782411640509963, + "learning_rate": 0.001, + "loss": 0.401, + "step": 2773 + }, + { + "epoch": 0.07654076575256004, + "grad_norm": 0.0030131624080240726, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 2774 + }, + { + "epoch": 0.0765683579536244, + "grad_norm": 0.003464979352429509, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 2775 + }, + { + "epoch": 0.07659595015468877, + "grad_norm": 0.002998175797984004, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 2776 + }, + { + "epoch": 0.07662354235575315, + "grad_norm": 0.004081446677446365, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 2777 + }, + { + "epoch": 0.07665113455681752, + "grad_norm": 0.02017812430858612, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 2778 + }, + { + "epoch": 0.07667872675788188, + "grad_norm": 0.003272912697866559, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 2779 + }, + { + "epoch": 0.07670631895894625, + "grad_norm": 0.005462125409394503, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 2780 + }, + { + "epoch": 0.07673391116001062, + "grad_norm": 0.003259886521846056, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 2781 + }, + { + "epoch": 0.076761503361075, + "grad_norm": 0.003059667069464922, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 2782 + }, + { + "epoch": 0.07678909556213936, + "grad_norm": 0.00428088940680027, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 2783 + }, + { + "epoch": 0.07681668776320373, + "grad_norm": 0.002609378658235073, + "learning_rate": 0.001, + "loss": 0.395, + "step": 2784 + }, + { + "epoch": 0.0768442799642681, + "grad_norm": 0.005715689156204462, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 2785 + }, + { + "epoch": 0.07687187216533246, + "grad_norm": 0.003460571402683854, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 2786 + }, + { + "epoch": 0.07689946436639684, + "grad_norm": 0.002582078566774726, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 2787 + }, + { + "epoch": 0.07692705656746121, + "grad_norm": 0.003169649513438344, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 2788 + }, + { + "epoch": 0.07695464876852558, + "grad_norm": 0.005018650088459253, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 2789 + }, + { + "epoch": 0.07698224096958994, + "grad_norm": 0.0027702595107257366, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 2790 + }, + { + "epoch": 0.07700983317065431, + "grad_norm": 0.0027409393806010485, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 2791 + }, + { + "epoch": 0.07703742537171869, + "grad_norm": 0.0031372166704386473, + "learning_rate": 0.001, + "loss": 0.438, + "step": 2792 + }, + { + "epoch": 0.07706501757278306, + "grad_norm": 0.0028250846080482006, + "learning_rate": 0.001, + "loss": 0.3607, + "step": 2793 + }, + { + "epoch": 0.07709260977384742, + "grad_norm": 0.003316509071737528, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 2794 + }, + { + "epoch": 0.07712020197491179, + "grad_norm": 0.002770907012745738, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 2795 + }, + { + "epoch": 0.07714779417597616, + "grad_norm": 0.002429973566904664, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 2796 + }, + { + "epoch": 0.07717538637704054, + "grad_norm": 0.0032115073408931494, + "learning_rate": 0.001, + "loss": 0.3559, + "step": 2797 + }, + { + "epoch": 0.0772029785781049, + "grad_norm": 0.006297094281762838, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 2798 + }, + { + "epoch": 0.07723057077916927, + "grad_norm": 0.00496833398938179, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 2799 + }, + { + "epoch": 0.07725816298023364, + "grad_norm": 0.002831167308613658, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 2800 + }, + { + "epoch": 0.077285755181298, + "grad_norm": 0.004788990132510662, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 2801 + }, + { + "epoch": 0.07731334738236237, + "grad_norm": 0.0026512015610933304, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 2802 + }, + { + "epoch": 0.07734093958342675, + "grad_norm": 0.003184010973200202, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 2803 + }, + { + "epoch": 0.07736853178449112, + "grad_norm": 0.003882109420374036, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 2804 + }, + { + "epoch": 0.07739612398555548, + "grad_norm": 0.003275086637586355, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 2805 + }, + { + "epoch": 0.07742371618661985, + "grad_norm": 0.0032583875581622124, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 2806 + }, + { + "epoch": 0.07745130838768421, + "grad_norm": 0.0029904379043728113, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 2807 + }, + { + "epoch": 0.0774789005887486, + "grad_norm": 0.0035862699151039124, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 2808 + }, + { + "epoch": 0.07750649278981296, + "grad_norm": 0.003321669064462185, + "learning_rate": 0.001, + "loss": 0.3679, + "step": 2809 + }, + { + "epoch": 0.07753408499087733, + "grad_norm": 0.002322467975318432, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 2810 + }, + { + "epoch": 0.0775616771919417, + "grad_norm": 0.0043731010518968105, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 2811 + }, + { + "epoch": 0.07758926939300606, + "grad_norm": 0.003245170461013913, + "learning_rate": 0.001, + "loss": 0.412, + "step": 2812 + }, + { + "epoch": 0.07761686159407044, + "grad_norm": 0.0024038052652031183, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 2813 + }, + { + "epoch": 0.07764445379513481, + "grad_norm": 0.0031776116229593754, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 2814 + }, + { + "epoch": 0.07767204599619917, + "grad_norm": 0.0029750748071819544, + "learning_rate": 0.001, + "loss": 0.4302, + "step": 2815 + }, + { + "epoch": 0.07769963819726354, + "grad_norm": 0.0033383795525878668, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 2816 + }, + { + "epoch": 0.0777272303983279, + "grad_norm": 0.00494232214987278, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 2817 + }, + { + "epoch": 0.07775482259939229, + "grad_norm": 0.0031302745919674635, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 2818 + }, + { + "epoch": 0.07778241480045665, + "grad_norm": 0.0038369682151824236, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 2819 + }, + { + "epoch": 0.07781000700152102, + "grad_norm": 0.00343205570243299, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 2820 + }, + { + "epoch": 0.07783759920258539, + "grad_norm": 0.005272636190056801, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 2821 + }, + { + "epoch": 0.07786519140364975, + "grad_norm": 0.006007963325828314, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 2822 + }, + { + "epoch": 0.07789278360471413, + "grad_norm": 0.004388149362057447, + "learning_rate": 0.001, + "loss": 0.3658, + "step": 2823 + }, + { + "epoch": 0.0779203758057785, + "grad_norm": 0.006076582707464695, + "learning_rate": 0.001, + "loss": 0.3637, + "step": 2824 + }, + { + "epoch": 0.07794796800684287, + "grad_norm": 0.0026879182551056147, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 2825 + }, + { + "epoch": 0.07797556020790723, + "grad_norm": 0.003195406636223197, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 2826 + }, + { + "epoch": 0.0780031524089716, + "grad_norm": 0.003109849989414215, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2827 + }, + { + "epoch": 0.07803074461003598, + "grad_norm": 0.004339446779340506, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 2828 + }, + { + "epoch": 0.07805833681110035, + "grad_norm": 0.003388351993635297, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 2829 + }, + { + "epoch": 0.07808592901216471, + "grad_norm": 0.0037827894557267427, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 2830 + }, + { + "epoch": 0.07811352121322908, + "grad_norm": 0.006149903871119022, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 2831 + }, + { + "epoch": 0.07814111341429344, + "grad_norm": 0.0027368527371436357, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 2832 + }, + { + "epoch": 0.07816870561535783, + "grad_norm": 0.004948263522237539, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 2833 + }, + { + "epoch": 0.07819629781642219, + "grad_norm": 0.017221815884113312, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 2834 + }, + { + "epoch": 0.07822389001748656, + "grad_norm": 0.004491179715842009, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 2835 + }, + { + "epoch": 0.07825148221855092, + "grad_norm": 0.004400896839797497, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 2836 + }, + { + "epoch": 0.07827907441961529, + "grad_norm": 0.0028574098832905293, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 2837 + }, + { + "epoch": 0.07830666662067967, + "grad_norm": 0.005395799409598112, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 2838 + }, + { + "epoch": 0.07833425882174404, + "grad_norm": 0.004169132094830275, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 2839 + }, + { + "epoch": 0.0783618510228084, + "grad_norm": 0.0036933980882167816, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 2840 + }, + { + "epoch": 0.07838944322387277, + "grad_norm": 0.0037820383440703154, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 2841 + }, + { + "epoch": 0.07841703542493714, + "grad_norm": 0.00365295703522861, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 2842 + }, + { + "epoch": 0.07844462762600152, + "grad_norm": 0.0040397047996521, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 2843 + }, + { + "epoch": 0.07847221982706588, + "grad_norm": 0.0027921211440116167, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 2844 + }, + { + "epoch": 0.07849981202813025, + "grad_norm": 0.002542336704209447, + "learning_rate": 0.001, + "loss": 0.4587, + "step": 2845 + }, + { + "epoch": 0.07852740422919462, + "grad_norm": 0.0032813462894409895, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 2846 + }, + { + "epoch": 0.07855499643025898, + "grad_norm": 0.0026641942095011473, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 2847 + }, + { + "epoch": 0.07858258863132335, + "grad_norm": 0.0045136939734220505, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 2848 + }, + { + "epoch": 0.07861018083238773, + "grad_norm": 0.003331273328512907, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 2849 + }, + { + "epoch": 0.0786377730334521, + "grad_norm": 0.0029903652612119913, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 2850 + }, + { + "epoch": 0.07866536523451646, + "grad_norm": 0.003270956454798579, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 2851 + }, + { + "epoch": 0.07869295743558083, + "grad_norm": 0.0025751839857548475, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 2852 + }, + { + "epoch": 0.0787205496366452, + "grad_norm": 0.004237758927047253, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 2853 + }, + { + "epoch": 0.07874814183770958, + "grad_norm": 0.005102077033370733, + "learning_rate": 0.001, + "loss": 0.4498, + "step": 2854 + }, + { + "epoch": 0.07877573403877394, + "grad_norm": 0.003620270872488618, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 2855 + }, + { + "epoch": 0.07880332623983831, + "grad_norm": 0.003328984137624502, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 2856 + }, + { + "epoch": 0.07883091844090268, + "grad_norm": 0.005346687976270914, + "learning_rate": 0.001, + "loss": 0.4406, + "step": 2857 + }, + { + "epoch": 0.07885851064196704, + "grad_norm": 0.002766883932054043, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 2858 + }, + { + "epoch": 0.07888610284303142, + "grad_norm": 0.0039212643168866634, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 2859 + }, + { + "epoch": 0.07891369504409579, + "grad_norm": 0.00360241811722517, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 2860 + }, + { + "epoch": 0.07894128724516015, + "grad_norm": 0.009609011001884937, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 2861 + }, + { + "epoch": 0.07896887944622452, + "grad_norm": 0.004263875540345907, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 2862 + }, + { + "epoch": 0.07899647164728889, + "grad_norm": 0.003934512846171856, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 2863 + }, + { + "epoch": 0.07902406384835327, + "grad_norm": 0.0042722225189208984, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 2864 + }, + { + "epoch": 0.07905165604941763, + "grad_norm": 0.0036077832337468863, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 2865 + }, + { + "epoch": 0.079079248250482, + "grad_norm": 0.0027798449154943228, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 2866 + }, + { + "epoch": 0.07910684045154637, + "grad_norm": 0.0025270867627114058, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 2867 + }, + { + "epoch": 0.07913443265261073, + "grad_norm": 0.0037853806279599667, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 2868 + }, + { + "epoch": 0.07916202485367511, + "grad_norm": 0.004050467163324356, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 2869 + }, + { + "epoch": 0.07918961705473948, + "grad_norm": 0.002820851979777217, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 2870 + }, + { + "epoch": 0.07921720925580385, + "grad_norm": 0.007634916342794895, + "learning_rate": 0.001, + "loss": 0.376, + "step": 2871 + }, + { + "epoch": 0.07924480145686821, + "grad_norm": 0.004927767440676689, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 2872 + }, + { + "epoch": 0.07927239365793258, + "grad_norm": 0.00282577658072114, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 2873 + }, + { + "epoch": 0.07929998585899696, + "grad_norm": 0.0034970357082784176, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 2874 + }, + { + "epoch": 0.07932757806006133, + "grad_norm": 0.00328719150274992, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 2875 + }, + { + "epoch": 0.0793551702611257, + "grad_norm": 0.0030249380506575108, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 2876 + }, + { + "epoch": 0.07938276246219006, + "grad_norm": 0.0026030796580016613, + "learning_rate": 0.001, + "loss": 0.4465, + "step": 2877 + }, + { + "epoch": 0.07941035466325443, + "grad_norm": 0.0026747360825538635, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 2878 + }, + { + "epoch": 0.0794379468643188, + "grad_norm": 0.0031104108784347773, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 2879 + }, + { + "epoch": 0.07946553906538317, + "grad_norm": 0.0029512427281588316, + "learning_rate": 0.001, + "loss": 0.426, + "step": 2880 + }, + { + "epoch": 0.07949313126644754, + "grad_norm": 0.0036311009898781776, + "learning_rate": 0.001, + "loss": 0.385, + "step": 2881 + }, + { + "epoch": 0.0795207234675119, + "grad_norm": 0.0028290299233049154, + "learning_rate": 0.001, + "loss": 0.403, + "step": 2882 + }, + { + "epoch": 0.07954831566857627, + "grad_norm": 0.0037825354374945164, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 2883 + }, + { + "epoch": 0.07957590786964065, + "grad_norm": 0.004042259883135557, + "learning_rate": 0.001, + "loss": 0.4346, + "step": 2884 + }, + { + "epoch": 0.07960350007070502, + "grad_norm": 0.0031280801631510258, + "learning_rate": 0.001, + "loss": 0.3606, + "step": 2885 + }, + { + "epoch": 0.07963109227176939, + "grad_norm": 0.003348682075738907, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 2886 + }, + { + "epoch": 0.07965868447283375, + "grad_norm": 0.003773730481043458, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 2887 + }, + { + "epoch": 0.07968627667389812, + "grad_norm": 0.0028589165303856134, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 2888 + }, + { + "epoch": 0.0797138688749625, + "grad_norm": 0.003061666851863265, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 2889 + }, + { + "epoch": 0.07974146107602686, + "grad_norm": 0.002353568095713854, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 2890 + }, + { + "epoch": 0.07976905327709123, + "grad_norm": 0.0026803589425981045, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 2891 + }, + { + "epoch": 0.0797966454781556, + "grad_norm": 0.0032909938599914312, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 2892 + }, + { + "epoch": 0.07982423767921996, + "grad_norm": 0.005034049041569233, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 2893 + }, + { + "epoch": 0.07985182988028433, + "grad_norm": 0.003058070782572031, + "learning_rate": 0.001, + "loss": 0.3646, + "step": 2894 + }, + { + "epoch": 0.07987942208134871, + "grad_norm": 0.0028637642972171307, + "learning_rate": 0.001, + "loss": 0.3609, + "step": 2895 + }, + { + "epoch": 0.07990701428241308, + "grad_norm": 0.004128556232899427, + "learning_rate": 0.001, + "loss": 0.391, + "step": 2896 + }, + { + "epoch": 0.07993460648347744, + "grad_norm": 0.0060408939607441425, + "learning_rate": 0.001, + "loss": 0.3351, + "step": 2897 + }, + { + "epoch": 0.07996219868454181, + "grad_norm": 0.0026880092918872833, + "learning_rate": 0.001, + "loss": 0.3596, + "step": 2898 + }, + { + "epoch": 0.07998979088560618, + "grad_norm": 0.00707295211032033, + "learning_rate": 0.001, + "loss": 0.3579, + "step": 2899 + }, + { + "epoch": 0.08001738308667056, + "grad_norm": 0.0043478901498019695, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 2900 + }, + { + "epoch": 0.08004497528773492, + "grad_norm": 0.0034000363666564226, + "learning_rate": 0.001, + "loss": 0.387, + "step": 2901 + }, + { + "epoch": 0.08007256748879929, + "grad_norm": 0.004060294013470411, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 2902 + }, + { + "epoch": 0.08010015968986366, + "grad_norm": 0.0032872636802494526, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 2903 + }, + { + "epoch": 0.08012775189092802, + "grad_norm": 0.006337369792163372, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 2904 + }, + { + "epoch": 0.0801553440919924, + "grad_norm": 0.002969271270558238, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 2905 + }, + { + "epoch": 0.08018293629305677, + "grad_norm": 0.002581764478236437, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 2906 + }, + { + "epoch": 0.08021052849412114, + "grad_norm": 0.0030686580576002598, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 2907 + }, + { + "epoch": 0.0802381206951855, + "grad_norm": 0.0032976726070046425, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 2908 + }, + { + "epoch": 0.08026571289624987, + "grad_norm": 0.0037495982833206654, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 2909 + }, + { + "epoch": 0.08029330509731425, + "grad_norm": 0.0030835745856165886, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 2910 + }, + { + "epoch": 0.08032089729837862, + "grad_norm": 0.006115986034274101, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 2911 + }, + { + "epoch": 0.08034848949944298, + "grad_norm": 0.003554831026121974, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 2912 + }, + { + "epoch": 0.08037608170050735, + "grad_norm": 0.0033155917190015316, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 2913 + }, + { + "epoch": 0.08040367390157172, + "grad_norm": 0.0027983803302049637, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 2914 + }, + { + "epoch": 0.0804312661026361, + "grad_norm": 0.0023535455111414194, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 2915 + }, + { + "epoch": 0.08045885830370046, + "grad_norm": 0.002943321131169796, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 2916 + }, + { + "epoch": 0.08048645050476483, + "grad_norm": 0.00304407742805779, + "learning_rate": 0.001, + "loss": 0.372, + "step": 2917 + }, + { + "epoch": 0.0805140427058292, + "grad_norm": 0.0032863402739167213, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 2918 + }, + { + "epoch": 0.08054163490689356, + "grad_norm": 0.0027361640240997076, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 2919 + }, + { + "epoch": 0.08056922710795794, + "grad_norm": 0.0026063849218189716, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 2920 + }, + { + "epoch": 0.08059681930902231, + "grad_norm": 0.0022036924492567778, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 2921 + }, + { + "epoch": 0.08062441151008667, + "grad_norm": 0.0023288466036319733, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 2922 + }, + { + "epoch": 0.08065200371115104, + "grad_norm": 0.0029277384746819735, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 2923 + }, + { + "epoch": 0.08067959591221541, + "grad_norm": 0.0061630988493561745, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 2924 + }, + { + "epoch": 0.08070718811327979, + "grad_norm": 0.0035532654728740454, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 2925 + }, + { + "epoch": 0.08073478031434415, + "grad_norm": 0.0036336968187242746, + "learning_rate": 0.001, + "loss": 0.38, + "step": 2926 + }, + { + "epoch": 0.08076237251540852, + "grad_norm": 0.004034141544252634, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 2927 + }, + { + "epoch": 0.08078996471647289, + "grad_norm": 0.0049674310721457005, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 2928 + }, + { + "epoch": 0.08081755691753725, + "grad_norm": 0.00383196328766644, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 2929 + }, + { + "epoch": 0.08084514911860163, + "grad_norm": 0.002637132303789258, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 2930 + }, + { + "epoch": 0.080872741319666, + "grad_norm": 0.004422449506819248, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 2931 + }, + { + "epoch": 0.08090033352073037, + "grad_norm": 0.005644423421472311, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 2932 + }, + { + "epoch": 0.08092792572179473, + "grad_norm": 0.0029548918828368187, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 2933 + }, + { + "epoch": 0.0809555179228591, + "grad_norm": 0.0033653799910098314, + "learning_rate": 0.001, + "loss": 0.407, + "step": 2934 + }, + { + "epoch": 0.08098311012392348, + "grad_norm": 0.0336541123688221, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 2935 + }, + { + "epoch": 0.08101070232498785, + "grad_norm": 0.005585819482803345, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 2936 + }, + { + "epoch": 0.08103829452605221, + "grad_norm": 0.0023465941194444895, + "learning_rate": 0.001, + "loss": 0.462, + "step": 2937 + }, + { + "epoch": 0.08106588672711658, + "grad_norm": 0.003671723185107112, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 2938 + }, + { + "epoch": 0.08109347892818095, + "grad_norm": 0.002763295080512762, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 2939 + }, + { + "epoch": 0.08112107112924533, + "grad_norm": 0.0028186712879687548, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 2940 + }, + { + "epoch": 0.08114866333030969, + "grad_norm": 0.0023679453879594803, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 2941 + }, + { + "epoch": 0.08117625553137406, + "grad_norm": 0.002650237875059247, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 2942 + }, + { + "epoch": 0.08120384773243843, + "grad_norm": 0.0028579425998032093, + "learning_rate": 0.001, + "loss": 0.421, + "step": 2943 + }, + { + "epoch": 0.08123143993350279, + "grad_norm": 0.003209290560334921, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 2944 + }, + { + "epoch": 0.08125903213456716, + "grad_norm": 0.0023669025395065546, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 2945 + }, + { + "epoch": 0.08128662433563154, + "grad_norm": 0.002830538898706436, + "learning_rate": 0.001, + "loss": 0.3586, + "step": 2946 + }, + { + "epoch": 0.0813142165366959, + "grad_norm": 0.0034998899791389704, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 2947 + }, + { + "epoch": 0.08134180873776027, + "grad_norm": 0.003356503788381815, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 2948 + }, + { + "epoch": 0.08136940093882464, + "grad_norm": 0.0027026510797441006, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 2949 + }, + { + "epoch": 0.081396993139889, + "grad_norm": 0.0025660600513219833, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 2950 + }, + { + "epoch": 0.08142458534095338, + "grad_norm": 0.0029600337147712708, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 2951 + }, + { + "epoch": 0.08145217754201775, + "grad_norm": 0.00301582389511168, + "learning_rate": 0.001, + "loss": 0.373, + "step": 2952 + }, + { + "epoch": 0.08147976974308212, + "grad_norm": 0.0027227008249610662, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2953 + }, + { + "epoch": 0.08150736194414648, + "grad_norm": 0.004939099308103323, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 2954 + }, + { + "epoch": 0.08153495414521085, + "grad_norm": 0.0038117829244583845, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 2955 + }, + { + "epoch": 0.08156254634627523, + "grad_norm": 0.002671457827091217, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 2956 + }, + { + "epoch": 0.0815901385473396, + "grad_norm": 0.0051042488776147366, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 2957 + }, + { + "epoch": 0.08161773074840396, + "grad_norm": 0.013458254747092724, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 2958 + }, + { + "epoch": 0.08164532294946833, + "grad_norm": 0.004344523418694735, + "learning_rate": 0.001, + "loss": 0.374, + "step": 2959 + }, + { + "epoch": 0.0816729151505327, + "grad_norm": 0.003277859417721629, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 2960 + }, + { + "epoch": 0.08170050735159708, + "grad_norm": 0.007647217717021704, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 2961 + }, + { + "epoch": 0.08172809955266144, + "grad_norm": 0.0031640264205634594, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 2962 + }, + { + "epoch": 0.08175569175372581, + "grad_norm": 0.00352554302662611, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 2963 + }, + { + "epoch": 0.08178328395479018, + "grad_norm": 0.004366365727037191, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 2964 + }, + { + "epoch": 0.08181087615585454, + "grad_norm": 0.0043410733342170715, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 2965 + }, + { + "epoch": 0.08183846835691892, + "grad_norm": 0.0039422293193638325, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 2966 + }, + { + "epoch": 0.08186606055798329, + "grad_norm": 0.0037924828939139843, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 2967 + }, + { + "epoch": 0.08189365275904766, + "grad_norm": 0.007280276622623205, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 2968 + }, + { + "epoch": 0.08192124496011202, + "grad_norm": 0.0051049706526100636, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 2969 + }, + { + "epoch": 0.08194883716117639, + "grad_norm": 0.003454606281593442, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 2970 + }, + { + "epoch": 0.08197642936224077, + "grad_norm": 0.0029868248384445906, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 2971 + }, + { + "epoch": 0.08200402156330514, + "grad_norm": 0.004341921303421259, + "learning_rate": 0.001, + "loss": 0.4353, + "step": 2972 + }, + { + "epoch": 0.0820316137643695, + "grad_norm": 0.003885595127940178, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 2973 + }, + { + "epoch": 0.08205920596543387, + "grad_norm": 0.0032196505926549435, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 2974 + }, + { + "epoch": 0.08208679816649823, + "grad_norm": 0.004508745390921831, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 2975 + }, + { + "epoch": 0.08211439036756261, + "grad_norm": 0.0032329261302948, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 2976 + }, + { + "epoch": 0.08214198256862698, + "grad_norm": 0.004983431659638882, + "learning_rate": 0.001, + "loss": 0.407, + "step": 2977 + }, + { + "epoch": 0.08216957476969135, + "grad_norm": 0.03812706470489502, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 2978 + }, + { + "epoch": 0.08219716697075571, + "grad_norm": 0.002776419511064887, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 2979 + }, + { + "epoch": 0.08222475917182008, + "grad_norm": 0.0030240516643971205, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 2980 + }, + { + "epoch": 0.08225235137288446, + "grad_norm": 0.0031225322745740414, + "learning_rate": 0.001, + "loss": 0.3519, + "step": 2981 + }, + { + "epoch": 0.08227994357394883, + "grad_norm": 0.002751345979049802, + "learning_rate": 0.001, + "loss": 0.398, + "step": 2982 + }, + { + "epoch": 0.0823075357750132, + "grad_norm": 0.003123503876850009, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 2983 + }, + { + "epoch": 0.08233512797607756, + "grad_norm": 0.004268140532076359, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 2984 + }, + { + "epoch": 0.08236272017714193, + "grad_norm": 0.002731149084866047, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 2985 + }, + { + "epoch": 0.0823903123782063, + "grad_norm": 0.002880630549043417, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 2986 + }, + { + "epoch": 0.08241790457927067, + "grad_norm": 0.008699797093868256, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 2987 + }, + { + "epoch": 0.08244549678033504, + "grad_norm": 0.0032247393392026424, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 2988 + }, + { + "epoch": 0.0824730889813994, + "grad_norm": 0.003280255477875471, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 2989 + }, + { + "epoch": 0.08250068118246377, + "grad_norm": 0.0024324634578078985, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 2990 + }, + { + "epoch": 0.08252827338352814, + "grad_norm": 0.004013401456177235, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 2991 + }, + { + "epoch": 0.08255586558459252, + "grad_norm": 0.0036929554771631956, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 2992 + }, + { + "epoch": 0.08258345778565689, + "grad_norm": 0.003950448241084814, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 2993 + }, + { + "epoch": 0.08261104998672125, + "grad_norm": 0.004668880719691515, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 2994 + }, + { + "epoch": 0.08263864218778562, + "grad_norm": 0.002737791510298848, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 2995 + }, + { + "epoch": 0.08266623438884999, + "grad_norm": 0.007926344871520996, + "learning_rate": 0.001, + "loss": 0.3542, + "step": 2996 + }, + { + "epoch": 0.08269382658991437, + "grad_norm": 0.003492021933197975, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 2997 + }, + { + "epoch": 0.08272141879097873, + "grad_norm": 0.0026433023158460855, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 2998 + }, + { + "epoch": 0.0827490109920431, + "grad_norm": 0.0029606178868561983, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 2999 + }, + { + "epoch": 0.08277660319310746, + "grad_norm": 0.00282181310467422, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 3000 + }, + { + "epoch": 0.08277660319310746, + "eval_runtime": 24.5847, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.163, + "step": 3000 + }, + { + "epoch": 0.08280419539417183, + "grad_norm": 0.004229205194860697, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 3001 + }, + { + "epoch": 0.08283178759523621, + "grad_norm": 0.002889542607590556, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 3002 + }, + { + "epoch": 0.08285937979630058, + "grad_norm": 0.0026309038512408733, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 3003 + }, + { + "epoch": 0.08288697199736494, + "grad_norm": 0.002682015299797058, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 3004 + }, + { + "epoch": 0.08291456419842931, + "grad_norm": 0.0026974889915436506, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 3005 + }, + { + "epoch": 0.08294215639949368, + "grad_norm": 0.0023957311641424894, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 3006 + }, + { + "epoch": 0.08296974860055806, + "grad_norm": 0.004567582160234451, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 3007 + }, + { + "epoch": 0.08299734080162242, + "grad_norm": 0.004289150703698397, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 3008 + }, + { + "epoch": 0.08302493300268679, + "grad_norm": 0.004455960355699062, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 3009 + }, + { + "epoch": 0.08305252520375116, + "grad_norm": 0.005996396765112877, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 3010 + }, + { + "epoch": 0.08308011740481552, + "grad_norm": 0.004294134210795164, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 3011 + }, + { + "epoch": 0.0831077096058799, + "grad_norm": 0.0030041569843888283, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 3012 + }, + { + "epoch": 0.08313530180694427, + "grad_norm": 0.0038454660680145025, + "learning_rate": 0.001, + "loss": 0.378, + "step": 3013 + }, + { + "epoch": 0.08316289400800864, + "grad_norm": 0.003388310084119439, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 3014 + }, + { + "epoch": 0.083190486209073, + "grad_norm": 0.003318838309496641, + "learning_rate": 0.001, + "loss": 0.38, + "step": 3015 + }, + { + "epoch": 0.08321807841013737, + "grad_norm": 0.002537375781685114, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 3016 + }, + { + "epoch": 0.08324567061120175, + "grad_norm": 0.00318319583311677, + "learning_rate": 0.001, + "loss": 0.408, + "step": 3017 + }, + { + "epoch": 0.08327326281226612, + "grad_norm": 0.004292814992368221, + "learning_rate": 0.001, + "loss": 0.3454, + "step": 3018 + }, + { + "epoch": 0.08330085501333048, + "grad_norm": 0.0027642296627163887, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 3019 + }, + { + "epoch": 0.08332844721439485, + "grad_norm": 0.0034603141248226166, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 3020 + }, + { + "epoch": 0.08335603941545922, + "grad_norm": 0.006106778047978878, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 3021 + }, + { + "epoch": 0.0833836316165236, + "grad_norm": 0.002718998584896326, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 3022 + }, + { + "epoch": 0.08341122381758796, + "grad_norm": 0.002778289606794715, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 3023 + }, + { + "epoch": 0.08343881601865233, + "grad_norm": 0.002709881402552128, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 3024 + }, + { + "epoch": 0.0834664082197167, + "grad_norm": 0.004608054179698229, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 3025 + }, + { + "epoch": 0.08349400042078106, + "grad_norm": 0.0070383017882704735, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 3026 + }, + { + "epoch": 0.08352159262184544, + "grad_norm": 0.0029459171928465366, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 3027 + }, + { + "epoch": 0.08354918482290981, + "grad_norm": 0.002476966939866543, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 3028 + }, + { + "epoch": 0.08357677702397417, + "grad_norm": 0.0041242605075240135, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 3029 + }, + { + "epoch": 0.08360436922503854, + "grad_norm": 0.0027966529596596956, + "learning_rate": 0.001, + "loss": 0.4453, + "step": 3030 + }, + { + "epoch": 0.08363196142610291, + "grad_norm": 0.0032828738912940025, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 3031 + }, + { + "epoch": 0.08365955362716729, + "grad_norm": 0.002733866684138775, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 3032 + }, + { + "epoch": 0.08368714582823165, + "grad_norm": 0.004932538606226444, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 3033 + }, + { + "epoch": 0.08371473802929602, + "grad_norm": 0.0026060608215630054, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 3034 + }, + { + "epoch": 0.08374233023036039, + "grad_norm": 0.0029096321668475866, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 3035 + }, + { + "epoch": 0.08376992243142475, + "grad_norm": 0.0027539224829524755, + "learning_rate": 0.001, + "loss": 0.4445, + "step": 3036 + }, + { + "epoch": 0.08379751463248912, + "grad_norm": 0.00449875695630908, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 3037 + }, + { + "epoch": 0.0838251068335535, + "grad_norm": 0.0045863245613873005, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 3038 + }, + { + "epoch": 0.08385269903461787, + "grad_norm": 0.004102183040231466, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 3039 + }, + { + "epoch": 0.08388029123568223, + "grad_norm": 0.003232220420613885, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 3040 + }, + { + "epoch": 0.0839078834367466, + "grad_norm": 0.0032380821648985147, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 3041 + }, + { + "epoch": 0.08393547563781097, + "grad_norm": 0.0023660236038267612, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 3042 + }, + { + "epoch": 0.08396306783887535, + "grad_norm": 0.006015659775584936, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 3043 + }, + { + "epoch": 0.08399066003993971, + "grad_norm": 0.0027081884909421206, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 3044 + }, + { + "epoch": 0.08401825224100408, + "grad_norm": 0.002505830954760313, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 3045 + }, + { + "epoch": 0.08404584444206845, + "grad_norm": 0.002315351041033864, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3046 + }, + { + "epoch": 0.08407343664313281, + "grad_norm": 0.003794366493821144, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 3047 + }, + { + "epoch": 0.08410102884419719, + "grad_norm": 0.006150163244456053, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 3048 + }, + { + "epoch": 0.08412862104526156, + "grad_norm": 0.00300444639287889, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 3049 + }, + { + "epoch": 0.08415621324632593, + "grad_norm": 0.0030877876561135054, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 3050 + }, + { + "epoch": 0.08418380544739029, + "grad_norm": 0.004085875116288662, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 3051 + }, + { + "epoch": 0.08421139764845466, + "grad_norm": 0.0042649428360164165, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 3052 + }, + { + "epoch": 0.08423898984951904, + "grad_norm": 0.002339342376217246, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 3053 + }, + { + "epoch": 0.0842665820505834, + "grad_norm": 0.0029005780816078186, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 3054 + }, + { + "epoch": 0.08429417425164777, + "grad_norm": 0.002606838010251522, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 3055 + }, + { + "epoch": 0.08432176645271214, + "grad_norm": 0.002856641774997115, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 3056 + }, + { + "epoch": 0.0843493586537765, + "grad_norm": 0.021601708605885506, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 3057 + }, + { + "epoch": 0.08437695085484088, + "grad_norm": 0.002950535388663411, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 3058 + }, + { + "epoch": 0.08440454305590525, + "grad_norm": 0.0030242663342505693, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 3059 + }, + { + "epoch": 0.08443213525696962, + "grad_norm": 0.002619365695863962, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 3060 + }, + { + "epoch": 0.08445972745803398, + "grad_norm": 0.0032321936450898647, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 3061 + }, + { + "epoch": 0.08448731965909835, + "grad_norm": 0.0033744669053703547, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 3062 + }, + { + "epoch": 0.08451491186016273, + "grad_norm": 0.0027905574534088373, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 3063 + }, + { + "epoch": 0.0845425040612271, + "grad_norm": 0.0028084227815270424, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 3064 + }, + { + "epoch": 0.08457009626229146, + "grad_norm": 0.0034166318364441395, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 3065 + }, + { + "epoch": 0.08459768846335583, + "grad_norm": 0.0029833640437573195, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 3066 + }, + { + "epoch": 0.0846252806644202, + "grad_norm": 0.002973797731101513, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 3067 + }, + { + "epoch": 0.08465287286548458, + "grad_norm": 0.0027056816034018993, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 3068 + }, + { + "epoch": 0.08468046506654894, + "grad_norm": 0.008972891606390476, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 3069 + }, + { + "epoch": 0.08470805726761331, + "grad_norm": 0.003379418281838298, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 3070 + }, + { + "epoch": 0.08473564946867768, + "grad_norm": 0.004911572206765413, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 3071 + }, + { + "epoch": 0.08476324166974204, + "grad_norm": 0.003271787893027067, + "learning_rate": 0.001, + "loss": 0.4428, + "step": 3072 + }, + { + "epoch": 0.08479083387080642, + "grad_norm": 0.0034387686755508184, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 3073 + }, + { + "epoch": 0.08481842607187079, + "grad_norm": 0.004575818777084351, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 3074 + }, + { + "epoch": 0.08484601827293516, + "grad_norm": 0.040848325937986374, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 3075 + }, + { + "epoch": 0.08487361047399952, + "grad_norm": 0.009725205600261688, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 3076 + }, + { + "epoch": 0.08490120267506389, + "grad_norm": 0.008186236955225468, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3077 + }, + { + "epoch": 0.08492879487612827, + "grad_norm": 0.0035197827965021133, + "learning_rate": 0.001, + "loss": 0.445, + "step": 3078 + }, + { + "epoch": 0.08495638707719264, + "grad_norm": 0.002530729863792658, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 3079 + }, + { + "epoch": 0.084983979278257, + "grad_norm": 0.002124677412211895, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 3080 + }, + { + "epoch": 0.08501157147932137, + "grad_norm": 0.0033621720504015684, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 3081 + }, + { + "epoch": 0.08503916368038573, + "grad_norm": 0.003055825363844633, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 3082 + }, + { + "epoch": 0.0850667558814501, + "grad_norm": 0.002876532031223178, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 3083 + }, + { + "epoch": 0.08509434808251448, + "grad_norm": 0.0029174459632486105, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 3084 + }, + { + "epoch": 0.08512194028357885, + "grad_norm": 0.002031163312494755, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 3085 + }, + { + "epoch": 0.08514953248464321, + "grad_norm": 0.003064326476305723, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 3086 + }, + { + "epoch": 0.08517712468570758, + "grad_norm": 0.003118072636425495, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 3087 + }, + { + "epoch": 0.08520471688677195, + "grad_norm": 0.0032082500401884317, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 3088 + }, + { + "epoch": 0.08523230908783633, + "grad_norm": 0.005306860897690058, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 3089 + }, + { + "epoch": 0.0852599012889007, + "grad_norm": 0.004819758236408234, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 3090 + }, + { + "epoch": 0.08528749348996506, + "grad_norm": 0.003489856142550707, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 3091 + }, + { + "epoch": 0.08531508569102943, + "grad_norm": 0.002650489332154393, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 3092 + }, + { + "epoch": 0.0853426778920938, + "grad_norm": 0.006152056623250246, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 3093 + }, + { + "epoch": 0.08537027009315817, + "grad_norm": 0.002845000009983778, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 3094 + }, + { + "epoch": 0.08539786229422254, + "grad_norm": 0.012226511724293232, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 3095 + }, + { + "epoch": 0.0854254544952869, + "grad_norm": 0.003854371840134263, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 3096 + }, + { + "epoch": 0.08545304669635127, + "grad_norm": 0.0025693075731396675, + "learning_rate": 0.001, + "loss": 0.413, + "step": 3097 + }, + { + "epoch": 0.08548063889741564, + "grad_norm": 0.006640615873038769, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 3098 + }, + { + "epoch": 0.08550823109848002, + "grad_norm": 0.006392517127096653, + "learning_rate": 0.001, + "loss": 0.396, + "step": 3099 + }, + { + "epoch": 0.08553582329954439, + "grad_norm": 0.004845835268497467, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 3100 + }, + { + "epoch": 0.08556341550060875, + "grad_norm": 0.005125217605382204, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 3101 + }, + { + "epoch": 0.08559100770167312, + "grad_norm": 0.002894837176427245, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 3102 + }, + { + "epoch": 0.08561859990273749, + "grad_norm": 0.0028156936168670654, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 3103 + }, + { + "epoch": 0.08564619210380187, + "grad_norm": 0.0022546479012817144, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 3104 + }, + { + "epoch": 0.08567378430486623, + "grad_norm": 0.0024128523655235767, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 3105 + }, + { + "epoch": 0.0857013765059306, + "grad_norm": 0.0024307817220687866, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 3106 + }, + { + "epoch": 0.08572896870699497, + "grad_norm": 0.0030128727667033672, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 3107 + }, + { + "epoch": 0.08575656090805933, + "grad_norm": 0.00243613263592124, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 3108 + }, + { + "epoch": 0.08578415310912371, + "grad_norm": 0.0021383436396718025, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 3109 + }, + { + "epoch": 0.08581174531018808, + "grad_norm": 0.003478130092844367, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 3110 + }, + { + "epoch": 0.08583933751125244, + "grad_norm": 0.0026314982678741217, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 3111 + }, + { + "epoch": 0.08586692971231681, + "grad_norm": 0.0033548614010214806, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 3112 + }, + { + "epoch": 0.08589452191338118, + "grad_norm": 0.0054088556207716465, + "learning_rate": 0.001, + "loss": 0.3519, + "step": 3113 + }, + { + "epoch": 0.08592211411444556, + "grad_norm": 0.0038994327187538147, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3114 + }, + { + "epoch": 0.08594970631550992, + "grad_norm": 0.0026842011138796806, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 3115 + }, + { + "epoch": 0.08597729851657429, + "grad_norm": 0.0026092708576470613, + "learning_rate": 0.001, + "loss": 0.431, + "step": 3116 + }, + { + "epoch": 0.08600489071763866, + "grad_norm": 0.004279262386262417, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 3117 + }, + { + "epoch": 0.08603248291870302, + "grad_norm": 0.0036713224835693836, + "learning_rate": 0.001, + "loss": 0.3371, + "step": 3118 + }, + { + "epoch": 0.0860600751197674, + "grad_norm": 0.004069841001182795, + "learning_rate": 0.001, + "loss": 0.3678, + "step": 3119 + }, + { + "epoch": 0.08608766732083177, + "grad_norm": 0.003785800887271762, + "learning_rate": 0.001, + "loss": 0.3646, + "step": 3120 + }, + { + "epoch": 0.08611525952189614, + "grad_norm": 0.003927123267203569, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 3121 + }, + { + "epoch": 0.0861428517229605, + "grad_norm": 0.004813566338270903, + "learning_rate": 0.001, + "loss": 0.4, + "step": 3122 + }, + { + "epoch": 0.08617044392402487, + "grad_norm": 0.002674676012247801, + "learning_rate": 0.001, + "loss": 0.415, + "step": 3123 + }, + { + "epoch": 0.08619803612508925, + "grad_norm": 0.0027380958199501038, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3124 + }, + { + "epoch": 0.08622562832615362, + "grad_norm": 0.006593447644263506, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 3125 + }, + { + "epoch": 0.08625322052721798, + "grad_norm": 0.005558252800256014, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 3126 + }, + { + "epoch": 0.08628081272828235, + "grad_norm": 0.003584665711969137, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 3127 + }, + { + "epoch": 0.08630840492934672, + "grad_norm": 0.004839813802391291, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 3128 + }, + { + "epoch": 0.08633599713041108, + "grad_norm": 0.0030945283360779285, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 3129 + }, + { + "epoch": 0.08636358933147546, + "grad_norm": 0.005693027749657631, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 3130 + }, + { + "epoch": 0.08639118153253983, + "grad_norm": 0.0034251015167683363, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 3131 + }, + { + "epoch": 0.0864187737336042, + "grad_norm": 0.0030861585400998592, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 3132 + }, + { + "epoch": 0.08644636593466856, + "grad_norm": 0.0027904235757887363, + "learning_rate": 0.001, + "loss": 0.394, + "step": 3133 + }, + { + "epoch": 0.08647395813573293, + "grad_norm": 0.003176966914907098, + "learning_rate": 0.001, + "loss": 0.407, + "step": 3134 + }, + { + "epoch": 0.08650155033679731, + "grad_norm": 0.007988881319761276, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 3135 + }, + { + "epoch": 0.08652914253786168, + "grad_norm": 0.0030314160976558924, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 3136 + }, + { + "epoch": 0.08655673473892604, + "grad_norm": 0.0032279801089316607, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 3137 + }, + { + "epoch": 0.08658432693999041, + "grad_norm": 0.0031009484082460403, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 3138 + }, + { + "epoch": 0.08661191914105477, + "grad_norm": 0.0032069345470517874, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 3139 + }, + { + "epoch": 0.08663951134211915, + "grad_norm": 0.005810491740703583, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 3140 + }, + { + "epoch": 0.08666710354318352, + "grad_norm": 0.003494016360491514, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 3141 + }, + { + "epoch": 0.08669469574424789, + "grad_norm": 0.0038405335508286953, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3142 + }, + { + "epoch": 0.08672228794531225, + "grad_norm": 0.03096974827349186, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 3143 + }, + { + "epoch": 0.08674988014637662, + "grad_norm": 0.007581043988466263, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 3144 + }, + { + "epoch": 0.086777472347441, + "grad_norm": 0.00279928813688457, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 3145 + }, + { + "epoch": 0.08680506454850537, + "grad_norm": 0.003512721508741379, + "learning_rate": 0.001, + "loss": 0.3581, + "step": 3146 + }, + { + "epoch": 0.08683265674956973, + "grad_norm": 0.0022023352794349194, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 3147 + }, + { + "epoch": 0.0868602489506341, + "grad_norm": 0.0022207568399608135, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 3148 + }, + { + "epoch": 0.08688784115169847, + "grad_norm": 0.003400396555662155, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 3149 + }, + { + "epoch": 0.08691543335276285, + "grad_norm": 0.0031312189530581236, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 3150 + }, + { + "epoch": 0.08694302555382721, + "grad_norm": 0.0022443707566708326, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 3151 + }, + { + "epoch": 0.08697061775489158, + "grad_norm": 0.0031315688975155354, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 3152 + }, + { + "epoch": 0.08699820995595595, + "grad_norm": 0.0029569200705736876, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 3153 + }, + { + "epoch": 0.08702580215702031, + "grad_norm": 0.0045159622095525265, + "learning_rate": 0.001, + "loss": 0.3658, + "step": 3154 + }, + { + "epoch": 0.08705339435808469, + "grad_norm": 0.0024900075513869524, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 3155 + }, + { + "epoch": 0.08708098655914906, + "grad_norm": 0.002761411713436246, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 3156 + }, + { + "epoch": 0.08710857876021343, + "grad_norm": 0.005542066879570484, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 3157 + }, + { + "epoch": 0.08713617096127779, + "grad_norm": 0.0027586561627686024, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 3158 + }, + { + "epoch": 0.08716376316234216, + "grad_norm": 0.0025526464451104403, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 3159 + }, + { + "epoch": 0.08719135536340654, + "grad_norm": 0.0028663338162004948, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 3160 + }, + { + "epoch": 0.0872189475644709, + "grad_norm": 0.0021977245341986418, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 3161 + }, + { + "epoch": 0.08724653976553527, + "grad_norm": 0.003095234278589487, + "learning_rate": 0.001, + "loss": 0.378, + "step": 3162 + }, + { + "epoch": 0.08727413196659964, + "grad_norm": 0.002372263465076685, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 3163 + }, + { + "epoch": 0.087301724167664, + "grad_norm": 0.0028856871649622917, + "learning_rate": 0.001, + "loss": 0.415, + "step": 3164 + }, + { + "epoch": 0.08732931636872839, + "grad_norm": 0.006369113922119141, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 3165 + }, + { + "epoch": 0.08735690856979275, + "grad_norm": 0.00523237232118845, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 3166 + }, + { + "epoch": 0.08738450077085712, + "grad_norm": 0.0037964817602187395, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 3167 + }, + { + "epoch": 0.08741209297192148, + "grad_norm": 0.004853063262999058, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 3168 + }, + { + "epoch": 0.08743968517298585, + "grad_norm": 0.0025882385671138763, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 3169 + }, + { + "epoch": 0.08746727737405023, + "grad_norm": 0.002228371100500226, + "learning_rate": 0.001, + "loss": 0.468, + "step": 3170 + }, + { + "epoch": 0.0874948695751146, + "grad_norm": 0.003534890478476882, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 3171 + }, + { + "epoch": 0.08752246177617896, + "grad_norm": 0.002975932788103819, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 3172 + }, + { + "epoch": 0.08755005397724333, + "grad_norm": 0.0035217402037233114, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 3173 + }, + { + "epoch": 0.0875776461783077, + "grad_norm": 0.002465012250468135, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 3174 + }, + { + "epoch": 0.08760523837937208, + "grad_norm": 0.003945467062294483, + "learning_rate": 0.001, + "loss": 0.391, + "step": 3175 + }, + { + "epoch": 0.08763283058043644, + "grad_norm": 0.003953505773097277, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 3176 + }, + { + "epoch": 0.08766042278150081, + "grad_norm": 0.004156408831477165, + "learning_rate": 0.001, + "loss": 0.403, + "step": 3177 + }, + { + "epoch": 0.08768801498256518, + "grad_norm": 0.0024448104668408632, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 3178 + }, + { + "epoch": 0.08771560718362954, + "grad_norm": 0.0024597158189862967, + "learning_rate": 0.001, + "loss": 0.425, + "step": 3179 + }, + { + "epoch": 0.08774319938469391, + "grad_norm": 0.003507564775645733, + "learning_rate": 0.001, + "loss": 0.364, + "step": 3180 + }, + { + "epoch": 0.08777079158575829, + "grad_norm": 0.002295244485139847, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 3181 + }, + { + "epoch": 0.08779838378682266, + "grad_norm": 0.0022501791827380657, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 3182 + }, + { + "epoch": 0.08782597598788702, + "grad_norm": 0.0027677484322339296, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 3183 + }, + { + "epoch": 0.08785356818895139, + "grad_norm": 0.0029434715397655964, + "learning_rate": 0.001, + "loss": 0.3447, + "step": 3184 + }, + { + "epoch": 0.08788116039001576, + "grad_norm": 0.0024778309743851423, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 3185 + }, + { + "epoch": 0.08790875259108014, + "grad_norm": 0.0023013644386082888, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 3186 + }, + { + "epoch": 0.0879363447921445, + "grad_norm": 0.0019937290344387293, + "learning_rate": 0.001, + "loss": 0.433, + "step": 3187 + }, + { + "epoch": 0.08796393699320887, + "grad_norm": 0.009529836475849152, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 3188 + }, + { + "epoch": 0.08799152919427324, + "grad_norm": 0.0029238457791507244, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 3189 + }, + { + "epoch": 0.0880191213953376, + "grad_norm": 0.0023196239490062, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 3190 + }, + { + "epoch": 0.08804671359640198, + "grad_norm": 0.0024587539955973625, + "learning_rate": 0.001, + "loss": 0.426, + "step": 3191 + }, + { + "epoch": 0.08807430579746635, + "grad_norm": 0.004239838104695082, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 3192 + }, + { + "epoch": 0.08810189799853071, + "grad_norm": 0.003386555938050151, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 3193 + }, + { + "epoch": 0.08812949019959508, + "grad_norm": 0.0025654241908341646, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 3194 + }, + { + "epoch": 0.08815708240065945, + "grad_norm": 0.002837128471583128, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 3195 + }, + { + "epoch": 0.08818467460172383, + "grad_norm": 0.0034069865942001343, + "learning_rate": 0.001, + "loss": 0.3413, + "step": 3196 + }, + { + "epoch": 0.0882122668027882, + "grad_norm": 0.002375151729211211, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 3197 + }, + { + "epoch": 0.08823985900385256, + "grad_norm": 0.003615601221099496, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 3198 + }, + { + "epoch": 0.08826745120491693, + "grad_norm": 0.0048666223883628845, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 3199 + }, + { + "epoch": 0.0882950434059813, + "grad_norm": 0.0025259742978960276, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 3200 + }, + { + "epoch": 0.08832263560704567, + "grad_norm": 0.006654439959675074, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 3201 + }, + { + "epoch": 0.08835022780811004, + "grad_norm": 0.002633353229612112, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 3202 + }, + { + "epoch": 0.08837782000917441, + "grad_norm": 0.0028525053057819605, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 3203 + }, + { + "epoch": 0.08840541221023877, + "grad_norm": 0.0071784635074436665, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 3204 + }, + { + "epoch": 0.08843300441130314, + "grad_norm": 0.0030664210207760334, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 3205 + }, + { + "epoch": 0.08846059661236752, + "grad_norm": 0.0034981323406100273, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 3206 + }, + { + "epoch": 0.08848818881343189, + "grad_norm": 0.006974066607654095, + "learning_rate": 0.001, + "loss": 0.458, + "step": 3207 + }, + { + "epoch": 0.08851578101449625, + "grad_norm": 0.002912199590355158, + "learning_rate": 0.001, + "loss": 0.416, + "step": 3208 + }, + { + "epoch": 0.08854337321556062, + "grad_norm": 0.0036544064059853554, + "learning_rate": 0.001, + "loss": 0.4617, + "step": 3209 + }, + { + "epoch": 0.08857096541662499, + "grad_norm": 0.002682635560631752, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 3210 + }, + { + "epoch": 0.08859855761768937, + "grad_norm": 0.0032552520278841257, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 3211 + }, + { + "epoch": 0.08862614981875373, + "grad_norm": 0.003252149559557438, + "learning_rate": 0.001, + "loss": 0.425, + "step": 3212 + }, + { + "epoch": 0.0886537420198181, + "grad_norm": 0.004062923137098551, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 3213 + }, + { + "epoch": 0.08868133422088247, + "grad_norm": 0.0034245557617396116, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 3214 + }, + { + "epoch": 0.08870892642194683, + "grad_norm": 0.00209795287810266, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 3215 + }, + { + "epoch": 0.08873651862301121, + "grad_norm": 0.002493221778422594, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 3216 + }, + { + "epoch": 0.08876411082407558, + "grad_norm": 0.0032907910645008087, + "learning_rate": 0.001, + "loss": 0.423, + "step": 3217 + }, + { + "epoch": 0.08879170302513995, + "grad_norm": 0.0022613955661654472, + "learning_rate": 0.001, + "loss": 0.406, + "step": 3218 + }, + { + "epoch": 0.08881929522620431, + "grad_norm": 0.0027464418672025204, + "learning_rate": 0.001, + "loss": 0.3534, + "step": 3219 + }, + { + "epoch": 0.08884688742726868, + "grad_norm": 0.0038859571795910597, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 3220 + }, + { + "epoch": 0.08887447962833306, + "grad_norm": 0.002589226933196187, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 3221 + }, + { + "epoch": 0.08890207182939742, + "grad_norm": 0.003947058692574501, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 3222 + }, + { + "epoch": 0.08892966403046179, + "grad_norm": 0.0034198507200926542, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 3223 + }, + { + "epoch": 0.08895725623152616, + "grad_norm": 0.0028758652042597532, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 3224 + }, + { + "epoch": 0.08898484843259052, + "grad_norm": 0.002941399347037077, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 3225 + }, + { + "epoch": 0.08901244063365489, + "grad_norm": 0.0027777596842497587, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 3226 + }, + { + "epoch": 0.08904003283471927, + "grad_norm": 0.00824811402708292, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 3227 + }, + { + "epoch": 0.08906762503578364, + "grad_norm": 0.002942741382867098, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 3228 + }, + { + "epoch": 0.089095217236848, + "grad_norm": 0.0034886416979134083, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 3229 + }, + { + "epoch": 0.08912280943791237, + "grad_norm": 0.0034925418440252542, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 3230 + }, + { + "epoch": 0.08915040163897674, + "grad_norm": 0.0031590645667165518, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 3231 + }, + { + "epoch": 0.08917799384004112, + "grad_norm": 0.00394233874976635, + "learning_rate": 0.001, + "loss": 0.412, + "step": 3232 + }, + { + "epoch": 0.08920558604110548, + "grad_norm": 0.002954542636871338, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 3233 + }, + { + "epoch": 0.08923317824216985, + "grad_norm": 0.0020198922138661146, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 3234 + }, + { + "epoch": 0.08926077044323422, + "grad_norm": 0.0032049540895968676, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 3235 + }, + { + "epoch": 0.08928836264429858, + "grad_norm": 0.0024705370888113976, + "learning_rate": 0.001, + "loss": 0.421, + "step": 3236 + }, + { + "epoch": 0.08931595484536296, + "grad_norm": 0.0023976247757673264, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 3237 + }, + { + "epoch": 0.08934354704642733, + "grad_norm": 0.0025141413789242506, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 3238 + }, + { + "epoch": 0.0893711392474917, + "grad_norm": 0.0034452369436621666, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 3239 + }, + { + "epoch": 0.08939873144855606, + "grad_norm": 0.0036900045815855265, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 3240 + }, + { + "epoch": 0.08942632364962043, + "grad_norm": 0.0030278146732598543, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 3241 + }, + { + "epoch": 0.08945391585068481, + "grad_norm": 0.0027707985136657953, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 3242 + }, + { + "epoch": 0.08948150805174918, + "grad_norm": 0.0024415196385234594, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 3243 + }, + { + "epoch": 0.08950910025281354, + "grad_norm": 0.0054499804973602295, + "learning_rate": 0.001, + "loss": 0.423, + "step": 3244 + }, + { + "epoch": 0.08953669245387791, + "grad_norm": 0.0034079072065651417, + "learning_rate": 0.001, + "loss": 0.3607, + "step": 3245 + }, + { + "epoch": 0.08956428465494227, + "grad_norm": 0.0038858712650835514, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 3246 + }, + { + "epoch": 0.08959187685600666, + "grad_norm": 0.007187449838966131, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3247 + }, + { + "epoch": 0.08961946905707102, + "grad_norm": 0.0033226367086172104, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 3248 + }, + { + "epoch": 0.08964706125813539, + "grad_norm": 0.0028839895967394114, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 3249 + }, + { + "epoch": 0.08967465345919975, + "grad_norm": 0.0032480424270033836, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 3250 + }, + { + "epoch": 0.08970224566026412, + "grad_norm": 0.004398911260068417, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 3251 + }, + { + "epoch": 0.0897298378613285, + "grad_norm": 0.00363975390791893, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 3252 + }, + { + "epoch": 0.08975743006239287, + "grad_norm": 0.008247625082731247, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 3253 + }, + { + "epoch": 0.08978502226345723, + "grad_norm": 0.003181576495990157, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 3254 + }, + { + "epoch": 0.0898126144645216, + "grad_norm": 0.005014390219002962, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 3255 + }, + { + "epoch": 0.08984020666558597, + "grad_norm": 0.0027002303395420313, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 3256 + }, + { + "epoch": 0.08986779886665035, + "grad_norm": 0.0028025219216942787, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 3257 + }, + { + "epoch": 0.08989539106771471, + "grad_norm": 0.0026610197965055704, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 3258 + }, + { + "epoch": 0.08992298326877908, + "grad_norm": 0.003577177645638585, + "learning_rate": 0.001, + "loss": 0.3467, + "step": 3259 + }, + { + "epoch": 0.08995057546984345, + "grad_norm": 0.0036113469395786524, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 3260 + }, + { + "epoch": 0.08997816767090781, + "grad_norm": 0.0033262385986745358, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 3261 + }, + { + "epoch": 0.0900057598719722, + "grad_norm": 0.0021796945948153734, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 3262 + }, + { + "epoch": 0.09003335207303656, + "grad_norm": 0.002145004691556096, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 3263 + }, + { + "epoch": 0.09006094427410093, + "grad_norm": 0.0034920983016490936, + "learning_rate": 0.001, + "loss": 0.397, + "step": 3264 + }, + { + "epoch": 0.09008853647516529, + "grad_norm": 0.0032197278924286366, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 3265 + }, + { + "epoch": 0.09011612867622966, + "grad_norm": 0.0023128585889935493, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 3266 + }, + { + "epoch": 0.09014372087729404, + "grad_norm": 0.0028581952210515738, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 3267 + }, + { + "epoch": 0.0901713130783584, + "grad_norm": 0.002962828380987048, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 3268 + }, + { + "epoch": 0.09019890527942277, + "grad_norm": 0.004223112482577562, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 3269 + }, + { + "epoch": 0.09022649748048714, + "grad_norm": 0.02053755894303322, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 3270 + }, + { + "epoch": 0.0902540896815515, + "grad_norm": 0.0043358695693314075, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 3271 + }, + { + "epoch": 0.09028168188261587, + "grad_norm": 0.003768104827031493, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 3272 + }, + { + "epoch": 0.09030927408368025, + "grad_norm": 0.0026477002538740635, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 3273 + }, + { + "epoch": 0.09033686628474462, + "grad_norm": 0.003647193778306246, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 3274 + }, + { + "epoch": 0.09036445848580898, + "grad_norm": 0.002453665481880307, + "learning_rate": 0.001, + "loss": 0.407, + "step": 3275 + }, + { + "epoch": 0.09039205068687335, + "grad_norm": 0.002882964676246047, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 3276 + }, + { + "epoch": 0.09041964288793772, + "grad_norm": 0.00521261477842927, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 3277 + }, + { + "epoch": 0.0904472350890021, + "grad_norm": 0.002587896538898349, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 3278 + }, + { + "epoch": 0.09047482729006646, + "grad_norm": 0.002533156191930175, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 3279 + }, + { + "epoch": 0.09050241949113083, + "grad_norm": 0.0027842391282320023, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 3280 + }, + { + "epoch": 0.0905300116921952, + "grad_norm": 0.0023463580291718245, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 3281 + }, + { + "epoch": 0.09055760389325956, + "grad_norm": 0.0023778725881129503, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 3282 + }, + { + "epoch": 0.09058519609432394, + "grad_norm": 0.0035691028460860252, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 3283 + }, + { + "epoch": 0.09061278829538831, + "grad_norm": 0.0037309350445866585, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 3284 + }, + { + "epoch": 0.09064038049645268, + "grad_norm": 0.002625760156661272, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 3285 + }, + { + "epoch": 0.09066797269751704, + "grad_norm": 0.003352556610479951, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 3286 + }, + { + "epoch": 0.09069556489858141, + "grad_norm": 0.0050033205188810825, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 3287 + }, + { + "epoch": 0.09072315709964579, + "grad_norm": 0.002488895785063505, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 3288 + }, + { + "epoch": 0.09075074930071016, + "grad_norm": 0.0031474691350013018, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 3289 + }, + { + "epoch": 0.09077834150177452, + "grad_norm": 0.002337649930268526, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 3290 + }, + { + "epoch": 0.09080593370283889, + "grad_norm": 0.0034202388487756252, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 3291 + }, + { + "epoch": 0.09083352590390326, + "grad_norm": 0.004500823561102152, + "learning_rate": 0.001, + "loss": 0.3663, + "step": 3292 + }, + { + "epoch": 0.09086111810496764, + "grad_norm": 0.0026400513015687466, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 3293 + }, + { + "epoch": 0.090888710306032, + "grad_norm": 0.002835595514625311, + "learning_rate": 0.001, + "loss": 0.388, + "step": 3294 + }, + { + "epoch": 0.09091630250709637, + "grad_norm": 0.0021484890021383762, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 3295 + }, + { + "epoch": 0.09094389470816074, + "grad_norm": 0.002589087001979351, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 3296 + }, + { + "epoch": 0.0909714869092251, + "grad_norm": 0.004091163631528616, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 3297 + }, + { + "epoch": 0.09099907911028948, + "grad_norm": 0.002472213003784418, + "learning_rate": 0.001, + "loss": 0.4512, + "step": 3298 + }, + { + "epoch": 0.09102667131135385, + "grad_norm": 0.0025115078315138817, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 3299 + }, + { + "epoch": 0.09105426351241822, + "grad_norm": 0.002945608925074339, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 3300 + }, + { + "epoch": 0.09108185571348258, + "grad_norm": 0.0024975589476525784, + "learning_rate": 0.001, + "loss": 0.357, + "step": 3301 + }, + { + "epoch": 0.09110944791454695, + "grad_norm": 0.003322755452245474, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 3302 + }, + { + "epoch": 0.09113704011561133, + "grad_norm": 0.003869826439768076, + "learning_rate": 0.001, + "loss": 0.3477, + "step": 3303 + }, + { + "epoch": 0.0911646323166757, + "grad_norm": 0.0036806685384362936, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 3304 + }, + { + "epoch": 0.09119222451774006, + "grad_norm": 0.002534373663365841, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 3305 + }, + { + "epoch": 0.09121981671880443, + "grad_norm": 0.002670099725946784, + "learning_rate": 0.001, + "loss": 0.4474, + "step": 3306 + }, + { + "epoch": 0.0912474089198688, + "grad_norm": 0.0026154613588005304, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 3307 + }, + { + "epoch": 0.09127500112093317, + "grad_norm": 0.002783595584332943, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 3308 + }, + { + "epoch": 0.09130259332199754, + "grad_norm": 0.0024160996545106173, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 3309 + }, + { + "epoch": 0.09133018552306191, + "grad_norm": 0.0040392628870904446, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 3310 + }, + { + "epoch": 0.09135777772412627, + "grad_norm": 0.0036512308288365602, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 3311 + }, + { + "epoch": 0.09138536992519064, + "grad_norm": 0.0026296162977814674, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 3312 + }, + { + "epoch": 0.09141296212625502, + "grad_norm": 0.002523603616282344, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 3313 + }, + { + "epoch": 0.09144055432731939, + "grad_norm": 0.0027449633926153183, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 3314 + }, + { + "epoch": 0.09146814652838375, + "grad_norm": 0.004635502118617296, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 3315 + }, + { + "epoch": 0.09149573872944812, + "grad_norm": 0.0026435446925461292, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 3316 + }, + { + "epoch": 0.09152333093051249, + "grad_norm": 0.012709138914942741, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 3317 + }, + { + "epoch": 0.09155092313157685, + "grad_norm": 0.0032850292045623064, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 3318 + }, + { + "epoch": 0.09157851533264123, + "grad_norm": 0.0038955537602305412, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 3319 + }, + { + "epoch": 0.0916061075337056, + "grad_norm": 0.004457899369299412, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 3320 + }, + { + "epoch": 0.09163369973476997, + "grad_norm": 0.002992943860590458, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 3321 + }, + { + "epoch": 0.09166129193583433, + "grad_norm": 0.004083162639290094, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 3322 + }, + { + "epoch": 0.0916888841368987, + "grad_norm": 0.0032160687260329723, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 3323 + }, + { + "epoch": 0.09171647633796308, + "grad_norm": 0.003969733603298664, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 3324 + }, + { + "epoch": 0.09174406853902745, + "grad_norm": 0.004920699633657932, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 3325 + }, + { + "epoch": 0.09177166074009181, + "grad_norm": 0.005621886812150478, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 3326 + }, + { + "epoch": 0.09179925294115618, + "grad_norm": 0.015430702827870846, + "learning_rate": 0.001, + "loss": 0.389, + "step": 3327 + }, + { + "epoch": 0.09182684514222055, + "grad_norm": 0.004751747474074364, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 3328 + }, + { + "epoch": 0.09185443734328493, + "grad_norm": 0.0023684531915932894, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 3329 + }, + { + "epoch": 0.09188202954434929, + "grad_norm": 0.004853997845202684, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 3330 + }, + { + "epoch": 0.09190962174541366, + "grad_norm": 0.005576598923653364, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 3331 + }, + { + "epoch": 0.09193721394647802, + "grad_norm": 0.0027397077064961195, + "learning_rate": 0.001, + "loss": 0.415, + "step": 3332 + }, + { + "epoch": 0.09196480614754239, + "grad_norm": 0.0026846746914088726, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 3333 + }, + { + "epoch": 0.09199239834860677, + "grad_norm": 0.0023833108134567738, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 3334 + }, + { + "epoch": 0.09201999054967114, + "grad_norm": 0.002954500960186124, + "learning_rate": 0.001, + "loss": 0.4484, + "step": 3335 + }, + { + "epoch": 0.0920475827507355, + "grad_norm": 0.0023365288507193327, + "learning_rate": 0.001, + "loss": 0.391, + "step": 3336 + }, + { + "epoch": 0.09207517495179987, + "grad_norm": 0.0031199888326227665, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 3337 + }, + { + "epoch": 0.09210276715286424, + "grad_norm": 0.00201621581800282, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 3338 + }, + { + "epoch": 0.09213035935392862, + "grad_norm": 0.0036083683371543884, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 3339 + }, + { + "epoch": 0.09215795155499298, + "grad_norm": 0.0037763137370347977, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 3340 + }, + { + "epoch": 0.09218554375605735, + "grad_norm": 0.002573802135884762, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 3341 + }, + { + "epoch": 0.09221313595712172, + "grad_norm": 0.0032212398946285248, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 3342 + }, + { + "epoch": 0.09224072815818608, + "grad_norm": 0.002471911022439599, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3343 + }, + { + "epoch": 0.09226832035925046, + "grad_norm": 0.0030415072105824947, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 3344 + }, + { + "epoch": 0.09229591256031483, + "grad_norm": 0.0027574030682444572, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 3345 + }, + { + "epoch": 0.0923235047613792, + "grad_norm": 0.0020206805784255266, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 3346 + }, + { + "epoch": 0.09235109696244356, + "grad_norm": 0.002503051655367017, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 3347 + }, + { + "epoch": 0.09237868916350793, + "grad_norm": 0.00392269529402256, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 3348 + }, + { + "epoch": 0.09240628136457231, + "grad_norm": 0.0035916061606258154, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 3349 + }, + { + "epoch": 0.09243387356563668, + "grad_norm": 0.003222295781597495, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 3350 + }, + { + "epoch": 0.09246146576670104, + "grad_norm": 0.003206141758710146, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 3351 + }, + { + "epoch": 0.09248905796776541, + "grad_norm": 0.00480427872389555, + "learning_rate": 0.001, + "loss": 0.398, + "step": 3352 + }, + { + "epoch": 0.09251665016882978, + "grad_norm": 0.0026392394211143255, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 3353 + }, + { + "epoch": 0.09254424236989416, + "grad_norm": 0.003221785882487893, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 3354 + }, + { + "epoch": 0.09257183457095852, + "grad_norm": 0.0030678475741297007, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 3355 + }, + { + "epoch": 0.09259942677202289, + "grad_norm": 0.0028969766572117805, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 3356 + }, + { + "epoch": 0.09262701897308726, + "grad_norm": 0.00338752125389874, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 3357 + }, + { + "epoch": 0.09265461117415162, + "grad_norm": 0.003165165428072214, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 3358 + }, + { + "epoch": 0.092682203375216, + "grad_norm": 0.002541585825383663, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 3359 + }, + { + "epoch": 0.09270979557628037, + "grad_norm": 0.0037303478457033634, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 3360 + }, + { + "epoch": 0.09273738777734473, + "grad_norm": 0.006375085562467575, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 3361 + }, + { + "epoch": 0.0927649799784091, + "grad_norm": 0.023185908794403076, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 3362 + }, + { + "epoch": 0.09279257217947347, + "grad_norm": 0.003924873657524586, + "learning_rate": 0.001, + "loss": 0.39, + "step": 3363 + }, + { + "epoch": 0.09282016438053785, + "grad_norm": 0.0037961790803819895, + "learning_rate": 0.001, + "loss": 0.4434, + "step": 3364 + }, + { + "epoch": 0.09284775658160221, + "grad_norm": 0.005324463825672865, + "learning_rate": 0.001, + "loss": 0.3622, + "step": 3365 + }, + { + "epoch": 0.09287534878266658, + "grad_norm": 0.0031803487800061703, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 3366 + }, + { + "epoch": 0.09290294098373095, + "grad_norm": 0.003402640810236335, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 3367 + }, + { + "epoch": 0.09293053318479531, + "grad_norm": 0.003059846581891179, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 3368 + }, + { + "epoch": 0.09295812538585968, + "grad_norm": 0.002190982224419713, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 3369 + }, + { + "epoch": 0.09298571758692406, + "grad_norm": 0.007336355280131102, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 3370 + }, + { + "epoch": 0.09301330978798843, + "grad_norm": 0.0030043197330087423, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 3371 + }, + { + "epoch": 0.0930409019890528, + "grad_norm": 0.00424328725785017, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 3372 + }, + { + "epoch": 0.09306849419011716, + "grad_norm": 0.0036531679797917604, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 3373 + }, + { + "epoch": 0.09309608639118153, + "grad_norm": 0.0029019531793892384, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 3374 + }, + { + "epoch": 0.0931236785922459, + "grad_norm": 0.002536515239626169, + "learning_rate": 0.001, + "loss": 0.41, + "step": 3375 + }, + { + "epoch": 0.09315127079331027, + "grad_norm": 0.003373499261215329, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 3376 + }, + { + "epoch": 0.09317886299437464, + "grad_norm": 0.0034982135985046625, + "learning_rate": 0.001, + "loss": 0.429, + "step": 3377 + }, + { + "epoch": 0.093206455195439, + "grad_norm": 0.005799585022032261, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 3378 + }, + { + "epoch": 0.09323404739650337, + "grad_norm": 0.002723401878029108, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 3379 + }, + { + "epoch": 0.09326163959756775, + "grad_norm": 0.002761744661256671, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 3380 + }, + { + "epoch": 0.09328923179863212, + "grad_norm": 0.0033628942910581827, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 3381 + }, + { + "epoch": 0.09331682399969649, + "grad_norm": 0.004340017680078745, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 3382 + }, + { + "epoch": 0.09334441620076085, + "grad_norm": 0.00276694493368268, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 3383 + }, + { + "epoch": 0.09337200840182522, + "grad_norm": 0.0040064319036901, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 3384 + }, + { + "epoch": 0.0933996006028896, + "grad_norm": 0.004304267466068268, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 3385 + }, + { + "epoch": 0.09342719280395397, + "grad_norm": 0.0043169171549379826, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 3386 + }, + { + "epoch": 0.09345478500501833, + "grad_norm": 0.0029776948504149914, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 3387 + }, + { + "epoch": 0.0934823772060827, + "grad_norm": 0.0030496844556182623, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 3388 + }, + { + "epoch": 0.09350996940714706, + "grad_norm": 0.002344959881156683, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 3389 + }, + { + "epoch": 0.09353756160821144, + "grad_norm": 0.0035598163958638906, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 3390 + }, + { + "epoch": 0.09356515380927581, + "grad_norm": 0.002589760348200798, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 3391 + }, + { + "epoch": 0.09359274601034018, + "grad_norm": 0.003679790301248431, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 3392 + }, + { + "epoch": 0.09362033821140454, + "grad_norm": 0.0021475160028785467, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 3393 + }, + { + "epoch": 0.09364793041246891, + "grad_norm": 0.0025992344599217176, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 3394 + }, + { + "epoch": 0.09367552261353329, + "grad_norm": 0.003233823226764798, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 3395 + }, + { + "epoch": 0.09370311481459766, + "grad_norm": 0.003669161582365632, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 3396 + }, + { + "epoch": 0.09373070701566202, + "grad_norm": 0.0025238466914743185, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 3397 + }, + { + "epoch": 0.09375829921672639, + "grad_norm": 0.0033630593679845333, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 3398 + }, + { + "epoch": 0.09378589141779076, + "grad_norm": 0.003087608842179179, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 3399 + }, + { + "epoch": 0.09381348361885514, + "grad_norm": 0.003995342645794153, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 3400 + }, + { + "epoch": 0.0938410758199195, + "grad_norm": 0.0032711985986679792, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 3401 + }, + { + "epoch": 0.09386866802098387, + "grad_norm": 0.003242811420932412, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 3402 + }, + { + "epoch": 0.09389626022204824, + "grad_norm": 0.003735556034371257, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 3403 + }, + { + "epoch": 0.0939238524231126, + "grad_norm": 0.005183308385312557, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 3404 + }, + { + "epoch": 0.09395144462417698, + "grad_norm": 0.0034695270005613565, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 3405 + }, + { + "epoch": 0.09397903682524135, + "grad_norm": 0.002555908402428031, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 3406 + }, + { + "epoch": 0.09400662902630572, + "grad_norm": 0.0033101667650043964, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 3407 + }, + { + "epoch": 0.09403422122737008, + "grad_norm": 0.0035219481214880943, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 3408 + }, + { + "epoch": 0.09406181342843445, + "grad_norm": 0.002841345267370343, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 3409 + }, + { + "epoch": 0.09408940562949883, + "grad_norm": 0.0024063391610980034, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 3410 + }, + { + "epoch": 0.0941169978305632, + "grad_norm": 0.004291092045605183, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 3411 + }, + { + "epoch": 0.09414459003162756, + "grad_norm": 0.00434610852971673, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 3412 + }, + { + "epoch": 0.09417218223269193, + "grad_norm": 0.002690440509468317, + "learning_rate": 0.001, + "loss": 0.461, + "step": 3413 + }, + { + "epoch": 0.0941997744337563, + "grad_norm": 0.004084399435669184, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 3414 + }, + { + "epoch": 0.09422736663482066, + "grad_norm": 0.003913892433047295, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 3415 + }, + { + "epoch": 0.09425495883588504, + "grad_norm": 0.002662160899490118, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 3416 + }, + { + "epoch": 0.09428255103694941, + "grad_norm": 0.0030965355690568686, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 3417 + }, + { + "epoch": 0.09431014323801377, + "grad_norm": 0.00348603050224483, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 3418 + }, + { + "epoch": 0.09433773543907814, + "grad_norm": 0.002485208213329315, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 3419 + }, + { + "epoch": 0.09436532764014251, + "grad_norm": 0.0029183162841945887, + "learning_rate": 0.001, + "loss": 0.3522, + "step": 3420 + }, + { + "epoch": 0.09439291984120689, + "grad_norm": 0.0036348486319184303, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 3421 + }, + { + "epoch": 0.09442051204227125, + "grad_norm": 0.03472241014242172, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 3422 + }, + { + "epoch": 0.09444810424333562, + "grad_norm": 0.0032440361101180315, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 3423 + }, + { + "epoch": 0.09447569644439999, + "grad_norm": 0.0023810886777937412, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 3424 + }, + { + "epoch": 0.09450328864546435, + "grad_norm": 0.0022793947719037533, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 3425 + }, + { + "epoch": 0.09453088084652873, + "grad_norm": 0.0028985291719436646, + "learning_rate": 0.001, + "loss": 0.413, + "step": 3426 + }, + { + "epoch": 0.0945584730475931, + "grad_norm": 0.00440243910998106, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 3427 + }, + { + "epoch": 0.09458606524865747, + "grad_norm": 0.003319642972201109, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 3428 + }, + { + "epoch": 0.09461365744972183, + "grad_norm": 0.016772592440247536, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 3429 + }, + { + "epoch": 0.0946412496507862, + "grad_norm": 0.005076760891824961, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 3430 + }, + { + "epoch": 0.09466884185185058, + "grad_norm": 0.005572374444454908, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 3431 + }, + { + "epoch": 0.09469643405291495, + "grad_norm": 0.004039878491312265, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 3432 + }, + { + "epoch": 0.09472402625397931, + "grad_norm": 0.0031457028817385435, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 3433 + }, + { + "epoch": 0.09475161845504368, + "grad_norm": 0.0031064285431057215, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 3434 + }, + { + "epoch": 0.09477921065610805, + "grad_norm": 0.00337353372015059, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 3435 + }, + { + "epoch": 0.09480680285717243, + "grad_norm": 0.0029832145664840937, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 3436 + }, + { + "epoch": 0.09483439505823679, + "grad_norm": 0.0031751911155879498, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 3437 + }, + { + "epoch": 0.09486198725930116, + "grad_norm": 0.056237224489450455, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 3438 + }, + { + "epoch": 0.09488957946036553, + "grad_norm": 0.010076162405312061, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 3439 + }, + { + "epoch": 0.09491717166142989, + "grad_norm": 0.0027662264183163643, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 3440 + }, + { + "epoch": 0.09494476386249427, + "grad_norm": 0.0028696192894130945, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 3441 + }, + { + "epoch": 0.09497235606355864, + "grad_norm": 0.0029843011870980263, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 3442 + }, + { + "epoch": 0.094999948264623, + "grad_norm": 0.0037254204507917166, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 3443 + }, + { + "epoch": 0.09502754046568737, + "grad_norm": 0.0030714944005012512, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 3444 + }, + { + "epoch": 0.09505513266675174, + "grad_norm": 0.0032339338213205338, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 3445 + }, + { + "epoch": 0.09508272486781612, + "grad_norm": 0.002931388793513179, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 3446 + }, + { + "epoch": 0.09511031706888048, + "grad_norm": 0.0032540869433432817, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 3447 + }, + { + "epoch": 0.09513790926994485, + "grad_norm": 0.003339814953505993, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 3448 + }, + { + "epoch": 0.09516550147100922, + "grad_norm": 0.0028484398499131203, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 3449 + }, + { + "epoch": 0.09519309367207358, + "grad_norm": 0.0028640010859817266, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 3450 + }, + { + "epoch": 0.09522068587313796, + "grad_norm": 0.003085035365074873, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 3451 + }, + { + "epoch": 0.09524827807420233, + "grad_norm": 0.0024918443523347378, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 3452 + }, + { + "epoch": 0.0952758702752667, + "grad_norm": 0.0036890755873173475, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 3453 + }, + { + "epoch": 0.09530346247633106, + "grad_norm": 0.002531822072342038, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 3454 + }, + { + "epoch": 0.09533105467739543, + "grad_norm": 0.00243132165633142, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 3455 + }, + { + "epoch": 0.09535864687845981, + "grad_norm": 0.0061992863193154335, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 3456 + }, + { + "epoch": 0.09538623907952418, + "grad_norm": 0.0030754937324672937, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 3457 + }, + { + "epoch": 0.09541383128058854, + "grad_norm": 0.0024158579763025045, + "learning_rate": 0.001, + "loss": 0.393, + "step": 3458 + }, + { + "epoch": 0.09544142348165291, + "grad_norm": 0.002696392824873328, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 3459 + }, + { + "epoch": 0.09546901568271728, + "grad_norm": 0.0024891409557312727, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 3460 + }, + { + "epoch": 0.09549660788378164, + "grad_norm": 0.004480718169361353, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 3461 + }, + { + "epoch": 0.09552420008484602, + "grad_norm": 0.0028512163553386927, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 3462 + }, + { + "epoch": 0.09555179228591039, + "grad_norm": 0.003078205045312643, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 3463 + }, + { + "epoch": 0.09557938448697476, + "grad_norm": 0.004663311876356602, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 3464 + }, + { + "epoch": 0.09560697668803912, + "grad_norm": 0.002411546418443322, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 3465 + }, + { + "epoch": 0.09563456888910349, + "grad_norm": 0.0031327796168625355, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 3466 + }, + { + "epoch": 0.09566216109016787, + "grad_norm": 0.0027790337335318327, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 3467 + }, + { + "epoch": 0.09568975329123224, + "grad_norm": 0.0028543812222778797, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 3468 + }, + { + "epoch": 0.0957173454922966, + "grad_norm": 0.003498122561722994, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 3469 + }, + { + "epoch": 0.09574493769336097, + "grad_norm": 0.00663920259103179, + "learning_rate": 0.001, + "loss": 0.4401, + "step": 3470 + }, + { + "epoch": 0.09577252989442533, + "grad_norm": 0.0032984658610075712, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 3471 + }, + { + "epoch": 0.09580012209548971, + "grad_norm": 0.0030672100838273764, + "learning_rate": 0.001, + "loss": 0.411, + "step": 3472 + }, + { + "epoch": 0.09582771429655408, + "grad_norm": 0.003116041421890259, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 3473 + }, + { + "epoch": 0.09585530649761845, + "grad_norm": 0.0065819453448057175, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 3474 + }, + { + "epoch": 0.09588289869868281, + "grad_norm": 0.015010586008429527, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 3475 + }, + { + "epoch": 0.09591049089974718, + "grad_norm": 0.008877400308847427, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 3476 + }, + { + "epoch": 0.09593808310081156, + "grad_norm": 0.004334705416113138, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 3477 + }, + { + "epoch": 0.09596567530187593, + "grad_norm": 0.0028876853175461292, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 3478 + }, + { + "epoch": 0.0959932675029403, + "grad_norm": 0.0028831001836806536, + "learning_rate": 0.001, + "loss": 0.398, + "step": 3479 + }, + { + "epoch": 0.09602085970400466, + "grad_norm": 0.002438000636175275, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 3480 + }, + { + "epoch": 0.09604845190506903, + "grad_norm": 0.004020646680146456, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 3481 + }, + { + "epoch": 0.09607604410613341, + "grad_norm": 0.0025036863517016172, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 3482 + }, + { + "epoch": 0.09610363630719777, + "grad_norm": 0.0033008300233632326, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 3483 + }, + { + "epoch": 0.09613122850826214, + "grad_norm": 0.002855231985449791, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 3484 + }, + { + "epoch": 0.0961588207093265, + "grad_norm": 0.008589272387325764, + "learning_rate": 0.001, + "loss": 0.436, + "step": 3485 + }, + { + "epoch": 0.09618641291039087, + "grad_norm": 0.0025324004236608744, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 3486 + }, + { + "epoch": 0.09621400511145525, + "grad_norm": 0.003534125629812479, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 3487 + }, + { + "epoch": 0.09624159731251962, + "grad_norm": 0.003250879468396306, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 3488 + }, + { + "epoch": 0.09626918951358399, + "grad_norm": 0.004198684822767973, + "learning_rate": 0.001, + "loss": 0.381, + "step": 3489 + }, + { + "epoch": 0.09629678171464835, + "grad_norm": 0.0034648876171559095, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 3490 + }, + { + "epoch": 0.09632437391571272, + "grad_norm": 0.003584906691685319, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 3491 + }, + { + "epoch": 0.0963519661167771, + "grad_norm": 0.0033312232699245214, + "learning_rate": 0.001, + "loss": 0.405, + "step": 3492 + }, + { + "epoch": 0.09637955831784147, + "grad_norm": 0.0032407655380666256, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 3493 + }, + { + "epoch": 0.09640715051890583, + "grad_norm": 0.00250063999556005, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 3494 + }, + { + "epoch": 0.0964347427199702, + "grad_norm": 0.0022465146612375975, + "learning_rate": 0.001, + "loss": 0.387, + "step": 3495 + }, + { + "epoch": 0.09646233492103456, + "grad_norm": 0.002582980552688241, + "learning_rate": 0.001, + "loss": 0.393, + "step": 3496 + }, + { + "epoch": 0.09648992712209895, + "grad_norm": 0.003146865637972951, + "learning_rate": 0.001, + "loss": 0.375, + "step": 3497 + }, + { + "epoch": 0.09651751932316331, + "grad_norm": 0.004826393909752369, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 3498 + }, + { + "epoch": 0.09654511152422768, + "grad_norm": 0.004438623785972595, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 3499 + }, + { + "epoch": 0.09657270372529204, + "grad_norm": 0.002986249513924122, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 3500 + }, + { + "epoch": 0.09657270372529204, + "eval_runtime": 24.1649, + "eval_samples_per_second": 1.324, + "eval_steps_per_second": 0.166, + "step": 3500 + }, + { + "epoch": 0.09660029592635641, + "grad_norm": 0.002994472160935402, + "learning_rate": 0.001, + "loss": 0.395, + "step": 3501 + }, + { + "epoch": 0.09662788812742079, + "grad_norm": 0.006290449295192957, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 3502 + }, + { + "epoch": 0.09665548032848516, + "grad_norm": 0.0027024163864552975, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 3503 + }, + { + "epoch": 0.09668307252954952, + "grad_norm": 0.0026126017328351736, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 3504 + }, + { + "epoch": 0.09671066473061389, + "grad_norm": 0.002393176080659032, + "learning_rate": 0.001, + "loss": 0.4, + "step": 3505 + }, + { + "epoch": 0.09673825693167826, + "grad_norm": 0.0025012970436364412, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 3506 + }, + { + "epoch": 0.09676584913274262, + "grad_norm": 0.0026587117463350296, + "learning_rate": 0.001, + "loss": 0.394, + "step": 3507 + }, + { + "epoch": 0.096793441333807, + "grad_norm": 0.004255054052919149, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 3508 + }, + { + "epoch": 0.09682103353487137, + "grad_norm": 0.0029915212653577328, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 3509 + }, + { + "epoch": 0.09684862573593574, + "grad_norm": 0.002514585852622986, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 3510 + }, + { + "epoch": 0.0968762179370001, + "grad_norm": 0.0031415291596204042, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 3511 + }, + { + "epoch": 0.09690381013806447, + "grad_norm": 0.004052399192005396, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 3512 + }, + { + "epoch": 0.09693140233912885, + "grad_norm": 0.003938235808163881, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 3513 + }, + { + "epoch": 0.09695899454019322, + "grad_norm": 0.0027284801471978426, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 3514 + }, + { + "epoch": 0.09698658674125758, + "grad_norm": 0.0021578462328761816, + "learning_rate": 0.001, + "loss": 0.4513, + "step": 3515 + }, + { + "epoch": 0.09701417894232195, + "grad_norm": 0.0030869885813444853, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 3516 + }, + { + "epoch": 0.09704177114338632, + "grad_norm": 0.0028977058827877045, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 3517 + }, + { + "epoch": 0.0970693633444507, + "grad_norm": 0.0025139932986348867, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 3518 + }, + { + "epoch": 0.09709695554551506, + "grad_norm": 0.0022785612381994724, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 3519 + }, + { + "epoch": 0.09712454774657943, + "grad_norm": 0.002874610014259815, + "learning_rate": 0.001, + "loss": 0.3645, + "step": 3520 + }, + { + "epoch": 0.0971521399476438, + "grad_norm": 0.004266508389264345, + "learning_rate": 0.001, + "loss": 0.3449, + "step": 3521 + }, + { + "epoch": 0.09717973214870816, + "grad_norm": 0.0025132927112281322, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 3522 + }, + { + "epoch": 0.09720732434977254, + "grad_norm": 0.006651229690760374, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 3523 + }, + { + "epoch": 0.09723491655083691, + "grad_norm": 0.0053511932492256165, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 3524 + }, + { + "epoch": 0.09726250875190127, + "grad_norm": 0.0037280949763953686, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 3525 + }, + { + "epoch": 0.09729010095296564, + "grad_norm": 0.0037587578408420086, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 3526 + }, + { + "epoch": 0.09731769315403001, + "grad_norm": 0.0037075839936733246, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 3527 + }, + { + "epoch": 0.09734528535509439, + "grad_norm": 0.004948961082845926, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 3528 + }, + { + "epoch": 0.09737287755615875, + "grad_norm": 0.006435552146285772, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 3529 + }, + { + "epoch": 0.09740046975722312, + "grad_norm": 0.0021128810476511717, + "learning_rate": 0.001, + "loss": 0.41, + "step": 3530 + }, + { + "epoch": 0.09742806195828749, + "grad_norm": 0.0025616176426410675, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 3531 + }, + { + "epoch": 0.09745565415935185, + "grad_norm": 0.00574469892308116, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 3532 + }, + { + "epoch": 0.09748324636041623, + "grad_norm": 0.0031522875651717186, + "learning_rate": 0.001, + "loss": 0.3611, + "step": 3533 + }, + { + "epoch": 0.0975108385614806, + "grad_norm": 0.0031021784525364637, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 3534 + }, + { + "epoch": 0.09753843076254497, + "grad_norm": 0.005699521396309137, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 3535 + }, + { + "epoch": 0.09756602296360933, + "grad_norm": 0.003169464645907283, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 3536 + }, + { + "epoch": 0.0975936151646737, + "grad_norm": 0.003686879761517048, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 3537 + }, + { + "epoch": 0.09762120736573808, + "grad_norm": 0.0031342299189418554, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 3538 + }, + { + "epoch": 0.09764879956680245, + "grad_norm": 0.0023811724968254566, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 3539 + }, + { + "epoch": 0.09767639176786681, + "grad_norm": 0.003792383009567857, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 3540 + }, + { + "epoch": 0.09770398396893118, + "grad_norm": 0.0024562154430896044, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3541 + }, + { + "epoch": 0.09773157616999555, + "grad_norm": 0.0034171422012150288, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 3542 + }, + { + "epoch": 0.09775916837105993, + "grad_norm": 0.003149349009618163, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 3543 + }, + { + "epoch": 0.09778676057212429, + "grad_norm": 0.002210955834016204, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 3544 + }, + { + "epoch": 0.09781435277318866, + "grad_norm": 0.003593403846025467, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 3545 + }, + { + "epoch": 0.09784194497425303, + "grad_norm": 0.0022881280165165663, + "learning_rate": 0.001, + "loss": 0.441, + "step": 3546 + }, + { + "epoch": 0.09786953717531739, + "grad_norm": 0.002236244734376669, + "learning_rate": 0.001, + "loss": 0.4306, + "step": 3547 + }, + { + "epoch": 0.09789712937638177, + "grad_norm": 0.004320134408771992, + "learning_rate": 0.001, + "loss": 0.3519, + "step": 3548 + }, + { + "epoch": 0.09792472157744614, + "grad_norm": 0.003616459434852004, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 3549 + }, + { + "epoch": 0.0979523137785105, + "grad_norm": 0.0072233411483466625, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 3550 + }, + { + "epoch": 0.09797990597957487, + "grad_norm": 0.017990432679653168, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 3551 + }, + { + "epoch": 0.09800749818063924, + "grad_norm": 0.0036775225307792425, + "learning_rate": 0.001, + "loss": 0.3654, + "step": 3552 + }, + { + "epoch": 0.0980350903817036, + "grad_norm": 0.0034110434353351593, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 3553 + }, + { + "epoch": 0.09806268258276798, + "grad_norm": 0.005091147031635046, + "learning_rate": 0.001, + "loss": 0.385, + "step": 3554 + }, + { + "epoch": 0.09809027478383235, + "grad_norm": 0.011450120247900486, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 3555 + }, + { + "epoch": 0.09811786698489672, + "grad_norm": 0.004091903567314148, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 3556 + }, + { + "epoch": 0.09814545918596108, + "grad_norm": 0.00835462100803852, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 3557 + }, + { + "epoch": 0.09817305138702545, + "grad_norm": 0.0032325852662324905, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 3558 + }, + { + "epoch": 0.09820064358808983, + "grad_norm": 0.003423454938456416, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 3559 + }, + { + "epoch": 0.0982282357891542, + "grad_norm": 0.0024230678100138903, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 3560 + }, + { + "epoch": 0.09825582799021856, + "grad_norm": 0.003030325984582305, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 3561 + }, + { + "epoch": 0.09828342019128293, + "grad_norm": 0.0031275860965251923, + "learning_rate": 0.001, + "loss": 0.3591, + "step": 3562 + }, + { + "epoch": 0.0983110123923473, + "grad_norm": 0.010123792104423046, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 3563 + }, + { + "epoch": 0.09833860459341168, + "grad_norm": 0.0048133572563529015, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 3564 + }, + { + "epoch": 0.09836619679447604, + "grad_norm": 0.002648024819791317, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 3565 + }, + { + "epoch": 0.09839378899554041, + "grad_norm": 0.006900025065988302, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 3566 + }, + { + "epoch": 0.09842138119660478, + "grad_norm": 0.011950865387916565, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 3567 + }, + { + "epoch": 0.09844897339766914, + "grad_norm": 0.0075233737006783485, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 3568 + }, + { + "epoch": 0.09847656559873352, + "grad_norm": 0.006645913701504469, + "learning_rate": 0.001, + "loss": 0.4328, + "step": 3569 + }, + { + "epoch": 0.09850415779979789, + "grad_norm": 0.005523411091417074, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 3570 + }, + { + "epoch": 0.09853175000086226, + "grad_norm": 0.006219461094588041, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 3571 + }, + { + "epoch": 0.09855934220192662, + "grad_norm": 0.002192398766055703, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 3572 + }, + { + "epoch": 0.09858693440299099, + "grad_norm": 0.002795976819470525, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 3573 + }, + { + "epoch": 0.09861452660405537, + "grad_norm": 0.0030005681328475475, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 3574 + }, + { + "epoch": 0.09864211880511974, + "grad_norm": 0.002711366629227996, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 3575 + }, + { + "epoch": 0.0986697110061841, + "grad_norm": 0.0025707653257995844, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 3576 + }, + { + "epoch": 0.09869730320724847, + "grad_norm": 0.002858672058209777, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 3577 + }, + { + "epoch": 0.09872489540831283, + "grad_norm": 0.003099956549704075, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 3578 + }, + { + "epoch": 0.09875248760937722, + "grad_norm": 0.014343291521072388, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 3579 + }, + { + "epoch": 0.09878007981044158, + "grad_norm": 0.0043120840564370155, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 3580 + }, + { + "epoch": 0.09880767201150595, + "grad_norm": 0.0042751263827085495, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 3581 + }, + { + "epoch": 0.09883526421257031, + "grad_norm": 0.0026994547806680202, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 3582 + }, + { + "epoch": 0.09886285641363468, + "grad_norm": 0.002389197936281562, + "learning_rate": 0.001, + "loss": 0.419, + "step": 3583 + }, + { + "epoch": 0.09889044861469906, + "grad_norm": 0.003209868213161826, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 3584 + }, + { + "epoch": 0.09891804081576343, + "grad_norm": 0.004312742967158556, + "learning_rate": 0.001, + "loss": 0.3506, + "step": 3585 + }, + { + "epoch": 0.0989456330168278, + "grad_norm": 0.003966677933931351, + "learning_rate": 0.001, + "loss": 0.426, + "step": 3586 + }, + { + "epoch": 0.09897322521789216, + "grad_norm": 0.008509870618581772, + "learning_rate": 0.001, + "loss": 0.402, + "step": 3587 + }, + { + "epoch": 0.09900081741895653, + "grad_norm": 0.003062241477891803, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 3588 + }, + { + "epoch": 0.09902840962002091, + "grad_norm": 0.0022860439494252205, + "learning_rate": 0.001, + "loss": 0.426, + "step": 3589 + }, + { + "epoch": 0.09905600182108527, + "grad_norm": 0.004430666100233793, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 3590 + }, + { + "epoch": 0.09908359402214964, + "grad_norm": 0.002458814997226, + "learning_rate": 0.001, + "loss": 0.417, + "step": 3591 + }, + { + "epoch": 0.099111186223214, + "grad_norm": 0.0029090861789882183, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 3592 + }, + { + "epoch": 0.09913877842427837, + "grad_norm": 0.0032796203158795834, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 3593 + }, + { + "epoch": 0.09916637062534275, + "grad_norm": 0.002556765917688608, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 3594 + }, + { + "epoch": 0.09919396282640712, + "grad_norm": 0.0023709458764642477, + "learning_rate": 0.001, + "loss": 0.415, + "step": 3595 + }, + { + "epoch": 0.09922155502747149, + "grad_norm": 0.003395737847313285, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 3596 + }, + { + "epoch": 0.09924914722853585, + "grad_norm": 0.0026961613912135363, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 3597 + }, + { + "epoch": 0.09927673942960022, + "grad_norm": 0.0020530601032078266, + "learning_rate": 0.001, + "loss": 0.4465, + "step": 3598 + }, + { + "epoch": 0.0993043316306646, + "grad_norm": 0.0026466413401067257, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 3599 + }, + { + "epoch": 0.09933192383172897, + "grad_norm": 0.003969122655689716, + "learning_rate": 0.001, + "loss": 0.406, + "step": 3600 + }, + { + "epoch": 0.09935951603279333, + "grad_norm": 0.0035336946602910757, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 3601 + }, + { + "epoch": 0.0993871082338577, + "grad_norm": 0.0030988729558885098, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 3602 + }, + { + "epoch": 0.09941470043492207, + "grad_norm": 0.0029431709554046392, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 3603 + }, + { + "epoch": 0.09944229263598643, + "grad_norm": 0.002947601256892085, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 3604 + }, + { + "epoch": 0.09946988483705081, + "grad_norm": 0.002736450405791402, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 3605 + }, + { + "epoch": 0.09949747703811518, + "grad_norm": 0.0027800817042589188, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 3606 + }, + { + "epoch": 0.09952506923917954, + "grad_norm": 0.003060448681935668, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 3607 + }, + { + "epoch": 0.09955266144024391, + "grad_norm": 0.0032676290720701218, + "learning_rate": 0.001, + "loss": 0.3543, + "step": 3608 + }, + { + "epoch": 0.09958025364130828, + "grad_norm": 0.0039128996431827545, + "learning_rate": 0.001, + "loss": 0.398, + "step": 3609 + }, + { + "epoch": 0.09960784584237266, + "grad_norm": 0.0037549827247858047, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 3610 + }, + { + "epoch": 0.09963543804343702, + "grad_norm": 0.00438270578160882, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 3611 + }, + { + "epoch": 0.09966303024450139, + "grad_norm": 0.003163162851706147, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 3612 + }, + { + "epoch": 0.09969062244556576, + "grad_norm": 0.0033597201108932495, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 3613 + }, + { + "epoch": 0.09971821464663012, + "grad_norm": 0.003219824517145753, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 3614 + }, + { + "epoch": 0.0997458068476945, + "grad_norm": 0.0035223192535340786, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 3615 + }, + { + "epoch": 0.09977339904875887, + "grad_norm": 0.00964295119047165, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 3616 + }, + { + "epoch": 0.09980099124982324, + "grad_norm": 0.002427217550575733, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3617 + }, + { + "epoch": 0.0998285834508876, + "grad_norm": 0.004480184521526098, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 3618 + }, + { + "epoch": 0.09985617565195197, + "grad_norm": 0.0030322298407554626, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 3619 + }, + { + "epoch": 0.09988376785301635, + "grad_norm": 0.02180991880595684, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 3620 + }, + { + "epoch": 0.09991136005408072, + "grad_norm": 0.0030360252130776644, + "learning_rate": 0.001, + "loss": 0.433, + "step": 3621 + }, + { + "epoch": 0.09993895225514508, + "grad_norm": 0.0025207020808011293, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3622 + }, + { + "epoch": 0.09996654445620945, + "grad_norm": 0.004299542400985956, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 3623 + }, + { + "epoch": 0.09999413665727382, + "grad_norm": 0.004565478768199682, + "learning_rate": 0.001, + "loss": 0.394, + "step": 3624 + }, + { + "epoch": 0.1000217288583382, + "grad_norm": 0.0028781460132449865, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 3625 + }, + { + "epoch": 0.10004932105940256, + "grad_norm": 0.0026228884235024452, + "learning_rate": 0.001, + "loss": 0.372, + "step": 3626 + }, + { + "epoch": 0.10007691326046693, + "grad_norm": 0.0029596835374832153, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 3627 + }, + { + "epoch": 0.1001045054615313, + "grad_norm": 0.007199972402304411, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 3628 + }, + { + "epoch": 0.10013209766259566, + "grad_norm": 0.003069596830755472, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 3629 + }, + { + "epoch": 0.10015968986366004, + "grad_norm": 0.002500646049156785, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 3630 + }, + { + "epoch": 0.10018728206472441, + "grad_norm": 0.0024019493721425533, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 3631 + }, + { + "epoch": 0.10021487426578878, + "grad_norm": 0.0032364402431994677, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 3632 + }, + { + "epoch": 0.10024246646685314, + "grad_norm": 0.00393798528239131, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 3633 + }, + { + "epoch": 0.10027005866791751, + "grad_norm": 0.0036344374530017376, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 3634 + }, + { + "epoch": 0.10029765086898189, + "grad_norm": 0.003431879449635744, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 3635 + }, + { + "epoch": 0.10032524307004625, + "grad_norm": 0.003278666641563177, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 3636 + }, + { + "epoch": 0.10035283527111062, + "grad_norm": 0.003398419125005603, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 3637 + }, + { + "epoch": 0.10038042747217499, + "grad_norm": 0.002605091081932187, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 3638 + }, + { + "epoch": 0.10040801967323935, + "grad_norm": 0.0038700527511537075, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 3639 + }, + { + "epoch": 0.10043561187430373, + "grad_norm": 0.002678008284419775, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 3640 + }, + { + "epoch": 0.1004632040753681, + "grad_norm": 0.004668472800403833, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 3641 + }, + { + "epoch": 0.10049079627643247, + "grad_norm": 0.003300663083791733, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 3642 + }, + { + "epoch": 0.10051838847749683, + "grad_norm": 0.0024147978983819485, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 3643 + }, + { + "epoch": 0.1005459806785612, + "grad_norm": 0.005644883494824171, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 3644 + }, + { + "epoch": 0.10057357287962558, + "grad_norm": 0.004027125891298056, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 3645 + }, + { + "epoch": 0.10060116508068995, + "grad_norm": 0.003284280654042959, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 3646 + }, + { + "epoch": 0.10062875728175431, + "grad_norm": 0.0025769018102437258, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 3647 + }, + { + "epoch": 0.10065634948281868, + "grad_norm": 0.002732061082497239, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 3648 + }, + { + "epoch": 0.10068394168388305, + "grad_norm": 0.00284641538746655, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3649 + }, + { + "epoch": 0.10071153388494741, + "grad_norm": 0.0040368628688156605, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 3650 + }, + { + "epoch": 0.1007391260860118, + "grad_norm": 0.0030586514621973038, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3651 + }, + { + "epoch": 0.10076671828707616, + "grad_norm": 0.002908149268478155, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 3652 + }, + { + "epoch": 0.10079431048814053, + "grad_norm": 0.0038641381543129683, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 3653 + }, + { + "epoch": 0.10082190268920489, + "grad_norm": 0.0032964826095849276, + "learning_rate": 0.001, + "loss": 0.39, + "step": 3654 + }, + { + "epoch": 0.10084949489026926, + "grad_norm": 0.0040243249386549, + "learning_rate": 0.001, + "loss": 0.3653, + "step": 3655 + }, + { + "epoch": 0.10087708709133364, + "grad_norm": 0.004164101555943489, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 3656 + }, + { + "epoch": 0.100904679292398, + "grad_norm": 0.003107170108705759, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 3657 + }, + { + "epoch": 0.10093227149346237, + "grad_norm": 0.0029900551307946444, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 3658 + }, + { + "epoch": 0.10095986369452674, + "grad_norm": 0.0030398843809962273, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 3659 + }, + { + "epoch": 0.1009874558955911, + "grad_norm": 0.0038001432549208403, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 3660 + }, + { + "epoch": 0.10101504809665549, + "grad_norm": 0.0026995730586349964, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 3661 + }, + { + "epoch": 0.10104264029771985, + "grad_norm": 0.0022786613553762436, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 3662 + }, + { + "epoch": 0.10107023249878422, + "grad_norm": 0.002397672738879919, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 3663 + }, + { + "epoch": 0.10109782469984858, + "grad_norm": 0.002657962031662464, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 3664 + }, + { + "epoch": 0.10112541690091295, + "grad_norm": 0.004640195053070784, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 3665 + }, + { + "epoch": 0.10115300910197733, + "grad_norm": 0.0031627060379832983, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 3666 + }, + { + "epoch": 0.1011806013030417, + "grad_norm": 0.003310044063255191, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 3667 + }, + { + "epoch": 0.10120819350410606, + "grad_norm": 0.0024094157852232456, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 3668 + }, + { + "epoch": 0.10123578570517043, + "grad_norm": 0.0029870392754673958, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 3669 + }, + { + "epoch": 0.1012633779062348, + "grad_norm": 0.002329483861103654, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 3670 + }, + { + "epoch": 0.10129097010729918, + "grad_norm": 0.003403107402846217, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 3671 + }, + { + "epoch": 0.10131856230836354, + "grad_norm": 0.0027673887088894844, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 3672 + }, + { + "epoch": 0.10134615450942791, + "grad_norm": 0.0028799972496926785, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 3673 + }, + { + "epoch": 0.10137374671049228, + "grad_norm": 0.003228268353268504, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 3674 + }, + { + "epoch": 0.10140133891155664, + "grad_norm": 0.0028087422251701355, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 3675 + }, + { + "epoch": 0.10142893111262102, + "grad_norm": 0.0026430152356624603, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 3676 + }, + { + "epoch": 0.10145652331368539, + "grad_norm": 0.0036075664684176445, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 3677 + }, + { + "epoch": 0.10148411551474976, + "grad_norm": 0.0028451229445636272, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 3678 + }, + { + "epoch": 0.10151170771581412, + "grad_norm": 0.0037802942097187042, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 3679 + }, + { + "epoch": 0.10153929991687849, + "grad_norm": 0.0029138477984815836, + "learning_rate": 0.001, + "loss": 0.436, + "step": 3680 + }, + { + "epoch": 0.10156689211794287, + "grad_norm": 0.003684982191771269, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 3681 + }, + { + "epoch": 0.10159448431900724, + "grad_norm": 0.005630989093333483, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 3682 + }, + { + "epoch": 0.1016220765200716, + "grad_norm": 0.006454580929130316, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 3683 + }, + { + "epoch": 0.10164966872113597, + "grad_norm": 0.007667763624340296, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 3684 + }, + { + "epoch": 0.10167726092220034, + "grad_norm": 0.004302634857594967, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 3685 + }, + { + "epoch": 0.10170485312326472, + "grad_norm": 0.003696159226819873, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 3686 + }, + { + "epoch": 0.10173244532432908, + "grad_norm": 0.004956797696650028, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 3687 + }, + { + "epoch": 0.10176003752539345, + "grad_norm": 0.0026316859293729067, + "learning_rate": 0.001, + "loss": 0.395, + "step": 3688 + }, + { + "epoch": 0.10178762972645782, + "grad_norm": 0.002009750111028552, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 3689 + }, + { + "epoch": 0.10181522192752218, + "grad_norm": 0.003345980541780591, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 3690 + }, + { + "epoch": 0.10184281412858656, + "grad_norm": 0.002231464022770524, + "learning_rate": 0.001, + "loss": 0.4316, + "step": 3691 + }, + { + "epoch": 0.10187040632965093, + "grad_norm": 0.003336479654535651, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 3692 + }, + { + "epoch": 0.1018979985307153, + "grad_norm": 0.002393354196101427, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 3693 + }, + { + "epoch": 0.10192559073177966, + "grad_norm": 0.0024670169223099947, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 3694 + }, + { + "epoch": 0.10195318293284403, + "grad_norm": 0.0034887620713561773, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 3695 + }, + { + "epoch": 0.1019807751339084, + "grad_norm": 0.003547382541000843, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 3696 + }, + { + "epoch": 0.10200836733497277, + "grad_norm": 0.0034907760564237833, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 3697 + }, + { + "epoch": 0.10203595953603714, + "grad_norm": 0.002545100636780262, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 3698 + }, + { + "epoch": 0.10206355173710151, + "grad_norm": 0.004985075909644365, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 3699 + }, + { + "epoch": 0.10209114393816587, + "grad_norm": 0.004573920741677284, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 3700 + }, + { + "epoch": 0.10211873613923024, + "grad_norm": 0.004074465949088335, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 3701 + }, + { + "epoch": 0.10214632834029462, + "grad_norm": 0.0037853543180972338, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 3702 + }, + { + "epoch": 0.10217392054135899, + "grad_norm": 0.002464262768626213, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 3703 + }, + { + "epoch": 0.10220151274242335, + "grad_norm": 0.0036815868224948645, + "learning_rate": 0.001, + "loss": 0.388, + "step": 3704 + }, + { + "epoch": 0.10222910494348772, + "grad_norm": 0.0033971264492720366, + "learning_rate": 0.001, + "loss": 0.4613, + "step": 3705 + }, + { + "epoch": 0.10225669714455209, + "grad_norm": 0.0029886479023844004, + "learning_rate": 0.001, + "loss": 0.413, + "step": 3706 + }, + { + "epoch": 0.10228428934561647, + "grad_norm": 0.0036080891732126474, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 3707 + }, + { + "epoch": 0.10231188154668083, + "grad_norm": 0.0026688736397773027, + "learning_rate": 0.001, + "loss": 0.432, + "step": 3708 + }, + { + "epoch": 0.1023394737477452, + "grad_norm": 0.003568600630387664, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 3709 + }, + { + "epoch": 0.10236706594880957, + "grad_norm": 0.0037499042227864265, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3710 + }, + { + "epoch": 0.10239465814987393, + "grad_norm": 0.0027967335190624, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3711 + }, + { + "epoch": 0.10242225035093831, + "grad_norm": 0.002266339026391506, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 3712 + }, + { + "epoch": 0.10244984255200268, + "grad_norm": 0.002678538439795375, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 3713 + }, + { + "epoch": 0.10247743475306705, + "grad_norm": 0.007051249034702778, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 3714 + }, + { + "epoch": 0.10250502695413141, + "grad_norm": 0.002434907481074333, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 3715 + }, + { + "epoch": 0.10253261915519578, + "grad_norm": 0.002865745685994625, + "learning_rate": 0.001, + "loss": 0.4641, + "step": 3716 + }, + { + "epoch": 0.10256021135626016, + "grad_norm": 0.0022143360693007708, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 3717 + }, + { + "epoch": 0.10258780355732453, + "grad_norm": 0.0022539508063346148, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 3718 + }, + { + "epoch": 0.10261539575838889, + "grad_norm": 0.0032584406435489655, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 3719 + }, + { + "epoch": 0.10264298795945326, + "grad_norm": 0.003400850109755993, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 3720 + }, + { + "epoch": 0.10267058016051762, + "grad_norm": 0.004264235496520996, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 3721 + }, + { + "epoch": 0.102698172361582, + "grad_norm": 0.0028461632318794727, + "learning_rate": 0.001, + "loss": 0.4444, + "step": 3722 + }, + { + "epoch": 0.10272576456264637, + "grad_norm": 0.00392636563628912, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 3723 + }, + { + "epoch": 0.10275335676371074, + "grad_norm": 0.002802118891850114, + "learning_rate": 0.001, + "loss": 0.4593, + "step": 3724 + }, + { + "epoch": 0.1027809489647751, + "grad_norm": 0.0052732862532138824, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 3725 + }, + { + "epoch": 0.10280854116583947, + "grad_norm": 0.009514648467302322, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 3726 + }, + { + "epoch": 0.10283613336690385, + "grad_norm": 0.0024788815062493086, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 3727 + }, + { + "epoch": 0.10286372556796822, + "grad_norm": 0.003208071691915393, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 3728 + }, + { + "epoch": 0.10289131776903258, + "grad_norm": 0.002506793709471822, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 3729 + }, + { + "epoch": 0.10291890997009695, + "grad_norm": 0.005780140403658152, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 3730 + }, + { + "epoch": 0.10294650217116132, + "grad_norm": 0.005190226249396801, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 3731 + }, + { + "epoch": 0.1029740943722257, + "grad_norm": 0.005167331546545029, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 3732 + }, + { + "epoch": 0.10300168657329006, + "grad_norm": 0.003800647798925638, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 3733 + }, + { + "epoch": 0.10302927877435443, + "grad_norm": 0.004843884147703648, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 3734 + }, + { + "epoch": 0.1030568709754188, + "grad_norm": 0.0037740804255008698, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 3735 + }, + { + "epoch": 0.10308446317648316, + "grad_norm": 0.004264209885150194, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 3736 + }, + { + "epoch": 0.10311205537754754, + "grad_norm": 0.007586845196783543, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 3737 + }, + { + "epoch": 0.10313964757861191, + "grad_norm": 0.005896701943129301, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 3738 + }, + { + "epoch": 0.10316723977967628, + "grad_norm": 0.004077561665326357, + "learning_rate": 0.001, + "loss": 0.442, + "step": 3739 + }, + { + "epoch": 0.10319483198074064, + "grad_norm": 0.003110091434791684, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 3740 + }, + { + "epoch": 0.10322242418180501, + "grad_norm": 0.0023520744871348143, + "learning_rate": 0.001, + "loss": 0.428, + "step": 3741 + }, + { + "epoch": 0.10325001638286938, + "grad_norm": 0.006090542767196894, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 3742 + }, + { + "epoch": 0.10327760858393376, + "grad_norm": 0.002889704890549183, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 3743 + }, + { + "epoch": 0.10330520078499812, + "grad_norm": 0.0024959116708487272, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 3744 + }, + { + "epoch": 0.10333279298606249, + "grad_norm": 0.002503210911527276, + "learning_rate": 0.001, + "loss": 0.4306, + "step": 3745 + }, + { + "epoch": 0.10336038518712685, + "grad_norm": 0.0028133681043982506, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 3746 + }, + { + "epoch": 0.10338797738819122, + "grad_norm": 0.0032689927611500025, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 3747 + }, + { + "epoch": 0.1034155695892556, + "grad_norm": 0.002301878994330764, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 3748 + }, + { + "epoch": 0.10344316179031997, + "grad_norm": 0.0024407797027379274, + "learning_rate": 0.001, + "loss": 0.376, + "step": 3749 + }, + { + "epoch": 0.10347075399138433, + "grad_norm": 0.0035565400030463934, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 3750 + }, + { + "epoch": 0.1034983461924487, + "grad_norm": 0.002976832212880254, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 3751 + }, + { + "epoch": 0.10352593839351307, + "grad_norm": 0.0027037430554628372, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 3752 + }, + { + "epoch": 0.10355353059457745, + "grad_norm": 0.004547620192170143, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 3753 + }, + { + "epoch": 0.10358112279564181, + "grad_norm": 0.0025993280578404665, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 3754 + }, + { + "epoch": 0.10360871499670618, + "grad_norm": 0.0020117738749831915, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 3755 + }, + { + "epoch": 0.10363630719777055, + "grad_norm": 0.003054060973227024, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 3756 + }, + { + "epoch": 0.10366389939883491, + "grad_norm": 0.0028975980821996927, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 3757 + }, + { + "epoch": 0.1036914915998993, + "grad_norm": 0.004843092989176512, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 3758 + }, + { + "epoch": 0.10371908380096366, + "grad_norm": 0.003735753009095788, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 3759 + }, + { + "epoch": 0.10374667600202803, + "grad_norm": 0.0024528366047888994, + "learning_rate": 0.001, + "loss": 0.4391, + "step": 3760 + }, + { + "epoch": 0.10377426820309239, + "grad_norm": 0.003306907368823886, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 3761 + }, + { + "epoch": 0.10380186040415676, + "grad_norm": 0.0029531391337513924, + "learning_rate": 0.001, + "loss": 0.3595, + "step": 3762 + }, + { + "epoch": 0.10382945260522114, + "grad_norm": 0.004337473772466183, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 3763 + }, + { + "epoch": 0.1038570448062855, + "grad_norm": 0.0033757942728698254, + "learning_rate": 0.001, + "loss": 0.386, + "step": 3764 + }, + { + "epoch": 0.10388463700734987, + "grad_norm": 0.004451198037713766, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 3765 + }, + { + "epoch": 0.10391222920841424, + "grad_norm": 0.0029722759500145912, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 3766 + }, + { + "epoch": 0.1039398214094786, + "grad_norm": 0.003191061783581972, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 3767 + }, + { + "epoch": 0.10396741361054299, + "grad_norm": 0.0037221303209662437, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 3768 + }, + { + "epoch": 0.10399500581160735, + "grad_norm": 0.0024105177726596594, + "learning_rate": 0.001, + "loss": 0.421, + "step": 3769 + }, + { + "epoch": 0.10402259801267172, + "grad_norm": 0.002493768697604537, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 3770 + }, + { + "epoch": 0.10405019021373609, + "grad_norm": 0.0025503532961010933, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 3771 + }, + { + "epoch": 0.10407778241480045, + "grad_norm": 0.0032149364706128836, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 3772 + }, + { + "epoch": 0.10410537461586483, + "grad_norm": 0.004015072248876095, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 3773 + }, + { + "epoch": 0.1041329668169292, + "grad_norm": 0.008095541037619114, + "learning_rate": 0.001, + "loss": 0.37, + "step": 3774 + }, + { + "epoch": 0.10416055901799356, + "grad_norm": 0.0025133301969617605, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 3775 + }, + { + "epoch": 0.10418815121905793, + "grad_norm": 0.0035531746689230204, + "learning_rate": 0.001, + "loss": 0.379, + "step": 3776 + }, + { + "epoch": 0.1042157434201223, + "grad_norm": 0.003656880697235465, + "learning_rate": 0.001, + "loss": 0.388, + "step": 3777 + }, + { + "epoch": 0.10424333562118668, + "grad_norm": 0.005002745892852545, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 3778 + }, + { + "epoch": 0.10427092782225104, + "grad_norm": 0.0031288950704038143, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 3779 + }, + { + "epoch": 0.10429852002331541, + "grad_norm": 0.0030303194653242826, + "learning_rate": 0.001, + "loss": 0.387, + "step": 3780 + }, + { + "epoch": 0.10432611222437978, + "grad_norm": 0.004291849210858345, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 3781 + }, + { + "epoch": 0.10435370442544414, + "grad_norm": 0.003425056580454111, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 3782 + }, + { + "epoch": 0.10438129662650852, + "grad_norm": 0.0025096056051552296, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 3783 + }, + { + "epoch": 0.10440888882757289, + "grad_norm": 0.002500693080946803, + "learning_rate": 0.001, + "loss": 0.403, + "step": 3784 + }, + { + "epoch": 0.10443648102863726, + "grad_norm": 0.003590058069676161, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 3785 + }, + { + "epoch": 0.10446407322970162, + "grad_norm": 0.004308347124606371, + "learning_rate": 0.001, + "loss": 0.4492, + "step": 3786 + }, + { + "epoch": 0.10449166543076599, + "grad_norm": 0.002307620132341981, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 3787 + }, + { + "epoch": 0.10451925763183036, + "grad_norm": 0.0036616444122046232, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 3788 + }, + { + "epoch": 0.10454684983289474, + "grad_norm": 0.0035604690201580524, + "learning_rate": 0.001, + "loss": 0.3597, + "step": 3789 + }, + { + "epoch": 0.1045744420339591, + "grad_norm": 0.00397746916860342, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 3790 + }, + { + "epoch": 0.10460203423502347, + "grad_norm": 0.009449174627661705, + "learning_rate": 0.001, + "loss": 0.3521, + "step": 3791 + }, + { + "epoch": 0.10462962643608784, + "grad_norm": 0.0032818394247442484, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 3792 + }, + { + "epoch": 0.1046572186371522, + "grad_norm": 0.0032766228541731834, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 3793 + }, + { + "epoch": 0.10468481083821658, + "grad_norm": 0.002585778711363673, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 3794 + }, + { + "epoch": 0.10471240303928095, + "grad_norm": 0.0029392503201961517, + "learning_rate": 0.001, + "loss": 0.4474, + "step": 3795 + }, + { + "epoch": 0.10473999524034532, + "grad_norm": 0.0027161298785358667, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 3796 + }, + { + "epoch": 0.10476758744140968, + "grad_norm": 0.0034348920453339815, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 3797 + }, + { + "epoch": 0.10479517964247405, + "grad_norm": 0.007731628604233265, + "learning_rate": 0.001, + "loss": 0.3592, + "step": 3798 + }, + { + "epoch": 0.10482277184353843, + "grad_norm": 0.0029055611230432987, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 3799 + }, + { + "epoch": 0.1048503640446028, + "grad_norm": 0.0027976164128631353, + "learning_rate": 0.001, + "loss": 0.388, + "step": 3800 + }, + { + "epoch": 0.10487795624566716, + "grad_norm": 0.003006401937454939, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 3801 + }, + { + "epoch": 0.10490554844673153, + "grad_norm": 0.002237136010080576, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 3802 + }, + { + "epoch": 0.1049331406477959, + "grad_norm": 0.003247616346925497, + "learning_rate": 0.001, + "loss": 0.381, + "step": 3803 + }, + { + "epoch": 0.10496073284886027, + "grad_norm": 0.002951403148472309, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 3804 + }, + { + "epoch": 0.10498832504992464, + "grad_norm": 0.002603907370939851, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 3805 + }, + { + "epoch": 0.10501591725098901, + "grad_norm": 0.0022911475971341133, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 3806 + }, + { + "epoch": 0.10504350945205337, + "grad_norm": 0.002795920707285404, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 3807 + }, + { + "epoch": 0.10507110165311774, + "grad_norm": 0.0031260910909622908, + "learning_rate": 0.001, + "loss": 0.386, + "step": 3808 + }, + { + "epoch": 0.10509869385418212, + "grad_norm": 0.003506281180307269, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 3809 + }, + { + "epoch": 0.10512628605524649, + "grad_norm": 0.0027451482601463795, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 3810 + }, + { + "epoch": 0.10515387825631085, + "grad_norm": 0.0057808831334114075, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 3811 + }, + { + "epoch": 0.10518147045737522, + "grad_norm": 0.003006263403221965, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 3812 + }, + { + "epoch": 0.10520906265843959, + "grad_norm": 0.004194669425487518, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 3813 + }, + { + "epoch": 0.10523665485950397, + "grad_norm": 0.004824482370167971, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 3814 + }, + { + "epoch": 0.10526424706056833, + "grad_norm": 0.0029831102583557367, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3815 + }, + { + "epoch": 0.1052918392616327, + "grad_norm": 0.004361843690276146, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 3816 + }, + { + "epoch": 0.10531943146269707, + "grad_norm": 0.002336485544219613, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 3817 + }, + { + "epoch": 0.10534702366376143, + "grad_norm": 0.0023848165292292833, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 3818 + }, + { + "epoch": 0.10537461586482581, + "grad_norm": 0.0029371667187660933, + "learning_rate": 0.001, + "loss": 0.361, + "step": 3819 + }, + { + "epoch": 0.10540220806589018, + "grad_norm": 0.003010603366419673, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 3820 + }, + { + "epoch": 0.10542980026695455, + "grad_norm": 0.007170096971094608, + "learning_rate": 0.001, + "loss": 0.4616, + "step": 3821 + }, + { + "epoch": 0.10545739246801891, + "grad_norm": 0.002445077523589134, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 3822 + }, + { + "epoch": 0.10548498466908328, + "grad_norm": 0.0034536407329142094, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 3823 + }, + { + "epoch": 0.10551257687014766, + "grad_norm": 0.0021885402966290712, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 3824 + }, + { + "epoch": 0.10554016907121203, + "grad_norm": 0.0027080499567091465, + "learning_rate": 0.001, + "loss": 0.416, + "step": 3825 + }, + { + "epoch": 0.10556776127227639, + "grad_norm": 0.0036047815810889006, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 3826 + }, + { + "epoch": 0.10559535347334076, + "grad_norm": 0.0027917807456105947, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 3827 + }, + { + "epoch": 0.10562294567440512, + "grad_norm": 0.0034859776496887207, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 3828 + }, + { + "epoch": 0.1056505378754695, + "grad_norm": 0.0031901709735393524, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 3829 + }, + { + "epoch": 0.10567813007653387, + "grad_norm": 0.004165272694081068, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 3830 + }, + { + "epoch": 0.10570572227759824, + "grad_norm": 0.0031863113399595022, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 3831 + }, + { + "epoch": 0.1057333144786626, + "grad_norm": 0.0035512226168066263, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 3832 + }, + { + "epoch": 0.10576090667972697, + "grad_norm": 0.0030755288898944855, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 3833 + }, + { + "epoch": 0.10578849888079135, + "grad_norm": 0.0031162879895418882, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 3834 + }, + { + "epoch": 0.10581609108185572, + "grad_norm": 0.0038108036387711763, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 3835 + }, + { + "epoch": 0.10584368328292008, + "grad_norm": 0.0033550935331732035, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 3836 + }, + { + "epoch": 0.10587127548398445, + "grad_norm": 0.0031523280777037144, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 3837 + }, + { + "epoch": 0.10589886768504882, + "grad_norm": 0.0038961879909038544, + "learning_rate": 0.001, + "loss": 0.395, + "step": 3838 + }, + { + "epoch": 0.10592645988611318, + "grad_norm": 0.005496688652783632, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 3839 + }, + { + "epoch": 0.10595405208717756, + "grad_norm": 0.0032198880799114704, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 3840 + }, + { + "epoch": 0.10598164428824193, + "grad_norm": 0.003234037896618247, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 3841 + }, + { + "epoch": 0.1060092364893063, + "grad_norm": 0.002870423486456275, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 3842 + }, + { + "epoch": 0.10603682869037066, + "grad_norm": 0.004519653040915728, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 3843 + }, + { + "epoch": 0.10606442089143503, + "grad_norm": 0.003621830837801099, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 3844 + }, + { + "epoch": 0.10609201309249941, + "grad_norm": 0.0029260909650474787, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 3845 + }, + { + "epoch": 0.10611960529356378, + "grad_norm": 0.0031509913969784975, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 3846 + }, + { + "epoch": 0.10614719749462814, + "grad_norm": 0.006669745780527592, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 3847 + }, + { + "epoch": 0.10617478969569251, + "grad_norm": 0.003406877163797617, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 3848 + }, + { + "epoch": 0.10620238189675688, + "grad_norm": 0.008724176324903965, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 3849 + }, + { + "epoch": 0.10622997409782126, + "grad_norm": 0.0026642687153071165, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 3850 + }, + { + "epoch": 0.10625756629888562, + "grad_norm": 0.003902031574398279, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 3851 + }, + { + "epoch": 0.10628515849994999, + "grad_norm": 0.0034857727587223053, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 3852 + }, + { + "epoch": 0.10631275070101436, + "grad_norm": 0.0022453153505921364, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 3853 + }, + { + "epoch": 0.10634034290207872, + "grad_norm": 0.002694975584745407, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 3854 + }, + { + "epoch": 0.1063679351031431, + "grad_norm": 0.005093062296509743, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 3855 + }, + { + "epoch": 0.10639552730420747, + "grad_norm": 0.004576206207275391, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 3856 + }, + { + "epoch": 0.10642311950527183, + "grad_norm": 0.0031380197033286095, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 3857 + }, + { + "epoch": 0.1064507117063362, + "grad_norm": 0.003493053140118718, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 3858 + }, + { + "epoch": 0.10647830390740057, + "grad_norm": 0.0024905947502702475, + "learning_rate": 0.001, + "loss": 0.4554, + "step": 3859 + }, + { + "epoch": 0.10650589610846495, + "grad_norm": 0.00544704170897603, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 3860 + }, + { + "epoch": 0.10653348830952931, + "grad_norm": 0.0029771511908620596, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 3861 + }, + { + "epoch": 0.10656108051059368, + "grad_norm": 0.0027052282821387053, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 3862 + }, + { + "epoch": 0.10658867271165805, + "grad_norm": 0.003082839772105217, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 3863 + }, + { + "epoch": 0.10661626491272241, + "grad_norm": 0.003736154641956091, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 3864 + }, + { + "epoch": 0.1066438571137868, + "grad_norm": 0.0027385384310036898, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 3865 + }, + { + "epoch": 0.10667144931485116, + "grad_norm": 0.002778218826279044, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 3866 + }, + { + "epoch": 0.10669904151591553, + "grad_norm": 0.01646745204925537, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 3867 + }, + { + "epoch": 0.1067266337169799, + "grad_norm": 0.0036807360593229532, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 3868 + }, + { + "epoch": 0.10675422591804426, + "grad_norm": 0.006385852582752705, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 3869 + }, + { + "epoch": 0.10678181811910864, + "grad_norm": 0.0027478185947984457, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 3870 + }, + { + "epoch": 0.106809410320173, + "grad_norm": 0.00484267994761467, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3871 + }, + { + "epoch": 0.10683700252123737, + "grad_norm": 0.0029064714908599854, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 3872 + }, + { + "epoch": 0.10686459472230174, + "grad_norm": 0.004293619655072689, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 3873 + }, + { + "epoch": 0.1068921869233661, + "grad_norm": 0.009066743776202202, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 3874 + }, + { + "epoch": 0.10691977912443049, + "grad_norm": 0.004455687943845987, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 3875 + }, + { + "epoch": 0.10694737132549485, + "grad_norm": 0.008621391840279102, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 3876 + }, + { + "epoch": 0.10697496352655922, + "grad_norm": 0.0032543812412768602, + "learning_rate": 0.001, + "loss": 0.394, + "step": 3877 + }, + { + "epoch": 0.10700255572762359, + "grad_norm": 0.003056267276406288, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 3878 + }, + { + "epoch": 0.10703014792868795, + "grad_norm": 0.003457016311585903, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 3879 + }, + { + "epoch": 0.10705774012975233, + "grad_norm": 0.0029117148369550705, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 3880 + }, + { + "epoch": 0.1070853323308167, + "grad_norm": 0.003112402046099305, + "learning_rate": 0.001, + "loss": 0.394, + "step": 3881 + }, + { + "epoch": 0.10711292453188107, + "grad_norm": 0.003459386760368943, + "learning_rate": 0.001, + "loss": 0.382, + "step": 3882 + }, + { + "epoch": 0.10714051673294543, + "grad_norm": 0.007629405707120895, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 3883 + }, + { + "epoch": 0.1071681089340098, + "grad_norm": 0.007017344702035189, + "learning_rate": 0.001, + "loss": 0.3503, + "step": 3884 + }, + { + "epoch": 0.10719570113507416, + "grad_norm": 0.006143512669950724, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 3885 + }, + { + "epoch": 0.10722329333613854, + "grad_norm": 0.007819131016731262, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 3886 + }, + { + "epoch": 0.10725088553720291, + "grad_norm": 0.0033061886206269264, + "learning_rate": 0.001, + "loss": 0.44, + "step": 3887 + }, + { + "epoch": 0.10727847773826728, + "grad_norm": 0.0026390962302684784, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 3888 + }, + { + "epoch": 0.10730606993933164, + "grad_norm": 0.003349416656419635, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 3889 + }, + { + "epoch": 0.10733366214039601, + "grad_norm": 0.004052475094795227, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 3890 + }, + { + "epoch": 0.10736125434146039, + "grad_norm": 0.008710183203220367, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 3891 + }, + { + "epoch": 0.10738884654252476, + "grad_norm": 0.0028958211187273264, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 3892 + }, + { + "epoch": 0.10741643874358912, + "grad_norm": 0.002961238846182823, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 3893 + }, + { + "epoch": 0.10744403094465349, + "grad_norm": 0.0025948896072804928, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 3894 + }, + { + "epoch": 0.10747162314571786, + "grad_norm": 0.0037905005738139153, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 3895 + }, + { + "epoch": 0.10749921534678224, + "grad_norm": 0.003213467774912715, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 3896 + }, + { + "epoch": 0.1075268075478466, + "grad_norm": 0.002972138812765479, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 3897 + }, + { + "epoch": 0.10755439974891097, + "grad_norm": 0.0030399637762457132, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 3898 + }, + { + "epoch": 0.10758199194997534, + "grad_norm": 0.002291029319167137, + "learning_rate": 0.001, + "loss": 0.382, + "step": 3899 + }, + { + "epoch": 0.1076095841510397, + "grad_norm": 0.002236071042716503, + "learning_rate": 0.001, + "loss": 0.4434, + "step": 3900 + }, + { + "epoch": 0.10763717635210408, + "grad_norm": 0.0024943517055362463, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 3901 + }, + { + "epoch": 0.10766476855316845, + "grad_norm": 0.002602664055302739, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 3902 + }, + { + "epoch": 0.10769236075423282, + "grad_norm": 0.0031983822118490934, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 3903 + }, + { + "epoch": 0.10771995295529718, + "grad_norm": 0.0027117652352899313, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 3904 + }, + { + "epoch": 0.10774754515636155, + "grad_norm": 0.002372644143179059, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 3905 + }, + { + "epoch": 0.10777513735742593, + "grad_norm": 0.0038946103304624557, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 3906 + }, + { + "epoch": 0.1078027295584903, + "grad_norm": 0.00335517106577754, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 3907 + }, + { + "epoch": 0.10783032175955466, + "grad_norm": 0.015401921235024929, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 3908 + }, + { + "epoch": 0.10785791396061903, + "grad_norm": 0.003028671722859144, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 3909 + }, + { + "epoch": 0.1078855061616834, + "grad_norm": 0.003672859398648143, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 3910 + }, + { + "epoch": 0.10791309836274778, + "grad_norm": 0.0038496828638017178, + "learning_rate": 0.001, + "loss": 0.429, + "step": 3911 + }, + { + "epoch": 0.10794069056381214, + "grad_norm": 0.0068849422968924046, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 3912 + }, + { + "epoch": 0.10796828276487651, + "grad_norm": 0.0074515510350465775, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 3913 + }, + { + "epoch": 0.10799587496594087, + "grad_norm": 0.0070841130800545216, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 3914 + }, + { + "epoch": 0.10802346716700524, + "grad_norm": 0.0054721771739423275, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 3915 + }, + { + "epoch": 0.10805105936806962, + "grad_norm": 0.004237642977386713, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 3916 + }, + { + "epoch": 0.10807865156913399, + "grad_norm": 0.004252060316503048, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 3917 + }, + { + "epoch": 0.10810624377019835, + "grad_norm": 0.0034951562993228436, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 3918 + }, + { + "epoch": 0.10813383597126272, + "grad_norm": 0.004239407833665609, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 3919 + }, + { + "epoch": 0.10816142817232709, + "grad_norm": 0.0040299855172634125, + "learning_rate": 0.001, + "loss": 0.36, + "step": 3920 + }, + { + "epoch": 0.10818902037339147, + "grad_norm": 0.0028297097887843847, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 3921 + }, + { + "epoch": 0.10821661257445583, + "grad_norm": 0.0039049547631293535, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 3922 + }, + { + "epoch": 0.1082442047755202, + "grad_norm": 0.0031370699871331453, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 3923 + }, + { + "epoch": 0.10827179697658457, + "grad_norm": 0.004705764818936586, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 3924 + }, + { + "epoch": 0.10829938917764893, + "grad_norm": 0.0024270943831652403, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 3925 + }, + { + "epoch": 0.10832698137871331, + "grad_norm": 0.003461951157078147, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 3926 + }, + { + "epoch": 0.10835457357977768, + "grad_norm": 0.008057190105319023, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 3927 + }, + { + "epoch": 0.10838216578084205, + "grad_norm": 0.0031221345998346806, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 3928 + }, + { + "epoch": 0.10840975798190641, + "grad_norm": 0.0029621014837175608, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 3929 + }, + { + "epoch": 0.10843735018297078, + "grad_norm": 0.002558749634772539, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 3930 + }, + { + "epoch": 0.10846494238403515, + "grad_norm": 0.002620436018332839, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 3931 + }, + { + "epoch": 0.10849253458509953, + "grad_norm": 0.0027069852221757174, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 3932 + }, + { + "epoch": 0.10852012678616389, + "grad_norm": 0.003972397185862064, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 3933 + }, + { + "epoch": 0.10854771898722826, + "grad_norm": 0.0021818478126078844, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 3934 + }, + { + "epoch": 0.10857531118829263, + "grad_norm": 0.0026342314667999744, + "learning_rate": 0.001, + "loss": 0.409, + "step": 3935 + }, + { + "epoch": 0.10860290338935699, + "grad_norm": 0.002321448177099228, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 3936 + }, + { + "epoch": 0.10863049559042137, + "grad_norm": 0.002941095968708396, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 3937 + }, + { + "epoch": 0.10865808779148574, + "grad_norm": 0.0027615423314273357, + "learning_rate": 0.001, + "loss": 0.393, + "step": 3938 + }, + { + "epoch": 0.1086856799925501, + "grad_norm": 0.0035208980552852154, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 3939 + }, + { + "epoch": 0.10871327219361447, + "grad_norm": 0.0035143953282386065, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 3940 + }, + { + "epoch": 0.10874086439467884, + "grad_norm": 0.003013983368873596, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 3941 + }, + { + "epoch": 0.10876845659574322, + "grad_norm": 0.003047554986551404, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 3942 + }, + { + "epoch": 0.10879604879680758, + "grad_norm": 0.0038127705920487642, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 3943 + }, + { + "epoch": 0.10882364099787195, + "grad_norm": 0.003960581962019205, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 3944 + }, + { + "epoch": 0.10885123319893632, + "grad_norm": 0.0032766389194875956, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 3945 + }, + { + "epoch": 0.10887882540000068, + "grad_norm": 0.005258220247924328, + "learning_rate": 0.001, + "loss": 0.3556, + "step": 3946 + }, + { + "epoch": 0.10890641760106506, + "grad_norm": 0.0038592154160141945, + "learning_rate": 0.001, + "loss": 0.3538, + "step": 3947 + }, + { + "epoch": 0.10893400980212943, + "grad_norm": 0.004304266069084406, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 3948 + }, + { + "epoch": 0.1089616020031938, + "grad_norm": 0.0032214997336268425, + "learning_rate": 0.001, + "loss": 0.378, + "step": 3949 + }, + { + "epoch": 0.10898919420425816, + "grad_norm": 0.0059113227762281895, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 3950 + }, + { + "epoch": 0.10901678640532253, + "grad_norm": 0.0029449171852320433, + "learning_rate": 0.001, + "loss": 0.393, + "step": 3951 + }, + { + "epoch": 0.10904437860638691, + "grad_norm": 0.0030160732567310333, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 3952 + }, + { + "epoch": 0.10907197080745128, + "grad_norm": 0.012647976167500019, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 3953 + }, + { + "epoch": 0.10909956300851564, + "grad_norm": 0.0026820586062967777, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 3954 + }, + { + "epoch": 0.10912715520958001, + "grad_norm": 0.006842675618827343, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 3955 + }, + { + "epoch": 0.10915474741064438, + "grad_norm": 0.0030738625209778547, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 3956 + }, + { + "epoch": 0.10918233961170876, + "grad_norm": 0.0023346368689090014, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 3957 + }, + { + "epoch": 0.10920993181277312, + "grad_norm": 0.005200542975217104, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 3958 + }, + { + "epoch": 0.10923752401383749, + "grad_norm": 0.0057869781740009785, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 3959 + }, + { + "epoch": 0.10926511621490186, + "grad_norm": 0.0025842657778412104, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 3960 + }, + { + "epoch": 0.10929270841596622, + "grad_norm": 0.002922437386587262, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 3961 + }, + { + "epoch": 0.1093203006170306, + "grad_norm": 0.0038535622879862785, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 3962 + }, + { + "epoch": 0.10934789281809497, + "grad_norm": 0.0034645821433514357, + "learning_rate": 0.001, + "loss": 0.364, + "step": 3963 + }, + { + "epoch": 0.10937548501915934, + "grad_norm": 0.0022498066537082195, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 3964 + }, + { + "epoch": 0.1094030772202237, + "grad_norm": 0.0035552133340388536, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 3965 + }, + { + "epoch": 0.10943066942128807, + "grad_norm": 0.0029964253772050142, + "learning_rate": 0.001, + "loss": 0.4, + "step": 3966 + }, + { + "epoch": 0.10945826162235245, + "grad_norm": 0.002939821919426322, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 3967 + }, + { + "epoch": 0.10948585382341681, + "grad_norm": 0.0021805204451084137, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 3968 + }, + { + "epoch": 0.10951344602448118, + "grad_norm": 0.010238613933324814, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 3969 + }, + { + "epoch": 0.10954103822554555, + "grad_norm": 0.00320064858533442, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 3970 + }, + { + "epoch": 0.10956863042660991, + "grad_norm": 0.0038515296764671803, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 3971 + }, + { + "epoch": 0.1095962226276743, + "grad_norm": 0.0033414731733500957, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 3972 + }, + { + "epoch": 0.10962381482873866, + "grad_norm": 0.002315077930688858, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 3973 + }, + { + "epoch": 0.10965140702980303, + "grad_norm": 0.0025413348339498043, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 3974 + }, + { + "epoch": 0.1096789992308674, + "grad_norm": 0.010324854403734207, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 3975 + }, + { + "epoch": 0.10970659143193176, + "grad_norm": 0.009162775240838528, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 3976 + }, + { + "epoch": 0.10973418363299613, + "grad_norm": 0.00687329052016139, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 3977 + }, + { + "epoch": 0.10976177583406051, + "grad_norm": 0.009686012752354145, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 3978 + }, + { + "epoch": 0.10978936803512487, + "grad_norm": 0.003329911269247532, + "learning_rate": 0.001, + "loss": 0.3573, + "step": 3979 + }, + { + "epoch": 0.10981696023618924, + "grad_norm": 0.003336430061608553, + "learning_rate": 0.001, + "loss": 0.4589, + "step": 3980 + }, + { + "epoch": 0.1098445524372536, + "grad_norm": 0.0021596092265099287, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 3981 + }, + { + "epoch": 0.10987214463831797, + "grad_norm": 0.003178425133228302, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 3982 + }, + { + "epoch": 0.10989973683938235, + "grad_norm": 0.0019768651109188795, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 3983 + }, + { + "epoch": 0.10992732904044672, + "grad_norm": 0.0026141603011637926, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 3984 + }, + { + "epoch": 0.10995492124151109, + "grad_norm": 0.00246097962372005, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 3985 + }, + { + "epoch": 0.10998251344257545, + "grad_norm": 0.002583438763394952, + "learning_rate": 0.001, + "loss": 0.377, + "step": 3986 + }, + { + "epoch": 0.11001010564363982, + "grad_norm": 0.003509828122332692, + "learning_rate": 0.001, + "loss": 0.437, + "step": 3987 + }, + { + "epoch": 0.1100376978447042, + "grad_norm": 0.002548534655943513, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 3988 + }, + { + "epoch": 0.11006529004576857, + "grad_norm": 0.002522020833566785, + "learning_rate": 0.001, + "loss": 0.379, + "step": 3989 + }, + { + "epoch": 0.11009288224683293, + "grad_norm": 0.0027163205668330193, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 3990 + }, + { + "epoch": 0.1101204744478973, + "grad_norm": 0.0020943363197147846, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 3991 + }, + { + "epoch": 0.11014806664896166, + "grad_norm": 0.002914323704317212, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 3992 + }, + { + "epoch": 0.11017565885002605, + "grad_norm": 0.0023269762750715017, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 3993 + }, + { + "epoch": 0.11020325105109041, + "grad_norm": 0.002514853375032544, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 3994 + }, + { + "epoch": 0.11023084325215478, + "grad_norm": 0.002852360252290964, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 3995 + }, + { + "epoch": 0.11025843545321914, + "grad_norm": 0.00278305122628808, + "learning_rate": 0.001, + "loss": 0.399, + "step": 3996 + }, + { + "epoch": 0.11028602765428351, + "grad_norm": 0.0031666734721511602, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 3997 + }, + { + "epoch": 0.11031361985534789, + "grad_norm": 0.006654566153883934, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 3998 + }, + { + "epoch": 0.11034121205641226, + "grad_norm": 0.002697068266570568, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 3999 + }, + { + "epoch": 0.11036880425747662, + "grad_norm": 0.003131921635940671, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 4000 + }, + { + "epoch": 0.11036880425747662, + "eval_runtime": 24.9249, + "eval_samples_per_second": 1.284, + "eval_steps_per_second": 0.16, + "step": 4000 + }, + { + "epoch": 0.11039639645854099, + "grad_norm": 0.009388426318764687, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 4001 + }, + { + "epoch": 0.11042398865960536, + "grad_norm": 0.002704363316297531, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 4002 + }, + { + "epoch": 0.11045158086066974, + "grad_norm": 0.0023160416167229414, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 4003 + }, + { + "epoch": 0.1104791730617341, + "grad_norm": 0.0025515344459563494, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 4004 + }, + { + "epoch": 0.11050676526279847, + "grad_norm": 0.0036002611741423607, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 4005 + }, + { + "epoch": 0.11053435746386284, + "grad_norm": 0.003040984272956848, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 4006 + }, + { + "epoch": 0.1105619496649272, + "grad_norm": 0.00276200077496469, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 4007 + }, + { + "epoch": 0.11058954186599158, + "grad_norm": 0.0028444197960197926, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 4008 + }, + { + "epoch": 0.11061713406705595, + "grad_norm": 0.0025242117699235678, + "learning_rate": 0.001, + "loss": 0.416, + "step": 4009 + }, + { + "epoch": 0.11064472626812032, + "grad_norm": 0.004357707686722279, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 4010 + }, + { + "epoch": 0.11067231846918468, + "grad_norm": 0.01803600788116455, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 4011 + }, + { + "epoch": 0.11069991067024905, + "grad_norm": 0.0028698742389678955, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 4012 + }, + { + "epoch": 0.11072750287131343, + "grad_norm": 0.0033235198352485895, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 4013 + }, + { + "epoch": 0.1107550950723778, + "grad_norm": 0.0038872750010341406, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 4014 + }, + { + "epoch": 0.11078268727344216, + "grad_norm": 0.002596453996375203, + "learning_rate": 0.001, + "loss": 0.4487, + "step": 4015 + }, + { + "epoch": 0.11081027947450653, + "grad_norm": 0.004450778476893902, + "learning_rate": 0.001, + "loss": 0.406, + "step": 4016 + }, + { + "epoch": 0.1108378716755709, + "grad_norm": 0.0031362990848720074, + "learning_rate": 0.001, + "loss": 0.405, + "step": 4017 + }, + { + "epoch": 0.11086546387663528, + "grad_norm": 0.0024727729614824057, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 4018 + }, + { + "epoch": 0.11089305607769964, + "grad_norm": 0.003262293990701437, + "learning_rate": 0.001, + "loss": 0.3506, + "step": 4019 + }, + { + "epoch": 0.11092064827876401, + "grad_norm": 0.002469596453011036, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 4020 + }, + { + "epoch": 0.11094824047982838, + "grad_norm": 0.0029696666169911623, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 4021 + }, + { + "epoch": 0.11097583268089274, + "grad_norm": 0.0031070455443114042, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 4022 + }, + { + "epoch": 0.11100342488195711, + "grad_norm": 0.002378135221078992, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 4023 + }, + { + "epoch": 0.11103101708302149, + "grad_norm": 0.0020418402273207903, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 4024 + }, + { + "epoch": 0.11105860928408585, + "grad_norm": 0.00241975043900311, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 4025 + }, + { + "epoch": 0.11108620148515022, + "grad_norm": 0.0036054837983101606, + "learning_rate": 0.001, + "loss": 0.375, + "step": 4026 + }, + { + "epoch": 0.11111379368621459, + "grad_norm": 0.002273577032610774, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 4027 + }, + { + "epoch": 0.11114138588727895, + "grad_norm": 0.008908730000257492, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 4028 + }, + { + "epoch": 0.11116897808834333, + "grad_norm": 0.0027021903079003096, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 4029 + }, + { + "epoch": 0.1111965702894077, + "grad_norm": 0.002809008816257119, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 4030 + }, + { + "epoch": 0.11122416249047207, + "grad_norm": 0.004108759108930826, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 4031 + }, + { + "epoch": 0.11125175469153643, + "grad_norm": 0.004110720008611679, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 4032 + }, + { + "epoch": 0.1112793468926008, + "grad_norm": 0.003450944786891341, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 4033 + }, + { + "epoch": 0.11130693909366518, + "grad_norm": 0.0027327281422913074, + "learning_rate": 0.001, + "loss": 0.42, + "step": 4034 + }, + { + "epoch": 0.11133453129472955, + "grad_norm": 0.003815416479483247, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 4035 + }, + { + "epoch": 0.11136212349579391, + "grad_norm": 0.0020829702261835337, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 4036 + }, + { + "epoch": 0.11138971569685828, + "grad_norm": 0.0024780267849564552, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 4037 + }, + { + "epoch": 0.11141730789792265, + "grad_norm": 0.004010303877294064, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 4038 + }, + { + "epoch": 0.11144490009898703, + "grad_norm": 0.0030929851345717907, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 4039 + }, + { + "epoch": 0.11147249230005139, + "grad_norm": 0.0041397190652787685, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 4040 + }, + { + "epoch": 0.11150008450111576, + "grad_norm": 0.0039605977945029736, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 4041 + }, + { + "epoch": 0.11152767670218013, + "grad_norm": 0.003413001075387001, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 4042 + }, + { + "epoch": 0.11155526890324449, + "grad_norm": 0.01140381395816803, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 4043 + }, + { + "epoch": 0.11158286110430887, + "grad_norm": 0.005510971415787935, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 4044 + }, + { + "epoch": 0.11161045330537324, + "grad_norm": 0.007506036199629307, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 4045 + }, + { + "epoch": 0.1116380455064376, + "grad_norm": 0.007963884621858597, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 4046 + }, + { + "epoch": 0.11166563770750197, + "grad_norm": 0.007050946820527315, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 4047 + }, + { + "epoch": 0.11169322990856634, + "grad_norm": 0.00468503637239337, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 4048 + }, + { + "epoch": 0.11172082210963072, + "grad_norm": 0.003186695510521531, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 4049 + }, + { + "epoch": 0.11174841431069509, + "grad_norm": 0.03650260344147682, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 4050 + }, + { + "epoch": 0.11177600651175945, + "grad_norm": 0.006556427571922541, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 4051 + }, + { + "epoch": 0.11180359871282382, + "grad_norm": 0.003500849474221468, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 4052 + }, + { + "epoch": 0.11183119091388818, + "grad_norm": 0.005811590701341629, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 4053 + }, + { + "epoch": 0.11185878311495256, + "grad_norm": 0.004504525102674961, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 4054 + }, + { + "epoch": 0.11188637531601693, + "grad_norm": 0.00391266867518425, + "learning_rate": 0.001, + "loss": 0.3716, + "step": 4055 + }, + { + "epoch": 0.1119139675170813, + "grad_norm": 0.002217537024989724, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 4056 + }, + { + "epoch": 0.11194155971814566, + "grad_norm": 0.0025836548302322626, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 4057 + }, + { + "epoch": 0.11196915191921003, + "grad_norm": 0.0025196904316544533, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 4058 + }, + { + "epoch": 0.11199674412027441, + "grad_norm": 0.0020293437410146, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 4059 + }, + { + "epoch": 0.11202433632133878, + "grad_norm": 0.0022750168573111296, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 4060 + }, + { + "epoch": 0.11205192852240314, + "grad_norm": 0.0026435197796672583, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 4061 + }, + { + "epoch": 0.11207952072346751, + "grad_norm": 0.0026955234352499247, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 4062 + }, + { + "epoch": 0.11210711292453188, + "grad_norm": 0.0022352158557623625, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 4063 + }, + { + "epoch": 0.11213470512559626, + "grad_norm": 0.00261326739564538, + "learning_rate": 0.001, + "loss": 0.431, + "step": 4064 + }, + { + "epoch": 0.11216229732666062, + "grad_norm": 0.0022083420772105455, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 4065 + }, + { + "epoch": 0.11218988952772499, + "grad_norm": 0.002633447526022792, + "learning_rate": 0.001, + "loss": 0.406, + "step": 4066 + }, + { + "epoch": 0.11221748172878936, + "grad_norm": 0.00373605964705348, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 4067 + }, + { + "epoch": 0.11224507392985372, + "grad_norm": 0.00297834281809628, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 4068 + }, + { + "epoch": 0.1122726661309181, + "grad_norm": 0.015468292869627476, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 4069 + }, + { + "epoch": 0.11230025833198247, + "grad_norm": 0.004757543094456196, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 4070 + }, + { + "epoch": 0.11232785053304684, + "grad_norm": 0.004110514651983976, + "learning_rate": 0.001, + "loss": 0.4556, + "step": 4071 + }, + { + "epoch": 0.1123554427341112, + "grad_norm": 0.0029956744983792305, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 4072 + }, + { + "epoch": 0.11238303493517557, + "grad_norm": 0.006121024955064058, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 4073 + }, + { + "epoch": 0.11241062713623994, + "grad_norm": 0.011045140214264393, + "learning_rate": 0.001, + "loss": 0.377, + "step": 4074 + }, + { + "epoch": 0.11243821933730432, + "grad_norm": 0.002652381081134081, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 4075 + }, + { + "epoch": 0.11246581153836868, + "grad_norm": 0.005784259643405676, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 4076 + }, + { + "epoch": 0.11249340373943305, + "grad_norm": 0.004484557081013918, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 4077 + }, + { + "epoch": 0.11252099594049741, + "grad_norm": 0.004235697444528341, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 4078 + }, + { + "epoch": 0.11254858814156178, + "grad_norm": 0.002368953777477145, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 4079 + }, + { + "epoch": 0.11257618034262616, + "grad_norm": 0.002926155459135771, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 4080 + }, + { + "epoch": 0.11260377254369053, + "grad_norm": 0.0026046691928058863, + "learning_rate": 0.001, + "loss": 0.396, + "step": 4081 + }, + { + "epoch": 0.1126313647447549, + "grad_norm": 0.0025377613492310047, + "learning_rate": 0.001, + "loss": 0.3658, + "step": 4082 + }, + { + "epoch": 0.11265895694581926, + "grad_norm": 0.0027987242210656404, + "learning_rate": 0.001, + "loss": 0.398, + "step": 4083 + }, + { + "epoch": 0.11268654914688363, + "grad_norm": 0.002613126765936613, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 4084 + }, + { + "epoch": 0.11271414134794801, + "grad_norm": 0.002449605381116271, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 4085 + }, + { + "epoch": 0.11274173354901237, + "grad_norm": 0.0033523484598845243, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 4086 + }, + { + "epoch": 0.11276932575007674, + "grad_norm": 0.003658158238977194, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 4087 + }, + { + "epoch": 0.1127969179511411, + "grad_norm": 0.0031146903056651354, + "learning_rate": 0.001, + "loss": 0.4306, + "step": 4088 + }, + { + "epoch": 0.11282451015220547, + "grad_norm": 0.004147498402744532, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 4089 + }, + { + "epoch": 0.11285210235326985, + "grad_norm": 0.0027466421015560627, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 4090 + }, + { + "epoch": 0.11287969455433422, + "grad_norm": 0.00301600550301373, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 4091 + }, + { + "epoch": 0.11290728675539859, + "grad_norm": 0.002960977843031287, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 4092 + }, + { + "epoch": 0.11293487895646295, + "grad_norm": 0.0030630468390882015, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 4093 + }, + { + "epoch": 0.11296247115752732, + "grad_norm": 0.002365349093452096, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 4094 + }, + { + "epoch": 0.1129900633585917, + "grad_norm": 0.005711345002055168, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 4095 + }, + { + "epoch": 0.11301765555965607, + "grad_norm": 0.003600204363465309, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 4096 + }, + { + "epoch": 0.11304524776072043, + "grad_norm": 0.003578650299459696, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 4097 + }, + { + "epoch": 0.1130728399617848, + "grad_norm": 0.003018326824530959, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 4098 + }, + { + "epoch": 0.11310043216284917, + "grad_norm": 0.0023286626674234867, + "learning_rate": 0.001, + "loss": 0.4477, + "step": 4099 + }, + { + "epoch": 0.11312802436391355, + "grad_norm": 0.0055235689505934715, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 4100 + }, + { + "epoch": 0.11315561656497791, + "grad_norm": 0.003630418796092272, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 4101 + }, + { + "epoch": 0.11318320876604228, + "grad_norm": 0.0026847817935049534, + "learning_rate": 0.001, + "loss": 0.3564, + "step": 4102 + }, + { + "epoch": 0.11321080096710665, + "grad_norm": 0.0022519559133797884, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 4103 + }, + { + "epoch": 0.11323839316817101, + "grad_norm": 0.002188954036682844, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 4104 + }, + { + "epoch": 0.11326598536923539, + "grad_norm": 0.006972792092710733, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 4105 + }, + { + "epoch": 0.11329357757029976, + "grad_norm": 0.0025407804641872644, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 4106 + }, + { + "epoch": 0.11332116977136412, + "grad_norm": 0.006640659179538488, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 4107 + }, + { + "epoch": 0.11334876197242849, + "grad_norm": 0.0022770599462091923, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 4108 + }, + { + "epoch": 0.11337635417349286, + "grad_norm": 0.0027237252797931433, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 4109 + }, + { + "epoch": 0.11340394637455724, + "grad_norm": 0.0026092720218002796, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4110 + }, + { + "epoch": 0.1134315385756216, + "grad_norm": 0.0025469979736953974, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 4111 + }, + { + "epoch": 0.11345913077668597, + "grad_norm": 0.004103371873497963, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 4112 + }, + { + "epoch": 0.11348672297775034, + "grad_norm": 0.002415394876152277, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 4113 + }, + { + "epoch": 0.1135143151788147, + "grad_norm": 0.0024486854672431946, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 4114 + }, + { + "epoch": 0.11354190737987908, + "grad_norm": 0.003065606579184532, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 4115 + }, + { + "epoch": 0.11356949958094345, + "grad_norm": 0.0029552297201007605, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 4116 + }, + { + "epoch": 0.11359709178200782, + "grad_norm": 0.0028423569165170193, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 4117 + }, + { + "epoch": 0.11362468398307218, + "grad_norm": 0.002471365500241518, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 4118 + }, + { + "epoch": 0.11365227618413655, + "grad_norm": 0.0026159638073295355, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 4119 + }, + { + "epoch": 0.11367986838520092, + "grad_norm": 0.00349643686786294, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 4120 + }, + { + "epoch": 0.1137074605862653, + "grad_norm": 0.003111085621640086, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 4121 + }, + { + "epoch": 0.11373505278732966, + "grad_norm": 0.002907720860093832, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 4122 + }, + { + "epoch": 0.11376264498839403, + "grad_norm": 0.004095634911209345, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 4123 + }, + { + "epoch": 0.1137902371894584, + "grad_norm": 0.0028071727138012648, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 4124 + }, + { + "epoch": 0.11381782939052276, + "grad_norm": 0.0032708507496863604, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 4125 + }, + { + "epoch": 0.11384542159158714, + "grad_norm": 0.002617691410705447, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 4126 + }, + { + "epoch": 0.11387301379265151, + "grad_norm": 0.0025609673466533422, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 4127 + }, + { + "epoch": 0.11390060599371588, + "grad_norm": 0.003452820936217904, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 4128 + }, + { + "epoch": 0.11392819819478024, + "grad_norm": 0.0029548651073127985, + "learning_rate": 0.001, + "loss": 0.4409, + "step": 4129 + }, + { + "epoch": 0.11395579039584461, + "grad_norm": 0.002860912587493658, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 4130 + }, + { + "epoch": 0.11398338259690899, + "grad_norm": 0.0028807439375668764, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 4131 + }, + { + "epoch": 0.11401097479797336, + "grad_norm": 0.0028092057909816504, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 4132 + }, + { + "epoch": 0.11403856699903772, + "grad_norm": 0.003919110633432865, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 4133 + }, + { + "epoch": 0.11406615920010209, + "grad_norm": 0.0034238446969538927, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 4134 + }, + { + "epoch": 0.11409375140116645, + "grad_norm": 0.0033501333091408014, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 4135 + }, + { + "epoch": 0.11412134360223083, + "grad_norm": 0.002704891376197338, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 4136 + }, + { + "epoch": 0.1141489358032952, + "grad_norm": 0.003555488074198365, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 4137 + }, + { + "epoch": 0.11417652800435957, + "grad_norm": 0.0075040231458842754, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 4138 + }, + { + "epoch": 0.11420412020542393, + "grad_norm": 0.008047969080507755, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 4139 + }, + { + "epoch": 0.1142317124064883, + "grad_norm": 0.003022789489477873, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 4140 + }, + { + "epoch": 0.11425930460755268, + "grad_norm": 0.0042315032333135605, + "learning_rate": 0.001, + "loss": 0.4537, + "step": 4141 + }, + { + "epoch": 0.11428689680861705, + "grad_norm": 0.0028275889344513416, + "learning_rate": 0.001, + "loss": 0.449, + "step": 4142 + }, + { + "epoch": 0.11431448900968141, + "grad_norm": 0.002496670465916395, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 4143 + }, + { + "epoch": 0.11434208121074578, + "grad_norm": 0.004964656662195921, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 4144 + }, + { + "epoch": 0.11436967341181015, + "grad_norm": 0.0024401163682341576, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 4145 + }, + { + "epoch": 0.11439726561287453, + "grad_norm": 0.004458567593246698, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 4146 + }, + { + "epoch": 0.1144248578139389, + "grad_norm": 0.002872915705665946, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 4147 + }, + { + "epoch": 0.11445245001500326, + "grad_norm": 0.004791326820850372, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 4148 + }, + { + "epoch": 0.11448004221606763, + "grad_norm": 0.0037970049306750298, + "learning_rate": 0.001, + "loss": 0.381, + "step": 4149 + }, + { + "epoch": 0.11450763441713199, + "grad_norm": 0.0025677571538835764, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 4150 + }, + { + "epoch": 0.11453522661819637, + "grad_norm": 0.0031208605505526066, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 4151 + }, + { + "epoch": 0.11456281881926074, + "grad_norm": 0.0031444996129721403, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 4152 + }, + { + "epoch": 0.1145904110203251, + "grad_norm": 0.0035791201516985893, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 4153 + }, + { + "epoch": 0.11461800322138947, + "grad_norm": 0.0027255616150796413, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 4154 + }, + { + "epoch": 0.11464559542245384, + "grad_norm": 0.003225408960133791, + "learning_rate": 0.001, + "loss": 0.374, + "step": 4155 + }, + { + "epoch": 0.11467318762351822, + "grad_norm": 0.0035840212367475033, + "learning_rate": 0.001, + "loss": 0.3518, + "step": 4156 + }, + { + "epoch": 0.11470077982458259, + "grad_norm": 0.0035705710761249065, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 4157 + }, + { + "epoch": 0.11472837202564695, + "grad_norm": 0.0029657899867743254, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 4158 + }, + { + "epoch": 0.11475596422671132, + "grad_norm": 0.003734529484063387, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 4159 + }, + { + "epoch": 0.11478355642777568, + "grad_norm": 0.00309072551317513, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 4160 + }, + { + "epoch": 0.11481114862884007, + "grad_norm": 0.004012387245893478, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 4161 + }, + { + "epoch": 0.11483874082990443, + "grad_norm": 0.002934156684204936, + "learning_rate": 0.001, + "loss": 0.4, + "step": 4162 + }, + { + "epoch": 0.1148663330309688, + "grad_norm": 0.004066895227879286, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 4163 + }, + { + "epoch": 0.11489392523203316, + "grad_norm": 0.004248685669153929, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 4164 + }, + { + "epoch": 0.11492151743309753, + "grad_norm": 0.0024275535251945257, + "learning_rate": 0.001, + "loss": 0.4446, + "step": 4165 + }, + { + "epoch": 0.1149491096341619, + "grad_norm": 0.0025079327169805765, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 4166 + }, + { + "epoch": 0.11497670183522628, + "grad_norm": 0.0024487797636538744, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 4167 + }, + { + "epoch": 0.11500429403629064, + "grad_norm": 0.002970887813717127, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 4168 + }, + { + "epoch": 0.11503188623735501, + "grad_norm": 0.0025243964046239853, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 4169 + }, + { + "epoch": 0.11505947843841938, + "grad_norm": 0.004984840750694275, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 4170 + }, + { + "epoch": 0.11508707063948374, + "grad_norm": 0.0044680144637823105, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 4171 + }, + { + "epoch": 0.11511466284054812, + "grad_norm": 0.0035056746564805508, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 4172 + }, + { + "epoch": 0.11514225504161249, + "grad_norm": 0.0023045954294502735, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 4173 + }, + { + "epoch": 0.11516984724267686, + "grad_norm": 0.0029870544094592333, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 4174 + }, + { + "epoch": 0.11519743944374122, + "grad_norm": 0.002851466415449977, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 4175 + }, + { + "epoch": 0.11522503164480559, + "grad_norm": 0.0037020803429186344, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 4176 + }, + { + "epoch": 0.11525262384586997, + "grad_norm": 0.0025792547967284918, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 4177 + }, + { + "epoch": 0.11528021604693434, + "grad_norm": 0.005641425959765911, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 4178 + }, + { + "epoch": 0.1153078082479987, + "grad_norm": 0.006405304651707411, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 4179 + }, + { + "epoch": 0.11533540044906307, + "grad_norm": 0.003738192841410637, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 4180 + }, + { + "epoch": 0.11536299265012744, + "grad_norm": 0.002722861710935831, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 4181 + }, + { + "epoch": 0.11539058485119182, + "grad_norm": 0.003077869303524494, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 4182 + }, + { + "epoch": 0.11541817705225618, + "grad_norm": 0.0023466874845325947, + "learning_rate": 0.001, + "loss": 0.391, + "step": 4183 + }, + { + "epoch": 0.11544576925332055, + "grad_norm": 0.0029894413892179728, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 4184 + }, + { + "epoch": 0.11547336145438492, + "grad_norm": 0.005931466352194548, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 4185 + }, + { + "epoch": 0.11550095365544928, + "grad_norm": 0.002602703869342804, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 4186 + }, + { + "epoch": 0.11552854585651366, + "grad_norm": 0.00304593937471509, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 4187 + }, + { + "epoch": 0.11555613805757803, + "grad_norm": 0.002939543453976512, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 4188 + }, + { + "epoch": 0.1155837302586424, + "grad_norm": 0.0026717365253716707, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 4189 + }, + { + "epoch": 0.11561132245970676, + "grad_norm": 0.0039357393980026245, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 4190 + }, + { + "epoch": 0.11563891466077113, + "grad_norm": 0.0033047099132090807, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 4191 + }, + { + "epoch": 0.11566650686183551, + "grad_norm": 0.00248891394585371, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 4192 + }, + { + "epoch": 0.11569409906289987, + "grad_norm": 0.003228937741369009, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 4193 + }, + { + "epoch": 0.11572169126396424, + "grad_norm": 0.004180791787803173, + "learning_rate": 0.001, + "loss": 0.389, + "step": 4194 + }, + { + "epoch": 0.11574928346502861, + "grad_norm": 0.0026013990864157677, + "learning_rate": 0.001, + "loss": 0.396, + "step": 4195 + }, + { + "epoch": 0.11577687566609297, + "grad_norm": 0.003035642672330141, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 4196 + }, + { + "epoch": 0.11580446786715735, + "grad_norm": 0.004497555084526539, + "learning_rate": 0.001, + "loss": 0.426, + "step": 4197 + }, + { + "epoch": 0.11583206006822172, + "grad_norm": 0.0024611612316221, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 4198 + }, + { + "epoch": 0.11585965226928609, + "grad_norm": 0.002211876679211855, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 4199 + }, + { + "epoch": 0.11588724447035045, + "grad_norm": 0.0038244640454649925, + "learning_rate": 0.001, + "loss": 0.367, + "step": 4200 + }, + { + "epoch": 0.11591483667141482, + "grad_norm": 0.002824941882863641, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 4201 + }, + { + "epoch": 0.1159424288724792, + "grad_norm": 0.00297146150842309, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 4202 + }, + { + "epoch": 0.11597002107354357, + "grad_norm": 0.003329311031848192, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 4203 + }, + { + "epoch": 0.11599761327460793, + "grad_norm": 0.0036552376113831997, + "learning_rate": 0.001, + "loss": 0.3593, + "step": 4204 + }, + { + "epoch": 0.1160252054756723, + "grad_norm": 0.003157126484438777, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 4205 + }, + { + "epoch": 0.11605279767673667, + "grad_norm": 0.0030856141820549965, + "learning_rate": 0.001, + "loss": 0.3612, + "step": 4206 + }, + { + "epoch": 0.11608038987780105, + "grad_norm": 0.0030666659586131573, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 4207 + }, + { + "epoch": 0.11610798207886541, + "grad_norm": 0.00340241938829422, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 4208 + }, + { + "epoch": 0.11613557427992978, + "grad_norm": 0.005646420642733574, + "learning_rate": 0.001, + "loss": 0.407, + "step": 4209 + }, + { + "epoch": 0.11616316648099415, + "grad_norm": 0.002865233225747943, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 4210 + }, + { + "epoch": 0.11619075868205851, + "grad_norm": 0.006409808062016964, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 4211 + }, + { + "epoch": 0.11621835088312288, + "grad_norm": 0.0022089029662311077, + "learning_rate": 0.001, + "loss": 0.4485, + "step": 4212 + }, + { + "epoch": 0.11624594308418726, + "grad_norm": 0.002593854209408164, + "learning_rate": 0.001, + "loss": 0.4342, + "step": 4213 + }, + { + "epoch": 0.11627353528525163, + "grad_norm": 0.004724476020783186, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 4214 + }, + { + "epoch": 0.11630112748631599, + "grad_norm": 0.0066048745065927505, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 4215 + }, + { + "epoch": 0.11632871968738036, + "grad_norm": 0.004728097002953291, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 4216 + }, + { + "epoch": 0.11635631188844472, + "grad_norm": 0.003980184905230999, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 4217 + }, + { + "epoch": 0.1163839040895091, + "grad_norm": 0.002691940637305379, + "learning_rate": 0.001, + "loss": 0.384, + "step": 4218 + }, + { + "epoch": 0.11641149629057347, + "grad_norm": 0.004341395106166601, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 4219 + }, + { + "epoch": 0.11643908849163784, + "grad_norm": 0.0023949614260345697, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 4220 + }, + { + "epoch": 0.1164666806927022, + "grad_norm": 0.0021929224021732807, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 4221 + }, + { + "epoch": 0.11649427289376657, + "grad_norm": 0.002848616801202297, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 4222 + }, + { + "epoch": 0.11652186509483095, + "grad_norm": 0.0023948801681399345, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 4223 + }, + { + "epoch": 0.11654945729589532, + "grad_norm": 0.0029203318990767, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 4224 + }, + { + "epoch": 0.11657704949695968, + "grad_norm": 0.0027072993107140064, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 4225 + }, + { + "epoch": 0.11660464169802405, + "grad_norm": 0.007629503961652517, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 4226 + }, + { + "epoch": 0.11663223389908842, + "grad_norm": 0.004317782819271088, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 4227 + }, + { + "epoch": 0.1166598261001528, + "grad_norm": 0.004796279594302177, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 4228 + }, + { + "epoch": 0.11668741830121716, + "grad_norm": 0.0048171901144087315, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 4229 + }, + { + "epoch": 0.11671501050228153, + "grad_norm": 0.005509024020284414, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 4230 + }, + { + "epoch": 0.1167426027033459, + "grad_norm": 0.002434986876323819, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 4231 + }, + { + "epoch": 0.11677019490441026, + "grad_norm": 0.0024681081995368004, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 4232 + }, + { + "epoch": 0.11679778710547464, + "grad_norm": 0.0029520918615162373, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 4233 + }, + { + "epoch": 0.11682537930653901, + "grad_norm": 0.0039764223620295525, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 4234 + }, + { + "epoch": 0.11685297150760338, + "grad_norm": 0.0033561012241989374, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 4235 + }, + { + "epoch": 0.11688056370866774, + "grad_norm": 0.002082569058984518, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 4236 + }, + { + "epoch": 0.11690815590973211, + "grad_norm": 0.002162642776966095, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 4237 + }, + { + "epoch": 0.11693574811079649, + "grad_norm": 0.004019891377538443, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 4238 + }, + { + "epoch": 0.11696334031186086, + "grad_norm": 0.0039974479004740715, + "learning_rate": 0.001, + "loss": 0.392, + "step": 4239 + }, + { + "epoch": 0.11699093251292522, + "grad_norm": 0.0032143592834472656, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 4240 + }, + { + "epoch": 0.11701852471398959, + "grad_norm": 0.0021860511042177677, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 4241 + }, + { + "epoch": 0.11704611691505395, + "grad_norm": 0.0034913863055408, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 4242 + }, + { + "epoch": 0.11707370911611834, + "grad_norm": 0.002544558374211192, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 4243 + }, + { + "epoch": 0.1171013013171827, + "grad_norm": 0.004531691782176495, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 4244 + }, + { + "epoch": 0.11712889351824707, + "grad_norm": 0.002380553400143981, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 4245 + }, + { + "epoch": 0.11715648571931143, + "grad_norm": 0.003138060914352536, + "learning_rate": 0.001, + "loss": 0.4387, + "step": 4246 + }, + { + "epoch": 0.1171840779203758, + "grad_norm": 0.0038936040364205837, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 4247 + }, + { + "epoch": 0.11721167012144018, + "grad_norm": 0.0034198390785604715, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 4248 + }, + { + "epoch": 0.11723926232250455, + "grad_norm": 0.004554110579192638, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 4249 + }, + { + "epoch": 0.11726685452356891, + "grad_norm": 0.002692369045689702, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 4250 + }, + { + "epoch": 0.11729444672463328, + "grad_norm": 0.0031687277369201183, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 4251 + }, + { + "epoch": 0.11732203892569765, + "grad_norm": 0.0030465414747595787, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 4252 + }, + { + "epoch": 0.11734963112676203, + "grad_norm": 0.0028786477632820606, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 4253 + }, + { + "epoch": 0.1173772233278264, + "grad_norm": 0.0026728706434369087, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 4254 + }, + { + "epoch": 0.11740481552889076, + "grad_norm": 0.0030022338032722473, + "learning_rate": 0.001, + "loss": 0.407, + "step": 4255 + }, + { + "epoch": 0.11743240772995513, + "grad_norm": 0.004039818421006203, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4256 + }, + { + "epoch": 0.1174599999310195, + "grad_norm": 0.0031344417948275805, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 4257 + }, + { + "epoch": 0.11748759213208387, + "grad_norm": 0.0020141347777098417, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 4258 + }, + { + "epoch": 0.11751518433314824, + "grad_norm": 0.003466543275862932, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 4259 + }, + { + "epoch": 0.1175427765342126, + "grad_norm": 0.006833492312580347, + "learning_rate": 0.001, + "loss": 0.4395, + "step": 4260 + }, + { + "epoch": 0.11757036873527697, + "grad_norm": 0.002628098940476775, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 4261 + }, + { + "epoch": 0.11759796093634134, + "grad_norm": 0.0029284025076776743, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 4262 + }, + { + "epoch": 0.1176255531374057, + "grad_norm": 0.002273330232128501, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 4263 + }, + { + "epoch": 0.11765314533847009, + "grad_norm": 0.004239256959408522, + "learning_rate": 0.001, + "loss": 0.409, + "step": 4264 + }, + { + "epoch": 0.11768073753953445, + "grad_norm": 0.003706045914441347, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 4265 + }, + { + "epoch": 0.11770832974059882, + "grad_norm": 0.004043275490403175, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 4266 + }, + { + "epoch": 0.11773592194166319, + "grad_norm": 0.00288589159026742, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 4267 + }, + { + "epoch": 0.11776351414272755, + "grad_norm": 0.002828913275152445, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 4268 + }, + { + "epoch": 0.11779110634379193, + "grad_norm": 0.0026987362653017044, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 4269 + }, + { + "epoch": 0.1178186985448563, + "grad_norm": 0.004176660440862179, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 4270 + }, + { + "epoch": 0.11784629074592066, + "grad_norm": 0.0025856473948806524, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 4271 + }, + { + "epoch": 0.11787388294698503, + "grad_norm": 0.002688355278223753, + "learning_rate": 0.001, + "loss": 0.4427, + "step": 4272 + }, + { + "epoch": 0.1179014751480494, + "grad_norm": 0.0035168598406016827, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 4273 + }, + { + "epoch": 0.11792906734911378, + "grad_norm": 0.00464267935603857, + "learning_rate": 0.001, + "loss": 0.3594, + "step": 4274 + }, + { + "epoch": 0.11795665955017814, + "grad_norm": 0.002608151640743017, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 4275 + }, + { + "epoch": 0.11798425175124251, + "grad_norm": 0.003174230456352234, + "learning_rate": 0.001, + "loss": 0.3583, + "step": 4276 + }, + { + "epoch": 0.11801184395230688, + "grad_norm": 0.005926152691245079, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 4277 + }, + { + "epoch": 0.11803943615337124, + "grad_norm": 0.0021453346125781536, + "learning_rate": 0.001, + "loss": 0.3648, + "step": 4278 + }, + { + "epoch": 0.11806702835443562, + "grad_norm": 0.003014147747308016, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4279 + }, + { + "epoch": 0.11809462055549999, + "grad_norm": 0.007064585108309984, + "learning_rate": 0.001, + "loss": 0.3441, + "step": 4280 + }, + { + "epoch": 0.11812221275656436, + "grad_norm": 0.004232319537550211, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 4281 + }, + { + "epoch": 0.11814980495762872, + "grad_norm": 0.0022875110153108835, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 4282 + }, + { + "epoch": 0.11817739715869309, + "grad_norm": 0.0034797275438904762, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 4283 + }, + { + "epoch": 0.11820498935975747, + "grad_norm": 0.0038444402161985636, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 4284 + }, + { + "epoch": 0.11823258156082184, + "grad_norm": 0.004799429327249527, + "learning_rate": 0.001, + "loss": 0.4482, + "step": 4285 + }, + { + "epoch": 0.1182601737618862, + "grad_norm": 0.0031683624256402254, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 4286 + }, + { + "epoch": 0.11828776596295057, + "grad_norm": 0.002779352478682995, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4287 + }, + { + "epoch": 0.11831535816401494, + "grad_norm": 0.00301496684551239, + "learning_rate": 0.001, + "loss": 0.452, + "step": 4288 + }, + { + "epoch": 0.11834295036507932, + "grad_norm": 0.0021808287128806114, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 4289 + }, + { + "epoch": 0.11837054256614368, + "grad_norm": 0.002374732168391347, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 4290 + }, + { + "epoch": 0.11839813476720805, + "grad_norm": 0.0025609827134758234, + "learning_rate": 0.001, + "loss": 0.41, + "step": 4291 + }, + { + "epoch": 0.11842572696827242, + "grad_norm": 0.0029708382207900286, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 4292 + }, + { + "epoch": 0.11845331916933678, + "grad_norm": 0.002453922526910901, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 4293 + }, + { + "epoch": 0.11848091137040116, + "grad_norm": 0.003155443584546447, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 4294 + }, + { + "epoch": 0.11850850357146553, + "grad_norm": 0.0028496377635747194, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 4295 + }, + { + "epoch": 0.1185360957725299, + "grad_norm": 0.0024521294981241226, + "learning_rate": 0.001, + "loss": 0.4255, + "step": 4296 + }, + { + "epoch": 0.11856368797359426, + "grad_norm": 0.002795828739181161, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 4297 + }, + { + "epoch": 0.11859128017465863, + "grad_norm": 0.0030860670376569033, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 4298 + }, + { + "epoch": 0.11861887237572301, + "grad_norm": 0.005245378706604242, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 4299 + }, + { + "epoch": 0.11864646457678737, + "grad_norm": 0.003146842820569873, + "learning_rate": 0.001, + "loss": 0.3404, + "step": 4300 + }, + { + "epoch": 0.11867405677785174, + "grad_norm": 0.0026854875031858683, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 4301 + }, + { + "epoch": 0.11870164897891611, + "grad_norm": 0.0026074268389493227, + "learning_rate": 0.001, + "loss": 0.3586, + "step": 4302 + }, + { + "epoch": 0.11872924117998047, + "grad_norm": 0.0035085766576230526, + "learning_rate": 0.001, + "loss": 0.4453, + "step": 4303 + }, + { + "epoch": 0.11875683338104485, + "grad_norm": 0.0026303378399461508, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 4304 + }, + { + "epoch": 0.11878442558210922, + "grad_norm": 0.002744373632594943, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 4305 + }, + { + "epoch": 0.11881201778317359, + "grad_norm": 0.002459439681842923, + "learning_rate": 0.001, + "loss": 0.424, + "step": 4306 + }, + { + "epoch": 0.11883960998423795, + "grad_norm": 0.004186084493994713, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 4307 + }, + { + "epoch": 0.11886720218530232, + "grad_norm": 0.002484044060111046, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 4308 + }, + { + "epoch": 0.11889479438636669, + "grad_norm": 0.002908499911427498, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 4309 + }, + { + "epoch": 0.11892238658743107, + "grad_norm": 0.0024023684673011303, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 4310 + }, + { + "epoch": 0.11894997878849543, + "grad_norm": 0.0033678016625344753, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 4311 + }, + { + "epoch": 0.1189775709895598, + "grad_norm": 0.0029683737084269524, + "learning_rate": 0.001, + "loss": 0.3552, + "step": 4312 + }, + { + "epoch": 0.11900516319062417, + "grad_norm": 0.0027305546682327986, + "learning_rate": 0.001, + "loss": 0.422, + "step": 4313 + }, + { + "epoch": 0.11903275539168853, + "grad_norm": 0.002915582386776805, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 4314 + }, + { + "epoch": 0.11906034759275291, + "grad_norm": 0.002866593888029456, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 4315 + }, + { + "epoch": 0.11908793979381728, + "grad_norm": 0.0024439122062176466, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 4316 + }, + { + "epoch": 0.11911553199488165, + "grad_norm": 0.003114825114607811, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 4317 + }, + { + "epoch": 0.11914312419594601, + "grad_norm": 0.005244606640189886, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 4318 + }, + { + "epoch": 0.11917071639701038, + "grad_norm": 0.0027887518517673016, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 4319 + }, + { + "epoch": 0.11919830859807476, + "grad_norm": 0.0033839023672044277, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 4320 + }, + { + "epoch": 0.11922590079913913, + "grad_norm": 0.00488735968247056, + "learning_rate": 0.001, + "loss": 0.4666, + "step": 4321 + }, + { + "epoch": 0.11925349300020349, + "grad_norm": 0.00358974770642817, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 4322 + }, + { + "epoch": 0.11928108520126786, + "grad_norm": 0.004664520733058453, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 4323 + }, + { + "epoch": 0.11930867740233222, + "grad_norm": 0.0029046619310975075, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 4324 + }, + { + "epoch": 0.1193362696033966, + "grad_norm": 0.005748322233557701, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 4325 + }, + { + "epoch": 0.11936386180446097, + "grad_norm": 0.004792370367795229, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 4326 + }, + { + "epoch": 0.11939145400552534, + "grad_norm": 0.002347557572647929, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 4327 + }, + { + "epoch": 0.1194190462065897, + "grad_norm": 0.009171836078166962, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 4328 + }, + { + "epoch": 0.11944663840765407, + "grad_norm": 0.0038018315099179745, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 4329 + }, + { + "epoch": 0.11947423060871845, + "grad_norm": 0.010487204417586327, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 4330 + }, + { + "epoch": 0.11950182280978282, + "grad_norm": 0.0031547516118735075, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 4331 + }, + { + "epoch": 0.11952941501084718, + "grad_norm": 0.008287927135825157, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 4332 + }, + { + "epoch": 0.11955700721191155, + "grad_norm": 0.0035811339039355516, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 4333 + }, + { + "epoch": 0.11958459941297592, + "grad_norm": 0.0023166469763964415, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 4334 + }, + { + "epoch": 0.1196121916140403, + "grad_norm": 0.005073962267488241, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 4335 + }, + { + "epoch": 0.11963978381510466, + "grad_norm": 0.003093453822657466, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 4336 + }, + { + "epoch": 0.11966737601616903, + "grad_norm": 0.002334992168471217, + "learning_rate": 0.001, + "loss": 0.4305, + "step": 4337 + }, + { + "epoch": 0.1196949682172334, + "grad_norm": 0.0023138297256082296, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 4338 + }, + { + "epoch": 0.11972256041829776, + "grad_norm": 0.002155238064005971, + "learning_rate": 0.001, + "loss": 0.4446, + "step": 4339 + }, + { + "epoch": 0.11975015261936214, + "grad_norm": 0.004164060112088919, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 4340 + }, + { + "epoch": 0.11977774482042651, + "grad_norm": 0.0026546409353613853, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 4341 + }, + { + "epoch": 0.11980533702149088, + "grad_norm": 0.004223043564707041, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 4342 + }, + { + "epoch": 0.11983292922255524, + "grad_norm": 0.002268544165417552, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 4343 + }, + { + "epoch": 0.11986052142361961, + "grad_norm": 0.0023410958237946033, + "learning_rate": 0.001, + "loss": 0.439, + "step": 4344 + }, + { + "epoch": 0.11988811362468399, + "grad_norm": 0.003867893014103174, + "learning_rate": 0.001, + "loss": 0.399, + "step": 4345 + }, + { + "epoch": 0.11991570582574836, + "grad_norm": 0.0023783824872225523, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 4346 + }, + { + "epoch": 0.11994329802681272, + "grad_norm": 0.002548053627833724, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 4347 + }, + { + "epoch": 0.11997089022787709, + "grad_norm": 0.0028831157833337784, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 4348 + }, + { + "epoch": 0.11999848242894146, + "grad_norm": 0.0029863922391086817, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 4349 + }, + { + "epoch": 0.12002607463000584, + "grad_norm": 0.002305314177647233, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 4350 + }, + { + "epoch": 0.1200536668310702, + "grad_norm": 0.006781649775803089, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 4351 + }, + { + "epoch": 0.12008125903213457, + "grad_norm": 0.0042722891084849834, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 4352 + }, + { + "epoch": 0.12010885123319893, + "grad_norm": 0.0033779507502913475, + "learning_rate": 0.001, + "loss": 0.3611, + "step": 4353 + }, + { + "epoch": 0.1201364434342633, + "grad_norm": 0.002541282679885626, + "learning_rate": 0.001, + "loss": 0.4365, + "step": 4354 + }, + { + "epoch": 0.12016403563532767, + "grad_norm": 0.002650998765602708, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 4355 + }, + { + "epoch": 0.12019162783639205, + "grad_norm": 0.002635399578139186, + "learning_rate": 0.001, + "loss": 0.4553, + "step": 4356 + }, + { + "epoch": 0.12021922003745641, + "grad_norm": 0.0034205662086606026, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 4357 + }, + { + "epoch": 0.12024681223852078, + "grad_norm": 0.0041115800850093365, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 4358 + }, + { + "epoch": 0.12027440443958515, + "grad_norm": 0.003525017062202096, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 4359 + }, + { + "epoch": 0.12030199664064951, + "grad_norm": 0.003658512607216835, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 4360 + }, + { + "epoch": 0.1203295888417139, + "grad_norm": 0.00296131893992424, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 4361 + }, + { + "epoch": 0.12035718104277826, + "grad_norm": 0.0035319009330123663, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 4362 + }, + { + "epoch": 0.12038477324384263, + "grad_norm": 0.0023319576866924763, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 4363 + }, + { + "epoch": 0.120412365444907, + "grad_norm": 0.002195805311203003, + "learning_rate": 0.001, + "loss": 0.4338, + "step": 4364 + }, + { + "epoch": 0.12043995764597136, + "grad_norm": 0.002451231935992837, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 4365 + }, + { + "epoch": 0.12046754984703574, + "grad_norm": 0.00870354101061821, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 4366 + }, + { + "epoch": 0.1204951420481001, + "grad_norm": 0.002587871393188834, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 4367 + }, + { + "epoch": 0.12052273424916447, + "grad_norm": 0.004466624464839697, + "learning_rate": 0.001, + "loss": 0.3526, + "step": 4368 + }, + { + "epoch": 0.12055032645022884, + "grad_norm": 0.004007890820503235, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 4369 + }, + { + "epoch": 0.1205779186512932, + "grad_norm": 0.015531973913311958, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 4370 + }, + { + "epoch": 0.12060551085235759, + "grad_norm": 0.008623013272881508, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 4371 + }, + { + "epoch": 0.12063310305342195, + "grad_norm": 0.006909455172717571, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 4372 + }, + { + "epoch": 0.12066069525448632, + "grad_norm": 0.009513488039374352, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 4373 + }, + { + "epoch": 0.12068828745555069, + "grad_norm": 0.003246552310883999, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 4374 + }, + { + "epoch": 0.12071587965661505, + "grad_norm": 0.00548326363787055, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 4375 + }, + { + "epoch": 0.12074347185767943, + "grad_norm": 0.004080353304743767, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 4376 + }, + { + "epoch": 0.1207710640587438, + "grad_norm": 0.004176739137619734, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 4377 + }, + { + "epoch": 0.12079865625980817, + "grad_norm": 0.002543453825637698, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 4378 + }, + { + "epoch": 0.12082624846087253, + "grad_norm": 0.0027144979685544968, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 4379 + }, + { + "epoch": 0.1208538406619369, + "grad_norm": 0.0035318818408995867, + "learning_rate": 0.001, + "loss": 0.3599, + "step": 4380 + }, + { + "epoch": 0.12088143286300128, + "grad_norm": 0.005954810418188572, + "learning_rate": 0.001, + "loss": 0.399, + "step": 4381 + }, + { + "epoch": 0.12090902506406564, + "grad_norm": 0.00457935081794858, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 4382 + }, + { + "epoch": 0.12093661726513001, + "grad_norm": 0.004335826262831688, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 4383 + }, + { + "epoch": 0.12096420946619438, + "grad_norm": 0.0032937126234173775, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 4384 + }, + { + "epoch": 0.12099180166725874, + "grad_norm": 0.0024808261077851057, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 4385 + }, + { + "epoch": 0.12101939386832312, + "grad_norm": 0.002847994677722454, + "learning_rate": 0.001, + "loss": 0.423, + "step": 4386 + }, + { + "epoch": 0.12104698606938749, + "grad_norm": 0.002378343604505062, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 4387 + }, + { + "epoch": 0.12107457827045186, + "grad_norm": 0.002491983585059643, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 4388 + }, + { + "epoch": 0.12110217047151622, + "grad_norm": 0.002309272298589349, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 4389 + }, + { + "epoch": 0.12112976267258059, + "grad_norm": 0.0028091350104659796, + "learning_rate": 0.001, + "loss": 0.3731, + "step": 4390 + }, + { + "epoch": 0.12115735487364497, + "grad_norm": 0.00367850624024868, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 4391 + }, + { + "epoch": 0.12118494707470934, + "grad_norm": 0.003562155645340681, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 4392 + }, + { + "epoch": 0.1212125392757737, + "grad_norm": 0.0025711546186357737, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 4393 + }, + { + "epoch": 0.12124013147683807, + "grad_norm": 0.003351216670125723, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 4394 + }, + { + "epoch": 0.12126772367790244, + "grad_norm": 0.002373376628383994, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 4395 + }, + { + "epoch": 0.12129531587896682, + "grad_norm": 0.0035600061528384686, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 4396 + }, + { + "epoch": 0.12132290808003118, + "grad_norm": 0.0033355422783643007, + "learning_rate": 0.001, + "loss": 0.3438, + "step": 4397 + }, + { + "epoch": 0.12135050028109555, + "grad_norm": 0.004041003528982401, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 4398 + }, + { + "epoch": 0.12137809248215992, + "grad_norm": 0.002868924057111144, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 4399 + }, + { + "epoch": 0.12140568468322428, + "grad_norm": 0.003549615852534771, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 4400 + }, + { + "epoch": 0.12143327688428865, + "grad_norm": 0.0034831890370696783, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 4401 + }, + { + "epoch": 0.12146086908535303, + "grad_norm": 0.0029312497936189175, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 4402 + }, + { + "epoch": 0.1214884612864174, + "grad_norm": 0.0037916668225079775, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 4403 + }, + { + "epoch": 0.12151605348748176, + "grad_norm": 0.0031889586243778467, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 4404 + }, + { + "epoch": 0.12154364568854613, + "grad_norm": 0.003701084526255727, + "learning_rate": 0.001, + "loss": 0.363, + "step": 4405 + }, + { + "epoch": 0.1215712378896105, + "grad_norm": 0.00297721428796649, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 4406 + }, + { + "epoch": 0.12159883009067488, + "grad_norm": 0.0032696991693228483, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 4407 + }, + { + "epoch": 0.12162642229173924, + "grad_norm": 0.0023237040732055902, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 4408 + }, + { + "epoch": 0.12165401449280361, + "grad_norm": 0.0023669737856835127, + "learning_rate": 0.001, + "loss": 0.396, + "step": 4409 + }, + { + "epoch": 0.12168160669386797, + "grad_norm": 0.0029125306755304337, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 4410 + }, + { + "epoch": 0.12170919889493234, + "grad_norm": 0.003643943928182125, + "learning_rate": 0.001, + "loss": 0.4312, + "step": 4411 + }, + { + "epoch": 0.12173679109599672, + "grad_norm": 0.002598782768473029, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 4412 + }, + { + "epoch": 0.12176438329706109, + "grad_norm": 0.00362451933324337, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 4413 + }, + { + "epoch": 0.12179197549812545, + "grad_norm": 0.0032412779983133078, + "learning_rate": 0.001, + "loss": 0.371, + "step": 4414 + }, + { + "epoch": 0.12181956769918982, + "grad_norm": 0.0031498463358730078, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 4415 + }, + { + "epoch": 0.12184715990025419, + "grad_norm": 0.0026793426368385553, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 4416 + }, + { + "epoch": 0.12187475210131857, + "grad_norm": 0.002742303302511573, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 4417 + }, + { + "epoch": 0.12190234430238293, + "grad_norm": 0.013603371568024158, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 4418 + }, + { + "epoch": 0.1219299365034473, + "grad_norm": 0.002509272890165448, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 4419 + }, + { + "epoch": 0.12195752870451167, + "grad_norm": 0.0029235216788947582, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 4420 + }, + { + "epoch": 0.12198512090557603, + "grad_norm": 0.003002134384587407, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 4421 + }, + { + "epoch": 0.12201271310664041, + "grad_norm": 0.0032670635264366865, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 4422 + }, + { + "epoch": 0.12204030530770478, + "grad_norm": 0.0023868351709097624, + "learning_rate": 0.001, + "loss": 0.3716, + "step": 4423 + }, + { + "epoch": 0.12206789750876915, + "grad_norm": 0.004251338541507721, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 4424 + }, + { + "epoch": 0.12209548970983351, + "grad_norm": 0.011102253571152687, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 4425 + }, + { + "epoch": 0.12212308191089788, + "grad_norm": 0.0027349465526640415, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 4426 + }, + { + "epoch": 0.12215067411196226, + "grad_norm": 0.0032276634592562914, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 4427 + }, + { + "epoch": 0.12217826631302663, + "grad_norm": 0.0035047761630266905, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 4428 + }, + { + "epoch": 0.12220585851409099, + "grad_norm": 0.0022022626362740993, + "learning_rate": 0.001, + "loss": 0.4513, + "step": 4429 + }, + { + "epoch": 0.12223345071515536, + "grad_norm": 0.0028917898889631033, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 4430 + }, + { + "epoch": 0.12226104291621973, + "grad_norm": 0.0040835002437233925, + "learning_rate": 0.001, + "loss": 0.3589, + "step": 4431 + }, + { + "epoch": 0.1222886351172841, + "grad_norm": 0.0042761219665408134, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 4432 + }, + { + "epoch": 0.12231622731834847, + "grad_norm": 0.0031644178088754416, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 4433 + }, + { + "epoch": 0.12234381951941284, + "grad_norm": 0.005158254411071539, + "learning_rate": 0.001, + "loss": 0.387, + "step": 4434 + }, + { + "epoch": 0.1223714117204772, + "grad_norm": 0.00329927378334105, + "learning_rate": 0.001, + "loss": 0.3385, + "step": 4435 + }, + { + "epoch": 0.12239900392154157, + "grad_norm": 0.0028820266015827656, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 4436 + }, + { + "epoch": 0.12242659612260595, + "grad_norm": 0.0035580925177782774, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 4437 + }, + { + "epoch": 0.12245418832367032, + "grad_norm": 0.0028019326273351908, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 4438 + }, + { + "epoch": 0.12248178052473468, + "grad_norm": 0.002797899767756462, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 4439 + }, + { + "epoch": 0.12250937272579905, + "grad_norm": 0.002947175409644842, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 4440 + }, + { + "epoch": 0.12253696492686342, + "grad_norm": 0.003999659325927496, + "learning_rate": 0.001, + "loss": 0.397, + "step": 4441 + }, + { + "epoch": 0.1225645571279278, + "grad_norm": 0.0022317490074783564, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 4442 + }, + { + "epoch": 0.12259214932899216, + "grad_norm": 0.01325872354209423, + "learning_rate": 0.001, + "loss": 0.397, + "step": 4443 + }, + { + "epoch": 0.12261974153005653, + "grad_norm": 0.0043718283995985985, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 4444 + }, + { + "epoch": 0.1226473337311209, + "grad_norm": 0.003364040283486247, + "learning_rate": 0.001, + "loss": 0.423, + "step": 4445 + }, + { + "epoch": 0.12267492593218526, + "grad_norm": 0.0032444903627038, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 4446 + }, + { + "epoch": 0.12270251813324963, + "grad_norm": 0.004509296268224716, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 4447 + }, + { + "epoch": 0.12273011033431401, + "grad_norm": 0.004130146466195583, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 4448 + }, + { + "epoch": 0.12275770253537838, + "grad_norm": 0.003644294338300824, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 4449 + }, + { + "epoch": 0.12278529473644274, + "grad_norm": 0.003993290476500988, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 4450 + }, + { + "epoch": 0.12281288693750711, + "grad_norm": 0.0026422853115946054, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 4451 + }, + { + "epoch": 0.12284047913857148, + "grad_norm": 0.0023565879091620445, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 4452 + }, + { + "epoch": 0.12286807133963586, + "grad_norm": 0.002863645553588867, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 4453 + }, + { + "epoch": 0.12289566354070022, + "grad_norm": 0.0022721088025718927, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 4454 + }, + { + "epoch": 0.12292325574176459, + "grad_norm": 0.0033876546658575535, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 4455 + }, + { + "epoch": 0.12295084794282896, + "grad_norm": 0.0030466553289443254, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 4456 + }, + { + "epoch": 0.12297844014389332, + "grad_norm": 0.005601429846137762, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 4457 + }, + { + "epoch": 0.1230060323449577, + "grad_norm": 0.0028402013704180717, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 4458 + }, + { + "epoch": 0.12303362454602207, + "grad_norm": 0.004646173678338528, + "learning_rate": 0.001, + "loss": 0.3447, + "step": 4459 + }, + { + "epoch": 0.12306121674708644, + "grad_norm": 0.002111830050125718, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 4460 + }, + { + "epoch": 0.1230888089481508, + "grad_norm": 0.0025454754941165447, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 4461 + }, + { + "epoch": 0.12311640114921517, + "grad_norm": 0.002491764025762677, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 4462 + }, + { + "epoch": 0.12314399335027955, + "grad_norm": 0.005627894774079323, + "learning_rate": 0.001, + "loss": 0.389, + "step": 4463 + }, + { + "epoch": 0.12317158555134392, + "grad_norm": 0.004091967828571796, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 4464 + }, + { + "epoch": 0.12319917775240828, + "grad_norm": 0.0029696535784751177, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 4465 + }, + { + "epoch": 0.12322676995347265, + "grad_norm": 0.004290423821657896, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 4466 + }, + { + "epoch": 0.12325436215453701, + "grad_norm": 0.002825783099979162, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 4467 + }, + { + "epoch": 0.1232819543556014, + "grad_norm": 0.002694531576707959, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 4468 + }, + { + "epoch": 0.12330954655666576, + "grad_norm": 0.002853821264579892, + "learning_rate": 0.001, + "loss": 0.3638, + "step": 4469 + }, + { + "epoch": 0.12333713875773013, + "grad_norm": 0.0025199069641530514, + "learning_rate": 0.001, + "loss": 0.384, + "step": 4470 + }, + { + "epoch": 0.1233647309587945, + "grad_norm": 0.003955396823585033, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 4471 + }, + { + "epoch": 0.12339232315985886, + "grad_norm": 0.0027513199020177126, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 4472 + }, + { + "epoch": 0.12341991536092324, + "grad_norm": 0.004522261209785938, + "learning_rate": 0.001, + "loss": 0.4623, + "step": 4473 + }, + { + "epoch": 0.12344750756198761, + "grad_norm": 0.003415697254240513, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 4474 + }, + { + "epoch": 0.12347509976305197, + "grad_norm": 0.0028595994226634502, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 4475 + }, + { + "epoch": 0.12350269196411634, + "grad_norm": 0.0041776299476623535, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 4476 + }, + { + "epoch": 0.1235302841651807, + "grad_norm": 0.002562503796070814, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 4477 + }, + { + "epoch": 0.12355787636624509, + "grad_norm": 0.0029480233788490295, + "learning_rate": 0.001, + "loss": 0.402, + "step": 4478 + }, + { + "epoch": 0.12358546856730945, + "grad_norm": 0.003219869453459978, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 4479 + }, + { + "epoch": 0.12361306076837382, + "grad_norm": 0.002801501424983144, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 4480 + }, + { + "epoch": 0.12364065296943819, + "grad_norm": 0.004376853816211224, + "learning_rate": 0.001, + "loss": 0.3591, + "step": 4481 + }, + { + "epoch": 0.12366824517050255, + "grad_norm": 0.0027625143993645906, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 4482 + }, + { + "epoch": 0.12369583737156693, + "grad_norm": 0.0024436269886791706, + "learning_rate": 0.001, + "loss": 0.4358, + "step": 4483 + }, + { + "epoch": 0.1237234295726313, + "grad_norm": 0.0035639768466353416, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 4484 + }, + { + "epoch": 0.12375102177369567, + "grad_norm": 0.0029148710891604424, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 4485 + }, + { + "epoch": 0.12377861397476003, + "grad_norm": 0.0025241354014724493, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 4486 + }, + { + "epoch": 0.1238062061758244, + "grad_norm": 0.003036417765542865, + "learning_rate": 0.001, + "loss": 0.398, + "step": 4487 + }, + { + "epoch": 0.12383379837688878, + "grad_norm": 0.002688635839149356, + "learning_rate": 0.001, + "loss": 0.3572, + "step": 4488 + }, + { + "epoch": 0.12386139057795315, + "grad_norm": 0.0027762525714933872, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 4489 + }, + { + "epoch": 0.12388898277901751, + "grad_norm": 0.0030563059262931347, + "learning_rate": 0.001, + "loss": 0.3528, + "step": 4490 + }, + { + "epoch": 0.12391657498008188, + "grad_norm": 0.006547185592353344, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 4491 + }, + { + "epoch": 0.12394416718114624, + "grad_norm": 0.0036082088481634855, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 4492 + }, + { + "epoch": 0.12397175938221063, + "grad_norm": 0.007952166721224785, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 4493 + }, + { + "epoch": 0.12399935158327499, + "grad_norm": 0.0032326721120625734, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 4494 + }, + { + "epoch": 0.12402694378433936, + "grad_norm": 0.003287110012024641, + "learning_rate": 0.001, + "loss": 0.3341, + "step": 4495 + }, + { + "epoch": 0.12405453598540372, + "grad_norm": 0.0022940190974622965, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 4496 + }, + { + "epoch": 0.12408212818646809, + "grad_norm": 0.007416036911308765, + "learning_rate": 0.001, + "loss": 0.404, + "step": 4497 + }, + { + "epoch": 0.12410972038753246, + "grad_norm": 0.002889924682676792, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 4498 + }, + { + "epoch": 0.12413731258859684, + "grad_norm": 0.002148964209482074, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 4499 + }, + { + "epoch": 0.1241649047896612, + "grad_norm": 0.004982593934983015, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 4500 + }, + { + "epoch": 0.1241649047896612, + "eval_runtime": 24.5339, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.163, + "step": 4500 + }, + { + "epoch": 0.12419249699072557, + "grad_norm": 0.004382569808512926, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 4501 + }, + { + "epoch": 0.12422008919178994, + "grad_norm": 0.003418430220335722, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 4502 + }, + { + "epoch": 0.1242476813928543, + "grad_norm": 0.003198147751390934, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 4503 + }, + { + "epoch": 0.12427527359391868, + "grad_norm": 0.00246452703140676, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 4504 + }, + { + "epoch": 0.12430286579498305, + "grad_norm": 0.0038448646664619446, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 4505 + }, + { + "epoch": 0.12433045799604742, + "grad_norm": 0.0039044911973178387, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 4506 + }, + { + "epoch": 0.12435805019711178, + "grad_norm": 0.003824865445494652, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 4507 + }, + { + "epoch": 0.12438564239817615, + "grad_norm": 0.003275007475167513, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 4508 + }, + { + "epoch": 0.12441323459924053, + "grad_norm": 0.00457676500082016, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 4509 + }, + { + "epoch": 0.1244408268003049, + "grad_norm": 0.004399159457534552, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 4510 + }, + { + "epoch": 0.12446841900136926, + "grad_norm": 0.0027072380762547255, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 4511 + }, + { + "epoch": 0.12449601120243363, + "grad_norm": 0.0025107194669544697, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 4512 + }, + { + "epoch": 0.124523603403498, + "grad_norm": 0.004047545604407787, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 4513 + }, + { + "epoch": 0.12455119560456238, + "grad_norm": 0.002424485282972455, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 4514 + }, + { + "epoch": 0.12457878780562674, + "grad_norm": 0.00391024025157094, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 4515 + }, + { + "epoch": 0.12460638000669111, + "grad_norm": 0.0026012531016021967, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 4516 + }, + { + "epoch": 0.12463397220775548, + "grad_norm": 0.0023119868710637093, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 4517 + }, + { + "epoch": 0.12466156440881984, + "grad_norm": 0.0023387623950839043, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 4518 + }, + { + "epoch": 0.12468915660988422, + "grad_norm": 0.0031953216530382633, + "learning_rate": 0.001, + "loss": 0.394, + "step": 4519 + }, + { + "epoch": 0.12471674881094859, + "grad_norm": 0.002653286559507251, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 4520 + }, + { + "epoch": 0.12474434101201295, + "grad_norm": 0.006683747284114361, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 4521 + }, + { + "epoch": 0.12477193321307732, + "grad_norm": 0.002483023563399911, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 4522 + }, + { + "epoch": 0.12479952541414169, + "grad_norm": 0.004147149156779051, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 4523 + }, + { + "epoch": 0.12482711761520607, + "grad_norm": 0.0026530877221375704, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 4524 + }, + { + "epoch": 0.12485470981627043, + "grad_norm": 0.003452074248343706, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 4525 + }, + { + "epoch": 0.1248823020173348, + "grad_norm": 0.0028085915837436914, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 4526 + }, + { + "epoch": 0.12490989421839917, + "grad_norm": 0.0034218342043459415, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 4527 + }, + { + "epoch": 0.12493748641946353, + "grad_norm": 0.007770628668367863, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 4528 + }, + { + "epoch": 0.12496507862052791, + "grad_norm": 0.0029227614868432283, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 4529 + }, + { + "epoch": 0.12499267082159228, + "grad_norm": 0.0030319190118461847, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 4530 + }, + { + "epoch": 0.12502026302265665, + "grad_norm": 0.002946640131995082, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 4531 + }, + { + "epoch": 0.12504785522372103, + "grad_norm": 0.003349126083776355, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 4532 + }, + { + "epoch": 0.12507544742478538, + "grad_norm": 0.0028201621025800705, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 4533 + }, + { + "epoch": 0.12510303962584976, + "grad_norm": 0.002343755681067705, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 4534 + }, + { + "epoch": 0.1251306318269141, + "grad_norm": 0.004897142760455608, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 4535 + }, + { + "epoch": 0.1251582240279785, + "grad_norm": 0.003420259803533554, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 4536 + }, + { + "epoch": 0.12518581622904287, + "grad_norm": 0.005415117833763361, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 4537 + }, + { + "epoch": 0.12521340843010723, + "grad_norm": 0.009312170557677746, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 4538 + }, + { + "epoch": 0.1252410006311716, + "grad_norm": 0.0031622762326151133, + "learning_rate": 0.001, + "loss": 0.404, + "step": 4539 + }, + { + "epoch": 0.12526859283223596, + "grad_norm": 0.0035899661015719175, + "learning_rate": 0.001, + "loss": 0.4331, + "step": 4540 + }, + { + "epoch": 0.12529618503330034, + "grad_norm": 0.005790769122540951, + "learning_rate": 0.001, + "loss": 0.3559, + "step": 4541 + }, + { + "epoch": 0.12532377723436472, + "grad_norm": 0.003316226415336132, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 4542 + }, + { + "epoch": 0.12535136943542907, + "grad_norm": 0.003180850762873888, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 4543 + }, + { + "epoch": 0.12537896163649345, + "grad_norm": 0.0034962312784045935, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 4544 + }, + { + "epoch": 0.1254065538375578, + "grad_norm": 0.0036736545152962208, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 4545 + }, + { + "epoch": 0.12543414603862219, + "grad_norm": 0.0032101564574986696, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 4546 + }, + { + "epoch": 0.12546173823968657, + "grad_norm": 0.0029373078141361475, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 4547 + }, + { + "epoch": 0.12548933044075092, + "grad_norm": 0.0039803143590688705, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 4548 + }, + { + "epoch": 0.1255169226418153, + "grad_norm": 0.006120176054537296, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 4549 + }, + { + "epoch": 0.12554451484287965, + "grad_norm": 0.004431293345987797, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 4550 + }, + { + "epoch": 0.12557210704394403, + "grad_norm": 0.0034864030312746763, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 4551 + }, + { + "epoch": 0.1255996992450084, + "grad_norm": 0.003251910675317049, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 4552 + }, + { + "epoch": 0.12562729144607276, + "grad_norm": 0.0032173239160329103, + "learning_rate": 0.001, + "loss": 0.358, + "step": 4553 + }, + { + "epoch": 0.12565488364713714, + "grad_norm": 0.002825138159096241, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 4554 + }, + { + "epoch": 0.1256824758482015, + "grad_norm": 0.0024809478782117367, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 4555 + }, + { + "epoch": 0.12571006804926588, + "grad_norm": 0.0038729074876755476, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 4556 + }, + { + "epoch": 0.12573766025033026, + "grad_norm": 0.0027885730378329754, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 4557 + }, + { + "epoch": 0.1257652524513946, + "grad_norm": 0.002833339385688305, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 4558 + }, + { + "epoch": 0.125792844652459, + "grad_norm": 0.00270766019821167, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 4559 + }, + { + "epoch": 0.12582043685352334, + "grad_norm": 0.0024998581502586603, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 4560 + }, + { + "epoch": 0.12584802905458772, + "grad_norm": 0.010215898975729942, + "learning_rate": 0.001, + "loss": 0.359, + "step": 4561 + }, + { + "epoch": 0.1258756212556521, + "grad_norm": 0.015013232827186584, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 4562 + }, + { + "epoch": 0.12590321345671646, + "grad_norm": 0.010075882077217102, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4563 + }, + { + "epoch": 0.12593080565778084, + "grad_norm": 0.02185934968292713, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 4564 + }, + { + "epoch": 0.1259583978588452, + "grad_norm": 0.003104067873209715, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 4565 + }, + { + "epoch": 0.12598599005990957, + "grad_norm": 0.0032787187956273556, + "learning_rate": 0.001, + "loss": 0.397, + "step": 4566 + }, + { + "epoch": 0.12601358226097392, + "grad_norm": 0.002555015729740262, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 4567 + }, + { + "epoch": 0.1260411744620383, + "grad_norm": 0.0023251352831721306, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 4568 + }, + { + "epoch": 0.12606876666310268, + "grad_norm": 0.0028923735953867435, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 4569 + }, + { + "epoch": 0.12609635886416704, + "grad_norm": 0.003060357179492712, + "learning_rate": 0.001, + "loss": 0.388, + "step": 4570 + }, + { + "epoch": 0.12612395106523142, + "grad_norm": 0.0036045622546225786, + "learning_rate": 0.001, + "loss": 0.3544, + "step": 4571 + }, + { + "epoch": 0.12615154326629577, + "grad_norm": 0.002739776624366641, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 4572 + }, + { + "epoch": 0.12617913546736015, + "grad_norm": 0.0037477100268006325, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 4573 + }, + { + "epoch": 0.12620672766842453, + "grad_norm": 0.002796964254230261, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 4574 + }, + { + "epoch": 0.12623431986948888, + "grad_norm": 0.0034001306630671024, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 4575 + }, + { + "epoch": 0.12626191207055326, + "grad_norm": 0.004743472672998905, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 4576 + }, + { + "epoch": 0.12628950427161761, + "grad_norm": 0.0023272617254406214, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 4577 + }, + { + "epoch": 0.126317096472682, + "grad_norm": 0.0036957256961613894, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 4578 + }, + { + "epoch": 0.12634468867374637, + "grad_norm": 0.002441568998619914, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 4579 + }, + { + "epoch": 0.12637228087481073, + "grad_norm": 0.003656483720988035, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 4580 + }, + { + "epoch": 0.1263998730758751, + "grad_norm": 0.005753074306994677, + "learning_rate": 0.001, + "loss": 0.3647, + "step": 4581 + }, + { + "epoch": 0.12642746527693946, + "grad_norm": 0.005195307079702616, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 4582 + }, + { + "epoch": 0.12645505747800384, + "grad_norm": 0.003572377609089017, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 4583 + }, + { + "epoch": 0.12648264967906822, + "grad_norm": 0.006414738483726978, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 4584 + }, + { + "epoch": 0.12651024188013257, + "grad_norm": 0.003181818872690201, + "learning_rate": 0.001, + "loss": 0.376, + "step": 4585 + }, + { + "epoch": 0.12653783408119695, + "grad_norm": 0.0062148310244083405, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 4586 + }, + { + "epoch": 0.1265654262822613, + "grad_norm": 0.00339969783090055, + "learning_rate": 0.001, + "loss": 0.4389, + "step": 4587 + }, + { + "epoch": 0.1265930184833257, + "grad_norm": 0.003279429627582431, + "learning_rate": 0.001, + "loss": 0.415, + "step": 4588 + }, + { + "epoch": 0.12662061068439007, + "grad_norm": 0.003928397316485643, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 4589 + }, + { + "epoch": 0.12664820288545442, + "grad_norm": 0.0071240440011024475, + "learning_rate": 0.001, + "loss": 0.371, + "step": 4590 + }, + { + "epoch": 0.1266757950865188, + "grad_norm": 0.0035491851158440113, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 4591 + }, + { + "epoch": 0.12670338728758315, + "grad_norm": 0.003695262363180518, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 4592 + }, + { + "epoch": 0.12673097948864753, + "grad_norm": 0.002730879234150052, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 4593 + }, + { + "epoch": 0.1267585716897119, + "grad_norm": 0.0026994007639586926, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 4594 + }, + { + "epoch": 0.12678616389077627, + "grad_norm": 0.0025865414645522833, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 4595 + }, + { + "epoch": 0.12681375609184065, + "grad_norm": 0.003949947189539671, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 4596 + }, + { + "epoch": 0.126841348292905, + "grad_norm": 0.0021580797620117664, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 4597 + }, + { + "epoch": 0.12686894049396938, + "grad_norm": 0.003605265635997057, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 4598 + }, + { + "epoch": 0.12689653269503376, + "grad_norm": 0.0027530419174581766, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 4599 + }, + { + "epoch": 0.1269241248960981, + "grad_norm": 0.002252806443721056, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 4600 + }, + { + "epoch": 0.1269517170971625, + "grad_norm": 0.0026294796261936426, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 4601 + }, + { + "epoch": 0.12697930929822684, + "grad_norm": 0.00271353917196393, + "learning_rate": 0.001, + "loss": 0.412, + "step": 4602 + }, + { + "epoch": 0.12700690149929122, + "grad_norm": 0.004706821870058775, + "learning_rate": 0.001, + "loss": 0.3532, + "step": 4603 + }, + { + "epoch": 0.1270344937003556, + "grad_norm": 0.0026412513107061386, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 4604 + }, + { + "epoch": 0.12706208590141996, + "grad_norm": 0.00251567829400301, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 4605 + }, + { + "epoch": 0.12708967810248434, + "grad_norm": 0.003459150902926922, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 4606 + }, + { + "epoch": 0.1271172703035487, + "grad_norm": 0.002293146215379238, + "learning_rate": 0.001, + "loss": 0.3427, + "step": 4607 + }, + { + "epoch": 0.12714486250461307, + "grad_norm": 0.0029938959050923586, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 4608 + }, + { + "epoch": 0.12717245470567745, + "grad_norm": 0.0036902104038745165, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 4609 + }, + { + "epoch": 0.1272000469067418, + "grad_norm": 0.004806086421012878, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 4610 + }, + { + "epoch": 0.12722763910780618, + "grad_norm": 0.002169287297874689, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 4611 + }, + { + "epoch": 0.12725523130887054, + "grad_norm": 0.0030692543368786573, + "learning_rate": 0.001, + "loss": 0.3516, + "step": 4612 + }, + { + "epoch": 0.12728282350993492, + "grad_norm": 0.004069609101861715, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 4613 + }, + { + "epoch": 0.1273104157109993, + "grad_norm": 0.0051890406757593155, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 4614 + }, + { + "epoch": 0.12733800791206365, + "grad_norm": 0.002785927150398493, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 4615 + }, + { + "epoch": 0.12736560011312803, + "grad_norm": 0.0058522410690784454, + "learning_rate": 0.001, + "loss": 0.417, + "step": 4616 + }, + { + "epoch": 0.12739319231419238, + "grad_norm": 0.0026462904643267393, + "learning_rate": 0.001, + "loss": 0.4455, + "step": 4617 + }, + { + "epoch": 0.12742078451525676, + "grad_norm": 0.012372707948088646, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 4618 + }, + { + "epoch": 0.12744837671632114, + "grad_norm": 0.003128821961581707, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 4619 + }, + { + "epoch": 0.1274759689173855, + "grad_norm": 0.004159901756793261, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 4620 + }, + { + "epoch": 0.12750356111844988, + "grad_norm": 0.00412391172721982, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 4621 + }, + { + "epoch": 0.12753115331951423, + "grad_norm": 0.002524176612496376, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 4622 + }, + { + "epoch": 0.1275587455205786, + "grad_norm": 0.004930204711854458, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 4623 + }, + { + "epoch": 0.127586337721643, + "grad_norm": 0.003738861531019211, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 4624 + }, + { + "epoch": 0.12761392992270734, + "grad_norm": 0.0171899925917387, + "learning_rate": 0.001, + "loss": 0.391, + "step": 4625 + }, + { + "epoch": 0.12764152212377172, + "grad_norm": 0.018499117344617844, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 4626 + }, + { + "epoch": 0.12766911432483607, + "grad_norm": 0.0035764595959335566, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 4627 + }, + { + "epoch": 0.12769670652590046, + "grad_norm": 0.00345508917234838, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 4628 + }, + { + "epoch": 0.12772429872696484, + "grad_norm": 0.0037645609118044376, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 4629 + }, + { + "epoch": 0.1277518909280292, + "grad_norm": 0.0043193078599870205, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 4630 + }, + { + "epoch": 0.12777948312909357, + "grad_norm": 0.0038787908852100372, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 4631 + }, + { + "epoch": 0.12780707533015792, + "grad_norm": 0.007828318513929844, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 4632 + }, + { + "epoch": 0.1278346675312223, + "grad_norm": 0.0029059904627501965, + "learning_rate": 0.001, + "loss": 0.4446, + "step": 4633 + }, + { + "epoch": 0.12786225973228668, + "grad_norm": 0.005105176474899054, + "learning_rate": 0.001, + "loss": 0.3566, + "step": 4634 + }, + { + "epoch": 0.12788985193335103, + "grad_norm": 0.005164528265595436, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 4635 + }, + { + "epoch": 0.12791744413441541, + "grad_norm": 0.003127202857285738, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 4636 + }, + { + "epoch": 0.12794503633547977, + "grad_norm": 0.004962852690368891, + "learning_rate": 0.001, + "loss": 0.3678, + "step": 4637 + }, + { + "epoch": 0.12797262853654415, + "grad_norm": 0.003928507212549448, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 4638 + }, + { + "epoch": 0.12800022073760853, + "grad_norm": 0.002523329108953476, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 4639 + }, + { + "epoch": 0.12802781293867288, + "grad_norm": 0.003745138179510832, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 4640 + }, + { + "epoch": 0.12805540513973726, + "grad_norm": 0.007446894887834787, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 4641 + }, + { + "epoch": 0.1280829973408016, + "grad_norm": 0.0024344183038920164, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 4642 + }, + { + "epoch": 0.128110589541866, + "grad_norm": 0.0059473030269145966, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 4643 + }, + { + "epoch": 0.12813818174293037, + "grad_norm": 0.00471229525282979, + "learning_rate": 0.001, + "loss": 0.3605, + "step": 4644 + }, + { + "epoch": 0.12816577394399473, + "grad_norm": 0.004949926398694515, + "learning_rate": 0.001, + "loss": 0.375, + "step": 4645 + }, + { + "epoch": 0.1281933661450591, + "grad_norm": 0.0033453593496233225, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 4646 + }, + { + "epoch": 0.12822095834612346, + "grad_norm": 0.003853026544675231, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 4647 + }, + { + "epoch": 0.12824855054718784, + "grad_norm": 0.0037174660246819258, + "learning_rate": 0.001, + "loss": 0.392, + "step": 4648 + }, + { + "epoch": 0.12827614274825222, + "grad_norm": 0.002829108154401183, + "learning_rate": 0.001, + "loss": 0.414, + "step": 4649 + }, + { + "epoch": 0.12830373494931657, + "grad_norm": 0.002781393239274621, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 4650 + }, + { + "epoch": 0.12833132715038095, + "grad_norm": 0.012161584571003914, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 4651 + }, + { + "epoch": 0.1283589193514453, + "grad_norm": 0.004447166807949543, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 4652 + }, + { + "epoch": 0.12838651155250969, + "grad_norm": 0.010521035641431808, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 4653 + }, + { + "epoch": 0.12841410375357407, + "grad_norm": 0.0034564679954200983, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 4654 + }, + { + "epoch": 0.12844169595463842, + "grad_norm": 0.003944997675716877, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 4655 + }, + { + "epoch": 0.1284692881557028, + "grad_norm": 0.0037498660385608673, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 4656 + }, + { + "epoch": 0.12849688035676715, + "grad_norm": 0.0032498843502253294, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 4657 + }, + { + "epoch": 0.12852447255783153, + "grad_norm": 0.007645866367965937, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 4658 + }, + { + "epoch": 0.12855206475889588, + "grad_norm": 0.006332173477858305, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 4659 + }, + { + "epoch": 0.12857965695996026, + "grad_norm": 0.004206281155347824, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 4660 + }, + { + "epoch": 0.12860724916102464, + "grad_norm": 0.0026335117872804403, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 4661 + }, + { + "epoch": 0.128634841362089, + "grad_norm": 0.004516151268035173, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 4662 + }, + { + "epoch": 0.12866243356315338, + "grad_norm": 0.0032997119706124067, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 4663 + }, + { + "epoch": 0.12869002576421773, + "grad_norm": 0.0038388820830732584, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 4664 + }, + { + "epoch": 0.1287176179652821, + "grad_norm": 0.0035206389147788286, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 4665 + }, + { + "epoch": 0.1287452101663465, + "grad_norm": 0.005415271036326885, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 4666 + }, + { + "epoch": 0.12877280236741084, + "grad_norm": 0.002756484318524599, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 4667 + }, + { + "epoch": 0.12880039456847522, + "grad_norm": 0.0028146097902208567, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 4668 + }, + { + "epoch": 0.12882798676953958, + "grad_norm": 0.004214905202388763, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 4669 + }, + { + "epoch": 0.12885557897060396, + "grad_norm": 0.00516669312492013, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 4670 + }, + { + "epoch": 0.12888317117166834, + "grad_norm": 0.0031740644481033087, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 4671 + }, + { + "epoch": 0.1289107633727327, + "grad_norm": 0.004468605387955904, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 4672 + }, + { + "epoch": 0.12893835557379707, + "grad_norm": 0.003522511338815093, + "learning_rate": 0.001, + "loss": 0.4624, + "step": 4673 + }, + { + "epoch": 0.12896594777486142, + "grad_norm": 0.002765173325315118, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 4674 + }, + { + "epoch": 0.1289935399759258, + "grad_norm": 0.0032698458526283503, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 4675 + }, + { + "epoch": 0.12902113217699018, + "grad_norm": 0.0034274624194949865, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 4676 + }, + { + "epoch": 0.12904872437805454, + "grad_norm": 0.003263387130573392, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 4677 + }, + { + "epoch": 0.12907631657911892, + "grad_norm": 0.0030025255400687456, + "learning_rate": 0.001, + "loss": 0.406, + "step": 4678 + }, + { + "epoch": 0.12910390878018327, + "grad_norm": 0.003843993414193392, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 4679 + }, + { + "epoch": 0.12913150098124765, + "grad_norm": 0.0030434427317231894, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 4680 + }, + { + "epoch": 0.12915909318231203, + "grad_norm": 0.0034381363075226545, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 4681 + }, + { + "epoch": 0.12918668538337638, + "grad_norm": 0.0032730584498494864, + "learning_rate": 0.001, + "loss": 0.388, + "step": 4682 + }, + { + "epoch": 0.12921427758444076, + "grad_norm": 0.004053408745676279, + "learning_rate": 0.001, + "loss": 0.3626, + "step": 4683 + }, + { + "epoch": 0.12924186978550511, + "grad_norm": 0.003272439120337367, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 4684 + }, + { + "epoch": 0.1292694619865695, + "grad_norm": 0.0030873471405357122, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 4685 + }, + { + "epoch": 0.12929705418763388, + "grad_norm": 0.0030633402056992054, + "learning_rate": 0.001, + "loss": 0.4443, + "step": 4686 + }, + { + "epoch": 0.12932464638869823, + "grad_norm": 0.003019932424649596, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 4687 + }, + { + "epoch": 0.1293522385897626, + "grad_norm": 0.0027192363049834967, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 4688 + }, + { + "epoch": 0.12937983079082696, + "grad_norm": 0.003051141509786248, + "learning_rate": 0.001, + "loss": 0.4528, + "step": 4689 + }, + { + "epoch": 0.12940742299189134, + "grad_norm": 0.004393309820443392, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 4690 + }, + { + "epoch": 0.12943501519295572, + "grad_norm": 0.004716060124337673, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 4691 + }, + { + "epoch": 0.12946260739402007, + "grad_norm": 0.0047832694835960865, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 4692 + }, + { + "epoch": 0.12949019959508445, + "grad_norm": 0.0040343874134123325, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 4693 + }, + { + "epoch": 0.1295177917961488, + "grad_norm": 0.006127424072474241, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 4694 + }, + { + "epoch": 0.1295453839972132, + "grad_norm": 0.002882004715502262, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 4695 + }, + { + "epoch": 0.12957297619827757, + "grad_norm": 0.003973776008933783, + "learning_rate": 0.001, + "loss": 0.393, + "step": 4696 + }, + { + "epoch": 0.12960056839934192, + "grad_norm": 0.0027190132532268763, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 4697 + }, + { + "epoch": 0.1296281606004063, + "grad_norm": 0.0025696575175970793, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 4698 + }, + { + "epoch": 0.12965575280147065, + "grad_norm": 0.0032083040568977594, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 4699 + }, + { + "epoch": 0.12968334500253503, + "grad_norm": 0.0025488468818366528, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 4700 + }, + { + "epoch": 0.1297109372035994, + "grad_norm": 0.009488187730312347, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 4701 + }, + { + "epoch": 0.12973852940466377, + "grad_norm": 0.0029801761265844107, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 4702 + }, + { + "epoch": 0.12976612160572815, + "grad_norm": 0.0247980747371912, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 4703 + }, + { + "epoch": 0.1297937138067925, + "grad_norm": 0.002381372032687068, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 4704 + }, + { + "epoch": 0.12982130600785688, + "grad_norm": 0.0035768726374953985, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 4705 + }, + { + "epoch": 0.12984889820892126, + "grad_norm": 0.003190788673236966, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 4706 + }, + { + "epoch": 0.1298764904099856, + "grad_norm": 0.00331826857291162, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 4707 + }, + { + "epoch": 0.12990408261105, + "grad_norm": 0.0032035168260335922, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 4708 + }, + { + "epoch": 0.12993167481211434, + "grad_norm": 0.0037546653766185045, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 4709 + }, + { + "epoch": 0.12995926701317873, + "grad_norm": 0.0026541994884610176, + "learning_rate": 0.001, + "loss": 0.4312, + "step": 4710 + }, + { + "epoch": 0.1299868592142431, + "grad_norm": 0.004107889253646135, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 4711 + }, + { + "epoch": 0.13001445141530746, + "grad_norm": 0.002514521824195981, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 4712 + }, + { + "epoch": 0.13004204361637184, + "grad_norm": 0.0028199944645166397, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 4713 + }, + { + "epoch": 0.1300696358174362, + "grad_norm": 0.0027020128909498453, + "learning_rate": 0.001, + "loss": 0.449, + "step": 4714 + }, + { + "epoch": 0.13009722801850057, + "grad_norm": 0.003255255287513137, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 4715 + }, + { + "epoch": 0.13012482021956495, + "grad_norm": 0.0035547774750739336, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 4716 + }, + { + "epoch": 0.1301524124206293, + "grad_norm": 0.0025250576436519623, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 4717 + }, + { + "epoch": 0.13018000462169368, + "grad_norm": 0.002805963857099414, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 4718 + }, + { + "epoch": 0.13020759682275804, + "grad_norm": 0.005695881322026253, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 4719 + }, + { + "epoch": 0.13023518902382242, + "grad_norm": 0.002897542668506503, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 4720 + }, + { + "epoch": 0.1302627812248868, + "grad_norm": 0.0037734811194241047, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 4721 + }, + { + "epoch": 0.13029037342595115, + "grad_norm": 0.003529176115989685, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 4722 + }, + { + "epoch": 0.13031796562701553, + "grad_norm": 0.003130922093987465, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 4723 + }, + { + "epoch": 0.13034555782807988, + "grad_norm": 0.0031659335363656282, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 4724 + }, + { + "epoch": 0.13037315002914426, + "grad_norm": 0.002901850501075387, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 4725 + }, + { + "epoch": 0.13040074223020864, + "grad_norm": 0.0034328114707022905, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 4726 + }, + { + "epoch": 0.130428334431273, + "grad_norm": 0.0028659338131546974, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 4727 + }, + { + "epoch": 0.13045592663233738, + "grad_norm": 0.002500742208212614, + "learning_rate": 0.001, + "loss": 0.431, + "step": 4728 + }, + { + "epoch": 0.13048351883340173, + "grad_norm": 0.004093059338629246, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 4729 + }, + { + "epoch": 0.1305111110344661, + "grad_norm": 0.006217313464730978, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 4730 + }, + { + "epoch": 0.1305387032355305, + "grad_norm": 0.003967654425650835, + "learning_rate": 0.001, + "loss": 0.398, + "step": 4731 + }, + { + "epoch": 0.13056629543659484, + "grad_norm": 0.002272986341267824, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 4732 + }, + { + "epoch": 0.13059388763765922, + "grad_norm": 0.003741595894098282, + "learning_rate": 0.001, + "loss": 0.35, + "step": 4733 + }, + { + "epoch": 0.13062147983872358, + "grad_norm": 0.004285778850317001, + "learning_rate": 0.001, + "loss": 0.381, + "step": 4734 + }, + { + "epoch": 0.13064907203978796, + "grad_norm": 0.003829494584351778, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 4735 + }, + { + "epoch": 0.13067666424085234, + "grad_norm": 0.0037094939034432173, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 4736 + }, + { + "epoch": 0.1307042564419167, + "grad_norm": 0.005124232266098261, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 4737 + }, + { + "epoch": 0.13073184864298107, + "grad_norm": 0.0033820930402725935, + "learning_rate": 0.001, + "loss": 0.3564, + "step": 4738 + }, + { + "epoch": 0.13075944084404542, + "grad_norm": 0.004075558390468359, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 4739 + }, + { + "epoch": 0.1307870330451098, + "grad_norm": 0.003454482415691018, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 4740 + }, + { + "epoch": 0.13081462524617418, + "grad_norm": 0.003381206886842847, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 4741 + }, + { + "epoch": 0.13084221744723853, + "grad_norm": 0.0028117720503360033, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 4742 + }, + { + "epoch": 0.13086980964830291, + "grad_norm": 0.002668531145900488, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 4743 + }, + { + "epoch": 0.13089740184936727, + "grad_norm": 0.003135831095278263, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 4744 + }, + { + "epoch": 0.13092499405043165, + "grad_norm": 0.008101669140160084, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 4745 + }, + { + "epoch": 0.13095258625149603, + "grad_norm": 0.0035032073501497507, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 4746 + }, + { + "epoch": 0.13098017845256038, + "grad_norm": 0.003943659830838442, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 4747 + }, + { + "epoch": 0.13100777065362476, + "grad_norm": 0.0026659085415303707, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 4748 + }, + { + "epoch": 0.1310353628546891, + "grad_norm": 0.004375234711915255, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 4749 + }, + { + "epoch": 0.1310629550557535, + "grad_norm": 0.0033472348004579544, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 4750 + }, + { + "epoch": 0.13109054725681787, + "grad_norm": 0.005840728525072336, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 4751 + }, + { + "epoch": 0.13111813945788223, + "grad_norm": 0.0032132677733898163, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 4752 + }, + { + "epoch": 0.1311457316589466, + "grad_norm": 0.002400551922619343, + "learning_rate": 0.001, + "loss": 0.446, + "step": 4753 + }, + { + "epoch": 0.13117332386001096, + "grad_norm": 0.0028581595979630947, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 4754 + }, + { + "epoch": 0.13120091606107534, + "grad_norm": 0.0029292525723576546, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 4755 + }, + { + "epoch": 0.1312285082621397, + "grad_norm": 0.004841271787881851, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 4756 + }, + { + "epoch": 0.13125610046320407, + "grad_norm": 0.004950361791998148, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 4757 + }, + { + "epoch": 0.13128369266426845, + "grad_norm": 0.003650795202702284, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 4758 + }, + { + "epoch": 0.1313112848653328, + "grad_norm": 0.005590515211224556, + "learning_rate": 0.001, + "loss": 0.3603, + "step": 4759 + }, + { + "epoch": 0.13133887706639719, + "grad_norm": 0.004784191958606243, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 4760 + }, + { + "epoch": 0.13136646926746154, + "grad_norm": 0.006027446128427982, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 4761 + }, + { + "epoch": 0.13139406146852592, + "grad_norm": 0.0037396312691271305, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 4762 + }, + { + "epoch": 0.1314216536695903, + "grad_norm": 0.0032113094348460436, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 4763 + }, + { + "epoch": 0.13144924587065465, + "grad_norm": 0.00637141102924943, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 4764 + }, + { + "epoch": 0.13147683807171903, + "grad_norm": 0.005106314085423946, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 4765 + }, + { + "epoch": 0.13150443027278338, + "grad_norm": 0.00308993854559958, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 4766 + }, + { + "epoch": 0.13153202247384777, + "grad_norm": 0.007722698617726564, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 4767 + }, + { + "epoch": 0.13155961467491215, + "grad_norm": 0.00279480149038136, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 4768 + }, + { + "epoch": 0.1315872068759765, + "grad_norm": 0.00339706614613533, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 4769 + }, + { + "epoch": 0.13161479907704088, + "grad_norm": 0.0045981621369719505, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 4770 + }, + { + "epoch": 0.13164239127810523, + "grad_norm": 0.0029820918571203947, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 4771 + }, + { + "epoch": 0.1316699834791696, + "grad_norm": 0.0026766203809529543, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 4772 + }, + { + "epoch": 0.131697575680234, + "grad_norm": 0.004069194197654724, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 4773 + }, + { + "epoch": 0.13172516788129834, + "grad_norm": 0.005836902651935816, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 4774 + }, + { + "epoch": 0.13175276008236272, + "grad_norm": 0.002671006368473172, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 4775 + }, + { + "epoch": 0.13178035228342708, + "grad_norm": 0.0025038421154022217, + "learning_rate": 0.001, + "loss": 0.426, + "step": 4776 + }, + { + "epoch": 0.13180794448449146, + "grad_norm": 0.0062514557503163815, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 4777 + }, + { + "epoch": 0.13183553668555584, + "grad_norm": 0.005402828101068735, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 4778 + }, + { + "epoch": 0.1318631288866202, + "grad_norm": 0.005721025634557009, + "learning_rate": 0.001, + "loss": 0.359, + "step": 4779 + }, + { + "epoch": 0.13189072108768457, + "grad_norm": 0.0032702479511499405, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 4780 + }, + { + "epoch": 0.13191831328874892, + "grad_norm": 0.0022660638205707073, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 4781 + }, + { + "epoch": 0.1319459054898133, + "grad_norm": 0.008734694682061672, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 4782 + }, + { + "epoch": 0.13197349769087768, + "grad_norm": 0.0025738070253282785, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 4783 + }, + { + "epoch": 0.13200108989194204, + "grad_norm": 0.004817556589841843, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 4784 + }, + { + "epoch": 0.13202868209300642, + "grad_norm": 0.0032889090944081545, + "learning_rate": 0.001, + "loss": 0.3642, + "step": 4785 + }, + { + "epoch": 0.13205627429407077, + "grad_norm": 0.002885392401367426, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 4786 + }, + { + "epoch": 0.13208386649513515, + "grad_norm": 0.002848939271643758, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 4787 + }, + { + "epoch": 0.13211145869619953, + "grad_norm": 0.004833425395190716, + "learning_rate": 0.001, + "loss": 0.368, + "step": 4788 + }, + { + "epoch": 0.13213905089726388, + "grad_norm": 0.005285562947392464, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 4789 + }, + { + "epoch": 0.13216664309832826, + "grad_norm": 0.002420694101601839, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 4790 + }, + { + "epoch": 0.13219423529939262, + "grad_norm": 0.0041345711797475815, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 4791 + }, + { + "epoch": 0.132221827500457, + "grad_norm": 0.00341211399063468, + "learning_rate": 0.001, + "loss": 0.441, + "step": 4792 + }, + { + "epoch": 0.13224941970152138, + "grad_norm": 0.003937454894185066, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 4793 + }, + { + "epoch": 0.13227701190258573, + "grad_norm": 0.0027174679562449455, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 4794 + }, + { + "epoch": 0.1323046041036501, + "grad_norm": 0.004002917557954788, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 4795 + }, + { + "epoch": 0.13233219630471446, + "grad_norm": 0.0035276333801448345, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 4796 + }, + { + "epoch": 0.13235978850577884, + "grad_norm": 0.0030461258720606565, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 4797 + }, + { + "epoch": 0.13238738070684322, + "grad_norm": 0.003558572381734848, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 4798 + }, + { + "epoch": 0.13241497290790757, + "grad_norm": 0.00515527231618762, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 4799 + }, + { + "epoch": 0.13244256510897195, + "grad_norm": 0.0030274007003754377, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 4800 + }, + { + "epoch": 0.1324701573100363, + "grad_norm": 0.0033126547932624817, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 4801 + }, + { + "epoch": 0.1324977495111007, + "grad_norm": 0.0031248959712684155, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 4802 + }, + { + "epoch": 0.13252534171216507, + "grad_norm": 0.0032562301494181156, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 4803 + }, + { + "epoch": 0.13255293391322942, + "grad_norm": 0.005781130399554968, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 4804 + }, + { + "epoch": 0.1325805261142938, + "grad_norm": 0.0039030571933835745, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 4805 + }, + { + "epoch": 0.13260811831535815, + "grad_norm": 0.002995892893522978, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 4806 + }, + { + "epoch": 0.13263571051642253, + "grad_norm": 0.002603591652587056, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 4807 + }, + { + "epoch": 0.13266330271748691, + "grad_norm": 0.00366300530731678, + "learning_rate": 0.001, + "loss": 0.404, + "step": 4808 + }, + { + "epoch": 0.13269089491855127, + "grad_norm": 0.0033524134196341038, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 4809 + }, + { + "epoch": 0.13271848711961565, + "grad_norm": 0.003584874328225851, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 4810 + }, + { + "epoch": 0.13274607932068, + "grad_norm": 0.004354959353804588, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 4811 + }, + { + "epoch": 0.13277367152174438, + "grad_norm": 0.0041418191976845264, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 4812 + }, + { + "epoch": 0.13280126372280876, + "grad_norm": 0.0028464416973292828, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 4813 + }, + { + "epoch": 0.1328288559238731, + "grad_norm": 0.0049109673127532005, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 4814 + }, + { + "epoch": 0.1328564481249375, + "grad_norm": 0.0039614904671907425, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 4815 + }, + { + "epoch": 0.13288404032600185, + "grad_norm": 0.008978486992418766, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 4816 + }, + { + "epoch": 0.13291163252706623, + "grad_norm": 0.006667922250926495, + "learning_rate": 0.001, + "loss": 0.417, + "step": 4817 + }, + { + "epoch": 0.1329392247281306, + "grad_norm": 0.004778092727065086, + "learning_rate": 0.001, + "loss": 0.4362, + "step": 4818 + }, + { + "epoch": 0.13296681692919496, + "grad_norm": 0.0056798397563397884, + "learning_rate": 0.001, + "loss": 0.4426, + "step": 4819 + }, + { + "epoch": 0.13299440913025934, + "grad_norm": 0.0033784289844334126, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 4820 + }, + { + "epoch": 0.1330220013313237, + "grad_norm": 0.002719793003052473, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 4821 + }, + { + "epoch": 0.13304959353238807, + "grad_norm": 0.003707107389345765, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 4822 + }, + { + "epoch": 0.13307718573345245, + "grad_norm": 0.0044320011511445045, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 4823 + }, + { + "epoch": 0.1331047779345168, + "grad_norm": 0.003244374878704548, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 4824 + }, + { + "epoch": 0.13313237013558119, + "grad_norm": 0.006342526059597731, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 4825 + }, + { + "epoch": 0.13315996233664554, + "grad_norm": 0.002801056718453765, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 4826 + }, + { + "epoch": 0.13318755453770992, + "grad_norm": 0.004005104303359985, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 4827 + }, + { + "epoch": 0.1332151467387743, + "grad_norm": 0.0026726596988737583, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 4828 + }, + { + "epoch": 0.13324273893983865, + "grad_norm": 0.0022171332966536283, + "learning_rate": 0.001, + "loss": 0.395, + "step": 4829 + }, + { + "epoch": 0.13327033114090303, + "grad_norm": 0.00455888919532299, + "learning_rate": 0.001, + "loss": 0.3438, + "step": 4830 + }, + { + "epoch": 0.13329792334196738, + "grad_norm": 0.0036296145990490913, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 4831 + }, + { + "epoch": 0.13332551554303176, + "grad_norm": 0.0030942729208618402, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 4832 + }, + { + "epoch": 0.13335310774409614, + "grad_norm": 0.0031570447608828545, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 4833 + }, + { + "epoch": 0.1333806999451605, + "grad_norm": 0.0029306053183972836, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 4834 + }, + { + "epoch": 0.13340829214622488, + "grad_norm": 0.002967880107462406, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 4835 + }, + { + "epoch": 0.13343588434728923, + "grad_norm": 0.0028411787934601307, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 4836 + }, + { + "epoch": 0.1334634765483536, + "grad_norm": 0.003896566806361079, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 4837 + }, + { + "epoch": 0.133491068749418, + "grad_norm": 0.0024064083117991686, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 4838 + }, + { + "epoch": 0.13351866095048234, + "grad_norm": 0.0045156353153288364, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 4839 + }, + { + "epoch": 0.13354625315154672, + "grad_norm": 0.002685493789613247, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 4840 + }, + { + "epoch": 0.13357384535261108, + "grad_norm": 0.00366023276001215, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 4841 + }, + { + "epoch": 0.13360143755367546, + "grad_norm": 0.0024847208987921476, + "learning_rate": 0.001, + "loss": 0.4506, + "step": 4842 + }, + { + "epoch": 0.13362902975473984, + "grad_norm": 0.004642703104764223, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 4843 + }, + { + "epoch": 0.1336566219558042, + "grad_norm": 0.004185870289802551, + "learning_rate": 0.001, + "loss": 0.417, + "step": 4844 + }, + { + "epoch": 0.13368421415686857, + "grad_norm": 0.0043914420530200005, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 4845 + }, + { + "epoch": 0.13371180635793292, + "grad_norm": 0.0026118417736142874, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 4846 + }, + { + "epoch": 0.1337393985589973, + "grad_norm": 0.0031845802441239357, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 4847 + }, + { + "epoch": 0.13376699076006165, + "grad_norm": 0.0029715183191001415, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 4848 + }, + { + "epoch": 0.13379458296112604, + "grad_norm": 0.0023027430288493633, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 4849 + }, + { + "epoch": 0.13382217516219042, + "grad_norm": 0.002911708317697048, + "learning_rate": 0.001, + "loss": 0.425, + "step": 4850 + }, + { + "epoch": 0.13384976736325477, + "grad_norm": 0.002629220252856612, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 4851 + }, + { + "epoch": 0.13387735956431915, + "grad_norm": 0.005986600182950497, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 4852 + }, + { + "epoch": 0.1339049517653835, + "grad_norm": 0.006612063851207495, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 4853 + }, + { + "epoch": 0.13393254396644788, + "grad_norm": 0.003582441946491599, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 4854 + }, + { + "epoch": 0.13396013616751226, + "grad_norm": 0.0032464175019413233, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 4855 + }, + { + "epoch": 0.13398772836857661, + "grad_norm": 0.0038717519491910934, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 4856 + }, + { + "epoch": 0.134015320569641, + "grad_norm": 0.00332628283649683, + "learning_rate": 0.001, + "loss": 0.375, + "step": 4857 + }, + { + "epoch": 0.13404291277070535, + "grad_norm": 0.004962892737239599, + "learning_rate": 0.001, + "loss": 0.412, + "step": 4858 + }, + { + "epoch": 0.13407050497176973, + "grad_norm": 0.0022890097461640835, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 4859 + }, + { + "epoch": 0.1340980971728341, + "grad_norm": 0.0030799328815191984, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 4860 + }, + { + "epoch": 0.13412568937389846, + "grad_norm": 0.003105921670794487, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 4861 + }, + { + "epoch": 0.13415328157496284, + "grad_norm": 0.0060799745842814445, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 4862 + }, + { + "epoch": 0.1341808737760272, + "grad_norm": 0.005220312625169754, + "learning_rate": 0.001, + "loss": 0.403, + "step": 4863 + }, + { + "epoch": 0.13420846597709157, + "grad_norm": 0.005123880226165056, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 4864 + }, + { + "epoch": 0.13423605817815595, + "grad_norm": 0.002265162765979767, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 4865 + }, + { + "epoch": 0.1342636503792203, + "grad_norm": 0.004077676683664322, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 4866 + }, + { + "epoch": 0.1342912425802847, + "grad_norm": 0.0035009863786399364, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 4867 + }, + { + "epoch": 0.13431883478134904, + "grad_norm": 0.003494169097393751, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 4868 + }, + { + "epoch": 0.13434642698241342, + "grad_norm": 0.0059214490465819836, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 4869 + }, + { + "epoch": 0.1343740191834778, + "grad_norm": 0.0038102336693555117, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 4870 + }, + { + "epoch": 0.13440161138454215, + "grad_norm": 0.01034005731344223, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 4871 + }, + { + "epoch": 0.13442920358560653, + "grad_norm": 0.003015523310750723, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 4872 + }, + { + "epoch": 0.13445679578667089, + "grad_norm": 0.003311215667054057, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 4873 + }, + { + "epoch": 0.13448438798773527, + "grad_norm": 0.002569688018411398, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 4874 + }, + { + "epoch": 0.13451198018879965, + "grad_norm": 0.002594457007944584, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 4875 + }, + { + "epoch": 0.134539572389864, + "grad_norm": 0.004568415228277445, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 4876 + }, + { + "epoch": 0.13456716459092838, + "grad_norm": 0.0033749649301171303, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 4877 + }, + { + "epoch": 0.13459475679199273, + "grad_norm": 0.003532090689986944, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 4878 + }, + { + "epoch": 0.1346223489930571, + "grad_norm": 0.0033763758838176727, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 4879 + }, + { + "epoch": 0.1346499411941215, + "grad_norm": 0.002815960207954049, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 4880 + }, + { + "epoch": 0.13467753339518584, + "grad_norm": 0.0021568064112216234, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 4881 + }, + { + "epoch": 0.13470512559625022, + "grad_norm": 0.002766069257631898, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 4882 + }, + { + "epoch": 0.13473271779731458, + "grad_norm": 0.004232902079820633, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 4883 + }, + { + "epoch": 0.13476030999837896, + "grad_norm": 0.007129458710551262, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 4884 + }, + { + "epoch": 0.13478790219944334, + "grad_norm": 0.0027489245403558016, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 4885 + }, + { + "epoch": 0.1348154944005077, + "grad_norm": 0.002653737785294652, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 4886 + }, + { + "epoch": 0.13484308660157207, + "grad_norm": 0.0031049742829054594, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 4887 + }, + { + "epoch": 0.13487067880263642, + "grad_norm": 0.002779177390038967, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 4888 + }, + { + "epoch": 0.1348982710037008, + "grad_norm": 0.003438913496211171, + "learning_rate": 0.001, + "loss": 0.378, + "step": 4889 + }, + { + "epoch": 0.13492586320476518, + "grad_norm": 0.003269481472671032, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 4890 + }, + { + "epoch": 0.13495345540582954, + "grad_norm": 0.00541806360706687, + "learning_rate": 0.001, + "loss": 0.4346, + "step": 4891 + }, + { + "epoch": 0.13498104760689392, + "grad_norm": 0.007846074178814888, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 4892 + }, + { + "epoch": 0.13500863980795827, + "grad_norm": 0.006200781557708979, + "learning_rate": 0.001, + "loss": 0.406, + "step": 4893 + }, + { + "epoch": 0.13503623200902265, + "grad_norm": 0.003382598515599966, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 4894 + }, + { + "epoch": 0.13506382421008703, + "grad_norm": 0.002724557416513562, + "learning_rate": 0.001, + "loss": 0.422, + "step": 4895 + }, + { + "epoch": 0.13509141641115138, + "grad_norm": 0.0024383303243666887, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 4896 + }, + { + "epoch": 0.13511900861221576, + "grad_norm": 0.0024829639587551355, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 4897 + }, + { + "epoch": 0.13514660081328012, + "grad_norm": 0.002763790776953101, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 4898 + }, + { + "epoch": 0.1351741930143445, + "grad_norm": 0.0031322850845754147, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 4899 + }, + { + "epoch": 0.13520178521540888, + "grad_norm": 0.002493941690772772, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 4900 + }, + { + "epoch": 0.13522937741647323, + "grad_norm": 0.002849231008440256, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 4901 + }, + { + "epoch": 0.1352569696175376, + "grad_norm": 0.0022846634965389967, + "learning_rate": 0.001, + "loss": 0.4765, + "step": 4902 + }, + { + "epoch": 0.13528456181860196, + "grad_norm": 0.003928142134100199, + "learning_rate": 0.001, + "loss": 0.395, + "step": 4903 + }, + { + "epoch": 0.13531215401966634, + "grad_norm": 0.0029574292711913586, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 4904 + }, + { + "epoch": 0.13533974622073072, + "grad_norm": 0.0024456526152789593, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 4905 + }, + { + "epoch": 0.13536733842179507, + "grad_norm": 0.005585736595094204, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 4906 + }, + { + "epoch": 0.13539493062285946, + "grad_norm": 0.003591830376535654, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 4907 + }, + { + "epoch": 0.1354225228239238, + "grad_norm": 0.0025483304634690285, + "learning_rate": 0.001, + "loss": 0.374, + "step": 4908 + }, + { + "epoch": 0.1354501150249882, + "grad_norm": 0.0023265292402356863, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 4909 + }, + { + "epoch": 0.13547770722605257, + "grad_norm": 0.0066468482837080956, + "learning_rate": 0.001, + "loss": 0.387, + "step": 4910 + }, + { + "epoch": 0.13550529942711692, + "grad_norm": 0.003016105853021145, + "learning_rate": 0.001, + "loss": 0.406, + "step": 4911 + }, + { + "epoch": 0.1355328916281813, + "grad_norm": 0.002601313404738903, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 4912 + }, + { + "epoch": 0.13556048382924565, + "grad_norm": 0.0035740050952881575, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 4913 + }, + { + "epoch": 0.13558807603031003, + "grad_norm": 0.0030930940993130207, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 4914 + }, + { + "epoch": 0.13561566823137441, + "grad_norm": 0.0029879440553486347, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 4915 + }, + { + "epoch": 0.13564326043243877, + "grad_norm": 0.003520643338561058, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 4916 + }, + { + "epoch": 0.13567085263350315, + "grad_norm": 0.0026404429227113724, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 4917 + }, + { + "epoch": 0.1356984448345675, + "grad_norm": 0.0031609549187123775, + "learning_rate": 0.001, + "loss": 0.3546, + "step": 4918 + }, + { + "epoch": 0.13572603703563188, + "grad_norm": 0.002632451243698597, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 4919 + }, + { + "epoch": 0.13575362923669626, + "grad_norm": 0.004959171637892723, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 4920 + }, + { + "epoch": 0.1357812214377606, + "grad_norm": 0.0027791087049990892, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 4921 + }, + { + "epoch": 0.135808813638825, + "grad_norm": 0.002658676588907838, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 4922 + }, + { + "epoch": 0.13583640583988935, + "grad_norm": 0.0022139514330774546, + "learning_rate": 0.001, + "loss": 0.42, + "step": 4923 + }, + { + "epoch": 0.13586399804095373, + "grad_norm": 0.0026424683164805174, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 4924 + }, + { + "epoch": 0.1358915902420181, + "grad_norm": 0.0026688999496400356, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 4925 + }, + { + "epoch": 0.13591918244308246, + "grad_norm": 0.003557176562026143, + "learning_rate": 0.001, + "loss": 0.404, + "step": 4926 + }, + { + "epoch": 0.13594677464414684, + "grad_norm": 0.002718021161854267, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 4927 + }, + { + "epoch": 0.1359743668452112, + "grad_norm": 0.0034369598142802715, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 4928 + }, + { + "epoch": 0.13600195904627557, + "grad_norm": 0.003354028332978487, + "learning_rate": 0.001, + "loss": 0.3488, + "step": 4929 + }, + { + "epoch": 0.13602955124733995, + "grad_norm": 0.0030789237935096025, + "learning_rate": 0.001, + "loss": 0.367, + "step": 4930 + }, + { + "epoch": 0.1360571434484043, + "grad_norm": 0.003499686485156417, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 4931 + }, + { + "epoch": 0.13608473564946869, + "grad_norm": 0.003687124466523528, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 4932 + }, + { + "epoch": 0.13611232785053304, + "grad_norm": 0.003492203541100025, + "learning_rate": 0.001, + "loss": 0.454, + "step": 4933 + }, + { + "epoch": 0.13613992005159742, + "grad_norm": 0.002475053770467639, + "learning_rate": 0.001, + "loss": 0.413, + "step": 4934 + }, + { + "epoch": 0.1361675122526618, + "grad_norm": 0.002234766026958823, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 4935 + }, + { + "epoch": 0.13619510445372615, + "grad_norm": 0.002103250939399004, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 4936 + }, + { + "epoch": 0.13622269665479053, + "grad_norm": 0.0036407755687832832, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 4937 + }, + { + "epoch": 0.13625028885585488, + "grad_norm": 0.003084122436121106, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 4938 + }, + { + "epoch": 0.13627788105691926, + "grad_norm": 0.0025477514136582613, + "learning_rate": 0.001, + "loss": 0.4523, + "step": 4939 + }, + { + "epoch": 0.13630547325798364, + "grad_norm": 0.002407640218734741, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 4940 + }, + { + "epoch": 0.136333065459048, + "grad_norm": 0.004071627277880907, + "learning_rate": 0.001, + "loss": 0.393, + "step": 4941 + }, + { + "epoch": 0.13636065766011238, + "grad_norm": 0.004198818933218718, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 4942 + }, + { + "epoch": 0.13638824986117673, + "grad_norm": 0.0038220062851905823, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 4943 + }, + { + "epoch": 0.1364158420622411, + "grad_norm": 0.003277060342952609, + "learning_rate": 0.001, + "loss": 0.427, + "step": 4944 + }, + { + "epoch": 0.13644343426330546, + "grad_norm": 0.0034869378432631493, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 4945 + }, + { + "epoch": 0.13647102646436984, + "grad_norm": 0.00437937444075942, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 4946 + }, + { + "epoch": 0.13649861866543422, + "grad_norm": 0.0038593048229813576, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 4947 + }, + { + "epoch": 0.13652621086649858, + "grad_norm": 0.0027042904403060675, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 4948 + }, + { + "epoch": 0.13655380306756296, + "grad_norm": 0.003592225257307291, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 4949 + }, + { + "epoch": 0.1365813952686273, + "grad_norm": 0.005848668981343508, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 4950 + }, + { + "epoch": 0.1366089874696917, + "grad_norm": 0.01795089617371559, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 4951 + }, + { + "epoch": 0.13663657967075607, + "grad_norm": 0.004261403810232878, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 4952 + }, + { + "epoch": 0.13666417187182042, + "grad_norm": 0.0033142368774861097, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 4953 + }, + { + "epoch": 0.1366917640728848, + "grad_norm": 0.003166605019941926, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 4954 + }, + { + "epoch": 0.13671935627394916, + "grad_norm": 0.003505310043692589, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 4955 + }, + { + "epoch": 0.13674694847501354, + "grad_norm": 0.005368894897401333, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 4956 + }, + { + "epoch": 0.13677454067607792, + "grad_norm": 0.00306013785302639, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 4957 + }, + { + "epoch": 0.13680213287714227, + "grad_norm": 0.005562109872698784, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 4958 + }, + { + "epoch": 0.13682972507820665, + "grad_norm": 0.0037877087015658617, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 4959 + }, + { + "epoch": 0.136857317279271, + "grad_norm": 0.0031170272268354893, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 4960 + }, + { + "epoch": 0.13688490948033538, + "grad_norm": 0.003254547482356429, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 4961 + }, + { + "epoch": 0.13691250168139976, + "grad_norm": 0.003371870843693614, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 4962 + }, + { + "epoch": 0.13694009388246411, + "grad_norm": 0.003327438374981284, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 4963 + }, + { + "epoch": 0.1369676860835285, + "grad_norm": 0.003542753402143717, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 4964 + }, + { + "epoch": 0.13699527828459285, + "grad_norm": 0.0030288288835436106, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 4965 + }, + { + "epoch": 0.13702287048565723, + "grad_norm": 0.0034882482141256332, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 4966 + }, + { + "epoch": 0.1370504626867216, + "grad_norm": 0.005606031510978937, + "learning_rate": 0.001, + "loss": 0.394, + "step": 4967 + }, + { + "epoch": 0.13707805488778596, + "grad_norm": 0.004056036937981844, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 4968 + }, + { + "epoch": 0.13710564708885034, + "grad_norm": 0.0032308900263160467, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 4969 + }, + { + "epoch": 0.1371332392899147, + "grad_norm": 0.006253343541175127, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 4970 + }, + { + "epoch": 0.13716083149097907, + "grad_norm": 0.003451489843428135, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 4971 + }, + { + "epoch": 0.13718842369204345, + "grad_norm": 0.0033425847068428993, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 4972 + }, + { + "epoch": 0.1372160158931078, + "grad_norm": 0.0033426876179873943, + "learning_rate": 0.001, + "loss": 0.437, + "step": 4973 + }, + { + "epoch": 0.1372436080941722, + "grad_norm": 0.0031063987407833338, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 4974 + }, + { + "epoch": 0.13727120029523654, + "grad_norm": 0.003505886532366276, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 4975 + }, + { + "epoch": 0.13729879249630092, + "grad_norm": 0.0025633492041379213, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 4976 + }, + { + "epoch": 0.1373263846973653, + "grad_norm": 0.003809051588177681, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 4977 + }, + { + "epoch": 0.13735397689842965, + "grad_norm": 0.004108694847673178, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 4978 + }, + { + "epoch": 0.13738156909949403, + "grad_norm": 0.0032902159728109837, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 4979 + }, + { + "epoch": 0.13740916130055839, + "grad_norm": 0.004460613243281841, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 4980 + }, + { + "epoch": 0.13743675350162277, + "grad_norm": 0.0026315529830753803, + "learning_rate": 0.001, + "loss": 0.409, + "step": 4981 + }, + { + "epoch": 0.13746434570268715, + "grad_norm": 0.0023952291812747717, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 4982 + }, + { + "epoch": 0.1374919379037515, + "grad_norm": 0.003244011662900448, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 4983 + }, + { + "epoch": 0.13751953010481588, + "grad_norm": 0.003023752709850669, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 4984 + }, + { + "epoch": 0.13754712230588023, + "grad_norm": 0.002322046086192131, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 4985 + }, + { + "epoch": 0.1375747145069446, + "grad_norm": 0.0032940306700766087, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 4986 + }, + { + "epoch": 0.137602306708009, + "grad_norm": 0.002528809243813157, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 4987 + }, + { + "epoch": 0.13762989890907334, + "grad_norm": 0.003885491518303752, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 4988 + }, + { + "epoch": 0.13765749111013773, + "grad_norm": 0.0026430352590978146, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 4989 + }, + { + "epoch": 0.13768508331120208, + "grad_norm": 0.0037311362102627754, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 4990 + }, + { + "epoch": 0.13771267551226646, + "grad_norm": 0.002789480844512582, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 4991 + }, + { + "epoch": 0.13774026771333084, + "grad_norm": 0.0031957447063177824, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 4992 + }, + { + "epoch": 0.1377678599143952, + "grad_norm": 0.0032790224067866802, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 4993 + }, + { + "epoch": 0.13779545211545957, + "grad_norm": 0.0031076695304363966, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 4994 + }, + { + "epoch": 0.13782304431652392, + "grad_norm": 0.0024428521282970905, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 4995 + }, + { + "epoch": 0.1378506365175883, + "grad_norm": 0.003022199496626854, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 4996 + }, + { + "epoch": 0.13787822871865268, + "grad_norm": 0.0028683668933808804, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 4997 + }, + { + "epoch": 0.13790582091971704, + "grad_norm": 0.003475520294159651, + "learning_rate": 0.001, + "loss": 0.395, + "step": 4998 + }, + { + "epoch": 0.13793341312078142, + "grad_norm": 0.003010012209415436, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 4999 + }, + { + "epoch": 0.13796100532184577, + "grad_norm": 0.002013305900618434, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 5000 + }, + { + "epoch": 0.13796100532184577, + "eval_runtime": 23.8099, + "eval_samples_per_second": 1.344, + "eval_steps_per_second": 0.168, + "step": 5000 + }, + { + "epoch": 0.13798859752291015, + "grad_norm": 0.00346556818112731, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 5001 + }, + { + "epoch": 0.13801618972397453, + "grad_norm": 0.003510264679789543, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 5002 + }, + { + "epoch": 0.13804378192503888, + "grad_norm": 0.0026663157623261213, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 5003 + }, + { + "epoch": 0.13807137412610326, + "grad_norm": 0.0029142978601157665, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 5004 + }, + { + "epoch": 0.13809896632716762, + "grad_norm": 0.0032160128466784954, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 5005 + }, + { + "epoch": 0.138126558528232, + "grad_norm": 0.0034054117277264595, + "learning_rate": 0.001, + "loss": 0.388, + "step": 5006 + }, + { + "epoch": 0.13815415072929638, + "grad_norm": 0.0025454023852944374, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 5007 + }, + { + "epoch": 0.13818174293036073, + "grad_norm": 0.0025894741993397474, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 5008 + }, + { + "epoch": 0.1382093351314251, + "grad_norm": 0.0027122844476252794, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 5009 + }, + { + "epoch": 0.13823692733248946, + "grad_norm": 0.0039015160873532295, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 5010 + }, + { + "epoch": 0.13826451953355384, + "grad_norm": 0.003185526467859745, + "learning_rate": 0.001, + "loss": 0.3548, + "step": 5011 + }, + { + "epoch": 0.13829211173461822, + "grad_norm": 0.004475030116736889, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 5012 + }, + { + "epoch": 0.13831970393568258, + "grad_norm": 0.003455979051068425, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 5013 + }, + { + "epoch": 0.13834729613674696, + "grad_norm": 0.002884593093767762, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 5014 + }, + { + "epoch": 0.1383748883378113, + "grad_norm": 0.002263484289869666, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 5015 + }, + { + "epoch": 0.1384024805388757, + "grad_norm": 0.003735289676114917, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 5016 + }, + { + "epoch": 0.13843007273994007, + "grad_norm": 0.003224038053303957, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 5017 + }, + { + "epoch": 0.13845766494100442, + "grad_norm": 0.004887696355581284, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 5018 + }, + { + "epoch": 0.1384852571420688, + "grad_norm": 0.004760736133903265, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 5019 + }, + { + "epoch": 0.13851284934313315, + "grad_norm": 0.0037501987535506487, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 5020 + }, + { + "epoch": 0.13854044154419753, + "grad_norm": 0.0027801606338471174, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 5021 + }, + { + "epoch": 0.13856803374526191, + "grad_norm": 0.004847459960728884, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 5022 + }, + { + "epoch": 0.13859562594632627, + "grad_norm": 0.0057189855724573135, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 5023 + }, + { + "epoch": 0.13862321814739065, + "grad_norm": 0.002882015658542514, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 5024 + }, + { + "epoch": 0.138650810348455, + "grad_norm": 0.002629178110510111, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 5025 + }, + { + "epoch": 0.13867840254951938, + "grad_norm": 0.0042918650433421135, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 5026 + }, + { + "epoch": 0.13870599475058376, + "grad_norm": 0.002527192234992981, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 5027 + }, + { + "epoch": 0.1387335869516481, + "grad_norm": 0.0032653820235282183, + "learning_rate": 0.001, + "loss": 0.4435, + "step": 5028 + }, + { + "epoch": 0.1387611791527125, + "grad_norm": 0.007327108643949032, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 5029 + }, + { + "epoch": 0.13878877135377685, + "grad_norm": 0.0036960134748369455, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 5030 + }, + { + "epoch": 0.13881636355484123, + "grad_norm": 0.0025663794949650764, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 5031 + }, + { + "epoch": 0.1388439557559056, + "grad_norm": 0.0026950971223413944, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 5032 + }, + { + "epoch": 0.13887154795696996, + "grad_norm": 0.002336070640012622, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 5033 + }, + { + "epoch": 0.13889914015803434, + "grad_norm": 0.0024258256889879704, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 5034 + }, + { + "epoch": 0.1389267323590987, + "grad_norm": 0.002580720465630293, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 5035 + }, + { + "epoch": 0.13895432456016307, + "grad_norm": 0.00329298572614789, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 5036 + }, + { + "epoch": 0.13898191676122743, + "grad_norm": 0.0029764564242213964, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 5037 + }, + { + "epoch": 0.1390095089622918, + "grad_norm": 0.002715889597311616, + "learning_rate": 0.001, + "loss": 0.382, + "step": 5038 + }, + { + "epoch": 0.13903710116335619, + "grad_norm": 0.002393190050497651, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 5039 + }, + { + "epoch": 0.13906469336442054, + "grad_norm": 0.0024876859970390797, + "learning_rate": 0.001, + "loss": 0.397, + "step": 5040 + }, + { + "epoch": 0.13909228556548492, + "grad_norm": 0.0032655515242367983, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 5041 + }, + { + "epoch": 0.13911987776654927, + "grad_norm": 0.002442015800625086, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 5042 + }, + { + "epoch": 0.13914746996761365, + "grad_norm": 0.002807747106999159, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 5043 + }, + { + "epoch": 0.13917506216867803, + "grad_norm": 0.005945953540503979, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 5044 + }, + { + "epoch": 0.13920265436974238, + "grad_norm": 0.0031765319872647524, + "learning_rate": 0.001, + "loss": 0.416, + "step": 5045 + }, + { + "epoch": 0.13923024657080676, + "grad_norm": 0.0037979150656610727, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 5046 + }, + { + "epoch": 0.13925783877187112, + "grad_norm": 0.0030904149170964956, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 5047 + }, + { + "epoch": 0.1392854309729355, + "grad_norm": 0.0037039672024548054, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5048 + }, + { + "epoch": 0.13931302317399988, + "grad_norm": 0.0033529032953083515, + "learning_rate": 0.001, + "loss": 0.3702, + "step": 5049 + }, + { + "epoch": 0.13934061537506423, + "grad_norm": 0.0024797532241791487, + "learning_rate": 0.001, + "loss": 0.4347, + "step": 5050 + }, + { + "epoch": 0.1393682075761286, + "grad_norm": 0.003208388341590762, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 5051 + }, + { + "epoch": 0.13939579977719296, + "grad_norm": 0.0024538834113627672, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 5052 + }, + { + "epoch": 0.13942339197825734, + "grad_norm": 0.0028640846721827984, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 5053 + }, + { + "epoch": 0.13945098417932172, + "grad_norm": 0.002653739182278514, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 5054 + }, + { + "epoch": 0.13947857638038608, + "grad_norm": 0.003615357680246234, + "learning_rate": 0.001, + "loss": 0.419, + "step": 5055 + }, + { + "epoch": 0.13950616858145046, + "grad_norm": 0.002402637619525194, + "learning_rate": 0.001, + "loss": 0.409, + "step": 5056 + }, + { + "epoch": 0.1395337607825148, + "grad_norm": 0.0025258746463805437, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5057 + }, + { + "epoch": 0.1395613529835792, + "grad_norm": 0.0028938499744981527, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 5058 + }, + { + "epoch": 0.13958894518464357, + "grad_norm": 0.008427090011537075, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 5059 + }, + { + "epoch": 0.13961653738570792, + "grad_norm": 0.00399380037561059, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 5060 + }, + { + "epoch": 0.1396441295867723, + "grad_norm": 0.002580770291388035, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 5061 + }, + { + "epoch": 0.13967172178783666, + "grad_norm": 0.0026310610119253397, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 5062 + }, + { + "epoch": 0.13969931398890104, + "grad_norm": 0.003709648735821247, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 5063 + }, + { + "epoch": 0.13972690618996542, + "grad_norm": 0.0026174660306423903, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 5064 + }, + { + "epoch": 0.13975449839102977, + "grad_norm": 0.002821024041622877, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 5065 + }, + { + "epoch": 0.13978209059209415, + "grad_norm": 0.0045610954985022545, + "learning_rate": 0.001, + "loss": 0.3604, + "step": 5066 + }, + { + "epoch": 0.1398096827931585, + "grad_norm": 0.003461558138951659, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 5067 + }, + { + "epoch": 0.13983727499422288, + "grad_norm": 0.00443047983571887, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 5068 + }, + { + "epoch": 0.13986486719528726, + "grad_norm": 0.0035488642752170563, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 5069 + }, + { + "epoch": 0.13989245939635161, + "grad_norm": 0.005335607565939426, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 5070 + }, + { + "epoch": 0.139920051597416, + "grad_norm": 0.003680909052491188, + "learning_rate": 0.001, + "loss": 0.4332, + "step": 5071 + }, + { + "epoch": 0.13994764379848035, + "grad_norm": 0.004192323889583349, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 5072 + }, + { + "epoch": 0.13997523599954473, + "grad_norm": 0.0035443026572465897, + "learning_rate": 0.001, + "loss": 0.4363, + "step": 5073 + }, + { + "epoch": 0.1400028282006091, + "grad_norm": 0.0038134013302624226, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 5074 + }, + { + "epoch": 0.14003042040167346, + "grad_norm": 0.0036664907820522785, + "learning_rate": 0.001, + "loss": 0.428, + "step": 5075 + }, + { + "epoch": 0.14005801260273784, + "grad_norm": 0.0024540217127650976, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 5076 + }, + { + "epoch": 0.1400856048038022, + "grad_norm": 0.0027097316924482584, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 5077 + }, + { + "epoch": 0.14011319700486657, + "grad_norm": 0.002922753570601344, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 5078 + }, + { + "epoch": 0.14014078920593095, + "grad_norm": 0.003050590632483363, + "learning_rate": 0.001, + "loss": 0.4488, + "step": 5079 + }, + { + "epoch": 0.1401683814069953, + "grad_norm": 0.0027282421942800283, + "learning_rate": 0.001, + "loss": 0.406, + "step": 5080 + }, + { + "epoch": 0.1401959736080597, + "grad_norm": 0.0027401361148804426, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 5081 + }, + { + "epoch": 0.14022356580912404, + "grad_norm": 0.006228750105947256, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 5082 + }, + { + "epoch": 0.14025115801018842, + "grad_norm": 0.002542686415836215, + "learning_rate": 0.001, + "loss": 0.453, + "step": 5083 + }, + { + "epoch": 0.1402787502112528, + "grad_norm": 0.002365612657740712, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 5084 + }, + { + "epoch": 0.14030634241231715, + "grad_norm": 0.008088381960988045, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 5085 + }, + { + "epoch": 0.14033393461338153, + "grad_norm": 0.002785349264740944, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 5086 + }, + { + "epoch": 0.14036152681444589, + "grad_norm": 0.00489590922370553, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 5087 + }, + { + "epoch": 0.14038911901551027, + "grad_norm": 0.0027916128747165203, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 5088 + }, + { + "epoch": 0.14041671121657465, + "grad_norm": 0.002344654407352209, + "learning_rate": 0.001, + "loss": 0.388, + "step": 5089 + }, + { + "epoch": 0.140444303417639, + "grad_norm": 0.0031531101558357477, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 5090 + }, + { + "epoch": 0.14047189561870338, + "grad_norm": 0.0027754863258451223, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 5091 + }, + { + "epoch": 0.14049948781976773, + "grad_norm": 0.003328294726088643, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 5092 + }, + { + "epoch": 0.1405270800208321, + "grad_norm": 0.0021852529607713223, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 5093 + }, + { + "epoch": 0.1405546722218965, + "grad_norm": 0.004019197542220354, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 5094 + }, + { + "epoch": 0.14058226442296085, + "grad_norm": 0.0024554249830543995, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 5095 + }, + { + "epoch": 0.14060985662402523, + "grad_norm": 0.0032448593992739916, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 5096 + }, + { + "epoch": 0.14063744882508958, + "grad_norm": 0.0027907416224479675, + "learning_rate": 0.001, + "loss": 0.4575, + "step": 5097 + }, + { + "epoch": 0.14066504102615396, + "grad_norm": 0.006815130822360516, + "learning_rate": 0.001, + "loss": 0.401, + "step": 5098 + }, + { + "epoch": 0.14069263322721834, + "grad_norm": 0.00370893650688231, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 5099 + }, + { + "epoch": 0.1407202254282827, + "grad_norm": 0.002180712530389428, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 5100 + }, + { + "epoch": 0.14074781762934707, + "grad_norm": 0.0035834074951708317, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 5101 + }, + { + "epoch": 0.14077540983041142, + "grad_norm": 0.003018912160769105, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 5102 + }, + { + "epoch": 0.1408030020314758, + "grad_norm": 0.002520001260563731, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 5103 + }, + { + "epoch": 0.14083059423254018, + "grad_norm": 0.00585381593555212, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 5104 + }, + { + "epoch": 0.14085818643360454, + "grad_norm": 0.0028747236356139183, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 5105 + }, + { + "epoch": 0.14088577863466892, + "grad_norm": 0.005821559578180313, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 5106 + }, + { + "epoch": 0.14091337083573327, + "grad_norm": 0.0025346369948238134, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5107 + }, + { + "epoch": 0.14094096303679765, + "grad_norm": 0.002172187902033329, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 5108 + }, + { + "epoch": 0.14096855523786203, + "grad_norm": 0.0038861907087266445, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 5109 + }, + { + "epoch": 0.14099614743892638, + "grad_norm": 0.0024652110878378153, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 5110 + }, + { + "epoch": 0.14102373963999076, + "grad_norm": 0.003532834816724062, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 5111 + }, + { + "epoch": 0.14105133184105512, + "grad_norm": 0.006292164325714111, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 5112 + }, + { + "epoch": 0.1410789240421195, + "grad_norm": 0.00268157827667892, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 5113 + }, + { + "epoch": 0.14110651624318388, + "grad_norm": 0.0035765564534813166, + "learning_rate": 0.001, + "loss": 0.3599, + "step": 5114 + }, + { + "epoch": 0.14113410844424823, + "grad_norm": 0.0026933897752314806, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 5115 + }, + { + "epoch": 0.1411617006453126, + "grad_norm": 0.0043404726311564445, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 5116 + }, + { + "epoch": 0.14118929284637696, + "grad_norm": 0.0029684489127248526, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 5117 + }, + { + "epoch": 0.14121688504744134, + "grad_norm": 0.0028438603039830923, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 5118 + }, + { + "epoch": 0.14124447724850572, + "grad_norm": 0.002665071515366435, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 5119 + }, + { + "epoch": 0.14127206944957008, + "grad_norm": 0.0035951558966189623, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 5120 + }, + { + "epoch": 0.14129966165063446, + "grad_norm": 0.007121792994439602, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 5121 + }, + { + "epoch": 0.1413272538516988, + "grad_norm": 0.0026034133043140173, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 5122 + }, + { + "epoch": 0.1413548460527632, + "grad_norm": 0.004363723564893007, + "learning_rate": 0.001, + "loss": 0.4695, + "step": 5123 + }, + { + "epoch": 0.14138243825382757, + "grad_norm": 0.002366288099437952, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 5124 + }, + { + "epoch": 0.14141003045489192, + "grad_norm": 0.002647766610607505, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 5125 + }, + { + "epoch": 0.1414376226559563, + "grad_norm": 0.002447773702442646, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 5126 + }, + { + "epoch": 0.14146521485702065, + "grad_norm": 0.0026955637149512768, + "learning_rate": 0.001, + "loss": 0.3644, + "step": 5127 + }, + { + "epoch": 0.14149280705808503, + "grad_norm": 0.004317718092352152, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 5128 + }, + { + "epoch": 0.1415203992591494, + "grad_norm": 0.002862673718482256, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 5129 + }, + { + "epoch": 0.14154799146021377, + "grad_norm": 0.004444323480129242, + "learning_rate": 0.001, + "loss": 0.417, + "step": 5130 + }, + { + "epoch": 0.14157558366127815, + "grad_norm": 0.008723779581487179, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 5131 + }, + { + "epoch": 0.1416031758623425, + "grad_norm": 0.0023829471319913864, + "learning_rate": 0.001, + "loss": 0.427, + "step": 5132 + }, + { + "epoch": 0.14163076806340688, + "grad_norm": 0.0023019250947982073, + "learning_rate": 0.001, + "loss": 0.404, + "step": 5133 + }, + { + "epoch": 0.14165836026447123, + "grad_norm": 0.002866733353585005, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 5134 + }, + { + "epoch": 0.14168595246553561, + "grad_norm": 0.002325895708054304, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 5135 + }, + { + "epoch": 0.1417135446666, + "grad_norm": 0.0031419056467711926, + "learning_rate": 0.001, + "loss": 0.394, + "step": 5136 + }, + { + "epoch": 0.14174113686766435, + "grad_norm": 0.0049896384589374065, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 5137 + }, + { + "epoch": 0.14176872906872873, + "grad_norm": 0.006104922853410244, + "learning_rate": 0.001, + "loss": 0.3583, + "step": 5138 + }, + { + "epoch": 0.14179632126979308, + "grad_norm": 0.005146206822246313, + "learning_rate": 0.001, + "loss": 0.3609, + "step": 5139 + }, + { + "epoch": 0.14182391347085746, + "grad_norm": 0.0036859933752566576, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 5140 + }, + { + "epoch": 0.14185150567192184, + "grad_norm": 0.0021758072543889284, + "learning_rate": 0.001, + "loss": 0.401, + "step": 5141 + }, + { + "epoch": 0.1418790978729862, + "grad_norm": 0.002568889642134309, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 5142 + }, + { + "epoch": 0.14190669007405057, + "grad_norm": 0.003079349873587489, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 5143 + }, + { + "epoch": 0.14193428227511493, + "grad_norm": 0.006028784904628992, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 5144 + }, + { + "epoch": 0.1419618744761793, + "grad_norm": 0.0028043135534971952, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 5145 + }, + { + "epoch": 0.1419894666772437, + "grad_norm": 0.0026649232022464275, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 5146 + }, + { + "epoch": 0.14201705887830804, + "grad_norm": 0.003915826324373484, + "learning_rate": 0.001, + "loss": 0.382, + "step": 5147 + }, + { + "epoch": 0.14204465107937242, + "grad_norm": 0.0024036671966314316, + "learning_rate": 0.001, + "loss": 0.423, + "step": 5148 + }, + { + "epoch": 0.14207224328043677, + "grad_norm": 0.0042000822722911835, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 5149 + }, + { + "epoch": 0.14209983548150115, + "grad_norm": 0.0055633410811424255, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 5150 + }, + { + "epoch": 0.14212742768256553, + "grad_norm": 0.0037763305008411407, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 5151 + }, + { + "epoch": 0.14215501988362989, + "grad_norm": 0.0034036405850201845, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5152 + }, + { + "epoch": 0.14218261208469427, + "grad_norm": 0.005322457291185856, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 5153 + }, + { + "epoch": 0.14221020428575862, + "grad_norm": 0.0037400966975837946, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 5154 + }, + { + "epoch": 0.142237796486823, + "grad_norm": 0.005500406958162785, + "learning_rate": 0.001, + "loss": 0.4355, + "step": 5155 + }, + { + "epoch": 0.14226538868788738, + "grad_norm": 0.003691243240609765, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 5156 + }, + { + "epoch": 0.14229298088895173, + "grad_norm": 0.005320982076227665, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 5157 + }, + { + "epoch": 0.1423205730900161, + "grad_norm": 0.0033200359903275967, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5158 + }, + { + "epoch": 0.14234816529108046, + "grad_norm": 0.0034416653215885162, + "learning_rate": 0.001, + "loss": 0.398, + "step": 5159 + }, + { + "epoch": 0.14237575749214484, + "grad_norm": 0.0032915968913584948, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 5160 + }, + { + "epoch": 0.14240334969320922, + "grad_norm": 0.00513947568833828, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5161 + }, + { + "epoch": 0.14243094189427358, + "grad_norm": 0.005058473441749811, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 5162 + }, + { + "epoch": 0.14245853409533796, + "grad_norm": 0.003918751142919064, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 5163 + }, + { + "epoch": 0.1424861262964023, + "grad_norm": 0.0029717017896473408, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 5164 + }, + { + "epoch": 0.1425137184974667, + "grad_norm": 0.004301862791180611, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 5165 + }, + { + "epoch": 0.14254131069853107, + "grad_norm": 0.0028698716778308153, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 5166 + }, + { + "epoch": 0.14256890289959542, + "grad_norm": 0.002715424867346883, + "learning_rate": 0.001, + "loss": 0.407, + "step": 5167 + }, + { + "epoch": 0.1425964951006598, + "grad_norm": 0.002695337636396289, + "learning_rate": 0.001, + "loss": 0.403, + "step": 5168 + }, + { + "epoch": 0.14262408730172416, + "grad_norm": 0.002396870171651244, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 5169 + }, + { + "epoch": 0.14265167950278854, + "grad_norm": 0.0023134411312639713, + "learning_rate": 0.001, + "loss": 0.405, + "step": 5170 + }, + { + "epoch": 0.14267927170385292, + "grad_norm": 0.007215626537799835, + "learning_rate": 0.001, + "loss": 0.4436, + "step": 5171 + }, + { + "epoch": 0.14270686390491727, + "grad_norm": 0.0030377882067114115, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 5172 + }, + { + "epoch": 0.14273445610598165, + "grad_norm": 0.0052809203043580055, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 5173 + }, + { + "epoch": 0.142762048307046, + "grad_norm": 0.0030719470232725143, + "learning_rate": 0.001, + "loss": 0.3556, + "step": 5174 + }, + { + "epoch": 0.14278964050811038, + "grad_norm": 0.003323314245790243, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 5175 + }, + { + "epoch": 0.14281723270917476, + "grad_norm": 0.004299065563827753, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 5176 + }, + { + "epoch": 0.14284482491023912, + "grad_norm": 0.006395953707396984, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 5177 + }, + { + "epoch": 0.1428724171113035, + "grad_norm": 0.0036343904212117195, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 5178 + }, + { + "epoch": 0.14290000931236785, + "grad_norm": 0.003354752203449607, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 5179 + }, + { + "epoch": 0.14292760151343223, + "grad_norm": 0.0029036353807896376, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 5180 + }, + { + "epoch": 0.1429551937144966, + "grad_norm": 0.0033736219629645348, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 5181 + }, + { + "epoch": 0.14298278591556096, + "grad_norm": 0.00228399527259171, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 5182 + }, + { + "epoch": 0.14301037811662534, + "grad_norm": 0.0025460305623710155, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 5183 + }, + { + "epoch": 0.1430379703176897, + "grad_norm": 0.0031551956199109554, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 5184 + }, + { + "epoch": 0.14306556251875407, + "grad_norm": 0.003111305646598339, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5185 + }, + { + "epoch": 0.14309315471981846, + "grad_norm": 0.0028325961902737617, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 5186 + }, + { + "epoch": 0.1431207469208828, + "grad_norm": 0.0034688191954046488, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 5187 + }, + { + "epoch": 0.1431483391219472, + "grad_norm": 0.0022599126677960157, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 5188 + }, + { + "epoch": 0.14317593132301154, + "grad_norm": 0.004831294994801283, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 5189 + }, + { + "epoch": 0.14320352352407592, + "grad_norm": 0.0029807155951857567, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 5190 + }, + { + "epoch": 0.1432311157251403, + "grad_norm": 0.003226724686101079, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 5191 + }, + { + "epoch": 0.14325870792620465, + "grad_norm": 0.0025674651842564344, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 5192 + }, + { + "epoch": 0.14328630012726903, + "grad_norm": 0.002636546269059181, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 5193 + }, + { + "epoch": 0.1433138923283334, + "grad_norm": 0.003959611523896456, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 5194 + }, + { + "epoch": 0.14334148452939777, + "grad_norm": 0.00245193880982697, + "learning_rate": 0.001, + "loss": 0.4404, + "step": 5195 + }, + { + "epoch": 0.14336907673046215, + "grad_norm": 0.0033164939377456903, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5196 + }, + { + "epoch": 0.1433966689315265, + "grad_norm": 0.0031362988520413637, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 5197 + }, + { + "epoch": 0.14342426113259088, + "grad_norm": 0.0031361919827759266, + "learning_rate": 0.001, + "loss": 0.387, + "step": 5198 + }, + { + "epoch": 0.14345185333365523, + "grad_norm": 0.0025203994009643793, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 5199 + }, + { + "epoch": 0.1434794455347196, + "grad_norm": 0.0021688216365873814, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 5200 + }, + { + "epoch": 0.143507037735784, + "grad_norm": 0.0022967627737671137, + "learning_rate": 0.001, + "loss": 0.443, + "step": 5201 + }, + { + "epoch": 0.14353462993684835, + "grad_norm": 0.002367371693253517, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 5202 + }, + { + "epoch": 0.14356222213791273, + "grad_norm": 0.002286283066496253, + "learning_rate": 0.001, + "loss": 0.381, + "step": 5203 + }, + { + "epoch": 0.14358981433897708, + "grad_norm": 0.002556881867349148, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 5204 + }, + { + "epoch": 0.14361740654004146, + "grad_norm": 0.002457389608025551, + "learning_rate": 0.001, + "loss": 0.4515, + "step": 5205 + }, + { + "epoch": 0.14364499874110584, + "grad_norm": 0.0026917587965726852, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 5206 + }, + { + "epoch": 0.1436725909421702, + "grad_norm": 0.0024027330800890923, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 5207 + }, + { + "epoch": 0.14370018314323457, + "grad_norm": 0.0036219994071871042, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5208 + }, + { + "epoch": 0.14372777534429892, + "grad_norm": 0.0024791264440864325, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 5209 + }, + { + "epoch": 0.1437553675453633, + "grad_norm": 0.002413226757198572, + "learning_rate": 0.001, + "loss": 0.4413, + "step": 5210 + }, + { + "epoch": 0.14378295974642769, + "grad_norm": 0.0027174693532288074, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 5211 + }, + { + "epoch": 0.14381055194749204, + "grad_norm": 0.002803023438900709, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 5212 + }, + { + "epoch": 0.14383814414855642, + "grad_norm": 0.004972567781805992, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 5213 + }, + { + "epoch": 0.14386573634962077, + "grad_norm": 0.003360881470143795, + "learning_rate": 0.001, + "loss": 0.414, + "step": 5214 + }, + { + "epoch": 0.14389332855068515, + "grad_norm": 0.0027532910462468863, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 5215 + }, + { + "epoch": 0.14392092075174953, + "grad_norm": 0.003589104162529111, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 5216 + }, + { + "epoch": 0.14394851295281388, + "grad_norm": 0.004771512001752853, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5217 + }, + { + "epoch": 0.14397610515387826, + "grad_norm": 0.0026405095122754574, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5218 + }, + { + "epoch": 0.14400369735494262, + "grad_norm": 0.004283549264073372, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 5219 + }, + { + "epoch": 0.144031289556007, + "grad_norm": 0.0045742373913526535, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 5220 + }, + { + "epoch": 0.14405888175707138, + "grad_norm": 0.0042295148596167564, + "learning_rate": 0.001, + "loss": 0.392, + "step": 5221 + }, + { + "epoch": 0.14408647395813573, + "grad_norm": 0.004257339984178543, + "learning_rate": 0.001, + "loss": 0.42, + "step": 5222 + }, + { + "epoch": 0.1441140661592001, + "grad_norm": 0.002708716783672571, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 5223 + }, + { + "epoch": 0.14414165836026446, + "grad_norm": 0.004137181676924229, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 5224 + }, + { + "epoch": 0.14416925056132884, + "grad_norm": 0.005688484758138657, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 5225 + }, + { + "epoch": 0.1441968427623932, + "grad_norm": 0.008182493969798088, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 5226 + }, + { + "epoch": 0.14422443496345758, + "grad_norm": 0.003255674382671714, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 5227 + }, + { + "epoch": 0.14425202716452196, + "grad_norm": 0.0024103031028062105, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 5228 + }, + { + "epoch": 0.1442796193655863, + "grad_norm": 0.0030367637518793344, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 5229 + }, + { + "epoch": 0.1443072115666507, + "grad_norm": 0.004183416720479727, + "learning_rate": 0.001, + "loss": 0.42, + "step": 5230 + }, + { + "epoch": 0.14433480376771504, + "grad_norm": 0.002727340441197157, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 5231 + }, + { + "epoch": 0.14436239596877942, + "grad_norm": 0.003591054119169712, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 5232 + }, + { + "epoch": 0.1443899881698438, + "grad_norm": 0.0028464680071920156, + "learning_rate": 0.001, + "loss": 0.359, + "step": 5233 + }, + { + "epoch": 0.14441758037090816, + "grad_norm": 0.006335907150059938, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 5234 + }, + { + "epoch": 0.14444517257197254, + "grad_norm": 0.0027192372363060713, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 5235 + }, + { + "epoch": 0.1444727647730369, + "grad_norm": 0.003246571170166135, + "learning_rate": 0.001, + "loss": 0.385, + "step": 5236 + }, + { + "epoch": 0.14450035697410127, + "grad_norm": 0.0024179634638130665, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 5237 + }, + { + "epoch": 0.14452794917516565, + "grad_norm": 0.006938959006220102, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 5238 + }, + { + "epoch": 0.14455554137623, + "grad_norm": 0.004923888016492128, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 5239 + }, + { + "epoch": 0.14458313357729438, + "grad_norm": 0.0031706364825367928, + "learning_rate": 0.001, + "loss": 0.3494, + "step": 5240 + }, + { + "epoch": 0.14461072577835873, + "grad_norm": 0.002737032249569893, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 5241 + }, + { + "epoch": 0.14463831797942311, + "grad_norm": 0.0025771090295165777, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 5242 + }, + { + "epoch": 0.1446659101804875, + "grad_norm": 0.0030646645464003086, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 5243 + }, + { + "epoch": 0.14469350238155185, + "grad_norm": 0.0036759376525878906, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 5244 + }, + { + "epoch": 0.14472109458261623, + "grad_norm": 0.0030812059994786978, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 5245 + }, + { + "epoch": 0.14474868678368058, + "grad_norm": 0.0022093786392360926, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 5246 + }, + { + "epoch": 0.14477627898474496, + "grad_norm": 0.009639604948461056, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 5247 + }, + { + "epoch": 0.14480387118580934, + "grad_norm": 0.0032258466817438602, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 5248 + }, + { + "epoch": 0.1448314633868737, + "grad_norm": 0.0022797686979174614, + "learning_rate": 0.001, + "loss": 0.4, + "step": 5249 + }, + { + "epoch": 0.14485905558793807, + "grad_norm": 0.004146902356296778, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 5250 + }, + { + "epoch": 0.14488664778900243, + "grad_norm": 0.006115823984146118, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 5251 + }, + { + "epoch": 0.1449142399900668, + "grad_norm": 0.007470074575394392, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 5252 + }, + { + "epoch": 0.1449418321911312, + "grad_norm": 0.005569383502006531, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 5253 + }, + { + "epoch": 0.14496942439219554, + "grad_norm": 0.004871412180364132, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 5254 + }, + { + "epoch": 0.14499701659325992, + "grad_norm": 0.0038668601773679256, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 5255 + }, + { + "epoch": 0.14502460879432427, + "grad_norm": 0.00738053023815155, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 5256 + }, + { + "epoch": 0.14505220099538865, + "grad_norm": 0.004165946505963802, + "learning_rate": 0.001, + "loss": 0.3671, + "step": 5257 + }, + { + "epoch": 0.14507979319645303, + "grad_norm": 0.004606361500918865, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 5258 + }, + { + "epoch": 0.14510738539751739, + "grad_norm": 0.002536054700613022, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 5259 + }, + { + "epoch": 0.14513497759858177, + "grad_norm": 0.005060167983174324, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 5260 + }, + { + "epoch": 0.14516256979964612, + "grad_norm": 0.004944161977618933, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 5261 + }, + { + "epoch": 0.1451901620007105, + "grad_norm": 0.0029536515939980745, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 5262 + }, + { + "epoch": 0.14521775420177488, + "grad_norm": 0.0047346302308142185, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 5263 + }, + { + "epoch": 0.14524534640283923, + "grad_norm": 0.0024177066516131163, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 5264 + }, + { + "epoch": 0.1452729386039036, + "grad_norm": 0.0036898923572152853, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 5265 + }, + { + "epoch": 0.14530053080496796, + "grad_norm": 0.0027802560944110155, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 5266 + }, + { + "epoch": 0.14532812300603234, + "grad_norm": 0.0035621714778244495, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 5267 + }, + { + "epoch": 0.14535571520709673, + "grad_norm": 0.004401834215968847, + "learning_rate": 0.001, + "loss": 0.427, + "step": 5268 + }, + { + "epoch": 0.14538330740816108, + "grad_norm": 0.0035394374281167984, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 5269 + }, + { + "epoch": 0.14541089960922546, + "grad_norm": 0.017217174172401428, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 5270 + }, + { + "epoch": 0.1454384918102898, + "grad_norm": 0.0022773677483201027, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5271 + }, + { + "epoch": 0.1454660840113542, + "grad_norm": 0.002403079532086849, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 5272 + }, + { + "epoch": 0.14549367621241857, + "grad_norm": 0.0033804429695010185, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 5273 + }, + { + "epoch": 0.14552126841348292, + "grad_norm": 0.002910598646849394, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 5274 + }, + { + "epoch": 0.1455488606145473, + "grad_norm": 0.008523901924490929, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 5275 + }, + { + "epoch": 0.14557645281561166, + "grad_norm": 0.003371842671185732, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 5276 + }, + { + "epoch": 0.14560404501667604, + "grad_norm": 0.009514998644590378, + "learning_rate": 0.001, + "loss": 0.38, + "step": 5277 + }, + { + "epoch": 0.14563163721774042, + "grad_norm": 0.002840045839548111, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 5278 + }, + { + "epoch": 0.14565922941880477, + "grad_norm": 0.0025841807946562767, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 5279 + }, + { + "epoch": 0.14568682161986915, + "grad_norm": 0.003922698087990284, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 5280 + }, + { + "epoch": 0.1457144138209335, + "grad_norm": 0.004982591141015291, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 5281 + }, + { + "epoch": 0.14574200602199788, + "grad_norm": 0.003860799828544259, + "learning_rate": 0.001, + "loss": 0.3455, + "step": 5282 + }, + { + "epoch": 0.14576959822306226, + "grad_norm": 0.003407267387956381, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 5283 + }, + { + "epoch": 0.14579719042412662, + "grad_norm": 0.003815464908257127, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 5284 + }, + { + "epoch": 0.145824782625191, + "grad_norm": 0.002572751371189952, + "learning_rate": 0.001, + "loss": 0.4426, + "step": 5285 + }, + { + "epoch": 0.14585237482625535, + "grad_norm": 0.0026371986605226994, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 5286 + }, + { + "epoch": 0.14587996702731973, + "grad_norm": 0.0028192142490297556, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 5287 + }, + { + "epoch": 0.1459075592283841, + "grad_norm": 0.0033562618773430586, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5288 + }, + { + "epoch": 0.14593515142944846, + "grad_norm": 0.0029199314303696156, + "learning_rate": 0.001, + "loss": 0.4294, + "step": 5289 + }, + { + "epoch": 0.14596274363051284, + "grad_norm": 0.0029893077444285154, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 5290 + }, + { + "epoch": 0.1459903358315772, + "grad_norm": 0.002518982160836458, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5291 + }, + { + "epoch": 0.14601792803264158, + "grad_norm": 0.0025784457102417946, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 5292 + }, + { + "epoch": 0.14604552023370596, + "grad_norm": 0.002763275755569339, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 5293 + }, + { + "epoch": 0.1460731124347703, + "grad_norm": 0.0020732074044644833, + "learning_rate": 0.001, + "loss": 0.38, + "step": 5294 + }, + { + "epoch": 0.1461007046358347, + "grad_norm": 0.002579174702987075, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 5295 + }, + { + "epoch": 0.14612829683689904, + "grad_norm": 0.0026421458460390568, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 5296 + }, + { + "epoch": 0.14615588903796342, + "grad_norm": 0.0025588644202798605, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 5297 + }, + { + "epoch": 0.1461834812390278, + "grad_norm": 0.002680868376046419, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 5298 + }, + { + "epoch": 0.14621107344009215, + "grad_norm": 0.002638003323227167, + "learning_rate": 0.001, + "loss": 0.372, + "step": 5299 + }, + { + "epoch": 0.14623866564115653, + "grad_norm": 0.002944410778582096, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 5300 + }, + { + "epoch": 0.1462662578422209, + "grad_norm": 0.0032715783454477787, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 5301 + }, + { + "epoch": 0.14629385004328527, + "grad_norm": 0.0038660853169858456, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 5302 + }, + { + "epoch": 0.14632144224434965, + "grad_norm": 0.0023569557815790176, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5303 + }, + { + "epoch": 0.146349034445414, + "grad_norm": 0.01049228198826313, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 5304 + }, + { + "epoch": 0.14637662664647838, + "grad_norm": 0.0023562125861644745, + "learning_rate": 0.001, + "loss": 0.4398, + "step": 5305 + }, + { + "epoch": 0.14640421884754273, + "grad_norm": 0.002021912718191743, + "learning_rate": 0.001, + "loss": 0.4416, + "step": 5306 + }, + { + "epoch": 0.1464318110486071, + "grad_norm": 0.002516190754249692, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 5307 + }, + { + "epoch": 0.1464594032496715, + "grad_norm": 0.0023932051844894886, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 5308 + }, + { + "epoch": 0.14648699545073585, + "grad_norm": 0.0037087369710206985, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 5309 + }, + { + "epoch": 0.14651458765180023, + "grad_norm": 0.002698789816349745, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 5310 + }, + { + "epoch": 0.14654217985286458, + "grad_norm": 0.0025848429650068283, + "learning_rate": 0.001, + "loss": 0.379, + "step": 5311 + }, + { + "epoch": 0.14656977205392896, + "grad_norm": 0.00317647447809577, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 5312 + }, + { + "epoch": 0.14659736425499334, + "grad_norm": 0.0027970026712864637, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 5313 + }, + { + "epoch": 0.1466249564560577, + "grad_norm": 0.0031060813926160336, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 5314 + }, + { + "epoch": 0.14665254865712207, + "grad_norm": 0.004864480346441269, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 5315 + }, + { + "epoch": 0.14668014085818643, + "grad_norm": 0.0022926428355276585, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5316 + }, + { + "epoch": 0.1467077330592508, + "grad_norm": 0.0029684528708457947, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 5317 + }, + { + "epoch": 0.14673532526031516, + "grad_norm": 0.0022980044595897198, + "learning_rate": 0.001, + "loss": 0.4402, + "step": 5318 + }, + { + "epoch": 0.14676291746137954, + "grad_norm": 0.0021655671298503876, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 5319 + }, + { + "epoch": 0.14679050966244392, + "grad_norm": 0.002639767248183489, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 5320 + }, + { + "epoch": 0.14681810186350827, + "grad_norm": 0.005498727783560753, + "learning_rate": 0.001, + "loss": 0.3696, + "step": 5321 + }, + { + "epoch": 0.14684569406457265, + "grad_norm": 0.0033152250107377768, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 5322 + }, + { + "epoch": 0.146873286265637, + "grad_norm": 0.004971036221832037, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 5323 + }, + { + "epoch": 0.14690087846670138, + "grad_norm": 0.0024375154171139, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 5324 + }, + { + "epoch": 0.14692847066776576, + "grad_norm": 0.002927098423242569, + "learning_rate": 0.001, + "loss": 0.3463, + "step": 5325 + }, + { + "epoch": 0.14695606286883012, + "grad_norm": 0.0036146354395896196, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 5326 + }, + { + "epoch": 0.1469836550698945, + "grad_norm": 0.0024644196964800358, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 5327 + }, + { + "epoch": 0.14701124727095885, + "grad_norm": 0.0025911214761435986, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 5328 + }, + { + "epoch": 0.14703883947202323, + "grad_norm": 0.0025596728082746267, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 5329 + }, + { + "epoch": 0.1470664316730876, + "grad_norm": 0.002164337085559964, + "learning_rate": 0.001, + "loss": 0.416, + "step": 5330 + }, + { + "epoch": 0.14709402387415196, + "grad_norm": 0.002624908462166786, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 5331 + }, + { + "epoch": 0.14712161607521634, + "grad_norm": 0.003358762711286545, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 5332 + }, + { + "epoch": 0.1471492082762807, + "grad_norm": 0.008317602798342705, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 5333 + }, + { + "epoch": 0.14717680047734508, + "grad_norm": 0.002502370160073042, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 5334 + }, + { + "epoch": 0.14720439267840946, + "grad_norm": 0.002517363289371133, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 5335 + }, + { + "epoch": 0.1472319848794738, + "grad_norm": 0.003344867378473282, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5336 + }, + { + "epoch": 0.1472595770805382, + "grad_norm": 0.003851872170343995, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 5337 + }, + { + "epoch": 0.14728716928160254, + "grad_norm": 0.002437508897855878, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 5338 + }, + { + "epoch": 0.14731476148266692, + "grad_norm": 0.0034037036821246147, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 5339 + }, + { + "epoch": 0.1473423536837313, + "grad_norm": 0.0027613432612270117, + "learning_rate": 0.001, + "loss": 0.4394, + "step": 5340 + }, + { + "epoch": 0.14736994588479566, + "grad_norm": 0.002495642751455307, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 5341 + }, + { + "epoch": 0.14739753808586004, + "grad_norm": 0.0029324067290872335, + "learning_rate": 0.001, + "loss": 0.3461, + "step": 5342 + }, + { + "epoch": 0.1474251302869244, + "grad_norm": 0.003891981905326247, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 5343 + }, + { + "epoch": 0.14745272248798877, + "grad_norm": 0.005491616670042276, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 5344 + }, + { + "epoch": 0.14748031468905315, + "grad_norm": 0.0028310040943324566, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 5345 + }, + { + "epoch": 0.1475079068901175, + "grad_norm": 0.0035124190617352724, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 5346 + }, + { + "epoch": 0.14753549909118188, + "grad_norm": 0.003958097193390131, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 5347 + }, + { + "epoch": 0.14756309129224623, + "grad_norm": 0.0031724879518151283, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 5348 + }, + { + "epoch": 0.14759068349331061, + "grad_norm": 0.006823610980063677, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 5349 + }, + { + "epoch": 0.147618275694375, + "grad_norm": 0.0023501457180827856, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 5350 + }, + { + "epoch": 0.14764586789543935, + "grad_norm": 0.0020646711345762014, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 5351 + }, + { + "epoch": 0.14767346009650373, + "grad_norm": 0.002760658971965313, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 5352 + }, + { + "epoch": 0.14770105229756808, + "grad_norm": 0.0029554900247603655, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 5353 + }, + { + "epoch": 0.14772864449863246, + "grad_norm": 0.0038962808903306723, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 5354 + }, + { + "epoch": 0.14775623669969684, + "grad_norm": 0.002745499601587653, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 5355 + }, + { + "epoch": 0.1477838289007612, + "grad_norm": 0.004177890717983246, + "learning_rate": 0.001, + "loss": 0.3632, + "step": 5356 + }, + { + "epoch": 0.14781142110182557, + "grad_norm": 0.002319454913958907, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 5357 + }, + { + "epoch": 0.14783901330288993, + "grad_norm": 0.0026902640238404274, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 5358 + }, + { + "epoch": 0.1478666055039543, + "grad_norm": 0.003064702032133937, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 5359 + }, + { + "epoch": 0.1478941977050187, + "grad_norm": 0.003410772420465946, + "learning_rate": 0.001, + "loss": 0.3601, + "step": 5360 + }, + { + "epoch": 0.14792178990608304, + "grad_norm": 0.0030657979659736156, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 5361 + }, + { + "epoch": 0.14794938210714742, + "grad_norm": 0.008095750585198402, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 5362 + }, + { + "epoch": 0.14797697430821177, + "grad_norm": 0.004853304475545883, + "learning_rate": 0.001, + "loss": 0.404, + "step": 5363 + }, + { + "epoch": 0.14800456650927615, + "grad_norm": 0.0032035887707024813, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 5364 + }, + { + "epoch": 0.14803215871034053, + "grad_norm": 0.004527990240603685, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 5365 + }, + { + "epoch": 0.14805975091140489, + "grad_norm": 0.006700141355395317, + "learning_rate": 0.001, + "loss": 0.375, + "step": 5366 + }, + { + "epoch": 0.14808734311246927, + "grad_norm": 0.00633378978818655, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 5367 + }, + { + "epoch": 0.14811493531353362, + "grad_norm": 0.0023305609356611967, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 5368 + }, + { + "epoch": 0.148142527514598, + "grad_norm": 0.0034147268161177635, + "learning_rate": 0.001, + "loss": 0.3556, + "step": 5369 + }, + { + "epoch": 0.14817011971566238, + "grad_norm": 0.003086140612140298, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 5370 + }, + { + "epoch": 0.14819771191672673, + "grad_norm": 0.0061439163982868195, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 5371 + }, + { + "epoch": 0.1482253041177911, + "grad_norm": 0.0025781714357435703, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 5372 + }, + { + "epoch": 0.14825289631885546, + "grad_norm": 0.0026104720309376717, + "learning_rate": 0.001, + "loss": 0.427, + "step": 5373 + }, + { + "epoch": 0.14828048851991985, + "grad_norm": 0.0037834334652870893, + "learning_rate": 0.001, + "loss": 0.391, + "step": 5374 + }, + { + "epoch": 0.14830808072098423, + "grad_norm": 0.005206381902098656, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 5375 + }, + { + "epoch": 0.14833567292204858, + "grad_norm": 0.002475143875926733, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 5376 + }, + { + "epoch": 0.14836326512311296, + "grad_norm": 0.005634800065308809, + "learning_rate": 0.001, + "loss": 0.388, + "step": 5377 + }, + { + "epoch": 0.1483908573241773, + "grad_norm": 0.004982566460967064, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 5378 + }, + { + "epoch": 0.1484184495252417, + "grad_norm": 0.0032306865323334932, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 5379 + }, + { + "epoch": 0.14844604172630607, + "grad_norm": 0.0074884905479848385, + "learning_rate": 0.001, + "loss": 0.405, + "step": 5380 + }, + { + "epoch": 0.14847363392737042, + "grad_norm": 0.003231652779504657, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 5381 + }, + { + "epoch": 0.1485012261284348, + "grad_norm": 0.004374918062239885, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 5382 + }, + { + "epoch": 0.14852881832949916, + "grad_norm": 0.003996079787611961, + "learning_rate": 0.001, + "loss": 0.3606, + "step": 5383 + }, + { + "epoch": 0.14855641053056354, + "grad_norm": 0.0023612806107848883, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 5384 + }, + { + "epoch": 0.14858400273162792, + "grad_norm": 0.002323776250705123, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 5385 + }, + { + "epoch": 0.14861159493269227, + "grad_norm": 0.003044640878215432, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 5386 + }, + { + "epoch": 0.14863918713375665, + "grad_norm": 0.0029183158185333014, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 5387 + }, + { + "epoch": 0.148666779334821, + "grad_norm": 0.0031076192390173674, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 5388 + }, + { + "epoch": 0.14869437153588538, + "grad_norm": 0.0032536948565393686, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 5389 + }, + { + "epoch": 0.14872196373694976, + "grad_norm": 0.0028777210973203182, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 5390 + }, + { + "epoch": 0.14874955593801412, + "grad_norm": 0.003044008044525981, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 5391 + }, + { + "epoch": 0.1487771481390785, + "grad_norm": 0.004723208025097847, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 5392 + }, + { + "epoch": 0.14880474034014285, + "grad_norm": 0.003199602710083127, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 5393 + }, + { + "epoch": 0.14883233254120723, + "grad_norm": 0.0029654945246875286, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 5394 + }, + { + "epoch": 0.1488599247422716, + "grad_norm": 0.003065300639718771, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 5395 + }, + { + "epoch": 0.14888751694333596, + "grad_norm": 0.003566544270142913, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 5396 + }, + { + "epoch": 0.14891510914440034, + "grad_norm": 0.0034122609067708254, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 5397 + }, + { + "epoch": 0.1489427013454647, + "grad_norm": 0.0025029098615050316, + "learning_rate": 0.001, + "loss": 0.3565, + "step": 5398 + }, + { + "epoch": 0.14897029354652908, + "grad_norm": 0.0030023674480617046, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 5399 + }, + { + "epoch": 0.14899788574759346, + "grad_norm": 0.00424955366179347, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 5400 + }, + { + "epoch": 0.1490254779486578, + "grad_norm": 0.0030338119249790907, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 5401 + }, + { + "epoch": 0.1490530701497222, + "grad_norm": 0.0028448712546378374, + "learning_rate": 0.001, + "loss": 0.432, + "step": 5402 + }, + { + "epoch": 0.14908066235078654, + "grad_norm": 0.0028374232351779938, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 5403 + }, + { + "epoch": 0.14910825455185092, + "grad_norm": 0.002151310909539461, + "learning_rate": 0.001, + "loss": 0.4508, + "step": 5404 + }, + { + "epoch": 0.1491358467529153, + "grad_norm": 0.002506339456886053, + "learning_rate": 0.001, + "loss": 0.4381, + "step": 5405 + }, + { + "epoch": 0.14916343895397965, + "grad_norm": 0.002707758452743292, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 5406 + }, + { + "epoch": 0.14919103115504403, + "grad_norm": 0.002374933334067464, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 5407 + }, + { + "epoch": 0.1492186233561084, + "grad_norm": 0.0027373386546969414, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 5408 + }, + { + "epoch": 0.14924621555717277, + "grad_norm": 0.004208944737911224, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 5409 + }, + { + "epoch": 0.14927380775823715, + "grad_norm": 0.0031958618201315403, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 5410 + }, + { + "epoch": 0.1493013999593015, + "grad_norm": 0.006793424021452665, + "learning_rate": 0.001, + "loss": 0.403, + "step": 5411 + }, + { + "epoch": 0.14932899216036588, + "grad_norm": 0.003309778869152069, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 5412 + }, + { + "epoch": 0.14935658436143023, + "grad_norm": 0.002356642158702016, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 5413 + }, + { + "epoch": 0.1493841765624946, + "grad_norm": 0.0032743606716394424, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 5414 + }, + { + "epoch": 0.14941176876355897, + "grad_norm": 0.005772759672254324, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 5415 + }, + { + "epoch": 0.14943936096462335, + "grad_norm": 0.0065681166015565395, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 5416 + }, + { + "epoch": 0.14946695316568773, + "grad_norm": 0.0028679503593593836, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 5417 + }, + { + "epoch": 0.14949454536675208, + "grad_norm": 0.003005877137184143, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 5418 + }, + { + "epoch": 0.14952213756781646, + "grad_norm": 0.0022220280952751637, + "learning_rate": 0.001, + "loss": 0.4393, + "step": 5419 + }, + { + "epoch": 0.1495497297688808, + "grad_norm": 0.0033852518536150455, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 5420 + }, + { + "epoch": 0.1495773219699452, + "grad_norm": 0.006787192076444626, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 5421 + }, + { + "epoch": 0.14960491417100957, + "grad_norm": 0.0036864059511572123, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 5422 + }, + { + "epoch": 0.14963250637207393, + "grad_norm": 0.002667534863576293, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 5423 + }, + { + "epoch": 0.1496600985731383, + "grad_norm": 0.003278045216575265, + "learning_rate": 0.001, + "loss": 0.416, + "step": 5424 + }, + { + "epoch": 0.14968769077420266, + "grad_norm": 0.003076856257393956, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 5425 + }, + { + "epoch": 0.14971528297526704, + "grad_norm": 0.0023882794193923473, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 5426 + }, + { + "epoch": 0.14974287517633142, + "grad_norm": 0.003012517001479864, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 5427 + }, + { + "epoch": 0.14977046737739577, + "grad_norm": 0.0023737861774861813, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 5428 + }, + { + "epoch": 0.14979805957846015, + "grad_norm": 0.0022319392301142216, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 5429 + }, + { + "epoch": 0.1498256517795245, + "grad_norm": 0.0032811984419822693, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 5430 + }, + { + "epoch": 0.14985324398058888, + "grad_norm": 0.005578971467912197, + "learning_rate": 0.001, + "loss": 0.39, + "step": 5431 + }, + { + "epoch": 0.14988083618165327, + "grad_norm": 0.003999901469796896, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 5432 + }, + { + "epoch": 0.14990842838271762, + "grad_norm": 0.002937354613095522, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 5433 + }, + { + "epoch": 0.149936020583782, + "grad_norm": 0.004472947679460049, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 5434 + }, + { + "epoch": 0.14996361278484635, + "grad_norm": 0.0043602860532701015, + "learning_rate": 0.001, + "loss": 0.3565, + "step": 5435 + }, + { + "epoch": 0.14999120498591073, + "grad_norm": 0.0026499300729483366, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 5436 + }, + { + "epoch": 0.1500187971869751, + "grad_norm": 0.003208121517673135, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 5437 + }, + { + "epoch": 0.15004638938803946, + "grad_norm": 0.0024985617492347956, + "learning_rate": 0.001, + "loss": 0.4716, + "step": 5438 + }, + { + "epoch": 0.15007398158910384, + "grad_norm": 0.0026381593197584152, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 5439 + }, + { + "epoch": 0.1501015737901682, + "grad_norm": 0.0035929230507463217, + "learning_rate": 0.001, + "loss": 0.3716, + "step": 5440 + }, + { + "epoch": 0.15012916599123258, + "grad_norm": 0.005476321559399366, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 5441 + }, + { + "epoch": 0.15015675819229696, + "grad_norm": 0.00393927376717329, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 5442 + }, + { + "epoch": 0.1501843503933613, + "grad_norm": 0.003273927140980959, + "learning_rate": 0.001, + "loss": 0.403, + "step": 5443 + }, + { + "epoch": 0.1502119425944257, + "grad_norm": 0.015024540945887566, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 5444 + }, + { + "epoch": 0.15023953479549004, + "grad_norm": 0.004785169847309589, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 5445 + }, + { + "epoch": 0.15026712699655442, + "grad_norm": 0.00336294062435627, + "learning_rate": 0.001, + "loss": 0.406, + "step": 5446 + }, + { + "epoch": 0.1502947191976188, + "grad_norm": 0.0036235249135643244, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 5447 + }, + { + "epoch": 0.15032231139868316, + "grad_norm": 0.002741090953350067, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 5448 + }, + { + "epoch": 0.15034990359974754, + "grad_norm": 0.0024599130265414715, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 5449 + }, + { + "epoch": 0.1503774958008119, + "grad_norm": 0.002163279801607132, + "learning_rate": 0.001, + "loss": 0.4581, + "step": 5450 + }, + { + "epoch": 0.15040508800187627, + "grad_norm": 0.005329553037881851, + "learning_rate": 0.001, + "loss": 0.375, + "step": 5451 + }, + { + "epoch": 0.15043268020294065, + "grad_norm": 0.004300289321690798, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 5452 + }, + { + "epoch": 0.150460272404005, + "grad_norm": 0.003734968136996031, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5453 + }, + { + "epoch": 0.15048786460506938, + "grad_norm": 0.0053179883398115635, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 5454 + }, + { + "epoch": 0.15051545680613374, + "grad_norm": 0.003352331230416894, + "learning_rate": 0.001, + "loss": 0.41, + "step": 5455 + }, + { + "epoch": 0.15054304900719812, + "grad_norm": 0.0027811650652438402, + "learning_rate": 0.001, + "loss": 0.4354, + "step": 5456 + }, + { + "epoch": 0.1505706412082625, + "grad_norm": 0.0069539607502520084, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 5457 + }, + { + "epoch": 0.15059823340932685, + "grad_norm": 0.002543016104027629, + "learning_rate": 0.001, + "loss": 0.475, + "step": 5458 + }, + { + "epoch": 0.15062582561039123, + "grad_norm": 0.004235788248479366, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 5459 + }, + { + "epoch": 0.15065341781145558, + "grad_norm": 0.0021371094044297934, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 5460 + }, + { + "epoch": 0.15068101001251996, + "grad_norm": 0.0031575944740325212, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 5461 + }, + { + "epoch": 0.15070860221358434, + "grad_norm": 0.002763645024970174, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 5462 + }, + { + "epoch": 0.1507361944146487, + "grad_norm": 0.003907541744410992, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 5463 + }, + { + "epoch": 0.15076378661571307, + "grad_norm": 0.004303582012653351, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 5464 + }, + { + "epoch": 0.15079137881677743, + "grad_norm": 0.005774863064289093, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 5465 + }, + { + "epoch": 0.1508189710178418, + "grad_norm": 0.00487649068236351, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 5466 + }, + { + "epoch": 0.1508465632189062, + "grad_norm": 0.0033277124166488647, + "learning_rate": 0.001, + "loss": 0.418, + "step": 5467 + }, + { + "epoch": 0.15087415541997054, + "grad_norm": 0.003046071156859398, + "learning_rate": 0.001, + "loss": 0.3428, + "step": 5468 + }, + { + "epoch": 0.15090174762103492, + "grad_norm": 0.0024130302481353283, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 5469 + }, + { + "epoch": 0.15092933982209927, + "grad_norm": 0.002598168794065714, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 5470 + }, + { + "epoch": 0.15095693202316365, + "grad_norm": 0.00266568036749959, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 5471 + }, + { + "epoch": 0.15098452422422803, + "grad_norm": 0.0028029074892401695, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 5472 + }, + { + "epoch": 0.1510121164252924, + "grad_norm": 0.008100908249616623, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 5473 + }, + { + "epoch": 0.15103970862635677, + "grad_norm": 0.00268415710888803, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 5474 + }, + { + "epoch": 0.15106730082742112, + "grad_norm": 0.00282164104282856, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 5475 + }, + { + "epoch": 0.1510948930284855, + "grad_norm": 0.0029998181853443384, + "learning_rate": 0.001, + "loss": 0.422, + "step": 5476 + }, + { + "epoch": 0.15112248522954988, + "grad_norm": 0.005322444252669811, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 5477 + }, + { + "epoch": 0.15115007743061423, + "grad_norm": 0.0020565236918628216, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5478 + }, + { + "epoch": 0.1511776696316786, + "grad_norm": 0.0021356120705604553, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 5479 + }, + { + "epoch": 0.15120526183274297, + "grad_norm": 0.003150090342387557, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 5480 + }, + { + "epoch": 0.15123285403380735, + "grad_norm": 0.002523067407310009, + "learning_rate": 0.001, + "loss": 0.4469, + "step": 5481 + }, + { + "epoch": 0.15126044623487173, + "grad_norm": 0.003910950850695372, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 5482 + }, + { + "epoch": 0.15128803843593608, + "grad_norm": 0.0035898578353226185, + "learning_rate": 0.001, + "loss": 0.429, + "step": 5483 + }, + { + "epoch": 0.15131563063700046, + "grad_norm": 0.0025848867371678352, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 5484 + }, + { + "epoch": 0.1513432228380648, + "grad_norm": 0.0024130861274898052, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 5485 + }, + { + "epoch": 0.1513708150391292, + "grad_norm": 0.005302855744957924, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5486 + }, + { + "epoch": 0.15139840724019357, + "grad_norm": 0.0032018995843827724, + "learning_rate": 0.001, + "loss": 0.3642, + "step": 5487 + }, + { + "epoch": 0.15142599944125792, + "grad_norm": 0.0025910425465554, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 5488 + }, + { + "epoch": 0.1514535916423223, + "grad_norm": 0.003524304600432515, + "learning_rate": 0.001, + "loss": 0.4421, + "step": 5489 + }, + { + "epoch": 0.15148118384338666, + "grad_norm": 0.0029097420629113913, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 5490 + }, + { + "epoch": 0.15150877604445104, + "grad_norm": 0.003354964079335332, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 5491 + }, + { + "epoch": 0.15153636824551542, + "grad_norm": 0.0032378071919083595, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 5492 + }, + { + "epoch": 0.15156396044657977, + "grad_norm": 0.0037639355286955833, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 5493 + }, + { + "epoch": 0.15159155264764415, + "grad_norm": 0.0043389336206018925, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 5494 + }, + { + "epoch": 0.1516191448487085, + "grad_norm": 0.0026339939795434475, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 5495 + }, + { + "epoch": 0.15164673704977288, + "grad_norm": 0.0039835344068706036, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 5496 + }, + { + "epoch": 0.15167432925083726, + "grad_norm": 0.003993440419435501, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 5497 + }, + { + "epoch": 0.15170192145190162, + "grad_norm": 0.003669349942356348, + "learning_rate": 0.001, + "loss": 0.3656, + "step": 5498 + }, + { + "epoch": 0.151729513652966, + "grad_norm": 0.005064620170742273, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 5499 + }, + { + "epoch": 0.15175710585403035, + "grad_norm": 0.0034543632064014673, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 5500 + }, + { + "epoch": 0.15175710585403035, + "eval_runtime": 27.0484, + "eval_samples_per_second": 1.183, + "eval_steps_per_second": 0.148, + "step": 5500 + }, + { + "epoch": 0.15178469805509473, + "grad_norm": 0.0021760701201856136, + "learning_rate": 0.001, + "loss": 0.4394, + "step": 5501 + }, + { + "epoch": 0.1518122902561591, + "grad_norm": 0.0033101621083915234, + "learning_rate": 0.001, + "loss": 0.392, + "step": 5502 + }, + { + "epoch": 0.15183988245722346, + "grad_norm": 0.005687963217496872, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 5503 + }, + { + "epoch": 0.15186747465828784, + "grad_norm": 0.0031170856673270464, + "learning_rate": 0.001, + "loss": 0.3453, + "step": 5504 + }, + { + "epoch": 0.1518950668593522, + "grad_norm": 0.0025914981961250305, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 5505 + }, + { + "epoch": 0.15192265906041658, + "grad_norm": 0.0037096338346600533, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5506 + }, + { + "epoch": 0.15195025126148093, + "grad_norm": 0.0023521913681179285, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 5507 + }, + { + "epoch": 0.1519778434625453, + "grad_norm": 0.0021949107758700848, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 5508 + }, + { + "epoch": 0.1520054356636097, + "grad_norm": 0.00198304932564497, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 5509 + }, + { + "epoch": 0.15203302786467404, + "grad_norm": 0.0021362160332500935, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 5510 + }, + { + "epoch": 0.15206062006573842, + "grad_norm": 0.0028702253475785255, + "learning_rate": 0.001, + "loss": 0.402, + "step": 5511 + }, + { + "epoch": 0.15208821226680277, + "grad_norm": 0.003037867834791541, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 5512 + }, + { + "epoch": 0.15211580446786716, + "grad_norm": 0.004557423759251833, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 5513 + }, + { + "epoch": 0.15214339666893154, + "grad_norm": 0.0027468795888125896, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 5514 + }, + { + "epoch": 0.1521709888699959, + "grad_norm": 0.004307581577450037, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 5515 + }, + { + "epoch": 0.15219858107106027, + "grad_norm": 0.0030242314096540213, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 5516 + }, + { + "epoch": 0.15222617327212462, + "grad_norm": 0.00433747936040163, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 5517 + }, + { + "epoch": 0.152253765473189, + "grad_norm": 0.009036600589752197, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 5518 + }, + { + "epoch": 0.15228135767425338, + "grad_norm": 0.012543873861432076, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 5519 + }, + { + "epoch": 0.15230894987531773, + "grad_norm": 0.009851133450865746, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 5520 + }, + { + "epoch": 0.15233654207638211, + "grad_norm": 0.013184795156121254, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 5521 + }, + { + "epoch": 0.15236413427744647, + "grad_norm": 0.017032135277986526, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 5522 + }, + { + "epoch": 0.15239172647851085, + "grad_norm": 0.002919899532571435, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 5523 + }, + { + "epoch": 0.15241931867957523, + "grad_norm": 0.002256933366879821, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 5524 + }, + { + "epoch": 0.15244691088063958, + "grad_norm": 0.0027415261138230562, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 5525 + }, + { + "epoch": 0.15247450308170396, + "grad_norm": 0.0031243073754012585, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 5526 + }, + { + "epoch": 0.1525020952827683, + "grad_norm": 0.0021765967831015587, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 5527 + }, + { + "epoch": 0.1525296874838327, + "grad_norm": 0.003611369989812374, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 5528 + }, + { + "epoch": 0.15255727968489707, + "grad_norm": 0.005910592619329691, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 5529 + }, + { + "epoch": 0.15258487188596143, + "grad_norm": 0.01214669831097126, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 5530 + }, + { + "epoch": 0.1526124640870258, + "grad_norm": 0.0060065449215471745, + "learning_rate": 0.001, + "loss": 0.402, + "step": 5531 + }, + { + "epoch": 0.15264005628809016, + "grad_norm": 0.0025461267214268446, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 5532 + }, + { + "epoch": 0.15266764848915454, + "grad_norm": 0.002425495535135269, + "learning_rate": 0.001, + "loss": 0.381, + "step": 5533 + }, + { + "epoch": 0.15269524069021892, + "grad_norm": 0.004517144989222288, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 5534 + }, + { + "epoch": 0.15272283289128327, + "grad_norm": 0.003472298150882125, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 5535 + }, + { + "epoch": 0.15275042509234765, + "grad_norm": 0.002562028355896473, + "learning_rate": 0.001, + "loss": 0.3576, + "step": 5536 + }, + { + "epoch": 0.152778017293412, + "grad_norm": 0.0028723133727908134, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 5537 + }, + { + "epoch": 0.15280560949447639, + "grad_norm": 0.0022548751439899206, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 5538 + }, + { + "epoch": 0.15283320169554077, + "grad_norm": 0.0031076574232429266, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 5539 + }, + { + "epoch": 0.15286079389660512, + "grad_norm": 0.00288601266220212, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 5540 + }, + { + "epoch": 0.1528883860976695, + "grad_norm": 0.004477125592529774, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 5541 + }, + { + "epoch": 0.15291597829873385, + "grad_norm": 0.0031475056894123554, + "learning_rate": 0.001, + "loss": 0.4294, + "step": 5542 + }, + { + "epoch": 0.15294357049979823, + "grad_norm": 0.0023465659469366074, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 5543 + }, + { + "epoch": 0.1529711627008626, + "grad_norm": 0.0032081464305520058, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 5544 + }, + { + "epoch": 0.15299875490192696, + "grad_norm": 0.002472371095791459, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 5545 + }, + { + "epoch": 0.15302634710299134, + "grad_norm": 0.006937100552022457, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 5546 + }, + { + "epoch": 0.1530539393040557, + "grad_norm": 0.0034694471396505833, + "learning_rate": 0.001, + "loss": 0.4476, + "step": 5547 + }, + { + "epoch": 0.15308153150512008, + "grad_norm": 0.0028970185667276382, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 5548 + }, + { + "epoch": 0.15310912370618446, + "grad_norm": 0.0024902718141674995, + "learning_rate": 0.001, + "loss": 0.449, + "step": 5549 + }, + { + "epoch": 0.1531367159072488, + "grad_norm": 0.0023354976437985897, + "learning_rate": 0.001, + "loss": 0.3648, + "step": 5550 + }, + { + "epoch": 0.1531643081083132, + "grad_norm": 0.004294807091355324, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 5551 + }, + { + "epoch": 0.15319190030937754, + "grad_norm": 0.0020946157164871693, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 5552 + }, + { + "epoch": 0.15321949251044192, + "grad_norm": 0.003225631546229124, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 5553 + }, + { + "epoch": 0.1532470847115063, + "grad_norm": 0.0035047954879701138, + "learning_rate": 0.001, + "loss": 0.3462, + "step": 5554 + }, + { + "epoch": 0.15327467691257066, + "grad_norm": 0.0029047110583633184, + "learning_rate": 0.001, + "loss": 0.393, + "step": 5555 + }, + { + "epoch": 0.15330226911363504, + "grad_norm": 0.0030687206890434027, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 5556 + }, + { + "epoch": 0.1533298613146994, + "grad_norm": 0.0027540400624275208, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 5557 + }, + { + "epoch": 0.15335745351576377, + "grad_norm": 0.002840878674760461, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 5558 + }, + { + "epoch": 0.15338504571682815, + "grad_norm": 0.0027139533776789904, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 5559 + }, + { + "epoch": 0.1534126379178925, + "grad_norm": 0.0027314014732837677, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 5560 + }, + { + "epoch": 0.15344023011895688, + "grad_norm": 0.002478542272001505, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 5561 + }, + { + "epoch": 0.15346782232002124, + "grad_norm": 0.007170672062784433, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 5562 + }, + { + "epoch": 0.15349541452108562, + "grad_norm": 0.004415068309754133, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 5563 + }, + { + "epoch": 0.15352300672215, + "grad_norm": 0.004127574618905783, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 5564 + }, + { + "epoch": 0.15355059892321435, + "grad_norm": 0.0033065411262214184, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 5565 + }, + { + "epoch": 0.15357819112427873, + "grad_norm": 0.0036155430134385824, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 5566 + }, + { + "epoch": 0.15360578332534308, + "grad_norm": 0.0034816188272088766, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 5567 + }, + { + "epoch": 0.15363337552640746, + "grad_norm": 0.003091090824455023, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 5568 + }, + { + "epoch": 0.15366096772747184, + "grad_norm": 0.006985298823565245, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 5569 + }, + { + "epoch": 0.1536885599285362, + "grad_norm": 0.0029920649249106646, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 5570 + }, + { + "epoch": 0.15371615212960058, + "grad_norm": 0.004347649868577719, + "learning_rate": 0.001, + "loss": 0.361, + "step": 5571 + }, + { + "epoch": 0.15374374433066493, + "grad_norm": 0.002314548706635833, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 5572 + }, + { + "epoch": 0.1537713365317293, + "grad_norm": 0.0023193899542093277, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 5573 + }, + { + "epoch": 0.1537989287327937, + "grad_norm": 0.004895602352917194, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 5574 + }, + { + "epoch": 0.15382652093385804, + "grad_norm": 0.003913875203579664, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 5575 + }, + { + "epoch": 0.15385411313492242, + "grad_norm": 0.002599104307591915, + "learning_rate": 0.001, + "loss": 0.4489, + "step": 5576 + }, + { + "epoch": 0.15388170533598677, + "grad_norm": 0.0025326511822640896, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 5577 + }, + { + "epoch": 0.15390929753705115, + "grad_norm": 0.003771717194467783, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 5578 + }, + { + "epoch": 0.15393688973811553, + "grad_norm": 0.004678527358919382, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 5579 + }, + { + "epoch": 0.1539644819391799, + "grad_norm": 0.0024059766437858343, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 5580 + }, + { + "epoch": 0.15399207414024427, + "grad_norm": 0.003771177725866437, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 5581 + }, + { + "epoch": 0.15401966634130862, + "grad_norm": 0.002773088635876775, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 5582 + }, + { + "epoch": 0.154047258542373, + "grad_norm": 0.003286458784714341, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 5583 + }, + { + "epoch": 0.15407485074343738, + "grad_norm": 0.007167492527514696, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 5584 + }, + { + "epoch": 0.15410244294450173, + "grad_norm": 0.0022903766948729753, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 5585 + }, + { + "epoch": 0.1541300351455661, + "grad_norm": 0.0034131724387407303, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 5586 + }, + { + "epoch": 0.15415762734663047, + "grad_norm": 0.005761331412941217, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 5587 + }, + { + "epoch": 0.15418521954769485, + "grad_norm": 0.0029236627742648125, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 5588 + }, + { + "epoch": 0.15421281174875923, + "grad_norm": 0.0033100962173193693, + "learning_rate": 0.001, + "loss": 0.405, + "step": 5589 + }, + { + "epoch": 0.15424040394982358, + "grad_norm": 0.003999118227511644, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 5590 + }, + { + "epoch": 0.15426799615088796, + "grad_norm": 0.006767808459699154, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 5591 + }, + { + "epoch": 0.1542955883519523, + "grad_norm": 0.0028603686951100826, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 5592 + }, + { + "epoch": 0.1543231805530167, + "grad_norm": 0.002447220031172037, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 5593 + }, + { + "epoch": 0.15435077275408107, + "grad_norm": 0.00598937040194869, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 5594 + }, + { + "epoch": 0.15437836495514543, + "grad_norm": 0.0037801298312842846, + "learning_rate": 0.001, + "loss": 0.4432, + "step": 5595 + }, + { + "epoch": 0.1544059571562098, + "grad_norm": 0.00243676477111876, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 5596 + }, + { + "epoch": 0.15443354935727416, + "grad_norm": 0.0022302521392703056, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 5597 + }, + { + "epoch": 0.15446114155833854, + "grad_norm": 0.002573461504653096, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 5598 + }, + { + "epoch": 0.15448873375940292, + "grad_norm": 0.004255924839526415, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 5599 + }, + { + "epoch": 0.15451632596046727, + "grad_norm": 0.0026880300138145685, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 5600 + }, + { + "epoch": 0.15454391816153165, + "grad_norm": 0.00242446456104517, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 5601 + }, + { + "epoch": 0.154571510362596, + "grad_norm": 0.00247559929266572, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 5602 + }, + { + "epoch": 0.15459910256366038, + "grad_norm": 0.003476825775578618, + "learning_rate": 0.001, + "loss": 0.3411, + "step": 5603 + }, + { + "epoch": 0.15462669476472474, + "grad_norm": 0.003844754071906209, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 5604 + }, + { + "epoch": 0.15465428696578912, + "grad_norm": 0.0029344146605581045, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 5605 + }, + { + "epoch": 0.1546818791668535, + "grad_norm": 0.0040620798245072365, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 5606 + }, + { + "epoch": 0.15470947136791785, + "grad_norm": 0.0020004198886454105, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 5607 + }, + { + "epoch": 0.15473706356898223, + "grad_norm": 0.002827103016898036, + "learning_rate": 0.001, + "loss": 0.376, + "step": 5608 + }, + { + "epoch": 0.15476465577004658, + "grad_norm": 0.0023741433396935463, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 5609 + }, + { + "epoch": 0.15479224797111096, + "grad_norm": 0.004414036870002747, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 5610 + }, + { + "epoch": 0.15481984017217534, + "grad_norm": 0.0030676499009132385, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 5611 + }, + { + "epoch": 0.1548474323732397, + "grad_norm": 0.0032093478366732597, + "learning_rate": 0.001, + "loss": 0.401, + "step": 5612 + }, + { + "epoch": 0.15487502457430408, + "grad_norm": 0.0024001640267670155, + "learning_rate": 0.001, + "loss": 0.4511, + "step": 5613 + }, + { + "epoch": 0.15490261677536843, + "grad_norm": 0.004838323220610619, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 5614 + }, + { + "epoch": 0.1549302089764328, + "grad_norm": 0.003309109481051564, + "learning_rate": 0.001, + "loss": 0.3565, + "step": 5615 + }, + { + "epoch": 0.1549578011774972, + "grad_norm": 0.0025519407354295254, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 5616 + }, + { + "epoch": 0.15498539337856154, + "grad_norm": 0.002377223689109087, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 5617 + }, + { + "epoch": 0.15501298557962592, + "grad_norm": 0.002599177649244666, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 5618 + }, + { + "epoch": 0.15504057778069028, + "grad_norm": 0.0029773900751024485, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 5619 + }, + { + "epoch": 0.15506816998175466, + "grad_norm": 0.0042325593531131744, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 5620 + }, + { + "epoch": 0.15509576218281904, + "grad_norm": 0.0032665403559803963, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 5621 + }, + { + "epoch": 0.1551233543838834, + "grad_norm": 0.00355579168535769, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 5622 + }, + { + "epoch": 0.15515094658494777, + "grad_norm": 0.0023576144594699144, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 5623 + }, + { + "epoch": 0.15517853878601212, + "grad_norm": 0.0028574687894433737, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5624 + }, + { + "epoch": 0.1552061309870765, + "grad_norm": 0.0038885304238647223, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 5625 + }, + { + "epoch": 0.15523372318814088, + "grad_norm": 0.0036933154333382845, + "learning_rate": 0.001, + "loss": 0.375, + "step": 5626 + }, + { + "epoch": 0.15526131538920523, + "grad_norm": 0.0026938861701637506, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 5627 + }, + { + "epoch": 0.15528890759026961, + "grad_norm": 0.00399070093408227, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 5628 + }, + { + "epoch": 0.15531649979133397, + "grad_norm": 0.004512227140367031, + "learning_rate": 0.001, + "loss": 0.4562, + "step": 5629 + }, + { + "epoch": 0.15534409199239835, + "grad_norm": 0.006776070687919855, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 5630 + }, + { + "epoch": 0.15537168419346273, + "grad_norm": 0.006678242702037096, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 5631 + }, + { + "epoch": 0.15539927639452708, + "grad_norm": 0.0036010188050568104, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 5632 + }, + { + "epoch": 0.15542686859559146, + "grad_norm": 0.0073088183999061584, + "learning_rate": 0.001, + "loss": 0.3522, + "step": 5633 + }, + { + "epoch": 0.1554544607966558, + "grad_norm": 0.005029013846069574, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 5634 + }, + { + "epoch": 0.1554820529977202, + "grad_norm": 0.002414390444755554, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 5635 + }, + { + "epoch": 0.15550964519878457, + "grad_norm": 0.0036553817335516214, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 5636 + }, + { + "epoch": 0.15553723739984893, + "grad_norm": 0.005032503046095371, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 5637 + }, + { + "epoch": 0.1555648296009133, + "grad_norm": 0.0028513423167169094, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 5638 + }, + { + "epoch": 0.15559242180197766, + "grad_norm": 0.004651295021176338, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 5639 + }, + { + "epoch": 0.15562001400304204, + "grad_norm": 0.0026212476659566164, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 5640 + }, + { + "epoch": 0.15564760620410642, + "grad_norm": 0.0022126201074570417, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 5641 + }, + { + "epoch": 0.15567519840517077, + "grad_norm": 0.002519601956009865, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 5642 + }, + { + "epoch": 0.15570279060623515, + "grad_norm": 0.002760083880275488, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 5643 + }, + { + "epoch": 0.1557303828072995, + "grad_norm": 0.003575262613594532, + "learning_rate": 0.001, + "loss": 0.401, + "step": 5644 + }, + { + "epoch": 0.15575797500836389, + "grad_norm": 0.0028452950064092875, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 5645 + }, + { + "epoch": 0.15578556720942827, + "grad_norm": 0.0025837207213044167, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5646 + }, + { + "epoch": 0.15581315941049262, + "grad_norm": 0.0023428683634847403, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 5647 + }, + { + "epoch": 0.155840751611557, + "grad_norm": 0.0025679313112050295, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 5648 + }, + { + "epoch": 0.15586834381262135, + "grad_norm": 0.002796493237838149, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 5649 + }, + { + "epoch": 0.15589593601368573, + "grad_norm": 0.002868711482733488, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 5650 + }, + { + "epoch": 0.1559235282147501, + "grad_norm": 0.0034229152370244265, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 5651 + }, + { + "epoch": 0.15595112041581446, + "grad_norm": 0.0044869715347886086, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 5652 + }, + { + "epoch": 0.15597871261687885, + "grad_norm": 0.0027872773353010416, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 5653 + }, + { + "epoch": 0.1560063048179432, + "grad_norm": 0.007318846881389618, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 5654 + }, + { + "epoch": 0.15603389701900758, + "grad_norm": 0.003985683433711529, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 5655 + }, + { + "epoch": 0.15606148922007196, + "grad_norm": 0.002402941230684519, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 5656 + }, + { + "epoch": 0.1560890814211363, + "grad_norm": 0.00238110963255167, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 5657 + }, + { + "epoch": 0.1561166736222007, + "grad_norm": 0.00369979883544147, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 5658 + }, + { + "epoch": 0.15614426582326504, + "grad_norm": 0.0022104033268988132, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 5659 + }, + { + "epoch": 0.15617185802432942, + "grad_norm": 0.0024638620670884848, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 5660 + }, + { + "epoch": 0.1561994502253938, + "grad_norm": 0.0023760111071169376, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 5661 + }, + { + "epoch": 0.15622704242645816, + "grad_norm": 0.002687407424673438, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 5662 + }, + { + "epoch": 0.15625463462752254, + "grad_norm": 0.002467637648805976, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 5663 + }, + { + "epoch": 0.1562822268285869, + "grad_norm": 0.0020871292799711227, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 5664 + }, + { + "epoch": 0.15630981902965127, + "grad_norm": 0.0048971157521009445, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 5665 + }, + { + "epoch": 0.15633741123071565, + "grad_norm": 0.002223189687356353, + "learning_rate": 0.001, + "loss": 0.387, + "step": 5666 + }, + { + "epoch": 0.15636500343178, + "grad_norm": 0.0026094792410731316, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 5667 + }, + { + "epoch": 0.15639259563284438, + "grad_norm": 0.004317654296755791, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 5668 + }, + { + "epoch": 0.15642018783390874, + "grad_norm": 0.0028076712042093277, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 5669 + }, + { + "epoch": 0.15644778003497312, + "grad_norm": 0.0033742813393473625, + "learning_rate": 0.001, + "loss": 0.4305, + "step": 5670 + }, + { + "epoch": 0.1564753722360375, + "grad_norm": 0.004012165125459433, + "learning_rate": 0.001, + "loss": 0.3531, + "step": 5671 + }, + { + "epoch": 0.15650296443710185, + "grad_norm": 0.0029937534127384424, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 5672 + }, + { + "epoch": 0.15653055663816623, + "grad_norm": 0.004357090685516596, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 5673 + }, + { + "epoch": 0.15655814883923058, + "grad_norm": 0.006484336219727993, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 5674 + }, + { + "epoch": 0.15658574104029496, + "grad_norm": 0.0038871821016073227, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 5675 + }, + { + "epoch": 0.15661333324135934, + "grad_norm": 0.008595773950219154, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 5676 + }, + { + "epoch": 0.1566409254424237, + "grad_norm": 0.009347690269351006, + "learning_rate": 0.001, + "loss": 0.3448, + "step": 5677 + }, + { + "epoch": 0.15666851764348808, + "grad_norm": 0.002826757961884141, + "learning_rate": 0.001, + "loss": 0.3628, + "step": 5678 + }, + { + "epoch": 0.15669610984455243, + "grad_norm": 0.003515382297337055, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 5679 + }, + { + "epoch": 0.1567237020456168, + "grad_norm": 0.0032721932511776686, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 5680 + }, + { + "epoch": 0.1567512942466812, + "grad_norm": 0.003463307162746787, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5681 + }, + { + "epoch": 0.15677888644774554, + "grad_norm": 0.003177905920892954, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 5682 + }, + { + "epoch": 0.15680647864880992, + "grad_norm": 0.003270684042945504, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 5683 + }, + { + "epoch": 0.15683407084987427, + "grad_norm": 0.0029053555335849524, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 5684 + }, + { + "epoch": 0.15686166305093865, + "grad_norm": 0.004513274412602186, + "learning_rate": 0.001, + "loss": 0.3607, + "step": 5685 + }, + { + "epoch": 0.15688925525200303, + "grad_norm": 0.012942062690854073, + "learning_rate": 0.001, + "loss": 0.408, + "step": 5686 + }, + { + "epoch": 0.1569168474530674, + "grad_norm": 0.004073042422533035, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 5687 + }, + { + "epoch": 0.15694443965413177, + "grad_norm": 0.0024588184896856546, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 5688 + }, + { + "epoch": 0.15697203185519612, + "grad_norm": 0.002686630468815565, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 5689 + }, + { + "epoch": 0.1569996240562605, + "grad_norm": 0.0031974229495972395, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 5690 + }, + { + "epoch": 0.15702721625732488, + "grad_norm": 0.0040585664100945, + "learning_rate": 0.001, + "loss": 0.3606, + "step": 5691 + }, + { + "epoch": 0.15705480845838923, + "grad_norm": 0.005035550333559513, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 5692 + }, + { + "epoch": 0.1570824006594536, + "grad_norm": 0.005043357145041227, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 5693 + }, + { + "epoch": 0.15710999286051797, + "grad_norm": 0.002406257903203368, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 5694 + }, + { + "epoch": 0.15713758506158235, + "grad_norm": 0.002501264214515686, + "learning_rate": 0.001, + "loss": 0.395, + "step": 5695 + }, + { + "epoch": 0.1571651772626467, + "grad_norm": 0.003250819630920887, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 5696 + }, + { + "epoch": 0.15719276946371108, + "grad_norm": 0.0020628594793379307, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 5697 + }, + { + "epoch": 0.15722036166477546, + "grad_norm": 0.002165653510019183, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 5698 + }, + { + "epoch": 0.1572479538658398, + "grad_norm": 0.003081389470025897, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 5699 + }, + { + "epoch": 0.1572755460669042, + "grad_norm": 0.0035273549146950245, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 5700 + }, + { + "epoch": 0.15730313826796855, + "grad_norm": 0.0023170190397650003, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 5701 + }, + { + "epoch": 0.15733073046903293, + "grad_norm": 0.0033377818763256073, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 5702 + }, + { + "epoch": 0.1573583226700973, + "grad_norm": 0.0030271231662482023, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 5703 + }, + { + "epoch": 0.15738591487116166, + "grad_norm": 0.002670741407200694, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 5704 + }, + { + "epoch": 0.15741350707222604, + "grad_norm": 0.0025286702439188957, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 5705 + }, + { + "epoch": 0.1574410992732904, + "grad_norm": 0.002894239965826273, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 5706 + }, + { + "epoch": 0.15746869147435477, + "grad_norm": 0.0030300067737698555, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 5707 + }, + { + "epoch": 0.15749628367541915, + "grad_norm": 0.0026801263447850943, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 5708 + }, + { + "epoch": 0.1575238758764835, + "grad_norm": 0.0029794182628393173, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 5709 + }, + { + "epoch": 0.15755146807754788, + "grad_norm": 0.0026555589865893126, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 5710 + }, + { + "epoch": 0.15757906027861224, + "grad_norm": 0.0025151160079985857, + "learning_rate": 0.001, + "loss": 0.3574, + "step": 5711 + }, + { + "epoch": 0.15760665247967662, + "grad_norm": 0.0026525617577135563, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 5712 + }, + { + "epoch": 0.157634244680741, + "grad_norm": 0.002047165296971798, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 5713 + }, + { + "epoch": 0.15766183688180535, + "grad_norm": 0.00422664824873209, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 5714 + }, + { + "epoch": 0.15768942908286973, + "grad_norm": 0.005134572274982929, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 5715 + }, + { + "epoch": 0.15771702128393408, + "grad_norm": 0.004460239317268133, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 5716 + }, + { + "epoch": 0.15774461348499846, + "grad_norm": 0.0029927766881883144, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 5717 + }, + { + "epoch": 0.15777220568606284, + "grad_norm": 0.005971200298517942, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 5718 + }, + { + "epoch": 0.1577997978871272, + "grad_norm": 0.0025469562970101833, + "learning_rate": 0.001, + "loss": 0.381, + "step": 5719 + }, + { + "epoch": 0.15782739008819158, + "grad_norm": 0.0036222515627741814, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 5720 + }, + { + "epoch": 0.15785498228925593, + "grad_norm": 0.002309228293597698, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5721 + }, + { + "epoch": 0.1578825744903203, + "grad_norm": 0.006437615491449833, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 5722 + }, + { + "epoch": 0.1579101666913847, + "grad_norm": 0.0027779482770711184, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 5723 + }, + { + "epoch": 0.15793775889244904, + "grad_norm": 0.006843385752290487, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 5724 + }, + { + "epoch": 0.15796535109351342, + "grad_norm": 0.005818149074912071, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 5725 + }, + { + "epoch": 0.15799294329457778, + "grad_norm": 0.004674671217799187, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 5726 + }, + { + "epoch": 0.15802053549564216, + "grad_norm": 0.0025879840832203627, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 5727 + }, + { + "epoch": 0.15804812769670654, + "grad_norm": 0.0030769093427807093, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 5728 + }, + { + "epoch": 0.1580757198977709, + "grad_norm": 0.0031094071455299854, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 5729 + }, + { + "epoch": 0.15810331209883527, + "grad_norm": 0.0025110999122262, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 5730 + }, + { + "epoch": 0.15813090429989962, + "grad_norm": 0.003311107400804758, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 5731 + }, + { + "epoch": 0.158158496500964, + "grad_norm": 0.003340107621625066, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 5732 + }, + { + "epoch": 0.15818608870202838, + "grad_norm": 0.004575404338538647, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 5733 + }, + { + "epoch": 0.15821368090309273, + "grad_norm": 0.0022902588825672865, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 5734 + }, + { + "epoch": 0.15824127310415712, + "grad_norm": 0.0030541005544364452, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 5735 + }, + { + "epoch": 0.15826886530522147, + "grad_norm": 0.002666509710252285, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5736 + }, + { + "epoch": 0.15829645750628585, + "grad_norm": 0.0029707769863307476, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 5737 + }, + { + "epoch": 0.15832404970735023, + "grad_norm": 0.0034186067059636116, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 5738 + }, + { + "epoch": 0.15835164190841458, + "grad_norm": 0.002819865709170699, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 5739 + }, + { + "epoch": 0.15837923410947896, + "grad_norm": 0.004903051070868969, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 5740 + }, + { + "epoch": 0.1584068263105433, + "grad_norm": 0.0036358917132019997, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 5741 + }, + { + "epoch": 0.1584344185116077, + "grad_norm": 0.004073919262737036, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 5742 + }, + { + "epoch": 0.15846201071267207, + "grad_norm": 0.0033164892811328173, + "learning_rate": 0.001, + "loss": 0.431, + "step": 5743 + }, + { + "epoch": 0.15848960291373643, + "grad_norm": 0.008852209895849228, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 5744 + }, + { + "epoch": 0.1585171951148008, + "grad_norm": 0.004943115636706352, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 5745 + }, + { + "epoch": 0.15854478731586516, + "grad_norm": 0.0025044376961886883, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 5746 + }, + { + "epoch": 0.15857237951692954, + "grad_norm": 0.0032009256538003683, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 5747 + }, + { + "epoch": 0.15859997171799392, + "grad_norm": 0.002845719689503312, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 5748 + }, + { + "epoch": 0.15862756391905827, + "grad_norm": 0.00325995241291821, + "learning_rate": 0.001, + "loss": 0.408, + "step": 5749 + }, + { + "epoch": 0.15865515612012265, + "grad_norm": 0.0026871482841670513, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 5750 + }, + { + "epoch": 0.158682748321187, + "grad_norm": 0.0027641262859106064, + "learning_rate": 0.001, + "loss": 0.3618, + "step": 5751 + }, + { + "epoch": 0.1587103405222514, + "grad_norm": 0.003129362827166915, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 5752 + }, + { + "epoch": 0.15873793272331577, + "grad_norm": 0.0031827734783291817, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 5753 + }, + { + "epoch": 0.15876552492438012, + "grad_norm": 0.0046300506219267845, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 5754 + }, + { + "epoch": 0.1587931171254445, + "grad_norm": 0.00367173389531672, + "learning_rate": 0.001, + "loss": 0.4506, + "step": 5755 + }, + { + "epoch": 0.15882070932650885, + "grad_norm": 0.0031661444809287786, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 5756 + }, + { + "epoch": 0.15884830152757323, + "grad_norm": 0.002517246874049306, + "learning_rate": 0.001, + "loss": 0.366, + "step": 5757 + }, + { + "epoch": 0.1588758937286376, + "grad_norm": 0.0026858425699174404, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 5758 + }, + { + "epoch": 0.15890348592970197, + "grad_norm": 0.0026828250847756863, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 5759 + }, + { + "epoch": 0.15893107813076635, + "grad_norm": 0.002697891555726528, + "learning_rate": 0.001, + "loss": 0.389, + "step": 5760 + }, + { + "epoch": 0.1589586703318307, + "grad_norm": 0.004389958921819925, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 5761 + }, + { + "epoch": 0.15898626253289508, + "grad_norm": 0.003987609874457121, + "learning_rate": 0.001, + "loss": 0.396, + "step": 5762 + }, + { + "epoch": 0.15901385473395946, + "grad_norm": 0.003649795660749078, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 5763 + }, + { + "epoch": 0.1590414469350238, + "grad_norm": 0.0034654231276363134, + "learning_rate": 0.001, + "loss": 0.409, + "step": 5764 + }, + { + "epoch": 0.1590690391360882, + "grad_norm": 0.005718625150620937, + "learning_rate": 0.001, + "loss": 0.405, + "step": 5765 + }, + { + "epoch": 0.15909663133715254, + "grad_norm": 0.0025557433255016804, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 5766 + }, + { + "epoch": 0.15912422353821692, + "grad_norm": 0.002460428746417165, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 5767 + }, + { + "epoch": 0.1591518157392813, + "grad_norm": 0.007104699965566397, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 5768 + }, + { + "epoch": 0.15917940794034566, + "grad_norm": 0.006836418528109789, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 5769 + }, + { + "epoch": 0.15920700014141004, + "grad_norm": 0.0034382655285298824, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 5770 + }, + { + "epoch": 0.1592345923424744, + "grad_norm": 0.0040556564927101135, + "learning_rate": 0.001, + "loss": 0.3581, + "step": 5771 + }, + { + "epoch": 0.15926218454353877, + "grad_norm": 0.0038001069333404303, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 5772 + }, + { + "epoch": 0.15928977674460315, + "grad_norm": 0.004240202251821756, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 5773 + }, + { + "epoch": 0.1593173689456675, + "grad_norm": 0.004947451408952475, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 5774 + }, + { + "epoch": 0.15934496114673188, + "grad_norm": 0.002301145112141967, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 5775 + }, + { + "epoch": 0.15937255334779624, + "grad_norm": 0.003091468010097742, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 5776 + }, + { + "epoch": 0.15940014554886062, + "grad_norm": 0.004508181009441614, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 5777 + }, + { + "epoch": 0.159427737749925, + "grad_norm": 0.002182744676247239, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 5778 + }, + { + "epoch": 0.15945532995098935, + "grad_norm": 0.00251766131259501, + "learning_rate": 0.001, + "loss": 0.4302, + "step": 5779 + }, + { + "epoch": 0.15948292215205373, + "grad_norm": 0.002740236232057214, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 5780 + }, + { + "epoch": 0.15951051435311808, + "grad_norm": 0.005189212504774332, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 5781 + }, + { + "epoch": 0.15953810655418246, + "grad_norm": 0.002994902664795518, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 5782 + }, + { + "epoch": 0.15956569875524684, + "grad_norm": 0.003750283271074295, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 5783 + }, + { + "epoch": 0.1595932909563112, + "grad_norm": 0.003004671772941947, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 5784 + }, + { + "epoch": 0.15962088315737558, + "grad_norm": 0.004584962036460638, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 5785 + }, + { + "epoch": 0.15964847535843993, + "grad_norm": 0.005403813440352678, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 5786 + }, + { + "epoch": 0.1596760675595043, + "grad_norm": 0.0032318695448338985, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 5787 + }, + { + "epoch": 0.15970365976056866, + "grad_norm": 0.002624890301376581, + "learning_rate": 0.001, + "loss": 0.369, + "step": 5788 + }, + { + "epoch": 0.15973125196163304, + "grad_norm": 0.004142098128795624, + "learning_rate": 0.001, + "loss": 0.3614, + "step": 5789 + }, + { + "epoch": 0.15975884416269742, + "grad_norm": 0.002787158126011491, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5790 + }, + { + "epoch": 0.15978643636376177, + "grad_norm": 0.004009711556136608, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 5791 + }, + { + "epoch": 0.15981402856482615, + "grad_norm": 0.002381292637437582, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 5792 + }, + { + "epoch": 0.1598416207658905, + "grad_norm": 0.0021126163192093372, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 5793 + }, + { + "epoch": 0.1598692129669549, + "grad_norm": 0.0049142311327159405, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 5794 + }, + { + "epoch": 0.15989680516801927, + "grad_norm": 0.0029681632295250893, + "learning_rate": 0.001, + "loss": 0.3565, + "step": 5795 + }, + { + "epoch": 0.15992439736908362, + "grad_norm": 0.002932732691988349, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 5796 + }, + { + "epoch": 0.159951989570148, + "grad_norm": 0.002819716464728117, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 5797 + }, + { + "epoch": 0.15997958177121235, + "grad_norm": 0.0028946103993803263, + "learning_rate": 0.001, + "loss": 0.395, + "step": 5798 + }, + { + "epoch": 0.16000717397227673, + "grad_norm": 0.0024298951029777527, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 5799 + }, + { + "epoch": 0.16003476617334111, + "grad_norm": 0.002949990564957261, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 5800 + }, + { + "epoch": 0.16006235837440547, + "grad_norm": 0.003484488232061267, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 5801 + }, + { + "epoch": 0.16008995057546985, + "grad_norm": 0.004663324449211359, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 5802 + }, + { + "epoch": 0.1601175427765342, + "grad_norm": 0.003363480092957616, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 5803 + }, + { + "epoch": 0.16014513497759858, + "grad_norm": 0.0023565501905977726, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 5804 + }, + { + "epoch": 0.16017272717866296, + "grad_norm": 0.002411758527159691, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 5805 + }, + { + "epoch": 0.1602003193797273, + "grad_norm": 0.0026753805577754974, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 5806 + }, + { + "epoch": 0.1602279115807917, + "grad_norm": 0.003161199390888214, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 5807 + }, + { + "epoch": 0.16025550378185605, + "grad_norm": 0.0042272647842764854, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5808 + }, + { + "epoch": 0.16028309598292043, + "grad_norm": 0.0023989747278392315, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 5809 + }, + { + "epoch": 0.1603106881839848, + "grad_norm": 0.003144536865875125, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 5810 + }, + { + "epoch": 0.16033828038504916, + "grad_norm": 0.002298231702297926, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 5811 + }, + { + "epoch": 0.16036587258611354, + "grad_norm": 0.005583870690315962, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 5812 + }, + { + "epoch": 0.1603934647871779, + "grad_norm": 0.002705441555008292, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 5813 + }, + { + "epoch": 0.16042105698824227, + "grad_norm": 0.0025329829659312963, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 5814 + }, + { + "epoch": 0.16044864918930665, + "grad_norm": 0.00348359951749444, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 5815 + }, + { + "epoch": 0.160476241390371, + "grad_norm": 0.005613440182060003, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 5816 + }, + { + "epoch": 0.16050383359143539, + "grad_norm": 0.005014996509999037, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 5817 + }, + { + "epoch": 0.16053142579249974, + "grad_norm": 0.003465597052127123, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 5818 + }, + { + "epoch": 0.16055901799356412, + "grad_norm": 0.003062979318201542, + "learning_rate": 0.001, + "loss": 0.3688, + "step": 5819 + }, + { + "epoch": 0.1605866101946285, + "grad_norm": 0.003151744371280074, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 5820 + }, + { + "epoch": 0.16061420239569285, + "grad_norm": 0.0023994911462068558, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 5821 + }, + { + "epoch": 0.16064179459675723, + "grad_norm": 0.002751079387962818, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 5822 + }, + { + "epoch": 0.16066938679782158, + "grad_norm": 0.0021729813888669014, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 5823 + }, + { + "epoch": 0.16069697899888596, + "grad_norm": 0.0027657474856823683, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 5824 + }, + { + "epoch": 0.16072457119995034, + "grad_norm": 0.0026696647983044386, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 5825 + }, + { + "epoch": 0.1607521634010147, + "grad_norm": 0.0028585640247911215, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 5826 + }, + { + "epoch": 0.16077975560207908, + "grad_norm": 0.003272457979619503, + "learning_rate": 0.001, + "loss": 0.395, + "step": 5827 + }, + { + "epoch": 0.16080734780314343, + "grad_norm": 0.004849962890148163, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 5828 + }, + { + "epoch": 0.1608349400042078, + "grad_norm": 0.0022566032130271196, + "learning_rate": 0.001, + "loss": 0.4437, + "step": 5829 + }, + { + "epoch": 0.1608625322052722, + "grad_norm": 0.0026730254758149385, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 5830 + }, + { + "epoch": 0.16089012440633654, + "grad_norm": 0.005255574360489845, + "learning_rate": 0.001, + "loss": 0.4395, + "step": 5831 + }, + { + "epoch": 0.16091771660740092, + "grad_norm": 0.0031837450806051493, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 5832 + }, + { + "epoch": 0.16094530880846528, + "grad_norm": 0.0028844368644058704, + "learning_rate": 0.001, + "loss": 0.4413, + "step": 5833 + }, + { + "epoch": 0.16097290100952966, + "grad_norm": 0.0036064505111426115, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 5834 + }, + { + "epoch": 0.16100049321059404, + "grad_norm": 0.0031712886411696672, + "learning_rate": 0.001, + "loss": 0.3584, + "step": 5835 + }, + { + "epoch": 0.1610280854116584, + "grad_norm": 0.0029163220897316933, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 5836 + }, + { + "epoch": 0.16105567761272277, + "grad_norm": 0.002263015601783991, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 5837 + }, + { + "epoch": 0.16108326981378712, + "grad_norm": 0.0027008019387722015, + "learning_rate": 0.001, + "loss": 0.4, + "step": 5838 + }, + { + "epoch": 0.1611108620148515, + "grad_norm": 0.002291950862854719, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5839 + }, + { + "epoch": 0.16113845421591588, + "grad_norm": 0.003169293748214841, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 5840 + }, + { + "epoch": 0.16116604641698024, + "grad_norm": 0.0032045808620750904, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 5841 + }, + { + "epoch": 0.16119363861804462, + "grad_norm": 0.00491705909371376, + "learning_rate": 0.001, + "loss": 0.3684, + "step": 5842 + }, + { + "epoch": 0.16122123081910897, + "grad_norm": 0.007085980381816626, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 5843 + }, + { + "epoch": 0.16124882302017335, + "grad_norm": 0.0026187379844486713, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 5844 + }, + { + "epoch": 0.16127641522123773, + "grad_norm": 0.0021291703451424837, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 5845 + }, + { + "epoch": 0.16130400742230208, + "grad_norm": 0.0025390980299562216, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 5846 + }, + { + "epoch": 0.16133159962336646, + "grad_norm": 0.0024054215755313635, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 5847 + }, + { + "epoch": 0.16135919182443081, + "grad_norm": 0.0030334896873682737, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 5848 + }, + { + "epoch": 0.1613867840254952, + "grad_norm": 0.004912371281534433, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 5849 + }, + { + "epoch": 0.16141437622655957, + "grad_norm": 0.00569628132507205, + "learning_rate": 0.001, + "loss": 0.4294, + "step": 5850 + }, + { + "epoch": 0.16144196842762393, + "grad_norm": 0.01722198724746704, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 5851 + }, + { + "epoch": 0.1614695606286883, + "grad_norm": 0.00705372542142868, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 5852 + }, + { + "epoch": 0.16149715282975266, + "grad_norm": 0.0025785481557250023, + "learning_rate": 0.001, + "loss": 0.4384, + "step": 5853 + }, + { + "epoch": 0.16152474503081704, + "grad_norm": 0.0024269360583275557, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 5854 + }, + { + "epoch": 0.16155233723188142, + "grad_norm": 0.0034063782077282667, + "learning_rate": 0.001, + "loss": 0.4463, + "step": 5855 + }, + { + "epoch": 0.16157992943294577, + "grad_norm": 0.0034151726868003607, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 5856 + }, + { + "epoch": 0.16160752163401015, + "grad_norm": 0.0038089072331786156, + "learning_rate": 0.001, + "loss": 0.386, + "step": 5857 + }, + { + "epoch": 0.1616351138350745, + "grad_norm": 0.0021995375864207745, + "learning_rate": 0.001, + "loss": 0.42, + "step": 5858 + }, + { + "epoch": 0.1616627060361389, + "grad_norm": 0.004200818948447704, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 5859 + }, + { + "epoch": 0.16169029823720327, + "grad_norm": 0.002936551347374916, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 5860 + }, + { + "epoch": 0.16171789043826762, + "grad_norm": 0.0025540448259562254, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 5861 + }, + { + "epoch": 0.161745482639332, + "grad_norm": 0.0026053600013256073, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 5862 + }, + { + "epoch": 0.16177307484039635, + "grad_norm": 0.0023329800460487604, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 5863 + }, + { + "epoch": 0.16180066704146073, + "grad_norm": 0.004357020370662212, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 5864 + }, + { + "epoch": 0.1618282592425251, + "grad_norm": 0.0025468000676482916, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 5865 + }, + { + "epoch": 0.16185585144358947, + "grad_norm": 0.004209857899695635, + "learning_rate": 0.001, + "loss": 0.362, + "step": 5866 + }, + { + "epoch": 0.16188344364465385, + "grad_norm": 0.005030781961977482, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 5867 + }, + { + "epoch": 0.1619110358457182, + "grad_norm": 0.0026541538536548615, + "learning_rate": 0.001, + "loss": 0.4342, + "step": 5868 + }, + { + "epoch": 0.16193862804678258, + "grad_norm": 0.0038330042734742165, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 5869 + }, + { + "epoch": 0.16196622024784696, + "grad_norm": 0.004821475129574537, + "learning_rate": 0.001, + "loss": 0.413, + "step": 5870 + }, + { + "epoch": 0.1619938124489113, + "grad_norm": 0.0028450365643948317, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 5871 + }, + { + "epoch": 0.1620214046499757, + "grad_norm": 0.0027043556328862906, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 5872 + }, + { + "epoch": 0.16204899685104004, + "grad_norm": 0.0035783706698566675, + "learning_rate": 0.001, + "loss": 0.3574, + "step": 5873 + }, + { + "epoch": 0.16207658905210443, + "grad_norm": 0.0049888514913618565, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 5874 + }, + { + "epoch": 0.1621041812531688, + "grad_norm": 0.0031105815432965755, + "learning_rate": 0.001, + "loss": 0.4, + "step": 5875 + }, + { + "epoch": 0.16213177345423316, + "grad_norm": 0.0034052070695906878, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 5876 + }, + { + "epoch": 0.16215936565529754, + "grad_norm": 0.009355590678751469, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 5877 + }, + { + "epoch": 0.1621869578563619, + "grad_norm": 0.0034844528418034315, + "learning_rate": 0.001, + "loss": 0.4489, + "step": 5878 + }, + { + "epoch": 0.16221455005742627, + "grad_norm": 0.0033237759489566088, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 5879 + }, + { + "epoch": 0.16224214225849065, + "grad_norm": 0.0030154725536704063, + "learning_rate": 0.001, + "loss": 0.3679, + "step": 5880 + }, + { + "epoch": 0.162269734459555, + "grad_norm": 0.00340478727594018, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 5881 + }, + { + "epoch": 0.16229732666061938, + "grad_norm": 0.003332821885123849, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 5882 + }, + { + "epoch": 0.16232491886168374, + "grad_norm": 0.00302005629055202, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 5883 + }, + { + "epoch": 0.16235251106274812, + "grad_norm": 0.0027085014153271914, + "learning_rate": 0.001, + "loss": 0.379, + "step": 5884 + }, + { + "epoch": 0.16238010326381247, + "grad_norm": 0.0031871707178652287, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 5885 + }, + { + "epoch": 0.16240769546487685, + "grad_norm": 0.0026585953310132027, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 5886 + }, + { + "epoch": 0.16243528766594123, + "grad_norm": 0.0022077590692788363, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 5887 + }, + { + "epoch": 0.16246287986700558, + "grad_norm": 0.0099159125238657, + "learning_rate": 0.001, + "loss": 0.4363, + "step": 5888 + }, + { + "epoch": 0.16249047206806996, + "grad_norm": 0.004561326466500759, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 5889 + }, + { + "epoch": 0.16251806426913432, + "grad_norm": 0.004134157672524452, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 5890 + }, + { + "epoch": 0.1625456564701987, + "grad_norm": 0.0028744975570589304, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 5891 + }, + { + "epoch": 0.16257324867126308, + "grad_norm": 0.0063214050605893135, + "learning_rate": 0.001, + "loss": 0.4302, + "step": 5892 + }, + { + "epoch": 0.16260084087232743, + "grad_norm": 0.0033373129554092884, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 5893 + }, + { + "epoch": 0.1626284330733918, + "grad_norm": 0.007856364361941814, + "learning_rate": 0.001, + "loss": 0.3508, + "step": 5894 + }, + { + "epoch": 0.16265602527445616, + "grad_norm": 0.005610327236354351, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 5895 + }, + { + "epoch": 0.16268361747552054, + "grad_norm": 0.0030173116829246283, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 5896 + }, + { + "epoch": 0.16271120967658492, + "grad_norm": 0.007099445443600416, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 5897 + }, + { + "epoch": 0.16273880187764928, + "grad_norm": 0.0066776215098798275, + "learning_rate": 0.001, + "loss": 0.425, + "step": 5898 + }, + { + "epoch": 0.16276639407871366, + "grad_norm": 0.0048042964190244675, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 5899 + }, + { + "epoch": 0.162793986279778, + "grad_norm": 0.013856537640094757, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 5900 + }, + { + "epoch": 0.1628215784808424, + "grad_norm": 0.00262752384878695, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 5901 + }, + { + "epoch": 0.16284917068190677, + "grad_norm": 0.004112104419618845, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 5902 + }, + { + "epoch": 0.16287676288297112, + "grad_norm": 0.0023975521326065063, + "learning_rate": 0.001, + "loss": 0.4522, + "step": 5903 + }, + { + "epoch": 0.1629043550840355, + "grad_norm": 0.003720303997397423, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 5904 + }, + { + "epoch": 0.16293194728509985, + "grad_norm": 0.0029375324957072735, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 5905 + }, + { + "epoch": 0.16295953948616423, + "grad_norm": 0.003167448565363884, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 5906 + }, + { + "epoch": 0.16298713168722861, + "grad_norm": 0.0032344404608011246, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 5907 + }, + { + "epoch": 0.16301472388829297, + "grad_norm": 0.0028254149947315454, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 5908 + }, + { + "epoch": 0.16304231608935735, + "grad_norm": 0.0026103025302290916, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 5909 + }, + { + "epoch": 0.1630699082904217, + "grad_norm": 0.0032244473695755005, + "learning_rate": 0.001, + "loss": 0.4391, + "step": 5910 + }, + { + "epoch": 0.16309750049148608, + "grad_norm": 0.005498593673110008, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 5911 + }, + { + "epoch": 0.16312509269255046, + "grad_norm": 0.002438473980873823, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 5912 + }, + { + "epoch": 0.1631526848936148, + "grad_norm": 0.002533574588596821, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 5913 + }, + { + "epoch": 0.1631802770946792, + "grad_norm": 0.0030868607573211193, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 5914 + }, + { + "epoch": 0.16320786929574355, + "grad_norm": 0.002708959858864546, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 5915 + }, + { + "epoch": 0.16323546149680793, + "grad_norm": 0.0037433947436511517, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 5916 + }, + { + "epoch": 0.1632630536978723, + "grad_norm": 0.0034228821750730276, + "learning_rate": 0.001, + "loss": 0.397, + "step": 5917 + }, + { + "epoch": 0.16329064589893666, + "grad_norm": 0.0021487257909029722, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 5918 + }, + { + "epoch": 0.16331823810000104, + "grad_norm": 0.008542514406144619, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 5919 + }, + { + "epoch": 0.1633458303010654, + "grad_norm": 0.0028414383996278048, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 5920 + }, + { + "epoch": 0.16337342250212977, + "grad_norm": 0.003626716323196888, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 5921 + }, + { + "epoch": 0.16340101470319415, + "grad_norm": 0.004335957579314709, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 5922 + }, + { + "epoch": 0.1634286069042585, + "grad_norm": 0.003027878934517503, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 5923 + }, + { + "epoch": 0.16345619910532289, + "grad_norm": 0.002936540637165308, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 5924 + }, + { + "epoch": 0.16348379130638724, + "grad_norm": 0.004033949691802263, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 5925 + }, + { + "epoch": 0.16351138350745162, + "grad_norm": 0.003175846766680479, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 5926 + }, + { + "epoch": 0.163538975708516, + "grad_norm": 0.002765644108876586, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 5927 + }, + { + "epoch": 0.16356656790958035, + "grad_norm": 0.0026081749238073826, + "learning_rate": 0.001, + "loss": 0.46, + "step": 5928 + }, + { + "epoch": 0.16359416011064473, + "grad_norm": 0.002697083866223693, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 5929 + }, + { + "epoch": 0.16362175231170908, + "grad_norm": 0.0040725781582295895, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 5930 + }, + { + "epoch": 0.16364934451277346, + "grad_norm": 0.0018469118513166904, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 5931 + }, + { + "epoch": 0.16367693671383785, + "grad_norm": 0.0027053162921220064, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 5932 + }, + { + "epoch": 0.1637045289149022, + "grad_norm": 0.0021168517414480448, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 5933 + }, + { + "epoch": 0.16373212111596658, + "grad_norm": 0.002157708862796426, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 5934 + }, + { + "epoch": 0.16375971331703093, + "grad_norm": 0.00347045436501503, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 5935 + }, + { + "epoch": 0.1637873055180953, + "grad_norm": 0.0034281827975064516, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 5936 + }, + { + "epoch": 0.1638148977191597, + "grad_norm": 0.005607214290648699, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 5937 + }, + { + "epoch": 0.16384248992022404, + "grad_norm": 0.0018512718379497528, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 5938 + }, + { + "epoch": 0.16387008212128842, + "grad_norm": 0.005284006241708994, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 5939 + }, + { + "epoch": 0.16389767432235278, + "grad_norm": 0.006320610176771879, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 5940 + }, + { + "epoch": 0.16392526652341716, + "grad_norm": 0.0025526133831590414, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 5941 + }, + { + "epoch": 0.16395285872448154, + "grad_norm": 0.0021038020495325327, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 5942 + }, + { + "epoch": 0.1639804509255459, + "grad_norm": 0.0026413672603666782, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 5943 + }, + { + "epoch": 0.16400804312661027, + "grad_norm": 0.004636452533304691, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 5944 + }, + { + "epoch": 0.16403563532767462, + "grad_norm": 0.005048688966780901, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 5945 + }, + { + "epoch": 0.164063227528739, + "grad_norm": 0.0032856205943971872, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 5946 + }, + { + "epoch": 0.16409081972980338, + "grad_norm": 0.0027474837843328714, + "learning_rate": 0.001, + "loss": 0.4642, + "step": 5947 + }, + { + "epoch": 0.16411841193086774, + "grad_norm": 0.004227146506309509, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 5948 + }, + { + "epoch": 0.16414600413193212, + "grad_norm": 0.00409206748008728, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 5949 + }, + { + "epoch": 0.16417359633299647, + "grad_norm": 0.004566535819321871, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 5950 + }, + { + "epoch": 0.16420118853406085, + "grad_norm": 0.0042169406078755856, + "learning_rate": 0.001, + "loss": 0.384, + "step": 5951 + }, + { + "epoch": 0.16422878073512523, + "grad_norm": 0.0025692598428577185, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 5952 + }, + { + "epoch": 0.16425637293618958, + "grad_norm": 0.0040415204130113125, + "learning_rate": 0.001, + "loss": 0.4316, + "step": 5953 + }, + { + "epoch": 0.16428396513725396, + "grad_norm": 0.002447271952405572, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 5954 + }, + { + "epoch": 0.16431155733831831, + "grad_norm": 0.00370286637917161, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 5955 + }, + { + "epoch": 0.1643391495393827, + "grad_norm": 0.0034686720464378595, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 5956 + }, + { + "epoch": 0.16436674174044708, + "grad_norm": 0.002705411519855261, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 5957 + }, + { + "epoch": 0.16439433394151143, + "grad_norm": 0.003815694246441126, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 5958 + }, + { + "epoch": 0.1644219261425758, + "grad_norm": 0.0033353432081639767, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 5959 + }, + { + "epoch": 0.16444951834364016, + "grad_norm": 0.003151560202240944, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 5960 + }, + { + "epoch": 0.16447711054470454, + "grad_norm": 0.0059262774884700775, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 5961 + }, + { + "epoch": 0.16450470274576892, + "grad_norm": 0.0027054711245000362, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 5962 + }, + { + "epoch": 0.16453229494683327, + "grad_norm": 0.005763984750956297, + "learning_rate": 0.001, + "loss": 0.3304, + "step": 5963 + }, + { + "epoch": 0.16455988714789765, + "grad_norm": 0.002855742583051324, + "learning_rate": 0.001, + "loss": 0.3459, + "step": 5964 + }, + { + "epoch": 0.164587479348962, + "grad_norm": 0.003007008694112301, + "learning_rate": 0.001, + "loss": 0.425, + "step": 5965 + }, + { + "epoch": 0.1646150715500264, + "grad_norm": 0.0030072838999330997, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 5966 + }, + { + "epoch": 0.16464266375109077, + "grad_norm": 0.005015561822801828, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 5967 + }, + { + "epoch": 0.16467025595215512, + "grad_norm": 0.0044816480949521065, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 5968 + }, + { + "epoch": 0.1646978481532195, + "grad_norm": 0.0035232999362051487, + "learning_rate": 0.001, + "loss": 0.3555, + "step": 5969 + }, + { + "epoch": 0.16472544035428385, + "grad_norm": 0.0043287379667162895, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 5970 + }, + { + "epoch": 0.16475303255534823, + "grad_norm": 0.00282704783603549, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 5971 + }, + { + "epoch": 0.1647806247564126, + "grad_norm": 0.004027531016618013, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 5972 + }, + { + "epoch": 0.16480821695747697, + "grad_norm": 0.011107796803116798, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 5973 + }, + { + "epoch": 0.16483580915854135, + "grad_norm": 0.004610290750861168, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 5974 + }, + { + "epoch": 0.1648634013596057, + "grad_norm": 0.002399605233222246, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 5975 + }, + { + "epoch": 0.16489099356067008, + "grad_norm": 0.004088590387254953, + "learning_rate": 0.001, + "loss": 0.414, + "step": 5976 + }, + { + "epoch": 0.16491858576173443, + "grad_norm": 0.0037160133942961693, + "learning_rate": 0.001, + "loss": 0.3613, + "step": 5977 + }, + { + "epoch": 0.1649461779627988, + "grad_norm": 0.004553014412522316, + "learning_rate": 0.001, + "loss": 0.414, + "step": 5978 + }, + { + "epoch": 0.1649737701638632, + "grad_norm": 0.0035007258411496878, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 5979 + }, + { + "epoch": 0.16500136236492755, + "grad_norm": 0.0026380368508398533, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 5980 + }, + { + "epoch": 0.16502895456599193, + "grad_norm": 0.0043465979397296906, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 5981 + }, + { + "epoch": 0.16505654676705628, + "grad_norm": 0.00366704142652452, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 5982 + }, + { + "epoch": 0.16508413896812066, + "grad_norm": 0.002497212029993534, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 5983 + }, + { + "epoch": 0.16511173116918504, + "grad_norm": 0.007907644845545292, + "learning_rate": 0.001, + "loss": 0.407, + "step": 5984 + }, + { + "epoch": 0.1651393233702494, + "grad_norm": 0.0024078672286123037, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 5985 + }, + { + "epoch": 0.16516691557131377, + "grad_norm": 0.003503212472423911, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 5986 + }, + { + "epoch": 0.16519450777237812, + "grad_norm": 0.00796839315444231, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 5987 + }, + { + "epoch": 0.1652220999734425, + "grad_norm": 0.007227026857435703, + "learning_rate": 0.001, + "loss": 0.367, + "step": 5988 + }, + { + "epoch": 0.16524969217450688, + "grad_norm": 0.0025876981671899557, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 5989 + }, + { + "epoch": 0.16527728437557124, + "grad_norm": 0.0023945660796016455, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 5990 + }, + { + "epoch": 0.16530487657663562, + "grad_norm": 0.0032521584071218967, + "learning_rate": 0.001, + "loss": 0.399, + "step": 5991 + }, + { + "epoch": 0.16533246877769997, + "grad_norm": 0.0025216443464159966, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 5992 + }, + { + "epoch": 0.16536006097876435, + "grad_norm": 0.0021180277690291405, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 5993 + }, + { + "epoch": 0.16538765317982873, + "grad_norm": 0.002108354354277253, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 5994 + }, + { + "epoch": 0.16541524538089308, + "grad_norm": 0.0022766771726310253, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 5995 + }, + { + "epoch": 0.16544283758195746, + "grad_norm": 0.0022483475040644407, + "learning_rate": 0.001, + "loss": 0.389, + "step": 5996 + }, + { + "epoch": 0.16547042978302182, + "grad_norm": 0.006487622391432524, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 5997 + }, + { + "epoch": 0.1654980219840862, + "grad_norm": 0.00356247927993536, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 5998 + }, + { + "epoch": 0.16552561418515058, + "grad_norm": 0.002978307893499732, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 5999 + }, + { + "epoch": 0.16555320638621493, + "grad_norm": 0.007124268915504217, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 6000 + }, + { + "epoch": 0.16555320638621493, + "eval_runtime": 24.3281, + "eval_samples_per_second": 1.315, + "eval_steps_per_second": 0.164, + "step": 6000 + }, + { + "epoch": 0.1655807985872793, + "grad_norm": 0.0031695945654064417, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 6001 + }, + { + "epoch": 0.16560839078834366, + "grad_norm": 0.0024594555143266916, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 6002 + }, + { + "epoch": 0.16563598298940804, + "grad_norm": 0.002371816663071513, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 6003 + }, + { + "epoch": 0.16566357519047242, + "grad_norm": 0.0034170527942478657, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 6004 + }, + { + "epoch": 0.16569116739153678, + "grad_norm": 0.0031782693695276976, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 6005 + }, + { + "epoch": 0.16571875959260116, + "grad_norm": 0.0026943848934024572, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 6006 + }, + { + "epoch": 0.1657463517936655, + "grad_norm": 0.003218788420781493, + "learning_rate": 0.001, + "loss": 0.397, + "step": 6007 + }, + { + "epoch": 0.1657739439947299, + "grad_norm": 0.0070941089652478695, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 6008 + }, + { + "epoch": 0.16580153619579427, + "grad_norm": 0.0025819384027272463, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 6009 + }, + { + "epoch": 0.16582912839685862, + "grad_norm": 0.003531514434143901, + "learning_rate": 0.001, + "loss": 0.383, + "step": 6010 + }, + { + "epoch": 0.165856720597923, + "grad_norm": 0.006012933328747749, + "learning_rate": 0.001, + "loss": 0.383, + "step": 6011 + }, + { + "epoch": 0.16588431279898735, + "grad_norm": 0.0036130910739302635, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 6012 + }, + { + "epoch": 0.16591190500005173, + "grad_norm": 0.0030107556376606226, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 6013 + }, + { + "epoch": 0.16593949720111612, + "grad_norm": 0.0032329028472304344, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 6014 + }, + { + "epoch": 0.16596708940218047, + "grad_norm": 0.002732783555984497, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 6015 + }, + { + "epoch": 0.16599468160324485, + "grad_norm": 0.004081232473254204, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 6016 + }, + { + "epoch": 0.1660222738043092, + "grad_norm": 0.1196216568350792, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 6017 + }, + { + "epoch": 0.16604986600537358, + "grad_norm": 0.005355612374842167, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 6018 + }, + { + "epoch": 0.16607745820643796, + "grad_norm": 0.003674236126244068, + "learning_rate": 0.001, + "loss": 0.378, + "step": 6019 + }, + { + "epoch": 0.1661050504075023, + "grad_norm": 0.002634497359395027, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 6020 + }, + { + "epoch": 0.1661326426085667, + "grad_norm": 0.0034477454610168934, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 6021 + }, + { + "epoch": 0.16616023480963105, + "grad_norm": 0.006275518331676722, + "learning_rate": 0.001, + "loss": 0.39, + "step": 6022 + }, + { + "epoch": 0.16618782701069543, + "grad_norm": 0.0067554921843111515, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 6023 + }, + { + "epoch": 0.1662154192117598, + "grad_norm": 0.0043805669993162155, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 6024 + }, + { + "epoch": 0.16624301141282416, + "grad_norm": 0.0032864839304238558, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 6025 + }, + { + "epoch": 0.16627060361388854, + "grad_norm": 0.0033326733391731977, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 6026 + }, + { + "epoch": 0.1662981958149529, + "grad_norm": 0.00480731762945652, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 6027 + }, + { + "epoch": 0.16632578801601727, + "grad_norm": 0.0029019401408731937, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 6028 + }, + { + "epoch": 0.16635338021708165, + "grad_norm": 0.002836265368387103, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 6029 + }, + { + "epoch": 0.166380972418146, + "grad_norm": 0.002911055227741599, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 6030 + }, + { + "epoch": 0.1664085646192104, + "grad_norm": 0.002560385735705495, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 6031 + }, + { + "epoch": 0.16643615682027474, + "grad_norm": 0.003303447039797902, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6032 + }, + { + "epoch": 0.16646374902133912, + "grad_norm": 0.0026523631531745195, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 6033 + }, + { + "epoch": 0.1664913412224035, + "grad_norm": 0.0037518537137657404, + "learning_rate": 0.001, + "loss": 0.4, + "step": 6034 + }, + { + "epoch": 0.16651893342346785, + "grad_norm": 0.0027246454264968634, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 6035 + }, + { + "epoch": 0.16654652562453223, + "grad_norm": 0.012342111207544804, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 6036 + }, + { + "epoch": 0.16657411782559658, + "grad_norm": 0.005223266314715147, + "learning_rate": 0.001, + "loss": 0.4332, + "step": 6037 + }, + { + "epoch": 0.16660171002666097, + "grad_norm": 0.003073178231716156, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 6038 + }, + { + "epoch": 0.16662930222772535, + "grad_norm": 0.004849819000810385, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 6039 + }, + { + "epoch": 0.1666568944287897, + "grad_norm": 0.002936877077445388, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 6040 + }, + { + "epoch": 0.16668448662985408, + "grad_norm": 0.002828566124662757, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 6041 + }, + { + "epoch": 0.16671207883091843, + "grad_norm": 0.0030576810240745544, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 6042 + }, + { + "epoch": 0.1667396710319828, + "grad_norm": 0.0025680421385914087, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 6043 + }, + { + "epoch": 0.1667672632330472, + "grad_norm": 0.004450993146747351, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 6044 + }, + { + "epoch": 0.16679485543411154, + "grad_norm": 0.0038969400338828564, + "learning_rate": 0.001, + "loss": 0.4425, + "step": 6045 + }, + { + "epoch": 0.16682244763517592, + "grad_norm": 0.003842557780444622, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 6046 + }, + { + "epoch": 0.16685003983624028, + "grad_norm": 0.002472433727234602, + "learning_rate": 0.001, + "loss": 0.391, + "step": 6047 + }, + { + "epoch": 0.16687763203730466, + "grad_norm": 0.00535456370562315, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 6048 + }, + { + "epoch": 0.16690522423836904, + "grad_norm": 0.002608702052384615, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 6049 + }, + { + "epoch": 0.1669328164394334, + "grad_norm": 0.0029743602499365807, + "learning_rate": 0.001, + "loss": 0.3424, + "step": 6050 + }, + { + "epoch": 0.16696040864049777, + "grad_norm": 0.0025020604953169823, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 6051 + }, + { + "epoch": 0.16698800084156212, + "grad_norm": 0.0027822526171803474, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 6052 + }, + { + "epoch": 0.1670155930426265, + "grad_norm": 0.0028931559063494205, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 6053 + }, + { + "epoch": 0.16704318524369088, + "grad_norm": 0.0022724780719727278, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6054 + }, + { + "epoch": 0.16707077744475524, + "grad_norm": 0.004654435440897942, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 6055 + }, + { + "epoch": 0.16709836964581962, + "grad_norm": 0.002734872279688716, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 6056 + }, + { + "epoch": 0.16712596184688397, + "grad_norm": 0.0025022958870977163, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 6057 + }, + { + "epoch": 0.16715355404794835, + "grad_norm": 0.004890537820756435, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 6058 + }, + { + "epoch": 0.16718114624901273, + "grad_norm": 0.0023612927179783583, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 6059 + }, + { + "epoch": 0.16720873845007708, + "grad_norm": 0.002075622323900461, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 6060 + }, + { + "epoch": 0.16723633065114146, + "grad_norm": 0.004314557649195194, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 6061 + }, + { + "epoch": 0.16726392285220582, + "grad_norm": 0.0022840569727122784, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 6062 + }, + { + "epoch": 0.1672915150532702, + "grad_norm": 0.0027527734637260437, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 6063 + }, + { + "epoch": 0.16731910725433458, + "grad_norm": 0.0024163657799363136, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 6064 + }, + { + "epoch": 0.16734669945539893, + "grad_norm": 0.0026521470863372087, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 6065 + }, + { + "epoch": 0.1673742916564633, + "grad_norm": 0.00407033134251833, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 6066 + }, + { + "epoch": 0.16740188385752766, + "grad_norm": 0.0028122677467763424, + "learning_rate": 0.001, + "loss": 0.3722, + "step": 6067 + }, + { + "epoch": 0.16742947605859204, + "grad_norm": 0.0021444158628582954, + "learning_rate": 0.001, + "loss": 0.432, + "step": 6068 + }, + { + "epoch": 0.16745706825965642, + "grad_norm": 0.004711539018899202, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 6069 + }, + { + "epoch": 0.16748466046072077, + "grad_norm": 0.005842708982527256, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 6070 + }, + { + "epoch": 0.16751225266178515, + "grad_norm": 0.005441974848508835, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 6071 + }, + { + "epoch": 0.1675398448628495, + "grad_norm": 0.005875125993043184, + "learning_rate": 0.001, + "loss": 0.408, + "step": 6072 + }, + { + "epoch": 0.1675674370639139, + "grad_norm": 0.002280894201248884, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 6073 + }, + { + "epoch": 0.16759502926497824, + "grad_norm": 0.0026473281905055046, + "learning_rate": 0.001, + "loss": 0.371, + "step": 6074 + }, + { + "epoch": 0.16762262146604262, + "grad_norm": 0.006329023279249668, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 6075 + }, + { + "epoch": 0.167650213667107, + "grad_norm": 0.0029098165687173605, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 6076 + }, + { + "epoch": 0.16767780586817135, + "grad_norm": 0.005095341708511114, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 6077 + }, + { + "epoch": 0.16770539806923573, + "grad_norm": 0.002626903122290969, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 6078 + }, + { + "epoch": 0.1677329902703001, + "grad_norm": 0.006261000409722328, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 6079 + }, + { + "epoch": 0.16776058247136447, + "grad_norm": 0.0027065426111221313, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 6080 + }, + { + "epoch": 0.16778817467242885, + "grad_norm": 0.0033599373418837786, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 6081 + }, + { + "epoch": 0.1678157668734932, + "grad_norm": 0.0021663676016032696, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 6082 + }, + { + "epoch": 0.16784335907455758, + "grad_norm": 0.003881096374243498, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 6083 + }, + { + "epoch": 0.16787095127562193, + "grad_norm": 0.0030875105876475573, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 6084 + }, + { + "epoch": 0.1678985434766863, + "grad_norm": 0.003538950812071562, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 6085 + }, + { + "epoch": 0.1679261356777507, + "grad_norm": 0.003060020739212632, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 6086 + }, + { + "epoch": 0.16795372787881505, + "grad_norm": 0.0032102216500788927, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 6087 + }, + { + "epoch": 0.16798132007987943, + "grad_norm": 0.004532152321189642, + "learning_rate": 0.001, + "loss": 0.3611, + "step": 6088 + }, + { + "epoch": 0.16800891228094378, + "grad_norm": 0.002951999893411994, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 6089 + }, + { + "epoch": 0.16803650448200816, + "grad_norm": 0.002537887077778578, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 6090 + }, + { + "epoch": 0.16806409668307254, + "grad_norm": 0.003645245684310794, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 6091 + }, + { + "epoch": 0.1680916888841369, + "grad_norm": 0.0038180460687726736, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 6092 + }, + { + "epoch": 0.16811928108520127, + "grad_norm": 0.002475427696481347, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 6093 + }, + { + "epoch": 0.16814687328626562, + "grad_norm": 0.017842723056674004, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 6094 + }, + { + "epoch": 0.16817446548733, + "grad_norm": 0.0030736385378986597, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 6095 + }, + { + "epoch": 0.16820205768839439, + "grad_norm": 0.004124650731682777, + "learning_rate": 0.001, + "loss": 0.424, + "step": 6096 + }, + { + "epoch": 0.16822964988945874, + "grad_norm": 0.002600730862468481, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 6097 + }, + { + "epoch": 0.16825724209052312, + "grad_norm": 0.003001442411914468, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 6098 + }, + { + "epoch": 0.16828483429158747, + "grad_norm": 0.002522877650335431, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 6099 + }, + { + "epoch": 0.16831242649265185, + "grad_norm": 0.0038972371257841587, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 6100 + }, + { + "epoch": 0.16834001869371623, + "grad_norm": 0.0026751782279461622, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 6101 + }, + { + "epoch": 0.16836761089478058, + "grad_norm": 0.003704581642523408, + "learning_rate": 0.001, + "loss": 0.3573, + "step": 6102 + }, + { + "epoch": 0.16839520309584496, + "grad_norm": 0.00222760159522295, + "learning_rate": 0.001, + "loss": 0.4439, + "step": 6103 + }, + { + "epoch": 0.16842279529690932, + "grad_norm": 0.0037216665223240852, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 6104 + }, + { + "epoch": 0.1684503874979737, + "grad_norm": 0.0030143833719193935, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 6105 + }, + { + "epoch": 0.16847797969903808, + "grad_norm": 0.0032768959645181894, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 6106 + }, + { + "epoch": 0.16850557190010243, + "grad_norm": 0.0028334499802440405, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 6107 + }, + { + "epoch": 0.1685331641011668, + "grad_norm": 0.004226780030876398, + "learning_rate": 0.001, + "loss": 0.354, + "step": 6108 + }, + { + "epoch": 0.16856075630223116, + "grad_norm": 0.0034058301243931055, + "learning_rate": 0.001, + "loss": 0.407, + "step": 6109 + }, + { + "epoch": 0.16858834850329554, + "grad_norm": 0.003791308030486107, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 6110 + }, + { + "epoch": 0.16861594070435992, + "grad_norm": 0.0050668418407440186, + "learning_rate": 0.001, + "loss": 0.395, + "step": 6111 + }, + { + "epoch": 0.16864353290542428, + "grad_norm": 0.01103867869824171, + "learning_rate": 0.001, + "loss": 0.3519, + "step": 6112 + }, + { + "epoch": 0.16867112510648866, + "grad_norm": 0.003474986646324396, + "learning_rate": 0.001, + "loss": 0.405, + "step": 6113 + }, + { + "epoch": 0.168698717307553, + "grad_norm": 0.0034254579804837704, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 6114 + }, + { + "epoch": 0.1687263095086174, + "grad_norm": 0.0021024683956056833, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 6115 + }, + { + "epoch": 0.16875390170968177, + "grad_norm": 0.0043007307685911655, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 6116 + }, + { + "epoch": 0.16878149391074612, + "grad_norm": 0.004201627802103758, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 6117 + }, + { + "epoch": 0.1688090861118105, + "grad_norm": 0.006456395611166954, + "learning_rate": 0.001, + "loss": 0.364, + "step": 6118 + }, + { + "epoch": 0.16883667831287485, + "grad_norm": 0.006011181510984898, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 6119 + }, + { + "epoch": 0.16886427051393924, + "grad_norm": 0.0036535318940877914, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 6120 + }, + { + "epoch": 0.16889186271500362, + "grad_norm": 0.0032456032931804657, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 6121 + }, + { + "epoch": 0.16891945491606797, + "grad_norm": 0.0025099399499595165, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 6122 + }, + { + "epoch": 0.16894704711713235, + "grad_norm": 0.003860413795337081, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 6123 + }, + { + "epoch": 0.1689746393181967, + "grad_norm": 0.004212185274809599, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 6124 + }, + { + "epoch": 0.16900223151926108, + "grad_norm": 0.0037831286899745464, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 6125 + }, + { + "epoch": 0.16902982372032546, + "grad_norm": 0.002503293799236417, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 6126 + }, + { + "epoch": 0.16905741592138981, + "grad_norm": 0.0032346874941140413, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 6127 + }, + { + "epoch": 0.1690850081224542, + "grad_norm": 0.0029911173041909933, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6128 + }, + { + "epoch": 0.16911260032351855, + "grad_norm": 0.003199785714969039, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 6129 + }, + { + "epoch": 0.16914019252458293, + "grad_norm": 0.0029822427313774824, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 6130 + }, + { + "epoch": 0.1691677847256473, + "grad_norm": 0.0030586915090680122, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 6131 + }, + { + "epoch": 0.16919537692671166, + "grad_norm": 0.0046739643439650536, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 6132 + }, + { + "epoch": 0.16922296912777604, + "grad_norm": 0.0033701432403177023, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 6133 + }, + { + "epoch": 0.1692505613288404, + "grad_norm": 0.003320742631331086, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 6134 + }, + { + "epoch": 0.16927815352990477, + "grad_norm": 0.005310896318405867, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 6135 + }, + { + "epoch": 0.16930574573096915, + "grad_norm": 0.003215103643015027, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 6136 + }, + { + "epoch": 0.1693333379320335, + "grad_norm": 0.002435446949675679, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 6137 + }, + { + "epoch": 0.1693609301330979, + "grad_norm": 0.009125195443630219, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 6138 + }, + { + "epoch": 0.16938852233416224, + "grad_norm": 0.002583020832389593, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 6139 + }, + { + "epoch": 0.16941611453522662, + "grad_norm": 0.003045174991711974, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 6140 + }, + { + "epoch": 0.169443706736291, + "grad_norm": 0.0020076248329132795, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 6141 + }, + { + "epoch": 0.16947129893735535, + "grad_norm": 0.003063742769882083, + "learning_rate": 0.001, + "loss": 0.4329, + "step": 6142 + }, + { + "epoch": 0.16949889113841973, + "grad_norm": 0.00650634104385972, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 6143 + }, + { + "epoch": 0.16952648333948409, + "grad_norm": 0.0031983335502445698, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 6144 + }, + { + "epoch": 0.16955407554054847, + "grad_norm": 0.0037359180860221386, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 6145 + }, + { + "epoch": 0.16958166774161285, + "grad_norm": 0.003823197679594159, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 6146 + }, + { + "epoch": 0.1696092599426772, + "grad_norm": 0.0030885476153343916, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 6147 + }, + { + "epoch": 0.16963685214374158, + "grad_norm": 0.005116578657180071, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 6148 + }, + { + "epoch": 0.16966444434480593, + "grad_norm": 0.003294056048616767, + "learning_rate": 0.001, + "loss": 0.428, + "step": 6149 + }, + { + "epoch": 0.1696920365458703, + "grad_norm": 0.002693452872335911, + "learning_rate": 0.001, + "loss": 0.389, + "step": 6150 + }, + { + "epoch": 0.1697196287469347, + "grad_norm": 0.0032077941577881575, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 6151 + }, + { + "epoch": 0.16974722094799904, + "grad_norm": 0.002029215916991234, + "learning_rate": 0.001, + "loss": 0.4398, + "step": 6152 + }, + { + "epoch": 0.16977481314906342, + "grad_norm": 0.002842904767021537, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 6153 + }, + { + "epoch": 0.16980240535012778, + "grad_norm": 0.004416587762534618, + "learning_rate": 0.001, + "loss": 0.3518, + "step": 6154 + }, + { + "epoch": 0.16982999755119216, + "grad_norm": 0.0031072101555764675, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 6155 + }, + { + "epoch": 0.16985758975225654, + "grad_norm": 0.006052012555301189, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 6156 + }, + { + "epoch": 0.1698851819533209, + "grad_norm": 0.012233547866344452, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 6157 + }, + { + "epoch": 0.16991277415438527, + "grad_norm": 0.004168129060417414, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 6158 + }, + { + "epoch": 0.16994036635544962, + "grad_norm": 0.0026695001870393753, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 6159 + }, + { + "epoch": 0.169967958556514, + "grad_norm": 0.0034213713370263577, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 6160 + }, + { + "epoch": 0.16999555075757838, + "grad_norm": 0.004756170324981213, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 6161 + }, + { + "epoch": 0.17002314295864274, + "grad_norm": 0.0028778575360774994, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 6162 + }, + { + "epoch": 0.17005073515970712, + "grad_norm": 0.003623120253905654, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 6163 + }, + { + "epoch": 0.17007832736077147, + "grad_norm": 0.0027707270346581936, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 6164 + }, + { + "epoch": 0.17010591956183585, + "grad_norm": 0.0021031824871897697, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 6165 + }, + { + "epoch": 0.1701335117629002, + "grad_norm": 0.0030407633166760206, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 6166 + }, + { + "epoch": 0.17016110396396458, + "grad_norm": 0.004170509055256844, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 6167 + }, + { + "epoch": 0.17018869616502896, + "grad_norm": 0.0025970737915486097, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 6168 + }, + { + "epoch": 0.17021628836609332, + "grad_norm": 0.003734288504347205, + "learning_rate": 0.001, + "loss": 0.4398, + "step": 6169 + }, + { + "epoch": 0.1702438805671577, + "grad_norm": 0.00284449546597898, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 6170 + }, + { + "epoch": 0.17027147276822205, + "grad_norm": 0.0026834469754248857, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 6171 + }, + { + "epoch": 0.17029906496928643, + "grad_norm": 0.002561383182182908, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 6172 + }, + { + "epoch": 0.1703266571703508, + "grad_norm": 0.008724918588995934, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 6173 + }, + { + "epoch": 0.17035424937141516, + "grad_norm": 0.004458330571651459, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 6174 + }, + { + "epoch": 0.17038184157247954, + "grad_norm": 0.004813158418983221, + "learning_rate": 0.001, + "loss": 0.4391, + "step": 6175 + }, + { + "epoch": 0.1704094337735439, + "grad_norm": 0.002779042813926935, + "learning_rate": 0.001, + "loss": 0.3619, + "step": 6176 + }, + { + "epoch": 0.17043702597460827, + "grad_norm": 0.004016479942947626, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 6177 + }, + { + "epoch": 0.17046461817567266, + "grad_norm": 0.0036288578994572163, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 6178 + }, + { + "epoch": 0.170492210376737, + "grad_norm": 0.003530343994498253, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 6179 + }, + { + "epoch": 0.1705198025778014, + "grad_norm": 0.005636625923216343, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 6180 + }, + { + "epoch": 0.17054739477886574, + "grad_norm": 0.004850632511079311, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 6181 + }, + { + "epoch": 0.17057498697993012, + "grad_norm": 0.0022970670834183693, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 6182 + }, + { + "epoch": 0.1706025791809945, + "grad_norm": 0.0032087513245642185, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 6183 + }, + { + "epoch": 0.17063017138205885, + "grad_norm": 0.0027670126873999834, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 6184 + }, + { + "epoch": 0.17065776358312323, + "grad_norm": 0.003269513137638569, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 6185 + }, + { + "epoch": 0.1706853557841876, + "grad_norm": 0.0030671104323118925, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 6186 + }, + { + "epoch": 0.17071294798525197, + "grad_norm": 0.002466262085363269, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 6187 + }, + { + "epoch": 0.17074054018631635, + "grad_norm": 0.0033186296932399273, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 6188 + }, + { + "epoch": 0.1707681323873807, + "grad_norm": 0.004772027488797903, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 6189 + }, + { + "epoch": 0.17079572458844508, + "grad_norm": 0.003858543001115322, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 6190 + }, + { + "epoch": 0.17082331678950943, + "grad_norm": 0.003998824395239353, + "learning_rate": 0.001, + "loss": 0.3572, + "step": 6191 + }, + { + "epoch": 0.1708509089905738, + "grad_norm": 0.0029393411241471767, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 6192 + }, + { + "epoch": 0.1708785011916382, + "grad_norm": 0.002925847191363573, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 6193 + }, + { + "epoch": 0.17090609339270255, + "grad_norm": 0.006174467504024506, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 6194 + }, + { + "epoch": 0.17093368559376693, + "grad_norm": 0.003490880597382784, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 6195 + }, + { + "epoch": 0.17096127779483128, + "grad_norm": 0.00242701661773026, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 6196 + }, + { + "epoch": 0.17098886999589566, + "grad_norm": 0.0028819721192121506, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 6197 + }, + { + "epoch": 0.17101646219696004, + "grad_norm": 0.0025861901231110096, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 6198 + }, + { + "epoch": 0.1710440543980244, + "grad_norm": 0.0020953891798853874, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 6199 + }, + { + "epoch": 0.17107164659908877, + "grad_norm": 0.0024096069391816854, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 6200 + }, + { + "epoch": 0.17109923880015313, + "grad_norm": 0.0030896812677383423, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 6201 + }, + { + "epoch": 0.1711268310012175, + "grad_norm": 0.0025605044793337584, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 6202 + }, + { + "epoch": 0.17115442320228189, + "grad_norm": 0.0032270110677927732, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 6203 + }, + { + "epoch": 0.17118201540334624, + "grad_norm": 0.002061746781691909, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 6204 + }, + { + "epoch": 0.17120960760441062, + "grad_norm": 0.002208675490692258, + "learning_rate": 0.001, + "loss": 0.4531, + "step": 6205 + }, + { + "epoch": 0.17123719980547497, + "grad_norm": 0.0019300546264275908, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6206 + }, + { + "epoch": 0.17126479200653935, + "grad_norm": 0.002864664187654853, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 6207 + }, + { + "epoch": 0.17129238420760373, + "grad_norm": 0.003815412288531661, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 6208 + }, + { + "epoch": 0.17131997640866808, + "grad_norm": 0.0021196226589381695, + "learning_rate": 0.001, + "loss": 0.401, + "step": 6209 + }, + { + "epoch": 0.17134756860973246, + "grad_norm": 0.0040994128212332726, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 6210 + }, + { + "epoch": 0.17137516081079682, + "grad_norm": 0.0021242250222712755, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 6211 + }, + { + "epoch": 0.1714027530118612, + "grad_norm": 0.0023374794982373714, + "learning_rate": 0.001, + "loss": 0.4438, + "step": 6212 + }, + { + "epoch": 0.17143034521292558, + "grad_norm": 0.004651496186852455, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 6213 + }, + { + "epoch": 0.17145793741398993, + "grad_norm": 0.0039611500687897205, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 6214 + }, + { + "epoch": 0.1714855296150543, + "grad_norm": 0.008256969973444939, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 6215 + }, + { + "epoch": 0.17151312181611866, + "grad_norm": 0.010161311365664005, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 6216 + }, + { + "epoch": 0.17154071401718304, + "grad_norm": 0.010829792357981205, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 6217 + }, + { + "epoch": 0.17156830621824742, + "grad_norm": 0.0036437201779335737, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 6218 + }, + { + "epoch": 0.17159589841931178, + "grad_norm": 0.0040598101913928986, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 6219 + }, + { + "epoch": 0.17162349062037616, + "grad_norm": 0.0032655976247042418, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 6220 + }, + { + "epoch": 0.1716510828214405, + "grad_norm": 0.0032314627896994352, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 6221 + }, + { + "epoch": 0.1716786750225049, + "grad_norm": 0.0044107455760240555, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 6222 + }, + { + "epoch": 0.17170626722356927, + "grad_norm": 0.0073016672395169735, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 6223 + }, + { + "epoch": 0.17173385942463362, + "grad_norm": 0.0031636636704206467, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 6224 + }, + { + "epoch": 0.171761451625698, + "grad_norm": 0.0030360522214323282, + "learning_rate": 0.001, + "loss": 0.3553, + "step": 6225 + }, + { + "epoch": 0.17178904382676236, + "grad_norm": 0.0032519621308892965, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 6226 + }, + { + "epoch": 0.17181663602782674, + "grad_norm": 0.002763562835752964, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 6227 + }, + { + "epoch": 0.17184422822889112, + "grad_norm": 0.002963263774290681, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 6228 + }, + { + "epoch": 0.17187182042995547, + "grad_norm": 0.0156992357224226, + "learning_rate": 0.001, + "loss": 0.4353, + "step": 6229 + }, + { + "epoch": 0.17189941263101985, + "grad_norm": 0.009238180704414845, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 6230 + }, + { + "epoch": 0.1719270048320842, + "grad_norm": 0.003196639008820057, + "learning_rate": 0.001, + "loss": 0.416, + "step": 6231 + }, + { + "epoch": 0.17195459703314858, + "grad_norm": 0.004371176473796368, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 6232 + }, + { + "epoch": 0.17198218923421296, + "grad_norm": 0.004302786663174629, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 6233 + }, + { + "epoch": 0.17200978143527731, + "grad_norm": 0.0027876824606209993, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 6234 + }, + { + "epoch": 0.1720373736363417, + "grad_norm": 0.004181624855846167, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 6235 + }, + { + "epoch": 0.17206496583740605, + "grad_norm": 0.0028202803805470467, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 6236 + }, + { + "epoch": 0.17209255803847043, + "grad_norm": 0.002387853804975748, + "learning_rate": 0.001, + "loss": 0.4437, + "step": 6237 + }, + { + "epoch": 0.1721201502395348, + "grad_norm": 0.0061591207049787045, + "learning_rate": 0.001, + "loss": 0.3678, + "step": 6238 + }, + { + "epoch": 0.17214774244059916, + "grad_norm": 0.002573385601863265, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 6239 + }, + { + "epoch": 0.17217533464166354, + "grad_norm": 0.0026830765418708324, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 6240 + }, + { + "epoch": 0.1722029268427279, + "grad_norm": 0.005540953949093819, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 6241 + }, + { + "epoch": 0.17223051904379227, + "grad_norm": 0.0054961638525128365, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 6242 + }, + { + "epoch": 0.17225811124485665, + "grad_norm": 0.007950017228722572, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 6243 + }, + { + "epoch": 0.172285703445921, + "grad_norm": 0.005001327488571405, + "learning_rate": 0.001, + "loss": 0.4595, + "step": 6244 + }, + { + "epoch": 0.1723132956469854, + "grad_norm": 0.011175988242030144, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 6245 + }, + { + "epoch": 0.17234088784804974, + "grad_norm": 0.004182237666100264, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 6246 + }, + { + "epoch": 0.17236848004911412, + "grad_norm": 0.004050822462886572, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 6247 + }, + { + "epoch": 0.1723960722501785, + "grad_norm": 0.0025094212032854557, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 6248 + }, + { + "epoch": 0.17242366445124285, + "grad_norm": 0.0029853296000510454, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 6249 + }, + { + "epoch": 0.17245125665230723, + "grad_norm": 0.0035753315314650536, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 6250 + }, + { + "epoch": 0.17247884885337159, + "grad_norm": 0.0028684705030173063, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 6251 + }, + { + "epoch": 0.17250644105443597, + "grad_norm": 0.0019036736339330673, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 6252 + }, + { + "epoch": 0.17253403325550035, + "grad_norm": 0.0022934586741030216, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 6253 + }, + { + "epoch": 0.1725616254565647, + "grad_norm": 0.0033066177275031805, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 6254 + }, + { + "epoch": 0.17258921765762908, + "grad_norm": 0.004688805900514126, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 6255 + }, + { + "epoch": 0.17261680985869343, + "grad_norm": 0.0033551587257534266, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 6256 + }, + { + "epoch": 0.1726444020597578, + "grad_norm": 0.002868784125894308, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 6257 + }, + { + "epoch": 0.17267199426082216, + "grad_norm": 0.0017792254220694304, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 6258 + }, + { + "epoch": 0.17269958646188655, + "grad_norm": 0.0052290805615484715, + "learning_rate": 0.001, + "loss": 0.374, + "step": 6259 + }, + { + "epoch": 0.17272717866295093, + "grad_norm": 0.003975987434387207, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 6260 + }, + { + "epoch": 0.17275477086401528, + "grad_norm": 0.005119245033711195, + "learning_rate": 0.001, + "loss": 0.383, + "step": 6261 + }, + { + "epoch": 0.17278236306507966, + "grad_norm": 0.009563916362822056, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 6262 + }, + { + "epoch": 0.172809955266144, + "grad_norm": 0.006325340364128351, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 6263 + }, + { + "epoch": 0.1728375474672084, + "grad_norm": 0.00598013773560524, + "learning_rate": 0.001, + "loss": 0.414, + "step": 6264 + }, + { + "epoch": 0.17286513966827277, + "grad_norm": 0.0035727466456592083, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 6265 + }, + { + "epoch": 0.17289273186933712, + "grad_norm": 0.009252941235899925, + "learning_rate": 0.001, + "loss": 0.4447, + "step": 6266 + }, + { + "epoch": 0.1729203240704015, + "grad_norm": 0.0041562700644135475, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 6267 + }, + { + "epoch": 0.17294791627146586, + "grad_norm": 0.0030006521847099066, + "learning_rate": 0.001, + "loss": 0.3489, + "step": 6268 + }, + { + "epoch": 0.17297550847253024, + "grad_norm": 0.002229264471679926, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 6269 + }, + { + "epoch": 0.17300310067359462, + "grad_norm": 0.004136356525123119, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 6270 + }, + { + "epoch": 0.17303069287465897, + "grad_norm": 0.005907285492867231, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 6271 + }, + { + "epoch": 0.17305828507572335, + "grad_norm": 0.003040108596906066, + "learning_rate": 0.001, + "loss": 0.391, + "step": 6272 + }, + { + "epoch": 0.1730858772767877, + "grad_norm": 0.0026024484541267157, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 6273 + }, + { + "epoch": 0.17311346947785208, + "grad_norm": 0.002952686743810773, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 6274 + }, + { + "epoch": 0.17314106167891646, + "grad_norm": 0.004544638562947512, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 6275 + }, + { + "epoch": 0.17316865387998082, + "grad_norm": 0.004679075442254543, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 6276 + }, + { + "epoch": 0.1731962460810452, + "grad_norm": 0.002047475427389145, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 6277 + }, + { + "epoch": 0.17322383828210955, + "grad_norm": 0.005213032476603985, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 6278 + }, + { + "epoch": 0.17325143048317393, + "grad_norm": 0.00405911635607481, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 6279 + }, + { + "epoch": 0.1732790226842383, + "grad_norm": 0.002308881375938654, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 6280 + }, + { + "epoch": 0.17330661488530266, + "grad_norm": 0.007890172302722931, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 6281 + }, + { + "epoch": 0.17333420708636704, + "grad_norm": 0.0034512749407440424, + "learning_rate": 0.001, + "loss": 0.42, + "step": 6282 + }, + { + "epoch": 0.1733617992874314, + "grad_norm": 0.007768754381686449, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 6283 + }, + { + "epoch": 0.17338939148849578, + "grad_norm": 0.004081700462847948, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 6284 + }, + { + "epoch": 0.17341698368956016, + "grad_norm": 0.003280284348875284, + "learning_rate": 0.001, + "loss": 0.417, + "step": 6285 + }, + { + "epoch": 0.1734445758906245, + "grad_norm": 0.0031469378154724836, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 6286 + }, + { + "epoch": 0.1734721680916889, + "grad_norm": 0.002462326781824231, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 6287 + }, + { + "epoch": 0.17349976029275324, + "grad_norm": 0.0022118764463812113, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 6288 + }, + { + "epoch": 0.17352735249381762, + "grad_norm": 0.0026676817797124386, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 6289 + }, + { + "epoch": 0.173554944694882, + "grad_norm": 0.002649841830134392, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 6290 + }, + { + "epoch": 0.17358253689594635, + "grad_norm": 0.002229656558483839, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 6291 + }, + { + "epoch": 0.17361012909701073, + "grad_norm": 0.0036838806699961424, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 6292 + }, + { + "epoch": 0.1736377212980751, + "grad_norm": 0.015469814650714397, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 6293 + }, + { + "epoch": 0.17366531349913947, + "grad_norm": 0.002747813006862998, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 6294 + }, + { + "epoch": 0.17369290570020385, + "grad_norm": 0.0029808697290718555, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 6295 + }, + { + "epoch": 0.1737204979012682, + "grad_norm": 0.0032657973933964968, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 6296 + }, + { + "epoch": 0.17374809010233258, + "grad_norm": 0.005452743265777826, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 6297 + }, + { + "epoch": 0.17377568230339693, + "grad_norm": 0.00484059751033783, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 6298 + }, + { + "epoch": 0.1738032745044613, + "grad_norm": 0.003019709372892976, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 6299 + }, + { + "epoch": 0.1738308667055257, + "grad_norm": 0.003449161071330309, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 6300 + }, + { + "epoch": 0.17385845890659005, + "grad_norm": 0.004382924642413855, + "learning_rate": 0.001, + "loss": 0.3602, + "step": 6301 + }, + { + "epoch": 0.17388605110765443, + "grad_norm": 0.00298452191054821, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 6302 + }, + { + "epoch": 0.17391364330871878, + "grad_norm": 0.002635273849591613, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 6303 + }, + { + "epoch": 0.17394123550978316, + "grad_norm": 0.0019398960284888744, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 6304 + }, + { + "epoch": 0.17396882771084754, + "grad_norm": 0.002571891061961651, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 6305 + }, + { + "epoch": 0.1739964199119119, + "grad_norm": 0.004911772906780243, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 6306 + }, + { + "epoch": 0.17402401211297627, + "grad_norm": 0.004217732232064009, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 6307 + }, + { + "epoch": 0.17405160431404063, + "grad_norm": 0.002490875544026494, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 6308 + }, + { + "epoch": 0.174079196515105, + "grad_norm": 0.002398621989414096, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 6309 + }, + { + "epoch": 0.17410678871616939, + "grad_norm": 0.0025024251081049442, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 6310 + }, + { + "epoch": 0.17413438091723374, + "grad_norm": 0.003449508221819997, + "learning_rate": 0.001, + "loss": 0.3632, + "step": 6311 + }, + { + "epoch": 0.17416197311829812, + "grad_norm": 0.003372189588844776, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 6312 + }, + { + "epoch": 0.17418956531936247, + "grad_norm": 0.004607321694493294, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 6313 + }, + { + "epoch": 0.17421715752042685, + "grad_norm": 0.00431500980630517, + "learning_rate": 0.001, + "loss": 0.419, + "step": 6314 + }, + { + "epoch": 0.17424474972149123, + "grad_norm": 0.0045218681916594505, + "learning_rate": 0.001, + "loss": 0.339, + "step": 6315 + }, + { + "epoch": 0.17427234192255558, + "grad_norm": 0.005656406749039888, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 6316 + }, + { + "epoch": 0.17429993412361997, + "grad_norm": 0.0031431603711098433, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 6317 + }, + { + "epoch": 0.17432752632468432, + "grad_norm": 0.0022704950533807278, + "learning_rate": 0.001, + "loss": 0.398, + "step": 6318 + }, + { + "epoch": 0.1743551185257487, + "grad_norm": 0.003445026697590947, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 6319 + }, + { + "epoch": 0.17438271072681308, + "grad_norm": 0.002719600684940815, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 6320 + }, + { + "epoch": 0.17441030292787743, + "grad_norm": 0.0020257963333278894, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6321 + }, + { + "epoch": 0.1744378951289418, + "grad_norm": 0.0022240818943828344, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 6322 + }, + { + "epoch": 0.17446548733000616, + "grad_norm": 0.002934374613687396, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 6323 + }, + { + "epoch": 0.17449307953107054, + "grad_norm": 0.006465549115091562, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 6324 + }, + { + "epoch": 0.17452067173213492, + "grad_norm": 0.005728098098188639, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 6325 + }, + { + "epoch": 0.17454826393319928, + "grad_norm": 0.002171339699998498, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 6326 + }, + { + "epoch": 0.17457585613426366, + "grad_norm": 0.0026950135361403227, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 6327 + }, + { + "epoch": 0.174603448335328, + "grad_norm": 0.006035445258021355, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 6328 + }, + { + "epoch": 0.1746310405363924, + "grad_norm": 0.0035918874200433493, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 6329 + }, + { + "epoch": 0.17465863273745677, + "grad_norm": 0.002868028124794364, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 6330 + }, + { + "epoch": 0.17468622493852112, + "grad_norm": 0.00462915375828743, + "learning_rate": 0.001, + "loss": 0.405, + "step": 6331 + }, + { + "epoch": 0.1747138171395855, + "grad_norm": 0.0024865546729415655, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 6332 + }, + { + "epoch": 0.17474140934064986, + "grad_norm": 0.002470286563038826, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 6333 + }, + { + "epoch": 0.17476900154171424, + "grad_norm": 0.003509331261739135, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 6334 + }, + { + "epoch": 0.17479659374277862, + "grad_norm": 0.0032379806507378817, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 6335 + }, + { + "epoch": 0.17482418594384297, + "grad_norm": 0.003039590548723936, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 6336 + }, + { + "epoch": 0.17485177814490735, + "grad_norm": 0.0033747265115380287, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 6337 + }, + { + "epoch": 0.1748793703459717, + "grad_norm": 0.005611809901893139, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 6338 + }, + { + "epoch": 0.17490696254703608, + "grad_norm": 0.002599924337118864, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 6339 + }, + { + "epoch": 0.17493455474810046, + "grad_norm": 0.0026452033780515194, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 6340 + }, + { + "epoch": 0.17496214694916482, + "grad_norm": 0.003480355953797698, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 6341 + }, + { + "epoch": 0.1749897391502292, + "grad_norm": 0.0024892655201256275, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 6342 + }, + { + "epoch": 0.17501733135129355, + "grad_norm": 0.0024968746583908796, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 6343 + }, + { + "epoch": 0.17504492355235793, + "grad_norm": 0.00571107491850853, + "learning_rate": 0.001, + "loss": 0.398, + "step": 6344 + }, + { + "epoch": 0.1750725157534223, + "grad_norm": 0.0021744274999946356, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 6345 + }, + { + "epoch": 0.17510010795448666, + "grad_norm": 0.004079717211425304, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 6346 + }, + { + "epoch": 0.17512770015555104, + "grad_norm": 0.0027901085559278727, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 6347 + }, + { + "epoch": 0.1751552923566154, + "grad_norm": 0.0030881750863045454, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 6348 + }, + { + "epoch": 0.17518288455767977, + "grad_norm": 0.002648904686793685, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 6349 + }, + { + "epoch": 0.17521047675874415, + "grad_norm": 0.004519915673881769, + "learning_rate": 0.001, + "loss": 0.413, + "step": 6350 + }, + { + "epoch": 0.1752380689598085, + "grad_norm": 0.004600946791470051, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 6351 + }, + { + "epoch": 0.1752656611608729, + "grad_norm": 0.003022734308615327, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 6352 + }, + { + "epoch": 0.17529325336193724, + "grad_norm": 0.004520618822425604, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 6353 + }, + { + "epoch": 0.17532084556300162, + "grad_norm": 0.0052781859412789345, + "learning_rate": 0.001, + "loss": 0.3592, + "step": 6354 + }, + { + "epoch": 0.17534843776406597, + "grad_norm": 0.0028431678656488657, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 6355 + }, + { + "epoch": 0.17537602996513035, + "grad_norm": 0.004005215130746365, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 6356 + }, + { + "epoch": 0.17540362216619473, + "grad_norm": 0.0038050967268645763, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 6357 + }, + { + "epoch": 0.1754312143672591, + "grad_norm": 0.01202372182160616, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 6358 + }, + { + "epoch": 0.17545880656832347, + "grad_norm": 0.0020817206241190434, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 6359 + }, + { + "epoch": 0.17548639876938782, + "grad_norm": 0.0022702061105519533, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 6360 + }, + { + "epoch": 0.1755139909704522, + "grad_norm": 0.002957484917715192, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 6361 + }, + { + "epoch": 0.17554158317151658, + "grad_norm": 0.002842365065589547, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 6362 + }, + { + "epoch": 0.17556917537258093, + "grad_norm": 0.002651175484061241, + "learning_rate": 0.001, + "loss": 0.448, + "step": 6363 + }, + { + "epoch": 0.1755967675736453, + "grad_norm": 0.0024952066596597433, + "learning_rate": 0.001, + "loss": 0.4306, + "step": 6364 + }, + { + "epoch": 0.17562435977470967, + "grad_norm": 0.0025687795132398605, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 6365 + }, + { + "epoch": 0.17565195197577405, + "grad_norm": 0.002511841943487525, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 6366 + }, + { + "epoch": 0.17567954417683843, + "grad_norm": 0.00360535248182714, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 6367 + }, + { + "epoch": 0.17570713637790278, + "grad_norm": 0.002814951818436384, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 6368 + }, + { + "epoch": 0.17573472857896716, + "grad_norm": 0.003035743487998843, + "learning_rate": 0.001, + "loss": 0.426, + "step": 6369 + }, + { + "epoch": 0.1757623207800315, + "grad_norm": 0.0033583652693778276, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 6370 + }, + { + "epoch": 0.1757899129810959, + "grad_norm": 0.0023555420339107513, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 6371 + }, + { + "epoch": 0.17581750518216027, + "grad_norm": 0.0036680963821709156, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 6372 + }, + { + "epoch": 0.17584509738322462, + "grad_norm": 0.00232534552924335, + "learning_rate": 0.001, + "loss": 0.422, + "step": 6373 + }, + { + "epoch": 0.175872689584289, + "grad_norm": 0.0050485520623624325, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 6374 + }, + { + "epoch": 0.17590028178535336, + "grad_norm": 0.004004286136478186, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 6375 + }, + { + "epoch": 0.17592787398641774, + "grad_norm": 0.003267053049057722, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 6376 + }, + { + "epoch": 0.17595546618748212, + "grad_norm": 0.002155203837901354, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 6377 + }, + { + "epoch": 0.17598305838854647, + "grad_norm": 0.030459165573120117, + "learning_rate": 0.001, + "loss": 0.379, + "step": 6378 + }, + { + "epoch": 0.17601065058961085, + "grad_norm": 0.007769315037876368, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 6379 + }, + { + "epoch": 0.1760382427906752, + "grad_norm": 0.00399011978879571, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 6380 + }, + { + "epoch": 0.17606583499173958, + "grad_norm": 0.003226799890398979, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 6381 + }, + { + "epoch": 0.17609342719280396, + "grad_norm": 0.005244703497737646, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 6382 + }, + { + "epoch": 0.17612101939386832, + "grad_norm": 0.00664431881159544, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 6383 + }, + { + "epoch": 0.1761486115949327, + "grad_norm": 0.003033186076208949, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 6384 + }, + { + "epoch": 0.17617620379599705, + "grad_norm": 0.0022742394357919693, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 6385 + }, + { + "epoch": 0.17620379599706143, + "grad_norm": 0.0029898136854171753, + "learning_rate": 0.001, + "loss": 0.4422, + "step": 6386 + }, + { + "epoch": 0.1762313881981258, + "grad_norm": 0.0022048363462090492, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 6387 + }, + { + "epoch": 0.17625898039919016, + "grad_norm": 0.0021618627943098545, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 6388 + }, + { + "epoch": 0.17628657260025454, + "grad_norm": 0.0029654805548489094, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 6389 + }, + { + "epoch": 0.1763141648013189, + "grad_norm": 0.0045374371111392975, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 6390 + }, + { + "epoch": 0.17634175700238328, + "grad_norm": 0.00429152837023139, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 6391 + }, + { + "epoch": 0.17636934920344766, + "grad_norm": 0.004207782447338104, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 6392 + }, + { + "epoch": 0.176396941404512, + "grad_norm": 0.0025363874156028032, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 6393 + }, + { + "epoch": 0.1764245336055764, + "grad_norm": 0.004385404288768768, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 6394 + }, + { + "epoch": 0.17645212580664074, + "grad_norm": 0.0033825349528342485, + "learning_rate": 0.001, + "loss": 0.407, + "step": 6395 + }, + { + "epoch": 0.17647971800770512, + "grad_norm": 0.002470629056915641, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 6396 + }, + { + "epoch": 0.1765073102087695, + "grad_norm": 0.004038370680063963, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 6397 + }, + { + "epoch": 0.17653490240983385, + "grad_norm": 0.002262383932247758, + "learning_rate": 0.001, + "loss": 0.44, + "step": 6398 + }, + { + "epoch": 0.17656249461089824, + "grad_norm": 0.0023851392325013876, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 6399 + }, + { + "epoch": 0.1765900868119626, + "grad_norm": 0.0039879628457129, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 6400 + }, + { + "epoch": 0.17661767901302697, + "grad_norm": 0.0036031249910593033, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 6401 + }, + { + "epoch": 0.17664527121409135, + "grad_norm": 0.0038508297875523567, + "learning_rate": 0.001, + "loss": 0.3642, + "step": 6402 + }, + { + "epoch": 0.1766728634151557, + "grad_norm": 0.0030231124255806208, + "learning_rate": 0.001, + "loss": 0.439, + "step": 6403 + }, + { + "epoch": 0.17670045561622008, + "grad_norm": 0.003226878121495247, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 6404 + }, + { + "epoch": 0.17672804781728443, + "grad_norm": 0.003266223007813096, + "learning_rate": 0.001, + "loss": 0.3595, + "step": 6405 + }, + { + "epoch": 0.17675564001834881, + "grad_norm": 0.003785705426707864, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 6406 + }, + { + "epoch": 0.1767832322194132, + "grad_norm": 0.0025163183454424143, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 6407 + }, + { + "epoch": 0.17681082442047755, + "grad_norm": 0.002644954714924097, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 6408 + }, + { + "epoch": 0.17683841662154193, + "grad_norm": 0.003993362188339233, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 6409 + }, + { + "epoch": 0.17686600882260628, + "grad_norm": 0.0027130364906042814, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 6410 + }, + { + "epoch": 0.17689360102367066, + "grad_norm": 0.005328681785613298, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 6411 + }, + { + "epoch": 0.17692119322473504, + "grad_norm": 0.0027323930989950895, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 6412 + }, + { + "epoch": 0.1769487854257994, + "grad_norm": 0.002667306223884225, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 6413 + }, + { + "epoch": 0.17697637762686377, + "grad_norm": 0.0038074366748332977, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 6414 + }, + { + "epoch": 0.17700396982792813, + "grad_norm": 0.002215777989476919, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 6415 + }, + { + "epoch": 0.1770315620289925, + "grad_norm": 0.002434387104585767, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 6416 + }, + { + "epoch": 0.1770591542300569, + "grad_norm": 0.0024976793210953474, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 6417 + }, + { + "epoch": 0.17708674643112124, + "grad_norm": 0.0029137188103049994, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 6418 + }, + { + "epoch": 0.17711433863218562, + "grad_norm": 0.002975389827042818, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 6419 + }, + { + "epoch": 0.17714193083324997, + "grad_norm": 0.002931037684902549, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 6420 + }, + { + "epoch": 0.17716952303431435, + "grad_norm": 0.004573407117277384, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 6421 + }, + { + "epoch": 0.17719711523537873, + "grad_norm": 0.003066561883315444, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 6422 + }, + { + "epoch": 0.17722470743644309, + "grad_norm": 0.003729255637153983, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 6423 + }, + { + "epoch": 0.17725229963750747, + "grad_norm": 0.0029258355498313904, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 6424 + }, + { + "epoch": 0.17727989183857182, + "grad_norm": 0.0034123340155929327, + "learning_rate": 0.001, + "loss": 0.405, + "step": 6425 + }, + { + "epoch": 0.1773074840396362, + "grad_norm": 0.0038247809279710054, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 6426 + }, + { + "epoch": 0.17733507624070058, + "grad_norm": 0.003323949873447418, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 6427 + }, + { + "epoch": 0.17736266844176493, + "grad_norm": 0.01532609574496746, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 6428 + }, + { + "epoch": 0.1773902606428293, + "grad_norm": 0.0025842315517365932, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 6429 + }, + { + "epoch": 0.17741785284389366, + "grad_norm": 0.003932729829102755, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 6430 + }, + { + "epoch": 0.17744544504495804, + "grad_norm": 0.00350461108610034, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 6431 + }, + { + "epoch": 0.17747303724602242, + "grad_norm": 0.0030087281484156847, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 6432 + }, + { + "epoch": 0.17750062944708678, + "grad_norm": 0.006132917013019323, + "learning_rate": 0.001, + "loss": 0.354, + "step": 6433 + }, + { + "epoch": 0.17752822164815116, + "grad_norm": 0.0028762738220393658, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 6434 + }, + { + "epoch": 0.1775558138492155, + "grad_norm": 0.0023419486824423075, + "learning_rate": 0.001, + "loss": 0.376, + "step": 6435 + }, + { + "epoch": 0.1775834060502799, + "grad_norm": 0.0026978617534041405, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 6436 + }, + { + "epoch": 0.17761099825134427, + "grad_norm": 0.003220965852960944, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 6437 + }, + { + "epoch": 0.17763859045240862, + "grad_norm": 0.013487125746905804, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 6438 + }, + { + "epoch": 0.177666182653473, + "grad_norm": 0.0024956711567938328, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 6439 + }, + { + "epoch": 0.17769377485453736, + "grad_norm": 0.002173262881115079, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 6440 + }, + { + "epoch": 0.17772136705560174, + "grad_norm": 0.0020064099226146936, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 6441 + }, + { + "epoch": 0.17774895925666612, + "grad_norm": 0.002172301523387432, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 6442 + }, + { + "epoch": 0.17777655145773047, + "grad_norm": 0.0030487473122775555, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 6443 + }, + { + "epoch": 0.17780414365879485, + "grad_norm": 0.0025476363953202963, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 6444 + }, + { + "epoch": 0.1778317358598592, + "grad_norm": 0.002622386207804084, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 6445 + }, + { + "epoch": 0.17785932806092358, + "grad_norm": 0.00240062247030437, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 6446 + }, + { + "epoch": 0.17788692026198794, + "grad_norm": 0.0036130244843661785, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 6447 + }, + { + "epoch": 0.17791451246305232, + "grad_norm": 0.002018724335357547, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 6448 + }, + { + "epoch": 0.1779421046641167, + "grad_norm": 0.0024812505580484867, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 6449 + }, + { + "epoch": 0.17796969686518105, + "grad_norm": 0.002273386111482978, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 6450 + }, + { + "epoch": 0.17799728906624543, + "grad_norm": 0.0027766439598053694, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 6451 + }, + { + "epoch": 0.17802488126730978, + "grad_norm": 0.003901482792571187, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 6452 + }, + { + "epoch": 0.17805247346837416, + "grad_norm": 0.003208085196092725, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 6453 + }, + { + "epoch": 0.17808006566943854, + "grad_norm": 0.0029840737115591764, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 6454 + }, + { + "epoch": 0.1781076578705029, + "grad_norm": 0.00228834873996675, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 6455 + }, + { + "epoch": 0.17813525007156727, + "grad_norm": 0.0035069435834884644, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 6456 + }, + { + "epoch": 0.17816284227263163, + "grad_norm": 0.0046169045381248, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 6457 + }, + { + "epoch": 0.178190434473696, + "grad_norm": 0.002904290799051523, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 6458 + }, + { + "epoch": 0.1782180266747604, + "grad_norm": 0.004634745419025421, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 6459 + }, + { + "epoch": 0.17824561887582474, + "grad_norm": 0.004808775614947081, + "learning_rate": 0.001, + "loss": 0.3519, + "step": 6460 + }, + { + "epoch": 0.17827321107688912, + "grad_norm": 0.005992877297103405, + "learning_rate": 0.001, + "loss": 0.3487, + "step": 6461 + }, + { + "epoch": 0.17830080327795347, + "grad_norm": 0.003151120152324438, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 6462 + }, + { + "epoch": 0.17832839547901785, + "grad_norm": 0.0020630250219255686, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 6463 + }, + { + "epoch": 0.17835598768008223, + "grad_norm": 0.0031087875831872225, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 6464 + }, + { + "epoch": 0.1783835798811466, + "grad_norm": 0.001903343596495688, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 6465 + }, + { + "epoch": 0.17841117208221097, + "grad_norm": 0.0029442012310028076, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 6466 + }, + { + "epoch": 0.17843876428327532, + "grad_norm": 0.003048170590773225, + "learning_rate": 0.001, + "loss": 0.3653, + "step": 6467 + }, + { + "epoch": 0.1784663564843397, + "grad_norm": 0.002289197174832225, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 6468 + }, + { + "epoch": 0.17849394868540408, + "grad_norm": 0.001974297221750021, + "learning_rate": 0.001, + "loss": 0.4418, + "step": 6469 + }, + { + "epoch": 0.17852154088646843, + "grad_norm": 0.0037421276792883873, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 6470 + }, + { + "epoch": 0.1785491330875328, + "grad_norm": 0.0027026592288166285, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 6471 + }, + { + "epoch": 0.17857672528859717, + "grad_norm": 0.0024960192386060953, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 6472 + }, + { + "epoch": 0.17860431748966155, + "grad_norm": 0.0029041236266493797, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 6473 + }, + { + "epoch": 0.17863190969072593, + "grad_norm": 0.003906071186065674, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 6474 + }, + { + "epoch": 0.17865950189179028, + "grad_norm": 0.002730944426730275, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 6475 + }, + { + "epoch": 0.17868709409285466, + "grad_norm": 0.002463391749188304, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 6476 + }, + { + "epoch": 0.178714686293919, + "grad_norm": 0.003006384475156665, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 6477 + }, + { + "epoch": 0.1787422784949834, + "grad_norm": 0.002436541486531496, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6478 + }, + { + "epoch": 0.17876987069604777, + "grad_norm": 0.007043438032269478, + "learning_rate": 0.001, + "loss": 0.42, + "step": 6479 + }, + { + "epoch": 0.17879746289711212, + "grad_norm": 0.0045332033187150955, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 6480 + }, + { + "epoch": 0.1788250550981765, + "grad_norm": 0.006813266780227423, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 6481 + }, + { + "epoch": 0.17885264729924086, + "grad_norm": 0.003106794785708189, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 6482 + }, + { + "epoch": 0.17888023950030524, + "grad_norm": 0.003665417432785034, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 6483 + }, + { + "epoch": 0.17890783170136962, + "grad_norm": 0.002157231792807579, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 6484 + }, + { + "epoch": 0.17893542390243397, + "grad_norm": 0.0032816650345921516, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 6485 + }, + { + "epoch": 0.17896301610349835, + "grad_norm": 0.003755836049094796, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 6486 + }, + { + "epoch": 0.1789906083045627, + "grad_norm": 0.0028953254222869873, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 6487 + }, + { + "epoch": 0.17901820050562708, + "grad_norm": 0.004839729517698288, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 6488 + }, + { + "epoch": 0.17904579270669146, + "grad_norm": 0.003079955466091633, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 6489 + }, + { + "epoch": 0.17907338490775582, + "grad_norm": 0.0024841863196343184, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 6490 + }, + { + "epoch": 0.1791009771088202, + "grad_norm": 0.0028867730870842934, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 6491 + }, + { + "epoch": 0.17912856930988455, + "grad_norm": 0.0027325463015586138, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 6492 + }, + { + "epoch": 0.17915616151094893, + "grad_norm": 0.006061290390789509, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 6493 + }, + { + "epoch": 0.1791837537120133, + "grad_norm": 0.002831445774063468, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 6494 + }, + { + "epoch": 0.17921134591307766, + "grad_norm": 0.004074187949299812, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 6495 + }, + { + "epoch": 0.17923893811414204, + "grad_norm": 0.0037040761671960354, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 6496 + }, + { + "epoch": 0.1792665303152064, + "grad_norm": 0.0032522703986614943, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 6497 + }, + { + "epoch": 0.17929412251627078, + "grad_norm": 0.003379018511623144, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 6498 + }, + { + "epoch": 0.17932171471733516, + "grad_norm": 0.004454959649592638, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 6499 + }, + { + "epoch": 0.1793493069183995, + "grad_norm": 0.006603573448956013, + "learning_rate": 0.001, + "loss": 0.378, + "step": 6500 + }, + { + "epoch": 0.1793493069183995, + "eval_runtime": 25.1324, + "eval_samples_per_second": 1.273, + "eval_steps_per_second": 0.159, + "step": 6500 + }, + { + "epoch": 0.1793768991194639, + "grad_norm": 0.002346447203308344, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 6501 + }, + { + "epoch": 0.17940449132052824, + "grad_norm": 0.004839559551328421, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 6502 + }, + { + "epoch": 0.17943208352159262, + "grad_norm": 0.0033710715360939503, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 6503 + }, + { + "epoch": 0.179459675722657, + "grad_norm": 0.003859972581267357, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 6504 + }, + { + "epoch": 0.17948726792372136, + "grad_norm": 0.004193128552287817, + "learning_rate": 0.001, + "loss": 0.443, + "step": 6505 + }, + { + "epoch": 0.17951486012478574, + "grad_norm": 0.0029107420705258846, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 6506 + }, + { + "epoch": 0.1795424523258501, + "grad_norm": 0.0028658658266067505, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 6507 + }, + { + "epoch": 0.17957004452691447, + "grad_norm": 0.0021101958118379116, + "learning_rate": 0.001, + "loss": 0.4653, + "step": 6508 + }, + { + "epoch": 0.17959763672797885, + "grad_norm": 0.0034610512666404247, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 6509 + }, + { + "epoch": 0.1796252289290432, + "grad_norm": 0.004026256036013365, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 6510 + }, + { + "epoch": 0.17965282113010758, + "grad_norm": 0.004078659228980541, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 6511 + }, + { + "epoch": 0.17968041333117193, + "grad_norm": 0.006540537811815739, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 6512 + }, + { + "epoch": 0.17970800553223631, + "grad_norm": 0.002590343588963151, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 6513 + }, + { + "epoch": 0.1797355977333007, + "grad_norm": 0.005247985944151878, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 6514 + }, + { + "epoch": 0.17976318993436505, + "grad_norm": 0.0027244454249739647, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 6515 + }, + { + "epoch": 0.17979078213542943, + "grad_norm": 0.002999567426741123, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 6516 + }, + { + "epoch": 0.17981837433649378, + "grad_norm": 0.002379565965384245, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 6517 + }, + { + "epoch": 0.17984596653755816, + "grad_norm": 0.0031703212298452854, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 6518 + }, + { + "epoch": 0.17987355873862254, + "grad_norm": 0.005782125052064657, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 6519 + }, + { + "epoch": 0.1799011509396869, + "grad_norm": 0.002240552334114909, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 6520 + }, + { + "epoch": 0.17992874314075127, + "grad_norm": 0.0027389421593397856, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 6521 + }, + { + "epoch": 0.17995633534181563, + "grad_norm": 0.0032193311490118504, + "learning_rate": 0.001, + "loss": 0.389, + "step": 6522 + }, + { + "epoch": 0.17998392754288, + "grad_norm": 0.002220897702500224, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 6523 + }, + { + "epoch": 0.1800115197439444, + "grad_norm": 0.0031423268374055624, + "learning_rate": 0.001, + "loss": 0.3548, + "step": 6524 + }, + { + "epoch": 0.18003911194500874, + "grad_norm": 0.0029932213947176933, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 6525 + }, + { + "epoch": 0.18006670414607312, + "grad_norm": 0.0042558046989142895, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 6526 + }, + { + "epoch": 0.18009429634713747, + "grad_norm": 0.006126770284026861, + "learning_rate": 0.001, + "loss": 0.392, + "step": 6527 + }, + { + "epoch": 0.18012188854820185, + "grad_norm": 0.0038677749689668417, + "learning_rate": 0.001, + "loss": 0.3591, + "step": 6528 + }, + { + "epoch": 0.18014948074926623, + "grad_norm": 0.004701892379671335, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 6529 + }, + { + "epoch": 0.18017707295033059, + "grad_norm": 0.002978444332256913, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 6530 + }, + { + "epoch": 0.18020466515139497, + "grad_norm": 0.003622008254751563, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 6531 + }, + { + "epoch": 0.18023225735245932, + "grad_norm": 0.004150434397161007, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 6532 + }, + { + "epoch": 0.1802598495535237, + "grad_norm": 0.006446919869631529, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 6533 + }, + { + "epoch": 0.18028744175458808, + "grad_norm": 0.0027283939998596907, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 6534 + }, + { + "epoch": 0.18031503395565243, + "grad_norm": 0.0035466288682073355, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 6535 + }, + { + "epoch": 0.1803426261567168, + "grad_norm": 0.0033509554341435432, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 6536 + }, + { + "epoch": 0.18037021835778116, + "grad_norm": 0.0052613504230976105, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 6537 + }, + { + "epoch": 0.18039781055884554, + "grad_norm": 0.0029608176555484533, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 6538 + }, + { + "epoch": 0.18042540275990993, + "grad_norm": 0.002105949679389596, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 6539 + }, + { + "epoch": 0.18045299496097428, + "grad_norm": 0.002690407680347562, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 6540 + }, + { + "epoch": 0.18048058716203866, + "grad_norm": 0.0024390683975070715, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 6541 + }, + { + "epoch": 0.180508179363103, + "grad_norm": 0.0034981141798198223, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 6542 + }, + { + "epoch": 0.1805357715641674, + "grad_norm": 0.003039612900465727, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 6543 + }, + { + "epoch": 0.18056336376523174, + "grad_norm": 0.00274271029047668, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 6544 + }, + { + "epoch": 0.18059095596629612, + "grad_norm": 0.002432740991935134, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 6545 + }, + { + "epoch": 0.1806185481673605, + "grad_norm": 0.003491653362289071, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 6546 + }, + { + "epoch": 0.18064614036842486, + "grad_norm": 0.0032173325307667255, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 6547 + }, + { + "epoch": 0.18067373256948924, + "grad_norm": 0.0029747970402240753, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 6548 + }, + { + "epoch": 0.1807013247705536, + "grad_norm": 0.007667601108551025, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 6549 + }, + { + "epoch": 0.18072891697161797, + "grad_norm": 0.003929935861378908, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 6550 + }, + { + "epoch": 0.18075650917268235, + "grad_norm": 0.004057936370372772, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 6551 + }, + { + "epoch": 0.1807841013737467, + "grad_norm": 0.003355666296556592, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 6552 + }, + { + "epoch": 0.18081169357481108, + "grad_norm": 0.0018296247581019998, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 6553 + }, + { + "epoch": 0.18083928577587544, + "grad_norm": 0.0028984597884118557, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 6554 + }, + { + "epoch": 0.18086687797693982, + "grad_norm": 0.004249675665050745, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 6555 + }, + { + "epoch": 0.1808944701780042, + "grad_norm": 0.002462320262566209, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 6556 + }, + { + "epoch": 0.18092206237906855, + "grad_norm": 0.003266765736043453, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 6557 + }, + { + "epoch": 0.18094965458013293, + "grad_norm": 0.002418224699795246, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 6558 + }, + { + "epoch": 0.18097724678119728, + "grad_norm": 0.002268604701384902, + "learning_rate": 0.001, + "loss": 0.4535, + "step": 6559 + }, + { + "epoch": 0.18100483898226166, + "grad_norm": 0.0030840514227747917, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 6560 + }, + { + "epoch": 0.18103243118332604, + "grad_norm": 0.003670538542792201, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 6561 + }, + { + "epoch": 0.1810600233843904, + "grad_norm": 0.003579066600650549, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 6562 + }, + { + "epoch": 0.18108761558545478, + "grad_norm": 0.008917649276554585, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 6563 + }, + { + "epoch": 0.18111520778651913, + "grad_norm": 0.004259804729372263, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 6564 + }, + { + "epoch": 0.1811427999875835, + "grad_norm": 0.0026127335149794817, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 6565 + }, + { + "epoch": 0.1811703921886479, + "grad_norm": 0.00313868117518723, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 6566 + }, + { + "epoch": 0.18119798438971224, + "grad_norm": 0.002577871782705188, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 6567 + }, + { + "epoch": 0.18122557659077662, + "grad_norm": 0.0029946148861199617, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 6568 + }, + { + "epoch": 0.18125316879184097, + "grad_norm": 0.0025365573819726706, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 6569 + }, + { + "epoch": 0.18128076099290535, + "grad_norm": 0.002418767660856247, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 6570 + }, + { + "epoch": 0.18130835319396973, + "grad_norm": 0.003545231418684125, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 6571 + }, + { + "epoch": 0.1813359453950341, + "grad_norm": 0.0031614559702575207, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 6572 + }, + { + "epoch": 0.18136353759609847, + "grad_norm": 0.004500371403992176, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 6573 + }, + { + "epoch": 0.18139112979716282, + "grad_norm": 0.004762687720358372, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 6574 + }, + { + "epoch": 0.1814187219982272, + "grad_norm": 0.0032746128272265196, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 6575 + }, + { + "epoch": 0.18144631419929158, + "grad_norm": 0.002015149686485529, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 6576 + }, + { + "epoch": 0.18147390640035593, + "grad_norm": 0.003133730962872505, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 6577 + }, + { + "epoch": 0.1815014986014203, + "grad_norm": 0.002634577453136444, + "learning_rate": 0.001, + "loss": 0.4255, + "step": 6578 + }, + { + "epoch": 0.18152909080248467, + "grad_norm": 0.002629198832437396, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 6579 + }, + { + "epoch": 0.18155668300354905, + "grad_norm": 0.0025706025771796703, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 6580 + }, + { + "epoch": 0.18158427520461343, + "grad_norm": 0.0024760281667113304, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 6581 + }, + { + "epoch": 0.18161186740567778, + "grad_norm": 0.002749704523012042, + "learning_rate": 0.001, + "loss": 0.441, + "step": 6582 + }, + { + "epoch": 0.18163945960674216, + "grad_norm": 0.004137085285037756, + "learning_rate": 0.001, + "loss": 0.401, + "step": 6583 + }, + { + "epoch": 0.1816670518078065, + "grad_norm": 0.003452888922765851, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 6584 + }, + { + "epoch": 0.1816946440088709, + "grad_norm": 0.00280118640512228, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 6585 + }, + { + "epoch": 0.18172223620993527, + "grad_norm": 0.002511856146156788, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 6586 + }, + { + "epoch": 0.18174982841099963, + "grad_norm": 0.003266239771619439, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 6587 + }, + { + "epoch": 0.181777420612064, + "grad_norm": 0.004074828699231148, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 6588 + }, + { + "epoch": 0.18180501281312836, + "grad_norm": 0.0032096183858811855, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 6589 + }, + { + "epoch": 0.18183260501419274, + "grad_norm": 0.004038103390485048, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 6590 + }, + { + "epoch": 0.18186019721525712, + "grad_norm": 0.002833410631865263, + "learning_rate": 0.001, + "loss": 0.412, + "step": 6591 + }, + { + "epoch": 0.18188778941632147, + "grad_norm": 0.0042181131429970264, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 6592 + }, + { + "epoch": 0.18191538161738585, + "grad_norm": 0.0023462646640837193, + "learning_rate": 0.001, + "loss": 0.4394, + "step": 6593 + }, + { + "epoch": 0.1819429738184502, + "grad_norm": 0.00263848970644176, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 6594 + }, + { + "epoch": 0.18197056601951458, + "grad_norm": 0.0036028139293193817, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 6595 + }, + { + "epoch": 0.18199815822057896, + "grad_norm": 0.0028711268678307533, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 6596 + }, + { + "epoch": 0.18202575042164332, + "grad_norm": 0.0030903166625648737, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 6597 + }, + { + "epoch": 0.1820533426227077, + "grad_norm": 0.0024692462757229805, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 6598 + }, + { + "epoch": 0.18208093482377205, + "grad_norm": 0.0031899542082101107, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 6599 + }, + { + "epoch": 0.18210852702483643, + "grad_norm": 0.003478053957223892, + "learning_rate": 0.001, + "loss": 0.399, + "step": 6600 + }, + { + "epoch": 0.1821361192259008, + "grad_norm": 0.00227729300968349, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 6601 + }, + { + "epoch": 0.18216371142696516, + "grad_norm": 0.002498425543308258, + "learning_rate": 0.001, + "loss": 0.3593, + "step": 6602 + }, + { + "epoch": 0.18219130362802954, + "grad_norm": 0.0035188214387744665, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 6603 + }, + { + "epoch": 0.1822188958290939, + "grad_norm": 0.0029362791683524847, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6604 + }, + { + "epoch": 0.18224648803015828, + "grad_norm": 0.0034354175440967083, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 6605 + }, + { + "epoch": 0.18227408023122266, + "grad_norm": 0.0026759831234812737, + "learning_rate": 0.001, + "loss": 0.3607, + "step": 6606 + }, + { + "epoch": 0.182301672432287, + "grad_norm": 0.002816120395436883, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 6607 + }, + { + "epoch": 0.1823292646333514, + "grad_norm": 0.0035022348165512085, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 6608 + }, + { + "epoch": 0.18235685683441574, + "grad_norm": 0.0034114073496311903, + "learning_rate": 0.001, + "loss": 0.3654, + "step": 6609 + }, + { + "epoch": 0.18238444903548012, + "grad_norm": 0.0029557389207184315, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 6610 + }, + { + "epoch": 0.1824120412365445, + "grad_norm": 0.0028255372308194637, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 6611 + }, + { + "epoch": 0.18243963343760886, + "grad_norm": 0.003345831297338009, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 6612 + }, + { + "epoch": 0.18246722563867324, + "grad_norm": 0.0033113721292465925, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 6613 + }, + { + "epoch": 0.1824948178397376, + "grad_norm": 0.0022858104202896357, + "learning_rate": 0.001, + "loss": 0.4601, + "step": 6614 + }, + { + "epoch": 0.18252241004080197, + "grad_norm": 0.0037697155494242907, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 6615 + }, + { + "epoch": 0.18255000224186635, + "grad_norm": 0.003321718657389283, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 6616 + }, + { + "epoch": 0.1825775944429307, + "grad_norm": 0.00823116209357977, + "learning_rate": 0.001, + "loss": 0.408, + "step": 6617 + }, + { + "epoch": 0.18260518664399508, + "grad_norm": 0.009032117202877998, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 6618 + }, + { + "epoch": 0.18263277884505943, + "grad_norm": 0.012609437108039856, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 6619 + }, + { + "epoch": 0.18266037104612382, + "grad_norm": 0.0026686121709644794, + "learning_rate": 0.001, + "loss": 0.428, + "step": 6620 + }, + { + "epoch": 0.1826879632471882, + "grad_norm": 0.004215455148369074, + "learning_rate": 0.001, + "loss": 0.412, + "step": 6621 + }, + { + "epoch": 0.18271555544825255, + "grad_norm": 0.004420561250299215, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 6622 + }, + { + "epoch": 0.18274314764931693, + "grad_norm": 0.004728737287223339, + "learning_rate": 0.001, + "loss": 0.39, + "step": 6623 + }, + { + "epoch": 0.18277073985038128, + "grad_norm": 0.0077790976502001286, + "learning_rate": 0.001, + "loss": 0.4597, + "step": 6624 + }, + { + "epoch": 0.18279833205144566, + "grad_norm": 0.005704334005713463, + "learning_rate": 0.001, + "loss": 0.38, + "step": 6625 + }, + { + "epoch": 0.18282592425251004, + "grad_norm": 0.0057289209216833115, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 6626 + }, + { + "epoch": 0.1828535164535744, + "grad_norm": 0.0031269195023924112, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 6627 + }, + { + "epoch": 0.18288110865463877, + "grad_norm": 0.005130524281412363, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 6628 + }, + { + "epoch": 0.18290870085570313, + "grad_norm": 0.003109897952526808, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 6629 + }, + { + "epoch": 0.1829362930567675, + "grad_norm": 0.003781312145292759, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 6630 + }, + { + "epoch": 0.1829638852578319, + "grad_norm": 0.002132704248651862, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 6631 + }, + { + "epoch": 0.18299147745889624, + "grad_norm": 0.0032692537643015385, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 6632 + }, + { + "epoch": 0.18301906965996062, + "grad_norm": 0.004832074046134949, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 6633 + }, + { + "epoch": 0.18304666186102497, + "grad_norm": 0.002927541034296155, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 6634 + }, + { + "epoch": 0.18307425406208935, + "grad_norm": 0.0030651381239295006, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 6635 + }, + { + "epoch": 0.1831018462631537, + "grad_norm": 0.005420647095888853, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 6636 + }, + { + "epoch": 0.1831294384642181, + "grad_norm": 0.0034076806623488665, + "learning_rate": 0.001, + "loss": 0.4476, + "step": 6637 + }, + { + "epoch": 0.18315703066528247, + "grad_norm": 0.0023421344812959433, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 6638 + }, + { + "epoch": 0.18318462286634682, + "grad_norm": 0.0025167351122945547, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 6639 + }, + { + "epoch": 0.1832122150674112, + "grad_norm": 0.0035126416478306055, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 6640 + }, + { + "epoch": 0.18323980726847555, + "grad_norm": 0.002516387961804867, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 6641 + }, + { + "epoch": 0.18326739946953993, + "grad_norm": 0.0032357322052121162, + "learning_rate": 0.001, + "loss": 0.3613, + "step": 6642 + }, + { + "epoch": 0.1832949916706043, + "grad_norm": 0.0042640650644898415, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 6643 + }, + { + "epoch": 0.18332258387166867, + "grad_norm": 0.003650575876235962, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 6644 + }, + { + "epoch": 0.18335017607273305, + "grad_norm": 0.00383795821107924, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 6645 + }, + { + "epoch": 0.1833777682737974, + "grad_norm": 0.003271985799074173, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 6646 + }, + { + "epoch": 0.18340536047486178, + "grad_norm": 0.004597705788910389, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 6647 + }, + { + "epoch": 0.18343295267592616, + "grad_norm": 0.0030701288487762213, + "learning_rate": 0.001, + "loss": 0.394, + "step": 6648 + }, + { + "epoch": 0.1834605448769905, + "grad_norm": 0.002716483548283577, + "learning_rate": 0.001, + "loss": 0.3562, + "step": 6649 + }, + { + "epoch": 0.1834881370780549, + "grad_norm": 0.00313906604424119, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 6650 + }, + { + "epoch": 0.18351572927911924, + "grad_norm": 0.002156095113605261, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 6651 + }, + { + "epoch": 0.18354332148018362, + "grad_norm": 0.005598701070994139, + "learning_rate": 0.001, + "loss": 0.394, + "step": 6652 + }, + { + "epoch": 0.183570913681248, + "grad_norm": 0.002826511859893799, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 6653 + }, + { + "epoch": 0.18359850588231236, + "grad_norm": 0.004248370416462421, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 6654 + }, + { + "epoch": 0.18362609808337674, + "grad_norm": 0.002483610762283206, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 6655 + }, + { + "epoch": 0.1836536902844411, + "grad_norm": 0.003579988144338131, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 6656 + }, + { + "epoch": 0.18368128248550547, + "grad_norm": 0.002827717922627926, + "learning_rate": 0.001, + "loss": 0.393, + "step": 6657 + }, + { + "epoch": 0.18370887468656985, + "grad_norm": 0.001899820170365274, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 6658 + }, + { + "epoch": 0.1837364668876342, + "grad_norm": 0.0031157906632870436, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 6659 + }, + { + "epoch": 0.18376405908869858, + "grad_norm": 0.0026786159723997116, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 6660 + }, + { + "epoch": 0.18379165128976294, + "grad_norm": 0.003711652010679245, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 6661 + }, + { + "epoch": 0.18381924349082732, + "grad_norm": 0.009293664246797562, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 6662 + }, + { + "epoch": 0.1838468356918917, + "grad_norm": 0.005516475066542625, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 6663 + }, + { + "epoch": 0.18387442789295605, + "grad_norm": 0.003258306998759508, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 6664 + }, + { + "epoch": 0.18390202009402043, + "grad_norm": 0.0026423600502312183, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 6665 + }, + { + "epoch": 0.18392961229508478, + "grad_norm": 0.0038184835575520992, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 6666 + }, + { + "epoch": 0.18395720449614916, + "grad_norm": 0.007692532613873482, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 6667 + }, + { + "epoch": 0.18398479669721354, + "grad_norm": 0.003459090832620859, + "learning_rate": 0.001, + "loss": 0.372, + "step": 6668 + }, + { + "epoch": 0.1840123888982779, + "grad_norm": 0.002234878484159708, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 6669 + }, + { + "epoch": 0.18403998109934228, + "grad_norm": 0.004345033783465624, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 6670 + }, + { + "epoch": 0.18406757330040663, + "grad_norm": 0.0024329267907887697, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 6671 + }, + { + "epoch": 0.184095165501471, + "grad_norm": 0.004201768897473812, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 6672 + }, + { + "epoch": 0.1841227577025354, + "grad_norm": 0.0029748522210866213, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 6673 + }, + { + "epoch": 0.18415034990359974, + "grad_norm": 0.0033948393538594246, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 6674 + }, + { + "epoch": 0.18417794210466412, + "grad_norm": 0.004755482543259859, + "learning_rate": 0.001, + "loss": 0.3606, + "step": 6675 + }, + { + "epoch": 0.18420553430572847, + "grad_norm": 0.004203738644719124, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 6676 + }, + { + "epoch": 0.18423312650679285, + "grad_norm": 0.0028434142004698515, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 6677 + }, + { + "epoch": 0.18426071870785724, + "grad_norm": 0.0036475297529250383, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 6678 + }, + { + "epoch": 0.1842883109089216, + "grad_norm": 0.0032815190497785807, + "learning_rate": 0.001, + "loss": 0.3731, + "step": 6679 + }, + { + "epoch": 0.18431590310998597, + "grad_norm": 0.0025924514047801495, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 6680 + }, + { + "epoch": 0.18434349531105032, + "grad_norm": 0.003826566506177187, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 6681 + }, + { + "epoch": 0.1843710875121147, + "grad_norm": 0.002758833346888423, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 6682 + }, + { + "epoch": 0.18439867971317908, + "grad_norm": 0.003747826674953103, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 6683 + }, + { + "epoch": 0.18442627191424343, + "grad_norm": 0.005826046224683523, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 6684 + }, + { + "epoch": 0.18445386411530781, + "grad_norm": 0.002600571606308222, + "learning_rate": 0.001, + "loss": 0.4378, + "step": 6685 + }, + { + "epoch": 0.18448145631637217, + "grad_norm": 0.0024368218146264553, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 6686 + }, + { + "epoch": 0.18450904851743655, + "grad_norm": 0.0028827337082475424, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 6687 + }, + { + "epoch": 0.18453664071850093, + "grad_norm": 0.0031948471441864967, + "learning_rate": 0.001, + "loss": 0.3473, + "step": 6688 + }, + { + "epoch": 0.18456423291956528, + "grad_norm": 0.0033101870212703943, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 6689 + }, + { + "epoch": 0.18459182512062966, + "grad_norm": 0.00872339028865099, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 6690 + }, + { + "epoch": 0.184619417321694, + "grad_norm": 0.0024567311629652977, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 6691 + }, + { + "epoch": 0.1846470095227584, + "grad_norm": 0.0036271214485168457, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 6692 + }, + { + "epoch": 0.18467460172382277, + "grad_norm": 0.004172285553067923, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 6693 + }, + { + "epoch": 0.18470219392488713, + "grad_norm": 0.002208105055615306, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 6694 + }, + { + "epoch": 0.1847297861259515, + "grad_norm": 0.00242190295830369, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 6695 + }, + { + "epoch": 0.18475737832701586, + "grad_norm": 0.002299925545230508, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 6696 + }, + { + "epoch": 0.18478497052808024, + "grad_norm": 0.0044403825886547565, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 6697 + }, + { + "epoch": 0.18481256272914462, + "grad_norm": 0.002557777799665928, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6698 + }, + { + "epoch": 0.18484015493020897, + "grad_norm": 0.003170351032167673, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 6699 + }, + { + "epoch": 0.18486774713127335, + "grad_norm": 0.00265632476657629, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 6700 + }, + { + "epoch": 0.1848953393323377, + "grad_norm": 0.00452793575823307, + "learning_rate": 0.001, + "loss": 0.413, + "step": 6701 + }, + { + "epoch": 0.18492293153340209, + "grad_norm": 0.003011427354067564, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 6702 + }, + { + "epoch": 0.18495052373446647, + "grad_norm": 0.0024044597521424294, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 6703 + }, + { + "epoch": 0.18497811593553082, + "grad_norm": 0.0035888811107724905, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 6704 + }, + { + "epoch": 0.1850057081365952, + "grad_norm": 0.002353749005123973, + "learning_rate": 0.001, + "loss": 0.4378, + "step": 6705 + }, + { + "epoch": 0.18503330033765955, + "grad_norm": 0.002476689638569951, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 6706 + }, + { + "epoch": 0.18506089253872393, + "grad_norm": 0.002863020868971944, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 6707 + }, + { + "epoch": 0.1850884847397883, + "grad_norm": 0.003525110660120845, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 6708 + }, + { + "epoch": 0.18511607694085266, + "grad_norm": 0.0031487741507589817, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 6709 + }, + { + "epoch": 0.18514366914191704, + "grad_norm": 0.0025411078240722418, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 6710 + }, + { + "epoch": 0.1851712613429814, + "grad_norm": 0.003697034902870655, + "learning_rate": 0.001, + "loss": 0.3566, + "step": 6711 + }, + { + "epoch": 0.18519885354404578, + "grad_norm": 0.0027804269921034575, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 6712 + }, + { + "epoch": 0.18522644574511016, + "grad_norm": 0.0025478231254965067, + "learning_rate": 0.001, + "loss": 0.379, + "step": 6713 + }, + { + "epoch": 0.1852540379461745, + "grad_norm": 0.005976776126772165, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 6714 + }, + { + "epoch": 0.1852816301472389, + "grad_norm": 0.004181606695055962, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 6715 + }, + { + "epoch": 0.18530922234830324, + "grad_norm": 0.002260936889797449, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 6716 + }, + { + "epoch": 0.18533681454936762, + "grad_norm": 0.004087687470018864, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 6717 + }, + { + "epoch": 0.185364406750432, + "grad_norm": 0.004004262387752533, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 6718 + }, + { + "epoch": 0.18539199895149636, + "grad_norm": 0.0027805129066109657, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 6719 + }, + { + "epoch": 0.18541959115256074, + "grad_norm": 0.0028537947218865156, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 6720 + }, + { + "epoch": 0.1854471833536251, + "grad_norm": 0.002309284871444106, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 6721 + }, + { + "epoch": 0.18547477555468947, + "grad_norm": 0.002640241291373968, + "learning_rate": 0.001, + "loss": 0.371, + "step": 6722 + }, + { + "epoch": 0.18550236775575385, + "grad_norm": 0.0032795467413961887, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 6723 + }, + { + "epoch": 0.1855299599568182, + "grad_norm": 0.0028754386585205793, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 6724 + }, + { + "epoch": 0.18555755215788258, + "grad_norm": 0.003849068423733115, + "learning_rate": 0.001, + "loss": 0.3489, + "step": 6725 + }, + { + "epoch": 0.18558514435894694, + "grad_norm": 0.0026304719503968954, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 6726 + }, + { + "epoch": 0.18561273656001132, + "grad_norm": 0.004114130511879921, + "learning_rate": 0.001, + "loss": 0.3534, + "step": 6727 + }, + { + "epoch": 0.1856403287610757, + "grad_norm": 0.0047980137169361115, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 6728 + }, + { + "epoch": 0.18566792096214005, + "grad_norm": 0.0033457186073064804, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 6729 + }, + { + "epoch": 0.18569551316320443, + "grad_norm": 0.0024908706545829773, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 6730 + }, + { + "epoch": 0.18572310536426878, + "grad_norm": 0.003977627027779818, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 6731 + }, + { + "epoch": 0.18575069756533316, + "grad_norm": 0.002345160348340869, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 6732 + }, + { + "epoch": 0.18577828976639751, + "grad_norm": 0.0025549698621034622, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 6733 + }, + { + "epoch": 0.1858058819674619, + "grad_norm": 0.002513099228963256, + "learning_rate": 0.001, + "loss": 0.398, + "step": 6734 + }, + { + "epoch": 0.18583347416852627, + "grad_norm": 0.00270891678519547, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 6735 + }, + { + "epoch": 0.18586106636959063, + "grad_norm": 0.002633742755278945, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 6736 + }, + { + "epoch": 0.185888658570655, + "grad_norm": 0.0025300132110714912, + "learning_rate": 0.001, + "loss": 0.4546, + "step": 6737 + }, + { + "epoch": 0.18591625077171936, + "grad_norm": 0.002625576453283429, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 6738 + }, + { + "epoch": 0.18594384297278374, + "grad_norm": 0.002719470066949725, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 6739 + }, + { + "epoch": 0.18597143517384812, + "grad_norm": 0.002173792105168104, + "learning_rate": 0.001, + "loss": 0.388, + "step": 6740 + }, + { + "epoch": 0.18599902737491247, + "grad_norm": 0.0022414301056414843, + "learning_rate": 0.001, + "loss": 0.4405, + "step": 6741 + }, + { + "epoch": 0.18602661957597685, + "grad_norm": 0.0032112672924995422, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 6742 + }, + { + "epoch": 0.1860542117770412, + "grad_norm": 0.002975397277623415, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 6743 + }, + { + "epoch": 0.1860818039781056, + "grad_norm": 0.0024268808774650097, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 6744 + }, + { + "epoch": 0.18610939617916997, + "grad_norm": 0.0031276950612664223, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 6745 + }, + { + "epoch": 0.18613698838023432, + "grad_norm": 0.0024137012660503387, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 6746 + }, + { + "epoch": 0.1861645805812987, + "grad_norm": 0.002811797196045518, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 6747 + }, + { + "epoch": 0.18619217278236305, + "grad_norm": 0.004455705638974905, + "learning_rate": 0.001, + "loss": 0.394, + "step": 6748 + }, + { + "epoch": 0.18621976498342743, + "grad_norm": 0.0033044186420738697, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 6749 + }, + { + "epoch": 0.1862473571844918, + "grad_norm": 0.0033213391434401274, + "learning_rate": 0.001, + "loss": 0.38, + "step": 6750 + }, + { + "epoch": 0.18627494938555617, + "grad_norm": 0.002652913797646761, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 6751 + }, + { + "epoch": 0.18630254158662055, + "grad_norm": 0.003949091769754887, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 6752 + }, + { + "epoch": 0.1863301337876849, + "grad_norm": 0.002760807517915964, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 6753 + }, + { + "epoch": 0.18635772598874928, + "grad_norm": 0.0035468130372464657, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 6754 + }, + { + "epoch": 0.18638531818981366, + "grad_norm": 0.005636704154312611, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 6755 + }, + { + "epoch": 0.186412910390878, + "grad_norm": 0.002743509830906987, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 6756 + }, + { + "epoch": 0.1864405025919424, + "grad_norm": 0.0033133768010884523, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 6757 + }, + { + "epoch": 0.18646809479300674, + "grad_norm": 0.004021006636321545, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 6758 + }, + { + "epoch": 0.18649568699407112, + "grad_norm": 0.0037046188954263926, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 6759 + }, + { + "epoch": 0.1865232791951355, + "grad_norm": 0.0019486859673634171, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 6760 + }, + { + "epoch": 0.18655087139619986, + "grad_norm": 0.005325522273778915, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 6761 + }, + { + "epoch": 0.18657846359726424, + "grad_norm": 0.003112213918939233, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 6762 + }, + { + "epoch": 0.1866060557983286, + "grad_norm": 0.003908544313162565, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 6763 + }, + { + "epoch": 0.18663364799939297, + "grad_norm": 0.0025811195373535156, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 6764 + }, + { + "epoch": 0.18666124020045735, + "grad_norm": 0.002601012121886015, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 6765 + }, + { + "epoch": 0.1866888324015217, + "grad_norm": 0.0023691104725003242, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 6766 + }, + { + "epoch": 0.18671642460258608, + "grad_norm": 0.0021417406387627125, + "learning_rate": 0.001, + "loss": 0.3635, + "step": 6767 + }, + { + "epoch": 0.18674401680365044, + "grad_norm": 0.0022483544889837503, + "learning_rate": 0.001, + "loss": 0.415, + "step": 6768 + }, + { + "epoch": 0.18677160900471482, + "grad_norm": 0.0032693545799702406, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 6769 + }, + { + "epoch": 0.1867992012057792, + "grad_norm": 0.002980321878567338, + "learning_rate": 0.001, + "loss": 0.4426, + "step": 6770 + }, + { + "epoch": 0.18682679340684355, + "grad_norm": 0.0027730639558285475, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 6771 + }, + { + "epoch": 0.18685438560790793, + "grad_norm": 0.002590013900771737, + "learning_rate": 0.001, + "loss": 0.3585, + "step": 6772 + }, + { + "epoch": 0.18688197780897228, + "grad_norm": 0.00434154225513339, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 6773 + }, + { + "epoch": 0.18690957001003666, + "grad_norm": 0.002608590293675661, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 6774 + }, + { + "epoch": 0.18693716221110104, + "grad_norm": 0.002211198676377535, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 6775 + }, + { + "epoch": 0.1869647544121654, + "grad_norm": 0.002349904738366604, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 6776 + }, + { + "epoch": 0.18699234661322978, + "grad_norm": 0.002399194985628128, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 6777 + }, + { + "epoch": 0.18701993881429413, + "grad_norm": 0.0026427030097693205, + "learning_rate": 0.001, + "loss": 0.386, + "step": 6778 + }, + { + "epoch": 0.1870475310153585, + "grad_norm": 0.00331774540245533, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 6779 + }, + { + "epoch": 0.1870751232164229, + "grad_norm": 0.003382153809070587, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 6780 + }, + { + "epoch": 0.18710271541748724, + "grad_norm": 0.002654004842042923, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 6781 + }, + { + "epoch": 0.18713030761855162, + "grad_norm": 0.005028711166232824, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 6782 + }, + { + "epoch": 0.18715789981961597, + "grad_norm": 0.002557139378041029, + "learning_rate": 0.001, + "loss": 0.384, + "step": 6783 + }, + { + "epoch": 0.18718549202068036, + "grad_norm": 0.003317068563774228, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 6784 + }, + { + "epoch": 0.18721308422174474, + "grad_norm": 0.007066572085022926, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 6785 + }, + { + "epoch": 0.1872406764228091, + "grad_norm": 0.002671597758308053, + "learning_rate": 0.001, + "loss": 0.402, + "step": 6786 + }, + { + "epoch": 0.18726826862387347, + "grad_norm": 0.0031658501829952, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 6787 + }, + { + "epoch": 0.18729586082493782, + "grad_norm": 0.004446005914360285, + "learning_rate": 0.001, + "loss": 0.387, + "step": 6788 + }, + { + "epoch": 0.1873234530260022, + "grad_norm": 0.004098923876881599, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 6789 + }, + { + "epoch": 0.18735104522706658, + "grad_norm": 0.010191929526627064, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 6790 + }, + { + "epoch": 0.18737863742813093, + "grad_norm": 0.0034303830470889807, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 6791 + }, + { + "epoch": 0.18740622962919531, + "grad_norm": 0.0028527514077723026, + "learning_rate": 0.001, + "loss": 0.4453, + "step": 6792 + }, + { + "epoch": 0.18743382183025967, + "grad_norm": 0.003259077901020646, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 6793 + }, + { + "epoch": 0.18746141403132405, + "grad_norm": 0.003400780726224184, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 6794 + }, + { + "epoch": 0.18748900623238843, + "grad_norm": 0.0031762244179844856, + "learning_rate": 0.001, + "loss": 0.398, + "step": 6795 + }, + { + "epoch": 0.18751659843345278, + "grad_norm": 0.0025605822447687387, + "learning_rate": 0.001, + "loss": 0.4467, + "step": 6796 + }, + { + "epoch": 0.18754419063451716, + "grad_norm": 0.0029185418970882893, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 6797 + }, + { + "epoch": 0.1875717828355815, + "grad_norm": 0.0023754434660077095, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 6798 + }, + { + "epoch": 0.1875993750366459, + "grad_norm": 0.002524258103221655, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 6799 + }, + { + "epoch": 0.18762696723771027, + "grad_norm": 0.0024060863070189953, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 6800 + }, + { + "epoch": 0.18765455943877463, + "grad_norm": 0.0025536813773214817, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 6801 + }, + { + "epoch": 0.187682151639839, + "grad_norm": 0.0032576583325862885, + "learning_rate": 0.001, + "loss": 0.4294, + "step": 6802 + }, + { + "epoch": 0.18770974384090336, + "grad_norm": 0.004546053241938353, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 6803 + }, + { + "epoch": 0.18773733604196774, + "grad_norm": 0.0041918703354895115, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 6804 + }, + { + "epoch": 0.18776492824303212, + "grad_norm": 0.0028781883884221315, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 6805 + }, + { + "epoch": 0.18779252044409647, + "grad_norm": 0.0031794614624232054, + "learning_rate": 0.001, + "loss": 0.413, + "step": 6806 + }, + { + "epoch": 0.18782011264516085, + "grad_norm": 0.0023967409506440163, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 6807 + }, + { + "epoch": 0.1878477048462252, + "grad_norm": 0.0029673967510461807, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 6808 + }, + { + "epoch": 0.18787529704728959, + "grad_norm": 0.0028512347489595413, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 6809 + }, + { + "epoch": 0.18790288924835397, + "grad_norm": 0.0031782910227775574, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 6810 + }, + { + "epoch": 0.18793048144941832, + "grad_norm": 0.00338121154345572, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 6811 + }, + { + "epoch": 0.1879580736504827, + "grad_norm": 0.002484740223735571, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 6812 + }, + { + "epoch": 0.18798566585154705, + "grad_norm": 0.0032946360297501087, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 6813 + }, + { + "epoch": 0.18801325805261143, + "grad_norm": 0.003030691295862198, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 6814 + }, + { + "epoch": 0.1880408502536758, + "grad_norm": 0.004018679261207581, + "learning_rate": 0.001, + "loss": 0.404, + "step": 6815 + }, + { + "epoch": 0.18806844245474016, + "grad_norm": 0.004374852403998375, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 6816 + }, + { + "epoch": 0.18809603465580454, + "grad_norm": 0.003533913753926754, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 6817 + }, + { + "epoch": 0.1881236268568689, + "grad_norm": 0.0031644238624721766, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 6818 + }, + { + "epoch": 0.18815121905793328, + "grad_norm": 0.003066874807700515, + "learning_rate": 0.001, + "loss": 0.4544, + "step": 6819 + }, + { + "epoch": 0.18817881125899766, + "grad_norm": 0.0024850773625075817, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 6820 + }, + { + "epoch": 0.188206403460062, + "grad_norm": 0.002926696091890335, + "learning_rate": 0.001, + "loss": 0.3464, + "step": 6821 + }, + { + "epoch": 0.1882339956611264, + "grad_norm": 0.00351322372443974, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6822 + }, + { + "epoch": 0.18826158786219074, + "grad_norm": 0.0029151032213121653, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 6823 + }, + { + "epoch": 0.18828918006325512, + "grad_norm": 0.004052076023072004, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 6824 + }, + { + "epoch": 0.18831677226431948, + "grad_norm": 0.00337521662004292, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 6825 + }, + { + "epoch": 0.18834436446538386, + "grad_norm": 0.003744855523109436, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 6826 + }, + { + "epoch": 0.18837195666644824, + "grad_norm": 0.0029044130351394415, + "learning_rate": 0.001, + "loss": 0.388, + "step": 6827 + }, + { + "epoch": 0.1883995488675126, + "grad_norm": 0.003070064587518573, + "learning_rate": 0.001, + "loss": 0.397, + "step": 6828 + }, + { + "epoch": 0.18842714106857697, + "grad_norm": 0.0030712231528013945, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 6829 + }, + { + "epoch": 0.18845473326964132, + "grad_norm": 0.0035601078998297453, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 6830 + }, + { + "epoch": 0.1884823254707057, + "grad_norm": 0.0034437361173331738, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 6831 + }, + { + "epoch": 0.18850991767177008, + "grad_norm": 0.003156115999445319, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 6832 + }, + { + "epoch": 0.18853750987283444, + "grad_norm": 0.0028527495451271534, + "learning_rate": 0.001, + "loss": 0.425, + "step": 6833 + }, + { + "epoch": 0.18856510207389882, + "grad_norm": 0.0044526634737849236, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 6834 + }, + { + "epoch": 0.18859269427496317, + "grad_norm": 0.006967521738260984, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 6835 + }, + { + "epoch": 0.18862028647602755, + "grad_norm": 0.0034451198298484087, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 6836 + }, + { + "epoch": 0.18864787867709193, + "grad_norm": 0.004877714905887842, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 6837 + }, + { + "epoch": 0.18867547087815628, + "grad_norm": 0.00429321825504303, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 6838 + }, + { + "epoch": 0.18870306307922066, + "grad_norm": 0.0028502768836915493, + "learning_rate": 0.001, + "loss": 0.3603, + "step": 6839 + }, + { + "epoch": 0.18873065528028501, + "grad_norm": 0.0023765196092426777, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 6840 + }, + { + "epoch": 0.1887582474813494, + "grad_norm": 0.0028856396675109863, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 6841 + }, + { + "epoch": 0.18878583968241378, + "grad_norm": 0.004223294090479612, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 6842 + }, + { + "epoch": 0.18881343188347813, + "grad_norm": 0.003582458943128586, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 6843 + }, + { + "epoch": 0.1888410240845425, + "grad_norm": 0.004069041460752487, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 6844 + }, + { + "epoch": 0.18886861628560686, + "grad_norm": 0.004946923349052668, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 6845 + }, + { + "epoch": 0.18889620848667124, + "grad_norm": 0.002650062320753932, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 6846 + }, + { + "epoch": 0.18892380068773562, + "grad_norm": 0.002152357017621398, + "learning_rate": 0.001, + "loss": 0.452, + "step": 6847 + }, + { + "epoch": 0.18895139288879997, + "grad_norm": 0.0030690866988152266, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 6848 + }, + { + "epoch": 0.18897898508986435, + "grad_norm": 0.0022116086911410093, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 6849 + }, + { + "epoch": 0.1890065772909287, + "grad_norm": 0.0023306317161768675, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 6850 + }, + { + "epoch": 0.1890341694919931, + "grad_norm": 0.002590792253613472, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 6851 + }, + { + "epoch": 0.18906176169305747, + "grad_norm": 0.005286765284836292, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 6852 + }, + { + "epoch": 0.18908935389412182, + "grad_norm": 0.006548671051859856, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 6853 + }, + { + "epoch": 0.1891169460951862, + "grad_norm": 0.002917979843914509, + "learning_rate": 0.001, + "loss": 0.4222, + "step": 6854 + }, + { + "epoch": 0.18914453829625055, + "grad_norm": 0.002655414631590247, + "learning_rate": 0.001, + "loss": 0.3637, + "step": 6855 + }, + { + "epoch": 0.18917213049731493, + "grad_norm": 0.003176578553393483, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 6856 + }, + { + "epoch": 0.1891997226983793, + "grad_norm": 0.0026494793128222227, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 6857 + }, + { + "epoch": 0.18922731489944367, + "grad_norm": 0.0027524055913090706, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 6858 + }, + { + "epoch": 0.18925490710050805, + "grad_norm": 0.0025463078636676073, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 6859 + }, + { + "epoch": 0.1892824993015724, + "grad_norm": 0.011269212700426579, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 6860 + }, + { + "epoch": 0.18931009150263678, + "grad_norm": 0.003725286340340972, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 6861 + }, + { + "epoch": 0.18933768370370116, + "grad_norm": 0.0022669502068310976, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 6862 + }, + { + "epoch": 0.1893652759047655, + "grad_norm": 0.0022833060938864946, + "learning_rate": 0.001, + "loss": 0.391, + "step": 6863 + }, + { + "epoch": 0.1893928681058299, + "grad_norm": 0.0027603027410805225, + "learning_rate": 0.001, + "loss": 0.4338, + "step": 6864 + }, + { + "epoch": 0.18942046030689424, + "grad_norm": 0.00340470764786005, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 6865 + }, + { + "epoch": 0.18944805250795863, + "grad_norm": 0.003243909915909171, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 6866 + }, + { + "epoch": 0.189475644709023, + "grad_norm": 0.003641802351921797, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 6867 + }, + { + "epoch": 0.18950323691008736, + "grad_norm": 0.00207584910094738, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 6868 + }, + { + "epoch": 0.18953082911115174, + "grad_norm": 0.0021775367204099894, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 6869 + }, + { + "epoch": 0.1895584213122161, + "grad_norm": 0.0023639260325580835, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 6870 + }, + { + "epoch": 0.18958601351328047, + "grad_norm": 0.0024233164731413126, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 6871 + }, + { + "epoch": 0.18961360571434485, + "grad_norm": 0.0024376865476369858, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 6872 + }, + { + "epoch": 0.1896411979154092, + "grad_norm": 0.004064356442540884, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 6873 + }, + { + "epoch": 0.18966879011647358, + "grad_norm": 0.0038676555268466473, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 6874 + }, + { + "epoch": 0.18969638231753794, + "grad_norm": 0.002870697295293212, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 6875 + }, + { + "epoch": 0.18972397451860232, + "grad_norm": 0.005860270466655493, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 6876 + }, + { + "epoch": 0.1897515667196667, + "grad_norm": 0.0024599696043878794, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 6877 + }, + { + "epoch": 0.18977915892073105, + "grad_norm": 0.002083956263959408, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 6878 + }, + { + "epoch": 0.18980675112179543, + "grad_norm": 0.0021180741023272276, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 6879 + }, + { + "epoch": 0.18983434332285978, + "grad_norm": 0.0028299293480813503, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 6880 + }, + { + "epoch": 0.18986193552392416, + "grad_norm": 0.009525451809167862, + "learning_rate": 0.001, + "loss": 0.362, + "step": 6881 + }, + { + "epoch": 0.18988952772498854, + "grad_norm": 0.0036288495175540447, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 6882 + }, + { + "epoch": 0.1899171199260529, + "grad_norm": 0.0025045403745025396, + "learning_rate": 0.001, + "loss": 0.4469, + "step": 6883 + }, + { + "epoch": 0.18994471212711728, + "grad_norm": 0.006380067672580481, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 6884 + }, + { + "epoch": 0.18997230432818163, + "grad_norm": 0.0035281891468912363, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 6885 + }, + { + "epoch": 0.189999896529246, + "grad_norm": 0.012035722844302654, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 6886 + }, + { + "epoch": 0.1900274887303104, + "grad_norm": 0.004180245101451874, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 6887 + }, + { + "epoch": 0.19005508093137474, + "grad_norm": 0.002991945017129183, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 6888 + }, + { + "epoch": 0.19008267313243912, + "grad_norm": 0.002699353964999318, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 6889 + }, + { + "epoch": 0.19011026533350348, + "grad_norm": 0.009517242200672626, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 6890 + }, + { + "epoch": 0.19013785753456786, + "grad_norm": 0.002533604623749852, + "learning_rate": 0.001, + "loss": 0.419, + "step": 6891 + }, + { + "epoch": 0.19016544973563224, + "grad_norm": 0.0020676185376942158, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 6892 + }, + { + "epoch": 0.1901930419366966, + "grad_norm": 0.0027616149745881557, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 6893 + }, + { + "epoch": 0.19022063413776097, + "grad_norm": 0.0025242832489311695, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 6894 + }, + { + "epoch": 0.19024822633882532, + "grad_norm": 0.005067579913884401, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 6895 + }, + { + "epoch": 0.1902758185398897, + "grad_norm": 0.005015381146222353, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 6896 + }, + { + "epoch": 0.19030341074095408, + "grad_norm": 0.010791119188070297, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 6897 + }, + { + "epoch": 0.19033100294201843, + "grad_norm": 0.00212705135345459, + "learning_rate": 0.001, + "loss": 0.4522, + "step": 6898 + }, + { + "epoch": 0.19035859514308281, + "grad_norm": 0.0018394197104498744, + "learning_rate": 0.001, + "loss": 0.4393, + "step": 6899 + }, + { + "epoch": 0.19038618734414717, + "grad_norm": 0.002272665733471513, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 6900 + }, + { + "epoch": 0.19041377954521155, + "grad_norm": 0.002068854635581374, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 6901 + }, + { + "epoch": 0.19044137174627593, + "grad_norm": 0.003429130418226123, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 6902 + }, + { + "epoch": 0.19046896394734028, + "grad_norm": 0.006805672310292721, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 6903 + }, + { + "epoch": 0.19049655614840466, + "grad_norm": 0.002861461602151394, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 6904 + }, + { + "epoch": 0.190524148349469, + "grad_norm": 0.002513586077839136, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 6905 + }, + { + "epoch": 0.1905517405505334, + "grad_norm": 0.003469777060672641, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 6906 + }, + { + "epoch": 0.19057933275159777, + "grad_norm": 0.003417365485802293, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 6907 + }, + { + "epoch": 0.19060692495266213, + "grad_norm": 0.0020143757574260235, + "learning_rate": 0.001, + "loss": 0.4343, + "step": 6908 + }, + { + "epoch": 0.1906345171537265, + "grad_norm": 0.002093646442517638, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 6909 + }, + { + "epoch": 0.19066210935479086, + "grad_norm": 0.001849993597716093, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 6910 + }, + { + "epoch": 0.19068970155585524, + "grad_norm": 0.002308469032868743, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 6911 + }, + { + "epoch": 0.19071729375691962, + "grad_norm": 0.00342297344468534, + "learning_rate": 0.001, + "loss": 0.3559, + "step": 6912 + }, + { + "epoch": 0.19074488595798397, + "grad_norm": 0.002367036882787943, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 6913 + }, + { + "epoch": 0.19077247815904835, + "grad_norm": 0.003705595852807164, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 6914 + }, + { + "epoch": 0.1908000703601127, + "grad_norm": 0.0037233191542327404, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 6915 + }, + { + "epoch": 0.19082766256117709, + "grad_norm": 0.0024186784867197275, + "learning_rate": 0.001, + "loss": 0.4397, + "step": 6916 + }, + { + "epoch": 0.19085525476224144, + "grad_norm": 0.0020412628073245287, + "learning_rate": 0.001, + "loss": 0.4353, + "step": 6917 + }, + { + "epoch": 0.19088284696330582, + "grad_norm": 0.002167558530345559, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 6918 + }, + { + "epoch": 0.1909104391643702, + "grad_norm": 0.002329624257981777, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 6919 + }, + { + "epoch": 0.19093803136543455, + "grad_norm": 0.0029148284811526537, + "learning_rate": 0.001, + "loss": 0.395, + "step": 6920 + }, + { + "epoch": 0.19096562356649893, + "grad_norm": 0.002269695047289133, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 6921 + }, + { + "epoch": 0.19099321576756328, + "grad_norm": 0.002682054415345192, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 6922 + }, + { + "epoch": 0.19102080796862766, + "grad_norm": 0.0024830945767462254, + "learning_rate": 0.001, + "loss": 0.4511, + "step": 6923 + }, + { + "epoch": 0.19104840016969205, + "grad_norm": 0.003123142756521702, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 6924 + }, + { + "epoch": 0.1910759923707564, + "grad_norm": 0.004371596500277519, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 6925 + }, + { + "epoch": 0.19110358457182078, + "grad_norm": 0.0042950925417244434, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 6926 + }, + { + "epoch": 0.19113117677288513, + "grad_norm": 0.0022360682487487793, + "learning_rate": 0.001, + "loss": 0.4616, + "step": 6927 + }, + { + "epoch": 0.1911587689739495, + "grad_norm": 0.0026784553192555904, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 6928 + }, + { + "epoch": 0.1911863611750139, + "grad_norm": 0.003034649882465601, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 6929 + }, + { + "epoch": 0.19121395337607824, + "grad_norm": 0.006058567203581333, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 6930 + }, + { + "epoch": 0.19124154557714262, + "grad_norm": 0.003582082223147154, + "learning_rate": 0.001, + "loss": 0.3453, + "step": 6931 + }, + { + "epoch": 0.19126913777820698, + "grad_norm": 0.002336689503863454, + "learning_rate": 0.001, + "loss": 0.4432, + "step": 6932 + }, + { + "epoch": 0.19129672997927136, + "grad_norm": 0.0033224106300622225, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 6933 + }, + { + "epoch": 0.19132432218033574, + "grad_norm": 0.005102561321109533, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 6934 + }, + { + "epoch": 0.1913519143814001, + "grad_norm": 0.0050558787770569324, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 6935 + }, + { + "epoch": 0.19137950658246447, + "grad_norm": 0.007568387780338526, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 6936 + }, + { + "epoch": 0.19140709878352882, + "grad_norm": 0.003738366300240159, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 6937 + }, + { + "epoch": 0.1914346909845932, + "grad_norm": 0.0034411989618092775, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 6938 + }, + { + "epoch": 0.19146228318565758, + "grad_norm": 0.0027622180059552193, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 6939 + }, + { + "epoch": 0.19148987538672194, + "grad_norm": 0.0026892200112342834, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6940 + }, + { + "epoch": 0.19151746758778632, + "grad_norm": 0.002897805068641901, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 6941 + }, + { + "epoch": 0.19154505978885067, + "grad_norm": 0.0027761433739215136, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 6942 + }, + { + "epoch": 0.19157265198991505, + "grad_norm": 0.0034833746030926704, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 6943 + }, + { + "epoch": 0.19160024419097943, + "grad_norm": 0.004619366955012083, + "learning_rate": 0.001, + "loss": 0.4471, + "step": 6944 + }, + { + "epoch": 0.19162783639204378, + "grad_norm": 0.003163927001878619, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 6945 + }, + { + "epoch": 0.19165542859310816, + "grad_norm": 0.005678548477590084, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 6946 + }, + { + "epoch": 0.19168302079417252, + "grad_norm": 0.0026377500034868717, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 6947 + }, + { + "epoch": 0.1917106129952369, + "grad_norm": 0.006021553184837103, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 6948 + }, + { + "epoch": 0.19173820519630128, + "grad_norm": 0.0024780158419162035, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 6949 + }, + { + "epoch": 0.19176579739736563, + "grad_norm": 0.002402723301202059, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 6950 + }, + { + "epoch": 0.19179338959843, + "grad_norm": 0.003152204444631934, + "learning_rate": 0.001, + "loss": 0.351, + "step": 6951 + }, + { + "epoch": 0.19182098179949436, + "grad_norm": 0.003795337863266468, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 6952 + }, + { + "epoch": 0.19184857400055874, + "grad_norm": 0.002820979803800583, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 6953 + }, + { + "epoch": 0.19187616620162312, + "grad_norm": 0.003103316528722644, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 6954 + }, + { + "epoch": 0.19190375840268747, + "grad_norm": 0.002751655410975218, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 6955 + }, + { + "epoch": 0.19193135060375185, + "grad_norm": 0.002747628604993224, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 6956 + }, + { + "epoch": 0.1919589428048162, + "grad_norm": 0.0033904961310327053, + "learning_rate": 0.001, + "loss": 0.382, + "step": 6957 + }, + { + "epoch": 0.1919865350058806, + "grad_norm": 0.002641551662236452, + "learning_rate": 0.001, + "loss": 0.3573, + "step": 6958 + }, + { + "epoch": 0.19201412720694497, + "grad_norm": 0.0025419299490749836, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 6959 + }, + { + "epoch": 0.19204171940800932, + "grad_norm": 0.0028294960502535105, + "learning_rate": 0.001, + "loss": 0.3585, + "step": 6960 + }, + { + "epoch": 0.1920693116090737, + "grad_norm": 0.0031680443789809942, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 6961 + }, + { + "epoch": 0.19209690381013805, + "grad_norm": 0.0024578890297561884, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 6962 + }, + { + "epoch": 0.19212449601120243, + "grad_norm": 0.002299124840646982, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 6963 + }, + { + "epoch": 0.19215208821226681, + "grad_norm": 0.0031422816682606936, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 6964 + }, + { + "epoch": 0.19217968041333117, + "grad_norm": 0.004103971645236015, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 6965 + }, + { + "epoch": 0.19220727261439555, + "grad_norm": 0.002797805005684495, + "learning_rate": 0.001, + "loss": 0.415, + "step": 6966 + }, + { + "epoch": 0.1922348648154599, + "grad_norm": 0.006955363787710667, + "learning_rate": 0.001, + "loss": 0.384, + "step": 6967 + }, + { + "epoch": 0.19226245701652428, + "grad_norm": 0.0027699500788003206, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 6968 + }, + { + "epoch": 0.19229004921758866, + "grad_norm": 0.006645936518907547, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 6969 + }, + { + "epoch": 0.192317641418653, + "grad_norm": 0.003216095734387636, + "learning_rate": 0.001, + "loss": 0.4473, + "step": 6970 + }, + { + "epoch": 0.1923452336197174, + "grad_norm": 0.002484220312908292, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 6971 + }, + { + "epoch": 0.19237282582078175, + "grad_norm": 0.0034046086948364973, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 6972 + }, + { + "epoch": 0.19240041802184613, + "grad_norm": 0.0025209994055330753, + "learning_rate": 0.001, + "loss": 0.4586, + "step": 6973 + }, + { + "epoch": 0.1924280102229105, + "grad_norm": 0.004154059570282698, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 6974 + }, + { + "epoch": 0.19245560242397486, + "grad_norm": 0.003831770271062851, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 6975 + }, + { + "epoch": 0.19248319462503924, + "grad_norm": 0.005726317409425974, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 6976 + }, + { + "epoch": 0.1925107868261036, + "grad_norm": 0.0032162668649107218, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 6977 + }, + { + "epoch": 0.19253837902716797, + "grad_norm": 0.0027887041214853525, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 6978 + }, + { + "epoch": 0.19256597122823235, + "grad_norm": 0.0031014943961054087, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 6979 + }, + { + "epoch": 0.1925935634292967, + "grad_norm": 0.0026730517856776714, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 6980 + }, + { + "epoch": 0.19262115563036109, + "grad_norm": 0.002030472969636321, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 6981 + }, + { + "epoch": 0.19264874783142544, + "grad_norm": 0.0030617276206612587, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 6982 + }, + { + "epoch": 0.19267634003248982, + "grad_norm": 0.0025087720714509487, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 6983 + }, + { + "epoch": 0.1927039322335542, + "grad_norm": 0.0020648448262363672, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 6984 + }, + { + "epoch": 0.19273152443461855, + "grad_norm": 0.002378196455538273, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 6985 + }, + { + "epoch": 0.19275911663568293, + "grad_norm": 0.0027894238010048866, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 6986 + }, + { + "epoch": 0.19278670883674728, + "grad_norm": 0.002425672486424446, + "learning_rate": 0.001, + "loss": 0.3512, + "step": 6987 + }, + { + "epoch": 0.19281430103781166, + "grad_norm": 0.0021580031607300043, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 6988 + }, + { + "epoch": 0.19284189323887604, + "grad_norm": 0.0041187843307852745, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 6989 + }, + { + "epoch": 0.1928694854399404, + "grad_norm": 0.0023956988006830215, + "learning_rate": 0.001, + "loss": 0.4488, + "step": 6990 + }, + { + "epoch": 0.19289707764100478, + "grad_norm": 0.0022000286262482405, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 6991 + }, + { + "epoch": 0.19292466984206913, + "grad_norm": 0.0037109351251274347, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 6992 + }, + { + "epoch": 0.1929522620431335, + "grad_norm": 0.0028746852185577154, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 6993 + }, + { + "epoch": 0.1929798542441979, + "grad_norm": 0.0026120489928871393, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 6994 + }, + { + "epoch": 0.19300744644526224, + "grad_norm": 0.0025121879298239946, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 6995 + }, + { + "epoch": 0.19303503864632662, + "grad_norm": 0.0026690547820180655, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 6996 + }, + { + "epoch": 0.19306263084739098, + "grad_norm": 0.0021126086357980967, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 6997 + }, + { + "epoch": 0.19309022304845536, + "grad_norm": 0.007225458975881338, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 6998 + }, + { + "epoch": 0.19311781524951974, + "grad_norm": 0.002150959335267544, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 6999 + }, + { + "epoch": 0.1931454074505841, + "grad_norm": 0.002550938166677952, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 7000 + }, + { + "epoch": 0.1931454074505841, + "eval_runtime": 24.8881, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.161, + "step": 7000 + }, + { + "epoch": 0.19317299965164847, + "grad_norm": 0.00321765523403883, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 7001 + }, + { + "epoch": 0.19320059185271282, + "grad_norm": 0.0028067470993846655, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 7002 + }, + { + "epoch": 0.1932281840537772, + "grad_norm": 0.002428458072245121, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 7003 + }, + { + "epoch": 0.19325577625484158, + "grad_norm": 0.003469350514933467, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 7004 + }, + { + "epoch": 0.19328336845590594, + "grad_norm": 0.0026082920376211405, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 7005 + }, + { + "epoch": 0.19331096065697032, + "grad_norm": 0.002355298027396202, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 7006 + }, + { + "epoch": 0.19333855285803467, + "grad_norm": 0.002677205018699169, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 7007 + }, + { + "epoch": 0.19336614505909905, + "grad_norm": 0.002955394797027111, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 7008 + }, + { + "epoch": 0.19339373726016343, + "grad_norm": 0.0033979052677750587, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 7009 + }, + { + "epoch": 0.19342132946122778, + "grad_norm": 0.004769494291394949, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 7010 + }, + { + "epoch": 0.19344892166229216, + "grad_norm": 0.003045122604817152, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 7011 + }, + { + "epoch": 0.19347651386335651, + "grad_norm": 0.00237339548766613, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 7012 + }, + { + "epoch": 0.1935041060644209, + "grad_norm": 0.002918932121247053, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 7013 + }, + { + "epoch": 0.19353169826548525, + "grad_norm": 0.003531453665345907, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 7014 + }, + { + "epoch": 0.19355929046654963, + "grad_norm": 0.002957909367978573, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 7015 + }, + { + "epoch": 0.193586882667614, + "grad_norm": 0.002795727690681815, + "learning_rate": 0.001, + "loss": 0.423, + "step": 7016 + }, + { + "epoch": 0.19361447486867836, + "grad_norm": 0.002575729275122285, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 7017 + }, + { + "epoch": 0.19364206706974274, + "grad_norm": 0.011985518038272858, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 7018 + }, + { + "epoch": 0.1936696592708071, + "grad_norm": 0.005119143985211849, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 7019 + }, + { + "epoch": 0.19369725147187147, + "grad_norm": 0.002983895130455494, + "learning_rate": 0.001, + "loss": 0.382, + "step": 7020 + }, + { + "epoch": 0.19372484367293585, + "grad_norm": 0.0021343736443668604, + "learning_rate": 0.001, + "loss": 0.4559, + "step": 7021 + }, + { + "epoch": 0.1937524358740002, + "grad_norm": 0.0032467253040522337, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 7022 + }, + { + "epoch": 0.1937800280750646, + "grad_norm": 0.0023335155565291643, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 7023 + }, + { + "epoch": 0.19380762027612894, + "grad_norm": 0.002503189956769347, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 7024 + }, + { + "epoch": 0.19383521247719332, + "grad_norm": 0.002620559884235263, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 7025 + }, + { + "epoch": 0.1938628046782577, + "grad_norm": 0.002399984747171402, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 7026 + }, + { + "epoch": 0.19389039687932205, + "grad_norm": 0.0024000145494937897, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 7027 + }, + { + "epoch": 0.19391798908038643, + "grad_norm": 0.0024858317337930202, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 7028 + }, + { + "epoch": 0.19394558128145079, + "grad_norm": 0.0028986751567572355, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 7029 + }, + { + "epoch": 0.19397317348251517, + "grad_norm": 0.0020534794311970472, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 7030 + }, + { + "epoch": 0.19400076568357955, + "grad_norm": 0.0021818815730512142, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 7031 + }, + { + "epoch": 0.1940283578846439, + "grad_norm": 0.003896314650774002, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 7032 + }, + { + "epoch": 0.19405595008570828, + "grad_norm": 0.003106046002358198, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 7033 + }, + { + "epoch": 0.19408354228677263, + "grad_norm": 0.003942525014281273, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 7034 + }, + { + "epoch": 0.194111134487837, + "grad_norm": 0.006781655363738537, + "learning_rate": 0.001, + "loss": 0.3489, + "step": 7035 + }, + { + "epoch": 0.1941387266889014, + "grad_norm": 0.0025264392606914043, + "learning_rate": 0.001, + "loss": 0.403, + "step": 7036 + }, + { + "epoch": 0.19416631888996574, + "grad_norm": 0.0023680292069911957, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 7037 + }, + { + "epoch": 0.19419391109103012, + "grad_norm": 0.0024297498166561127, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 7038 + }, + { + "epoch": 0.19422150329209448, + "grad_norm": 0.0028899710159748793, + "learning_rate": 0.001, + "loss": 0.38, + "step": 7039 + }, + { + "epoch": 0.19424909549315886, + "grad_norm": 0.00262429122813046, + "learning_rate": 0.001, + "loss": 0.4411, + "step": 7040 + }, + { + "epoch": 0.19427668769422324, + "grad_norm": 0.0034140751231461763, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 7041 + }, + { + "epoch": 0.1943042798952876, + "grad_norm": 0.003651238512247801, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 7042 + }, + { + "epoch": 0.19433187209635197, + "grad_norm": 0.0027614810969680548, + "learning_rate": 0.001, + "loss": 0.3686, + "step": 7043 + }, + { + "epoch": 0.19435946429741632, + "grad_norm": 0.0027164684142917395, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 7044 + }, + { + "epoch": 0.1943870564984807, + "grad_norm": 0.0031832915265113115, + "learning_rate": 0.001, + "loss": 0.402, + "step": 7045 + }, + { + "epoch": 0.19441464869954508, + "grad_norm": 0.0033719497732818127, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 7046 + }, + { + "epoch": 0.19444224090060944, + "grad_norm": 0.002569133648648858, + "learning_rate": 0.001, + "loss": 0.399, + "step": 7047 + }, + { + "epoch": 0.19446983310167382, + "grad_norm": 0.0024267893750220537, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 7048 + }, + { + "epoch": 0.19449742530273817, + "grad_norm": 0.002893506083637476, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 7049 + }, + { + "epoch": 0.19452501750380255, + "grad_norm": 0.002697143005207181, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 7050 + }, + { + "epoch": 0.19455260970486693, + "grad_norm": 0.0032814224250614643, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 7051 + }, + { + "epoch": 0.19458020190593128, + "grad_norm": 0.002781797433272004, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 7052 + }, + { + "epoch": 0.19460779410699566, + "grad_norm": 0.003478818805888295, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7053 + }, + { + "epoch": 0.19463538630806002, + "grad_norm": 0.0027507366612553596, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 7054 + }, + { + "epoch": 0.1946629785091244, + "grad_norm": 0.0034008813090622425, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 7055 + }, + { + "epoch": 0.19469057071018878, + "grad_norm": 0.0030989109072834253, + "learning_rate": 0.001, + "loss": 0.393, + "step": 7056 + }, + { + "epoch": 0.19471816291125313, + "grad_norm": 0.003060347633436322, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 7057 + }, + { + "epoch": 0.1947457551123175, + "grad_norm": 0.0040310220792889595, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 7058 + }, + { + "epoch": 0.19477334731338186, + "grad_norm": 0.0034857376012951136, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 7059 + }, + { + "epoch": 0.19480093951444624, + "grad_norm": 0.0030391844920814037, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 7060 + }, + { + "epoch": 0.19482853171551062, + "grad_norm": 0.0030424713622778654, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 7061 + }, + { + "epoch": 0.19485612391657497, + "grad_norm": 0.0027415102813392878, + "learning_rate": 0.001, + "loss": 0.394, + "step": 7062 + }, + { + "epoch": 0.19488371611763936, + "grad_norm": 0.002916666679084301, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 7063 + }, + { + "epoch": 0.1949113083187037, + "grad_norm": 0.0023576219100505114, + "learning_rate": 0.001, + "loss": 0.4512, + "step": 7064 + }, + { + "epoch": 0.1949389005197681, + "grad_norm": 0.002573948120698333, + "learning_rate": 0.001, + "loss": 0.3731, + "step": 7065 + }, + { + "epoch": 0.19496649272083247, + "grad_norm": 0.004156920593231916, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 7066 + }, + { + "epoch": 0.19499408492189682, + "grad_norm": 0.0037690969184041023, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 7067 + }, + { + "epoch": 0.1950216771229612, + "grad_norm": 0.0031203448306769133, + "learning_rate": 0.001, + "loss": 0.3632, + "step": 7068 + }, + { + "epoch": 0.19504926932402555, + "grad_norm": 0.002718303119763732, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 7069 + }, + { + "epoch": 0.19507686152508993, + "grad_norm": 0.002880721352994442, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 7070 + }, + { + "epoch": 0.19510445372615431, + "grad_norm": 0.0021646865643560886, + "learning_rate": 0.001, + "loss": 0.3722, + "step": 7071 + }, + { + "epoch": 0.19513204592721867, + "grad_norm": 0.004858906380832195, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 7072 + }, + { + "epoch": 0.19515963812828305, + "grad_norm": 0.006143823266029358, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 7073 + }, + { + "epoch": 0.1951872303293474, + "grad_norm": 0.002867503557354212, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 7074 + }, + { + "epoch": 0.19521482253041178, + "grad_norm": 0.0057819196954369545, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 7075 + }, + { + "epoch": 0.19524241473147616, + "grad_norm": 0.003106197575107217, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 7076 + }, + { + "epoch": 0.1952700069325405, + "grad_norm": 0.0021443332079797983, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 7077 + }, + { + "epoch": 0.1952975991336049, + "grad_norm": 0.003420140827074647, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 7078 + }, + { + "epoch": 0.19532519133466925, + "grad_norm": 0.0017785170348361135, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 7079 + }, + { + "epoch": 0.19535278353573363, + "grad_norm": 0.003068318823352456, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 7080 + }, + { + "epoch": 0.195380375736798, + "grad_norm": 0.0028662248514592648, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 7081 + }, + { + "epoch": 0.19540796793786236, + "grad_norm": 0.002816071966663003, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 7082 + }, + { + "epoch": 0.19543556013892674, + "grad_norm": 0.003606053302064538, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 7083 + }, + { + "epoch": 0.1954631523399911, + "grad_norm": 0.002966247033327818, + "learning_rate": 0.001, + "loss": 0.352, + "step": 7084 + }, + { + "epoch": 0.19549074454105547, + "grad_norm": 0.0033741393126547337, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 7085 + }, + { + "epoch": 0.19551833674211985, + "grad_norm": 0.0050892336294054985, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 7086 + }, + { + "epoch": 0.1955459289431842, + "grad_norm": 0.0030292568262666464, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 7087 + }, + { + "epoch": 0.19557352114424859, + "grad_norm": 0.0051439255475997925, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 7088 + }, + { + "epoch": 0.19560111334531294, + "grad_norm": 0.0027856396045535803, + "learning_rate": 0.001, + "loss": 0.45, + "step": 7089 + }, + { + "epoch": 0.19562870554637732, + "grad_norm": 0.0030972841195762157, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 7090 + }, + { + "epoch": 0.1956562977474417, + "grad_norm": 0.003249438712373376, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 7091 + }, + { + "epoch": 0.19568388994850605, + "grad_norm": 0.0027454618830233812, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 7092 + }, + { + "epoch": 0.19571148214957043, + "grad_norm": 0.0027419314719736576, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 7093 + }, + { + "epoch": 0.19573907435063478, + "grad_norm": 0.0027157592121511698, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 7094 + }, + { + "epoch": 0.19576666655169916, + "grad_norm": 0.006076755467802286, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 7095 + }, + { + "epoch": 0.19579425875276354, + "grad_norm": 0.0034049993846565485, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 7096 + }, + { + "epoch": 0.1958218509538279, + "grad_norm": 0.004467803984880447, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 7097 + }, + { + "epoch": 0.19584944315489228, + "grad_norm": 0.0059601617977023125, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 7098 + }, + { + "epoch": 0.19587703535595663, + "grad_norm": 0.0025327398907393217, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 7099 + }, + { + "epoch": 0.195904627557021, + "grad_norm": 0.002435739152133465, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 7100 + }, + { + "epoch": 0.1959322197580854, + "grad_norm": 0.002827326999977231, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 7101 + }, + { + "epoch": 0.19595981195914974, + "grad_norm": 0.003259490942582488, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 7102 + }, + { + "epoch": 0.19598740416021412, + "grad_norm": 0.0026045122649520636, + "learning_rate": 0.001, + "loss": 0.3684, + "step": 7103 + }, + { + "epoch": 0.19601499636127848, + "grad_norm": 0.0032783085480332375, + "learning_rate": 0.001, + "loss": 0.382, + "step": 7104 + }, + { + "epoch": 0.19604258856234286, + "grad_norm": 0.004205625504255295, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 7105 + }, + { + "epoch": 0.1960701807634072, + "grad_norm": 0.004856889136135578, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 7106 + }, + { + "epoch": 0.1960977729644716, + "grad_norm": 0.005419073160737753, + "learning_rate": 0.001, + "loss": 0.4515, + "step": 7107 + }, + { + "epoch": 0.19612536516553597, + "grad_norm": 0.002779455156996846, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 7108 + }, + { + "epoch": 0.19615295736660032, + "grad_norm": 0.005004864651709795, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 7109 + }, + { + "epoch": 0.1961805495676647, + "grad_norm": 0.0031100206542760134, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 7110 + }, + { + "epoch": 0.19620814176872906, + "grad_norm": 0.002827596152201295, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 7111 + }, + { + "epoch": 0.19623573396979344, + "grad_norm": 0.003027317812666297, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 7112 + }, + { + "epoch": 0.19626332617085782, + "grad_norm": 0.004092982038855553, + "learning_rate": 0.001, + "loss": 0.3499, + "step": 7113 + }, + { + "epoch": 0.19629091837192217, + "grad_norm": 0.0034322801511734724, + "learning_rate": 0.001, + "loss": 0.39, + "step": 7114 + }, + { + "epoch": 0.19631851057298655, + "grad_norm": 0.0031399535946547985, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 7115 + }, + { + "epoch": 0.1963461027740509, + "grad_norm": 0.004002594854682684, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 7116 + }, + { + "epoch": 0.19637369497511528, + "grad_norm": 0.0028446640353649855, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 7117 + }, + { + "epoch": 0.19640128717617966, + "grad_norm": 0.0026851436123251915, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 7118 + }, + { + "epoch": 0.19642887937724401, + "grad_norm": 0.004198794718831778, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 7119 + }, + { + "epoch": 0.1964564715783084, + "grad_norm": 0.0026877266354858875, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 7120 + }, + { + "epoch": 0.19648406377937275, + "grad_norm": 0.0036120673175901175, + "learning_rate": 0.001, + "loss": 0.4328, + "step": 7121 + }, + { + "epoch": 0.19651165598043713, + "grad_norm": 0.004937993362545967, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 7122 + }, + { + "epoch": 0.1965392481815015, + "grad_norm": 0.0036189809907227755, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 7123 + }, + { + "epoch": 0.19656684038256586, + "grad_norm": 0.002798637840896845, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 7124 + }, + { + "epoch": 0.19659443258363024, + "grad_norm": 0.005139973945915699, + "learning_rate": 0.001, + "loss": 0.402, + "step": 7125 + }, + { + "epoch": 0.1966220247846946, + "grad_norm": 0.006130583584308624, + "learning_rate": 0.001, + "loss": 0.401, + "step": 7126 + }, + { + "epoch": 0.19664961698575897, + "grad_norm": 0.005488388240337372, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 7127 + }, + { + "epoch": 0.19667720918682335, + "grad_norm": 0.004060831852257252, + "learning_rate": 0.001, + "loss": 0.4456, + "step": 7128 + }, + { + "epoch": 0.1967048013878877, + "grad_norm": 0.0029782892670482397, + "learning_rate": 0.001, + "loss": 0.394, + "step": 7129 + }, + { + "epoch": 0.1967323935889521, + "grad_norm": 0.0036948262713849545, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 7130 + }, + { + "epoch": 0.19675998579001644, + "grad_norm": 0.004071654751896858, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 7131 + }, + { + "epoch": 0.19678757799108082, + "grad_norm": 0.0027887504547834396, + "learning_rate": 0.001, + "loss": 0.404, + "step": 7132 + }, + { + "epoch": 0.1968151701921452, + "grad_norm": 0.003311131615191698, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 7133 + }, + { + "epoch": 0.19684276239320955, + "grad_norm": 0.002101517515257001, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 7134 + }, + { + "epoch": 0.19687035459427393, + "grad_norm": 0.002868801588192582, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 7135 + }, + { + "epoch": 0.19689794679533829, + "grad_norm": 0.003476998768746853, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 7136 + }, + { + "epoch": 0.19692553899640267, + "grad_norm": 0.0028729683253914118, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 7137 + }, + { + "epoch": 0.19695313119746705, + "grad_norm": 0.0023318929597735405, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 7138 + }, + { + "epoch": 0.1969807233985314, + "grad_norm": 0.0027352285105735064, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7139 + }, + { + "epoch": 0.19700831559959578, + "grad_norm": 0.006381817627698183, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 7140 + }, + { + "epoch": 0.19703590780066013, + "grad_norm": 0.0036294516175985336, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 7141 + }, + { + "epoch": 0.1970635000017245, + "grad_norm": 0.0023575150407850742, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 7142 + }, + { + "epoch": 0.1970910922027889, + "grad_norm": 0.0027103947941213846, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 7143 + }, + { + "epoch": 0.19711868440385324, + "grad_norm": 0.003010998945683241, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 7144 + }, + { + "epoch": 0.19714627660491763, + "grad_norm": 0.0021617074962705374, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 7145 + }, + { + "epoch": 0.19717386880598198, + "grad_norm": 0.008347420953214169, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 7146 + }, + { + "epoch": 0.19720146100704636, + "grad_norm": 0.002271142089739442, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 7147 + }, + { + "epoch": 0.19722905320811074, + "grad_norm": 0.005453981924802065, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 7148 + }, + { + "epoch": 0.1972566454091751, + "grad_norm": 0.003245376283302903, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 7149 + }, + { + "epoch": 0.19728423761023947, + "grad_norm": 0.0026226479094475508, + "learning_rate": 0.001, + "loss": 0.3647, + "step": 7150 + }, + { + "epoch": 0.19731182981130382, + "grad_norm": 0.003077802946791053, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 7151 + }, + { + "epoch": 0.1973394220123682, + "grad_norm": 0.0025755036622285843, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 7152 + }, + { + "epoch": 0.19736701421343258, + "grad_norm": 0.0020703410264104605, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 7153 + }, + { + "epoch": 0.19739460641449694, + "grad_norm": 0.0022496734745800495, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 7154 + }, + { + "epoch": 0.19742219861556132, + "grad_norm": 0.004225686192512512, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 7155 + }, + { + "epoch": 0.19744979081662567, + "grad_norm": 0.0025360104627907276, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 7156 + }, + { + "epoch": 0.19747738301769005, + "grad_norm": 0.00297233066521585, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 7157 + }, + { + "epoch": 0.19750497521875443, + "grad_norm": 0.0028523015789687634, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 7158 + }, + { + "epoch": 0.19753256741981878, + "grad_norm": 0.002735694171860814, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 7159 + }, + { + "epoch": 0.19756015962088316, + "grad_norm": 0.005362628027796745, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 7160 + }, + { + "epoch": 0.19758775182194752, + "grad_norm": 0.0036659168545156717, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 7161 + }, + { + "epoch": 0.1976153440230119, + "grad_norm": 0.0036525116302073, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 7162 + }, + { + "epoch": 0.19764293622407628, + "grad_norm": 0.002731502987444401, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 7163 + }, + { + "epoch": 0.19767052842514063, + "grad_norm": 0.006773337721824646, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 7164 + }, + { + "epoch": 0.197698120626205, + "grad_norm": 0.0025534844025969505, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 7165 + }, + { + "epoch": 0.19772571282726936, + "grad_norm": 0.003402253147214651, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 7166 + }, + { + "epoch": 0.19775330502833374, + "grad_norm": 0.003048243233934045, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 7167 + }, + { + "epoch": 0.19778089722939812, + "grad_norm": 0.004743688274174929, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 7168 + }, + { + "epoch": 0.19780848943046248, + "grad_norm": 0.0041982936672866344, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 7169 + }, + { + "epoch": 0.19783608163152686, + "grad_norm": 0.0023896710481494665, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 7170 + }, + { + "epoch": 0.1978636738325912, + "grad_norm": 0.0028549651615321636, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 7171 + }, + { + "epoch": 0.1978912660336556, + "grad_norm": 0.002721426310017705, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 7172 + }, + { + "epoch": 0.19791885823471997, + "grad_norm": 0.0029002949595451355, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 7173 + }, + { + "epoch": 0.19794645043578432, + "grad_norm": 0.00278780166991055, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 7174 + }, + { + "epoch": 0.1979740426368487, + "grad_norm": 0.0019508522236719728, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 7175 + }, + { + "epoch": 0.19800163483791305, + "grad_norm": 0.002591001568362117, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 7176 + }, + { + "epoch": 0.19802922703897743, + "grad_norm": 0.00309981987811625, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 7177 + }, + { + "epoch": 0.19805681924004181, + "grad_norm": 0.003413395956158638, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 7178 + }, + { + "epoch": 0.19808441144110617, + "grad_norm": 0.0031371319200843573, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 7179 + }, + { + "epoch": 0.19811200364217055, + "grad_norm": 0.0040471418760716915, + "learning_rate": 0.001, + "loss": 0.379, + "step": 7180 + }, + { + "epoch": 0.1981395958432349, + "grad_norm": 0.002860469277948141, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 7181 + }, + { + "epoch": 0.19816718804429928, + "grad_norm": 0.0035744935739785433, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 7182 + }, + { + "epoch": 0.19819478024536366, + "grad_norm": 0.0021571393590420485, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 7183 + }, + { + "epoch": 0.198222372446428, + "grad_norm": 0.0027684883680194616, + "learning_rate": 0.001, + "loss": 0.398, + "step": 7184 + }, + { + "epoch": 0.1982499646474924, + "grad_norm": 0.004502817988395691, + "learning_rate": 0.001, + "loss": 0.397, + "step": 7185 + }, + { + "epoch": 0.19827755684855675, + "grad_norm": 0.0026800809428095818, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 7186 + }, + { + "epoch": 0.19830514904962113, + "grad_norm": 0.004524301737546921, + "learning_rate": 0.001, + "loss": 0.392, + "step": 7187 + }, + { + "epoch": 0.1983327412506855, + "grad_norm": 0.002851566532626748, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 7188 + }, + { + "epoch": 0.19836033345174986, + "grad_norm": 0.004874248988926411, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 7189 + }, + { + "epoch": 0.19838792565281424, + "grad_norm": 0.0025999664794653654, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 7190 + }, + { + "epoch": 0.1984155178538786, + "grad_norm": 0.0053477040491998196, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 7191 + }, + { + "epoch": 0.19844311005494297, + "grad_norm": 0.002580095548182726, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 7192 + }, + { + "epoch": 0.19847070225600735, + "grad_norm": 0.006665955297648907, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 7193 + }, + { + "epoch": 0.1984982944570717, + "grad_norm": 0.002255852334201336, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 7194 + }, + { + "epoch": 0.19852588665813609, + "grad_norm": 0.0027929407078772783, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 7195 + }, + { + "epoch": 0.19855347885920044, + "grad_norm": 0.002101206686347723, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 7196 + }, + { + "epoch": 0.19858107106026482, + "grad_norm": 0.0027232288848608732, + "learning_rate": 0.001, + "loss": 0.435, + "step": 7197 + }, + { + "epoch": 0.1986086632613292, + "grad_norm": 0.007063394878059626, + "learning_rate": 0.001, + "loss": 0.385, + "step": 7198 + }, + { + "epoch": 0.19863625546239355, + "grad_norm": 0.004317095037549734, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 7199 + }, + { + "epoch": 0.19866384766345793, + "grad_norm": 0.0018634272273629904, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 7200 + }, + { + "epoch": 0.19869143986452228, + "grad_norm": 0.0029656426049768925, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 7201 + }, + { + "epoch": 0.19871903206558666, + "grad_norm": 0.0018411249620839953, + "learning_rate": 0.001, + "loss": 0.44, + "step": 7202 + }, + { + "epoch": 0.19874662426665102, + "grad_norm": 0.0027455666568130255, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 7203 + }, + { + "epoch": 0.1987742164677154, + "grad_norm": 0.003438533516600728, + "learning_rate": 0.001, + "loss": 0.423, + "step": 7204 + }, + { + "epoch": 0.19880180866877978, + "grad_norm": 0.0023013537283986807, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 7205 + }, + { + "epoch": 0.19882940086984413, + "grad_norm": 0.003259762190282345, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 7206 + }, + { + "epoch": 0.1988569930709085, + "grad_norm": 0.002609924878925085, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 7207 + }, + { + "epoch": 0.19888458527197286, + "grad_norm": 0.002380332676693797, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 7208 + }, + { + "epoch": 0.19891217747303724, + "grad_norm": 0.0030413048807531595, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 7209 + }, + { + "epoch": 0.19893976967410162, + "grad_norm": 0.0022222718689590693, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 7210 + }, + { + "epoch": 0.19896736187516598, + "grad_norm": 0.0030871161725372076, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 7211 + }, + { + "epoch": 0.19899495407623036, + "grad_norm": 0.005348892416805029, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 7212 + }, + { + "epoch": 0.1990225462772947, + "grad_norm": 0.005420180968940258, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 7213 + }, + { + "epoch": 0.1990501384783591, + "grad_norm": 0.003726621624082327, + "learning_rate": 0.001, + "loss": 0.3649, + "step": 7214 + }, + { + "epoch": 0.19907773067942347, + "grad_norm": 0.0031097896862775087, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 7215 + }, + { + "epoch": 0.19910532288048782, + "grad_norm": 0.0029584530275315046, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 7216 + }, + { + "epoch": 0.1991329150815522, + "grad_norm": 0.0022907305974513292, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 7217 + }, + { + "epoch": 0.19916050728261656, + "grad_norm": 0.0031007654033601284, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 7218 + }, + { + "epoch": 0.19918809948368094, + "grad_norm": 0.0021812678314745426, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 7219 + }, + { + "epoch": 0.19921569168474532, + "grad_norm": 0.0023966538719832897, + "learning_rate": 0.001, + "loss": 0.394, + "step": 7220 + }, + { + "epoch": 0.19924328388580967, + "grad_norm": 0.0031528030522167683, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 7221 + }, + { + "epoch": 0.19927087608687405, + "grad_norm": 0.0022123432718217373, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 7222 + }, + { + "epoch": 0.1992984682879384, + "grad_norm": 0.0031831758096814156, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 7223 + }, + { + "epoch": 0.19932606048900278, + "grad_norm": 0.002463659970089793, + "learning_rate": 0.001, + "loss": 0.406, + "step": 7224 + }, + { + "epoch": 0.19935365269006716, + "grad_norm": 0.0026505514979362488, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 7225 + }, + { + "epoch": 0.19938124489113151, + "grad_norm": 0.0024356083013117313, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 7226 + }, + { + "epoch": 0.1994088370921959, + "grad_norm": 0.009376121684908867, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 7227 + }, + { + "epoch": 0.19943642929326025, + "grad_norm": 0.002203370677307248, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 7228 + }, + { + "epoch": 0.19946402149432463, + "grad_norm": 0.0029552599880844355, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 7229 + }, + { + "epoch": 0.199491613695389, + "grad_norm": 0.0028194712940603495, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 7230 + }, + { + "epoch": 0.19951920589645336, + "grad_norm": 0.0036550972145050764, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 7231 + }, + { + "epoch": 0.19954679809751774, + "grad_norm": 0.002419488737359643, + "learning_rate": 0.001, + "loss": 0.378, + "step": 7232 + }, + { + "epoch": 0.1995743902985821, + "grad_norm": 0.002697533695027232, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 7233 + }, + { + "epoch": 0.19960198249964647, + "grad_norm": 0.0025205162819474936, + "learning_rate": 0.001, + "loss": 0.3649, + "step": 7234 + }, + { + "epoch": 0.19962957470071085, + "grad_norm": 0.002819119254127145, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 7235 + }, + { + "epoch": 0.1996571669017752, + "grad_norm": 0.002348159672692418, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 7236 + }, + { + "epoch": 0.1996847591028396, + "grad_norm": 0.002327647991478443, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 7237 + }, + { + "epoch": 0.19971235130390394, + "grad_norm": 0.003467888105660677, + "learning_rate": 0.001, + "loss": 0.375, + "step": 7238 + }, + { + "epoch": 0.19973994350496832, + "grad_norm": 0.002749866805970669, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 7239 + }, + { + "epoch": 0.1997675357060327, + "grad_norm": 0.002804868621751666, + "learning_rate": 0.001, + "loss": 0.4616, + "step": 7240 + }, + { + "epoch": 0.19979512790709705, + "grad_norm": 0.0025544483214616776, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 7241 + }, + { + "epoch": 0.19982272010816143, + "grad_norm": 0.004032348282635212, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 7242 + }, + { + "epoch": 0.19985031230922579, + "grad_norm": 0.0025702861603349447, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 7243 + }, + { + "epoch": 0.19987790451029017, + "grad_norm": 0.0029868450947105885, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 7244 + }, + { + "epoch": 0.19990549671135455, + "grad_norm": 0.0034900156315416098, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 7245 + }, + { + "epoch": 0.1999330889124189, + "grad_norm": 0.004184171557426453, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 7246 + }, + { + "epoch": 0.19996068111348328, + "grad_norm": 0.003094634972512722, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 7247 + }, + { + "epoch": 0.19998827331454763, + "grad_norm": 0.004218010231852531, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 7248 + }, + { + "epoch": 0.200015865515612, + "grad_norm": 0.002849703887477517, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 7249 + }, + { + "epoch": 0.2000434577166764, + "grad_norm": 0.002104248385876417, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 7250 + }, + { + "epoch": 0.20007104991774075, + "grad_norm": 0.0028338278643786907, + "learning_rate": 0.001, + "loss": 0.4377, + "step": 7251 + }, + { + "epoch": 0.20009864211880513, + "grad_norm": 0.0070443847216665745, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 7252 + }, + { + "epoch": 0.20012623431986948, + "grad_norm": 0.002888858551159501, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 7253 + }, + { + "epoch": 0.20015382652093386, + "grad_norm": 0.0023948305752128363, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 7254 + }, + { + "epoch": 0.20018141872199824, + "grad_norm": 0.0023272959515452385, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 7255 + }, + { + "epoch": 0.2002090109230626, + "grad_norm": 0.0038700769655406475, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 7256 + }, + { + "epoch": 0.20023660312412697, + "grad_norm": 0.002915510907769203, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 7257 + }, + { + "epoch": 0.20026419532519132, + "grad_norm": 0.0024133019614964724, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 7258 + }, + { + "epoch": 0.2002917875262557, + "grad_norm": 0.002601010724902153, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 7259 + }, + { + "epoch": 0.20031937972732008, + "grad_norm": 0.004203200805932283, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 7260 + }, + { + "epoch": 0.20034697192838444, + "grad_norm": 0.0025508387479931116, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 7261 + }, + { + "epoch": 0.20037456412944882, + "grad_norm": 0.0040724920108914375, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 7262 + }, + { + "epoch": 0.20040215633051317, + "grad_norm": 0.003506725886836648, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 7263 + }, + { + "epoch": 0.20042974853157755, + "grad_norm": 0.003731567645445466, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 7264 + }, + { + "epoch": 0.20045734073264193, + "grad_norm": 0.0050995261408388615, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 7265 + }, + { + "epoch": 0.20048493293370628, + "grad_norm": 0.00341955223120749, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 7266 + }, + { + "epoch": 0.20051252513477066, + "grad_norm": 0.003466901136562228, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 7267 + }, + { + "epoch": 0.20054011733583502, + "grad_norm": 0.0026405456010252237, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 7268 + }, + { + "epoch": 0.2005677095368994, + "grad_norm": 0.002517679473385215, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 7269 + }, + { + "epoch": 0.20059530173796378, + "grad_norm": 0.002916789846494794, + "learning_rate": 0.001, + "loss": 0.4504, + "step": 7270 + }, + { + "epoch": 0.20062289393902813, + "grad_norm": 0.0029924395494163036, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 7271 + }, + { + "epoch": 0.2006504861400925, + "grad_norm": 0.004070238210260868, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 7272 + }, + { + "epoch": 0.20067807834115686, + "grad_norm": 0.002408135449513793, + "learning_rate": 0.001, + "loss": 0.4481, + "step": 7273 + }, + { + "epoch": 0.20070567054222124, + "grad_norm": 0.005004410166293383, + "learning_rate": 0.001, + "loss": 0.4515, + "step": 7274 + }, + { + "epoch": 0.20073326274328562, + "grad_norm": 0.002634943462908268, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 7275 + }, + { + "epoch": 0.20076085494434998, + "grad_norm": 0.0034329029731452465, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 7276 + }, + { + "epoch": 0.20078844714541436, + "grad_norm": 0.003150471020489931, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 7277 + }, + { + "epoch": 0.2008160393464787, + "grad_norm": 0.0030349427834153175, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 7278 + }, + { + "epoch": 0.2008436315475431, + "grad_norm": 0.0024701536167412996, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 7279 + }, + { + "epoch": 0.20087122374860747, + "grad_norm": 0.0022775549441576004, + "learning_rate": 0.001, + "loss": 0.4324, + "step": 7280 + }, + { + "epoch": 0.20089881594967182, + "grad_norm": 0.0037725751753896475, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 7281 + }, + { + "epoch": 0.2009264081507362, + "grad_norm": 0.0029782191850245, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 7282 + }, + { + "epoch": 0.20095400035180055, + "grad_norm": 0.0028811590746045113, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 7283 + }, + { + "epoch": 0.20098159255286493, + "grad_norm": 0.0027202339842915535, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 7284 + }, + { + "epoch": 0.20100918475392932, + "grad_norm": 0.0026378172915428877, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 7285 + }, + { + "epoch": 0.20103677695499367, + "grad_norm": 0.0031549364794045687, + "learning_rate": 0.001, + "loss": 0.409, + "step": 7286 + }, + { + "epoch": 0.20106436915605805, + "grad_norm": 0.0036540657747536898, + "learning_rate": 0.001, + "loss": 0.3454, + "step": 7287 + }, + { + "epoch": 0.2010919613571224, + "grad_norm": 0.0027373498305678368, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 7288 + }, + { + "epoch": 0.20111955355818678, + "grad_norm": 0.0032555065117776394, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 7289 + }, + { + "epoch": 0.20114714575925116, + "grad_norm": 0.0048676906153559685, + "learning_rate": 0.001, + "loss": 0.3526, + "step": 7290 + }, + { + "epoch": 0.20117473796031551, + "grad_norm": 0.0023562663700431585, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 7291 + }, + { + "epoch": 0.2012023301613799, + "grad_norm": 0.00272022164426744, + "learning_rate": 0.001, + "loss": 0.384, + "step": 7292 + }, + { + "epoch": 0.20122992236244425, + "grad_norm": 0.002916688099503517, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 7293 + }, + { + "epoch": 0.20125751456350863, + "grad_norm": 0.003989284858107567, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 7294 + }, + { + "epoch": 0.20128510676457298, + "grad_norm": 0.0029025361873209476, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 7295 + }, + { + "epoch": 0.20131269896563736, + "grad_norm": 0.003632181789726019, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 7296 + }, + { + "epoch": 0.20134029116670174, + "grad_norm": 0.004145762883126736, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 7297 + }, + { + "epoch": 0.2013678833677661, + "grad_norm": 0.0024826403241604567, + "learning_rate": 0.001, + "loss": 0.4606, + "step": 7298 + }, + { + "epoch": 0.20139547556883047, + "grad_norm": 0.002547920448705554, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 7299 + }, + { + "epoch": 0.20142306776989483, + "grad_norm": 0.0030286472756415606, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 7300 + }, + { + "epoch": 0.2014506599709592, + "grad_norm": 0.008298000320792198, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 7301 + }, + { + "epoch": 0.2014782521720236, + "grad_norm": 0.003152182325720787, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 7302 + }, + { + "epoch": 0.20150584437308794, + "grad_norm": 0.002809209283441305, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 7303 + }, + { + "epoch": 0.20153343657415232, + "grad_norm": 0.004826799966394901, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 7304 + }, + { + "epoch": 0.20156102877521667, + "grad_norm": 0.0035292392130941153, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 7305 + }, + { + "epoch": 0.20158862097628105, + "grad_norm": 0.002209904370829463, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 7306 + }, + { + "epoch": 0.20161621317734543, + "grad_norm": 0.002997803036123514, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 7307 + }, + { + "epoch": 0.20164380537840979, + "grad_norm": 0.004406254272907972, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 7308 + }, + { + "epoch": 0.20167139757947417, + "grad_norm": 0.0043180338107049465, + "learning_rate": 0.001, + "loss": 0.4384, + "step": 7309 + }, + { + "epoch": 0.20169898978053852, + "grad_norm": 0.07319663465023041, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 7310 + }, + { + "epoch": 0.2017265819816029, + "grad_norm": 0.003474163357168436, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 7311 + }, + { + "epoch": 0.20175417418266728, + "grad_norm": 0.005750832613557577, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 7312 + }, + { + "epoch": 0.20178176638373163, + "grad_norm": 0.0026583108119666576, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 7313 + }, + { + "epoch": 0.201809358584796, + "grad_norm": 0.004087739158421755, + "learning_rate": 0.001, + "loss": 0.4412, + "step": 7314 + }, + { + "epoch": 0.20183695078586036, + "grad_norm": 0.0027278910856693983, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 7315 + }, + { + "epoch": 0.20186454298692474, + "grad_norm": 0.002498122164979577, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 7316 + }, + { + "epoch": 0.20189213518798912, + "grad_norm": 0.004829999525099993, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 7317 + }, + { + "epoch": 0.20191972738905348, + "grad_norm": 0.0025493940338492393, + "learning_rate": 0.001, + "loss": 0.4411, + "step": 7318 + }, + { + "epoch": 0.20194731959011786, + "grad_norm": 0.0027906931936740875, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 7319 + }, + { + "epoch": 0.2019749117911822, + "grad_norm": 0.0034205715637654066, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 7320 + }, + { + "epoch": 0.2020025039922466, + "grad_norm": 0.0030028745532035828, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 7321 + }, + { + "epoch": 0.20203009619331097, + "grad_norm": 0.006040586158633232, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 7322 + }, + { + "epoch": 0.20205768839437532, + "grad_norm": 0.0025682656560093164, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 7323 + }, + { + "epoch": 0.2020852805954397, + "grad_norm": 0.003085241885855794, + "learning_rate": 0.001, + "loss": 0.44, + "step": 7324 + }, + { + "epoch": 0.20211287279650406, + "grad_norm": 0.0035471986047923565, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 7325 + }, + { + "epoch": 0.20214046499756844, + "grad_norm": 0.0031859648879617453, + "learning_rate": 0.001, + "loss": 0.4442, + "step": 7326 + }, + { + "epoch": 0.20216805719863282, + "grad_norm": 0.003315818263217807, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 7327 + }, + { + "epoch": 0.20219564939969717, + "grad_norm": 0.004785994999110699, + "learning_rate": 0.001, + "loss": 0.346, + "step": 7328 + }, + { + "epoch": 0.20222324160076155, + "grad_norm": 0.004568551201373339, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 7329 + }, + { + "epoch": 0.2022508338018259, + "grad_norm": 0.00264363712631166, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 7330 + }, + { + "epoch": 0.20227842600289028, + "grad_norm": 0.003292328678071499, + "learning_rate": 0.001, + "loss": 0.416, + "step": 7331 + }, + { + "epoch": 0.20230601820395466, + "grad_norm": 0.004438953939825296, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 7332 + }, + { + "epoch": 0.20233361040501902, + "grad_norm": 0.0032826552633196115, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 7333 + }, + { + "epoch": 0.2023612026060834, + "grad_norm": 0.03612243011593819, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 7334 + }, + { + "epoch": 0.20238879480714775, + "grad_norm": 0.01141429878771305, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 7335 + }, + { + "epoch": 0.20241638700821213, + "grad_norm": 0.0034477144945412874, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 7336 + }, + { + "epoch": 0.2024439792092765, + "grad_norm": 0.007367158308625221, + "learning_rate": 0.001, + "loss": 0.37, + "step": 7337 + }, + { + "epoch": 0.20247157141034086, + "grad_norm": 0.0035362287890166044, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 7338 + }, + { + "epoch": 0.20249916361140524, + "grad_norm": 0.002065723529085517, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 7339 + }, + { + "epoch": 0.2025267558124696, + "grad_norm": 0.004578362684696913, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 7340 + }, + { + "epoch": 0.20255434801353397, + "grad_norm": 0.002685924293473363, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 7341 + }, + { + "epoch": 0.20258194021459835, + "grad_norm": 0.0038794514257460833, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 7342 + }, + { + "epoch": 0.2026095324156627, + "grad_norm": 0.002912055002525449, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 7343 + }, + { + "epoch": 0.2026371246167271, + "grad_norm": 0.003003375604748726, + "learning_rate": 0.001, + "loss": 0.376, + "step": 7344 + }, + { + "epoch": 0.20266471681779144, + "grad_norm": 0.003948207478970289, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 7345 + }, + { + "epoch": 0.20269230901885582, + "grad_norm": 0.0033744163811206818, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 7346 + }, + { + "epoch": 0.2027199012199202, + "grad_norm": 0.004838820081204176, + "learning_rate": 0.001, + "loss": 0.4642, + "step": 7347 + }, + { + "epoch": 0.20274749342098455, + "grad_norm": 0.0024067736230790615, + "learning_rate": 0.001, + "loss": 0.3415, + "step": 7348 + }, + { + "epoch": 0.20277508562204893, + "grad_norm": 0.0045340536162257195, + "learning_rate": 0.001, + "loss": 0.3561, + "step": 7349 + }, + { + "epoch": 0.2028026778231133, + "grad_norm": 0.003165213158354163, + "learning_rate": 0.001, + "loss": 0.372, + "step": 7350 + }, + { + "epoch": 0.20283027002417767, + "grad_norm": 0.004642155487090349, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 7351 + }, + { + "epoch": 0.20285786222524205, + "grad_norm": 0.005825064145028591, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 7352 + }, + { + "epoch": 0.2028854544263064, + "grad_norm": 0.002787059871479869, + "learning_rate": 0.001, + "loss": 0.407, + "step": 7353 + }, + { + "epoch": 0.20291304662737078, + "grad_norm": 0.004571388475596905, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 7354 + }, + { + "epoch": 0.20294063882843513, + "grad_norm": 0.0032351436093449593, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 7355 + }, + { + "epoch": 0.2029682310294995, + "grad_norm": 0.0041768900118768215, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 7356 + }, + { + "epoch": 0.2029958232305639, + "grad_norm": 0.0028741308487951756, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 7357 + }, + { + "epoch": 0.20302341543162825, + "grad_norm": 0.00324125774204731, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 7358 + }, + { + "epoch": 0.20305100763269263, + "grad_norm": 0.003516893368214369, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 7359 + }, + { + "epoch": 0.20307859983375698, + "grad_norm": 0.0020891185849905014, + "learning_rate": 0.001, + "loss": 0.422, + "step": 7360 + }, + { + "epoch": 0.20310619203482136, + "grad_norm": 0.0024756919592618942, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 7361 + }, + { + "epoch": 0.20313378423588574, + "grad_norm": 0.007347720675170422, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 7362 + }, + { + "epoch": 0.2031613764369501, + "grad_norm": 0.009618041105568409, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 7363 + }, + { + "epoch": 0.20318896863801447, + "grad_norm": 0.004037544596940279, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 7364 + }, + { + "epoch": 0.20321656083907882, + "grad_norm": 0.002381223253905773, + "learning_rate": 0.001, + "loss": 0.427, + "step": 7365 + }, + { + "epoch": 0.2032441530401432, + "grad_norm": 0.0031598431523889303, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 7366 + }, + { + "epoch": 0.20327174524120759, + "grad_norm": 0.00809874664992094, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 7367 + }, + { + "epoch": 0.20329933744227194, + "grad_norm": 0.002093277871608734, + "learning_rate": 0.001, + "loss": 0.4407, + "step": 7368 + }, + { + "epoch": 0.20332692964333632, + "grad_norm": 0.002877402352169156, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 7369 + }, + { + "epoch": 0.20335452184440067, + "grad_norm": 0.004072779323905706, + "learning_rate": 0.001, + "loss": 0.3556, + "step": 7370 + }, + { + "epoch": 0.20338211404546505, + "grad_norm": 0.0036910090129822493, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 7371 + }, + { + "epoch": 0.20340970624652943, + "grad_norm": 0.002574736950919032, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 7372 + }, + { + "epoch": 0.20343729844759378, + "grad_norm": 0.0024004147853702307, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 7373 + }, + { + "epoch": 0.20346489064865816, + "grad_norm": 0.003890526946634054, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 7374 + }, + { + "epoch": 0.20349248284972252, + "grad_norm": 0.0030879208352416754, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 7375 + }, + { + "epoch": 0.2035200750507869, + "grad_norm": 0.003786006011068821, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 7376 + }, + { + "epoch": 0.20354766725185128, + "grad_norm": 0.0021713408641517162, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 7377 + }, + { + "epoch": 0.20357525945291563, + "grad_norm": 0.00433305511251092, + "learning_rate": 0.001, + "loss": 0.392, + "step": 7378 + }, + { + "epoch": 0.20360285165398, + "grad_norm": 0.0030764697585254908, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 7379 + }, + { + "epoch": 0.20363044385504436, + "grad_norm": 0.005284723825752735, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 7380 + }, + { + "epoch": 0.20365803605610874, + "grad_norm": 0.004528792109340429, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 7381 + }, + { + "epoch": 0.20368562825717312, + "grad_norm": 0.029749277979135513, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 7382 + }, + { + "epoch": 0.20371322045823748, + "grad_norm": 0.004927974659949541, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 7383 + }, + { + "epoch": 0.20374081265930186, + "grad_norm": 0.004339797887951136, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 7384 + }, + { + "epoch": 0.2037684048603662, + "grad_norm": 0.002485482720658183, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 7385 + }, + { + "epoch": 0.2037959970614306, + "grad_norm": 0.004456434864550829, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 7386 + }, + { + "epoch": 0.20382358926249497, + "grad_norm": 0.0022889727260917425, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 7387 + }, + { + "epoch": 0.20385118146355932, + "grad_norm": 0.006221712101250887, + "learning_rate": 0.001, + "loss": 0.3579, + "step": 7388 + }, + { + "epoch": 0.2038787736646237, + "grad_norm": 0.003358406713232398, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 7389 + }, + { + "epoch": 0.20390636586568806, + "grad_norm": 0.0029171998612582684, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 7390 + }, + { + "epoch": 0.20393395806675244, + "grad_norm": 0.0030932044610381126, + "learning_rate": 0.001, + "loss": 0.415, + "step": 7391 + }, + { + "epoch": 0.2039615502678168, + "grad_norm": 0.009446857497096062, + "learning_rate": 0.001, + "loss": 0.411, + "step": 7392 + }, + { + "epoch": 0.20398914246888117, + "grad_norm": 0.003646930679678917, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 7393 + }, + { + "epoch": 0.20401673466994555, + "grad_norm": 0.002314311685040593, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 7394 + }, + { + "epoch": 0.2040443268710099, + "grad_norm": 0.0032062383834272623, + "learning_rate": 0.001, + "loss": 0.378, + "step": 7395 + }, + { + "epoch": 0.20407191907207428, + "grad_norm": 0.005731469485908747, + "learning_rate": 0.001, + "loss": 0.42, + "step": 7396 + }, + { + "epoch": 0.20409951127313863, + "grad_norm": 0.002991395303979516, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 7397 + }, + { + "epoch": 0.20412710347420301, + "grad_norm": 0.0036581414751708508, + "learning_rate": 0.001, + "loss": 0.401, + "step": 7398 + }, + { + "epoch": 0.2041546956752674, + "grad_norm": 0.00450851721689105, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 7399 + }, + { + "epoch": 0.20418228787633175, + "grad_norm": 0.003067772602662444, + "learning_rate": 0.001, + "loss": 0.392, + "step": 7400 + }, + { + "epoch": 0.20420988007739613, + "grad_norm": 0.0030468387994915247, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 7401 + }, + { + "epoch": 0.20423747227846048, + "grad_norm": 0.004612901713699102, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 7402 + }, + { + "epoch": 0.20426506447952486, + "grad_norm": 0.0034134613815695047, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 7403 + }, + { + "epoch": 0.20429265668058924, + "grad_norm": 0.009274942800402641, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 7404 + }, + { + "epoch": 0.2043202488816536, + "grad_norm": 0.0026235580444335938, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 7405 + }, + { + "epoch": 0.20434784108271797, + "grad_norm": 0.006790754850953817, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 7406 + }, + { + "epoch": 0.20437543328378233, + "grad_norm": 0.0022785153705626726, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 7407 + }, + { + "epoch": 0.2044030254848467, + "grad_norm": 0.005909627769142389, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 7408 + }, + { + "epoch": 0.2044306176859111, + "grad_norm": 0.0032420544885098934, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 7409 + }, + { + "epoch": 0.20445820988697544, + "grad_norm": 0.003111346159130335, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 7410 + }, + { + "epoch": 0.20448580208803982, + "grad_norm": 0.003899330273270607, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 7411 + }, + { + "epoch": 0.20451339428910417, + "grad_norm": 0.007644087076187134, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 7412 + }, + { + "epoch": 0.20454098649016855, + "grad_norm": 0.0028403971809893847, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 7413 + }, + { + "epoch": 0.20456857869123293, + "grad_norm": 0.00496717169880867, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 7414 + }, + { + "epoch": 0.20459617089229729, + "grad_norm": 0.003765889909118414, + "learning_rate": 0.001, + "loss": 0.4465, + "step": 7415 + }, + { + "epoch": 0.20462376309336167, + "grad_norm": 0.0031095000449568033, + "learning_rate": 0.001, + "loss": 0.3615, + "step": 7416 + }, + { + "epoch": 0.20465135529442602, + "grad_norm": 0.0027119989972561598, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 7417 + }, + { + "epoch": 0.2046789474954904, + "grad_norm": 0.0027941481675952673, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 7418 + }, + { + "epoch": 0.20470653969655478, + "grad_norm": 0.002864205278456211, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 7419 + }, + { + "epoch": 0.20473413189761913, + "grad_norm": 0.0032913892064243555, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 7420 + }, + { + "epoch": 0.2047617240986835, + "grad_norm": 0.0031653158366680145, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 7421 + }, + { + "epoch": 0.20478931629974786, + "grad_norm": 0.002829028759151697, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 7422 + }, + { + "epoch": 0.20481690850081224, + "grad_norm": 0.0026797757018357515, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 7423 + }, + { + "epoch": 0.20484450070187663, + "grad_norm": 0.0033115644473582506, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 7424 + }, + { + "epoch": 0.20487209290294098, + "grad_norm": 0.0037570816930383444, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 7425 + }, + { + "epoch": 0.20489968510400536, + "grad_norm": 0.0022269757464528084, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 7426 + }, + { + "epoch": 0.2049272773050697, + "grad_norm": 0.003650048514828086, + "learning_rate": 0.001, + "loss": 0.391, + "step": 7427 + }, + { + "epoch": 0.2049548695061341, + "grad_norm": 0.003723046975210309, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 7428 + }, + { + "epoch": 0.20498246170719847, + "grad_norm": 0.0031359000131487846, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 7429 + }, + { + "epoch": 0.20501005390826282, + "grad_norm": 0.0026418371126055717, + "learning_rate": 0.001, + "loss": 0.4462, + "step": 7430 + }, + { + "epoch": 0.2050376461093272, + "grad_norm": 0.005113274324685335, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 7431 + }, + { + "epoch": 0.20506523831039156, + "grad_norm": 0.0034433556720614433, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 7432 + }, + { + "epoch": 0.20509283051145594, + "grad_norm": 0.0039060013368725777, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 7433 + }, + { + "epoch": 0.20512042271252032, + "grad_norm": 0.0031034150160849094, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 7434 + }, + { + "epoch": 0.20514801491358467, + "grad_norm": 0.004134570714086294, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 7435 + }, + { + "epoch": 0.20517560711464905, + "grad_norm": 0.002315821126103401, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 7436 + }, + { + "epoch": 0.2052031993157134, + "grad_norm": 0.004337753169238567, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 7437 + }, + { + "epoch": 0.20523079151677778, + "grad_norm": 0.014801721088588238, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 7438 + }, + { + "epoch": 0.20525838371784216, + "grad_norm": 0.003172953613102436, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 7439 + }, + { + "epoch": 0.20528597591890652, + "grad_norm": 0.0031124092638492584, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 7440 + }, + { + "epoch": 0.2053135681199709, + "grad_norm": 0.004870596341788769, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 7441 + }, + { + "epoch": 0.20534116032103525, + "grad_norm": 0.002806989708915353, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 7442 + }, + { + "epoch": 0.20536875252209963, + "grad_norm": 0.002985073020681739, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 7443 + }, + { + "epoch": 0.205396344723164, + "grad_norm": 0.002774279797449708, + "learning_rate": 0.001, + "loss": 0.378, + "step": 7444 + }, + { + "epoch": 0.20542393692422836, + "grad_norm": 0.0028050506953150034, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 7445 + }, + { + "epoch": 0.20545152912529274, + "grad_norm": 0.004381167236715555, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 7446 + }, + { + "epoch": 0.2054791213263571, + "grad_norm": 0.003191079944372177, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 7447 + }, + { + "epoch": 0.20550671352742148, + "grad_norm": 0.0027379656676203012, + "learning_rate": 0.001, + "loss": 0.385, + "step": 7448 + }, + { + "epoch": 0.20553430572848586, + "grad_norm": 0.0034181999508291483, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 7449 + }, + { + "epoch": 0.2055618979295502, + "grad_norm": 0.003427153918892145, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 7450 + }, + { + "epoch": 0.2055894901306146, + "grad_norm": 0.002515499945729971, + "learning_rate": 0.001, + "loss": 0.3551, + "step": 7451 + }, + { + "epoch": 0.20561708233167894, + "grad_norm": 0.002670654794201255, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 7452 + }, + { + "epoch": 0.20564467453274332, + "grad_norm": 0.003271377645432949, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 7453 + }, + { + "epoch": 0.2056722667338077, + "grad_norm": 0.0034607460256665945, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 7454 + }, + { + "epoch": 0.20569985893487205, + "grad_norm": 0.002583264373242855, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 7455 + }, + { + "epoch": 0.20572745113593643, + "grad_norm": 0.003357719397172332, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 7456 + }, + { + "epoch": 0.2057550433370008, + "grad_norm": 0.005699231754988432, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 7457 + }, + { + "epoch": 0.20578263553806517, + "grad_norm": 0.00468082819133997, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 7458 + }, + { + "epoch": 0.20581022773912955, + "grad_norm": 0.002804639982059598, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 7459 + }, + { + "epoch": 0.2058378199401939, + "grad_norm": 0.0031567709520459175, + "learning_rate": 0.001, + "loss": 0.452, + "step": 7460 + }, + { + "epoch": 0.20586541214125828, + "grad_norm": 0.0023213198874145746, + "learning_rate": 0.001, + "loss": 0.4353, + "step": 7461 + }, + { + "epoch": 0.20589300434232263, + "grad_norm": 0.003251513699069619, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 7462 + }, + { + "epoch": 0.205920596543387, + "grad_norm": 0.0036114819813519716, + "learning_rate": 0.001, + "loss": 0.4622, + "step": 7463 + }, + { + "epoch": 0.2059481887444514, + "grad_norm": 0.0032585756853222847, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 7464 + }, + { + "epoch": 0.20597578094551575, + "grad_norm": 0.004090276546776295, + "learning_rate": 0.001, + "loss": 0.369, + "step": 7465 + }, + { + "epoch": 0.20600337314658013, + "grad_norm": 0.006538175046443939, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 7466 + }, + { + "epoch": 0.20603096534764448, + "grad_norm": 0.006117241457104683, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 7467 + }, + { + "epoch": 0.20605855754870886, + "grad_norm": 0.004128247033804655, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 7468 + }, + { + "epoch": 0.20608614974977324, + "grad_norm": 0.0023199093993753195, + "learning_rate": 0.001, + "loss": 0.3613, + "step": 7469 + }, + { + "epoch": 0.2061137419508376, + "grad_norm": 0.002994114300236106, + "learning_rate": 0.001, + "loss": 0.396, + "step": 7470 + }, + { + "epoch": 0.20614133415190197, + "grad_norm": 0.00449094595387578, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 7471 + }, + { + "epoch": 0.20616892635296633, + "grad_norm": 0.002262924797832966, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 7472 + }, + { + "epoch": 0.2061965185540307, + "grad_norm": 0.0033367322757840157, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 7473 + }, + { + "epoch": 0.20622411075509509, + "grad_norm": 0.004621635656803846, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 7474 + }, + { + "epoch": 0.20625170295615944, + "grad_norm": 0.0024976639542728662, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 7475 + }, + { + "epoch": 0.20627929515722382, + "grad_norm": 0.0032991066109389067, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 7476 + }, + { + "epoch": 0.20630688735828817, + "grad_norm": 0.004945872817188501, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 7477 + }, + { + "epoch": 0.20633447955935255, + "grad_norm": 0.003072677878662944, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 7478 + }, + { + "epoch": 0.20636207176041693, + "grad_norm": 0.004622003063559532, + "learning_rate": 0.001, + "loss": 0.4524, + "step": 7479 + }, + { + "epoch": 0.20638966396148128, + "grad_norm": 0.005966620985418558, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 7480 + }, + { + "epoch": 0.20641725616254566, + "grad_norm": 0.002969125984236598, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 7481 + }, + { + "epoch": 0.20644484836361002, + "grad_norm": 0.020619425922632217, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 7482 + }, + { + "epoch": 0.2064724405646744, + "grad_norm": 0.005815454758703709, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 7483 + }, + { + "epoch": 0.20650003276573875, + "grad_norm": 0.0029577158857136965, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 7484 + }, + { + "epoch": 0.20652762496680313, + "grad_norm": 0.0039966655895113945, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 7485 + }, + { + "epoch": 0.2065552171678675, + "grad_norm": 0.00554103497415781, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 7486 + }, + { + "epoch": 0.20658280936893186, + "grad_norm": 0.00304683530703187, + "learning_rate": 0.001, + "loss": 0.4444, + "step": 7487 + }, + { + "epoch": 0.20661040156999624, + "grad_norm": 0.007905455306172371, + "learning_rate": 0.001, + "loss": 0.357, + "step": 7488 + }, + { + "epoch": 0.2066379937710606, + "grad_norm": 0.0040861996822059155, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 7489 + }, + { + "epoch": 0.20666558597212498, + "grad_norm": 0.00287861586548388, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 7490 + }, + { + "epoch": 0.20669317817318936, + "grad_norm": 0.00307285669259727, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 7491 + }, + { + "epoch": 0.2067207703742537, + "grad_norm": 0.0033920174464583397, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 7492 + }, + { + "epoch": 0.2067483625753181, + "grad_norm": 0.0028167800046503544, + "learning_rate": 0.001, + "loss": 0.4377, + "step": 7493 + }, + { + "epoch": 0.20677595477638244, + "grad_norm": 0.0022106487303972244, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 7494 + }, + { + "epoch": 0.20680354697744682, + "grad_norm": 0.002651102375239134, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 7495 + }, + { + "epoch": 0.2068311391785112, + "grad_norm": 0.003722158260643482, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 7496 + }, + { + "epoch": 0.20685873137957556, + "grad_norm": 0.004339053761214018, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 7497 + }, + { + "epoch": 0.20688632358063994, + "grad_norm": 0.002610723953694105, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 7498 + }, + { + "epoch": 0.2069139157817043, + "grad_norm": 0.0024759217631071806, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 7499 + }, + { + "epoch": 0.20694150798276867, + "grad_norm": 0.005409894045442343, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 7500 + }, + { + "epoch": 0.20694150798276867, + "eval_runtime": 24.8623, + "eval_samples_per_second": 1.287, + "eval_steps_per_second": 0.161, + "step": 7500 + }, + { + "epoch": 0.20696910018383305, + "grad_norm": 0.0043741133995354176, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 7501 + }, + { + "epoch": 0.2069966923848974, + "grad_norm": 0.0034702117554843426, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 7502 + }, + { + "epoch": 0.20702428458596178, + "grad_norm": 0.002621249994263053, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 7503 + }, + { + "epoch": 0.20705187678702613, + "grad_norm": 0.00401890417560935, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7504 + }, + { + "epoch": 0.20707946898809051, + "grad_norm": 0.005101599730551243, + "learning_rate": 0.001, + "loss": 0.4362, + "step": 7505 + }, + { + "epoch": 0.2071070611891549, + "grad_norm": 0.006792318541556597, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 7506 + }, + { + "epoch": 0.20713465339021925, + "grad_norm": 0.0032163949217647314, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 7507 + }, + { + "epoch": 0.20716224559128363, + "grad_norm": 0.006187311839312315, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 7508 + }, + { + "epoch": 0.20718983779234798, + "grad_norm": 0.003395832609385252, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 7509 + }, + { + "epoch": 0.20721742999341236, + "grad_norm": 0.007058777846395969, + "learning_rate": 0.001, + "loss": 0.3545, + "step": 7510 + }, + { + "epoch": 0.20724502219447674, + "grad_norm": 0.0022814422845840454, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 7511 + }, + { + "epoch": 0.2072726143955411, + "grad_norm": 0.003063853131607175, + "learning_rate": 0.001, + "loss": 0.4, + "step": 7512 + }, + { + "epoch": 0.20730020659660547, + "grad_norm": 0.003978427965193987, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 7513 + }, + { + "epoch": 0.20732779879766983, + "grad_norm": 0.002310180803760886, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 7514 + }, + { + "epoch": 0.2073553909987342, + "grad_norm": 0.0026651090011000633, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 7515 + }, + { + "epoch": 0.2073829831997986, + "grad_norm": 0.0029477854259312153, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 7516 + }, + { + "epoch": 0.20741057540086294, + "grad_norm": 0.0022027394734323025, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 7517 + }, + { + "epoch": 0.20743816760192732, + "grad_norm": 0.0025718417018651962, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 7518 + }, + { + "epoch": 0.20746575980299167, + "grad_norm": 0.002921158680692315, + "learning_rate": 0.001, + "loss": 0.383, + "step": 7519 + }, + { + "epoch": 0.20749335200405605, + "grad_norm": 0.0037957970052957535, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 7520 + }, + { + "epoch": 0.20752094420512043, + "grad_norm": 0.003919035196304321, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 7521 + }, + { + "epoch": 0.20754853640618479, + "grad_norm": 0.0024592061527073383, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 7522 + }, + { + "epoch": 0.20757612860724917, + "grad_norm": 0.0034799345303326845, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 7523 + }, + { + "epoch": 0.20760372080831352, + "grad_norm": 0.00930013321340084, + "learning_rate": 0.001, + "loss": 0.3504, + "step": 7524 + }, + { + "epoch": 0.2076313130093779, + "grad_norm": 0.004809627775102854, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 7525 + }, + { + "epoch": 0.20765890521044228, + "grad_norm": 0.0031031654216349125, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 7526 + }, + { + "epoch": 0.20768649741150663, + "grad_norm": 0.0024205984082072973, + "learning_rate": 0.001, + "loss": 0.414, + "step": 7527 + }, + { + "epoch": 0.207714089612571, + "grad_norm": 0.003100008238106966, + "learning_rate": 0.001, + "loss": 0.4384, + "step": 7528 + }, + { + "epoch": 0.20774168181363536, + "grad_norm": 0.0023790469858795404, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 7529 + }, + { + "epoch": 0.20776927401469975, + "grad_norm": 0.0028267705347388983, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 7530 + }, + { + "epoch": 0.20779686621576413, + "grad_norm": 0.002445453545078635, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 7531 + }, + { + "epoch": 0.20782445841682848, + "grad_norm": 0.0038315069396048784, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 7532 + }, + { + "epoch": 0.20785205061789286, + "grad_norm": 0.002911431947723031, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 7533 + }, + { + "epoch": 0.2078796428189572, + "grad_norm": 0.024655209854245186, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 7534 + }, + { + "epoch": 0.2079072350200216, + "grad_norm": 0.0030850169714540243, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 7535 + }, + { + "epoch": 0.20793482722108597, + "grad_norm": 0.0029573985375463963, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 7536 + }, + { + "epoch": 0.20796241942215032, + "grad_norm": 0.002838612301275134, + "learning_rate": 0.001, + "loss": 0.4398, + "step": 7537 + }, + { + "epoch": 0.2079900116232147, + "grad_norm": 0.0038796397857367992, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 7538 + }, + { + "epoch": 0.20801760382427906, + "grad_norm": 0.002909380476921797, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 7539 + }, + { + "epoch": 0.20804519602534344, + "grad_norm": 0.005611411761492491, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 7540 + }, + { + "epoch": 0.20807278822640782, + "grad_norm": 0.005777435377240181, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 7541 + }, + { + "epoch": 0.20810038042747217, + "grad_norm": 0.002417078008875251, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 7542 + }, + { + "epoch": 0.20812797262853655, + "grad_norm": 0.0039400262758135796, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 7543 + }, + { + "epoch": 0.2081555648296009, + "grad_norm": 0.005697792861610651, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 7544 + }, + { + "epoch": 0.20818315703066528, + "grad_norm": 0.0035569635219872, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 7545 + }, + { + "epoch": 0.20821074923172966, + "grad_norm": 0.0035249588545411825, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 7546 + }, + { + "epoch": 0.20823834143279402, + "grad_norm": 0.003973274026066065, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 7547 + }, + { + "epoch": 0.2082659336338584, + "grad_norm": 0.010858539491891861, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 7548 + }, + { + "epoch": 0.20829352583492275, + "grad_norm": 0.002799051348119974, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 7549 + }, + { + "epoch": 0.20832111803598713, + "grad_norm": 0.0032943717669695616, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 7550 + }, + { + "epoch": 0.2083487102370515, + "grad_norm": 0.003968504723161459, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 7551 + }, + { + "epoch": 0.20837630243811586, + "grad_norm": 0.0025952784344553947, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 7552 + }, + { + "epoch": 0.20840389463918024, + "grad_norm": 0.0023193256929516792, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 7553 + }, + { + "epoch": 0.2084314868402446, + "grad_norm": 0.020232345908880234, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 7554 + }, + { + "epoch": 0.20845907904130898, + "grad_norm": 0.005101238377392292, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 7555 + }, + { + "epoch": 0.20848667124237336, + "grad_norm": 0.0028565346729010344, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 7556 + }, + { + "epoch": 0.2085142634434377, + "grad_norm": 0.0023610927164554596, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 7557 + }, + { + "epoch": 0.2085418556445021, + "grad_norm": 0.0025352907832711935, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 7558 + }, + { + "epoch": 0.20856944784556644, + "grad_norm": 0.002582514425739646, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 7559 + }, + { + "epoch": 0.20859704004663082, + "grad_norm": 0.0051224916242063046, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 7560 + }, + { + "epoch": 0.2086246322476952, + "grad_norm": 0.004121377132833004, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 7561 + }, + { + "epoch": 0.20865222444875955, + "grad_norm": 0.0028835702687501907, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 7562 + }, + { + "epoch": 0.20867981664982393, + "grad_norm": 0.0035318653099238873, + "learning_rate": 0.001, + "loss": 0.38, + "step": 7563 + }, + { + "epoch": 0.2087074088508883, + "grad_norm": 0.003049545455724001, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 7564 + }, + { + "epoch": 0.20873500105195267, + "grad_norm": 0.01318820845335722, + "learning_rate": 0.001, + "loss": 0.4478, + "step": 7565 + }, + { + "epoch": 0.20876259325301705, + "grad_norm": 0.0028140349313616753, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 7566 + }, + { + "epoch": 0.2087901854540814, + "grad_norm": 0.0023844155948609114, + "learning_rate": 0.001, + "loss": 0.4497, + "step": 7567 + }, + { + "epoch": 0.20881777765514578, + "grad_norm": 0.0021965601481497288, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 7568 + }, + { + "epoch": 0.20884536985621013, + "grad_norm": 0.003476344281807542, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 7569 + }, + { + "epoch": 0.2088729620572745, + "grad_norm": 0.004417444113641977, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7570 + }, + { + "epoch": 0.2089005542583389, + "grad_norm": 0.002342290710657835, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 7571 + }, + { + "epoch": 0.20892814645940325, + "grad_norm": 0.002702725352719426, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 7572 + }, + { + "epoch": 0.20895573866046763, + "grad_norm": 0.003713875776156783, + "learning_rate": 0.001, + "loss": 0.387, + "step": 7573 + }, + { + "epoch": 0.20898333086153198, + "grad_norm": 0.002027127193287015, + "learning_rate": 0.001, + "loss": 0.4431, + "step": 7574 + }, + { + "epoch": 0.20901092306259636, + "grad_norm": 0.0038261881563812494, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 7575 + }, + { + "epoch": 0.2090385152636607, + "grad_norm": 0.002401645528152585, + "learning_rate": 0.001, + "loss": 0.4568, + "step": 7576 + }, + { + "epoch": 0.2090661074647251, + "grad_norm": 0.002996440976858139, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 7577 + }, + { + "epoch": 0.20909369966578947, + "grad_norm": 0.0030510788783431053, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 7578 + }, + { + "epoch": 0.20912129186685383, + "grad_norm": 0.0031568347476422787, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 7579 + }, + { + "epoch": 0.2091488840679182, + "grad_norm": 0.0025101054925471544, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 7580 + }, + { + "epoch": 0.20917647626898256, + "grad_norm": 0.002685755491256714, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 7581 + }, + { + "epoch": 0.20920406847004694, + "grad_norm": 0.00574698718264699, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 7582 + }, + { + "epoch": 0.20923166067111132, + "grad_norm": 0.0023178094998002052, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 7583 + }, + { + "epoch": 0.20925925287217567, + "grad_norm": 0.0048984200693666935, + "learning_rate": 0.001, + "loss": 0.3566, + "step": 7584 + }, + { + "epoch": 0.20928684507324005, + "grad_norm": 0.0027060145512223244, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 7585 + }, + { + "epoch": 0.2093144372743044, + "grad_norm": 0.00470772897824645, + "learning_rate": 0.001, + "loss": 0.385, + "step": 7586 + }, + { + "epoch": 0.20934202947536878, + "grad_norm": 0.002918388694524765, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 7587 + }, + { + "epoch": 0.20936962167643317, + "grad_norm": 0.002451063599437475, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 7588 + }, + { + "epoch": 0.20939721387749752, + "grad_norm": 0.0024748530704528093, + "learning_rate": 0.001, + "loss": 0.4419, + "step": 7589 + }, + { + "epoch": 0.2094248060785619, + "grad_norm": 0.0029519018717110157, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 7590 + }, + { + "epoch": 0.20945239827962625, + "grad_norm": 0.004285396076738834, + "learning_rate": 0.001, + "loss": 0.3722, + "step": 7591 + }, + { + "epoch": 0.20947999048069063, + "grad_norm": 0.0032341107726097107, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 7592 + }, + { + "epoch": 0.209507582681755, + "grad_norm": 0.0032684197649359703, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 7593 + }, + { + "epoch": 0.20953517488281936, + "grad_norm": 0.002590345684438944, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 7594 + }, + { + "epoch": 0.20956276708388374, + "grad_norm": 0.0075867660343647, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 7595 + }, + { + "epoch": 0.2095903592849481, + "grad_norm": 0.011841630563139915, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 7596 + }, + { + "epoch": 0.20961795148601248, + "grad_norm": 0.002727978862822056, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 7597 + }, + { + "epoch": 0.20964554368707686, + "grad_norm": 0.0028050565160810947, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 7598 + }, + { + "epoch": 0.2096731358881412, + "grad_norm": 0.0022304770536720753, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 7599 + }, + { + "epoch": 0.2097007280892056, + "grad_norm": 0.0042723724618554115, + "learning_rate": 0.001, + "loss": 0.409, + "step": 7600 + }, + { + "epoch": 0.20972832029026994, + "grad_norm": 0.00373181514441967, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 7601 + }, + { + "epoch": 0.20975591249133432, + "grad_norm": 0.0032532603945583105, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 7602 + }, + { + "epoch": 0.2097835046923987, + "grad_norm": 0.002218460664153099, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 7603 + }, + { + "epoch": 0.20981109689346306, + "grad_norm": 0.00374056794680655, + "learning_rate": 0.001, + "loss": 0.412, + "step": 7604 + }, + { + "epoch": 0.20983868909452744, + "grad_norm": 0.0029005371034145355, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 7605 + }, + { + "epoch": 0.2098662812955918, + "grad_norm": 0.0028765443712472916, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 7606 + }, + { + "epoch": 0.20989387349665617, + "grad_norm": 0.0032596492674201727, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 7607 + }, + { + "epoch": 0.20992146569772055, + "grad_norm": 0.0027232312131673098, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 7608 + }, + { + "epoch": 0.2099490578987849, + "grad_norm": 0.0047914572060108185, + "learning_rate": 0.001, + "loss": 0.383, + "step": 7609 + }, + { + "epoch": 0.20997665009984928, + "grad_norm": 0.002376759424805641, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 7610 + }, + { + "epoch": 0.21000424230091363, + "grad_norm": 0.0035623747389763594, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 7611 + }, + { + "epoch": 0.21003183450197802, + "grad_norm": 0.003275655210018158, + "learning_rate": 0.001, + "loss": 0.4302, + "step": 7612 + }, + { + "epoch": 0.2100594267030424, + "grad_norm": 0.0037639643996953964, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 7613 + }, + { + "epoch": 0.21008701890410675, + "grad_norm": 0.010343240574002266, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 7614 + }, + { + "epoch": 0.21011461110517113, + "grad_norm": 0.002769758924841881, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 7615 + }, + { + "epoch": 0.21014220330623548, + "grad_norm": 0.0020274778362363577, + "learning_rate": 0.001, + "loss": 0.4406, + "step": 7616 + }, + { + "epoch": 0.21016979550729986, + "grad_norm": 0.0034504812210798264, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 7617 + }, + { + "epoch": 0.21019738770836424, + "grad_norm": 0.0032671205699443817, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 7618 + }, + { + "epoch": 0.2102249799094286, + "grad_norm": 0.0033758427016437054, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 7619 + }, + { + "epoch": 0.21025257211049297, + "grad_norm": 0.005549068097025156, + "learning_rate": 0.001, + "loss": 0.383, + "step": 7620 + }, + { + "epoch": 0.21028016431155733, + "grad_norm": 0.0029898162465542555, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 7621 + }, + { + "epoch": 0.2103077565126217, + "grad_norm": 0.0020999349653720856, + "learning_rate": 0.001, + "loss": 0.405, + "step": 7622 + }, + { + "epoch": 0.2103353487136861, + "grad_norm": 0.0028715464286506176, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 7623 + }, + { + "epoch": 0.21036294091475044, + "grad_norm": 0.0032411713618785143, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 7624 + }, + { + "epoch": 0.21039053311581482, + "grad_norm": 0.006850194651633501, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 7625 + }, + { + "epoch": 0.21041812531687917, + "grad_norm": 0.0031761995051056147, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 7626 + }, + { + "epoch": 0.21044571751794355, + "grad_norm": 0.0024642094504088163, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 7627 + }, + { + "epoch": 0.21047330971900793, + "grad_norm": 0.0035544894635677338, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 7628 + }, + { + "epoch": 0.2105009019200723, + "grad_norm": 0.009451251477003098, + "learning_rate": 0.001, + "loss": 0.394, + "step": 7629 + }, + { + "epoch": 0.21052849412113667, + "grad_norm": 0.002780449576675892, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 7630 + }, + { + "epoch": 0.21055608632220102, + "grad_norm": 0.0032124193385243416, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 7631 + }, + { + "epoch": 0.2105836785232654, + "grad_norm": 0.004741044715046883, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 7632 + }, + { + "epoch": 0.21061127072432978, + "grad_norm": 0.005938271526247263, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 7633 + }, + { + "epoch": 0.21063886292539413, + "grad_norm": 0.003253635484725237, + "learning_rate": 0.001, + "loss": 0.362, + "step": 7634 + }, + { + "epoch": 0.2106664551264585, + "grad_norm": 0.0031698490492999554, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 7635 + }, + { + "epoch": 0.21069404732752287, + "grad_norm": 0.0025998263154178858, + "learning_rate": 0.001, + "loss": 0.3606, + "step": 7636 + }, + { + "epoch": 0.21072163952858725, + "grad_norm": 0.0023143659345805645, + "learning_rate": 0.001, + "loss": 0.4475, + "step": 7637 + }, + { + "epoch": 0.21074923172965163, + "grad_norm": 0.0022685672156512737, + "learning_rate": 0.001, + "loss": 0.3564, + "step": 7638 + }, + { + "epoch": 0.21077682393071598, + "grad_norm": 0.003365386975929141, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 7639 + }, + { + "epoch": 0.21080441613178036, + "grad_norm": 0.0031616310589015484, + "learning_rate": 0.001, + "loss": 0.3569, + "step": 7640 + }, + { + "epoch": 0.2108320083328447, + "grad_norm": 0.0061267949640750885, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 7641 + }, + { + "epoch": 0.2108596005339091, + "grad_norm": 0.0033039050176739693, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 7642 + }, + { + "epoch": 0.21088719273497347, + "grad_norm": 0.002562372013926506, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 7643 + }, + { + "epoch": 0.21091478493603782, + "grad_norm": 0.0026763584464788437, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 7644 + }, + { + "epoch": 0.2109423771371022, + "grad_norm": 0.0032096514478325844, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 7645 + }, + { + "epoch": 0.21096996933816656, + "grad_norm": 0.002220802940428257, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 7646 + }, + { + "epoch": 0.21099756153923094, + "grad_norm": 0.0039492035284638405, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 7647 + }, + { + "epoch": 0.21102515374029532, + "grad_norm": 0.0027708462439477444, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 7648 + }, + { + "epoch": 0.21105274594135967, + "grad_norm": 0.002891121432185173, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 7649 + }, + { + "epoch": 0.21108033814242405, + "grad_norm": 0.008183618076145649, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 7650 + }, + { + "epoch": 0.2111079303434884, + "grad_norm": 0.0033024440053850412, + "learning_rate": 0.001, + "loss": 0.408, + "step": 7651 + }, + { + "epoch": 0.21113552254455278, + "grad_norm": 0.0020405303221195936, + "learning_rate": 0.001, + "loss": 0.421, + "step": 7652 + }, + { + "epoch": 0.21116311474561716, + "grad_norm": 0.004159392323344946, + "learning_rate": 0.001, + "loss": 0.3637, + "step": 7653 + }, + { + "epoch": 0.21119070694668152, + "grad_norm": 0.004359352868050337, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 7654 + }, + { + "epoch": 0.2112182991477459, + "grad_norm": 0.003729519434273243, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 7655 + }, + { + "epoch": 0.21124589134881025, + "grad_norm": 0.0033560399897396564, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 7656 + }, + { + "epoch": 0.21127348354987463, + "grad_norm": 0.008382863365113735, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 7657 + }, + { + "epoch": 0.211301075750939, + "grad_norm": 0.002670932561159134, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 7658 + }, + { + "epoch": 0.21132866795200336, + "grad_norm": 0.0026596023235470057, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 7659 + }, + { + "epoch": 0.21135626015306774, + "grad_norm": 0.0030948955100029707, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 7660 + }, + { + "epoch": 0.2113838523541321, + "grad_norm": 0.005249246954917908, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 7661 + }, + { + "epoch": 0.21141144455519648, + "grad_norm": 0.0028843062464147806, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 7662 + }, + { + "epoch": 0.21143903675626086, + "grad_norm": 0.0027450129855424166, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 7663 + }, + { + "epoch": 0.2114666289573252, + "grad_norm": 0.01026302482932806, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 7664 + }, + { + "epoch": 0.2114942211583896, + "grad_norm": 0.0032274313271045685, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 7665 + }, + { + "epoch": 0.21152181335945394, + "grad_norm": 0.005412998143583536, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 7666 + }, + { + "epoch": 0.21154940556051832, + "grad_norm": 0.00447877449914813, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 7667 + }, + { + "epoch": 0.2115769977615827, + "grad_norm": 0.0035270792432129383, + "learning_rate": 0.001, + "loss": 0.3559, + "step": 7668 + }, + { + "epoch": 0.21160458996264706, + "grad_norm": 0.009330617263913155, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 7669 + }, + { + "epoch": 0.21163218216371144, + "grad_norm": 0.003103942610323429, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 7670 + }, + { + "epoch": 0.2116597743647758, + "grad_norm": 0.0024018525145947933, + "learning_rate": 0.001, + "loss": 0.384, + "step": 7671 + }, + { + "epoch": 0.21168736656584017, + "grad_norm": 0.0029490841552615166, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 7672 + }, + { + "epoch": 0.21171495876690452, + "grad_norm": 0.003537841374054551, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 7673 + }, + { + "epoch": 0.2117425509679689, + "grad_norm": 0.007800383027642965, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 7674 + }, + { + "epoch": 0.21177014316903328, + "grad_norm": 0.0025011899415403605, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 7675 + }, + { + "epoch": 0.21179773537009763, + "grad_norm": 0.004059432540088892, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 7676 + }, + { + "epoch": 0.21182532757116201, + "grad_norm": 0.0030776297207921743, + "learning_rate": 0.001, + "loss": 0.3552, + "step": 7677 + }, + { + "epoch": 0.21185291977222637, + "grad_norm": 0.004162721801549196, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 7678 + }, + { + "epoch": 0.21188051197329075, + "grad_norm": 0.003027817001566291, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 7679 + }, + { + "epoch": 0.21190810417435513, + "grad_norm": 0.004329508636146784, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 7680 + }, + { + "epoch": 0.21193569637541948, + "grad_norm": 0.003321508876979351, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 7681 + }, + { + "epoch": 0.21196328857648386, + "grad_norm": 0.003910355735570192, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 7682 + }, + { + "epoch": 0.2119908807775482, + "grad_norm": 0.002138720592483878, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 7683 + }, + { + "epoch": 0.2120184729786126, + "grad_norm": 0.003873582696542144, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 7684 + }, + { + "epoch": 0.21204606517967697, + "grad_norm": 0.008897165767848492, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 7685 + }, + { + "epoch": 0.21207365738074133, + "grad_norm": 0.009049713611602783, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 7686 + }, + { + "epoch": 0.2121012495818057, + "grad_norm": 0.00294255162589252, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 7687 + }, + { + "epoch": 0.21212884178287006, + "grad_norm": 0.003320804564282298, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 7688 + }, + { + "epoch": 0.21215643398393444, + "grad_norm": 0.006188528146594763, + "learning_rate": 0.001, + "loss": 0.404, + "step": 7689 + }, + { + "epoch": 0.21218402618499882, + "grad_norm": 0.002375251380726695, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 7690 + }, + { + "epoch": 0.21221161838606317, + "grad_norm": 0.00397599907591939, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 7691 + }, + { + "epoch": 0.21223921058712755, + "grad_norm": 0.002846440998837352, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 7692 + }, + { + "epoch": 0.2122668027881919, + "grad_norm": 0.005194092635065317, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 7693 + }, + { + "epoch": 0.21229439498925629, + "grad_norm": 0.004019029904156923, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 7694 + }, + { + "epoch": 0.21232198719032067, + "grad_norm": 0.0032543425913900137, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 7695 + }, + { + "epoch": 0.21234957939138502, + "grad_norm": 0.004645978100597858, + "learning_rate": 0.001, + "loss": 0.4443, + "step": 7696 + }, + { + "epoch": 0.2123771715924494, + "grad_norm": 0.003786055836826563, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 7697 + }, + { + "epoch": 0.21240476379351375, + "grad_norm": 0.005406087264418602, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 7698 + }, + { + "epoch": 0.21243235599457813, + "grad_norm": 0.005549001973122358, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 7699 + }, + { + "epoch": 0.2124599481956425, + "grad_norm": 0.002728297607973218, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 7700 + }, + { + "epoch": 0.21248754039670686, + "grad_norm": 0.005214143078774214, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 7701 + }, + { + "epoch": 0.21251513259777124, + "grad_norm": 0.0025736193638294935, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 7702 + }, + { + "epoch": 0.2125427247988356, + "grad_norm": 0.00296932109631598, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 7703 + }, + { + "epoch": 0.21257031699989998, + "grad_norm": 0.003445189446210861, + "learning_rate": 0.001, + "loss": 0.3584, + "step": 7704 + }, + { + "epoch": 0.21259790920096436, + "grad_norm": 0.0035377286840230227, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 7705 + }, + { + "epoch": 0.2126255014020287, + "grad_norm": 0.0029369608964771032, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 7706 + }, + { + "epoch": 0.2126530936030931, + "grad_norm": 0.0030892151407897472, + "learning_rate": 0.001, + "loss": 0.395, + "step": 7707 + }, + { + "epoch": 0.21268068580415744, + "grad_norm": 0.004821309354156256, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 7708 + }, + { + "epoch": 0.21270827800522182, + "grad_norm": 0.003210441442206502, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 7709 + }, + { + "epoch": 0.2127358702062862, + "grad_norm": 0.0030719011556357145, + "learning_rate": 0.001, + "loss": 0.399, + "step": 7710 + }, + { + "epoch": 0.21276346240735056, + "grad_norm": 0.0028634623158723116, + "learning_rate": 0.001, + "loss": 0.3503, + "step": 7711 + }, + { + "epoch": 0.21279105460841494, + "grad_norm": 0.004046322777867317, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 7712 + }, + { + "epoch": 0.2128186468094793, + "grad_norm": 0.00294592441059649, + "learning_rate": 0.001, + "loss": 0.3417, + "step": 7713 + }, + { + "epoch": 0.21284623901054367, + "grad_norm": 0.0021102267783135176, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 7714 + }, + { + "epoch": 0.21287383121160805, + "grad_norm": 0.0028041282203048468, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 7715 + }, + { + "epoch": 0.2129014234126724, + "grad_norm": 0.0034821259323507547, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 7716 + }, + { + "epoch": 0.21292901561373678, + "grad_norm": 0.003997510299086571, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 7717 + }, + { + "epoch": 0.21295660781480114, + "grad_norm": 0.00297334766946733, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 7718 + }, + { + "epoch": 0.21298420001586552, + "grad_norm": 0.0029889491852372885, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 7719 + }, + { + "epoch": 0.2130117922169299, + "grad_norm": 0.005800141021609306, + "learning_rate": 0.001, + "loss": 0.3693, + "step": 7720 + }, + { + "epoch": 0.21303938441799425, + "grad_norm": 0.004063557833433151, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 7721 + }, + { + "epoch": 0.21306697661905863, + "grad_norm": 0.0052527179941535, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 7722 + }, + { + "epoch": 0.21309456882012298, + "grad_norm": 0.004150967579334974, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 7723 + }, + { + "epoch": 0.21312216102118736, + "grad_norm": 0.003142510773614049, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 7724 + }, + { + "epoch": 0.21314975322225174, + "grad_norm": 0.0024571139365434647, + "learning_rate": 0.001, + "loss": 0.397, + "step": 7725 + }, + { + "epoch": 0.2131773454233161, + "grad_norm": 0.007491269148886204, + "learning_rate": 0.001, + "loss": 0.4254, + "step": 7726 + }, + { + "epoch": 0.21320493762438048, + "grad_norm": 0.003059527836740017, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 7727 + }, + { + "epoch": 0.21323252982544483, + "grad_norm": 0.0030870058108121157, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 7728 + }, + { + "epoch": 0.2132601220265092, + "grad_norm": 0.002307609189301729, + "learning_rate": 0.001, + "loss": 0.404, + "step": 7729 + }, + { + "epoch": 0.2132877142275736, + "grad_norm": 0.004910447634756565, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 7730 + }, + { + "epoch": 0.21331530642863794, + "grad_norm": 0.0032054877374321222, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 7731 + }, + { + "epoch": 0.21334289862970232, + "grad_norm": 0.004739253781735897, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 7732 + }, + { + "epoch": 0.21337049083076667, + "grad_norm": 0.0027026189491152763, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 7733 + }, + { + "epoch": 0.21339808303183105, + "grad_norm": 0.0031935579609125853, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 7734 + }, + { + "epoch": 0.21342567523289543, + "grad_norm": 0.0026129090692847967, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 7735 + }, + { + "epoch": 0.2134532674339598, + "grad_norm": 0.0030374531634151936, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 7736 + }, + { + "epoch": 0.21348085963502417, + "grad_norm": 0.0035555511713027954, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 7737 + }, + { + "epoch": 0.21350845183608852, + "grad_norm": 0.005884343292564154, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 7738 + }, + { + "epoch": 0.2135360440371529, + "grad_norm": 0.0030709283892065287, + "learning_rate": 0.001, + "loss": 0.376, + "step": 7739 + }, + { + "epoch": 0.21356363623821728, + "grad_norm": 0.0027664625085890293, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 7740 + }, + { + "epoch": 0.21359122843928163, + "grad_norm": 0.005600049160420895, + "learning_rate": 0.001, + "loss": 0.3438, + "step": 7741 + }, + { + "epoch": 0.213618820640346, + "grad_norm": 0.00319514493457973, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 7742 + }, + { + "epoch": 0.21364641284141037, + "grad_norm": 0.0033522641751915216, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 7743 + }, + { + "epoch": 0.21367400504247475, + "grad_norm": 0.0030811361502856016, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 7744 + }, + { + "epoch": 0.21370159724353913, + "grad_norm": 0.0026056889910250902, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 7745 + }, + { + "epoch": 0.21372918944460348, + "grad_norm": 0.0046222517266869545, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 7746 + }, + { + "epoch": 0.21375678164566786, + "grad_norm": 0.005610965192317963, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 7747 + }, + { + "epoch": 0.2137843738467322, + "grad_norm": 0.004591417033225298, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 7748 + }, + { + "epoch": 0.2138119660477966, + "grad_norm": 0.0033903804142028093, + "learning_rate": 0.001, + "loss": 0.396, + "step": 7749 + }, + { + "epoch": 0.21383955824886097, + "grad_norm": 0.0037878809962421656, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 7750 + }, + { + "epoch": 0.21386715044992533, + "grad_norm": 0.006682871840894222, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 7751 + }, + { + "epoch": 0.2138947426509897, + "grad_norm": 0.0054921298287808895, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 7752 + }, + { + "epoch": 0.21392233485205406, + "grad_norm": 0.002533350605517626, + "learning_rate": 0.001, + "loss": 0.415, + "step": 7753 + }, + { + "epoch": 0.21394992705311844, + "grad_norm": 0.004424331709742546, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 7754 + }, + { + "epoch": 0.21397751925418282, + "grad_norm": 0.003484759945422411, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 7755 + }, + { + "epoch": 0.21400511145524717, + "grad_norm": 0.0028570671565830708, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 7756 + }, + { + "epoch": 0.21403270365631155, + "grad_norm": 0.0032984951976686716, + "learning_rate": 0.001, + "loss": 0.405, + "step": 7757 + }, + { + "epoch": 0.2140602958573759, + "grad_norm": 0.002993236295878887, + "learning_rate": 0.001, + "loss": 0.381, + "step": 7758 + }, + { + "epoch": 0.21408788805844028, + "grad_norm": 0.0039431145414710045, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 7759 + }, + { + "epoch": 0.21411548025950466, + "grad_norm": 0.0023782916832715273, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 7760 + }, + { + "epoch": 0.21414307246056902, + "grad_norm": 0.005132326390594244, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 7761 + }, + { + "epoch": 0.2141706646616334, + "grad_norm": 0.004260215442627668, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 7762 + }, + { + "epoch": 0.21419825686269775, + "grad_norm": 0.00576009601354599, + "learning_rate": 0.001, + "loss": 0.431, + "step": 7763 + }, + { + "epoch": 0.21422584906376213, + "grad_norm": 0.002464310498908162, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 7764 + }, + { + "epoch": 0.21425344126482648, + "grad_norm": 0.0022607767023146152, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 7765 + }, + { + "epoch": 0.21428103346589086, + "grad_norm": 0.002766657853499055, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 7766 + }, + { + "epoch": 0.21430862566695524, + "grad_norm": 0.002656952477991581, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 7767 + }, + { + "epoch": 0.2143362178680196, + "grad_norm": 0.005709274671971798, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 7768 + }, + { + "epoch": 0.21436381006908398, + "grad_norm": 0.0032649333588778973, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 7769 + }, + { + "epoch": 0.21439140227014833, + "grad_norm": 0.0026809549890458584, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 7770 + }, + { + "epoch": 0.2144189944712127, + "grad_norm": 0.010057074949145317, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 7771 + }, + { + "epoch": 0.2144465866722771, + "grad_norm": 0.003958344459533691, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 7772 + }, + { + "epoch": 0.21447417887334144, + "grad_norm": 0.004084006417542696, + "learning_rate": 0.001, + "loss": 0.4316, + "step": 7773 + }, + { + "epoch": 0.21450177107440582, + "grad_norm": 0.0051500373519957066, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 7774 + }, + { + "epoch": 0.21452936327547018, + "grad_norm": 0.003794713644310832, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 7775 + }, + { + "epoch": 0.21455695547653456, + "grad_norm": 0.02025657705962658, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 7776 + }, + { + "epoch": 0.21458454767759894, + "grad_norm": 0.0035089454613626003, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 7777 + }, + { + "epoch": 0.2146121398786633, + "grad_norm": 0.0029882427770644426, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 7778 + }, + { + "epoch": 0.21463973207972767, + "grad_norm": 0.0025343652814626694, + "learning_rate": 0.001, + "loss": 0.391, + "step": 7779 + }, + { + "epoch": 0.21466732428079202, + "grad_norm": 0.0028338837437331676, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 7780 + }, + { + "epoch": 0.2146949164818564, + "grad_norm": 0.0033451595809310675, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 7781 + }, + { + "epoch": 0.21472250868292078, + "grad_norm": 0.0033898449037224054, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 7782 + }, + { + "epoch": 0.21475010088398513, + "grad_norm": 0.0038336054421961308, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 7783 + }, + { + "epoch": 0.21477769308504951, + "grad_norm": 0.012188595719635487, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 7784 + }, + { + "epoch": 0.21480528528611387, + "grad_norm": 0.005000379402190447, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 7785 + }, + { + "epoch": 0.21483287748717825, + "grad_norm": 0.00532375555485487, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 7786 + }, + { + "epoch": 0.21486046968824263, + "grad_norm": 0.004247152712196112, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 7787 + }, + { + "epoch": 0.21488806188930698, + "grad_norm": 0.0024253970477730036, + "learning_rate": 0.001, + "loss": 0.4233, + "step": 7788 + }, + { + "epoch": 0.21491565409037136, + "grad_norm": 0.0028955123852938414, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 7789 + }, + { + "epoch": 0.2149432462914357, + "grad_norm": 0.0019871145486831665, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 7790 + }, + { + "epoch": 0.2149708384925001, + "grad_norm": 0.0026270560920238495, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 7791 + }, + { + "epoch": 0.21499843069356447, + "grad_norm": 0.0022613939363509417, + "learning_rate": 0.001, + "loss": 0.399, + "step": 7792 + }, + { + "epoch": 0.21502602289462883, + "grad_norm": 0.0022467582020908594, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 7793 + }, + { + "epoch": 0.2150536150956932, + "grad_norm": 0.003144500544294715, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 7794 + }, + { + "epoch": 0.21508120729675756, + "grad_norm": 0.004727689083665609, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 7795 + }, + { + "epoch": 0.21510879949782194, + "grad_norm": 0.0030138723086565733, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 7796 + }, + { + "epoch": 0.21513639169888632, + "grad_norm": 0.003997731953859329, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 7797 + }, + { + "epoch": 0.21516398389995067, + "grad_norm": 0.0034538882318884134, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 7798 + }, + { + "epoch": 0.21519157610101505, + "grad_norm": 0.0030911024659872055, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 7799 + }, + { + "epoch": 0.2152191683020794, + "grad_norm": 0.006776158697903156, + "learning_rate": 0.001, + "loss": 0.406, + "step": 7800 + }, + { + "epoch": 0.21524676050314379, + "grad_norm": 0.0032637538388371468, + "learning_rate": 0.001, + "loss": 0.392, + "step": 7801 + }, + { + "epoch": 0.21527435270420817, + "grad_norm": 0.0028168826829642057, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 7802 + }, + { + "epoch": 0.21530194490527252, + "grad_norm": 0.002768394071608782, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 7803 + }, + { + "epoch": 0.2153295371063369, + "grad_norm": 0.004282908979803324, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 7804 + }, + { + "epoch": 0.21535712930740125, + "grad_norm": 0.002293472411110997, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 7805 + }, + { + "epoch": 0.21538472150846563, + "grad_norm": 0.003117901738733053, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 7806 + }, + { + "epoch": 0.21541231370953, + "grad_norm": 0.0030702080111950636, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 7807 + }, + { + "epoch": 0.21543990591059436, + "grad_norm": 0.0037037963047623634, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 7808 + }, + { + "epoch": 0.21546749811165875, + "grad_norm": 0.003569400403648615, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 7809 + }, + { + "epoch": 0.2154950903127231, + "grad_norm": 0.005185401998460293, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 7810 + }, + { + "epoch": 0.21552268251378748, + "grad_norm": 0.0026427856646478176, + "learning_rate": 0.001, + "loss": 0.4301, + "step": 7811 + }, + { + "epoch": 0.21555027471485186, + "grad_norm": 0.0026996051892638206, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 7812 + }, + { + "epoch": 0.2155778669159162, + "grad_norm": 0.0026872565504163504, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 7813 + }, + { + "epoch": 0.2156054591169806, + "grad_norm": 0.0033470892813056707, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 7814 + }, + { + "epoch": 0.21563305131804494, + "grad_norm": 0.002979893935844302, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 7815 + }, + { + "epoch": 0.21566064351910932, + "grad_norm": 0.002338412217795849, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 7816 + }, + { + "epoch": 0.2156882357201737, + "grad_norm": 0.0026039378717541695, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 7817 + }, + { + "epoch": 0.21571582792123806, + "grad_norm": 0.004928927402943373, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 7818 + }, + { + "epoch": 0.21574342012230244, + "grad_norm": 0.003969724290072918, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 7819 + }, + { + "epoch": 0.2157710123233668, + "grad_norm": 0.004132222384214401, + "learning_rate": 0.001, + "loss": 0.4446, + "step": 7820 + }, + { + "epoch": 0.21579860452443117, + "grad_norm": 0.003034410998225212, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 7821 + }, + { + "epoch": 0.21582619672549555, + "grad_norm": 0.0034465952776372433, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 7822 + }, + { + "epoch": 0.2158537889265599, + "grad_norm": 0.0028153613675385714, + "learning_rate": 0.001, + "loss": 0.4302, + "step": 7823 + }, + { + "epoch": 0.21588138112762428, + "grad_norm": 0.007028549909591675, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 7824 + }, + { + "epoch": 0.21590897332868864, + "grad_norm": 0.0035245222970843315, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 7825 + }, + { + "epoch": 0.21593656552975302, + "grad_norm": 0.0027619630564004183, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 7826 + }, + { + "epoch": 0.2159641577308174, + "grad_norm": 0.0029921175446361303, + "learning_rate": 0.001, + "loss": 0.3609, + "step": 7827 + }, + { + "epoch": 0.21599174993188175, + "grad_norm": 0.004075322765856981, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 7828 + }, + { + "epoch": 0.21601934213294613, + "grad_norm": 0.004721477162092924, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 7829 + }, + { + "epoch": 0.21604693433401048, + "grad_norm": 0.00825839675962925, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 7830 + }, + { + "epoch": 0.21607452653507486, + "grad_norm": 0.0029653101228177547, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 7831 + }, + { + "epoch": 0.21610211873613924, + "grad_norm": 0.004584647715091705, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 7832 + }, + { + "epoch": 0.2161297109372036, + "grad_norm": 0.0019515285966917872, + "learning_rate": 0.001, + "loss": 0.4817, + "step": 7833 + }, + { + "epoch": 0.21615730313826798, + "grad_norm": 0.0029263091273605824, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 7834 + }, + { + "epoch": 0.21618489533933233, + "grad_norm": 0.002956991782411933, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 7835 + }, + { + "epoch": 0.2162124875403967, + "grad_norm": 0.0037960107438266277, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 7836 + }, + { + "epoch": 0.2162400797414611, + "grad_norm": 0.0029722515027970076, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 7837 + }, + { + "epoch": 0.21626767194252544, + "grad_norm": 0.0032150924671441317, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 7838 + }, + { + "epoch": 0.21629526414358982, + "grad_norm": 0.005019112955778837, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 7839 + }, + { + "epoch": 0.21632285634465417, + "grad_norm": 0.0028452936094254255, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 7840 + }, + { + "epoch": 0.21635044854571855, + "grad_norm": 0.0035461215302348137, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 7841 + }, + { + "epoch": 0.21637804074678293, + "grad_norm": 0.0051398370414972305, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 7842 + }, + { + "epoch": 0.2164056329478473, + "grad_norm": 0.0029704368207603693, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 7843 + }, + { + "epoch": 0.21643322514891167, + "grad_norm": 0.004192579071968794, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 7844 + }, + { + "epoch": 0.21646081734997602, + "grad_norm": 0.0036786114796996117, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 7845 + }, + { + "epoch": 0.2164884095510404, + "grad_norm": 0.0032894443720579147, + "learning_rate": 0.001, + "loss": 0.3671, + "step": 7846 + }, + { + "epoch": 0.21651600175210478, + "grad_norm": 0.00437202537432313, + "learning_rate": 0.001, + "loss": 0.4354, + "step": 7847 + }, + { + "epoch": 0.21654359395316913, + "grad_norm": 0.004263371229171753, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 7848 + }, + { + "epoch": 0.2165711861542335, + "grad_norm": 0.0023139826953411102, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 7849 + }, + { + "epoch": 0.21659877835529787, + "grad_norm": 0.002766036195680499, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 7850 + }, + { + "epoch": 0.21662637055636225, + "grad_norm": 0.004952535964548588, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 7851 + }, + { + "epoch": 0.21665396275742663, + "grad_norm": 0.008287088945508003, + "learning_rate": 0.001, + "loss": 0.3569, + "step": 7852 + }, + { + "epoch": 0.21668155495849098, + "grad_norm": 0.019219983369112015, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 7853 + }, + { + "epoch": 0.21670914715955536, + "grad_norm": 0.0029021003283560276, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 7854 + }, + { + "epoch": 0.2167367393606197, + "grad_norm": 0.004268075339496136, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 7855 + }, + { + "epoch": 0.2167643315616841, + "grad_norm": 0.0038819487672299147, + "learning_rate": 0.001, + "loss": 0.366, + "step": 7856 + }, + { + "epoch": 0.21679192376274847, + "grad_norm": 0.0033485370222479105, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 7857 + }, + { + "epoch": 0.21681951596381283, + "grad_norm": 0.004031899850815535, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 7858 + }, + { + "epoch": 0.2168471081648772, + "grad_norm": 0.0028309531044214964, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 7859 + }, + { + "epoch": 0.21687470036594156, + "grad_norm": 0.005679224617779255, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 7860 + }, + { + "epoch": 0.21690229256700594, + "grad_norm": 0.0028747431933879852, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 7861 + }, + { + "epoch": 0.2169298847680703, + "grad_norm": 0.0022720531560480595, + "learning_rate": 0.001, + "loss": 0.4386, + "step": 7862 + }, + { + "epoch": 0.21695747696913467, + "grad_norm": 0.0037069146055728197, + "learning_rate": 0.001, + "loss": 0.399, + "step": 7863 + }, + { + "epoch": 0.21698506917019905, + "grad_norm": 0.004593148361891508, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 7864 + }, + { + "epoch": 0.2170126613712634, + "grad_norm": 0.004522950388491154, + "learning_rate": 0.001, + "loss": 0.3679, + "step": 7865 + }, + { + "epoch": 0.21704025357232778, + "grad_norm": 0.0038752169348299503, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 7866 + }, + { + "epoch": 0.21706784577339214, + "grad_norm": 0.002294728998094797, + "learning_rate": 0.001, + "loss": 0.4457, + "step": 7867 + }, + { + "epoch": 0.21709543797445652, + "grad_norm": 0.002397543750703335, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 7868 + }, + { + "epoch": 0.2171230301755209, + "grad_norm": 0.009867599233984947, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 7869 + }, + { + "epoch": 0.21715062237658525, + "grad_norm": 0.003990727476775646, + "learning_rate": 0.001, + "loss": 0.3495, + "step": 7870 + }, + { + "epoch": 0.21717821457764963, + "grad_norm": 0.003901657648384571, + "learning_rate": 0.001, + "loss": 0.3531, + "step": 7871 + }, + { + "epoch": 0.21720580677871398, + "grad_norm": 0.00325774191878736, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 7872 + }, + { + "epoch": 0.21723339897977836, + "grad_norm": 0.003350186161696911, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 7873 + }, + { + "epoch": 0.21726099118084274, + "grad_norm": 0.002645864151418209, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 7874 + }, + { + "epoch": 0.2172885833819071, + "grad_norm": 0.006232825573533773, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 7875 + }, + { + "epoch": 0.21731617558297148, + "grad_norm": 0.0021961445454508066, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 7876 + }, + { + "epoch": 0.21734376778403583, + "grad_norm": 0.0030032650101929903, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 7877 + }, + { + "epoch": 0.2173713599851002, + "grad_norm": 0.006241418421268463, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 7878 + }, + { + "epoch": 0.2173989521861646, + "grad_norm": 0.003943136427551508, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 7879 + }, + { + "epoch": 0.21742654438722894, + "grad_norm": 0.0024433995131403208, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 7880 + }, + { + "epoch": 0.21745413658829332, + "grad_norm": 0.0043607852421700954, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 7881 + }, + { + "epoch": 0.21748172878935768, + "grad_norm": 0.004530392121523619, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 7882 + }, + { + "epoch": 0.21750932099042206, + "grad_norm": 0.0024341184180229902, + "learning_rate": 0.001, + "loss": 0.4491, + "step": 7883 + }, + { + "epoch": 0.21753691319148644, + "grad_norm": 0.003600149182602763, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 7884 + }, + { + "epoch": 0.2175645053925508, + "grad_norm": 0.002762843621894717, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 7885 + }, + { + "epoch": 0.21759209759361517, + "grad_norm": 0.0029462978709489107, + "learning_rate": 0.001, + "loss": 0.375, + "step": 7886 + }, + { + "epoch": 0.21761968979467952, + "grad_norm": 0.0023480509407818317, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 7887 + }, + { + "epoch": 0.2176472819957439, + "grad_norm": 0.002522410824894905, + "learning_rate": 0.001, + "loss": 0.434, + "step": 7888 + }, + { + "epoch": 0.21767487419680828, + "grad_norm": 0.0022286747116595507, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 7889 + }, + { + "epoch": 0.21770246639787263, + "grad_norm": 0.002925209002569318, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 7890 + }, + { + "epoch": 0.21773005859893702, + "grad_norm": 0.003066236851736903, + "learning_rate": 0.001, + "loss": 0.3512, + "step": 7891 + }, + { + "epoch": 0.21775765080000137, + "grad_norm": 0.003831464098766446, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 7892 + }, + { + "epoch": 0.21778524300106575, + "grad_norm": 0.0033246371895074844, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 7893 + }, + { + "epoch": 0.21781283520213013, + "grad_norm": 0.0038893541786819696, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 7894 + }, + { + "epoch": 0.21784042740319448, + "grad_norm": 0.003152028191834688, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 7895 + }, + { + "epoch": 0.21786801960425886, + "grad_norm": 0.004073096439242363, + "learning_rate": 0.001, + "loss": 0.4466, + "step": 7896 + }, + { + "epoch": 0.2178956118053232, + "grad_norm": 0.00838442798703909, + "learning_rate": 0.001, + "loss": 0.43, + "step": 7897 + }, + { + "epoch": 0.2179232040063876, + "grad_norm": 0.004715254995971918, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 7898 + }, + { + "epoch": 0.21795079620745197, + "grad_norm": 0.005644774530082941, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 7899 + }, + { + "epoch": 0.21797838840851633, + "grad_norm": 0.0035621551796793938, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 7900 + }, + { + "epoch": 0.2180059806095807, + "grad_norm": 0.003715448547154665, + "learning_rate": 0.001, + "loss": 0.4418, + "step": 7901 + }, + { + "epoch": 0.21803357281064506, + "grad_norm": 0.0025230993051081896, + "learning_rate": 0.001, + "loss": 0.389, + "step": 7902 + }, + { + "epoch": 0.21806116501170944, + "grad_norm": 0.0039021014235913754, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 7903 + }, + { + "epoch": 0.21808875721277382, + "grad_norm": 0.004864577203989029, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 7904 + }, + { + "epoch": 0.21811634941383817, + "grad_norm": 0.007110804785043001, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 7905 + }, + { + "epoch": 0.21814394161490255, + "grad_norm": 0.004145526327192783, + "learning_rate": 0.001, + "loss": 0.398, + "step": 7906 + }, + { + "epoch": 0.2181715338159669, + "grad_norm": 0.003214552765712142, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 7907 + }, + { + "epoch": 0.2181991260170313, + "grad_norm": 0.0025902027264237404, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 7908 + }, + { + "epoch": 0.21822671821809567, + "grad_norm": 0.005056621506810188, + "learning_rate": 0.001, + "loss": 0.384, + "step": 7909 + }, + { + "epoch": 0.21825431041916002, + "grad_norm": 0.0027781168464571238, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 7910 + }, + { + "epoch": 0.2182819026202244, + "grad_norm": 0.002909269416704774, + "learning_rate": 0.001, + "loss": 0.409, + "step": 7911 + }, + { + "epoch": 0.21830949482128875, + "grad_norm": 0.003969018347561359, + "learning_rate": 0.001, + "loss": 0.415, + "step": 7912 + }, + { + "epoch": 0.21833708702235313, + "grad_norm": 0.002928847912698984, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 7913 + }, + { + "epoch": 0.2183646792234175, + "grad_norm": 0.003366072429344058, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7914 + }, + { + "epoch": 0.21839227142448187, + "grad_norm": 0.003718828782439232, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 7915 + }, + { + "epoch": 0.21841986362554625, + "grad_norm": 0.0029193959198892117, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 7916 + }, + { + "epoch": 0.2184474558266106, + "grad_norm": 0.002285629976540804, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 7917 + }, + { + "epoch": 0.21847504802767498, + "grad_norm": 0.002450938569381833, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 7918 + }, + { + "epoch": 0.21850264022873936, + "grad_norm": 0.002341889077797532, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 7919 + }, + { + "epoch": 0.2185302324298037, + "grad_norm": 0.003041157266125083, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 7920 + }, + { + "epoch": 0.2185578246308681, + "grad_norm": 0.004077001940459013, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 7921 + }, + { + "epoch": 0.21858541683193244, + "grad_norm": 0.002720333868637681, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 7922 + }, + { + "epoch": 0.21861300903299682, + "grad_norm": 0.005403697956353426, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 7923 + }, + { + "epoch": 0.2186406012340612, + "grad_norm": 0.004800851922482252, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 7924 + }, + { + "epoch": 0.21866819343512556, + "grad_norm": 0.003321266733109951, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 7925 + }, + { + "epoch": 0.21869578563618994, + "grad_norm": 0.003606748068705201, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 7926 + }, + { + "epoch": 0.2187233778372543, + "grad_norm": 0.002816277788951993, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 7927 + }, + { + "epoch": 0.21875097003831867, + "grad_norm": 0.0028528219554573298, + "learning_rate": 0.001, + "loss": 0.423, + "step": 7928 + }, + { + "epoch": 0.21877856223938305, + "grad_norm": 0.00370188825763762, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 7929 + }, + { + "epoch": 0.2188061544404474, + "grad_norm": 0.00391001021489501, + "learning_rate": 0.001, + "loss": 0.357, + "step": 7930 + }, + { + "epoch": 0.21883374664151178, + "grad_norm": 0.002469925908371806, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 7931 + }, + { + "epoch": 0.21886133884257614, + "grad_norm": 0.004554017446935177, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 7932 + }, + { + "epoch": 0.21888893104364052, + "grad_norm": 0.005216664168983698, + "learning_rate": 0.001, + "loss": 0.404, + "step": 7933 + }, + { + "epoch": 0.2189165232447049, + "grad_norm": 0.002773087937384844, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 7934 + }, + { + "epoch": 0.21894411544576925, + "grad_norm": 0.002156183123588562, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 7935 + }, + { + "epoch": 0.21897170764683363, + "grad_norm": 0.0025752519723027945, + "learning_rate": 0.001, + "loss": 0.4342, + "step": 7936 + }, + { + "epoch": 0.21899929984789798, + "grad_norm": 0.002592964330688119, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 7937 + }, + { + "epoch": 0.21902689204896236, + "grad_norm": 0.0023423507809638977, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 7938 + }, + { + "epoch": 0.21905448425002674, + "grad_norm": 0.0025791767984628677, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 7939 + }, + { + "epoch": 0.2190820764510911, + "grad_norm": 0.0030768734868615866, + "learning_rate": 0.001, + "loss": 0.384, + "step": 7940 + }, + { + "epoch": 0.21910966865215548, + "grad_norm": 0.0035807385575026274, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 7941 + }, + { + "epoch": 0.21913726085321983, + "grad_norm": 0.0031572608277201653, + "learning_rate": 0.001, + "loss": 0.3611, + "step": 7942 + }, + { + "epoch": 0.2191648530542842, + "grad_norm": 0.0027867392636835575, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 7943 + }, + { + "epoch": 0.2191924452553486, + "grad_norm": 0.0025052884593605995, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 7944 + }, + { + "epoch": 0.21922003745641294, + "grad_norm": 0.0027730814181268215, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 7945 + }, + { + "epoch": 0.21924762965747732, + "grad_norm": 0.0026065954007208347, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 7946 + }, + { + "epoch": 0.21927522185854167, + "grad_norm": 0.00736627820879221, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 7947 + }, + { + "epoch": 0.21930281405960605, + "grad_norm": 0.003984878305345774, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 7948 + }, + { + "epoch": 0.21933040626067044, + "grad_norm": 0.005414186976850033, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 7949 + }, + { + "epoch": 0.2193579984617348, + "grad_norm": 0.0026020752266049385, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 7950 + }, + { + "epoch": 0.21938559066279917, + "grad_norm": 0.0026629299391061068, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 7951 + }, + { + "epoch": 0.21941318286386352, + "grad_norm": 0.0022089278791099787, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 7952 + }, + { + "epoch": 0.2194407750649279, + "grad_norm": 0.0028548568952828646, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 7953 + }, + { + "epoch": 0.21946836726599225, + "grad_norm": 0.0029203901067376137, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 7954 + }, + { + "epoch": 0.21949595946705663, + "grad_norm": 0.0032568967435508966, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 7955 + }, + { + "epoch": 0.21952355166812101, + "grad_norm": 0.005700434558093548, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 7956 + }, + { + "epoch": 0.21955114386918537, + "grad_norm": 0.0026754315476864576, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 7957 + }, + { + "epoch": 0.21957873607024975, + "grad_norm": 0.004874562378972769, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 7958 + }, + { + "epoch": 0.2196063282713141, + "grad_norm": 0.0051938071846961975, + "learning_rate": 0.001, + "loss": 0.412, + "step": 7959 + }, + { + "epoch": 0.21963392047237848, + "grad_norm": 0.005463124252855778, + "learning_rate": 0.001, + "loss": 0.392, + "step": 7960 + }, + { + "epoch": 0.21966151267344286, + "grad_norm": 0.0026376366149634123, + "learning_rate": 0.001, + "loss": 0.3492, + "step": 7961 + }, + { + "epoch": 0.2196891048745072, + "grad_norm": 0.0029911526944488287, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 7962 + }, + { + "epoch": 0.2197166970755716, + "grad_norm": 0.002725818660110235, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 7963 + }, + { + "epoch": 0.21974428927663595, + "grad_norm": 0.0030495068058371544, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 7964 + }, + { + "epoch": 0.21977188147770033, + "grad_norm": 0.0032879766076803207, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 7965 + }, + { + "epoch": 0.2197994736787647, + "grad_norm": 0.005279941018670797, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 7966 + }, + { + "epoch": 0.21982706587982906, + "grad_norm": 0.004807317163795233, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 7967 + }, + { + "epoch": 0.21985465808089344, + "grad_norm": 0.004063909407705069, + "learning_rate": 0.001, + "loss": 0.38, + "step": 7968 + }, + { + "epoch": 0.2198822502819578, + "grad_norm": 0.002503841184079647, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 7969 + }, + { + "epoch": 0.21990984248302217, + "grad_norm": 0.002414180664345622, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 7970 + }, + { + "epoch": 0.21993743468408655, + "grad_norm": 0.004260845948010683, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 7971 + }, + { + "epoch": 0.2199650268851509, + "grad_norm": 0.003588800085708499, + "learning_rate": 0.001, + "loss": 0.3539, + "step": 7972 + }, + { + "epoch": 0.21999261908621529, + "grad_norm": 0.01770518720149994, + "learning_rate": 0.001, + "loss": 0.3468, + "step": 7973 + }, + { + "epoch": 0.22002021128727964, + "grad_norm": 0.004121637437492609, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 7974 + }, + { + "epoch": 0.22004780348834402, + "grad_norm": 0.0025279296096414328, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 7975 + }, + { + "epoch": 0.2200753956894084, + "grad_norm": 0.0031378697603940964, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 7976 + }, + { + "epoch": 0.22010298789047275, + "grad_norm": 0.006243572104722261, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 7977 + }, + { + "epoch": 0.22013058009153713, + "grad_norm": 0.005349922459572554, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 7978 + }, + { + "epoch": 0.22015817229260148, + "grad_norm": 0.004400680772960186, + "learning_rate": 0.001, + "loss": 0.399, + "step": 7979 + }, + { + "epoch": 0.22018576449366586, + "grad_norm": 0.005931714083999395, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 7980 + }, + { + "epoch": 0.22021335669473024, + "grad_norm": 0.006180520635098219, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 7981 + }, + { + "epoch": 0.2202409488957946, + "grad_norm": 0.002432264154776931, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 7982 + }, + { + "epoch": 0.22026854109685898, + "grad_norm": 0.002539379522204399, + "learning_rate": 0.001, + "loss": 0.4761, + "step": 7983 + }, + { + "epoch": 0.22029613329792333, + "grad_norm": 0.0032303177285939455, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 7984 + }, + { + "epoch": 0.2203237254989877, + "grad_norm": 0.004244436044245958, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 7985 + }, + { + "epoch": 0.2203513177000521, + "grad_norm": 0.003201343584805727, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 7986 + }, + { + "epoch": 0.22037890990111644, + "grad_norm": 0.004487950354814529, + "learning_rate": 0.001, + "loss": 0.4543, + "step": 7987 + }, + { + "epoch": 0.22040650210218082, + "grad_norm": 0.002907637506723404, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 7988 + }, + { + "epoch": 0.22043409430324518, + "grad_norm": 0.002862214343622327, + "learning_rate": 0.001, + "loss": 0.4378, + "step": 7989 + }, + { + "epoch": 0.22046168650430956, + "grad_norm": 0.011252621188759804, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 7990 + }, + { + "epoch": 0.22048927870537394, + "grad_norm": 0.002715484006330371, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 7991 + }, + { + "epoch": 0.2205168709064383, + "grad_norm": 0.0029305212665349245, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 7992 + }, + { + "epoch": 0.22054446310750267, + "grad_norm": 0.0029706000350415707, + "learning_rate": 0.001, + "loss": 0.348, + "step": 7993 + }, + { + "epoch": 0.22057205530856702, + "grad_norm": 0.00246282946318388, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 7994 + }, + { + "epoch": 0.2205996475096314, + "grad_norm": 0.0036219728644937277, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 7995 + }, + { + "epoch": 0.22062723971069578, + "grad_norm": 0.0028112917207181454, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 7996 + }, + { + "epoch": 0.22065483191176014, + "grad_norm": 0.005280990153551102, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 7997 + }, + { + "epoch": 0.22068242411282452, + "grad_norm": 0.004479492083191872, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 7998 + }, + { + "epoch": 0.22071001631388887, + "grad_norm": 0.0030434499494731426, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 7999 + }, + { + "epoch": 0.22073760851495325, + "grad_norm": 0.016081584617495537, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 8000 + }, + { + "epoch": 0.22073760851495325, + "eval_runtime": 24.424, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.164, + "step": 8000 + }, + { + "epoch": 0.22076520071601763, + "grad_norm": 0.004563974682241678, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 8001 + }, + { + "epoch": 0.22079279291708198, + "grad_norm": 0.002752038650214672, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 8002 + }, + { + "epoch": 0.22082038511814636, + "grad_norm": 0.0030077442061156034, + "learning_rate": 0.001, + "loss": 0.403, + "step": 8003 + }, + { + "epoch": 0.22084797731921071, + "grad_norm": 0.0024848515167832375, + "learning_rate": 0.001, + "loss": 0.387, + "step": 8004 + }, + { + "epoch": 0.2208755695202751, + "grad_norm": 0.0023180947173386812, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 8005 + }, + { + "epoch": 0.22090316172133947, + "grad_norm": 0.0037670242600142956, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 8006 + }, + { + "epoch": 0.22093075392240383, + "grad_norm": 0.003507862566038966, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 8007 + }, + { + "epoch": 0.2209583461234682, + "grad_norm": 0.004687591455876827, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 8008 + }, + { + "epoch": 0.22098593832453256, + "grad_norm": 0.002483742544427514, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 8009 + }, + { + "epoch": 0.22101353052559694, + "grad_norm": 0.004836369771510363, + "learning_rate": 0.001, + "loss": 0.4485, + "step": 8010 + }, + { + "epoch": 0.22104112272666132, + "grad_norm": 0.003374388674274087, + "learning_rate": 0.001, + "loss": 0.398, + "step": 8011 + }, + { + "epoch": 0.22106871492772567, + "grad_norm": 0.004700181540101767, + "learning_rate": 0.001, + "loss": 0.3449, + "step": 8012 + }, + { + "epoch": 0.22109630712879005, + "grad_norm": 0.011050062254071236, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 8013 + }, + { + "epoch": 0.2211238993298544, + "grad_norm": 0.0021428498439490795, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 8014 + }, + { + "epoch": 0.2211514915309188, + "grad_norm": 0.00233366503380239, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 8015 + }, + { + "epoch": 0.22117908373198317, + "grad_norm": 0.0025947142858058214, + "learning_rate": 0.001, + "loss": 0.4434, + "step": 8016 + }, + { + "epoch": 0.22120667593304752, + "grad_norm": 0.00214517954736948, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 8017 + }, + { + "epoch": 0.2212342681341119, + "grad_norm": 0.003991144709289074, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 8018 + }, + { + "epoch": 0.22126186033517625, + "grad_norm": 0.003229195484891534, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 8019 + }, + { + "epoch": 0.22128945253624063, + "grad_norm": 0.002979196375235915, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 8020 + }, + { + "epoch": 0.221317044737305, + "grad_norm": 0.004278901033103466, + "learning_rate": 0.001, + "loss": 0.4478, + "step": 8021 + }, + { + "epoch": 0.22134463693836937, + "grad_norm": 0.007357322610914707, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 8022 + }, + { + "epoch": 0.22137222913943375, + "grad_norm": 0.0035520815290510654, + "learning_rate": 0.001, + "loss": 0.368, + "step": 8023 + }, + { + "epoch": 0.2213998213404981, + "grad_norm": 0.004246349446475506, + "learning_rate": 0.001, + "loss": 0.3497, + "step": 8024 + }, + { + "epoch": 0.22142741354156248, + "grad_norm": 0.0035226894542574883, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 8025 + }, + { + "epoch": 0.22145500574262686, + "grad_norm": 0.0033106855116784573, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 8026 + }, + { + "epoch": 0.2214825979436912, + "grad_norm": 0.004675147123634815, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8027 + }, + { + "epoch": 0.2215101901447556, + "grad_norm": 0.004575135186314583, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 8028 + }, + { + "epoch": 0.22153778234581994, + "grad_norm": 0.002557947300374508, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 8029 + }, + { + "epoch": 0.22156537454688432, + "grad_norm": 0.0031390858348459005, + "learning_rate": 0.001, + "loss": 0.454, + "step": 8030 + }, + { + "epoch": 0.2215929667479487, + "grad_norm": 0.004276024177670479, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 8031 + }, + { + "epoch": 0.22162055894901306, + "grad_norm": 0.00613689050078392, + "learning_rate": 0.001, + "loss": 0.3498, + "step": 8032 + }, + { + "epoch": 0.22164815115007744, + "grad_norm": 0.0052116867154836655, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 8033 + }, + { + "epoch": 0.2216757433511418, + "grad_norm": 0.0032780475448817015, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 8034 + }, + { + "epoch": 0.22170333555220617, + "grad_norm": 0.003026155522093177, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 8035 + }, + { + "epoch": 0.22173092775327055, + "grad_norm": 0.0075735533609986305, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 8036 + }, + { + "epoch": 0.2217585199543349, + "grad_norm": 0.010422303341329098, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 8037 + }, + { + "epoch": 0.22178611215539928, + "grad_norm": 0.009534427896142006, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 8038 + }, + { + "epoch": 0.22181370435646364, + "grad_norm": 0.004428492859005928, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 8039 + }, + { + "epoch": 0.22184129655752802, + "grad_norm": 0.006164777558296919, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 8040 + }, + { + "epoch": 0.2218688887585924, + "grad_norm": 0.003359799971804023, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 8041 + }, + { + "epoch": 0.22189648095965675, + "grad_norm": 0.006137318443506956, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 8042 + }, + { + "epoch": 0.22192407316072113, + "grad_norm": 0.002756676636636257, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 8043 + }, + { + "epoch": 0.22195166536178548, + "grad_norm": 0.004392385482788086, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 8044 + }, + { + "epoch": 0.22197925756284986, + "grad_norm": 0.002814218634739518, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 8045 + }, + { + "epoch": 0.22200684976391422, + "grad_norm": 0.002874415833503008, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8046 + }, + { + "epoch": 0.2220344419649786, + "grad_norm": 0.005475836340337992, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 8047 + }, + { + "epoch": 0.22206203416604298, + "grad_norm": 0.002302660373970866, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 8048 + }, + { + "epoch": 0.22208962636710733, + "grad_norm": 0.003013992914929986, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 8049 + }, + { + "epoch": 0.2221172185681717, + "grad_norm": 0.0038887159898877144, + "learning_rate": 0.001, + "loss": 0.3634, + "step": 8050 + }, + { + "epoch": 0.22214481076923606, + "grad_norm": 0.004871399141848087, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 8051 + }, + { + "epoch": 0.22217240297030044, + "grad_norm": 0.002852171426638961, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 8052 + }, + { + "epoch": 0.22219999517136482, + "grad_norm": 0.0025132293812930584, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 8053 + }, + { + "epoch": 0.22222758737242918, + "grad_norm": 0.002570350421592593, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 8054 + }, + { + "epoch": 0.22225517957349356, + "grad_norm": 0.005364352371543646, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 8055 + }, + { + "epoch": 0.2222827717745579, + "grad_norm": 0.0030977013520896435, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 8056 + }, + { + "epoch": 0.2223103639756223, + "grad_norm": 0.0027198875322937965, + "learning_rate": 0.001, + "loss": 0.4575, + "step": 8057 + }, + { + "epoch": 0.22233795617668667, + "grad_norm": 0.011028733104467392, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 8058 + }, + { + "epoch": 0.22236554837775102, + "grad_norm": 0.0037885240744799376, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 8059 + }, + { + "epoch": 0.2223931405788154, + "grad_norm": 0.005188632290810347, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 8060 + }, + { + "epoch": 0.22242073277987975, + "grad_norm": 0.003573206951841712, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 8061 + }, + { + "epoch": 0.22244832498094413, + "grad_norm": 0.0036306334659457207, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 8062 + }, + { + "epoch": 0.22247591718200851, + "grad_norm": 0.0033542602322995663, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 8063 + }, + { + "epoch": 0.22250350938307287, + "grad_norm": 0.004111922346055508, + "learning_rate": 0.001, + "loss": 0.424, + "step": 8064 + }, + { + "epoch": 0.22253110158413725, + "grad_norm": 0.004905781242996454, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 8065 + }, + { + "epoch": 0.2225586937852016, + "grad_norm": 0.009073971770703793, + "learning_rate": 0.001, + "loss": 0.3574, + "step": 8066 + }, + { + "epoch": 0.22258628598626598, + "grad_norm": 0.0031212870962917805, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 8067 + }, + { + "epoch": 0.22261387818733036, + "grad_norm": 0.0061768037267029285, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8068 + }, + { + "epoch": 0.2226414703883947, + "grad_norm": 0.004057316109538078, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 8069 + }, + { + "epoch": 0.2226690625894591, + "grad_norm": 0.002062835730612278, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 8070 + }, + { + "epoch": 0.22269665479052345, + "grad_norm": 0.010553326457738876, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 8071 + }, + { + "epoch": 0.22272424699158783, + "grad_norm": 0.004648920148611069, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 8072 + }, + { + "epoch": 0.2227518391926522, + "grad_norm": 0.002884708344936371, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8073 + }, + { + "epoch": 0.22277943139371656, + "grad_norm": 0.002019941108301282, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 8074 + }, + { + "epoch": 0.22280702359478094, + "grad_norm": 0.004767908249050379, + "learning_rate": 0.001, + "loss": 0.423, + "step": 8075 + }, + { + "epoch": 0.2228346157958453, + "grad_norm": 0.003971953876316547, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 8076 + }, + { + "epoch": 0.22286220799690967, + "grad_norm": 0.004503239411860704, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 8077 + }, + { + "epoch": 0.22288980019797405, + "grad_norm": 0.002689438173547387, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 8078 + }, + { + "epoch": 0.2229173923990384, + "grad_norm": 0.003111765021458268, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 8079 + }, + { + "epoch": 0.22294498460010279, + "grad_norm": 0.003488035872578621, + "learning_rate": 0.001, + "loss": 0.3488, + "step": 8080 + }, + { + "epoch": 0.22297257680116714, + "grad_norm": 0.004513781983405352, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 8081 + }, + { + "epoch": 0.22300016900223152, + "grad_norm": 0.0029607918113470078, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 8082 + }, + { + "epoch": 0.2230277612032959, + "grad_norm": 0.0026686247438192368, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 8083 + }, + { + "epoch": 0.22305535340436025, + "grad_norm": 0.003172766650095582, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 8084 + }, + { + "epoch": 0.22308294560542463, + "grad_norm": 0.002642567502334714, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 8085 + }, + { + "epoch": 0.22311053780648898, + "grad_norm": 0.002758550923317671, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 8086 + }, + { + "epoch": 0.22313813000755336, + "grad_norm": 0.004465687554329634, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 8087 + }, + { + "epoch": 0.22316572220861775, + "grad_norm": 0.003773334203287959, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 8088 + }, + { + "epoch": 0.2231933144096821, + "grad_norm": 0.0027832011692225933, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 8089 + }, + { + "epoch": 0.22322090661074648, + "grad_norm": 0.00280310888774693, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 8090 + }, + { + "epoch": 0.22324849881181083, + "grad_norm": 0.003244071500375867, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 8091 + }, + { + "epoch": 0.2232760910128752, + "grad_norm": 0.0024930760264396667, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 8092 + }, + { + "epoch": 0.2233036832139396, + "grad_norm": 0.002686891006305814, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 8093 + }, + { + "epoch": 0.22333127541500394, + "grad_norm": 0.00237724045291543, + "learning_rate": 0.001, + "loss": 0.437, + "step": 8094 + }, + { + "epoch": 0.22335886761606832, + "grad_norm": 0.004243023227900267, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 8095 + }, + { + "epoch": 0.22338645981713268, + "grad_norm": 0.004333519376814365, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 8096 + }, + { + "epoch": 0.22341405201819706, + "grad_norm": 0.0025725301820784807, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 8097 + }, + { + "epoch": 0.22344164421926144, + "grad_norm": 0.004863837733864784, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 8098 + }, + { + "epoch": 0.2234692364203258, + "grad_norm": 0.0025464182253926992, + "learning_rate": 0.001, + "loss": 0.412, + "step": 8099 + }, + { + "epoch": 0.22349682862139017, + "grad_norm": 0.003454705933108926, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 8100 + }, + { + "epoch": 0.22352442082245452, + "grad_norm": 0.004076403100043535, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 8101 + }, + { + "epoch": 0.2235520130235189, + "grad_norm": 0.002538996748626232, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 8102 + }, + { + "epoch": 0.22357960522458328, + "grad_norm": 0.002853696933016181, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 8103 + }, + { + "epoch": 0.22360719742564764, + "grad_norm": 0.005618890281766653, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 8104 + }, + { + "epoch": 0.22363478962671202, + "grad_norm": 0.0023889478761702776, + "learning_rate": 0.001, + "loss": 0.407, + "step": 8105 + }, + { + "epoch": 0.22366238182777637, + "grad_norm": 0.0049384357407689095, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 8106 + }, + { + "epoch": 0.22368997402884075, + "grad_norm": 0.0029198869597166777, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 8107 + }, + { + "epoch": 0.22371756622990513, + "grad_norm": 0.00721718929708004, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 8108 + }, + { + "epoch": 0.22374515843096948, + "grad_norm": 0.0038066962733864784, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 8109 + }, + { + "epoch": 0.22377275063203386, + "grad_norm": 0.0029343971982598305, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 8110 + }, + { + "epoch": 0.22380034283309821, + "grad_norm": 0.004783669952303171, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 8111 + }, + { + "epoch": 0.2238279350341626, + "grad_norm": 0.005818706471472979, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 8112 + }, + { + "epoch": 0.22385552723522698, + "grad_norm": 0.0027874866500496864, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 8113 + }, + { + "epoch": 0.22388311943629133, + "grad_norm": 0.006304432172328234, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 8114 + }, + { + "epoch": 0.2239107116373557, + "grad_norm": 0.006212018895894289, + "learning_rate": 0.001, + "loss": 0.3632, + "step": 8115 + }, + { + "epoch": 0.22393830383842006, + "grad_norm": 0.0021557544823735952, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 8116 + }, + { + "epoch": 0.22396589603948444, + "grad_norm": 0.0035563178826123476, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 8117 + }, + { + "epoch": 0.22399348824054882, + "grad_norm": 0.002528556389734149, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 8118 + }, + { + "epoch": 0.22402108044161317, + "grad_norm": 0.0022855051793158054, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 8119 + }, + { + "epoch": 0.22404867264267755, + "grad_norm": 0.0027871252968907356, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 8120 + }, + { + "epoch": 0.2240762648437419, + "grad_norm": 0.003169616684317589, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 8121 + }, + { + "epoch": 0.2241038570448063, + "grad_norm": 0.003899726551026106, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 8122 + }, + { + "epoch": 0.22413144924587067, + "grad_norm": 0.0040968372486531734, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 8123 + }, + { + "epoch": 0.22415904144693502, + "grad_norm": 0.002130861859768629, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 8124 + }, + { + "epoch": 0.2241866336479994, + "grad_norm": 0.0030744632240384817, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 8125 + }, + { + "epoch": 0.22421422584906375, + "grad_norm": 0.0023388941772282124, + "learning_rate": 0.001, + "loss": 0.428, + "step": 8126 + }, + { + "epoch": 0.22424181805012813, + "grad_norm": 0.005775371100753546, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 8127 + }, + { + "epoch": 0.2242694102511925, + "grad_norm": 0.002374644624069333, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 8128 + }, + { + "epoch": 0.22429700245225687, + "grad_norm": 0.0058793784119188786, + "learning_rate": 0.001, + "loss": 0.3693, + "step": 8129 + }, + { + "epoch": 0.22432459465332125, + "grad_norm": 0.0027781794779002666, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 8130 + }, + { + "epoch": 0.2243521868543856, + "grad_norm": 0.004530200269073248, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 8131 + }, + { + "epoch": 0.22437977905544998, + "grad_norm": 0.002897880505770445, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 8132 + }, + { + "epoch": 0.22440737125651436, + "grad_norm": 0.0025317175313830376, + "learning_rate": 0.001, + "loss": 0.398, + "step": 8133 + }, + { + "epoch": 0.2244349634575787, + "grad_norm": 0.004969809204339981, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 8134 + }, + { + "epoch": 0.2244625556586431, + "grad_norm": 0.019586963579058647, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 8135 + }, + { + "epoch": 0.22449014785970745, + "grad_norm": 0.003650497179478407, + "learning_rate": 0.001, + "loss": 0.4429, + "step": 8136 + }, + { + "epoch": 0.22451774006077183, + "grad_norm": 0.004681742750108242, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 8137 + }, + { + "epoch": 0.2245453322618362, + "grad_norm": 0.0019167111022397876, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 8138 + }, + { + "epoch": 0.22457292446290056, + "grad_norm": 0.002770308405160904, + "learning_rate": 0.001, + "loss": 0.393, + "step": 8139 + }, + { + "epoch": 0.22460051666396494, + "grad_norm": 0.002731415443122387, + "learning_rate": 0.001, + "loss": 0.373, + "step": 8140 + }, + { + "epoch": 0.2246281088650293, + "grad_norm": 0.0024662816431373358, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 8141 + }, + { + "epoch": 0.22465570106609367, + "grad_norm": 0.00265320367179811, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 8142 + }, + { + "epoch": 0.22468329326715802, + "grad_norm": 0.0031822596210986376, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 8143 + }, + { + "epoch": 0.2247108854682224, + "grad_norm": 0.005078164394944906, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 8144 + }, + { + "epoch": 0.22473847766928678, + "grad_norm": 0.003918538335710764, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 8145 + }, + { + "epoch": 0.22476606987035114, + "grad_norm": 0.0030788714066147804, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 8146 + }, + { + "epoch": 0.22479366207141552, + "grad_norm": 0.0029016491025686264, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 8147 + }, + { + "epoch": 0.22482125427247987, + "grad_norm": 0.0023613544180989265, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 8148 + }, + { + "epoch": 0.22484884647354425, + "grad_norm": 0.0021770396269857883, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 8149 + }, + { + "epoch": 0.22487643867460863, + "grad_norm": 0.004851729609072208, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 8150 + }, + { + "epoch": 0.22490403087567298, + "grad_norm": 0.0023205161560326815, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 8151 + }, + { + "epoch": 0.22493162307673736, + "grad_norm": 0.0027498926501721144, + "learning_rate": 0.001, + "loss": 0.3587, + "step": 8152 + }, + { + "epoch": 0.22495921527780172, + "grad_norm": 0.004693325143307447, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 8153 + }, + { + "epoch": 0.2249868074788661, + "grad_norm": 0.0030215794686228037, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 8154 + }, + { + "epoch": 0.22501439967993048, + "grad_norm": 0.0024281737860292196, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 8155 + }, + { + "epoch": 0.22504199188099483, + "grad_norm": 0.006052395328879356, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 8156 + }, + { + "epoch": 0.2250695840820592, + "grad_norm": 0.005301719065755606, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 8157 + }, + { + "epoch": 0.22509717628312356, + "grad_norm": 0.0024900834541767836, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8158 + }, + { + "epoch": 0.22512476848418794, + "grad_norm": 0.00321084912866354, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 8159 + }, + { + "epoch": 0.22515236068525232, + "grad_norm": 0.002374766394495964, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 8160 + }, + { + "epoch": 0.22517995288631668, + "grad_norm": 0.002608582377433777, + "learning_rate": 0.001, + "loss": 0.409, + "step": 8161 + }, + { + "epoch": 0.22520754508738106, + "grad_norm": 0.0029289585072547197, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8162 + }, + { + "epoch": 0.2252351372884454, + "grad_norm": 0.0028822184540331364, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 8163 + }, + { + "epoch": 0.2252627294895098, + "grad_norm": 0.004350913688540459, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 8164 + }, + { + "epoch": 0.22529032169057417, + "grad_norm": 0.0049639069475233555, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 8165 + }, + { + "epoch": 0.22531791389163852, + "grad_norm": 0.005849256180226803, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8166 + }, + { + "epoch": 0.2253455060927029, + "grad_norm": 0.007307564374059439, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 8167 + }, + { + "epoch": 0.22537309829376725, + "grad_norm": 0.0058277989737689495, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 8168 + }, + { + "epoch": 0.22540069049483163, + "grad_norm": 0.0024729755241423845, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 8169 + }, + { + "epoch": 0.22542828269589602, + "grad_norm": 0.0030476911924779415, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 8170 + }, + { + "epoch": 0.22545587489696037, + "grad_norm": 0.003712064353749156, + "learning_rate": 0.001, + "loss": 0.405, + "step": 8171 + }, + { + "epoch": 0.22548346709802475, + "grad_norm": 0.0031436847057193518, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 8172 + }, + { + "epoch": 0.2255110592990891, + "grad_norm": 0.002069181762635708, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 8173 + }, + { + "epoch": 0.22553865150015348, + "grad_norm": 0.0023455459158867598, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 8174 + }, + { + "epoch": 0.22556624370121786, + "grad_norm": 0.005130277015268803, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 8175 + }, + { + "epoch": 0.2255938359022822, + "grad_norm": 0.002300563734024763, + "learning_rate": 0.001, + "loss": 0.4008, + "step": 8176 + }, + { + "epoch": 0.2256214281033466, + "grad_norm": 0.002703635022044182, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 8177 + }, + { + "epoch": 0.22564902030441095, + "grad_norm": 0.00716767180711031, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 8178 + }, + { + "epoch": 0.22567661250547533, + "grad_norm": 0.002465134486556053, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 8179 + }, + { + "epoch": 0.2257042047065397, + "grad_norm": 0.004032150376588106, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 8180 + }, + { + "epoch": 0.22573179690760406, + "grad_norm": 0.0025379545986652374, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 8181 + }, + { + "epoch": 0.22575938910866844, + "grad_norm": 0.0032701275777071714, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 8182 + }, + { + "epoch": 0.2257869813097328, + "grad_norm": 0.002877070102840662, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 8183 + }, + { + "epoch": 0.22581457351079717, + "grad_norm": 0.003329310566186905, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 8184 + }, + { + "epoch": 0.22584216571186155, + "grad_norm": 0.003345692064613104, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 8185 + }, + { + "epoch": 0.2258697579129259, + "grad_norm": 0.002473951783031225, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 8186 + }, + { + "epoch": 0.2258973501139903, + "grad_norm": 0.0027697773184627295, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 8187 + }, + { + "epoch": 0.22592494231505464, + "grad_norm": 0.00608485285192728, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 8188 + }, + { + "epoch": 0.22595253451611902, + "grad_norm": 0.0026021813973784447, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 8189 + }, + { + "epoch": 0.2259801267171834, + "grad_norm": 0.003049626713618636, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 8190 + }, + { + "epoch": 0.22600771891824775, + "grad_norm": 0.0028316755779087543, + "learning_rate": 0.001, + "loss": 0.402, + "step": 8191 + }, + { + "epoch": 0.22603531111931213, + "grad_norm": 0.0024335982743650675, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8192 + }, + { + "epoch": 0.22606290332037648, + "grad_norm": 0.0019406350329518318, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 8193 + }, + { + "epoch": 0.22609049552144087, + "grad_norm": 0.0031156607437878847, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 8194 + }, + { + "epoch": 0.22611808772250525, + "grad_norm": 0.0024029607884585857, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8195 + }, + { + "epoch": 0.2261456799235696, + "grad_norm": 0.002428837353363633, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 8196 + }, + { + "epoch": 0.22617327212463398, + "grad_norm": 0.003429250791668892, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 8197 + }, + { + "epoch": 0.22620086432569833, + "grad_norm": 0.003104103496298194, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 8198 + }, + { + "epoch": 0.2262284565267627, + "grad_norm": 0.002276889281347394, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 8199 + }, + { + "epoch": 0.2262560487278271, + "grad_norm": 0.002815461019054055, + "learning_rate": 0.001, + "loss": 0.378, + "step": 8200 + }, + { + "epoch": 0.22628364092889144, + "grad_norm": 0.0021125515922904015, + "learning_rate": 0.001, + "loss": 0.416, + "step": 8201 + }, + { + "epoch": 0.22631123312995582, + "grad_norm": 0.0026191947981715202, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 8202 + }, + { + "epoch": 0.22633882533102018, + "grad_norm": 0.002689339919015765, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 8203 + }, + { + "epoch": 0.22636641753208456, + "grad_norm": 0.00354541908018291, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 8204 + }, + { + "epoch": 0.22639400973314894, + "grad_norm": 0.0037923678755760193, + "learning_rate": 0.001, + "loss": 0.4343, + "step": 8205 + }, + { + "epoch": 0.2264216019342133, + "grad_norm": 0.004423064645379782, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 8206 + }, + { + "epoch": 0.22644919413527767, + "grad_norm": 0.004195543006062508, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 8207 + }, + { + "epoch": 0.22647678633634202, + "grad_norm": 0.0025238455273211002, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 8208 + }, + { + "epoch": 0.2265043785374064, + "grad_norm": 0.0026775554288178682, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 8209 + }, + { + "epoch": 0.22653197073847078, + "grad_norm": 0.004028724506497383, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 8210 + }, + { + "epoch": 0.22655956293953514, + "grad_norm": 0.002054669661447406, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 8211 + }, + { + "epoch": 0.22658715514059952, + "grad_norm": 0.003321393858641386, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 8212 + }, + { + "epoch": 0.22661474734166387, + "grad_norm": 0.0035699696745723486, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 8213 + }, + { + "epoch": 0.22664233954272825, + "grad_norm": 0.004189506638795137, + "learning_rate": 0.001, + "loss": 0.383, + "step": 8214 + }, + { + "epoch": 0.22666993174379263, + "grad_norm": 0.003797245444729924, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 8215 + }, + { + "epoch": 0.22669752394485698, + "grad_norm": 0.0045156171545386314, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 8216 + }, + { + "epoch": 0.22672511614592136, + "grad_norm": 0.0026707893703132868, + "learning_rate": 0.001, + "loss": 0.3575, + "step": 8217 + }, + { + "epoch": 0.22675270834698572, + "grad_norm": 0.0025783004239201546, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 8218 + }, + { + "epoch": 0.2267803005480501, + "grad_norm": 0.004405698273330927, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 8219 + }, + { + "epoch": 0.22680789274911448, + "grad_norm": 0.00231956597417593, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 8220 + }, + { + "epoch": 0.22683548495017883, + "grad_norm": 0.0033627781085669994, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 8221 + }, + { + "epoch": 0.2268630771512432, + "grad_norm": 0.003316913964226842, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 8222 + }, + { + "epoch": 0.22689066935230756, + "grad_norm": 0.0026452546007931232, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 8223 + }, + { + "epoch": 0.22691826155337194, + "grad_norm": 0.004451108165085316, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 8224 + }, + { + "epoch": 0.22694585375443632, + "grad_norm": 0.0040152668952941895, + "learning_rate": 0.001, + "loss": 0.3664, + "step": 8225 + }, + { + "epoch": 0.22697344595550067, + "grad_norm": 0.003606460290029645, + "learning_rate": 0.001, + "loss": 0.441, + "step": 8226 + }, + { + "epoch": 0.22700103815656505, + "grad_norm": 0.003297601593658328, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 8227 + }, + { + "epoch": 0.2270286303576294, + "grad_norm": 0.003477993654087186, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 8228 + }, + { + "epoch": 0.2270562225586938, + "grad_norm": 0.002349859569221735, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 8229 + }, + { + "epoch": 0.22708381475975817, + "grad_norm": 0.003242079634219408, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 8230 + }, + { + "epoch": 0.22711140696082252, + "grad_norm": 0.007759904023259878, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 8231 + }, + { + "epoch": 0.2271389991618869, + "grad_norm": 0.003948306664824486, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 8232 + }, + { + "epoch": 0.22716659136295125, + "grad_norm": 0.002385435625910759, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 8233 + }, + { + "epoch": 0.22719418356401563, + "grad_norm": 0.0032237351406365633, + "learning_rate": 0.001, + "loss": 0.398, + "step": 8234 + }, + { + "epoch": 0.22722177576508, + "grad_norm": 0.0027539869770407677, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 8235 + }, + { + "epoch": 0.22724936796614437, + "grad_norm": 0.002958692843094468, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8236 + }, + { + "epoch": 0.22727696016720875, + "grad_norm": 0.005276213400065899, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8237 + }, + { + "epoch": 0.2273045523682731, + "grad_norm": 0.0028638257645070553, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 8238 + }, + { + "epoch": 0.22733214456933748, + "grad_norm": 0.0034719184041023254, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 8239 + }, + { + "epoch": 0.22735973677040183, + "grad_norm": 0.002775615779682994, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 8240 + }, + { + "epoch": 0.2273873289714662, + "grad_norm": 0.0028845765627920628, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 8241 + }, + { + "epoch": 0.2274149211725306, + "grad_norm": 0.002906092908233404, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 8242 + }, + { + "epoch": 0.22744251337359495, + "grad_norm": 0.0031754986848682165, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 8243 + }, + { + "epoch": 0.22747010557465933, + "grad_norm": 0.00252131768502295, + "learning_rate": 0.001, + "loss": 0.4347, + "step": 8244 + }, + { + "epoch": 0.22749769777572368, + "grad_norm": 0.0025290907360613346, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8245 + }, + { + "epoch": 0.22752528997678806, + "grad_norm": 0.0034006794448941946, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 8246 + }, + { + "epoch": 0.22755288217785244, + "grad_norm": 0.002173987217247486, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 8247 + }, + { + "epoch": 0.2275804743789168, + "grad_norm": 0.002650484209880233, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 8248 + }, + { + "epoch": 0.22760806657998117, + "grad_norm": 0.0069848657585680485, + "learning_rate": 0.001, + "loss": 0.3648, + "step": 8249 + }, + { + "epoch": 0.22763565878104552, + "grad_norm": 0.002296354155987501, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 8250 + }, + { + "epoch": 0.2276632509821099, + "grad_norm": 0.0023510761093348265, + "learning_rate": 0.001, + "loss": 0.348, + "step": 8251 + }, + { + "epoch": 0.22769084318317429, + "grad_norm": 0.002062094397842884, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 8252 + }, + { + "epoch": 0.22771843538423864, + "grad_norm": 0.004330387804657221, + "learning_rate": 0.001, + "loss": 0.3546, + "step": 8253 + }, + { + "epoch": 0.22774602758530302, + "grad_norm": 0.0027810055762529373, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 8254 + }, + { + "epoch": 0.22777361978636737, + "grad_norm": 0.0025015720166265965, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 8255 + }, + { + "epoch": 0.22780121198743175, + "grad_norm": 0.004701843950897455, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8256 + }, + { + "epoch": 0.22782880418849613, + "grad_norm": 0.0027081798762083054, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 8257 + }, + { + "epoch": 0.22785639638956048, + "grad_norm": 0.00656101293861866, + "learning_rate": 0.001, + "loss": 0.3426, + "step": 8258 + }, + { + "epoch": 0.22788398859062486, + "grad_norm": 0.004077386576682329, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 8259 + }, + { + "epoch": 0.22791158079168922, + "grad_norm": 0.005094864405691624, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 8260 + }, + { + "epoch": 0.2279391729927536, + "grad_norm": 0.003425776259973645, + "learning_rate": 0.001, + "loss": 0.3538, + "step": 8261 + }, + { + "epoch": 0.22796676519381798, + "grad_norm": 0.0027753396425396204, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 8262 + }, + { + "epoch": 0.22799435739488233, + "grad_norm": 0.00263264961540699, + "learning_rate": 0.001, + "loss": 0.4395, + "step": 8263 + }, + { + "epoch": 0.2280219495959467, + "grad_norm": 0.002466138917952776, + "learning_rate": 0.001, + "loss": 0.402, + "step": 8264 + }, + { + "epoch": 0.22804954179701106, + "grad_norm": 0.0038233925588428974, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 8265 + }, + { + "epoch": 0.22807713399807544, + "grad_norm": 0.004745953716337681, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8266 + }, + { + "epoch": 0.22810472619913982, + "grad_norm": 0.0024268226698040962, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 8267 + }, + { + "epoch": 0.22813231840020418, + "grad_norm": 0.002283599926158786, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 8268 + }, + { + "epoch": 0.22815991060126856, + "grad_norm": 0.003483585547655821, + "learning_rate": 0.001, + "loss": 0.3517, + "step": 8269 + }, + { + "epoch": 0.2281875028023329, + "grad_norm": 0.002310832031071186, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 8270 + }, + { + "epoch": 0.2282150950033973, + "grad_norm": 0.0025413238909095526, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 8271 + }, + { + "epoch": 0.22824268720446167, + "grad_norm": 0.002345605753362179, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 8272 + }, + { + "epoch": 0.22827027940552602, + "grad_norm": 0.005003250669687986, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 8273 + }, + { + "epoch": 0.2282978716065904, + "grad_norm": 0.002230096375569701, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 8274 + }, + { + "epoch": 0.22832546380765475, + "grad_norm": 0.0022804015316069126, + "learning_rate": 0.001, + "loss": 0.429, + "step": 8275 + }, + { + "epoch": 0.22835305600871914, + "grad_norm": 0.004967406392097473, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 8276 + }, + { + "epoch": 0.22838064820978352, + "grad_norm": 0.003264637663960457, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 8277 + }, + { + "epoch": 0.22840824041084787, + "grad_norm": 0.0022652260959148407, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 8278 + }, + { + "epoch": 0.22843583261191225, + "grad_norm": 0.002336147939786315, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 8279 + }, + { + "epoch": 0.2284634248129766, + "grad_norm": 0.00243670167401433, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 8280 + }, + { + "epoch": 0.22849101701404098, + "grad_norm": 0.0036559735890477896, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 8281 + }, + { + "epoch": 0.22851860921510536, + "grad_norm": 0.007312767673283815, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 8282 + }, + { + "epoch": 0.22854620141616971, + "grad_norm": 0.008099487982690334, + "learning_rate": 0.001, + "loss": 0.356, + "step": 8283 + }, + { + "epoch": 0.2285737936172341, + "grad_norm": 0.0025754349771887064, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 8284 + }, + { + "epoch": 0.22860138581829845, + "grad_norm": 0.002983193611726165, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 8285 + }, + { + "epoch": 0.22862897801936283, + "grad_norm": 0.0024407468736171722, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 8286 + }, + { + "epoch": 0.2286565702204272, + "grad_norm": 0.004473252687603235, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 8287 + }, + { + "epoch": 0.22868416242149156, + "grad_norm": 0.002675954718142748, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 8288 + }, + { + "epoch": 0.22871175462255594, + "grad_norm": 0.0025287093594670296, + "learning_rate": 0.001, + "loss": 0.395, + "step": 8289 + }, + { + "epoch": 0.2287393468236203, + "grad_norm": 0.002895546378567815, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 8290 + }, + { + "epoch": 0.22876693902468467, + "grad_norm": 0.003705421229824424, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 8291 + }, + { + "epoch": 0.22879453122574905, + "grad_norm": 0.003449185285717249, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 8292 + }, + { + "epoch": 0.2288221234268134, + "grad_norm": 0.0022859005257487297, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 8293 + }, + { + "epoch": 0.2288497156278778, + "grad_norm": 0.0023443542886525393, + "learning_rate": 0.001, + "loss": 0.391, + "step": 8294 + }, + { + "epoch": 0.22887730782894214, + "grad_norm": 0.0021697822958230972, + "learning_rate": 0.001, + "loss": 0.401, + "step": 8295 + }, + { + "epoch": 0.22890490003000652, + "grad_norm": 0.0023217375855892897, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 8296 + }, + { + "epoch": 0.2289324922310709, + "grad_norm": 0.005043079610913992, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 8297 + }, + { + "epoch": 0.22896008443213525, + "grad_norm": 0.004579662811011076, + "learning_rate": 0.001, + "loss": 0.3543, + "step": 8298 + }, + { + "epoch": 0.22898767663319963, + "grad_norm": 0.0032538543455302715, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8299 + }, + { + "epoch": 0.22901526883426399, + "grad_norm": 0.00212521362118423, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 8300 + }, + { + "epoch": 0.22904286103532837, + "grad_norm": 0.003112571081146598, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 8301 + }, + { + "epoch": 0.22907045323639275, + "grad_norm": 0.0030887876637279987, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 8302 + }, + { + "epoch": 0.2290980454374571, + "grad_norm": 0.002820172579959035, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 8303 + }, + { + "epoch": 0.22912563763852148, + "grad_norm": 0.003904817858710885, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 8304 + }, + { + "epoch": 0.22915322983958583, + "grad_norm": 0.003779762424528599, + "learning_rate": 0.001, + "loss": 0.434, + "step": 8305 + }, + { + "epoch": 0.2291808220406502, + "grad_norm": 0.0025575195904821157, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 8306 + }, + { + "epoch": 0.2292084142417146, + "grad_norm": 0.0039753662422299385, + "learning_rate": 0.001, + "loss": 0.401, + "step": 8307 + }, + { + "epoch": 0.22923600644277894, + "grad_norm": 0.0023605753667652607, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 8308 + }, + { + "epoch": 0.22926359864384332, + "grad_norm": 0.0050887493416666985, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 8309 + }, + { + "epoch": 0.22929119084490768, + "grad_norm": 0.0026991376653313637, + "learning_rate": 0.001, + "loss": 0.3734, + "step": 8310 + }, + { + "epoch": 0.22931878304597206, + "grad_norm": 0.0030844821594655514, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 8311 + }, + { + "epoch": 0.22934637524703644, + "grad_norm": 0.0029710521921515465, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 8312 + }, + { + "epoch": 0.2293739674481008, + "grad_norm": 0.0033121525775641203, + "learning_rate": 0.001, + "loss": 0.3688, + "step": 8313 + }, + { + "epoch": 0.22940155964916517, + "grad_norm": 0.002841067034751177, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 8314 + }, + { + "epoch": 0.22942915185022952, + "grad_norm": 0.003009919775649905, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 8315 + }, + { + "epoch": 0.2294567440512939, + "grad_norm": 0.007487253285944462, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 8316 + }, + { + "epoch": 0.22948433625235828, + "grad_norm": 0.003841083962470293, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 8317 + }, + { + "epoch": 0.22951192845342264, + "grad_norm": 0.003769118804484606, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 8318 + }, + { + "epoch": 0.22953952065448702, + "grad_norm": 0.0026735500432550907, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 8319 + }, + { + "epoch": 0.22956711285555137, + "grad_norm": 0.0035852077417075634, + "learning_rate": 0.001, + "loss": 0.376, + "step": 8320 + }, + { + "epoch": 0.22959470505661575, + "grad_norm": 0.004404496867209673, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8321 + }, + { + "epoch": 0.22962229725768013, + "grad_norm": 0.004810912534594536, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 8322 + }, + { + "epoch": 0.22964988945874448, + "grad_norm": 0.002430463209748268, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 8323 + }, + { + "epoch": 0.22967748165980886, + "grad_norm": 0.0034299069084227085, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 8324 + }, + { + "epoch": 0.22970507386087322, + "grad_norm": 0.002617501188069582, + "learning_rate": 0.001, + "loss": 0.408, + "step": 8325 + }, + { + "epoch": 0.2297326660619376, + "grad_norm": 0.002159700496122241, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 8326 + }, + { + "epoch": 0.22976025826300198, + "grad_norm": 0.003289289539679885, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 8327 + }, + { + "epoch": 0.22978785046406633, + "grad_norm": 0.0034644966945052147, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 8328 + }, + { + "epoch": 0.2298154426651307, + "grad_norm": 0.005093908403068781, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 8329 + }, + { + "epoch": 0.22984303486619506, + "grad_norm": 0.0033900076523423195, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 8330 + }, + { + "epoch": 0.22987062706725944, + "grad_norm": 0.0024000401608645916, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 8331 + }, + { + "epoch": 0.2298982192683238, + "grad_norm": 0.0034197689965367317, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 8332 + }, + { + "epoch": 0.22992581146938817, + "grad_norm": 0.0020301672630012035, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 8333 + }, + { + "epoch": 0.22995340367045256, + "grad_norm": 0.002956526121124625, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 8334 + }, + { + "epoch": 0.2299809958715169, + "grad_norm": 0.003122934838756919, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 8335 + }, + { + "epoch": 0.2300085880725813, + "grad_norm": 0.005318638868629932, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 8336 + }, + { + "epoch": 0.23003618027364564, + "grad_norm": 0.0033183018676936626, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 8337 + }, + { + "epoch": 0.23006377247471002, + "grad_norm": 0.0026557277888059616, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 8338 + }, + { + "epoch": 0.2300913646757744, + "grad_norm": 0.002340905833989382, + "learning_rate": 0.001, + "loss": 0.388, + "step": 8339 + }, + { + "epoch": 0.23011895687683875, + "grad_norm": 0.002298276871442795, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 8340 + }, + { + "epoch": 0.23014654907790313, + "grad_norm": 0.004820041824132204, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 8341 + }, + { + "epoch": 0.2301741412789675, + "grad_norm": 0.00302408030256629, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 8342 + }, + { + "epoch": 0.23020173348003187, + "grad_norm": 0.002940029837191105, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 8343 + }, + { + "epoch": 0.23022932568109625, + "grad_norm": 0.007366549223661423, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 8344 + }, + { + "epoch": 0.2302569178821606, + "grad_norm": 0.00624980591237545, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 8345 + }, + { + "epoch": 0.23028451008322498, + "grad_norm": 0.003244641702622175, + "learning_rate": 0.001, + "loss": 0.3637, + "step": 8346 + }, + { + "epoch": 0.23031210228428933, + "grad_norm": 0.003316201502457261, + "learning_rate": 0.001, + "loss": 0.409, + "step": 8347 + }, + { + "epoch": 0.2303396944853537, + "grad_norm": 0.004240705166012049, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 8348 + }, + { + "epoch": 0.2303672866864181, + "grad_norm": 0.004042000509798527, + "learning_rate": 0.001, + "loss": 0.392, + "step": 8349 + }, + { + "epoch": 0.23039487888748245, + "grad_norm": 0.013157385401427746, + "learning_rate": 0.001, + "loss": 0.4483, + "step": 8350 + }, + { + "epoch": 0.23042247108854683, + "grad_norm": 0.014354042708873749, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 8351 + }, + { + "epoch": 0.23045006328961118, + "grad_norm": 0.003511092159897089, + "learning_rate": 0.001, + "loss": 0.417, + "step": 8352 + }, + { + "epoch": 0.23047765549067556, + "grad_norm": 0.0031483755446970463, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 8353 + }, + { + "epoch": 0.23050524769173994, + "grad_norm": 0.0037154678720980883, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 8354 + }, + { + "epoch": 0.2305328398928043, + "grad_norm": 0.004443106707185507, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 8355 + }, + { + "epoch": 0.23056043209386867, + "grad_norm": 0.0030360580421984196, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 8356 + }, + { + "epoch": 0.23058802429493302, + "grad_norm": 0.0023466795682907104, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 8357 + }, + { + "epoch": 0.2306156164959974, + "grad_norm": 0.005221103318035603, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8358 + }, + { + "epoch": 0.23064320869706179, + "grad_norm": 0.0028962953947484493, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 8359 + }, + { + "epoch": 0.23067080089812614, + "grad_norm": 0.0029752026312053204, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 8360 + }, + { + "epoch": 0.23069839309919052, + "grad_norm": 0.005493995267897844, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8361 + }, + { + "epoch": 0.23072598530025487, + "grad_norm": 0.0037254132330417633, + "learning_rate": 0.001, + "loss": 0.3481, + "step": 8362 + }, + { + "epoch": 0.23075357750131925, + "grad_norm": 0.0025524902157485485, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 8363 + }, + { + "epoch": 0.23078116970238363, + "grad_norm": 0.0027644087094813585, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 8364 + }, + { + "epoch": 0.23080876190344798, + "grad_norm": 0.0027628885582089424, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 8365 + }, + { + "epoch": 0.23083635410451236, + "grad_norm": 0.0039875865913927555, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 8366 + }, + { + "epoch": 0.23086394630557672, + "grad_norm": 0.003650275059044361, + "learning_rate": 0.001, + "loss": 0.386, + "step": 8367 + }, + { + "epoch": 0.2308915385066411, + "grad_norm": 0.0026859240606427193, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 8368 + }, + { + "epoch": 0.23091913070770548, + "grad_norm": 0.004638598766177893, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 8369 + }, + { + "epoch": 0.23094672290876983, + "grad_norm": 0.0029127905145287514, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 8370 + }, + { + "epoch": 0.2309743151098342, + "grad_norm": 0.01036902703344822, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 8371 + }, + { + "epoch": 0.23100190731089856, + "grad_norm": 0.0038089959416538477, + "learning_rate": 0.001, + "loss": 0.3554, + "step": 8372 + }, + { + "epoch": 0.23102949951196294, + "grad_norm": 0.0037768797483295202, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 8373 + }, + { + "epoch": 0.23105709171302732, + "grad_norm": 0.006050050258636475, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 8374 + }, + { + "epoch": 0.23108468391409168, + "grad_norm": 0.0037368466146290302, + "learning_rate": 0.001, + "loss": 0.4804, + "step": 8375 + }, + { + "epoch": 0.23111227611515606, + "grad_norm": 0.002388233318924904, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 8376 + }, + { + "epoch": 0.2311398683162204, + "grad_norm": 0.002968754153698683, + "learning_rate": 0.001, + "loss": 0.393, + "step": 8377 + }, + { + "epoch": 0.2311674605172848, + "grad_norm": 0.005483376793563366, + "learning_rate": 0.001, + "loss": 0.418, + "step": 8378 + }, + { + "epoch": 0.23119505271834917, + "grad_norm": 0.0025941620115190744, + "learning_rate": 0.001, + "loss": 0.392, + "step": 8379 + }, + { + "epoch": 0.23122264491941352, + "grad_norm": 0.0066278777085244656, + "learning_rate": 0.001, + "loss": 0.386, + "step": 8380 + }, + { + "epoch": 0.2312502371204779, + "grad_norm": 0.00500098429620266, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 8381 + }, + { + "epoch": 0.23127782932154226, + "grad_norm": 0.002849952783435583, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 8382 + }, + { + "epoch": 0.23130542152260664, + "grad_norm": 0.0047418465837836266, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8383 + }, + { + "epoch": 0.23133301372367102, + "grad_norm": 0.005365386605262756, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 8384 + }, + { + "epoch": 0.23136060592473537, + "grad_norm": 0.002952918875962496, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 8385 + }, + { + "epoch": 0.23138819812579975, + "grad_norm": 0.003814739640802145, + "learning_rate": 0.001, + "loss": 0.4248, + "step": 8386 + }, + { + "epoch": 0.2314157903268641, + "grad_norm": 0.0031040378380566835, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 8387 + }, + { + "epoch": 0.23144338252792848, + "grad_norm": 0.0033247387036681175, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 8388 + }, + { + "epoch": 0.23147097472899286, + "grad_norm": 0.002116522518917918, + "learning_rate": 0.001, + "loss": 0.408, + "step": 8389 + }, + { + "epoch": 0.23149856693005721, + "grad_norm": 0.0033864439465105534, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 8390 + }, + { + "epoch": 0.2315261591311216, + "grad_norm": 0.0037893191911280155, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 8391 + }, + { + "epoch": 0.23155375133218595, + "grad_norm": 0.0026662342716008425, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 8392 + }, + { + "epoch": 0.23158134353325033, + "grad_norm": 0.002584991976618767, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 8393 + }, + { + "epoch": 0.2316089357343147, + "grad_norm": 0.002258981578052044, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 8394 + }, + { + "epoch": 0.23163652793537906, + "grad_norm": 0.004243454430252314, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 8395 + }, + { + "epoch": 0.23166412013644344, + "grad_norm": 0.004912800621241331, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 8396 + }, + { + "epoch": 0.2316917123375078, + "grad_norm": 0.0038783997297286987, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8397 + }, + { + "epoch": 0.23171930453857217, + "grad_norm": 0.0028120707720518112, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 8398 + }, + { + "epoch": 0.23174689673963655, + "grad_norm": 0.003508375259116292, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 8399 + }, + { + "epoch": 0.2317744889407009, + "grad_norm": 0.0044393884018063545, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 8400 + }, + { + "epoch": 0.2318020811417653, + "grad_norm": 0.006484494544565678, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 8401 + }, + { + "epoch": 0.23182967334282964, + "grad_norm": 0.0041907550767064095, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 8402 + }, + { + "epoch": 0.23185726554389402, + "grad_norm": 0.002757348818704486, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 8403 + }, + { + "epoch": 0.2318848577449584, + "grad_norm": 0.004819660913199186, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 8404 + }, + { + "epoch": 0.23191244994602275, + "grad_norm": 0.0022516471799463034, + "learning_rate": 0.001, + "loss": 0.4485, + "step": 8405 + }, + { + "epoch": 0.23194004214708713, + "grad_norm": 0.002369027817621827, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 8406 + }, + { + "epoch": 0.23196763434815149, + "grad_norm": 0.0035298550501465797, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 8407 + }, + { + "epoch": 0.23199522654921587, + "grad_norm": 0.002744413213804364, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 8408 + }, + { + "epoch": 0.23202281875028025, + "grad_norm": 0.002334202639758587, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 8409 + }, + { + "epoch": 0.2320504109513446, + "grad_norm": 0.0026494518388062716, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 8410 + }, + { + "epoch": 0.23207800315240898, + "grad_norm": 0.002644699066877365, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 8411 + }, + { + "epoch": 0.23210559535347333, + "grad_norm": 0.0030237159226089716, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 8412 + }, + { + "epoch": 0.2321331875545377, + "grad_norm": 0.003086669836193323, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 8413 + }, + { + "epoch": 0.2321607797556021, + "grad_norm": 0.002666539279744029, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 8414 + }, + { + "epoch": 0.23218837195666645, + "grad_norm": 0.0034365833271294832, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 8415 + }, + { + "epoch": 0.23221596415773083, + "grad_norm": 0.0030753256287425756, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 8416 + }, + { + "epoch": 0.23224355635879518, + "grad_norm": 0.003509074915200472, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 8417 + }, + { + "epoch": 0.23227114855985956, + "grad_norm": 0.005115900654345751, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 8418 + }, + { + "epoch": 0.23229874076092394, + "grad_norm": 0.005185318179428577, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 8419 + }, + { + "epoch": 0.2323263329619883, + "grad_norm": 0.0024755573831498623, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 8420 + }, + { + "epoch": 0.23235392516305267, + "grad_norm": 0.005173765122890472, + "learning_rate": 0.001, + "loss": 0.3588, + "step": 8421 + }, + { + "epoch": 0.23238151736411702, + "grad_norm": 0.004211380612105131, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 8422 + }, + { + "epoch": 0.2324091095651814, + "grad_norm": 0.002343039261177182, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 8423 + }, + { + "epoch": 0.23243670176624576, + "grad_norm": 0.004708436783403158, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 8424 + }, + { + "epoch": 0.23246429396731014, + "grad_norm": 0.004656490869820118, + "learning_rate": 0.001, + "loss": 0.3696, + "step": 8425 + }, + { + "epoch": 0.23249188616837452, + "grad_norm": 0.002325586276128888, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 8426 + }, + { + "epoch": 0.23251947836943887, + "grad_norm": 0.00335716363042593, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 8427 + }, + { + "epoch": 0.23254707057050325, + "grad_norm": 0.00340313115157187, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 8428 + }, + { + "epoch": 0.2325746627715676, + "grad_norm": 0.0026581473648548126, + "learning_rate": 0.001, + "loss": 0.399, + "step": 8429 + }, + { + "epoch": 0.23260225497263198, + "grad_norm": 0.019147882238030434, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 8430 + }, + { + "epoch": 0.23262984717369636, + "grad_norm": 0.0027826984878629446, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 8431 + }, + { + "epoch": 0.23265743937476072, + "grad_norm": 0.002678795251995325, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 8432 + }, + { + "epoch": 0.2326850315758251, + "grad_norm": 0.002674340968951583, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 8433 + }, + { + "epoch": 0.23271262377688945, + "grad_norm": 0.01514797005802393, + "learning_rate": 0.001, + "loss": 0.3665, + "step": 8434 + }, + { + "epoch": 0.23274021597795383, + "grad_norm": 0.005059736780822277, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 8435 + }, + { + "epoch": 0.2327678081790182, + "grad_norm": 0.0021096577402204275, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 8436 + }, + { + "epoch": 0.23279540038008256, + "grad_norm": 0.003909014165401459, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 8437 + }, + { + "epoch": 0.23282299258114694, + "grad_norm": 0.003196495585143566, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 8438 + }, + { + "epoch": 0.2328505847822113, + "grad_norm": 0.009719678200781345, + "learning_rate": 0.001, + "loss": 0.3553, + "step": 8439 + }, + { + "epoch": 0.23287817698327568, + "grad_norm": 0.0031904377974569798, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 8440 + }, + { + "epoch": 0.23290576918434006, + "grad_norm": 0.0028641929384320974, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 8441 + }, + { + "epoch": 0.2329333613854044, + "grad_norm": 0.0037782087456434965, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 8442 + }, + { + "epoch": 0.2329609535864688, + "grad_norm": 0.002453066175803542, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8443 + }, + { + "epoch": 0.23298854578753314, + "grad_norm": 0.0037556791212409735, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 8444 + }, + { + "epoch": 0.23301613798859752, + "grad_norm": 0.0029747902881354094, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 8445 + }, + { + "epoch": 0.2330437301896619, + "grad_norm": 0.004175838083028793, + "learning_rate": 0.001, + "loss": 0.3412, + "step": 8446 + }, + { + "epoch": 0.23307132239072625, + "grad_norm": 0.003063888754695654, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 8447 + }, + { + "epoch": 0.23309891459179063, + "grad_norm": 0.0027233557775616646, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 8448 + }, + { + "epoch": 0.233126506792855, + "grad_norm": 0.002462203847244382, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8449 + }, + { + "epoch": 0.23315409899391937, + "grad_norm": 0.003968900069594383, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 8450 + }, + { + "epoch": 0.23318169119498375, + "grad_norm": 0.004251559264957905, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 8451 + }, + { + "epoch": 0.2332092833960481, + "grad_norm": 0.0033609773963689804, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 8452 + }, + { + "epoch": 0.23323687559711248, + "grad_norm": 0.0026037050411105156, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 8453 + }, + { + "epoch": 0.23326446779817683, + "grad_norm": 0.0028592885937541723, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 8454 + }, + { + "epoch": 0.2332920599992412, + "grad_norm": 0.0032447976991534233, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 8455 + }, + { + "epoch": 0.2333196522003056, + "grad_norm": 0.0034741810522973537, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 8456 + }, + { + "epoch": 0.23334724440136995, + "grad_norm": 0.0030226546805351973, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 8457 + }, + { + "epoch": 0.23337483660243433, + "grad_norm": 0.002790014259517193, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 8458 + }, + { + "epoch": 0.23340242880349868, + "grad_norm": 0.0024738421197980642, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 8459 + }, + { + "epoch": 0.23343002100456306, + "grad_norm": 0.002386496402323246, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 8460 + }, + { + "epoch": 0.23345761320562744, + "grad_norm": 0.005026910919696093, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 8461 + }, + { + "epoch": 0.2334852054066918, + "grad_norm": 0.002900604158639908, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 8462 + }, + { + "epoch": 0.23351279760775617, + "grad_norm": 0.002177110407501459, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 8463 + }, + { + "epoch": 0.23354038980882053, + "grad_norm": 0.003278181655332446, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 8464 + }, + { + "epoch": 0.2335679820098849, + "grad_norm": 0.002810918027535081, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 8465 + }, + { + "epoch": 0.23359557421094929, + "grad_norm": 0.002259747590869665, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 8466 + }, + { + "epoch": 0.23362316641201364, + "grad_norm": 0.0023482360411435366, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 8467 + }, + { + "epoch": 0.23365075861307802, + "grad_norm": 0.004843599628657103, + "learning_rate": 0.001, + "loss": 0.3671, + "step": 8468 + }, + { + "epoch": 0.23367835081414237, + "grad_norm": 0.0035332750994712114, + "learning_rate": 0.001, + "loss": 0.379, + "step": 8469 + }, + { + "epoch": 0.23370594301520675, + "grad_norm": 0.009748833253979683, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 8470 + }, + { + "epoch": 0.23373353521627113, + "grad_norm": 0.002617282560095191, + "learning_rate": 0.001, + "loss": 0.407, + "step": 8471 + }, + { + "epoch": 0.23376112741733548, + "grad_norm": 0.0023585897870361805, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 8472 + }, + { + "epoch": 0.23378871961839987, + "grad_norm": 0.004000307526439428, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 8473 + }, + { + "epoch": 0.23381631181946422, + "grad_norm": 0.003987070173025131, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 8474 + }, + { + "epoch": 0.2338439040205286, + "grad_norm": 0.004123962949961424, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8475 + }, + { + "epoch": 0.23387149622159298, + "grad_norm": 0.014491601847112179, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 8476 + }, + { + "epoch": 0.23389908842265733, + "grad_norm": 0.0026626239996403456, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 8477 + }, + { + "epoch": 0.2339266806237217, + "grad_norm": 0.006109694018959999, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 8478 + }, + { + "epoch": 0.23395427282478606, + "grad_norm": 0.00370441609993577, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 8479 + }, + { + "epoch": 0.23398186502585044, + "grad_norm": 0.004282459616661072, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 8480 + }, + { + "epoch": 0.23400945722691482, + "grad_norm": 0.003556980052962899, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 8481 + }, + { + "epoch": 0.23403704942797918, + "grad_norm": 0.00320242578163743, + "learning_rate": 0.001, + "loss": 0.4397, + "step": 8482 + }, + { + "epoch": 0.23406464162904356, + "grad_norm": 0.0037783649750053883, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 8483 + }, + { + "epoch": 0.2340922338301079, + "grad_norm": 0.003380347043275833, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 8484 + }, + { + "epoch": 0.2341198260311723, + "grad_norm": 0.0029710596427321434, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8485 + }, + { + "epoch": 0.23414741823223667, + "grad_norm": 0.003365810727700591, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 8486 + }, + { + "epoch": 0.23417501043330102, + "grad_norm": 0.0027016180101782084, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 8487 + }, + { + "epoch": 0.2342026026343654, + "grad_norm": 0.002262349473312497, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8488 + }, + { + "epoch": 0.23423019483542976, + "grad_norm": 0.002855840837582946, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 8489 + }, + { + "epoch": 0.23425778703649414, + "grad_norm": 0.004416047595441341, + "learning_rate": 0.001, + "loss": 0.389, + "step": 8490 + }, + { + "epoch": 0.23428537923755852, + "grad_norm": 0.0035155070945620537, + "learning_rate": 0.001, + "loss": 0.392, + "step": 8491 + }, + { + "epoch": 0.23431297143862287, + "grad_norm": 0.003230678616091609, + "learning_rate": 0.001, + "loss": 0.3429, + "step": 8492 + }, + { + "epoch": 0.23434056363968725, + "grad_norm": 0.0038198174443095922, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 8493 + }, + { + "epoch": 0.2343681558407516, + "grad_norm": 0.003024979494512081, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 8494 + }, + { + "epoch": 0.23439574804181598, + "grad_norm": 0.0022407593205571175, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 8495 + }, + { + "epoch": 0.23442334024288036, + "grad_norm": 0.0024072916712611914, + "learning_rate": 0.001, + "loss": 0.4466, + "step": 8496 + }, + { + "epoch": 0.23445093244394472, + "grad_norm": 0.004507563542574644, + "learning_rate": 0.001, + "loss": 0.3714, + "step": 8497 + }, + { + "epoch": 0.2344785246450091, + "grad_norm": 0.0035954902414232492, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 8498 + }, + { + "epoch": 0.23450611684607345, + "grad_norm": 0.003051930107176304, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 8499 + }, + { + "epoch": 0.23453370904713783, + "grad_norm": 0.0034672280307859182, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8500 + }, + { + "epoch": 0.23453370904713783, + "eval_runtime": 23.4916, + "eval_samples_per_second": 1.362, + "eval_steps_per_second": 0.17, + "step": 8500 + }, + { + "epoch": 0.2345613012482022, + "grad_norm": 0.004859286360442638, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 8501 + }, + { + "epoch": 0.23458889344926656, + "grad_norm": 0.005819413810968399, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 8502 + }, + { + "epoch": 0.23461648565033094, + "grad_norm": 0.0030995369888842106, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 8503 + }, + { + "epoch": 0.2346440778513953, + "grad_norm": 0.0032427343539893627, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 8504 + }, + { + "epoch": 0.23467167005245967, + "grad_norm": 0.0034287397284060717, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 8505 + }, + { + "epoch": 0.23469926225352405, + "grad_norm": 0.003676005406305194, + "learning_rate": 0.001, + "loss": 0.422, + "step": 8506 + }, + { + "epoch": 0.2347268544545884, + "grad_norm": 0.0029224164318293333, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 8507 + }, + { + "epoch": 0.2347544466556528, + "grad_norm": 0.00621433649212122, + "learning_rate": 0.001, + "loss": 0.412, + "step": 8508 + }, + { + "epoch": 0.23478203885671714, + "grad_norm": 0.0034033150877803564, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 8509 + }, + { + "epoch": 0.23480963105778152, + "grad_norm": 0.006222172640264034, + "learning_rate": 0.001, + "loss": 0.381, + "step": 8510 + }, + { + "epoch": 0.2348372232588459, + "grad_norm": 0.0026931515894830227, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 8511 + }, + { + "epoch": 0.23486481545991025, + "grad_norm": 0.004435943905264139, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 8512 + }, + { + "epoch": 0.23489240766097463, + "grad_norm": 0.003721091663464904, + "learning_rate": 0.001, + "loss": 0.384, + "step": 8513 + }, + { + "epoch": 0.234919999862039, + "grad_norm": 0.002308727940544486, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 8514 + }, + { + "epoch": 0.23494759206310337, + "grad_norm": 0.0038359523750841618, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 8515 + }, + { + "epoch": 0.23497518426416775, + "grad_norm": 0.002997712232172489, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 8516 + }, + { + "epoch": 0.2350027764652321, + "grad_norm": 0.0034595001488924026, + "learning_rate": 0.001, + "loss": 0.3658, + "step": 8517 + }, + { + "epoch": 0.23503036866629648, + "grad_norm": 0.0026987416204065084, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 8518 + }, + { + "epoch": 0.23505796086736083, + "grad_norm": 0.00263298605568707, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 8519 + }, + { + "epoch": 0.2350855530684252, + "grad_norm": 0.0024717003107070923, + "learning_rate": 0.001, + "loss": 0.438, + "step": 8520 + }, + { + "epoch": 0.23511314526948957, + "grad_norm": 0.003266090527176857, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 8521 + }, + { + "epoch": 0.23514073747055395, + "grad_norm": 0.007142478600144386, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 8522 + }, + { + "epoch": 0.23516832967161833, + "grad_norm": 0.003394266590476036, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 8523 + }, + { + "epoch": 0.23519592187268268, + "grad_norm": 0.0034649951849132776, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 8524 + }, + { + "epoch": 0.23522351407374706, + "grad_norm": 0.0022391905076801777, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 8525 + }, + { + "epoch": 0.2352511062748114, + "grad_norm": 0.002735828747972846, + "learning_rate": 0.001, + "loss": 0.396, + "step": 8526 + }, + { + "epoch": 0.2352786984758758, + "grad_norm": 0.00296612735837698, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 8527 + }, + { + "epoch": 0.23530629067694017, + "grad_norm": 0.0029764720238745213, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 8528 + }, + { + "epoch": 0.23533388287800452, + "grad_norm": 0.002517016837373376, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 8529 + }, + { + "epoch": 0.2353614750790689, + "grad_norm": 0.004660237114876509, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 8530 + }, + { + "epoch": 0.23538906728013326, + "grad_norm": 0.01403157226741314, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 8531 + }, + { + "epoch": 0.23541665948119764, + "grad_norm": 0.002752164611592889, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 8532 + }, + { + "epoch": 0.23544425168226202, + "grad_norm": 0.004661790560930967, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 8533 + }, + { + "epoch": 0.23547184388332637, + "grad_norm": 0.0029863761737942696, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 8534 + }, + { + "epoch": 0.23549943608439075, + "grad_norm": 0.002214206848293543, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 8535 + }, + { + "epoch": 0.2355270282854551, + "grad_norm": 0.00356200966052711, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 8536 + }, + { + "epoch": 0.23555462048651948, + "grad_norm": 0.0029001201037317514, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 8537 + }, + { + "epoch": 0.23558221268758386, + "grad_norm": 0.0023703081533312798, + "learning_rate": 0.001, + "loss": 0.4531, + "step": 8538 + }, + { + "epoch": 0.23560980488864822, + "grad_norm": 0.0027728870045393705, + "learning_rate": 0.001, + "loss": 0.4478, + "step": 8539 + }, + { + "epoch": 0.2356373970897126, + "grad_norm": 0.002979228738695383, + "learning_rate": 0.001, + "loss": 0.4161, + "step": 8540 + }, + { + "epoch": 0.23566498929077695, + "grad_norm": 0.0030399069655686617, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 8541 + }, + { + "epoch": 0.23569258149184133, + "grad_norm": 0.0033329655416309834, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 8542 + }, + { + "epoch": 0.2357201736929057, + "grad_norm": 0.006761785596609116, + "learning_rate": 0.001, + "loss": 0.4624, + "step": 8543 + }, + { + "epoch": 0.23574776589397006, + "grad_norm": 0.0030948342755436897, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 8544 + }, + { + "epoch": 0.23577535809503444, + "grad_norm": 0.003666854929178953, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8545 + }, + { + "epoch": 0.2358029502960988, + "grad_norm": 0.0034998871851712465, + "learning_rate": 0.001, + "loss": 0.4306, + "step": 8546 + }, + { + "epoch": 0.23583054249716318, + "grad_norm": 0.002683976199477911, + "learning_rate": 0.001, + "loss": 0.4485, + "step": 8547 + }, + { + "epoch": 0.23585813469822756, + "grad_norm": 0.003037314862012863, + "learning_rate": 0.001, + "loss": 0.38, + "step": 8548 + }, + { + "epoch": 0.2358857268992919, + "grad_norm": 0.002266938565298915, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 8549 + }, + { + "epoch": 0.2359133191003563, + "grad_norm": 0.004051513969898224, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 8550 + }, + { + "epoch": 0.23594091130142064, + "grad_norm": 0.0025013620033860207, + "learning_rate": 0.001, + "loss": 0.4445, + "step": 8551 + }, + { + "epoch": 0.23596850350248502, + "grad_norm": 0.0030788013245910406, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 8552 + }, + { + "epoch": 0.2359960957035494, + "grad_norm": 0.003969093784689903, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 8553 + }, + { + "epoch": 0.23602368790461375, + "grad_norm": 0.003966695163398981, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 8554 + }, + { + "epoch": 0.23605128010567814, + "grad_norm": 0.009804654866456985, + "learning_rate": 0.001, + "loss": 0.425, + "step": 8555 + }, + { + "epoch": 0.2360788723067425, + "grad_norm": 0.0031127145048230886, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 8556 + }, + { + "epoch": 0.23610646450780687, + "grad_norm": 0.0036678691394627094, + "learning_rate": 0.001, + "loss": 0.3702, + "step": 8557 + }, + { + "epoch": 0.23613405670887125, + "grad_norm": 0.003199309343472123, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 8558 + }, + { + "epoch": 0.2361616489099356, + "grad_norm": 0.0028935642912983894, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 8559 + }, + { + "epoch": 0.23618924111099998, + "grad_norm": 0.0033331215381622314, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 8560 + }, + { + "epoch": 0.23621683331206433, + "grad_norm": 0.004135193768888712, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 8561 + }, + { + "epoch": 0.23624442551312871, + "grad_norm": 0.008535767905414104, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 8562 + }, + { + "epoch": 0.2362720177141931, + "grad_norm": 0.005464480258524418, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 8563 + }, + { + "epoch": 0.23629960991525745, + "grad_norm": 0.0026870830915868282, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 8564 + }, + { + "epoch": 0.23632720211632183, + "grad_norm": 0.002947178203612566, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 8565 + }, + { + "epoch": 0.23635479431738618, + "grad_norm": 0.004835926927626133, + "learning_rate": 0.001, + "loss": 0.44, + "step": 8566 + }, + { + "epoch": 0.23638238651845056, + "grad_norm": 0.004437744617462158, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 8567 + }, + { + "epoch": 0.23640997871951494, + "grad_norm": 0.0033282919321209192, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 8568 + }, + { + "epoch": 0.2364375709205793, + "grad_norm": 0.0035205124877393246, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 8569 + }, + { + "epoch": 0.23646516312164367, + "grad_norm": 0.004044828005135059, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 8570 + }, + { + "epoch": 0.23649275532270803, + "grad_norm": 0.008543507196009159, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 8571 + }, + { + "epoch": 0.2365203475237724, + "grad_norm": 0.005565674975514412, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 8572 + }, + { + "epoch": 0.2365479397248368, + "grad_norm": 0.0032614252995699644, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 8573 + }, + { + "epoch": 0.23657553192590114, + "grad_norm": 0.0036154796835035086, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 8574 + }, + { + "epoch": 0.23660312412696552, + "grad_norm": 0.003126727417111397, + "learning_rate": 0.001, + "loss": 0.375, + "step": 8575 + }, + { + "epoch": 0.23663071632802987, + "grad_norm": 0.002990932669490576, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 8576 + }, + { + "epoch": 0.23665830852909425, + "grad_norm": 0.0033934074454009533, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 8577 + }, + { + "epoch": 0.23668590073015863, + "grad_norm": 0.002845100359991193, + "learning_rate": 0.001, + "loss": 0.4331, + "step": 8578 + }, + { + "epoch": 0.23671349293122299, + "grad_norm": 0.005931117571890354, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 8579 + }, + { + "epoch": 0.23674108513228737, + "grad_norm": 0.0030062024015933275, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 8580 + }, + { + "epoch": 0.23676867733335172, + "grad_norm": 0.004353534895926714, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8581 + }, + { + "epoch": 0.2367962695344161, + "grad_norm": 0.007340231444686651, + "learning_rate": 0.001, + "loss": 0.368, + "step": 8582 + }, + { + "epoch": 0.23682386173548048, + "grad_norm": 0.0033272774890065193, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 8583 + }, + { + "epoch": 0.23685145393654483, + "grad_norm": 0.002648326801136136, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8584 + }, + { + "epoch": 0.2368790461376092, + "grad_norm": 0.0026433998718857765, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 8585 + }, + { + "epoch": 0.23690663833867356, + "grad_norm": 0.0039000390097498894, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 8586 + }, + { + "epoch": 0.23693423053973794, + "grad_norm": 0.00217796815559268, + "learning_rate": 0.001, + "loss": 0.4424, + "step": 8587 + }, + { + "epoch": 0.23696182274080232, + "grad_norm": 0.007604559417814016, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 8588 + }, + { + "epoch": 0.23698941494186668, + "grad_norm": 0.003747879760339856, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 8589 + }, + { + "epoch": 0.23701700714293106, + "grad_norm": 0.0028189239092171192, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 8590 + }, + { + "epoch": 0.2370445993439954, + "grad_norm": 0.0033880542032420635, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 8591 + }, + { + "epoch": 0.2370721915450598, + "grad_norm": 0.004370769020169973, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 8592 + }, + { + "epoch": 0.23709978374612417, + "grad_norm": 0.0029521717224270105, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 8593 + }, + { + "epoch": 0.23712737594718852, + "grad_norm": 0.0032671017106622458, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 8594 + }, + { + "epoch": 0.2371549681482529, + "grad_norm": 0.002473097527399659, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 8595 + }, + { + "epoch": 0.23718256034931726, + "grad_norm": 0.0034314331132918596, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 8596 + }, + { + "epoch": 0.23721015255038164, + "grad_norm": 0.003747452748939395, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 8597 + }, + { + "epoch": 0.23723774475144602, + "grad_norm": 0.0053464872762560844, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 8598 + }, + { + "epoch": 0.23726533695251037, + "grad_norm": 0.004069317597895861, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 8599 + }, + { + "epoch": 0.23729292915357475, + "grad_norm": 0.00245997728779912, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 8600 + }, + { + "epoch": 0.2373205213546391, + "grad_norm": 0.0024410157930105925, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 8601 + }, + { + "epoch": 0.23734811355570348, + "grad_norm": 0.002567254938185215, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 8602 + }, + { + "epoch": 0.23737570575676786, + "grad_norm": 0.003037130692973733, + "learning_rate": 0.001, + "loss": 0.366, + "step": 8603 + }, + { + "epoch": 0.23740329795783222, + "grad_norm": 0.0020308520179241896, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 8604 + }, + { + "epoch": 0.2374308901588966, + "grad_norm": 0.008115851320326328, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 8605 + }, + { + "epoch": 0.23745848235996095, + "grad_norm": 0.002992655150592327, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 8606 + }, + { + "epoch": 0.23748607456102533, + "grad_norm": 0.004864429123699665, + "learning_rate": 0.001, + "loss": 0.4524, + "step": 8607 + }, + { + "epoch": 0.2375136667620897, + "grad_norm": 0.0024918217677623034, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 8608 + }, + { + "epoch": 0.23754125896315406, + "grad_norm": 0.0024185108486562967, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 8609 + }, + { + "epoch": 0.23756885116421844, + "grad_norm": 0.0024980721063911915, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 8610 + }, + { + "epoch": 0.2375964433652828, + "grad_norm": 0.002587017137557268, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8611 + }, + { + "epoch": 0.23762403556634717, + "grad_norm": 0.002519281581044197, + "learning_rate": 0.001, + "loss": 0.3612, + "step": 8612 + }, + { + "epoch": 0.23765162776741153, + "grad_norm": 0.0031468705274164677, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 8613 + }, + { + "epoch": 0.2376792199684759, + "grad_norm": 0.0039115361869335175, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 8614 + }, + { + "epoch": 0.2377068121695403, + "grad_norm": 0.002601664513349533, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 8615 + }, + { + "epoch": 0.23773440437060464, + "grad_norm": 0.0024887637700885534, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 8616 + }, + { + "epoch": 0.23776199657166902, + "grad_norm": 0.0022282125428318977, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 8617 + }, + { + "epoch": 0.23778958877273337, + "grad_norm": 0.00493327621370554, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 8618 + }, + { + "epoch": 0.23781718097379775, + "grad_norm": 0.003642312716692686, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 8619 + }, + { + "epoch": 0.23784477317486213, + "grad_norm": 0.002251163125038147, + "learning_rate": 0.001, + "loss": 0.4593, + "step": 8620 + }, + { + "epoch": 0.2378723653759265, + "grad_norm": 0.0038927237037569284, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 8621 + }, + { + "epoch": 0.23789995757699087, + "grad_norm": 0.002186572877690196, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 8622 + }, + { + "epoch": 0.23792754977805522, + "grad_norm": 0.008558029308915138, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8623 + }, + { + "epoch": 0.2379551419791196, + "grad_norm": 0.0047720009461045265, + "learning_rate": 0.001, + "loss": 0.395, + "step": 8624 + }, + { + "epoch": 0.23798273418018398, + "grad_norm": 0.007841572165489197, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 8625 + }, + { + "epoch": 0.23801032638124833, + "grad_norm": 0.006973532494157553, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 8626 + }, + { + "epoch": 0.2380379185823127, + "grad_norm": 0.002645769389346242, + "learning_rate": 0.001, + "loss": 0.3651, + "step": 8627 + }, + { + "epoch": 0.23806551078337707, + "grad_norm": 0.013875133357942104, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 8628 + }, + { + "epoch": 0.23809310298444145, + "grad_norm": 0.005976676940917969, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 8629 + }, + { + "epoch": 0.23812069518550583, + "grad_norm": 0.0029498045332729816, + "learning_rate": 0.001, + "loss": 0.368, + "step": 8630 + }, + { + "epoch": 0.23814828738657018, + "grad_norm": 0.00462282495573163, + "learning_rate": 0.001, + "loss": 0.445, + "step": 8631 + }, + { + "epoch": 0.23817587958763456, + "grad_norm": 0.003022623248398304, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 8632 + }, + { + "epoch": 0.2382034717886989, + "grad_norm": 0.003926178906112909, + "learning_rate": 0.001, + "loss": 0.3651, + "step": 8633 + }, + { + "epoch": 0.2382310639897633, + "grad_norm": 0.0035945766139775515, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 8634 + }, + { + "epoch": 0.23825865619082767, + "grad_norm": 0.00597176980227232, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 8635 + }, + { + "epoch": 0.23828624839189202, + "grad_norm": 0.004440873861312866, + "learning_rate": 0.001, + "loss": 0.3614, + "step": 8636 + }, + { + "epoch": 0.2383138405929564, + "grad_norm": 0.002808907302096486, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 8637 + }, + { + "epoch": 0.23834143279402076, + "grad_norm": 0.002455639885738492, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 8638 + }, + { + "epoch": 0.23836902499508514, + "grad_norm": 0.003995021339505911, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 8639 + }, + { + "epoch": 0.23839661719614952, + "grad_norm": 0.0029346742667257786, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 8640 + }, + { + "epoch": 0.23842420939721387, + "grad_norm": 0.0027218139730393887, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 8641 + }, + { + "epoch": 0.23845180159827825, + "grad_norm": 0.002403157763183117, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 8642 + }, + { + "epoch": 0.2384793937993426, + "grad_norm": 0.0038983020931482315, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 8643 + }, + { + "epoch": 0.23850698600040698, + "grad_norm": 0.002467036945745349, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 8644 + }, + { + "epoch": 0.23853457820147136, + "grad_norm": 0.003569959197193384, + "learning_rate": 0.001, + "loss": 0.4, + "step": 8645 + }, + { + "epoch": 0.23856217040253572, + "grad_norm": 0.0020964080467820168, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 8646 + }, + { + "epoch": 0.2385897626036001, + "grad_norm": 0.0025515384040772915, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 8647 + }, + { + "epoch": 0.23861735480466445, + "grad_norm": 0.0025259265676140785, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 8648 + }, + { + "epoch": 0.23864494700572883, + "grad_norm": 0.002351469825953245, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8649 + }, + { + "epoch": 0.2386725392067932, + "grad_norm": 0.002201459836214781, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 8650 + }, + { + "epoch": 0.23870013140785756, + "grad_norm": 0.0032044921535998583, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 8651 + }, + { + "epoch": 0.23872772360892194, + "grad_norm": 0.0036108682397753, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 8652 + }, + { + "epoch": 0.2387553158099863, + "grad_norm": 0.0031174325849860907, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 8653 + }, + { + "epoch": 0.23878290801105068, + "grad_norm": 0.0035186016466468573, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 8654 + }, + { + "epoch": 0.23881050021211506, + "grad_norm": 0.002634822390973568, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 8655 + }, + { + "epoch": 0.2388380924131794, + "grad_norm": 0.0029784373473376036, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 8656 + }, + { + "epoch": 0.2388656846142438, + "grad_norm": 0.0027631595730781555, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 8657 + }, + { + "epoch": 0.23889327681530814, + "grad_norm": 0.007283089216798544, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 8658 + }, + { + "epoch": 0.23892086901637252, + "grad_norm": 0.0024284112732857466, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 8659 + }, + { + "epoch": 0.2389484612174369, + "grad_norm": 0.0027097521815449, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 8660 + }, + { + "epoch": 0.23897605341850126, + "grad_norm": 0.0025627308059483767, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 8661 + }, + { + "epoch": 0.23900364561956564, + "grad_norm": 0.003014184534549713, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 8662 + }, + { + "epoch": 0.23903123782063, + "grad_norm": 0.0035087834112346172, + "learning_rate": 0.001, + "loss": 0.3609, + "step": 8663 + }, + { + "epoch": 0.23905883002169437, + "grad_norm": 0.004521367605775595, + "learning_rate": 0.001, + "loss": 0.423, + "step": 8664 + }, + { + "epoch": 0.23908642222275875, + "grad_norm": 0.003859465243294835, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 8665 + }, + { + "epoch": 0.2391140144238231, + "grad_norm": 0.0035421785432845354, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 8666 + }, + { + "epoch": 0.23914160662488748, + "grad_norm": 0.0022301869466900826, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 8667 + }, + { + "epoch": 0.23916919882595183, + "grad_norm": 0.0030494078528136015, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 8668 + }, + { + "epoch": 0.23919679102701621, + "grad_norm": 0.0031116558238863945, + "learning_rate": 0.001, + "loss": 0.411, + "step": 8669 + }, + { + "epoch": 0.2392243832280806, + "grad_norm": 0.002859594067558646, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 8670 + }, + { + "epoch": 0.23925197542914495, + "grad_norm": 0.002495800144970417, + "learning_rate": 0.001, + "loss": 0.4545, + "step": 8671 + }, + { + "epoch": 0.23927956763020933, + "grad_norm": 0.0030453058425337076, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 8672 + }, + { + "epoch": 0.23930715983127368, + "grad_norm": 0.0048926519230008125, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 8673 + }, + { + "epoch": 0.23933475203233806, + "grad_norm": 0.0029445511754602194, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 8674 + }, + { + "epoch": 0.23936234423340244, + "grad_norm": 0.00260615604929626, + "learning_rate": 0.001, + "loss": 0.4355, + "step": 8675 + }, + { + "epoch": 0.2393899364344668, + "grad_norm": 0.003435730002820492, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 8676 + }, + { + "epoch": 0.23941752863553117, + "grad_norm": 0.0024756945203989744, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 8677 + }, + { + "epoch": 0.23944512083659553, + "grad_norm": 0.0030621823389083147, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 8678 + }, + { + "epoch": 0.2394727130376599, + "grad_norm": 0.003644311334937811, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 8679 + }, + { + "epoch": 0.2395003052387243, + "grad_norm": 0.0026737714651972055, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 8680 + }, + { + "epoch": 0.23952789743978864, + "grad_norm": 0.0027650066185742617, + "learning_rate": 0.001, + "loss": 0.3592, + "step": 8681 + }, + { + "epoch": 0.23955548964085302, + "grad_norm": 0.010126309469342232, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 8682 + }, + { + "epoch": 0.23958308184191737, + "grad_norm": 0.008825338445603848, + "learning_rate": 0.001, + "loss": 0.41, + "step": 8683 + }, + { + "epoch": 0.23961067404298175, + "grad_norm": 0.0028963929507881403, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 8684 + }, + { + "epoch": 0.23963826624404613, + "grad_norm": 0.006523852702230215, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 8685 + }, + { + "epoch": 0.23966585844511049, + "grad_norm": 0.0027353796176612377, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 8686 + }, + { + "epoch": 0.23969345064617487, + "grad_norm": 0.0034795296378433704, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 8687 + }, + { + "epoch": 0.23972104284723922, + "grad_norm": 0.002904376946389675, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 8688 + }, + { + "epoch": 0.2397486350483036, + "grad_norm": 0.00341939739882946, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 8689 + }, + { + "epoch": 0.23977622724936798, + "grad_norm": 0.00255574774928391, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 8690 + }, + { + "epoch": 0.23980381945043233, + "grad_norm": 0.002928397851064801, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8691 + }, + { + "epoch": 0.2398314116514967, + "grad_norm": 0.003814896335825324, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 8692 + }, + { + "epoch": 0.23985900385256106, + "grad_norm": 0.004918345715850592, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 8693 + }, + { + "epoch": 0.23988659605362544, + "grad_norm": 0.00214549177326262, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8694 + }, + { + "epoch": 0.23991418825468983, + "grad_norm": 0.00271332124248147, + "learning_rate": 0.001, + "loss": 0.3752, + "step": 8695 + }, + { + "epoch": 0.23994178045575418, + "grad_norm": 0.005185315851122141, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 8696 + }, + { + "epoch": 0.23996937265681856, + "grad_norm": 0.003353232517838478, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 8697 + }, + { + "epoch": 0.2399969648578829, + "grad_norm": 0.0031625647097826004, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 8698 + }, + { + "epoch": 0.2400245570589473, + "grad_norm": 0.0053766947239637375, + "learning_rate": 0.001, + "loss": 0.4522, + "step": 8699 + }, + { + "epoch": 0.24005214926001167, + "grad_norm": 0.005576374474912882, + "learning_rate": 0.001, + "loss": 0.431, + "step": 8700 + }, + { + "epoch": 0.24007974146107602, + "grad_norm": 0.003967816010117531, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 8701 + }, + { + "epoch": 0.2401073336621404, + "grad_norm": 0.0026094361674040556, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 8702 + }, + { + "epoch": 0.24013492586320476, + "grad_norm": 0.049603283405303955, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 8703 + }, + { + "epoch": 0.24016251806426914, + "grad_norm": 0.0027699440252035856, + "learning_rate": 0.001, + "loss": 0.388, + "step": 8704 + }, + { + "epoch": 0.2401901102653335, + "grad_norm": 0.0032966039143502712, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 8705 + }, + { + "epoch": 0.24021770246639787, + "grad_norm": 0.0029616530518978834, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 8706 + }, + { + "epoch": 0.24024529466746225, + "grad_norm": 0.0031759492121636868, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 8707 + }, + { + "epoch": 0.2402728868685266, + "grad_norm": 0.002272370969876647, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 8708 + }, + { + "epoch": 0.24030047906959098, + "grad_norm": 0.004187437240034342, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8709 + }, + { + "epoch": 0.24032807127065534, + "grad_norm": 0.0026218758430331945, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8710 + }, + { + "epoch": 0.24035566347171972, + "grad_norm": 0.0024012320209294558, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 8711 + }, + { + "epoch": 0.2403832556727841, + "grad_norm": 0.004809997510164976, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 8712 + }, + { + "epoch": 0.24041084787384845, + "grad_norm": 0.0030046291649341583, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 8713 + }, + { + "epoch": 0.24043844007491283, + "grad_norm": 0.0032057911157608032, + "learning_rate": 0.001, + "loss": 0.4252, + "step": 8714 + }, + { + "epoch": 0.24046603227597718, + "grad_norm": 0.0034800011198967695, + "learning_rate": 0.001, + "loss": 0.404, + "step": 8715 + }, + { + "epoch": 0.24049362447704156, + "grad_norm": 0.0029743260238319635, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 8716 + }, + { + "epoch": 0.24052121667810594, + "grad_norm": 0.0031264862045645714, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 8717 + }, + { + "epoch": 0.2405488088791703, + "grad_norm": 0.0027079239953309298, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 8718 + }, + { + "epoch": 0.24057640108023468, + "grad_norm": 0.0026093865744769573, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 8719 + }, + { + "epoch": 0.24060399328129903, + "grad_norm": 0.004578443244099617, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 8720 + }, + { + "epoch": 0.2406315854823634, + "grad_norm": 0.0024112521205097437, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 8721 + }, + { + "epoch": 0.2406591776834278, + "grad_norm": 0.0029260313604027033, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 8722 + }, + { + "epoch": 0.24068676988449214, + "grad_norm": 0.003243402810767293, + "learning_rate": 0.001, + "loss": 0.3539, + "step": 8723 + }, + { + "epoch": 0.24071436208555652, + "grad_norm": 0.03149477019906044, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 8724 + }, + { + "epoch": 0.24074195428662087, + "grad_norm": 0.002621316583827138, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 8725 + }, + { + "epoch": 0.24076954648768525, + "grad_norm": 0.007181955501437187, + "learning_rate": 0.001, + "loss": 0.374, + "step": 8726 + }, + { + "epoch": 0.24079713868874963, + "grad_norm": 0.006243562325835228, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 8727 + }, + { + "epoch": 0.240824730889814, + "grad_norm": 0.005759544670581818, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 8728 + }, + { + "epoch": 0.24085232309087837, + "grad_norm": 0.002845682902261615, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 8729 + }, + { + "epoch": 0.24087991529194272, + "grad_norm": 0.00525195337831974, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 8730 + }, + { + "epoch": 0.2409075074930071, + "grad_norm": 0.004564746282994747, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 8731 + }, + { + "epoch": 0.24093509969407148, + "grad_norm": 0.006655642297118902, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 8732 + }, + { + "epoch": 0.24096269189513583, + "grad_norm": 0.003846411593258381, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 8733 + }, + { + "epoch": 0.2409902840962002, + "grad_norm": 0.003326327772811055, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 8734 + }, + { + "epoch": 0.24101787629726457, + "grad_norm": 0.013138518668711185, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 8735 + }, + { + "epoch": 0.24104546849832895, + "grad_norm": 0.007260841317474842, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 8736 + }, + { + "epoch": 0.24107306069939333, + "grad_norm": 0.0034342585131525993, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 8737 + }, + { + "epoch": 0.24110065290045768, + "grad_norm": 0.0038822691421955824, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 8738 + }, + { + "epoch": 0.24112824510152206, + "grad_norm": 0.0024222838692367077, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 8739 + }, + { + "epoch": 0.2411558373025864, + "grad_norm": 0.0026801645290106535, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 8740 + }, + { + "epoch": 0.2411834295036508, + "grad_norm": 0.007557088974863291, + "learning_rate": 0.001, + "loss": 0.4343, + "step": 8741 + }, + { + "epoch": 0.24121102170471517, + "grad_norm": 0.02677319385111332, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 8742 + }, + { + "epoch": 0.24123861390577953, + "grad_norm": 0.0026736543513834476, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 8743 + }, + { + "epoch": 0.2412662061068439, + "grad_norm": 0.0023438511416316032, + "learning_rate": 0.001, + "loss": 0.4387, + "step": 8744 + }, + { + "epoch": 0.24129379830790826, + "grad_norm": 0.004221671260893345, + "learning_rate": 0.001, + "loss": 0.3227, + "step": 8745 + }, + { + "epoch": 0.24132139050897264, + "grad_norm": 0.004964245017617941, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 8746 + }, + { + "epoch": 0.24134898271003702, + "grad_norm": 0.004307127092033625, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 8747 + }, + { + "epoch": 0.24137657491110137, + "grad_norm": 0.004791958257555962, + "learning_rate": 0.001, + "loss": 0.38, + "step": 8748 + }, + { + "epoch": 0.24140416711216575, + "grad_norm": 0.004529708996415138, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 8749 + }, + { + "epoch": 0.2414317593132301, + "grad_norm": 0.004783578682690859, + "learning_rate": 0.001, + "loss": 0.3648, + "step": 8750 + }, + { + "epoch": 0.24145935151429448, + "grad_norm": 0.003613299923017621, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 8751 + }, + { + "epoch": 0.24148694371535886, + "grad_norm": 0.003846370615065098, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 8752 + }, + { + "epoch": 0.24151453591642322, + "grad_norm": 0.002877620980143547, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 8753 + }, + { + "epoch": 0.2415421281174876, + "grad_norm": 0.004218699410557747, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 8754 + }, + { + "epoch": 0.24156972031855195, + "grad_norm": 0.003392399987205863, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 8755 + }, + { + "epoch": 0.24159731251961633, + "grad_norm": 0.0036800485104322433, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 8756 + }, + { + "epoch": 0.2416249047206807, + "grad_norm": 0.0023208935745060444, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 8757 + }, + { + "epoch": 0.24165249692174506, + "grad_norm": 0.0029652283992618322, + "learning_rate": 0.001, + "loss": 0.3543, + "step": 8758 + }, + { + "epoch": 0.24168008912280944, + "grad_norm": 0.00395188620314002, + "learning_rate": 0.001, + "loss": 0.398, + "step": 8759 + }, + { + "epoch": 0.2417076813238738, + "grad_norm": 0.0028020909521728754, + "learning_rate": 0.001, + "loss": 0.423, + "step": 8760 + }, + { + "epoch": 0.24173527352493818, + "grad_norm": 0.002905269619077444, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 8761 + }, + { + "epoch": 0.24176286572600256, + "grad_norm": 0.0026353024877607822, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 8762 + }, + { + "epoch": 0.2417904579270669, + "grad_norm": 0.009191271848976612, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 8763 + }, + { + "epoch": 0.2418180501281313, + "grad_norm": 0.002872015815228224, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 8764 + }, + { + "epoch": 0.24184564232919564, + "grad_norm": 0.0035799501929432154, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 8765 + }, + { + "epoch": 0.24187323453026002, + "grad_norm": 0.0039528412744402885, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 8766 + }, + { + "epoch": 0.2419008267313244, + "grad_norm": 0.004701048135757446, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 8767 + }, + { + "epoch": 0.24192841893238876, + "grad_norm": 0.0029811752028763294, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 8768 + }, + { + "epoch": 0.24195601113345314, + "grad_norm": 0.00273152650333941, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 8769 + }, + { + "epoch": 0.2419836033345175, + "grad_norm": 0.0036260110791772604, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 8770 + }, + { + "epoch": 0.24201119553558187, + "grad_norm": 0.0030035688541829586, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 8771 + }, + { + "epoch": 0.24203878773664625, + "grad_norm": 0.003084450261667371, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 8772 + }, + { + "epoch": 0.2420663799377106, + "grad_norm": 0.0035567975137382746, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 8773 + }, + { + "epoch": 0.24209397213877498, + "grad_norm": 0.0024042977020144463, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 8774 + }, + { + "epoch": 0.24212156433983933, + "grad_norm": 0.005611390806734562, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 8775 + }, + { + "epoch": 0.24214915654090371, + "grad_norm": 0.0022727535106241703, + "learning_rate": 0.001, + "loss": 0.403, + "step": 8776 + }, + { + "epoch": 0.2421767487419681, + "grad_norm": 0.004948400892317295, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 8777 + }, + { + "epoch": 0.24220434094303245, + "grad_norm": 0.005593698471784592, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 8778 + }, + { + "epoch": 0.24223193314409683, + "grad_norm": 0.002849163953214884, + "learning_rate": 0.001, + "loss": 0.375, + "step": 8779 + }, + { + "epoch": 0.24225952534516118, + "grad_norm": 0.0032104626297950745, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 8780 + }, + { + "epoch": 0.24228711754622556, + "grad_norm": 0.0051482305862009525, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 8781 + }, + { + "epoch": 0.24231470974728994, + "grad_norm": 0.0035567518789321184, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 8782 + }, + { + "epoch": 0.2423423019483543, + "grad_norm": 0.0023606910835951567, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 8783 + }, + { + "epoch": 0.24236989414941867, + "grad_norm": 0.0028458137530833483, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 8784 + }, + { + "epoch": 0.24239748635048303, + "grad_norm": 0.0029930644668638706, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 8785 + }, + { + "epoch": 0.2424250785515474, + "grad_norm": 0.0022928270045667887, + "learning_rate": 0.001, + "loss": 0.399, + "step": 8786 + }, + { + "epoch": 0.2424526707526118, + "grad_norm": 0.003093563485890627, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 8787 + }, + { + "epoch": 0.24248026295367614, + "grad_norm": 0.002772730076685548, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 8788 + }, + { + "epoch": 0.24250785515474052, + "grad_norm": 0.00274568609893322, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 8789 + }, + { + "epoch": 0.24253544735580487, + "grad_norm": 0.0022818988654762506, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 8790 + }, + { + "epoch": 0.24256303955686925, + "grad_norm": 0.004985783249139786, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 8791 + }, + { + "epoch": 0.24259063175793363, + "grad_norm": 0.00211884337477386, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 8792 + }, + { + "epoch": 0.24261822395899799, + "grad_norm": 0.00450787041336298, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 8793 + }, + { + "epoch": 0.24264581616006237, + "grad_norm": 0.003899297444149852, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 8794 + }, + { + "epoch": 0.24267340836112672, + "grad_norm": 0.002133857225999236, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 8795 + }, + { + "epoch": 0.2427010005621911, + "grad_norm": 0.002347117057070136, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 8796 + }, + { + "epoch": 0.24272859276325548, + "grad_norm": 0.0024913426022976637, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 8797 + }, + { + "epoch": 0.24275618496431983, + "grad_norm": 0.0028214750345796347, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 8798 + }, + { + "epoch": 0.2427837771653842, + "grad_norm": 0.002642587758600712, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 8799 + }, + { + "epoch": 0.24281136936644857, + "grad_norm": 0.002706260420382023, + "learning_rate": 0.001, + "loss": 0.42, + "step": 8800 + }, + { + "epoch": 0.24283896156751295, + "grad_norm": 0.0023238682188093662, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 8801 + }, + { + "epoch": 0.2428665537685773, + "grad_norm": 0.003259580582380295, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8802 + }, + { + "epoch": 0.24289414596964168, + "grad_norm": 0.007316852454096079, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 8803 + }, + { + "epoch": 0.24292173817070606, + "grad_norm": 0.003550121560692787, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8804 + }, + { + "epoch": 0.2429493303717704, + "grad_norm": 0.002780807903036475, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8805 + }, + { + "epoch": 0.2429769225728348, + "grad_norm": 0.0035296916030347347, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 8806 + }, + { + "epoch": 0.24300451477389914, + "grad_norm": 0.005781420040875673, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 8807 + }, + { + "epoch": 0.24303210697496352, + "grad_norm": 0.0032384470105171204, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 8808 + }, + { + "epoch": 0.2430596991760279, + "grad_norm": 0.007727981545031071, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 8809 + }, + { + "epoch": 0.24308729137709226, + "grad_norm": 0.024563631042838097, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 8810 + }, + { + "epoch": 0.24311488357815664, + "grad_norm": 0.00322868674993515, + "learning_rate": 0.001, + "loss": 0.401, + "step": 8811 + }, + { + "epoch": 0.243142475779221, + "grad_norm": 0.004947735462337732, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 8812 + }, + { + "epoch": 0.24317006798028537, + "grad_norm": 0.004064179491251707, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 8813 + }, + { + "epoch": 0.24319766018134975, + "grad_norm": 0.005366888828575611, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 8814 + }, + { + "epoch": 0.2432252523824141, + "grad_norm": 0.012629834935069084, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 8815 + }, + { + "epoch": 0.24325284458347848, + "grad_norm": 0.004325262736529112, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 8816 + }, + { + "epoch": 0.24328043678454284, + "grad_norm": 0.002770826453343034, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 8817 + }, + { + "epoch": 0.24330802898560722, + "grad_norm": 0.005562702193856239, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 8818 + }, + { + "epoch": 0.2433356211866716, + "grad_norm": 0.002646266482770443, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 8819 + }, + { + "epoch": 0.24336321338773595, + "grad_norm": 0.003909732680767775, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 8820 + }, + { + "epoch": 0.24339080558880033, + "grad_norm": 0.0035097142681479454, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 8821 + }, + { + "epoch": 0.24341839778986468, + "grad_norm": 0.002032148651778698, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 8822 + }, + { + "epoch": 0.24344598999092906, + "grad_norm": 0.00408203387632966, + "learning_rate": 0.001, + "loss": 0.4452, + "step": 8823 + }, + { + "epoch": 0.24347358219199344, + "grad_norm": 0.0028013025876134634, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 8824 + }, + { + "epoch": 0.2435011743930578, + "grad_norm": 0.0023070168681442738, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 8825 + }, + { + "epoch": 0.24352876659412218, + "grad_norm": 0.006049746181815863, + "learning_rate": 0.001, + "loss": 0.3658, + "step": 8826 + }, + { + "epoch": 0.24355635879518653, + "grad_norm": 0.004005006048828363, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 8827 + }, + { + "epoch": 0.2435839509962509, + "grad_norm": 0.003078661160543561, + "learning_rate": 0.001, + "loss": 0.3725, + "step": 8828 + }, + { + "epoch": 0.2436115431973153, + "grad_norm": 0.0025324684102088213, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8829 + }, + { + "epoch": 0.24363913539837964, + "grad_norm": 0.0028970125131309032, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 8830 + }, + { + "epoch": 0.24366672759944402, + "grad_norm": 0.0022717502433806658, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 8831 + }, + { + "epoch": 0.24369431980050837, + "grad_norm": 0.0029859100468456745, + "learning_rate": 0.001, + "loss": 0.3481, + "step": 8832 + }, + { + "epoch": 0.24372191200157275, + "grad_norm": 0.002873897785320878, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8833 + }, + { + "epoch": 0.24374950420263714, + "grad_norm": 0.00388680980540812, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 8834 + }, + { + "epoch": 0.2437770964037015, + "grad_norm": 0.0036059573758393526, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 8835 + }, + { + "epoch": 0.24380468860476587, + "grad_norm": 0.004464290104806423, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 8836 + }, + { + "epoch": 0.24383228080583022, + "grad_norm": 0.002686945255845785, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 8837 + }, + { + "epoch": 0.2438598730068946, + "grad_norm": 0.0031961945351213217, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 8838 + }, + { + "epoch": 0.24388746520795898, + "grad_norm": 0.002320102881640196, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 8839 + }, + { + "epoch": 0.24391505740902333, + "grad_norm": 0.003384110052138567, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 8840 + }, + { + "epoch": 0.24394264961008771, + "grad_norm": 0.002216142136603594, + "learning_rate": 0.001, + "loss": 0.399, + "step": 8841 + }, + { + "epoch": 0.24397024181115207, + "grad_norm": 0.0023850633297115564, + "learning_rate": 0.001, + "loss": 0.4719, + "step": 8842 + }, + { + "epoch": 0.24399783401221645, + "grad_norm": 0.0023910165764391422, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 8843 + }, + { + "epoch": 0.24402542621328083, + "grad_norm": 0.002412325469776988, + "learning_rate": 0.001, + "loss": 0.3571, + "step": 8844 + }, + { + "epoch": 0.24405301841434518, + "grad_norm": 0.003925090655684471, + "learning_rate": 0.001, + "loss": 0.358, + "step": 8845 + }, + { + "epoch": 0.24408061061540956, + "grad_norm": 0.002139107324182987, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 8846 + }, + { + "epoch": 0.2441082028164739, + "grad_norm": 0.003215159522369504, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 8847 + }, + { + "epoch": 0.2441357950175383, + "grad_norm": 0.0029652882367372513, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 8848 + }, + { + "epoch": 0.24416338721860267, + "grad_norm": 0.0026338063180446625, + "learning_rate": 0.001, + "loss": 0.392, + "step": 8849 + }, + { + "epoch": 0.24419097941966703, + "grad_norm": 0.00244903308339417, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 8850 + }, + { + "epoch": 0.2442185716207314, + "grad_norm": 0.002739574061706662, + "learning_rate": 0.001, + "loss": 0.384, + "step": 8851 + }, + { + "epoch": 0.24424616382179576, + "grad_norm": 0.003445478854700923, + "learning_rate": 0.001, + "loss": 0.409, + "step": 8852 + }, + { + "epoch": 0.24427375602286014, + "grad_norm": 0.0026570416521281004, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 8853 + }, + { + "epoch": 0.24430134822392452, + "grad_norm": 0.0024555225390940905, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 8854 + }, + { + "epoch": 0.24432894042498887, + "grad_norm": 0.004189019091427326, + "learning_rate": 0.001, + "loss": 0.3589, + "step": 8855 + }, + { + "epoch": 0.24435653262605325, + "grad_norm": 0.0035908452700823545, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 8856 + }, + { + "epoch": 0.2443841248271176, + "grad_norm": 0.002694958820939064, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 8857 + }, + { + "epoch": 0.24441171702818199, + "grad_norm": 0.002945966785773635, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 8858 + }, + { + "epoch": 0.24443930922924637, + "grad_norm": 0.002466941252350807, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8859 + }, + { + "epoch": 0.24446690143031072, + "grad_norm": 0.003148929215967655, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 8860 + }, + { + "epoch": 0.2444944936313751, + "grad_norm": 0.0029636970721185207, + "learning_rate": 0.001, + "loss": 0.3543, + "step": 8861 + }, + { + "epoch": 0.24452208583243945, + "grad_norm": 0.002561190165579319, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 8862 + }, + { + "epoch": 0.24454967803350383, + "grad_norm": 0.0031816980335861444, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 8863 + }, + { + "epoch": 0.2445772702345682, + "grad_norm": 0.0030445698648691177, + "learning_rate": 0.001, + "loss": 0.4252, + "step": 8864 + }, + { + "epoch": 0.24460486243563256, + "grad_norm": 0.003067183308303356, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 8865 + }, + { + "epoch": 0.24463245463669694, + "grad_norm": 0.002353479852899909, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 8866 + }, + { + "epoch": 0.2446600468377613, + "grad_norm": 0.0058910418301820755, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 8867 + }, + { + "epoch": 0.24468763903882568, + "grad_norm": 0.003310910891741514, + "learning_rate": 0.001, + "loss": 0.38, + "step": 8868 + }, + { + "epoch": 0.24471523123989006, + "grad_norm": 0.003525955369696021, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 8869 + }, + { + "epoch": 0.2447428234409544, + "grad_norm": 0.00252323760651052, + "learning_rate": 0.001, + "loss": 0.3562, + "step": 8870 + }, + { + "epoch": 0.2447704156420188, + "grad_norm": 0.0025553128216415644, + "learning_rate": 0.001, + "loss": 0.41, + "step": 8871 + }, + { + "epoch": 0.24479800784308314, + "grad_norm": 0.003628962906077504, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 8872 + }, + { + "epoch": 0.24482560004414752, + "grad_norm": 0.00299929385073483, + "learning_rate": 0.001, + "loss": 0.384, + "step": 8873 + }, + { + "epoch": 0.2448531922452119, + "grad_norm": 0.004137910902500153, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 8874 + }, + { + "epoch": 0.24488078444627626, + "grad_norm": 0.0027973924297839403, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 8875 + }, + { + "epoch": 0.24490837664734064, + "grad_norm": 0.0029198199044913054, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 8876 + }, + { + "epoch": 0.244935968848405, + "grad_norm": 0.0027383132837712765, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 8877 + }, + { + "epoch": 0.24496356104946937, + "grad_norm": 0.00903650838881731, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 8878 + }, + { + "epoch": 0.24499115325053375, + "grad_norm": 0.01571609638631344, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 8879 + }, + { + "epoch": 0.2450187454515981, + "grad_norm": 0.0059790583327412605, + "learning_rate": 0.001, + "loss": 0.372, + "step": 8880 + }, + { + "epoch": 0.24504633765266248, + "grad_norm": 0.0021693052258342505, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 8881 + }, + { + "epoch": 0.24507392985372684, + "grad_norm": 0.002195873064920306, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 8882 + }, + { + "epoch": 0.24510152205479122, + "grad_norm": 0.0032636993564665318, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 8883 + }, + { + "epoch": 0.2451291142558556, + "grad_norm": 0.005068526603281498, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 8884 + }, + { + "epoch": 0.24515670645691995, + "grad_norm": 0.0020894519984722137, + "learning_rate": 0.001, + "loss": 0.4464, + "step": 8885 + }, + { + "epoch": 0.24518429865798433, + "grad_norm": 0.0024781054817140102, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 8886 + }, + { + "epoch": 0.24521189085904868, + "grad_norm": 0.002742476761341095, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 8887 + }, + { + "epoch": 0.24523948306011306, + "grad_norm": 0.0031709850300103426, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 8888 + }, + { + "epoch": 0.24526707526117744, + "grad_norm": 0.00277856457978487, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 8889 + }, + { + "epoch": 0.2452946674622418, + "grad_norm": 0.004140378907322884, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 8890 + }, + { + "epoch": 0.24532225966330617, + "grad_norm": 0.0283675380051136, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 8891 + }, + { + "epoch": 0.24534985186437053, + "grad_norm": 0.010619306936860085, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 8892 + }, + { + "epoch": 0.2453774440654349, + "grad_norm": 0.0036299237981438637, + "learning_rate": 0.001, + "loss": 0.4419, + "step": 8893 + }, + { + "epoch": 0.24540503626649926, + "grad_norm": 0.00281274551525712, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 8894 + }, + { + "epoch": 0.24543262846756364, + "grad_norm": 0.0032774037681519985, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 8895 + }, + { + "epoch": 0.24546022066862802, + "grad_norm": 0.004956527147442102, + "learning_rate": 0.001, + "loss": 0.3636, + "step": 8896 + }, + { + "epoch": 0.24548781286969237, + "grad_norm": 0.003114642109721899, + "learning_rate": 0.001, + "loss": 0.3618, + "step": 8897 + }, + { + "epoch": 0.24551540507075675, + "grad_norm": 0.012209619395434856, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 8898 + }, + { + "epoch": 0.2455429972718211, + "grad_norm": 0.005186257418245077, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 8899 + }, + { + "epoch": 0.2455705894728855, + "grad_norm": 0.003014913760125637, + "learning_rate": 0.001, + "loss": 0.3578, + "step": 8900 + }, + { + "epoch": 0.24559818167394987, + "grad_norm": 0.0029307794757187366, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 8901 + }, + { + "epoch": 0.24562577387501422, + "grad_norm": 0.0025628958828747272, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 8902 + }, + { + "epoch": 0.2456533660760786, + "grad_norm": 0.0026851852890104055, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 8903 + }, + { + "epoch": 0.24568095827714295, + "grad_norm": 0.0029029424767941236, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 8904 + }, + { + "epoch": 0.24570855047820733, + "grad_norm": 0.0035970478784292936, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 8905 + }, + { + "epoch": 0.2457361426792717, + "grad_norm": 0.006312237121164799, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 8906 + }, + { + "epoch": 0.24576373488033607, + "grad_norm": 0.005121266935020685, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 8907 + }, + { + "epoch": 0.24579132708140045, + "grad_norm": 0.003577583469450474, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 8908 + }, + { + "epoch": 0.2458189192824648, + "grad_norm": 0.0027252668514847755, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 8909 + }, + { + "epoch": 0.24584651148352918, + "grad_norm": 0.0026001029182225466, + "learning_rate": 0.001, + "loss": 0.3571, + "step": 8910 + }, + { + "epoch": 0.24587410368459356, + "grad_norm": 0.0025208101142197847, + "learning_rate": 0.001, + "loss": 0.4442, + "step": 8911 + }, + { + "epoch": 0.2459016958856579, + "grad_norm": 0.004707024432718754, + "learning_rate": 0.001, + "loss": 0.404, + "step": 8912 + }, + { + "epoch": 0.2459292880867223, + "grad_norm": 0.0031555844470858574, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 8913 + }, + { + "epoch": 0.24595688028778664, + "grad_norm": 0.002120368182659149, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 8914 + }, + { + "epoch": 0.24598447248885102, + "grad_norm": 0.002101072110235691, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 8915 + }, + { + "epoch": 0.2460120646899154, + "grad_norm": 0.0030265292152762413, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 8916 + }, + { + "epoch": 0.24603965689097976, + "grad_norm": 0.0027274913154542446, + "learning_rate": 0.001, + "loss": 0.3491, + "step": 8917 + }, + { + "epoch": 0.24606724909204414, + "grad_norm": 0.0028532680589705706, + "learning_rate": 0.001, + "loss": 0.3599, + "step": 8918 + }, + { + "epoch": 0.2460948412931085, + "grad_norm": 0.003202751511707902, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 8919 + }, + { + "epoch": 0.24612243349417287, + "grad_norm": 0.00257576210424304, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 8920 + }, + { + "epoch": 0.24615002569523725, + "grad_norm": 0.002339770086109638, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 8921 + }, + { + "epoch": 0.2461776178963016, + "grad_norm": 0.0026211580261588097, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 8922 + }, + { + "epoch": 0.24620521009736598, + "grad_norm": 0.002310456009581685, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 8923 + }, + { + "epoch": 0.24623280229843034, + "grad_norm": 0.0026841405779123306, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 8924 + }, + { + "epoch": 0.24626039449949472, + "grad_norm": 0.0026302135083824396, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 8925 + }, + { + "epoch": 0.2462879867005591, + "grad_norm": 0.0022160590160638094, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 8926 + }, + { + "epoch": 0.24631557890162345, + "grad_norm": 0.0031570009887218475, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 8927 + }, + { + "epoch": 0.24634317110268783, + "grad_norm": 0.0042785764671862125, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 8928 + }, + { + "epoch": 0.24637076330375218, + "grad_norm": 0.00619478989392519, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 8929 + }, + { + "epoch": 0.24639835550481656, + "grad_norm": 0.0054093278013169765, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 8930 + }, + { + "epoch": 0.24642594770588094, + "grad_norm": 0.004795154556632042, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 8931 + }, + { + "epoch": 0.2464535399069453, + "grad_norm": 0.003340116934850812, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 8932 + }, + { + "epoch": 0.24648113210800968, + "grad_norm": 0.0024299235083162785, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8933 + }, + { + "epoch": 0.24650872430907403, + "grad_norm": 0.0033976933918893337, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 8934 + }, + { + "epoch": 0.2465363165101384, + "grad_norm": 0.004061064217239618, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 8935 + }, + { + "epoch": 0.2465639087112028, + "grad_norm": 0.002939441241323948, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 8936 + }, + { + "epoch": 0.24659150091226714, + "grad_norm": 0.003068729070946574, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 8937 + }, + { + "epoch": 0.24661909311333152, + "grad_norm": 0.003941199276596308, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 8938 + }, + { + "epoch": 0.24664668531439587, + "grad_norm": 0.0029029848519712687, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 8939 + }, + { + "epoch": 0.24667427751546026, + "grad_norm": 0.0041121323592960835, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 8940 + }, + { + "epoch": 0.24670186971652464, + "grad_norm": 0.002195686800405383, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 8941 + }, + { + "epoch": 0.246729461917589, + "grad_norm": 0.00511584896594286, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 8942 + }, + { + "epoch": 0.24675705411865337, + "grad_norm": 0.0033936495892703533, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 8943 + }, + { + "epoch": 0.24678464631971772, + "grad_norm": 0.002642909763380885, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 8944 + }, + { + "epoch": 0.2468122385207821, + "grad_norm": 0.007020313758403063, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 8945 + }, + { + "epoch": 0.24683983072184648, + "grad_norm": 0.0026212832890450954, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 8946 + }, + { + "epoch": 0.24686742292291083, + "grad_norm": 0.00335301854647696, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 8947 + }, + { + "epoch": 0.24689501512397521, + "grad_norm": 0.004139469005167484, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 8948 + }, + { + "epoch": 0.24692260732503957, + "grad_norm": 0.004907173104584217, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 8949 + }, + { + "epoch": 0.24695019952610395, + "grad_norm": 0.0036863782443106174, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 8950 + }, + { + "epoch": 0.24697779172716833, + "grad_norm": 0.004586333874613047, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 8951 + }, + { + "epoch": 0.24700538392823268, + "grad_norm": 0.002670851768925786, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 8952 + }, + { + "epoch": 0.24703297612929706, + "grad_norm": 0.003045836230739951, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 8953 + }, + { + "epoch": 0.2470605683303614, + "grad_norm": 0.004562459886074066, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 8954 + }, + { + "epoch": 0.2470881605314258, + "grad_norm": 0.003939764108508825, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 8955 + }, + { + "epoch": 0.24711575273249017, + "grad_norm": 0.00309895072132349, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 8956 + }, + { + "epoch": 0.24714334493355453, + "grad_norm": 0.00386620219796896, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 8957 + }, + { + "epoch": 0.2471709371346189, + "grad_norm": 0.01034584641456604, + "learning_rate": 0.001, + "loss": 0.403, + "step": 8958 + }, + { + "epoch": 0.24719852933568326, + "grad_norm": 0.002349577145650983, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 8959 + }, + { + "epoch": 0.24722612153674764, + "grad_norm": 0.003580132033675909, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 8960 + }, + { + "epoch": 0.24725371373781202, + "grad_norm": 0.0027131785172969103, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 8961 + }, + { + "epoch": 0.24728130593887637, + "grad_norm": 0.0030202209018170834, + "learning_rate": 0.001, + "loss": 0.3688, + "step": 8962 + }, + { + "epoch": 0.24730889813994075, + "grad_norm": 0.002842100802809, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 8963 + }, + { + "epoch": 0.2473364903410051, + "grad_norm": 0.002618222264572978, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 8964 + }, + { + "epoch": 0.24736408254206949, + "grad_norm": 0.0024017065297812223, + "learning_rate": 0.001, + "loss": 0.37, + "step": 8965 + }, + { + "epoch": 0.24739167474313387, + "grad_norm": 0.0025306292809545994, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 8966 + }, + { + "epoch": 0.24741926694419822, + "grad_norm": 0.004213306587189436, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 8967 + }, + { + "epoch": 0.2474468591452626, + "grad_norm": 0.0023831671569496393, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 8968 + }, + { + "epoch": 0.24747445134632695, + "grad_norm": 0.002815242623910308, + "learning_rate": 0.001, + "loss": 0.4499, + "step": 8969 + }, + { + "epoch": 0.24750204354739133, + "grad_norm": 0.0026342514902353287, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 8970 + }, + { + "epoch": 0.2475296357484557, + "grad_norm": 0.004273373633623123, + "learning_rate": 0.001, + "loss": 0.3637, + "step": 8971 + }, + { + "epoch": 0.24755722794952006, + "grad_norm": 0.003125197486951947, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 8972 + }, + { + "epoch": 0.24758482015058444, + "grad_norm": 0.004526606295257807, + "learning_rate": 0.001, + "loss": 0.3571, + "step": 8973 + }, + { + "epoch": 0.2476124123516488, + "grad_norm": 0.0031581746879965067, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 8974 + }, + { + "epoch": 0.24764000455271318, + "grad_norm": 0.0028788463678210974, + "learning_rate": 0.001, + "loss": 0.364, + "step": 8975 + }, + { + "epoch": 0.24766759675377756, + "grad_norm": 0.004946670029312372, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 8976 + }, + { + "epoch": 0.2476951889548419, + "grad_norm": 0.004649583250284195, + "learning_rate": 0.001, + "loss": 0.419, + "step": 8977 + }, + { + "epoch": 0.2477227811559063, + "grad_norm": 0.0023594009689986706, + "learning_rate": 0.001, + "loss": 0.407, + "step": 8978 + }, + { + "epoch": 0.24775037335697064, + "grad_norm": 0.0028385408222675323, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 8979 + }, + { + "epoch": 0.24777796555803502, + "grad_norm": 0.0029238059651106596, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 8980 + }, + { + "epoch": 0.2478055577590994, + "grad_norm": 0.002410429297015071, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 8981 + }, + { + "epoch": 0.24783314996016376, + "grad_norm": 0.0036725676618516445, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 8982 + }, + { + "epoch": 0.24786074216122814, + "grad_norm": 0.0026173696387559175, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 8983 + }, + { + "epoch": 0.2478883343622925, + "grad_norm": 0.0019191295141354203, + "learning_rate": 0.001, + "loss": 0.411, + "step": 8984 + }, + { + "epoch": 0.24791592656335687, + "grad_norm": 0.003160592168569565, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 8985 + }, + { + "epoch": 0.24794351876442125, + "grad_norm": 0.002665070816874504, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 8986 + }, + { + "epoch": 0.2479711109654856, + "grad_norm": 0.0035125231370329857, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 8987 + }, + { + "epoch": 0.24799870316654998, + "grad_norm": 0.0034389530774205923, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 8988 + }, + { + "epoch": 0.24802629536761434, + "grad_norm": 0.00577655341476202, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 8989 + }, + { + "epoch": 0.24805388756867872, + "grad_norm": 0.002529504243284464, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 8990 + }, + { + "epoch": 0.24808147976974307, + "grad_norm": 0.003672944149002433, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 8991 + }, + { + "epoch": 0.24810907197080745, + "grad_norm": 0.004433492664247751, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 8992 + }, + { + "epoch": 0.24813666417187183, + "grad_norm": 0.0036997883580625057, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 8993 + }, + { + "epoch": 0.24816425637293618, + "grad_norm": 0.0071926964446902275, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 8994 + }, + { + "epoch": 0.24819184857400056, + "grad_norm": 0.002511198166757822, + "learning_rate": 0.001, + "loss": 0.413, + "step": 8995 + }, + { + "epoch": 0.24821944077506491, + "grad_norm": 0.0027406965382397175, + "learning_rate": 0.001, + "loss": 0.4401, + "step": 8996 + }, + { + "epoch": 0.2482470329761293, + "grad_norm": 0.0022513847798109055, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 8997 + }, + { + "epoch": 0.24827462517719368, + "grad_norm": 0.011038105934858322, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 8998 + }, + { + "epoch": 0.24830221737825803, + "grad_norm": 0.002501958515495062, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 8999 + }, + { + "epoch": 0.2483298095793224, + "grad_norm": 0.0029480638913810253, + "learning_rate": 0.001, + "loss": 0.406, + "step": 9000 + }, + { + "epoch": 0.2483298095793224, + "eval_runtime": 24.7639, + "eval_samples_per_second": 1.292, + "eval_steps_per_second": 0.162, + "step": 9000 + }, + { + "epoch": 0.24835740178038676, + "grad_norm": 0.0028752442449331284, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 9001 + }, + { + "epoch": 0.24838499398145114, + "grad_norm": 0.002456225221976638, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 9002 + }, + { + "epoch": 0.24841258618251552, + "grad_norm": 0.0035426050890237093, + "learning_rate": 0.001, + "loss": 0.385, + "step": 9003 + }, + { + "epoch": 0.24844017838357987, + "grad_norm": 0.0030952002853155136, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 9004 + }, + { + "epoch": 0.24846777058464425, + "grad_norm": 0.004819031804800034, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 9005 + }, + { + "epoch": 0.2484953627857086, + "grad_norm": 0.007780123967677355, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 9006 + }, + { + "epoch": 0.248522954986773, + "grad_norm": 0.005544982384890318, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 9007 + }, + { + "epoch": 0.24855054718783737, + "grad_norm": 0.00238407077267766, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 9008 + }, + { + "epoch": 0.24857813938890172, + "grad_norm": 0.0024830265901982784, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 9009 + }, + { + "epoch": 0.2486057315899661, + "grad_norm": 0.001999202184379101, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 9010 + }, + { + "epoch": 0.24863332379103045, + "grad_norm": 0.0038518370129168034, + "learning_rate": 0.001, + "loss": 0.39, + "step": 9011 + }, + { + "epoch": 0.24866091599209483, + "grad_norm": 0.0025100288912653923, + "learning_rate": 0.001, + "loss": 0.4523, + "step": 9012 + }, + { + "epoch": 0.2486885081931592, + "grad_norm": 0.0034927844535559416, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 9013 + }, + { + "epoch": 0.24871610039422357, + "grad_norm": 0.0020984727889299393, + "learning_rate": 0.001, + "loss": 0.4255, + "step": 9014 + }, + { + "epoch": 0.24874369259528795, + "grad_norm": 0.003706206800416112, + "learning_rate": 0.001, + "loss": 0.3502, + "step": 9015 + }, + { + "epoch": 0.2487712847963523, + "grad_norm": 0.002861752174794674, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 9016 + }, + { + "epoch": 0.24879887699741668, + "grad_norm": 0.0027587637305259705, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 9017 + }, + { + "epoch": 0.24882646919848106, + "grad_norm": 0.002627007896080613, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 9018 + }, + { + "epoch": 0.2488540613995454, + "grad_norm": 0.0047552213072776794, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 9019 + }, + { + "epoch": 0.2488816536006098, + "grad_norm": 0.002344539389014244, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 9020 + }, + { + "epoch": 0.24890924580167414, + "grad_norm": 0.0031450914684683084, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 9021 + }, + { + "epoch": 0.24893683800273853, + "grad_norm": 0.0026123332791030407, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 9022 + }, + { + "epoch": 0.2489644302038029, + "grad_norm": 0.005729448515921831, + "learning_rate": 0.001, + "loss": 0.405, + "step": 9023 + }, + { + "epoch": 0.24899202240486726, + "grad_norm": 0.003826259169727564, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 9024 + }, + { + "epoch": 0.24901961460593164, + "grad_norm": 0.005589526146650314, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 9025 + }, + { + "epoch": 0.249047206806996, + "grad_norm": 0.003237026045098901, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 9026 + }, + { + "epoch": 0.24907479900806037, + "grad_norm": 0.0024905288591980934, + "learning_rate": 0.001, + "loss": 0.418, + "step": 9027 + }, + { + "epoch": 0.24910239120912475, + "grad_norm": 0.0029746349900960922, + "learning_rate": 0.001, + "loss": 0.437, + "step": 9028 + }, + { + "epoch": 0.2491299834101891, + "grad_norm": 0.003939430229365826, + "learning_rate": 0.001, + "loss": 0.3604, + "step": 9029 + }, + { + "epoch": 0.24915757561125348, + "grad_norm": 0.0025192126631736755, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 9030 + }, + { + "epoch": 0.24918516781231784, + "grad_norm": 0.0019102268852293491, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 9031 + }, + { + "epoch": 0.24921276001338222, + "grad_norm": 0.008570538833737373, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 9032 + }, + { + "epoch": 0.2492403522144466, + "grad_norm": 0.0023212565574795008, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 9033 + }, + { + "epoch": 0.24926794441551095, + "grad_norm": 0.0023950086906552315, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 9034 + }, + { + "epoch": 0.24929553661657533, + "grad_norm": 0.002595582278445363, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 9035 + }, + { + "epoch": 0.24932312881763968, + "grad_norm": 0.012406972236931324, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 9036 + }, + { + "epoch": 0.24935072101870406, + "grad_norm": 0.0029333438724279404, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 9037 + }, + { + "epoch": 0.24937831321976844, + "grad_norm": 0.004842236638069153, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 9038 + }, + { + "epoch": 0.2494059054208328, + "grad_norm": 0.0062524196691811085, + "learning_rate": 0.001, + "loss": 0.397, + "step": 9039 + }, + { + "epoch": 0.24943349762189718, + "grad_norm": 0.004324330948293209, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 9040 + }, + { + "epoch": 0.24946108982296153, + "grad_norm": 0.0026493435725569725, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 9041 + }, + { + "epoch": 0.2494886820240259, + "grad_norm": 0.003301947610452771, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 9042 + }, + { + "epoch": 0.2495162742250903, + "grad_norm": 0.0035235814284533262, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 9043 + }, + { + "epoch": 0.24954386642615464, + "grad_norm": 0.002071363152936101, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 9044 + }, + { + "epoch": 0.24957145862721902, + "grad_norm": 0.0021638255566358566, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 9045 + }, + { + "epoch": 0.24959905082828338, + "grad_norm": 0.0027805992867797613, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 9046 + }, + { + "epoch": 0.24962664302934776, + "grad_norm": 0.004270479548722506, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 9047 + }, + { + "epoch": 0.24965423523041214, + "grad_norm": 0.002844201633706689, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 9048 + }, + { + "epoch": 0.2496818274314765, + "grad_norm": 0.0030719267670065165, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 9049 + }, + { + "epoch": 0.24970941963254087, + "grad_norm": 0.008080038242042065, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 9050 + }, + { + "epoch": 0.24973701183360522, + "grad_norm": 0.0022514360025525093, + "learning_rate": 0.001, + "loss": 0.4391, + "step": 9051 + }, + { + "epoch": 0.2497646040346696, + "grad_norm": 0.0026551985647529364, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 9052 + }, + { + "epoch": 0.24979219623573398, + "grad_norm": 0.002240521600469947, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9053 + }, + { + "epoch": 0.24981978843679833, + "grad_norm": 0.0024243989028036594, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 9054 + }, + { + "epoch": 0.24984738063786271, + "grad_norm": 0.004180778283625841, + "learning_rate": 0.001, + "loss": 0.4338, + "step": 9055 + }, + { + "epoch": 0.24987497283892707, + "grad_norm": 0.003084244905039668, + "learning_rate": 0.001, + "loss": 0.4469, + "step": 9056 + }, + { + "epoch": 0.24990256503999145, + "grad_norm": 0.0029490680899471045, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 9057 + }, + { + "epoch": 0.24993015724105583, + "grad_norm": 0.002250351244583726, + "learning_rate": 0.001, + "loss": 0.4503, + "step": 9058 + }, + { + "epoch": 0.24995774944212018, + "grad_norm": 0.0033541766460984945, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 9059 + }, + { + "epoch": 0.24998534164318456, + "grad_norm": 0.005710989236831665, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 9060 + }, + { + "epoch": 0.25001293384424894, + "grad_norm": 0.002218688605353236, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 9061 + }, + { + "epoch": 0.2500405260453133, + "grad_norm": 0.014134782366454601, + "learning_rate": 0.001, + "loss": 0.3534, + "step": 9062 + }, + { + "epoch": 0.25006811824637765, + "grad_norm": 0.003705043811351061, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 9063 + }, + { + "epoch": 0.25009571044744205, + "grad_norm": 0.0030994920525699854, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 9064 + }, + { + "epoch": 0.2501233026485064, + "grad_norm": 0.002610408468171954, + "learning_rate": 0.001, + "loss": 0.3696, + "step": 9065 + }, + { + "epoch": 0.25015089484957076, + "grad_norm": 0.0018913348903879523, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 9066 + }, + { + "epoch": 0.2501784870506351, + "grad_norm": 0.002292625606060028, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 9067 + }, + { + "epoch": 0.2502060792516995, + "grad_norm": 0.0036623626947402954, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 9068 + }, + { + "epoch": 0.2502336714527639, + "grad_norm": 0.002552927238866687, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 9069 + }, + { + "epoch": 0.2502612636538282, + "grad_norm": 0.005650006700307131, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 9070 + }, + { + "epoch": 0.25028885585489263, + "grad_norm": 0.0022689776960760355, + "learning_rate": 0.001, + "loss": 0.4568, + "step": 9071 + }, + { + "epoch": 0.250316448055957, + "grad_norm": 0.002632437041029334, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 9072 + }, + { + "epoch": 0.25034404025702134, + "grad_norm": 0.004504368640482426, + "learning_rate": 0.001, + "loss": 0.3648, + "step": 9073 + }, + { + "epoch": 0.25037163245808575, + "grad_norm": 0.0025573919992893934, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 9074 + }, + { + "epoch": 0.2503992246591501, + "grad_norm": 0.00638620974496007, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 9075 + }, + { + "epoch": 0.25042681686021445, + "grad_norm": 0.002866227412596345, + "learning_rate": 0.001, + "loss": 0.4103, + "step": 9076 + }, + { + "epoch": 0.2504544090612788, + "grad_norm": 0.002649690955877304, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 9077 + }, + { + "epoch": 0.2504820012623432, + "grad_norm": 0.002525368705391884, + "learning_rate": 0.001, + "loss": 0.444, + "step": 9078 + }, + { + "epoch": 0.25050959346340756, + "grad_norm": 0.008922641165554523, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 9079 + }, + { + "epoch": 0.2505371856644719, + "grad_norm": 0.0028822512831538916, + "learning_rate": 0.001, + "loss": 0.4405, + "step": 9080 + }, + { + "epoch": 0.2505647778655363, + "grad_norm": 0.0031333775259554386, + "learning_rate": 0.001, + "loss": 0.3556, + "step": 9081 + }, + { + "epoch": 0.2505923700666007, + "grad_norm": 0.0029146878514438868, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 9082 + }, + { + "epoch": 0.25061996226766503, + "grad_norm": 0.005818530917167664, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 9083 + }, + { + "epoch": 0.25064755446872944, + "grad_norm": 0.0026978689711540937, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 9084 + }, + { + "epoch": 0.2506751466697938, + "grad_norm": 0.005949284881353378, + "learning_rate": 0.001, + "loss": 0.3697, + "step": 9085 + }, + { + "epoch": 0.25070273887085814, + "grad_norm": 0.003694251412525773, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 9086 + }, + { + "epoch": 0.2507303310719225, + "grad_norm": 0.005075919907540083, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 9087 + }, + { + "epoch": 0.2507579232729869, + "grad_norm": 0.002228903118520975, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 9088 + }, + { + "epoch": 0.25078551547405126, + "grad_norm": 0.002552257152274251, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 9089 + }, + { + "epoch": 0.2508131076751156, + "grad_norm": 0.004208039958029985, + "learning_rate": 0.001, + "loss": 0.4442, + "step": 9090 + }, + { + "epoch": 0.25084069987618, + "grad_norm": 0.004520753864198923, + "learning_rate": 0.001, + "loss": 0.3442, + "step": 9091 + }, + { + "epoch": 0.25086829207724437, + "grad_norm": 0.002299990737810731, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 9092 + }, + { + "epoch": 0.2508958842783087, + "grad_norm": 0.003526929300278425, + "learning_rate": 0.001, + "loss": 0.3555, + "step": 9093 + }, + { + "epoch": 0.25092347647937313, + "grad_norm": 0.0026393721345812082, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 9094 + }, + { + "epoch": 0.2509510686804375, + "grad_norm": 0.0042528389021754265, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 9095 + }, + { + "epoch": 0.25097866088150184, + "grad_norm": 0.004804633557796478, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 9096 + }, + { + "epoch": 0.2510062530825662, + "grad_norm": 0.002310951007530093, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 9097 + }, + { + "epoch": 0.2510338452836306, + "grad_norm": 0.0024983736220747232, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 9098 + }, + { + "epoch": 0.25106143748469495, + "grad_norm": 0.0018971741665154696, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 9099 + }, + { + "epoch": 0.2510890296857593, + "grad_norm": 0.002223706105723977, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 9100 + }, + { + "epoch": 0.2511166218868237, + "grad_norm": 0.0028201346285641193, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 9101 + }, + { + "epoch": 0.25114421408788806, + "grad_norm": 0.0036441651172935963, + "learning_rate": 0.001, + "loss": 0.3565, + "step": 9102 + }, + { + "epoch": 0.2511718062889524, + "grad_norm": 0.002532258862629533, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 9103 + }, + { + "epoch": 0.2511993984900168, + "grad_norm": 0.0029783290810883045, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 9104 + }, + { + "epoch": 0.2512269906910812, + "grad_norm": 0.003864760510623455, + "learning_rate": 0.001, + "loss": 0.401, + "step": 9105 + }, + { + "epoch": 0.25125458289214553, + "grad_norm": 0.004010128788650036, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 9106 + }, + { + "epoch": 0.2512821750932099, + "grad_norm": 0.004032640252262354, + "learning_rate": 0.001, + "loss": 0.373, + "step": 9107 + }, + { + "epoch": 0.2513097672942743, + "grad_norm": 0.003456498496234417, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 9108 + }, + { + "epoch": 0.25133735949533864, + "grad_norm": 0.0041527096182107925, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 9109 + }, + { + "epoch": 0.251364951696403, + "grad_norm": 0.0033256958704441786, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 9110 + }, + { + "epoch": 0.2513925438974674, + "grad_norm": 0.009706409648060799, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 9111 + }, + { + "epoch": 0.25142013609853175, + "grad_norm": 0.005892944522202015, + "learning_rate": 0.001, + "loss": 0.4361, + "step": 9112 + }, + { + "epoch": 0.2514477282995961, + "grad_norm": 0.0027680047787725925, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 9113 + }, + { + "epoch": 0.2514753205006605, + "grad_norm": 0.0046938760206103325, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 9114 + }, + { + "epoch": 0.25150291270172487, + "grad_norm": 0.00436344463378191, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 9115 + }, + { + "epoch": 0.2515305049027892, + "grad_norm": 0.002917677629739046, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 9116 + }, + { + "epoch": 0.2515580971038536, + "grad_norm": 0.0041982559487223625, + "learning_rate": 0.001, + "loss": 0.3528, + "step": 9117 + }, + { + "epoch": 0.251585689304918, + "grad_norm": 0.003178415121510625, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 9118 + }, + { + "epoch": 0.25161328150598233, + "grad_norm": 0.00351191614754498, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 9119 + }, + { + "epoch": 0.2516408737070467, + "grad_norm": 0.002716810442507267, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 9120 + }, + { + "epoch": 0.2516684659081111, + "grad_norm": 0.00262506608851254, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 9121 + }, + { + "epoch": 0.25169605810917545, + "grad_norm": 0.005152091383934021, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 9122 + }, + { + "epoch": 0.2517236503102398, + "grad_norm": 0.003531603142619133, + "learning_rate": 0.001, + "loss": 0.379, + "step": 9123 + }, + { + "epoch": 0.2517512425113042, + "grad_norm": 0.0037356463726609945, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 9124 + }, + { + "epoch": 0.25177883471236856, + "grad_norm": 0.02328849956393242, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 9125 + }, + { + "epoch": 0.2518064269134329, + "grad_norm": 0.0030895329546183348, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 9126 + }, + { + "epoch": 0.25183401911449727, + "grad_norm": 0.002636708552017808, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 9127 + }, + { + "epoch": 0.2518616113155617, + "grad_norm": 0.0028558552730828524, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 9128 + }, + { + "epoch": 0.251889203516626, + "grad_norm": 0.003687650430947542, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 9129 + }, + { + "epoch": 0.2519167957176904, + "grad_norm": 0.002363695530220866, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 9130 + }, + { + "epoch": 0.2519443879187548, + "grad_norm": 0.0024190808180719614, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 9131 + }, + { + "epoch": 0.25197198011981914, + "grad_norm": 0.002729938132688403, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 9132 + }, + { + "epoch": 0.2519995723208835, + "grad_norm": 0.003316995920613408, + "learning_rate": 0.001, + "loss": 0.3628, + "step": 9133 + }, + { + "epoch": 0.25202716452194784, + "grad_norm": 0.002829955890774727, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 9134 + }, + { + "epoch": 0.25205475672301225, + "grad_norm": 0.0023533031344413757, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 9135 + }, + { + "epoch": 0.2520823489240766, + "grad_norm": 0.002993488684296608, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 9136 + }, + { + "epoch": 0.25210994112514096, + "grad_norm": 0.002113811671733856, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 9137 + }, + { + "epoch": 0.25213753332620537, + "grad_norm": 0.00470153009518981, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 9138 + }, + { + "epoch": 0.2521651255272697, + "grad_norm": 0.0033217284362763166, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 9139 + }, + { + "epoch": 0.25219271772833407, + "grad_norm": 0.0025939196348190308, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 9140 + }, + { + "epoch": 0.2522203099293985, + "grad_norm": 0.0032115280628204346, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 9141 + }, + { + "epoch": 0.25224790213046283, + "grad_norm": 0.0023531445767730474, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 9142 + }, + { + "epoch": 0.2522754943315272, + "grad_norm": 0.0035788915120065212, + "learning_rate": 0.001, + "loss": 0.386, + "step": 9143 + }, + { + "epoch": 0.25230308653259154, + "grad_norm": 0.002639509504660964, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 9144 + }, + { + "epoch": 0.25233067873365594, + "grad_norm": 0.0037151076830923557, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 9145 + }, + { + "epoch": 0.2523582709347203, + "grad_norm": 0.002693182323127985, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 9146 + }, + { + "epoch": 0.25238586313578465, + "grad_norm": 0.00298108346760273, + "learning_rate": 0.001, + "loss": 0.4452, + "step": 9147 + }, + { + "epoch": 0.25241345533684906, + "grad_norm": 0.005775284022092819, + "learning_rate": 0.001, + "loss": 0.381, + "step": 9148 + }, + { + "epoch": 0.2524410475379134, + "grad_norm": 0.004025799687951803, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 9149 + }, + { + "epoch": 0.25246863973897776, + "grad_norm": 0.0029659196734428406, + "learning_rate": 0.001, + "loss": 0.4274, + "step": 9150 + }, + { + "epoch": 0.25249623194004217, + "grad_norm": 0.006109724286943674, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 9151 + }, + { + "epoch": 0.2525238241411065, + "grad_norm": 0.004189697094261646, + "learning_rate": 0.001, + "loss": 0.3635, + "step": 9152 + }, + { + "epoch": 0.2525514163421709, + "grad_norm": 0.004084098618477583, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 9153 + }, + { + "epoch": 0.25257900854323523, + "grad_norm": 0.0035397496540099382, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 9154 + }, + { + "epoch": 0.25260660074429964, + "grad_norm": 0.002793220803141594, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 9155 + }, + { + "epoch": 0.252634192945364, + "grad_norm": 0.0030049553606659174, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 9156 + }, + { + "epoch": 0.25266178514642834, + "grad_norm": 0.003896713722497225, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 9157 + }, + { + "epoch": 0.25268937734749275, + "grad_norm": 0.0028666965663433075, + "learning_rate": 0.001, + "loss": 0.4524, + "step": 9158 + }, + { + "epoch": 0.2527169695485571, + "grad_norm": 0.0026138313114643097, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 9159 + }, + { + "epoch": 0.25274456174962145, + "grad_norm": 0.0023572929203510284, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 9160 + }, + { + "epoch": 0.25277215395068586, + "grad_norm": 0.003126518102362752, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 9161 + }, + { + "epoch": 0.2527997461517502, + "grad_norm": 0.0024071959778666496, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 9162 + }, + { + "epoch": 0.25282733835281457, + "grad_norm": 0.0023764509242028, + "learning_rate": 0.001, + "loss": 0.4642, + "step": 9163 + }, + { + "epoch": 0.2528549305538789, + "grad_norm": 0.002558658830821514, + "learning_rate": 0.001, + "loss": 0.387, + "step": 9164 + }, + { + "epoch": 0.25288252275494333, + "grad_norm": 0.004064169712364674, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 9165 + }, + { + "epoch": 0.2529101149560077, + "grad_norm": 0.0038082520477473736, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 9166 + }, + { + "epoch": 0.25293770715707203, + "grad_norm": 0.002633225405588746, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 9167 + }, + { + "epoch": 0.25296529935813644, + "grad_norm": 0.002508282894268632, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9168 + }, + { + "epoch": 0.2529928915592008, + "grad_norm": 0.0038721607998013496, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 9169 + }, + { + "epoch": 0.25302048376026515, + "grad_norm": 0.0033644256182014942, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 9170 + }, + { + "epoch": 0.25304807596132955, + "grad_norm": 0.002658225130289793, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 9171 + }, + { + "epoch": 0.2530756681623939, + "grad_norm": 0.0030783566180616617, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 9172 + }, + { + "epoch": 0.25310326036345826, + "grad_norm": 0.0024686295073479414, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 9173 + }, + { + "epoch": 0.2531308525645226, + "grad_norm": 0.0030403723940253258, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 9174 + }, + { + "epoch": 0.253158444765587, + "grad_norm": 0.0033355674240738153, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 9175 + }, + { + "epoch": 0.2531860369666514, + "grad_norm": 0.0023434923496097326, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 9176 + }, + { + "epoch": 0.2532136291677157, + "grad_norm": 0.0037734145298600197, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 9177 + }, + { + "epoch": 0.25324122136878013, + "grad_norm": 0.0024306117556989193, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 9178 + }, + { + "epoch": 0.2532688135698445, + "grad_norm": 0.002472655149176717, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 9179 + }, + { + "epoch": 0.25329640577090884, + "grad_norm": 0.0033675595186650753, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 9180 + }, + { + "epoch": 0.25332399797197325, + "grad_norm": 0.00425712438300252, + "learning_rate": 0.001, + "loss": 0.396, + "step": 9181 + }, + { + "epoch": 0.2533515901730376, + "grad_norm": 0.002620971528813243, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 9182 + }, + { + "epoch": 0.25337918237410195, + "grad_norm": 0.002496513072401285, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 9183 + }, + { + "epoch": 0.2534067745751663, + "grad_norm": 0.0037882968317717314, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 9184 + }, + { + "epoch": 0.2534343667762307, + "grad_norm": 0.0028284622821956873, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 9185 + }, + { + "epoch": 0.25346195897729507, + "grad_norm": 0.0031743019353598356, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 9186 + }, + { + "epoch": 0.2534895511783594, + "grad_norm": 0.003066749544814229, + "learning_rate": 0.001, + "loss": 0.371, + "step": 9187 + }, + { + "epoch": 0.2535171433794238, + "grad_norm": 0.002977808238938451, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 9188 + }, + { + "epoch": 0.2535447355804882, + "grad_norm": 0.0039002352859824896, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 9189 + }, + { + "epoch": 0.25357232778155253, + "grad_norm": 0.003922617062926292, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 9190 + }, + { + "epoch": 0.25359991998261694, + "grad_norm": 0.004719828721135855, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 9191 + }, + { + "epoch": 0.2536275121836813, + "grad_norm": 0.0065970043651759624, + "learning_rate": 0.001, + "loss": 0.4595, + "step": 9192 + }, + { + "epoch": 0.25365510438474564, + "grad_norm": 0.002771706786006689, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 9193 + }, + { + "epoch": 0.25368269658581, + "grad_norm": 0.002968127839267254, + "learning_rate": 0.001, + "loss": 0.4369, + "step": 9194 + }, + { + "epoch": 0.2537102887868744, + "grad_norm": 0.002558299573138356, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 9195 + }, + { + "epoch": 0.25373788098793876, + "grad_norm": 0.003306181402876973, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 9196 + }, + { + "epoch": 0.2537654731890031, + "grad_norm": 0.005295965354889631, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 9197 + }, + { + "epoch": 0.2537930653900675, + "grad_norm": 0.00258263130672276, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 9198 + }, + { + "epoch": 0.25382065759113187, + "grad_norm": 0.003539256053045392, + "learning_rate": 0.001, + "loss": 0.427, + "step": 9199 + }, + { + "epoch": 0.2538482497921962, + "grad_norm": 0.004078635014593601, + "learning_rate": 0.001, + "loss": 0.417, + "step": 9200 + }, + { + "epoch": 0.25387584199326063, + "grad_norm": 0.003579481039196253, + "learning_rate": 0.001, + "loss": 0.3628, + "step": 9201 + }, + { + "epoch": 0.253903434194325, + "grad_norm": 0.011261108331382275, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 9202 + }, + { + "epoch": 0.25393102639538934, + "grad_norm": 0.005823037121444941, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 9203 + }, + { + "epoch": 0.2539586185964537, + "grad_norm": 0.0032197630498558283, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 9204 + }, + { + "epoch": 0.2539862107975181, + "grad_norm": 0.007648573722690344, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 9205 + }, + { + "epoch": 0.25401380299858245, + "grad_norm": 0.0039948225021362305, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 9206 + }, + { + "epoch": 0.2540413951996468, + "grad_norm": 0.008907606825232506, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 9207 + }, + { + "epoch": 0.2540689874007112, + "grad_norm": 0.0032174009829759598, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 9208 + }, + { + "epoch": 0.25409657960177556, + "grad_norm": 0.0036075576208531857, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 9209 + }, + { + "epoch": 0.2541241718028399, + "grad_norm": 0.002537550637498498, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 9210 + }, + { + "epoch": 0.2541517640039043, + "grad_norm": 0.004352409392595291, + "learning_rate": 0.001, + "loss": 0.394, + "step": 9211 + }, + { + "epoch": 0.2541793562049687, + "grad_norm": 0.004635980818420649, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 9212 + }, + { + "epoch": 0.25420694840603303, + "grad_norm": 0.0028103957884013653, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 9213 + }, + { + "epoch": 0.2542345406070974, + "grad_norm": 0.0023787037935107946, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 9214 + }, + { + "epoch": 0.2542621328081618, + "grad_norm": 0.002404349623247981, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 9215 + }, + { + "epoch": 0.25428972500922614, + "grad_norm": 0.0025460943579673767, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 9216 + }, + { + "epoch": 0.2543173172102905, + "grad_norm": 0.003394089639186859, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 9217 + }, + { + "epoch": 0.2543449094113549, + "grad_norm": 0.003400506917387247, + "learning_rate": 0.001, + "loss": 0.386, + "step": 9218 + }, + { + "epoch": 0.25437250161241926, + "grad_norm": 0.007282086182385683, + "learning_rate": 0.001, + "loss": 0.394, + "step": 9219 + }, + { + "epoch": 0.2544000938134836, + "grad_norm": 0.0025983904488384724, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 9220 + }, + { + "epoch": 0.254427686014548, + "grad_norm": 0.0024682951625436544, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 9221 + }, + { + "epoch": 0.25445527821561237, + "grad_norm": 0.002498073736205697, + "learning_rate": 0.001, + "loss": 0.381, + "step": 9222 + }, + { + "epoch": 0.2544828704166767, + "grad_norm": 0.004814106039702892, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 9223 + }, + { + "epoch": 0.2545104626177411, + "grad_norm": 0.004485031124204397, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 9224 + }, + { + "epoch": 0.2545380548188055, + "grad_norm": 0.002533281221985817, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 9225 + }, + { + "epoch": 0.25456564701986983, + "grad_norm": 0.0037844269536435604, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 9226 + }, + { + "epoch": 0.2545932392209342, + "grad_norm": 0.0033858432434499264, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 9227 + }, + { + "epoch": 0.2546208314219986, + "grad_norm": 0.006454948335886002, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 9228 + }, + { + "epoch": 0.25464842362306295, + "grad_norm": 0.0032489451114088297, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 9229 + }, + { + "epoch": 0.2546760158241273, + "grad_norm": 0.0027783990371972322, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 9230 + }, + { + "epoch": 0.25470360802519165, + "grad_norm": 0.0032559032551944256, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 9231 + }, + { + "epoch": 0.25473120022625606, + "grad_norm": 0.006878309417515993, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 9232 + }, + { + "epoch": 0.2547587924273204, + "grad_norm": 0.003954887855798006, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 9233 + }, + { + "epoch": 0.25478638462838477, + "grad_norm": 0.0032106926664710045, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 9234 + }, + { + "epoch": 0.2548139768294492, + "grad_norm": 0.002547104610130191, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 9235 + }, + { + "epoch": 0.2548415690305135, + "grad_norm": 0.00274961581453681, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 9236 + }, + { + "epoch": 0.2548691612315779, + "grad_norm": 0.002504730597138405, + "learning_rate": 0.001, + "loss": 0.3737, + "step": 9237 + }, + { + "epoch": 0.2548967534326423, + "grad_norm": 0.00348614901304245, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 9238 + }, + { + "epoch": 0.25492434563370664, + "grad_norm": 0.0030160502064973116, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 9239 + }, + { + "epoch": 0.254951937834771, + "grad_norm": 0.0032628930639475584, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 9240 + }, + { + "epoch": 0.25497953003583534, + "grad_norm": 0.0022108100820332766, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 9241 + }, + { + "epoch": 0.25500712223689975, + "grad_norm": 0.003723006695508957, + "learning_rate": 0.001, + "loss": 0.4371, + "step": 9242 + }, + { + "epoch": 0.2550347144379641, + "grad_norm": 0.00470739183947444, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 9243 + }, + { + "epoch": 0.25506230663902846, + "grad_norm": 0.00285819242708385, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 9244 + }, + { + "epoch": 0.25508989884009287, + "grad_norm": 0.0027255092281848192, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 9245 + }, + { + "epoch": 0.2551174910411572, + "grad_norm": 0.0023286372888833284, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 9246 + }, + { + "epoch": 0.25514508324222157, + "grad_norm": 0.006774048320949078, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 9247 + }, + { + "epoch": 0.255172675443286, + "grad_norm": 0.003444313770160079, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 9248 + }, + { + "epoch": 0.25520026764435033, + "grad_norm": 0.003187762573361397, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 9249 + }, + { + "epoch": 0.2552278598454147, + "grad_norm": 0.00499644735828042, + "learning_rate": 0.001, + "loss": 0.4378, + "step": 9250 + }, + { + "epoch": 0.25525545204647904, + "grad_norm": 0.00488440552726388, + "learning_rate": 0.001, + "loss": 0.4275, + "step": 9251 + }, + { + "epoch": 0.25528304424754344, + "grad_norm": 0.0026155610103160143, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 9252 + }, + { + "epoch": 0.2553106364486078, + "grad_norm": 0.0035269884392619133, + "learning_rate": 0.001, + "loss": 0.406, + "step": 9253 + }, + { + "epoch": 0.25533822864967215, + "grad_norm": 0.005496086087077856, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 9254 + }, + { + "epoch": 0.25536582085073656, + "grad_norm": 0.003939601127058268, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 9255 + }, + { + "epoch": 0.2553934130518009, + "grad_norm": 0.0052981991320848465, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 9256 + }, + { + "epoch": 0.25542100525286526, + "grad_norm": 0.0025523051153868437, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 9257 + }, + { + "epoch": 0.25544859745392967, + "grad_norm": 0.004392458591610193, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 9258 + }, + { + "epoch": 0.255476189654994, + "grad_norm": 0.005645066034048796, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 9259 + }, + { + "epoch": 0.2555037818560584, + "grad_norm": 0.00333485659211874, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 9260 + }, + { + "epoch": 0.25553137405712273, + "grad_norm": 0.003476017387583852, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 9261 + }, + { + "epoch": 0.25555896625818714, + "grad_norm": 0.007261955179274082, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 9262 + }, + { + "epoch": 0.2555865584592515, + "grad_norm": 0.004596728831529617, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 9263 + }, + { + "epoch": 0.25561415066031584, + "grad_norm": 0.0033560562878847122, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 9264 + }, + { + "epoch": 0.25564174286138025, + "grad_norm": 0.003836827352643013, + "learning_rate": 0.001, + "loss": 0.3753, + "step": 9265 + }, + { + "epoch": 0.2556693350624446, + "grad_norm": 0.0031401992309838533, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 9266 + }, + { + "epoch": 0.25569692726350896, + "grad_norm": 0.003163114422932267, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 9267 + }, + { + "epoch": 0.25572451946457336, + "grad_norm": 0.002614476252347231, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 9268 + }, + { + "epoch": 0.2557521116656377, + "grad_norm": 0.0029321182519197464, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 9269 + }, + { + "epoch": 0.25577970386670207, + "grad_norm": 0.0028678993694484234, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 9270 + }, + { + "epoch": 0.2558072960677664, + "grad_norm": 0.0034806549083441496, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 9271 + }, + { + "epoch": 0.25583488826883083, + "grad_norm": 0.0030649881809949875, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 9272 + }, + { + "epoch": 0.2558624804698952, + "grad_norm": 0.004192110616713762, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 9273 + }, + { + "epoch": 0.25589007267095953, + "grad_norm": 0.0024996623396873474, + "learning_rate": 0.001, + "loss": 0.3587, + "step": 9274 + }, + { + "epoch": 0.25591766487202394, + "grad_norm": 0.002356313867494464, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 9275 + }, + { + "epoch": 0.2559452570730883, + "grad_norm": 0.0032441243529319763, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 9276 + }, + { + "epoch": 0.25597284927415265, + "grad_norm": 0.003050130559131503, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 9277 + }, + { + "epoch": 0.25600044147521706, + "grad_norm": 0.0028984618838876486, + "learning_rate": 0.001, + "loss": 0.4252, + "step": 9278 + }, + { + "epoch": 0.2560280336762814, + "grad_norm": 0.0024825683794915676, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 9279 + }, + { + "epoch": 0.25605562587734576, + "grad_norm": 0.007617509458214045, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 9280 + }, + { + "epoch": 0.2560832180784101, + "grad_norm": 0.002734134206548333, + "learning_rate": 0.001, + "loss": 0.411, + "step": 9281 + }, + { + "epoch": 0.2561108102794745, + "grad_norm": 0.0032497511710971594, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 9282 + }, + { + "epoch": 0.2561384024805389, + "grad_norm": 0.0038412590511143208, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 9283 + }, + { + "epoch": 0.2561659946816032, + "grad_norm": 0.002717244438827038, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 9284 + }, + { + "epoch": 0.25619358688266763, + "grad_norm": 0.002895118435844779, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 9285 + }, + { + "epoch": 0.256221179083732, + "grad_norm": 0.0027081493753939867, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 9286 + }, + { + "epoch": 0.25624877128479634, + "grad_norm": 0.002953689079731703, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 9287 + }, + { + "epoch": 0.25627636348586075, + "grad_norm": 0.0018930652877315879, + "learning_rate": 0.001, + "loss": 0.4461, + "step": 9288 + }, + { + "epoch": 0.2563039556869251, + "grad_norm": 0.0027619446627795696, + "learning_rate": 0.001, + "loss": 0.393, + "step": 9289 + }, + { + "epoch": 0.25633154788798945, + "grad_norm": 0.0028445671778172255, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9290 + }, + { + "epoch": 0.2563591400890538, + "grad_norm": 0.004813406616449356, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 9291 + }, + { + "epoch": 0.2563867322901182, + "grad_norm": 0.0027178998570889235, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 9292 + }, + { + "epoch": 0.25641432449118257, + "grad_norm": 0.0027016105595976114, + "learning_rate": 0.001, + "loss": 0.409, + "step": 9293 + }, + { + "epoch": 0.2564419166922469, + "grad_norm": 0.0038917434867471457, + "learning_rate": 0.001, + "loss": 0.391, + "step": 9294 + }, + { + "epoch": 0.2564695088933113, + "grad_norm": 0.004160382319241762, + "learning_rate": 0.001, + "loss": 0.3978, + "step": 9295 + }, + { + "epoch": 0.2564971010943757, + "grad_norm": 0.004111155401915312, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 9296 + }, + { + "epoch": 0.25652469329544003, + "grad_norm": 0.00411924347281456, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 9297 + }, + { + "epoch": 0.25655228549650444, + "grad_norm": 0.0026549468748271465, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 9298 + }, + { + "epoch": 0.2565798776975688, + "grad_norm": 0.004998568445444107, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 9299 + }, + { + "epoch": 0.25660746989863314, + "grad_norm": 0.002522986149415374, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 9300 + }, + { + "epoch": 0.2566350620996975, + "grad_norm": 0.0028745641466230154, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 9301 + }, + { + "epoch": 0.2566626543007619, + "grad_norm": 0.0056059998460114, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 9302 + }, + { + "epoch": 0.25669024650182626, + "grad_norm": 0.002385946689173579, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 9303 + }, + { + "epoch": 0.2567178387028906, + "grad_norm": 0.0057910652831196785, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 9304 + }, + { + "epoch": 0.256745430903955, + "grad_norm": 0.003377356566488743, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 9305 + }, + { + "epoch": 0.25677302310501937, + "grad_norm": 0.002434053458273411, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 9306 + }, + { + "epoch": 0.2568006153060837, + "grad_norm": 0.0029004281386733055, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 9307 + }, + { + "epoch": 0.25682820750714813, + "grad_norm": 0.0032230631913989782, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 9308 + }, + { + "epoch": 0.2568557997082125, + "grad_norm": 0.00245184195227921, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 9309 + }, + { + "epoch": 0.25688339190927684, + "grad_norm": 0.0030043041333556175, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 9310 + }, + { + "epoch": 0.2569109841103412, + "grad_norm": 0.007739334367215633, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 9311 + }, + { + "epoch": 0.2569385763114056, + "grad_norm": 0.002734127687290311, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 9312 + }, + { + "epoch": 0.25696616851246995, + "grad_norm": 0.004879895132035017, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 9313 + }, + { + "epoch": 0.2569937607135343, + "grad_norm": 0.002906453562900424, + "learning_rate": 0.001, + "loss": 0.4332, + "step": 9314 + }, + { + "epoch": 0.2570213529145987, + "grad_norm": 0.004463196266442537, + "learning_rate": 0.001, + "loss": 0.3455, + "step": 9315 + }, + { + "epoch": 0.25704894511566306, + "grad_norm": 0.004677020013332367, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 9316 + }, + { + "epoch": 0.2570765373167274, + "grad_norm": 0.006812858395278454, + "learning_rate": 0.001, + "loss": 0.415, + "step": 9317 + }, + { + "epoch": 0.25710412951779177, + "grad_norm": 0.0026047630235552788, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 9318 + }, + { + "epoch": 0.2571317217188562, + "grad_norm": 0.004368063993752003, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 9319 + }, + { + "epoch": 0.25715931391992053, + "grad_norm": 0.0024550901725888252, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 9320 + }, + { + "epoch": 0.2571869061209849, + "grad_norm": 0.003168846946209669, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 9321 + }, + { + "epoch": 0.2572144983220493, + "grad_norm": 0.004993709735572338, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 9322 + }, + { + "epoch": 0.25724209052311364, + "grad_norm": 0.0027966087218374014, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 9323 + }, + { + "epoch": 0.257269682724178, + "grad_norm": 0.0036490256898105145, + "learning_rate": 0.001, + "loss": 0.3662, + "step": 9324 + }, + { + "epoch": 0.2572972749252424, + "grad_norm": 0.006444889586418867, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 9325 + }, + { + "epoch": 0.25732486712630676, + "grad_norm": 0.005108532030135393, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 9326 + }, + { + "epoch": 0.2573524593273711, + "grad_norm": 0.004126682877540588, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 9327 + }, + { + "epoch": 0.25738005152843546, + "grad_norm": 0.002818217733874917, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 9328 + }, + { + "epoch": 0.25740764372949987, + "grad_norm": 0.0042424676939845085, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 9329 + }, + { + "epoch": 0.2574352359305642, + "grad_norm": 0.003638759721070528, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 9330 + }, + { + "epoch": 0.2574628281316286, + "grad_norm": 0.003619089489802718, + "learning_rate": 0.001, + "loss": 0.366, + "step": 9331 + }, + { + "epoch": 0.257490420332693, + "grad_norm": 0.0029553293716162443, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 9332 + }, + { + "epoch": 0.25751801253375733, + "grad_norm": 0.003728683805093169, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 9333 + }, + { + "epoch": 0.2575456047348217, + "grad_norm": 0.006002207286655903, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 9334 + }, + { + "epoch": 0.2575731969358861, + "grad_norm": 0.005744462367147207, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 9335 + }, + { + "epoch": 0.25760078913695045, + "grad_norm": 0.006803566124290228, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 9336 + }, + { + "epoch": 0.2576283813380148, + "grad_norm": 0.0028065780643373728, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 9337 + }, + { + "epoch": 0.25765597353907915, + "grad_norm": 0.003158635227009654, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 9338 + }, + { + "epoch": 0.25768356574014356, + "grad_norm": 0.0043748971074819565, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 9339 + }, + { + "epoch": 0.2577111579412079, + "grad_norm": 0.0038016666658222675, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 9340 + }, + { + "epoch": 0.25773875014227227, + "grad_norm": 0.005203484557569027, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 9341 + }, + { + "epoch": 0.2577663423433367, + "grad_norm": 0.0031318182591348886, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 9342 + }, + { + "epoch": 0.257793934544401, + "grad_norm": 0.0021927165798842907, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 9343 + }, + { + "epoch": 0.2578215267454654, + "grad_norm": 0.008451412431895733, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 9344 + }, + { + "epoch": 0.2578491189465298, + "grad_norm": 0.012602372094988823, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 9345 + }, + { + "epoch": 0.25787671114759414, + "grad_norm": 0.004658313933759928, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 9346 + }, + { + "epoch": 0.2579043033486585, + "grad_norm": 0.029415573924779892, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 9347 + }, + { + "epoch": 0.25793189554972284, + "grad_norm": 0.005347141530364752, + "learning_rate": 0.001, + "loss": 0.395, + "step": 9348 + }, + { + "epoch": 0.25795948775078725, + "grad_norm": 0.008220274932682514, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 9349 + }, + { + "epoch": 0.2579870799518516, + "grad_norm": 0.0034864528570324183, + "learning_rate": 0.001, + "loss": 0.396, + "step": 9350 + }, + { + "epoch": 0.25801467215291596, + "grad_norm": 0.0023793810978531837, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 9351 + }, + { + "epoch": 0.25804226435398037, + "grad_norm": 0.00437847338616848, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 9352 + }, + { + "epoch": 0.2580698565550447, + "grad_norm": 0.011434728279709816, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 9353 + }, + { + "epoch": 0.25809744875610907, + "grad_norm": 0.002974115777760744, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 9354 + }, + { + "epoch": 0.2581250409571735, + "grad_norm": 0.0029822078067809343, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 9355 + }, + { + "epoch": 0.25815263315823783, + "grad_norm": 0.004817003384232521, + "learning_rate": 0.001, + "loss": 0.413, + "step": 9356 + }, + { + "epoch": 0.2581802253593022, + "grad_norm": 0.005547203589230776, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 9357 + }, + { + "epoch": 0.25820781756036654, + "grad_norm": 0.002377490047365427, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9358 + }, + { + "epoch": 0.25823540976143095, + "grad_norm": 0.0028770265635102987, + "learning_rate": 0.001, + "loss": 0.376, + "step": 9359 + }, + { + "epoch": 0.2582630019624953, + "grad_norm": 0.003406745847314596, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 9360 + }, + { + "epoch": 0.25829059416355965, + "grad_norm": 0.002956562442705035, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 9361 + }, + { + "epoch": 0.25831818636462406, + "grad_norm": 0.0038173554930835962, + "learning_rate": 0.001, + "loss": 0.4468, + "step": 9362 + }, + { + "epoch": 0.2583457785656884, + "grad_norm": 0.003059924114495516, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 9363 + }, + { + "epoch": 0.25837337076675276, + "grad_norm": 0.002305653179064393, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 9364 + }, + { + "epoch": 0.25840096296781717, + "grad_norm": 0.00254283519461751, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 9365 + }, + { + "epoch": 0.2584285551688815, + "grad_norm": 0.00530956732109189, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 9366 + }, + { + "epoch": 0.2584561473699459, + "grad_norm": 0.0036723476368933916, + "learning_rate": 0.001, + "loss": 0.3638, + "step": 9367 + }, + { + "epoch": 0.25848373957101023, + "grad_norm": 0.008182425051927567, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 9368 + }, + { + "epoch": 0.25851133177207464, + "grad_norm": 0.01312658004462719, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 9369 + }, + { + "epoch": 0.258538923973139, + "grad_norm": 0.0029170040506869555, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 9370 + }, + { + "epoch": 0.25856651617420334, + "grad_norm": 0.00417022779583931, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 9371 + }, + { + "epoch": 0.25859410837526775, + "grad_norm": 0.007455759681761265, + "learning_rate": 0.001, + "loss": 0.377, + "step": 9372 + }, + { + "epoch": 0.2586217005763321, + "grad_norm": 0.0027683484368026257, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 9373 + }, + { + "epoch": 0.25864929277739646, + "grad_norm": 0.005918456707149744, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 9374 + }, + { + "epoch": 0.25867688497846086, + "grad_norm": 0.002821930916979909, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 9375 + }, + { + "epoch": 0.2587044771795252, + "grad_norm": 0.003820334793999791, + "learning_rate": 0.001, + "loss": 0.4133, + "step": 9376 + }, + { + "epoch": 0.25873206938058957, + "grad_norm": 0.003256513038650155, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 9377 + }, + { + "epoch": 0.2587596615816539, + "grad_norm": 0.010328397154808044, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 9378 + }, + { + "epoch": 0.25878725378271833, + "grad_norm": 0.002478259615600109, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 9379 + }, + { + "epoch": 0.2588148459837827, + "grad_norm": 0.0034857571590691805, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 9380 + }, + { + "epoch": 0.25884243818484703, + "grad_norm": 0.00216446490958333, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 9381 + }, + { + "epoch": 0.25887003038591144, + "grad_norm": 0.0028666919097304344, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 9382 + }, + { + "epoch": 0.2588976225869758, + "grad_norm": 0.002316189929842949, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 9383 + }, + { + "epoch": 0.25892521478804015, + "grad_norm": 0.0033454406075179577, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 9384 + }, + { + "epoch": 0.25895280698910456, + "grad_norm": 0.0029785428196191788, + "learning_rate": 0.001, + "loss": 0.379, + "step": 9385 + }, + { + "epoch": 0.2589803991901689, + "grad_norm": 0.0043611531145870686, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 9386 + }, + { + "epoch": 0.25900799139123326, + "grad_norm": 0.004450157284736633, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 9387 + }, + { + "epoch": 0.2590355835922976, + "grad_norm": 0.003939083311706781, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 9388 + }, + { + "epoch": 0.259063175793362, + "grad_norm": 0.0035476302728056908, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 9389 + }, + { + "epoch": 0.2590907679944264, + "grad_norm": 0.005521733313798904, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 9390 + }, + { + "epoch": 0.2591183601954907, + "grad_norm": 0.0055731479078531265, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 9391 + }, + { + "epoch": 0.25914595239655513, + "grad_norm": 0.003587324172258377, + "learning_rate": 0.001, + "loss": 0.374, + "step": 9392 + }, + { + "epoch": 0.2591735445976195, + "grad_norm": 0.018497616052627563, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 9393 + }, + { + "epoch": 0.25920113679868384, + "grad_norm": 0.005223001353442669, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 9394 + }, + { + "epoch": 0.25922872899974825, + "grad_norm": 0.0024736267514526844, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 9395 + }, + { + "epoch": 0.2592563212008126, + "grad_norm": 0.0023903397377580404, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 9396 + }, + { + "epoch": 0.25928391340187695, + "grad_norm": 0.0038790139369666576, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 9397 + }, + { + "epoch": 0.2593115056029413, + "grad_norm": 0.0027185981161892414, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 9398 + }, + { + "epoch": 0.2593390978040057, + "grad_norm": 0.007733003702014685, + "learning_rate": 0.001, + "loss": 0.368, + "step": 9399 + }, + { + "epoch": 0.25936669000507007, + "grad_norm": 0.003809612710028887, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 9400 + }, + { + "epoch": 0.2593942822061344, + "grad_norm": 0.004529821220785379, + "learning_rate": 0.001, + "loss": 0.4428, + "step": 9401 + }, + { + "epoch": 0.2594218744071988, + "grad_norm": 0.0025704330764710903, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 9402 + }, + { + "epoch": 0.2594494666082632, + "grad_norm": 0.0027423694264143705, + "learning_rate": 0.001, + "loss": 0.4307, + "step": 9403 + }, + { + "epoch": 0.25947705880932753, + "grad_norm": 0.0036277722101658583, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 9404 + }, + { + "epoch": 0.25950465101039194, + "grad_norm": 0.003103514201939106, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 9405 + }, + { + "epoch": 0.2595322432114563, + "grad_norm": 0.005228511989116669, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 9406 + }, + { + "epoch": 0.25955983541252065, + "grad_norm": 0.0027565350756049156, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 9407 + }, + { + "epoch": 0.259587427613585, + "grad_norm": 0.005467361304908991, + "learning_rate": 0.001, + "loss": 0.38, + "step": 9408 + }, + { + "epoch": 0.2596150198146494, + "grad_norm": 0.002642360283061862, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 9409 + }, + { + "epoch": 0.25964261201571376, + "grad_norm": 0.00389938335865736, + "learning_rate": 0.001, + "loss": 0.416, + "step": 9410 + }, + { + "epoch": 0.2596702042167781, + "grad_norm": 0.0047807167284190655, + "learning_rate": 0.001, + "loss": 0.41, + "step": 9411 + }, + { + "epoch": 0.2596977964178425, + "grad_norm": 0.017033278942108154, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 9412 + }, + { + "epoch": 0.25972538861890687, + "grad_norm": 0.014154274947941303, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 9413 + }, + { + "epoch": 0.2597529808199712, + "grad_norm": 0.009238173253834248, + "learning_rate": 0.001, + "loss": 0.3483, + "step": 9414 + }, + { + "epoch": 0.2597805730210356, + "grad_norm": 0.008399531245231628, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 9415 + }, + { + "epoch": 0.2598081652221, + "grad_norm": 0.002651994815096259, + "learning_rate": 0.001, + "loss": 0.389, + "step": 9416 + }, + { + "epoch": 0.25983575742316434, + "grad_norm": 0.003266576211899519, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 9417 + }, + { + "epoch": 0.2598633496242287, + "grad_norm": 0.0043223886750638485, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 9418 + }, + { + "epoch": 0.2598909418252931, + "grad_norm": 0.004282352980226278, + "learning_rate": 0.001, + "loss": 0.382, + "step": 9419 + }, + { + "epoch": 0.25991853402635745, + "grad_norm": 0.006613946054130793, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 9420 + }, + { + "epoch": 0.2599461262274218, + "grad_norm": 0.0021815176587551832, + "learning_rate": 0.001, + "loss": 0.415, + "step": 9421 + }, + { + "epoch": 0.2599737184284862, + "grad_norm": 0.0028681547846645117, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 9422 + }, + { + "epoch": 0.26000131062955056, + "grad_norm": 0.0035515769850462675, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 9423 + }, + { + "epoch": 0.2600289028306149, + "grad_norm": 0.004106239881366491, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 9424 + }, + { + "epoch": 0.26005649503167927, + "grad_norm": 0.005105991847813129, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 9425 + }, + { + "epoch": 0.2600840872327437, + "grad_norm": 0.004727422725409269, + "learning_rate": 0.001, + "loss": 0.391, + "step": 9426 + }, + { + "epoch": 0.26011167943380803, + "grad_norm": 0.005350995343178511, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 9427 + }, + { + "epoch": 0.2601392716348724, + "grad_norm": 0.005172072909772396, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 9428 + }, + { + "epoch": 0.2601668638359368, + "grad_norm": 0.004639607388526201, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 9429 + }, + { + "epoch": 0.26019445603700114, + "grad_norm": 0.004586922936141491, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 9430 + }, + { + "epoch": 0.2602220482380655, + "grad_norm": 0.004610531497746706, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 9431 + }, + { + "epoch": 0.2602496404391299, + "grad_norm": 0.0026218774728477, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 9432 + }, + { + "epoch": 0.26027723264019426, + "grad_norm": 0.006149280350655317, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 9433 + }, + { + "epoch": 0.2603048248412586, + "grad_norm": 0.0048711239360272884, + "learning_rate": 0.001, + "loss": 0.4503, + "step": 9434 + }, + { + "epoch": 0.26033241704232296, + "grad_norm": 0.03840470314025879, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 9435 + }, + { + "epoch": 0.26036000924338737, + "grad_norm": 0.0037160839419811964, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 9436 + }, + { + "epoch": 0.2603876014444517, + "grad_norm": 0.0032652821391820908, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 9437 + }, + { + "epoch": 0.2604151936455161, + "grad_norm": 0.007966517470777035, + "learning_rate": 0.001, + "loss": 0.375, + "step": 9438 + }, + { + "epoch": 0.2604427858465805, + "grad_norm": 0.006473064888268709, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 9439 + }, + { + "epoch": 0.26047037804764483, + "grad_norm": 0.005094799678772688, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 9440 + }, + { + "epoch": 0.2604979702487092, + "grad_norm": 0.049805864691734314, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 9441 + }, + { + "epoch": 0.2605255624497736, + "grad_norm": 0.0036678947508335114, + "learning_rate": 0.001, + "loss": 0.4396, + "step": 9442 + }, + { + "epoch": 0.26055315465083795, + "grad_norm": 0.004897846840322018, + "learning_rate": 0.001, + "loss": 0.366, + "step": 9443 + }, + { + "epoch": 0.2605807468519023, + "grad_norm": 0.00426288740709424, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 9444 + }, + { + "epoch": 0.26060833905296665, + "grad_norm": 0.009089482948184013, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 9445 + }, + { + "epoch": 0.26063593125403106, + "grad_norm": 0.00743220467120409, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 9446 + }, + { + "epoch": 0.2606635234550954, + "grad_norm": 0.004399383440613747, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 9447 + }, + { + "epoch": 0.26069111565615977, + "grad_norm": 0.0023268854711204767, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 9448 + }, + { + "epoch": 0.2607187078572242, + "grad_norm": 0.003956654574722052, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 9449 + }, + { + "epoch": 0.2607463000582885, + "grad_norm": 0.00389308063313365, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 9450 + }, + { + "epoch": 0.2607738922593529, + "grad_norm": 0.0036460221745073795, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 9451 + }, + { + "epoch": 0.2608014844604173, + "grad_norm": 0.003133069258183241, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 9452 + }, + { + "epoch": 0.26082907666148164, + "grad_norm": 0.0027508933562785387, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 9453 + }, + { + "epoch": 0.260856668862546, + "grad_norm": 0.003370395628735423, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 9454 + }, + { + "epoch": 0.26088426106361035, + "grad_norm": 0.0036432910710573196, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 9455 + }, + { + "epoch": 0.26091185326467475, + "grad_norm": 0.004142069257795811, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 9456 + }, + { + "epoch": 0.2609394454657391, + "grad_norm": 0.003075930755585432, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 9457 + }, + { + "epoch": 0.26096703766680346, + "grad_norm": 0.002679844619706273, + "learning_rate": 0.001, + "loss": 0.371, + "step": 9458 + }, + { + "epoch": 0.26099462986786787, + "grad_norm": 0.0034837801940739155, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 9459 + }, + { + "epoch": 0.2610222220689322, + "grad_norm": 0.002534442814067006, + "learning_rate": 0.001, + "loss": 0.408, + "step": 9460 + }, + { + "epoch": 0.26104981426999657, + "grad_norm": 0.005891683977097273, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 9461 + }, + { + "epoch": 0.261077406471061, + "grad_norm": 0.005468081682920456, + "learning_rate": 0.001, + "loss": 0.353, + "step": 9462 + }, + { + "epoch": 0.26110499867212533, + "grad_norm": 0.002956255106255412, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 9463 + }, + { + "epoch": 0.2611325908731897, + "grad_norm": 0.003849986707791686, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 9464 + }, + { + "epoch": 0.26116018307425404, + "grad_norm": 0.002705160528421402, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 9465 + }, + { + "epoch": 0.26118777527531845, + "grad_norm": 0.002549894852563739, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 9466 + }, + { + "epoch": 0.2612153674763828, + "grad_norm": 0.003938885871320963, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 9467 + }, + { + "epoch": 0.26124295967744715, + "grad_norm": 0.002657047938555479, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 9468 + }, + { + "epoch": 0.26127055187851156, + "grad_norm": 0.004167834762483835, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 9469 + }, + { + "epoch": 0.2612981440795759, + "grad_norm": 0.0034376648254692554, + "learning_rate": 0.001, + "loss": 0.3597, + "step": 9470 + }, + { + "epoch": 0.26132573628064026, + "grad_norm": 0.0034596873447299004, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 9471 + }, + { + "epoch": 0.26135332848170467, + "grad_norm": 0.002702021738514304, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 9472 + }, + { + "epoch": 0.261380920682769, + "grad_norm": 0.0027853166684508324, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 9473 + }, + { + "epoch": 0.2614085128838334, + "grad_norm": 0.0032341419719159603, + "learning_rate": 0.001, + "loss": 0.4335, + "step": 9474 + }, + { + "epoch": 0.26143610508489773, + "grad_norm": 0.0034284372813999653, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 9475 + }, + { + "epoch": 0.26146369728596214, + "grad_norm": 0.003640861948952079, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 9476 + }, + { + "epoch": 0.2614912894870265, + "grad_norm": 0.002866639755666256, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 9477 + }, + { + "epoch": 0.26151888168809084, + "grad_norm": 0.0023725773207843304, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 9478 + }, + { + "epoch": 0.26154647388915525, + "grad_norm": 0.0032590448390692472, + "learning_rate": 0.001, + "loss": 0.3543, + "step": 9479 + }, + { + "epoch": 0.2615740660902196, + "grad_norm": 0.004378579091280699, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 9480 + }, + { + "epoch": 0.26160165829128396, + "grad_norm": 0.0037949122488498688, + "learning_rate": 0.001, + "loss": 0.4333, + "step": 9481 + }, + { + "epoch": 0.26162925049234836, + "grad_norm": 0.002791197504848242, + "learning_rate": 0.001, + "loss": 0.3499, + "step": 9482 + }, + { + "epoch": 0.2616568426934127, + "grad_norm": 0.0038585460279136896, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 9483 + }, + { + "epoch": 0.26168443489447707, + "grad_norm": 0.002629459137097001, + "learning_rate": 0.001, + "loss": 0.423, + "step": 9484 + }, + { + "epoch": 0.2617120270955414, + "grad_norm": 0.0032473283354192972, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 9485 + }, + { + "epoch": 0.26173961929660583, + "grad_norm": 0.0029964474961161613, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 9486 + }, + { + "epoch": 0.2617672114976702, + "grad_norm": 0.0031551928259432316, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 9487 + }, + { + "epoch": 0.26179480369873454, + "grad_norm": 0.00244794599711895, + "learning_rate": 0.001, + "loss": 0.3962, + "step": 9488 + }, + { + "epoch": 0.26182239589979894, + "grad_norm": 0.003302481258288026, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 9489 + }, + { + "epoch": 0.2618499881008633, + "grad_norm": 0.0027688101399689913, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 9490 + }, + { + "epoch": 0.26187758030192765, + "grad_norm": 0.0031254065688699484, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 9491 + }, + { + "epoch": 0.26190517250299206, + "grad_norm": 0.00236527225933969, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 9492 + }, + { + "epoch": 0.2619327647040564, + "grad_norm": 0.0028193267062306404, + "learning_rate": 0.001, + "loss": 0.3627, + "step": 9493 + }, + { + "epoch": 0.26196035690512076, + "grad_norm": 0.002419235184788704, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 9494 + }, + { + "epoch": 0.2619879491061851, + "grad_norm": 0.006053110118955374, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 9495 + }, + { + "epoch": 0.2620155413072495, + "grad_norm": 0.004557557869702578, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 9496 + }, + { + "epoch": 0.2620431335083139, + "grad_norm": 0.002896302379667759, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 9497 + }, + { + "epoch": 0.2620707257093782, + "grad_norm": 0.002858472056686878, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 9498 + }, + { + "epoch": 0.26209831791044264, + "grad_norm": 0.004048534668982029, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 9499 + }, + { + "epoch": 0.262125910111507, + "grad_norm": 0.003960879985243082, + "learning_rate": 0.001, + "loss": 0.4, + "step": 9500 + }, + { + "epoch": 0.262125910111507, + "eval_runtime": 25.194, + "eval_samples_per_second": 1.27, + "eval_steps_per_second": 0.159, + "step": 9500 + }, + { + "epoch": 0.26215350231257134, + "grad_norm": 0.0026859226636588573, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 9501 + }, + { + "epoch": 0.26218109451363575, + "grad_norm": 0.004340642131865025, + "learning_rate": 0.001, + "loss": 0.3599, + "step": 9502 + }, + { + "epoch": 0.2622086867147001, + "grad_norm": 0.003308952320367098, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 9503 + }, + { + "epoch": 0.26223627891576445, + "grad_norm": 0.003607766004279256, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 9504 + }, + { + "epoch": 0.2622638711168288, + "grad_norm": 0.004522815812379122, + "learning_rate": 0.001, + "loss": 0.3602, + "step": 9505 + }, + { + "epoch": 0.2622914633178932, + "grad_norm": 0.004235134460031986, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 9506 + }, + { + "epoch": 0.26231905551895757, + "grad_norm": 0.010735023766756058, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 9507 + }, + { + "epoch": 0.2623466477200219, + "grad_norm": 0.004076390992850065, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 9508 + }, + { + "epoch": 0.2623742399210863, + "grad_norm": 0.007312237750738859, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 9509 + }, + { + "epoch": 0.2624018321221507, + "grad_norm": 0.007191124372184277, + "learning_rate": 0.001, + "loss": 0.38, + "step": 9510 + }, + { + "epoch": 0.26242942432321503, + "grad_norm": 0.0035255979746580124, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 9511 + }, + { + "epoch": 0.2624570165242794, + "grad_norm": 0.004981547594070435, + "learning_rate": 0.001, + "loss": 0.4575, + "step": 9512 + }, + { + "epoch": 0.2624846087253438, + "grad_norm": 0.006985159125179052, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 9513 + }, + { + "epoch": 0.26251220092640815, + "grad_norm": 0.005956006236374378, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 9514 + }, + { + "epoch": 0.2625397931274725, + "grad_norm": 0.005447957664728165, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 9515 + }, + { + "epoch": 0.2625673853285369, + "grad_norm": 0.007735871244221926, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 9516 + }, + { + "epoch": 0.26259497752960126, + "grad_norm": 0.004465447273105383, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 9517 + }, + { + "epoch": 0.2626225697306656, + "grad_norm": 0.0033752690069377422, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 9518 + }, + { + "epoch": 0.26265016193173, + "grad_norm": 0.006316347047686577, + "learning_rate": 0.001, + "loss": 0.4389, + "step": 9519 + }, + { + "epoch": 0.26267775413279437, + "grad_norm": 0.019447481259703636, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 9520 + }, + { + "epoch": 0.2627053463338587, + "grad_norm": 0.0039499313570559025, + "learning_rate": 0.001, + "loss": 0.42, + "step": 9521 + }, + { + "epoch": 0.2627329385349231, + "grad_norm": 0.005296787712723017, + "learning_rate": 0.001, + "loss": 0.397, + "step": 9522 + }, + { + "epoch": 0.2627605307359875, + "grad_norm": 0.0028091988060623407, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 9523 + }, + { + "epoch": 0.26278812293705184, + "grad_norm": 0.004335075616836548, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 9524 + }, + { + "epoch": 0.2628157151381162, + "grad_norm": 0.0030884994193911552, + "learning_rate": 0.001, + "loss": 0.415, + "step": 9525 + }, + { + "epoch": 0.2628433073391806, + "grad_norm": 0.004353479482233524, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 9526 + }, + { + "epoch": 0.26287089954024495, + "grad_norm": 0.004048333968967199, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 9527 + }, + { + "epoch": 0.2628984917413093, + "grad_norm": 0.0034877778962254524, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 9528 + }, + { + "epoch": 0.2629260839423737, + "grad_norm": 0.0037483058404177427, + "learning_rate": 0.001, + "loss": 0.399, + "step": 9529 + }, + { + "epoch": 0.26295367614343806, + "grad_norm": 0.005867067724466324, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 9530 + }, + { + "epoch": 0.2629812683445024, + "grad_norm": 0.004038537386804819, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 9531 + }, + { + "epoch": 0.26300886054556677, + "grad_norm": 0.005649103317409754, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 9532 + }, + { + "epoch": 0.2630364527466312, + "grad_norm": 0.0033814937341958284, + "learning_rate": 0.001, + "loss": 0.3688, + "step": 9533 + }, + { + "epoch": 0.26306404494769553, + "grad_norm": 0.0035686511546373367, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 9534 + }, + { + "epoch": 0.2630916371487599, + "grad_norm": 0.004245868884027004, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 9535 + }, + { + "epoch": 0.2631192293498243, + "grad_norm": 0.006375094410032034, + "learning_rate": 0.001, + "loss": 0.3639, + "step": 9536 + }, + { + "epoch": 0.26314682155088864, + "grad_norm": 0.008856172673404217, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 9537 + }, + { + "epoch": 0.263174413751953, + "grad_norm": 0.005304283462464809, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 9538 + }, + { + "epoch": 0.2632020059530174, + "grad_norm": 0.003037388902157545, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 9539 + }, + { + "epoch": 0.26322959815408176, + "grad_norm": 0.00462722172960639, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 9540 + }, + { + "epoch": 0.2632571903551461, + "grad_norm": 0.009490979835391045, + "learning_rate": 0.001, + "loss": 0.3635, + "step": 9541 + }, + { + "epoch": 0.26328478255621046, + "grad_norm": 0.0032088488806039095, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 9542 + }, + { + "epoch": 0.26331237475727487, + "grad_norm": 0.006467896047979593, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 9543 + }, + { + "epoch": 0.2633399669583392, + "grad_norm": 0.005221541505306959, + "learning_rate": 0.001, + "loss": 0.395, + "step": 9544 + }, + { + "epoch": 0.2633675591594036, + "grad_norm": 0.0027723468374460936, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 9545 + }, + { + "epoch": 0.263395151360468, + "grad_norm": 0.0038048315327614546, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 9546 + }, + { + "epoch": 0.26342274356153234, + "grad_norm": 0.004077599383890629, + "learning_rate": 0.001, + "loss": 0.438, + "step": 9547 + }, + { + "epoch": 0.2634503357625967, + "grad_norm": 0.0045372615568339825, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 9548 + }, + { + "epoch": 0.2634779279636611, + "grad_norm": 0.0027856752276420593, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 9549 + }, + { + "epoch": 0.26350552016472545, + "grad_norm": 0.0026442657690495253, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 9550 + }, + { + "epoch": 0.2635331123657898, + "grad_norm": 0.003073024796321988, + "learning_rate": 0.001, + "loss": 0.4273, + "step": 9551 + }, + { + "epoch": 0.26356070456685415, + "grad_norm": 0.0030135842971503735, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 9552 + }, + { + "epoch": 0.26358829676791856, + "grad_norm": 0.004977943375706673, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 9553 + }, + { + "epoch": 0.2636158889689829, + "grad_norm": 0.0030254484154284, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 9554 + }, + { + "epoch": 0.26364348117004727, + "grad_norm": 0.003810320282354951, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 9555 + }, + { + "epoch": 0.2636710733711117, + "grad_norm": 0.002874776953831315, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 9556 + }, + { + "epoch": 0.263698665572176, + "grad_norm": 0.00934076588600874, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 9557 + }, + { + "epoch": 0.2637262577732404, + "grad_norm": 0.0022208630107343197, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 9558 + }, + { + "epoch": 0.2637538499743048, + "grad_norm": 0.002492060884833336, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 9559 + }, + { + "epoch": 0.26378144217536914, + "grad_norm": 0.005849645473062992, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 9560 + }, + { + "epoch": 0.2638090343764335, + "grad_norm": 0.004015061073005199, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 9561 + }, + { + "epoch": 0.26383662657749785, + "grad_norm": 0.0033112233504652977, + "learning_rate": 0.001, + "loss": 0.3804, + "step": 9562 + }, + { + "epoch": 0.26386421877856225, + "grad_norm": 0.002749896375462413, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 9563 + }, + { + "epoch": 0.2638918109796266, + "grad_norm": 0.0027315288316458464, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 9564 + }, + { + "epoch": 0.26391940318069096, + "grad_norm": 0.008834882639348507, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 9565 + }, + { + "epoch": 0.26394699538175537, + "grad_norm": 0.01456514373421669, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 9566 + }, + { + "epoch": 0.2639745875828197, + "grad_norm": 0.0028064576908946037, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 9567 + }, + { + "epoch": 0.2640021797838841, + "grad_norm": 0.00802356656640768, + "learning_rate": 0.001, + "loss": 0.415, + "step": 9568 + }, + { + "epoch": 0.2640297719849485, + "grad_norm": 0.0031302161514759064, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 9569 + }, + { + "epoch": 0.26405736418601283, + "grad_norm": 0.0035431873984634876, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 9570 + }, + { + "epoch": 0.2640849563870772, + "grad_norm": 0.002664680825546384, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 9571 + }, + { + "epoch": 0.26411254858814154, + "grad_norm": 0.006673896219581366, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 9572 + }, + { + "epoch": 0.26414014078920595, + "grad_norm": 0.003176551777869463, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 9573 + }, + { + "epoch": 0.2641677329902703, + "grad_norm": 0.01657606102526188, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 9574 + }, + { + "epoch": 0.26419532519133465, + "grad_norm": 0.0069385310634970665, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 9575 + }, + { + "epoch": 0.26422291739239906, + "grad_norm": 0.009939515963196754, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 9576 + }, + { + "epoch": 0.2642505095934634, + "grad_norm": 0.002801012946292758, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 9577 + }, + { + "epoch": 0.26427810179452776, + "grad_norm": 0.003218452911823988, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 9578 + }, + { + "epoch": 0.2643056939955922, + "grad_norm": 0.0029445989057421684, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 9579 + }, + { + "epoch": 0.2643332861966565, + "grad_norm": 0.003089828649535775, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 9580 + }, + { + "epoch": 0.2643608783977209, + "grad_norm": 0.0025015585124492645, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 9581 + }, + { + "epoch": 0.26438847059878523, + "grad_norm": 0.0021664395462721586, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 9582 + }, + { + "epoch": 0.26441606279984964, + "grad_norm": 0.003756561316549778, + "learning_rate": 0.001, + "loss": 0.388, + "step": 9583 + }, + { + "epoch": 0.264443655000914, + "grad_norm": 0.003459861734881997, + "learning_rate": 0.001, + "loss": 0.41, + "step": 9584 + }, + { + "epoch": 0.26447124720197834, + "grad_norm": 0.0028741243295371532, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 9585 + }, + { + "epoch": 0.26449883940304275, + "grad_norm": 0.00595883559435606, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 9586 + }, + { + "epoch": 0.2645264316041071, + "grad_norm": 0.011750889010727406, + "learning_rate": 0.001, + "loss": 0.402, + "step": 9587 + }, + { + "epoch": 0.26455402380517146, + "grad_norm": 0.005314968526363373, + "learning_rate": 0.001, + "loss": 0.354, + "step": 9588 + }, + { + "epoch": 0.26458161600623586, + "grad_norm": 0.007636393420398235, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 9589 + }, + { + "epoch": 0.2646092082073002, + "grad_norm": 0.003898282302543521, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 9590 + }, + { + "epoch": 0.26463680040836457, + "grad_norm": 0.0038661775179207325, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 9591 + }, + { + "epoch": 0.2646643926094289, + "grad_norm": 0.006659380160272121, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 9592 + }, + { + "epoch": 0.26469198481049333, + "grad_norm": 0.0044357809238135815, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 9593 + }, + { + "epoch": 0.2647195770115577, + "grad_norm": 0.0028593826573342085, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 9594 + }, + { + "epoch": 0.26474716921262204, + "grad_norm": 0.003539180150255561, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 9595 + }, + { + "epoch": 0.26477476141368644, + "grad_norm": 0.002942041726782918, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 9596 + }, + { + "epoch": 0.2648023536147508, + "grad_norm": 0.0028285589069128036, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 9597 + }, + { + "epoch": 0.26482994581581515, + "grad_norm": 0.0025839281734079123, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 9598 + }, + { + "epoch": 0.2648575380168795, + "grad_norm": 0.00478973425924778, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 9599 + }, + { + "epoch": 0.2648851302179439, + "grad_norm": 0.00439714128151536, + "learning_rate": 0.001, + "loss": 0.3596, + "step": 9600 + }, + { + "epoch": 0.26491272241900826, + "grad_norm": 0.0023904817644506693, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 9601 + }, + { + "epoch": 0.2649403146200726, + "grad_norm": 0.0025250171311199665, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 9602 + }, + { + "epoch": 0.264967906821137, + "grad_norm": 0.0065780081786215305, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 9603 + }, + { + "epoch": 0.2649954990222014, + "grad_norm": 0.0026748040691018105, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 9604 + }, + { + "epoch": 0.2650230912232657, + "grad_norm": 0.0031709959730505943, + "learning_rate": 0.001, + "loss": 0.4583, + "step": 9605 + }, + { + "epoch": 0.26505068342433014, + "grad_norm": 0.0026485987473279238, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 9606 + }, + { + "epoch": 0.2650782756253945, + "grad_norm": 0.0029246548656374216, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 9607 + }, + { + "epoch": 0.26510586782645884, + "grad_norm": 0.0031324047595262527, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 9608 + }, + { + "epoch": 0.2651334600275232, + "grad_norm": 0.0024871290661394596, + "learning_rate": 0.001, + "loss": 0.4121, + "step": 9609 + }, + { + "epoch": 0.2651610522285876, + "grad_norm": 0.00248337653465569, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 9610 + }, + { + "epoch": 0.26518864442965195, + "grad_norm": 0.002592930570244789, + "learning_rate": 0.001, + "loss": 0.4398, + "step": 9611 + }, + { + "epoch": 0.2652162366307163, + "grad_norm": 0.0031219925731420517, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 9612 + }, + { + "epoch": 0.2652438288317807, + "grad_norm": 0.002096372190862894, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 9613 + }, + { + "epoch": 0.26527142103284507, + "grad_norm": 0.0021825828589498997, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 9614 + }, + { + "epoch": 0.2652990132339094, + "grad_norm": 0.0035961021203547716, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 9615 + }, + { + "epoch": 0.26532660543497383, + "grad_norm": 0.003122879657894373, + "learning_rate": 0.001, + "loss": 0.4624, + "step": 9616 + }, + { + "epoch": 0.2653541976360382, + "grad_norm": 0.004705473314970732, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 9617 + }, + { + "epoch": 0.26538178983710253, + "grad_norm": 0.0024439608678221703, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 9618 + }, + { + "epoch": 0.2654093820381669, + "grad_norm": 0.0027413556817919016, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 9619 + }, + { + "epoch": 0.2654369742392313, + "grad_norm": 0.004719111602753401, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 9620 + }, + { + "epoch": 0.26546456644029565, + "grad_norm": 0.003787653986364603, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 9621 + }, + { + "epoch": 0.26549215864136, + "grad_norm": 0.007587883621454239, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 9622 + }, + { + "epoch": 0.2655197508424244, + "grad_norm": 0.0023858651984483004, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 9623 + }, + { + "epoch": 0.26554734304348876, + "grad_norm": 0.005965366493910551, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 9624 + }, + { + "epoch": 0.2655749352445531, + "grad_norm": 0.003585060592740774, + "learning_rate": 0.001, + "loss": 0.4503, + "step": 9625 + }, + { + "epoch": 0.2656025274456175, + "grad_norm": 0.006620787549763918, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 9626 + }, + { + "epoch": 0.2656301196466819, + "grad_norm": 0.002199475420638919, + "learning_rate": 0.001, + "loss": 0.4479, + "step": 9627 + }, + { + "epoch": 0.2656577118477462, + "grad_norm": 0.003352593397721648, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 9628 + }, + { + "epoch": 0.2656853040488106, + "grad_norm": 0.00883631780743599, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 9629 + }, + { + "epoch": 0.265712896249875, + "grad_norm": 0.0033785421401262283, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 9630 + }, + { + "epoch": 0.26574048845093934, + "grad_norm": 0.00507304398342967, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 9631 + }, + { + "epoch": 0.2657680806520037, + "grad_norm": 0.010923560708761215, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 9632 + }, + { + "epoch": 0.2657956728530681, + "grad_norm": 0.003360697068274021, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 9633 + }, + { + "epoch": 0.26582326505413245, + "grad_norm": 0.003309650346636772, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 9634 + }, + { + "epoch": 0.2658508572551968, + "grad_norm": 0.003346043173223734, + "learning_rate": 0.001, + "loss": 0.395, + "step": 9635 + }, + { + "epoch": 0.2658784494562612, + "grad_norm": 0.002929107751697302, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 9636 + }, + { + "epoch": 0.26590604165732556, + "grad_norm": 0.0035214154049754143, + "learning_rate": 0.001, + "loss": 0.4382, + "step": 9637 + }, + { + "epoch": 0.2659336338583899, + "grad_norm": 0.004796061664819717, + "learning_rate": 0.001, + "loss": 0.3684, + "step": 9638 + }, + { + "epoch": 0.26596122605945427, + "grad_norm": 0.002335017779842019, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 9639 + }, + { + "epoch": 0.2659888182605187, + "grad_norm": 0.002536866581067443, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 9640 + }, + { + "epoch": 0.26601641046158303, + "grad_norm": 0.0021245151292532682, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 9641 + }, + { + "epoch": 0.2660440026626474, + "grad_norm": 0.002907783491536975, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 9642 + }, + { + "epoch": 0.2660715948637118, + "grad_norm": 0.00233438890427351, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 9643 + }, + { + "epoch": 0.26609918706477614, + "grad_norm": 0.002872066106647253, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 9644 + }, + { + "epoch": 0.2661267792658405, + "grad_norm": 0.002991445129737258, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 9645 + }, + { + "epoch": 0.2661543714669049, + "grad_norm": 0.004284476395696402, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 9646 + }, + { + "epoch": 0.26618196366796926, + "grad_norm": 0.0026576148811727762, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 9647 + }, + { + "epoch": 0.2662095558690336, + "grad_norm": 0.0023093062918633223, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 9648 + }, + { + "epoch": 0.26623714807009796, + "grad_norm": 0.0027488505002111197, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 9649 + }, + { + "epoch": 0.26626474027116237, + "grad_norm": 0.002484068041667342, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 9650 + }, + { + "epoch": 0.2662923324722267, + "grad_norm": 0.003056896850466728, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 9651 + }, + { + "epoch": 0.2663199246732911, + "grad_norm": 0.0031155794858932495, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 9652 + }, + { + "epoch": 0.2663475168743555, + "grad_norm": 0.002967652166262269, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 9653 + }, + { + "epoch": 0.26637510907541984, + "grad_norm": 0.004259769804775715, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 9654 + }, + { + "epoch": 0.2664027012764842, + "grad_norm": 0.0025000995956361294, + "learning_rate": 0.001, + "loss": 0.4392, + "step": 9655 + }, + { + "epoch": 0.2664302934775486, + "grad_norm": 0.0023649544455111027, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 9656 + }, + { + "epoch": 0.26645788567861295, + "grad_norm": 0.005546201951801777, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 9657 + }, + { + "epoch": 0.2664854778796773, + "grad_norm": 0.005744127556681633, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 9658 + }, + { + "epoch": 0.26651307008074165, + "grad_norm": 0.004911984317004681, + "learning_rate": 0.001, + "loss": 0.4594, + "step": 9659 + }, + { + "epoch": 0.26654066228180606, + "grad_norm": 0.016794539988040924, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 9660 + }, + { + "epoch": 0.2665682544828704, + "grad_norm": 0.004329861607402563, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 9661 + }, + { + "epoch": 0.26659584668393477, + "grad_norm": 0.002347084926441312, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 9662 + }, + { + "epoch": 0.2666234388849992, + "grad_norm": 0.003959767986088991, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 9663 + }, + { + "epoch": 0.26665103108606353, + "grad_norm": 0.0030292284209281206, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 9664 + }, + { + "epoch": 0.2666786232871279, + "grad_norm": 0.0031279707327485085, + "learning_rate": 0.001, + "loss": 0.4021, + "step": 9665 + }, + { + "epoch": 0.2667062154881923, + "grad_norm": 0.0038146632723510265, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 9666 + }, + { + "epoch": 0.26673380768925664, + "grad_norm": 0.002462350530549884, + "learning_rate": 0.001, + "loss": 0.4424, + "step": 9667 + }, + { + "epoch": 0.266761399890321, + "grad_norm": 0.0025544725358486176, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 9668 + }, + { + "epoch": 0.26678899209138535, + "grad_norm": 0.008749466389417648, + "learning_rate": 0.001, + "loss": 0.388, + "step": 9669 + }, + { + "epoch": 0.26681658429244975, + "grad_norm": 0.004481594543904066, + "learning_rate": 0.001, + "loss": 0.387, + "step": 9670 + }, + { + "epoch": 0.2668441764935141, + "grad_norm": 0.0027323602698743343, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 9671 + }, + { + "epoch": 0.26687176869457846, + "grad_norm": 0.005246414337307215, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 9672 + }, + { + "epoch": 0.26689936089564287, + "grad_norm": 0.0037931909319013357, + "learning_rate": 0.001, + "loss": 0.361, + "step": 9673 + }, + { + "epoch": 0.2669269530967072, + "grad_norm": 0.0028204875998198986, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 9674 + }, + { + "epoch": 0.2669545452977716, + "grad_norm": 0.002369094407185912, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 9675 + }, + { + "epoch": 0.266982137498836, + "grad_norm": 0.0028346215840429068, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 9676 + }, + { + "epoch": 0.26700972969990033, + "grad_norm": 0.002556850900873542, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 9677 + }, + { + "epoch": 0.2670373219009647, + "grad_norm": 0.0030708981212228537, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 9678 + }, + { + "epoch": 0.26706491410202904, + "grad_norm": 0.003195315832272172, + "learning_rate": 0.001, + "loss": 0.397, + "step": 9679 + }, + { + "epoch": 0.26709250630309345, + "grad_norm": 0.0025848529767245054, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 9680 + }, + { + "epoch": 0.2671200985041578, + "grad_norm": 0.0033659334294497967, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 9681 + }, + { + "epoch": 0.26714769070522215, + "grad_norm": 0.023203924298286438, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 9682 + }, + { + "epoch": 0.26717528290628656, + "grad_norm": 0.016148481518030167, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 9683 + }, + { + "epoch": 0.2672028751073509, + "grad_norm": 0.003742114407941699, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 9684 + }, + { + "epoch": 0.26723046730841526, + "grad_norm": 0.003448138013482094, + "learning_rate": 0.001, + "loss": 0.379, + "step": 9685 + }, + { + "epoch": 0.2672580595094797, + "grad_norm": 0.004335745703428984, + "learning_rate": 0.001, + "loss": 0.406, + "step": 9686 + }, + { + "epoch": 0.267285651710544, + "grad_norm": 0.0025457171723246574, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 9687 + }, + { + "epoch": 0.2673132439116084, + "grad_norm": 0.0024251220747828484, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 9688 + }, + { + "epoch": 0.26734083611267273, + "grad_norm": 0.00228850357234478, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 9689 + }, + { + "epoch": 0.26736842831373714, + "grad_norm": 0.0042780437506735325, + "learning_rate": 0.001, + "loss": 0.387, + "step": 9690 + }, + { + "epoch": 0.2673960205148015, + "grad_norm": 0.003844790393486619, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 9691 + }, + { + "epoch": 0.26742361271586584, + "grad_norm": 0.005739064887166023, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 9692 + }, + { + "epoch": 0.26745120491693025, + "grad_norm": 0.0037738836836069822, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 9693 + }, + { + "epoch": 0.2674787971179946, + "grad_norm": 0.006218787748366594, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 9694 + }, + { + "epoch": 0.26750638931905896, + "grad_norm": 0.002998180454596877, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 9695 + }, + { + "epoch": 0.2675339815201233, + "grad_norm": 0.008532087318599224, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 9696 + }, + { + "epoch": 0.2675615737211877, + "grad_norm": 0.002803490497171879, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 9697 + }, + { + "epoch": 0.26758916592225207, + "grad_norm": 0.003659422043710947, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 9698 + }, + { + "epoch": 0.2676167581233164, + "grad_norm": 0.004317516926676035, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 9699 + }, + { + "epoch": 0.26764435032438083, + "grad_norm": 0.004090850241482258, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 9700 + }, + { + "epoch": 0.2676719425254452, + "grad_norm": 0.005415142513811588, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 9701 + }, + { + "epoch": 0.26769953472650954, + "grad_norm": 0.0036655161529779434, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 9702 + }, + { + "epoch": 0.26772712692757394, + "grad_norm": 0.008789146319031715, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 9703 + }, + { + "epoch": 0.2677547191286383, + "grad_norm": 0.004173342138528824, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 9704 + }, + { + "epoch": 0.26778231132970265, + "grad_norm": 0.0031271290499716997, + "learning_rate": 0.001, + "loss": 0.424, + "step": 9705 + }, + { + "epoch": 0.267809903530767, + "grad_norm": 0.0030106983613222837, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 9706 + }, + { + "epoch": 0.2678374957318314, + "grad_norm": 0.004071452189236879, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 9707 + }, + { + "epoch": 0.26786508793289576, + "grad_norm": 0.002735607326030731, + "learning_rate": 0.001, + "loss": 0.4445, + "step": 9708 + }, + { + "epoch": 0.2678926801339601, + "grad_norm": 0.0036248862743377686, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 9709 + }, + { + "epoch": 0.2679202723350245, + "grad_norm": 0.004495333880186081, + "learning_rate": 0.001, + "loss": 0.4288, + "step": 9710 + }, + { + "epoch": 0.2679478645360889, + "grad_norm": 0.003465922549366951, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 9711 + }, + { + "epoch": 0.26797545673715323, + "grad_norm": 0.0050760298036038876, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 9712 + }, + { + "epoch": 0.26800304893821764, + "grad_norm": 0.0025128766428679228, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 9713 + }, + { + "epoch": 0.268030641139282, + "grad_norm": 0.0030928144697099924, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 9714 + }, + { + "epoch": 0.26805823334034634, + "grad_norm": 0.0031707047019153833, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 9715 + }, + { + "epoch": 0.2680858255414107, + "grad_norm": 0.0074263643473386765, + "learning_rate": 0.001, + "loss": 0.392, + "step": 9716 + }, + { + "epoch": 0.2681134177424751, + "grad_norm": 0.004428863059729338, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 9717 + }, + { + "epoch": 0.26814100994353945, + "grad_norm": 0.005300541874021292, + "learning_rate": 0.001, + "loss": 0.421, + "step": 9718 + }, + { + "epoch": 0.2681686021446038, + "grad_norm": 0.013318234123289585, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 9719 + }, + { + "epoch": 0.2681961943456682, + "grad_norm": 0.0030859310645610094, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 9720 + }, + { + "epoch": 0.26822378654673257, + "grad_norm": 0.00345474760979414, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 9721 + }, + { + "epoch": 0.2682513787477969, + "grad_norm": 0.006549532990902662, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 9722 + }, + { + "epoch": 0.26827897094886133, + "grad_norm": 0.013787861913442612, + "learning_rate": 0.001, + "loss": 0.4332, + "step": 9723 + }, + { + "epoch": 0.2683065631499257, + "grad_norm": 0.00343018164858222, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 9724 + }, + { + "epoch": 0.26833415535099003, + "grad_norm": 0.002628603018820286, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 9725 + }, + { + "epoch": 0.2683617475520544, + "grad_norm": 0.004094823729246855, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 9726 + }, + { + "epoch": 0.2683893397531188, + "grad_norm": 0.002913734642788768, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 9727 + }, + { + "epoch": 0.26841693195418315, + "grad_norm": 0.002792428247630596, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 9728 + }, + { + "epoch": 0.2684445241552475, + "grad_norm": 0.0024083973839879036, + "learning_rate": 0.001, + "loss": 0.4336, + "step": 9729 + }, + { + "epoch": 0.2684721163563119, + "grad_norm": 0.002965459832921624, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 9730 + }, + { + "epoch": 0.26849970855737626, + "grad_norm": 0.01316264271736145, + "learning_rate": 0.001, + "loss": 0.4539, + "step": 9731 + }, + { + "epoch": 0.2685273007584406, + "grad_norm": 0.002591692376881838, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 9732 + }, + { + "epoch": 0.268554892959505, + "grad_norm": 0.002965483581647277, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 9733 + }, + { + "epoch": 0.2685824851605694, + "grad_norm": 0.004762920085340738, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 9734 + }, + { + "epoch": 0.2686100773616337, + "grad_norm": 0.0021797195076942444, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 9735 + }, + { + "epoch": 0.2686376695626981, + "grad_norm": 0.0029116403311491013, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 9736 + }, + { + "epoch": 0.2686652617637625, + "grad_norm": 0.003725858870893717, + "learning_rate": 0.001, + "loss": 0.3588, + "step": 9737 + }, + { + "epoch": 0.26869285396482684, + "grad_norm": 0.004337244667112827, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 9738 + }, + { + "epoch": 0.2687204461658912, + "grad_norm": 0.0027076283004134893, + "learning_rate": 0.001, + "loss": 0.391, + "step": 9739 + }, + { + "epoch": 0.2687480383669556, + "grad_norm": 0.0021667422261089087, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 9740 + }, + { + "epoch": 0.26877563056801995, + "grad_norm": 0.0021627771202474833, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 9741 + }, + { + "epoch": 0.2688032227690843, + "grad_norm": 0.0031696807127445936, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 9742 + }, + { + "epoch": 0.2688308149701487, + "grad_norm": 0.0033821675460785627, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 9743 + }, + { + "epoch": 0.26885840717121307, + "grad_norm": 0.0039416286163032055, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 9744 + }, + { + "epoch": 0.2688859993722774, + "grad_norm": 0.003314357250928879, + "learning_rate": 0.001, + "loss": 0.3387, + "step": 9745 + }, + { + "epoch": 0.26891359157334177, + "grad_norm": 0.0031994907185435295, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 9746 + }, + { + "epoch": 0.2689411837744062, + "grad_norm": 0.006605618633329868, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 9747 + }, + { + "epoch": 0.26896877597547053, + "grad_norm": 0.0040326593443751335, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 9748 + }, + { + "epoch": 0.2689963681765349, + "grad_norm": 0.004000399261713028, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 9749 + }, + { + "epoch": 0.2690239603775993, + "grad_norm": 0.005057289730757475, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 9750 + }, + { + "epoch": 0.26905155257866364, + "grad_norm": 0.006951628718525171, + "learning_rate": 0.001, + "loss": 0.4397, + "step": 9751 + }, + { + "epoch": 0.269079144779728, + "grad_norm": 0.012159796431660652, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 9752 + }, + { + "epoch": 0.2691067369807924, + "grad_norm": 0.012929181568324566, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 9753 + }, + { + "epoch": 0.26913432918185676, + "grad_norm": 0.028701601549983025, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 9754 + }, + { + "epoch": 0.2691619213829211, + "grad_norm": 0.00337967392988503, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9755 + }, + { + "epoch": 0.26918951358398546, + "grad_norm": 0.004440324380993843, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 9756 + }, + { + "epoch": 0.26921710578504987, + "grad_norm": 0.009946012869477272, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 9757 + }, + { + "epoch": 0.2692446979861142, + "grad_norm": 0.0047539942897856236, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 9758 + }, + { + "epoch": 0.2692722901871786, + "grad_norm": 0.002247569151222706, + "learning_rate": 0.001, + "loss": 0.4414, + "step": 9759 + }, + { + "epoch": 0.269299882388243, + "grad_norm": 0.004046887159347534, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 9760 + }, + { + "epoch": 0.26932747458930734, + "grad_norm": 0.0023645355831831694, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 9761 + }, + { + "epoch": 0.2693550667903717, + "grad_norm": 0.006252918858081102, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 9762 + }, + { + "epoch": 0.2693826589914361, + "grad_norm": 0.004668123554438353, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 9763 + }, + { + "epoch": 0.26941025119250045, + "grad_norm": 0.005385317839682102, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 9764 + }, + { + "epoch": 0.2694378433935648, + "grad_norm": 0.004070324823260307, + "learning_rate": 0.001, + "loss": 0.409, + "step": 9765 + }, + { + "epoch": 0.26946543559462915, + "grad_norm": 0.002345689572393894, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 9766 + }, + { + "epoch": 0.26949302779569356, + "grad_norm": 0.004202275536954403, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 9767 + }, + { + "epoch": 0.2695206199967579, + "grad_norm": 0.003104708855971694, + "learning_rate": 0.001, + "loss": 0.3805, + "step": 9768 + }, + { + "epoch": 0.26954821219782227, + "grad_norm": 0.003290161257609725, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 9769 + }, + { + "epoch": 0.2695758043988867, + "grad_norm": 0.0024769490119069815, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 9770 + }, + { + "epoch": 0.26960339659995103, + "grad_norm": 0.002376476302742958, + "learning_rate": 0.001, + "loss": 0.3567, + "step": 9771 + }, + { + "epoch": 0.2696309888010154, + "grad_norm": 0.00581009779125452, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 9772 + }, + { + "epoch": 0.2696585810020798, + "grad_norm": 0.005047387443482876, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 9773 + }, + { + "epoch": 0.26968617320314414, + "grad_norm": 0.002559883752837777, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 9774 + }, + { + "epoch": 0.2697137654042085, + "grad_norm": 0.002292527351528406, + "learning_rate": 0.001, + "loss": 0.402, + "step": 9775 + }, + { + "epoch": 0.26974135760527285, + "grad_norm": 0.002428713021799922, + "learning_rate": 0.001, + "loss": 0.393, + "step": 9776 + }, + { + "epoch": 0.26976894980633725, + "grad_norm": 0.0029613785445690155, + "learning_rate": 0.001, + "loss": 0.387, + "step": 9777 + }, + { + "epoch": 0.2697965420074016, + "grad_norm": 0.00751551054418087, + "learning_rate": 0.001, + "loss": 0.383, + "step": 9778 + }, + { + "epoch": 0.26982413420846596, + "grad_norm": 0.0034091894049197435, + "learning_rate": 0.001, + "loss": 0.4048, + "step": 9779 + }, + { + "epoch": 0.26985172640953037, + "grad_norm": 0.003550524590536952, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 9780 + }, + { + "epoch": 0.2698793186105947, + "grad_norm": 0.019099680706858635, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 9781 + }, + { + "epoch": 0.2699069108116591, + "grad_norm": 0.013346359133720398, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 9782 + }, + { + "epoch": 0.2699345030127235, + "grad_norm": 0.0031708034221082926, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 9783 + }, + { + "epoch": 0.26996209521378783, + "grad_norm": 0.009070201776921749, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 9784 + }, + { + "epoch": 0.2699896874148522, + "grad_norm": 0.003416346851736307, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 9785 + }, + { + "epoch": 0.27001727961591654, + "grad_norm": 0.0025843738112598658, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 9786 + }, + { + "epoch": 0.27004487181698095, + "grad_norm": 0.024039220064878464, + "learning_rate": 0.001, + "loss": 0.3799, + "step": 9787 + }, + { + "epoch": 0.2700724640180453, + "grad_norm": 0.00454790610820055, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 9788 + }, + { + "epoch": 0.27010005621910965, + "grad_norm": 0.0040943981148302555, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 9789 + }, + { + "epoch": 0.27012764842017406, + "grad_norm": 0.01507536880671978, + "learning_rate": 0.001, + "loss": 0.401, + "step": 9790 + }, + { + "epoch": 0.2701552406212384, + "grad_norm": 0.007708291057497263, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 9791 + }, + { + "epoch": 0.27018283282230277, + "grad_norm": 0.002290240256115794, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 9792 + }, + { + "epoch": 0.2702104250233671, + "grad_norm": 0.006000965368002653, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 9793 + }, + { + "epoch": 0.2702380172244315, + "grad_norm": 0.0025645066052675247, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 9794 + }, + { + "epoch": 0.2702656094254959, + "grad_norm": 0.004800410475581884, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 9795 + }, + { + "epoch": 0.27029320162656023, + "grad_norm": 0.002543856855481863, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 9796 + }, + { + "epoch": 0.27032079382762464, + "grad_norm": 0.0024924466852098703, + "learning_rate": 0.001, + "loss": 0.4508, + "step": 9797 + }, + { + "epoch": 0.270348386028689, + "grad_norm": 0.017285561189055443, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 9798 + }, + { + "epoch": 0.27037597822975334, + "grad_norm": 0.0034071647096425295, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 9799 + }, + { + "epoch": 0.27040357043081775, + "grad_norm": 0.0026173749938607216, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 9800 + }, + { + "epoch": 0.2704311626318821, + "grad_norm": 0.0027858337853103876, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 9801 + }, + { + "epoch": 0.27045875483294646, + "grad_norm": 0.003459551138803363, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 9802 + }, + { + "epoch": 0.2704863470340108, + "grad_norm": 0.0028464416973292828, + "learning_rate": 0.001, + "loss": 0.3334, + "step": 9803 + }, + { + "epoch": 0.2705139392350752, + "grad_norm": 0.0025196147616952658, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 9804 + }, + { + "epoch": 0.27054153143613957, + "grad_norm": 0.002594658173620701, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 9805 + }, + { + "epoch": 0.2705691236372039, + "grad_norm": 0.0055243829265236855, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 9806 + }, + { + "epoch": 0.27059671583826833, + "grad_norm": 0.003879512194544077, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 9807 + }, + { + "epoch": 0.2706243080393327, + "grad_norm": 0.0029453851748257875, + "learning_rate": 0.001, + "loss": 0.3562, + "step": 9808 + }, + { + "epoch": 0.27065190024039704, + "grad_norm": 0.004673081450164318, + "learning_rate": 0.001, + "loss": 0.394, + "step": 9809 + }, + { + "epoch": 0.27067949244146144, + "grad_norm": 0.002838741522282362, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 9810 + }, + { + "epoch": 0.2707070846425258, + "grad_norm": 0.002747466554865241, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 9811 + }, + { + "epoch": 0.27073467684359015, + "grad_norm": 0.003146926872432232, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 9812 + }, + { + "epoch": 0.2707622690446545, + "grad_norm": 0.007232631091028452, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 9813 + }, + { + "epoch": 0.2707898612457189, + "grad_norm": 0.0033664864022284746, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 9814 + }, + { + "epoch": 0.27081745344678326, + "grad_norm": 0.002574306447058916, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 9815 + }, + { + "epoch": 0.2708450456478476, + "grad_norm": 0.0030514101963490248, + "learning_rate": 0.001, + "loss": 0.3745, + "step": 9816 + }, + { + "epoch": 0.270872637848912, + "grad_norm": 0.0026982761919498444, + "learning_rate": 0.001, + "loss": 0.4389, + "step": 9817 + }, + { + "epoch": 0.2709002300499764, + "grad_norm": 0.0034780215937644243, + "learning_rate": 0.001, + "loss": 0.4446, + "step": 9818 + }, + { + "epoch": 0.27092782225104073, + "grad_norm": 0.0022655725479125977, + "learning_rate": 0.001, + "loss": 0.391, + "step": 9819 + }, + { + "epoch": 0.27095541445210514, + "grad_norm": 0.0031723659485578537, + "learning_rate": 0.001, + "loss": 0.3482, + "step": 9820 + }, + { + "epoch": 0.2709830066531695, + "grad_norm": 0.004059515427798033, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 9821 + }, + { + "epoch": 0.27101059885423384, + "grad_norm": 0.003187219612300396, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 9822 + }, + { + "epoch": 0.2710381910552982, + "grad_norm": 0.002115658251568675, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 9823 + }, + { + "epoch": 0.2710657832563626, + "grad_norm": 0.004135797265917063, + "learning_rate": 0.001, + "loss": 0.4439, + "step": 9824 + }, + { + "epoch": 0.27109337545742695, + "grad_norm": 0.0046222819946706295, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 9825 + }, + { + "epoch": 0.2711209676584913, + "grad_norm": 0.00371656590141356, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 9826 + }, + { + "epoch": 0.2711485598595557, + "grad_norm": 0.0033854972571134567, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 9827 + }, + { + "epoch": 0.27117615206062007, + "grad_norm": 0.0034048438537865877, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 9828 + }, + { + "epoch": 0.2712037442616844, + "grad_norm": 0.005359988659620285, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 9829 + }, + { + "epoch": 0.27123133646274883, + "grad_norm": 0.0037438932340592146, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 9830 + }, + { + "epoch": 0.2712589286638132, + "grad_norm": 0.002518385648727417, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 9831 + }, + { + "epoch": 0.27128652086487753, + "grad_norm": 0.0029225589241832495, + "learning_rate": 0.001, + "loss": 0.414, + "step": 9832 + }, + { + "epoch": 0.2713141130659419, + "grad_norm": 0.003086130367591977, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 9833 + }, + { + "epoch": 0.2713417052670063, + "grad_norm": 0.005153126548975706, + "learning_rate": 0.001, + "loss": 0.37, + "step": 9834 + }, + { + "epoch": 0.27136929746807065, + "grad_norm": 0.0035248089116066694, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 9835 + }, + { + "epoch": 0.271396889669135, + "grad_norm": 0.006128870882093906, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 9836 + }, + { + "epoch": 0.2714244818701994, + "grad_norm": 0.003302632598206401, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 9837 + }, + { + "epoch": 0.27145207407126376, + "grad_norm": 0.004240743815898895, + "learning_rate": 0.001, + "loss": 0.4279, + "step": 9838 + }, + { + "epoch": 0.2714796662723281, + "grad_norm": 0.014292960986495018, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 9839 + }, + { + "epoch": 0.2715072584733925, + "grad_norm": 0.002007813658565283, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 9840 + }, + { + "epoch": 0.2715348506744569, + "grad_norm": 0.004010828211903572, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 9841 + }, + { + "epoch": 0.2715624428755212, + "grad_norm": 0.00426107831299305, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 9842 + }, + { + "epoch": 0.2715900350765856, + "grad_norm": 0.0019332681549713016, + "learning_rate": 0.001, + "loss": 0.4438, + "step": 9843 + }, + { + "epoch": 0.27161762727765, + "grad_norm": 0.0026610195636749268, + "learning_rate": 0.001, + "loss": 0.406, + "step": 9844 + }, + { + "epoch": 0.27164521947871434, + "grad_norm": 0.0026648433413356543, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 9845 + }, + { + "epoch": 0.2716728116797787, + "grad_norm": 0.0038692099042236805, + "learning_rate": 0.001, + "loss": 0.423, + "step": 9846 + }, + { + "epoch": 0.2717004038808431, + "grad_norm": 0.0028008136432617903, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 9847 + }, + { + "epoch": 0.27172799608190745, + "grad_norm": 0.0038147938903421164, + "learning_rate": 0.001, + "loss": 0.394, + "step": 9848 + }, + { + "epoch": 0.2717555882829718, + "grad_norm": 0.0787762999534607, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 9849 + }, + { + "epoch": 0.2717831804840362, + "grad_norm": 0.004193917848169804, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 9850 + }, + { + "epoch": 0.27181077268510057, + "grad_norm": 0.0029792841523885727, + "learning_rate": 0.001, + "loss": 0.3763, + "step": 9851 + }, + { + "epoch": 0.2718383648861649, + "grad_norm": 0.002425075275823474, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 9852 + }, + { + "epoch": 0.27186595708722927, + "grad_norm": 0.0051060812547802925, + "learning_rate": 0.001, + "loss": 0.3555, + "step": 9853 + }, + { + "epoch": 0.2718935492882937, + "grad_norm": 0.024291977286338806, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 9854 + }, + { + "epoch": 0.27192114148935803, + "grad_norm": 0.005702044349163771, + "learning_rate": 0.001, + "loss": 0.4261, + "step": 9855 + }, + { + "epoch": 0.2719487336904224, + "grad_norm": 0.0037687926087528467, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 9856 + }, + { + "epoch": 0.2719763258914868, + "grad_norm": 0.007429871242493391, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 9857 + }, + { + "epoch": 0.27200391809255114, + "grad_norm": 0.00363955763168633, + "learning_rate": 0.001, + "loss": 0.3385, + "step": 9858 + }, + { + "epoch": 0.2720315102936155, + "grad_norm": 0.00260857748799026, + "learning_rate": 0.001, + "loss": 0.4518, + "step": 9859 + }, + { + "epoch": 0.2720591024946799, + "grad_norm": 0.002581104403361678, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 9860 + }, + { + "epoch": 0.27208669469574426, + "grad_norm": 0.0025525682140141726, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 9861 + }, + { + "epoch": 0.2721142868968086, + "grad_norm": 0.00219151028431952, + "learning_rate": 0.001, + "loss": 0.381, + "step": 9862 + }, + { + "epoch": 0.27214187909787296, + "grad_norm": 0.0028392940293997526, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 9863 + }, + { + "epoch": 0.27216947129893737, + "grad_norm": 0.005825422238558531, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 9864 + }, + { + "epoch": 0.2721970635000017, + "grad_norm": 0.00334284920245409, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 9865 + }, + { + "epoch": 0.2722246557010661, + "grad_norm": 0.0023583846632391214, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 9866 + }, + { + "epoch": 0.2722522479021305, + "grad_norm": 0.004898847080767155, + "learning_rate": 0.001, + "loss": 0.4439, + "step": 9867 + }, + { + "epoch": 0.27227984010319484, + "grad_norm": 0.004352132324129343, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 9868 + }, + { + "epoch": 0.2723074323042592, + "grad_norm": 0.003136218059808016, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 9869 + }, + { + "epoch": 0.2723350245053236, + "grad_norm": 0.003824204672127962, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 9870 + }, + { + "epoch": 0.27236261670638795, + "grad_norm": 0.0033238916657865047, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 9871 + }, + { + "epoch": 0.2723902089074523, + "grad_norm": 0.005290450528264046, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 9872 + }, + { + "epoch": 0.27241780110851666, + "grad_norm": 0.0039006490260362625, + "learning_rate": 0.001, + "loss": 0.3767, + "step": 9873 + }, + { + "epoch": 0.27244539330958106, + "grad_norm": 0.006280507426708937, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 9874 + }, + { + "epoch": 0.2724729855106454, + "grad_norm": 0.010091300122439861, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 9875 + }, + { + "epoch": 0.27250057771170977, + "grad_norm": 0.012804842554032803, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 9876 + }, + { + "epoch": 0.2725281699127742, + "grad_norm": 0.005947540979832411, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 9877 + }, + { + "epoch": 0.27255576211383853, + "grad_norm": 0.007248189765959978, + "learning_rate": 0.001, + "loss": 0.4269, + "step": 9878 + }, + { + "epoch": 0.2725833543149029, + "grad_norm": 0.00791088119149208, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 9879 + }, + { + "epoch": 0.2726109465159673, + "grad_norm": 0.011914456263184547, + "learning_rate": 0.001, + "loss": 0.415, + "step": 9880 + }, + { + "epoch": 0.27263853871703164, + "grad_norm": 0.005593923386186361, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 9881 + }, + { + "epoch": 0.272666130918096, + "grad_norm": 0.0036203390918672085, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 9882 + }, + { + "epoch": 0.27269372311916035, + "grad_norm": 0.0030619618482887745, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 9883 + }, + { + "epoch": 0.27272131532022476, + "grad_norm": 0.002730242908000946, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 9884 + }, + { + "epoch": 0.2727489075212891, + "grad_norm": 0.004023353569209576, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 9885 + }, + { + "epoch": 0.27277649972235346, + "grad_norm": 0.0054342905059456825, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 9886 + }, + { + "epoch": 0.27280409192341787, + "grad_norm": 0.004413217771798372, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 9887 + }, + { + "epoch": 0.2728316841244822, + "grad_norm": 0.0039059999398887157, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 9888 + }, + { + "epoch": 0.2728592763255466, + "grad_norm": 0.003089727135375142, + "learning_rate": 0.001, + "loss": 0.4385, + "step": 9889 + }, + { + "epoch": 0.2728868685266109, + "grad_norm": 0.002594609744846821, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 9890 + }, + { + "epoch": 0.27291446072767533, + "grad_norm": 0.004234743770211935, + "learning_rate": 0.001, + "loss": 0.3582, + "step": 9891 + }, + { + "epoch": 0.2729420529287397, + "grad_norm": 0.002928930101916194, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 9892 + }, + { + "epoch": 0.27296964512980404, + "grad_norm": 0.002950299996882677, + "learning_rate": 0.001, + "loss": 0.4439, + "step": 9893 + }, + { + "epoch": 0.27299723733086845, + "grad_norm": 0.002796403132379055, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 9894 + }, + { + "epoch": 0.2730248295319328, + "grad_norm": 0.0047624544240534306, + "learning_rate": 0.001, + "loss": 0.416, + "step": 9895 + }, + { + "epoch": 0.27305242173299715, + "grad_norm": 0.002439835574477911, + "learning_rate": 0.001, + "loss": 0.4364, + "step": 9896 + }, + { + "epoch": 0.27308001393406156, + "grad_norm": 0.002326715039089322, + "learning_rate": 0.001, + "loss": 0.4155, + "step": 9897 + }, + { + "epoch": 0.2731076061351259, + "grad_norm": 0.00256901397369802, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 9898 + }, + { + "epoch": 0.27313519833619027, + "grad_norm": 0.002579151652753353, + "learning_rate": 0.001, + "loss": 0.408, + "step": 9899 + }, + { + "epoch": 0.2731627905372546, + "grad_norm": 0.00309234787710011, + "learning_rate": 0.001, + "loss": 0.3471, + "step": 9900 + }, + { + "epoch": 0.273190382738319, + "grad_norm": 0.0036706968676298857, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 9901 + }, + { + "epoch": 0.2732179749393834, + "grad_norm": 0.0024913379456847906, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 9902 + }, + { + "epoch": 0.27324556714044773, + "grad_norm": 0.004764418583363295, + "learning_rate": 0.001, + "loss": 0.432, + "step": 9903 + }, + { + "epoch": 0.27327315934151214, + "grad_norm": 0.0027252251747995615, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 9904 + }, + { + "epoch": 0.2733007515425765, + "grad_norm": 0.0033297077752649784, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 9905 + }, + { + "epoch": 0.27332834374364084, + "grad_norm": 0.0033139032311737537, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 9906 + }, + { + "epoch": 0.27335593594470525, + "grad_norm": 0.004935343284159899, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 9907 + }, + { + "epoch": 0.2733835281457696, + "grad_norm": 0.0046791452914476395, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 9908 + }, + { + "epoch": 0.27341112034683396, + "grad_norm": 0.004267543088644743, + "learning_rate": 0.001, + "loss": 0.4563, + "step": 9909 + }, + { + "epoch": 0.2734387125478983, + "grad_norm": 0.002623759675770998, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 9910 + }, + { + "epoch": 0.2734663047489627, + "grad_norm": 0.002590791555121541, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 9911 + }, + { + "epoch": 0.27349389695002707, + "grad_norm": 0.002841070294380188, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 9912 + }, + { + "epoch": 0.2735214891510914, + "grad_norm": 0.009606373496353626, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 9913 + }, + { + "epoch": 0.27354908135215583, + "grad_norm": 0.0030742899980396032, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 9914 + }, + { + "epoch": 0.2735766735532202, + "grad_norm": 0.00670670298859477, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 9915 + }, + { + "epoch": 0.27360426575428454, + "grad_norm": 0.002301518339663744, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 9916 + }, + { + "epoch": 0.27363185795534894, + "grad_norm": 0.00346144987270236, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 9917 + }, + { + "epoch": 0.2736594501564133, + "grad_norm": 0.0030207394156605005, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 9918 + }, + { + "epoch": 0.27368704235747765, + "grad_norm": 0.005349605809897184, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 9919 + }, + { + "epoch": 0.273714634558542, + "grad_norm": 0.004348041955381632, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 9920 + }, + { + "epoch": 0.2737422267596064, + "grad_norm": 0.0030986126512289047, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 9921 + }, + { + "epoch": 0.27376981896067076, + "grad_norm": 0.0026239063590765, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 9922 + }, + { + "epoch": 0.2737974111617351, + "grad_norm": 0.0023042093962430954, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 9923 + }, + { + "epoch": 0.2738250033627995, + "grad_norm": 0.007671200204640627, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 9924 + }, + { + "epoch": 0.2738525955638639, + "grad_norm": 0.00476741511374712, + "learning_rate": 0.001, + "loss": 0.367, + "step": 9925 + }, + { + "epoch": 0.27388018776492823, + "grad_norm": 0.004138735588639975, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 9926 + }, + { + "epoch": 0.27390777996599264, + "grad_norm": 0.007656489033252001, + "learning_rate": 0.001, + "loss": 0.3586, + "step": 9927 + }, + { + "epoch": 0.273935372167057, + "grad_norm": 0.003876802045851946, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 9928 + }, + { + "epoch": 0.27396296436812134, + "grad_norm": 0.010240191593766212, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 9929 + }, + { + "epoch": 0.2739905565691857, + "grad_norm": 0.005146909970790148, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 9930 + }, + { + "epoch": 0.2740181487702501, + "grad_norm": 0.00523938424885273, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 9931 + }, + { + "epoch": 0.27404574097131446, + "grad_norm": 0.00330050359480083, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 9932 + }, + { + "epoch": 0.2740733331723788, + "grad_norm": 0.00420812563970685, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 9933 + }, + { + "epoch": 0.2741009253734432, + "grad_norm": 0.00954358745366335, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 9934 + }, + { + "epoch": 0.27412851757450757, + "grad_norm": 0.005676385015249252, + "learning_rate": 0.001, + "loss": 0.401, + "step": 9935 + }, + { + "epoch": 0.2741561097755719, + "grad_norm": 0.00360414432361722, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 9936 + }, + { + "epoch": 0.27418370197663633, + "grad_norm": 0.002916174242272973, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 9937 + }, + { + "epoch": 0.2742112941777007, + "grad_norm": 0.0041490960866212845, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 9938 + }, + { + "epoch": 0.27423888637876503, + "grad_norm": 0.03103417344391346, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 9939 + }, + { + "epoch": 0.2742664785798294, + "grad_norm": 0.00923039298504591, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 9940 + }, + { + "epoch": 0.2742940707808938, + "grad_norm": 0.0029965273570269346, + "learning_rate": 0.001, + "loss": 0.439, + "step": 9941 + }, + { + "epoch": 0.27432166298195815, + "grad_norm": 0.003088477300480008, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 9942 + }, + { + "epoch": 0.2743492551830225, + "grad_norm": 0.0032554971985518932, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 9943 + }, + { + "epoch": 0.2743768473840869, + "grad_norm": 0.004142871592193842, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 9944 + }, + { + "epoch": 0.27440443958515126, + "grad_norm": 0.003104181494563818, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 9945 + }, + { + "epoch": 0.2744320317862156, + "grad_norm": 0.0025867957156151533, + "learning_rate": 0.001, + "loss": 0.4087, + "step": 9946 + }, + { + "epoch": 0.27445962398728, + "grad_norm": 0.002821350237354636, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 9947 + }, + { + "epoch": 0.2744872161883444, + "grad_norm": 0.003803239669650793, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 9948 + }, + { + "epoch": 0.2745148083894087, + "grad_norm": 0.002869410440325737, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 9949 + }, + { + "epoch": 0.2745424005904731, + "grad_norm": 0.004910988733172417, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 9950 + }, + { + "epoch": 0.2745699927915375, + "grad_norm": 0.00325010740198195, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 9951 + }, + { + "epoch": 0.27459758499260184, + "grad_norm": 0.0021431315690279007, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 9952 + }, + { + "epoch": 0.2746251771936662, + "grad_norm": 0.003677117172628641, + "learning_rate": 0.001, + "loss": 0.4193, + "step": 9953 + }, + { + "epoch": 0.2746527693947306, + "grad_norm": 0.002277594292536378, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 9954 + }, + { + "epoch": 0.27468036159579495, + "grad_norm": 0.004701568745076656, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 9955 + }, + { + "epoch": 0.2747079537968593, + "grad_norm": 0.002484973520040512, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 9956 + }, + { + "epoch": 0.2747355459979237, + "grad_norm": 0.0033851531334221363, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 9957 + }, + { + "epoch": 0.27476313819898807, + "grad_norm": 0.0025149055290967226, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 9958 + }, + { + "epoch": 0.2747907304000524, + "grad_norm": 0.0030154273845255375, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 9959 + }, + { + "epoch": 0.27481832260111677, + "grad_norm": 0.010692945681512356, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 9960 + }, + { + "epoch": 0.2748459148021812, + "grad_norm": 0.0041275848634541035, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 9961 + }, + { + "epoch": 0.27487350700324553, + "grad_norm": 0.0029004793614149094, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 9962 + }, + { + "epoch": 0.2749010992043099, + "grad_norm": 0.009717054665088654, + "learning_rate": 0.001, + "loss": 0.38, + "step": 9963 + }, + { + "epoch": 0.2749286914053743, + "grad_norm": 0.00394978653639555, + "learning_rate": 0.001, + "loss": 0.3602, + "step": 9964 + }, + { + "epoch": 0.27495628360643865, + "grad_norm": 0.0029732300899922848, + "learning_rate": 0.001, + "loss": 0.4, + "step": 9965 + }, + { + "epoch": 0.274983875807503, + "grad_norm": 0.002955190371721983, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 9966 + }, + { + "epoch": 0.2750114680085674, + "grad_norm": 0.0038855448365211487, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 9967 + }, + { + "epoch": 0.27503906020963176, + "grad_norm": 0.002460125368088484, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 9968 + }, + { + "epoch": 0.2750666524106961, + "grad_norm": 0.002936316654086113, + "learning_rate": 0.001, + "loss": 0.3468, + "step": 9969 + }, + { + "epoch": 0.27509424461176046, + "grad_norm": 0.003299242816865444, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 9970 + }, + { + "epoch": 0.27512183681282487, + "grad_norm": 0.0037559715565294027, + "learning_rate": 0.001, + "loss": 0.362, + "step": 9971 + }, + { + "epoch": 0.2751494290138892, + "grad_norm": 0.0035624271258711815, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 9972 + }, + { + "epoch": 0.2751770212149536, + "grad_norm": 0.00262457481585443, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 9973 + }, + { + "epoch": 0.275204613416018, + "grad_norm": 0.0050546471029520035, + "learning_rate": 0.001, + "loss": 0.3629, + "step": 9974 + }, + { + "epoch": 0.27523220561708234, + "grad_norm": 0.004298574756830931, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 9975 + }, + { + "epoch": 0.2752597978181467, + "grad_norm": 0.0042538209818303585, + "learning_rate": 0.001, + "loss": 0.402, + "step": 9976 + }, + { + "epoch": 0.27528739001921104, + "grad_norm": 0.003947499208152294, + "learning_rate": 0.001, + "loss": 0.3789, + "step": 9977 + }, + { + "epoch": 0.27531498222027545, + "grad_norm": 0.0038300196174532175, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 9978 + }, + { + "epoch": 0.2753425744213398, + "grad_norm": 0.005689695011824369, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 9979 + }, + { + "epoch": 0.27537016662240416, + "grad_norm": 0.0022208907175809145, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 9980 + }, + { + "epoch": 0.27539775882346856, + "grad_norm": 0.0022134091705083847, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 9981 + }, + { + "epoch": 0.2754253510245329, + "grad_norm": 0.0045527201145887375, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 9982 + }, + { + "epoch": 0.27545294322559727, + "grad_norm": 0.004349110182374716, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 9983 + }, + { + "epoch": 0.2754805354266617, + "grad_norm": 0.0028283181600272655, + "learning_rate": 0.001, + "loss": 0.4415, + "step": 9984 + }, + { + "epoch": 0.27550812762772603, + "grad_norm": 0.0024474281817674637, + "learning_rate": 0.001, + "loss": 0.368, + "step": 9985 + }, + { + "epoch": 0.2755357198287904, + "grad_norm": 0.0033072589430958033, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 9986 + }, + { + "epoch": 0.27556331202985473, + "grad_norm": 0.002837040927261114, + "learning_rate": 0.001, + "loss": 0.369, + "step": 9987 + }, + { + "epoch": 0.27559090423091914, + "grad_norm": 0.014132903888821602, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 9988 + }, + { + "epoch": 0.2756184964319835, + "grad_norm": 0.002668624045327306, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 9989 + }, + { + "epoch": 0.27564608863304785, + "grad_norm": 0.003035599598661065, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 9990 + }, + { + "epoch": 0.27567368083411226, + "grad_norm": 0.0033727851696312428, + "learning_rate": 0.001, + "loss": 0.402, + "step": 9991 + }, + { + "epoch": 0.2757012730351766, + "grad_norm": 0.0022625040728598833, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 9992 + }, + { + "epoch": 0.27572886523624096, + "grad_norm": 0.004006414674222469, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 9993 + }, + { + "epoch": 0.27575645743730537, + "grad_norm": 0.002454617992043495, + "learning_rate": 0.001, + "loss": 0.424, + "step": 9994 + }, + { + "epoch": 0.2757840496383697, + "grad_norm": 0.003350407350808382, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 9995 + }, + { + "epoch": 0.2758116418394341, + "grad_norm": 0.004660541657358408, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 9996 + }, + { + "epoch": 0.2758392340404984, + "grad_norm": 0.002644259948283434, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 9997 + }, + { + "epoch": 0.27586682624156283, + "grad_norm": 0.004398273769766092, + "learning_rate": 0.001, + "loss": 0.3655, + "step": 9998 + }, + { + "epoch": 0.2758944184426272, + "grad_norm": 0.013148190453648567, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 9999 + }, + { + "epoch": 0.27592201064369154, + "grad_norm": 0.002878269413486123, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 10000 + }, + { + "epoch": 0.27592201064369154, + "eval_runtime": 24.4086, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.164, + "step": 10000 + }, + { + "epoch": 0.27594960284475595, + "grad_norm": 0.0022266616579145193, + "learning_rate": 0.001, + "loss": 0.4605, + "step": 10001 + }, + { + "epoch": 0.2759771950458203, + "grad_norm": 0.0019374669063836336, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 10002 + }, + { + "epoch": 0.27600478724688465, + "grad_norm": 0.0026133570354431868, + "learning_rate": 0.001, + "loss": 0.408, + "step": 10003 + }, + { + "epoch": 0.27603237944794906, + "grad_norm": 0.003653626423329115, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 10004 + }, + { + "epoch": 0.2760599716490134, + "grad_norm": 0.002634751843288541, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 10005 + }, + { + "epoch": 0.27608756385007777, + "grad_norm": 0.012561113573610783, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 10006 + }, + { + "epoch": 0.2761151560511421, + "grad_norm": 0.01040496677160263, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 10007 + }, + { + "epoch": 0.2761427482522065, + "grad_norm": 0.005907583516091108, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 10008 + }, + { + "epoch": 0.2761703404532709, + "grad_norm": 0.0033373169135302305, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 10009 + }, + { + "epoch": 0.27619793265433523, + "grad_norm": 0.003724615555256605, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 10010 + }, + { + "epoch": 0.27622552485539964, + "grad_norm": 0.0035398202016949654, + "learning_rate": 0.001, + "loss": 0.403, + "step": 10011 + }, + { + "epoch": 0.276253117056464, + "grad_norm": 0.003467197297140956, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 10012 + }, + { + "epoch": 0.27628070925752835, + "grad_norm": 0.004464291967451572, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 10013 + }, + { + "epoch": 0.27630830145859275, + "grad_norm": 0.0036106444895267487, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 10014 + }, + { + "epoch": 0.2763358936596571, + "grad_norm": 0.004417051561176777, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 10015 + }, + { + "epoch": 0.27636348586072146, + "grad_norm": 0.0054782601073384285, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 10016 + }, + { + "epoch": 0.2763910780617858, + "grad_norm": 0.004640686791390181, + "learning_rate": 0.001, + "loss": 0.384, + "step": 10017 + }, + { + "epoch": 0.2764186702628502, + "grad_norm": 0.005945026874542236, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 10018 + }, + { + "epoch": 0.27644626246391457, + "grad_norm": 0.002607600996270776, + "learning_rate": 0.001, + "loss": 0.3993, + "step": 10019 + }, + { + "epoch": 0.2764738546649789, + "grad_norm": 0.0032611433416604996, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 10020 + }, + { + "epoch": 0.27650144686604333, + "grad_norm": 0.0037529822438955307, + "learning_rate": 0.001, + "loss": 0.372, + "step": 10021 + }, + { + "epoch": 0.2765290390671077, + "grad_norm": 0.002805948257446289, + "learning_rate": 0.001, + "loss": 0.4327, + "step": 10022 + }, + { + "epoch": 0.27655663126817204, + "grad_norm": 0.003274905029684305, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 10023 + }, + { + "epoch": 0.27658422346923645, + "grad_norm": 0.00208392646163702, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 10024 + }, + { + "epoch": 0.2766118156703008, + "grad_norm": 0.007121518719941378, + "learning_rate": 0.001, + "loss": 0.3495, + "step": 10025 + }, + { + "epoch": 0.27663940787136515, + "grad_norm": 0.0040301973931491375, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 10026 + }, + { + "epoch": 0.2766670000724295, + "grad_norm": 0.0036494045052677393, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 10027 + }, + { + "epoch": 0.2766945922734939, + "grad_norm": 0.0036077816039323807, + "learning_rate": 0.001, + "loss": 0.3922, + "step": 10028 + }, + { + "epoch": 0.27672218447455826, + "grad_norm": 0.010453345254063606, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 10029 + }, + { + "epoch": 0.2767497766756226, + "grad_norm": 0.004115840420126915, + "learning_rate": 0.001, + "loss": 0.4289, + "step": 10030 + }, + { + "epoch": 0.276777368876687, + "grad_norm": 0.0033005299046635628, + "learning_rate": 0.001, + "loss": 0.4526, + "step": 10031 + }, + { + "epoch": 0.2768049610777514, + "grad_norm": 0.010283264331519604, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 10032 + }, + { + "epoch": 0.27683255327881573, + "grad_norm": 0.0024423354770988226, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 10033 + }, + { + "epoch": 0.27686014547988014, + "grad_norm": 0.002194769913330674, + "learning_rate": 0.001, + "loss": 0.3853, + "step": 10034 + }, + { + "epoch": 0.2768877376809445, + "grad_norm": 0.00277558621019125, + "learning_rate": 0.001, + "loss": 0.407, + "step": 10035 + }, + { + "epoch": 0.27691532988200884, + "grad_norm": 0.0023043700493872166, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 10036 + }, + { + "epoch": 0.2769429220830732, + "grad_norm": 0.0019729414489120245, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 10037 + }, + { + "epoch": 0.2769705142841376, + "grad_norm": 0.0028671245090663433, + "learning_rate": 0.001, + "loss": 0.407, + "step": 10038 + }, + { + "epoch": 0.27699810648520196, + "grad_norm": 0.0039373342879116535, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 10039 + }, + { + "epoch": 0.2770256986862663, + "grad_norm": 0.009182474575936794, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 10040 + }, + { + "epoch": 0.2770532908873307, + "grad_norm": 0.0029172920621931553, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 10041 + }, + { + "epoch": 0.27708088308839507, + "grad_norm": 0.0023737072478979826, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 10042 + }, + { + "epoch": 0.2771084752894594, + "grad_norm": 0.002812950173392892, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 10043 + }, + { + "epoch": 0.27713606749052383, + "grad_norm": 0.002222485141828656, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 10044 + }, + { + "epoch": 0.2771636596915882, + "grad_norm": 0.0037747088354080915, + "learning_rate": 0.001, + "loss": 0.4252, + "step": 10045 + }, + { + "epoch": 0.27719125189265253, + "grad_norm": 0.00997960101813078, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 10046 + }, + { + "epoch": 0.2772188440937169, + "grad_norm": 0.0026916342321783304, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 10047 + }, + { + "epoch": 0.2772464362947813, + "grad_norm": 0.002582460641860962, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 10048 + }, + { + "epoch": 0.27727402849584565, + "grad_norm": 0.0027102259919047356, + "learning_rate": 0.001, + "loss": 0.4339, + "step": 10049 + }, + { + "epoch": 0.27730162069691, + "grad_norm": 0.0034718411043286324, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 10050 + }, + { + "epoch": 0.2773292128979744, + "grad_norm": 0.0027413431089371443, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 10051 + }, + { + "epoch": 0.27735680509903876, + "grad_norm": 0.003133729798719287, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 10052 + }, + { + "epoch": 0.2773843973001031, + "grad_norm": 0.002180175157263875, + "learning_rate": 0.001, + "loss": 0.4286, + "step": 10053 + }, + { + "epoch": 0.2774119895011675, + "grad_norm": 0.0031354373786598444, + "learning_rate": 0.001, + "loss": 0.412, + "step": 10054 + }, + { + "epoch": 0.2774395817022319, + "grad_norm": 0.0018822277197614312, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 10055 + }, + { + "epoch": 0.2774671739032962, + "grad_norm": 0.013822532258927822, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 10056 + }, + { + "epoch": 0.2774947661043606, + "grad_norm": 0.0037403854075819254, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 10057 + }, + { + "epoch": 0.277522358305425, + "grad_norm": 0.005285004619508982, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 10058 + }, + { + "epoch": 0.27754995050648934, + "grad_norm": 0.0025517537724226713, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 10059 + }, + { + "epoch": 0.2775775427075537, + "grad_norm": 0.005149201024323702, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 10060 + }, + { + "epoch": 0.2776051349086181, + "grad_norm": 0.012182818725705147, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 10061 + }, + { + "epoch": 0.27763272710968245, + "grad_norm": 0.003710835939273238, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 10062 + }, + { + "epoch": 0.2776603193107468, + "grad_norm": 0.005408014170825481, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 10063 + }, + { + "epoch": 0.2776879115118112, + "grad_norm": 0.0025339778512716293, + "learning_rate": 0.001, + "loss": 0.3499, + "step": 10064 + }, + { + "epoch": 0.27771550371287557, + "grad_norm": 0.0028167536947876215, + "learning_rate": 0.001, + "loss": 0.379, + "step": 10065 + }, + { + "epoch": 0.2777430959139399, + "grad_norm": 0.0029440028592944145, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 10066 + }, + { + "epoch": 0.27777068811500427, + "grad_norm": 0.0024950318038463593, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 10067 + }, + { + "epoch": 0.2777982803160687, + "grad_norm": 0.002304122317582369, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 10068 + }, + { + "epoch": 0.27782587251713303, + "grad_norm": 0.003218509955331683, + "learning_rate": 0.001, + "loss": 0.407, + "step": 10069 + }, + { + "epoch": 0.2778534647181974, + "grad_norm": 0.003081993665546179, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 10070 + }, + { + "epoch": 0.2778810569192618, + "grad_norm": 0.003274043556302786, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 10071 + }, + { + "epoch": 0.27790864912032615, + "grad_norm": 0.011977877467870712, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 10072 + }, + { + "epoch": 0.2779362413213905, + "grad_norm": 0.004248856566846371, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 10073 + }, + { + "epoch": 0.27796383352245485, + "grad_norm": 0.0030563895124942064, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 10074 + }, + { + "epoch": 0.27799142572351926, + "grad_norm": 0.0027819809038192034, + "learning_rate": 0.001, + "loss": 0.3906, + "step": 10075 + }, + { + "epoch": 0.2780190179245836, + "grad_norm": 0.003203164553269744, + "learning_rate": 0.001, + "loss": 0.4168, + "step": 10076 + }, + { + "epoch": 0.27804661012564796, + "grad_norm": 0.003680006368085742, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 10077 + }, + { + "epoch": 0.27807420232671237, + "grad_norm": 0.0054307919926941395, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 10078 + }, + { + "epoch": 0.2781017945277767, + "grad_norm": 0.00977605115622282, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 10079 + }, + { + "epoch": 0.2781293867288411, + "grad_norm": 0.003044104902073741, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 10080 + }, + { + "epoch": 0.2781569789299055, + "grad_norm": 0.009494941681623459, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 10081 + }, + { + "epoch": 0.27818457113096984, + "grad_norm": 0.04275614395737648, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 10082 + }, + { + "epoch": 0.2782121633320342, + "grad_norm": 0.0026185614988207817, + "learning_rate": 0.001, + "loss": 0.4145, + "step": 10083 + }, + { + "epoch": 0.27823975553309854, + "grad_norm": 0.003999069333076477, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 10084 + }, + { + "epoch": 0.27826734773416295, + "grad_norm": 0.00245444243773818, + "learning_rate": 0.001, + "loss": 0.386, + "step": 10085 + }, + { + "epoch": 0.2782949399352273, + "grad_norm": 0.00522113312035799, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 10086 + }, + { + "epoch": 0.27832253213629166, + "grad_norm": 0.0037140315398573875, + "learning_rate": 0.001, + "loss": 0.3612, + "step": 10087 + }, + { + "epoch": 0.27835012433735606, + "grad_norm": 0.0034693137276917696, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 10088 + }, + { + "epoch": 0.2783777165384204, + "grad_norm": 0.00494216475635767, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 10089 + }, + { + "epoch": 0.27840530873948477, + "grad_norm": 0.0024820046965032816, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 10090 + }, + { + "epoch": 0.2784329009405492, + "grad_norm": 0.0024154249113053083, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 10091 + }, + { + "epoch": 0.27846049314161353, + "grad_norm": 0.0025213605258613825, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 10092 + }, + { + "epoch": 0.2784880853426779, + "grad_norm": 0.003560342825949192, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 10093 + }, + { + "epoch": 0.27851567754374223, + "grad_norm": 0.0023579909466207027, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 10094 + }, + { + "epoch": 0.27854326974480664, + "grad_norm": 0.0032963070552796125, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 10095 + }, + { + "epoch": 0.278570861945871, + "grad_norm": 0.0023323865607380867, + "learning_rate": 0.001, + "loss": 0.4111, + "step": 10096 + }, + { + "epoch": 0.27859845414693535, + "grad_norm": 0.0035246131010353565, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 10097 + }, + { + "epoch": 0.27862604634799976, + "grad_norm": 0.004129378125071526, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 10098 + }, + { + "epoch": 0.2786536385490641, + "grad_norm": 0.0030813373159617186, + "learning_rate": 0.001, + "loss": 0.3724, + "step": 10099 + }, + { + "epoch": 0.27868123075012846, + "grad_norm": 0.002692109439522028, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 10100 + }, + { + "epoch": 0.27870882295119287, + "grad_norm": 0.0024737734347581863, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 10101 + }, + { + "epoch": 0.2787364151522572, + "grad_norm": 0.0027061791624873877, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 10102 + }, + { + "epoch": 0.2787640073533216, + "grad_norm": 0.002315426943823695, + "learning_rate": 0.001, + "loss": 0.3813, + "step": 10103 + }, + { + "epoch": 0.2787915995543859, + "grad_norm": 0.005130481906235218, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 10104 + }, + { + "epoch": 0.27881919175545034, + "grad_norm": 0.006558484397828579, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 10105 + }, + { + "epoch": 0.2788467839565147, + "grad_norm": 0.004167409613728523, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 10106 + }, + { + "epoch": 0.27887437615757904, + "grad_norm": 0.0057708099484443665, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 10107 + }, + { + "epoch": 0.27890196835864345, + "grad_norm": 0.0028495537117123604, + "learning_rate": 0.001, + "loss": 0.3616, + "step": 10108 + }, + { + "epoch": 0.2789295605597078, + "grad_norm": 0.002703807782381773, + "learning_rate": 0.001, + "loss": 0.4447, + "step": 10109 + }, + { + "epoch": 0.27895715276077215, + "grad_norm": 0.0030382112599909306, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 10110 + }, + { + "epoch": 0.27898474496183656, + "grad_norm": 0.0030904843006283045, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 10111 + }, + { + "epoch": 0.2790123371629009, + "grad_norm": 0.003971985075622797, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 10112 + }, + { + "epoch": 0.27903992936396527, + "grad_norm": 0.002467118203639984, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 10113 + }, + { + "epoch": 0.2790675215650296, + "grad_norm": 0.006953855976462364, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 10114 + }, + { + "epoch": 0.279095113766094, + "grad_norm": 0.0055779386311769485, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 10115 + }, + { + "epoch": 0.2791227059671584, + "grad_norm": 0.004655706230551004, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 10116 + }, + { + "epoch": 0.27915029816822273, + "grad_norm": 0.004876443184912205, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 10117 + }, + { + "epoch": 0.27917789036928714, + "grad_norm": 0.004932073410600424, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 10118 + }, + { + "epoch": 0.2792054825703515, + "grad_norm": 0.007771969772875309, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 10119 + }, + { + "epoch": 0.27923307477141585, + "grad_norm": 0.005660747177898884, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 10120 + }, + { + "epoch": 0.27926066697248025, + "grad_norm": 0.0034438660368323326, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 10121 + }, + { + "epoch": 0.2792882591735446, + "grad_norm": 0.003013996873050928, + "learning_rate": 0.001, + "loss": 0.444, + "step": 10122 + }, + { + "epoch": 0.27931585137460896, + "grad_norm": 0.0033822916448116302, + "learning_rate": 0.001, + "loss": 0.3583, + "step": 10123 + }, + { + "epoch": 0.2793434435756733, + "grad_norm": 0.0029740110039711, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 10124 + }, + { + "epoch": 0.2793710357767377, + "grad_norm": 0.005846134852617979, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 10125 + }, + { + "epoch": 0.27939862797780207, + "grad_norm": 0.0038693509995937347, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 10126 + }, + { + "epoch": 0.2794262201788664, + "grad_norm": 0.023018015548586845, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 10127 + }, + { + "epoch": 0.27945381237993083, + "grad_norm": 0.004444723483175039, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 10128 + }, + { + "epoch": 0.2794814045809952, + "grad_norm": 0.00677674962207675, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 10129 + }, + { + "epoch": 0.27950899678205954, + "grad_norm": 0.004375552758574486, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 10130 + }, + { + "epoch": 0.27953658898312395, + "grad_norm": 0.003335924819111824, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 10131 + }, + { + "epoch": 0.2795641811841883, + "grad_norm": 0.0037330735940486193, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 10132 + }, + { + "epoch": 0.27959177338525265, + "grad_norm": 0.004045455250889063, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 10133 + }, + { + "epoch": 0.279619365586317, + "grad_norm": 0.004944518208503723, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 10134 + }, + { + "epoch": 0.2796469577873814, + "grad_norm": 0.003303113393485546, + "learning_rate": 0.001, + "loss": 0.4458, + "step": 10135 + }, + { + "epoch": 0.27967454998844576, + "grad_norm": 0.002501634182408452, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 10136 + }, + { + "epoch": 0.2797021421895101, + "grad_norm": 0.008220963180065155, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 10137 + }, + { + "epoch": 0.2797297343905745, + "grad_norm": 0.015138577669858932, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 10138 + }, + { + "epoch": 0.2797573265916389, + "grad_norm": 0.00455261766910553, + "learning_rate": 0.001, + "loss": 0.377, + "step": 10139 + }, + { + "epoch": 0.27978491879270323, + "grad_norm": 0.003458748571574688, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 10140 + }, + { + "epoch": 0.27981251099376764, + "grad_norm": 0.0061928508803248405, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 10141 + }, + { + "epoch": 0.279840103194832, + "grad_norm": 0.004049024078994989, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 10142 + }, + { + "epoch": 0.27986769539589634, + "grad_norm": 0.0035144255962222815, + "learning_rate": 0.001, + "loss": 0.404, + "step": 10143 + }, + { + "epoch": 0.2798952875969607, + "grad_norm": 0.004885702393949032, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 10144 + }, + { + "epoch": 0.2799228797980251, + "grad_norm": 0.004949494265019894, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 10145 + }, + { + "epoch": 0.27995047199908946, + "grad_norm": 0.003847777610644698, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 10146 + }, + { + "epoch": 0.2799780642001538, + "grad_norm": 0.0029670281801372766, + "learning_rate": 0.001, + "loss": 0.388, + "step": 10147 + }, + { + "epoch": 0.2800056564012182, + "grad_norm": 0.0038127326406538486, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 10148 + }, + { + "epoch": 0.28003324860228257, + "grad_norm": 0.002967024687677622, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 10149 + }, + { + "epoch": 0.2800608408033469, + "grad_norm": 0.004500363487750292, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 10150 + }, + { + "epoch": 0.28008843300441133, + "grad_norm": 0.0036577654536813498, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 10151 + }, + { + "epoch": 0.2801160252054757, + "grad_norm": 0.00338082667440176, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 10152 + }, + { + "epoch": 0.28014361740654004, + "grad_norm": 0.004477594047784805, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 10153 + }, + { + "epoch": 0.2801712096076044, + "grad_norm": 0.005331623367965221, + "learning_rate": 0.001, + "loss": 0.3674, + "step": 10154 + }, + { + "epoch": 0.2801988018086688, + "grad_norm": 0.0034174472093582153, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 10155 + }, + { + "epoch": 0.28022639400973315, + "grad_norm": 0.004699235316365957, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 10156 + }, + { + "epoch": 0.2802539862107975, + "grad_norm": 0.0027989267837256193, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 10157 + }, + { + "epoch": 0.2802815784118619, + "grad_norm": 0.003362043295055628, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 10158 + }, + { + "epoch": 0.28030917061292626, + "grad_norm": 0.0024922748561948538, + "learning_rate": 0.001, + "loss": 0.4422, + "step": 10159 + }, + { + "epoch": 0.2803367628139906, + "grad_norm": 0.0043379804119467735, + "learning_rate": 0.001, + "loss": 0.391, + "step": 10160 + }, + { + "epoch": 0.280364355015055, + "grad_norm": 0.0037757758982479572, + "learning_rate": 0.001, + "loss": 0.4115, + "step": 10161 + }, + { + "epoch": 0.2803919472161194, + "grad_norm": 0.003047212492674589, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 10162 + }, + { + "epoch": 0.2804195394171837, + "grad_norm": 0.0027652375865727663, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 10163 + }, + { + "epoch": 0.2804471316182481, + "grad_norm": 0.0034420695155858994, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 10164 + }, + { + "epoch": 0.2804747238193125, + "grad_norm": 0.0024248689878731966, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 10165 + }, + { + "epoch": 0.28050231602037684, + "grad_norm": 0.0045005762949585915, + "learning_rate": 0.001, + "loss": 0.3774, + "step": 10166 + }, + { + "epoch": 0.2805299082214412, + "grad_norm": 0.002247242256999016, + "learning_rate": 0.001, + "loss": 0.409, + "step": 10167 + }, + { + "epoch": 0.2805575004225056, + "grad_norm": 0.004416101146489382, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 10168 + }, + { + "epoch": 0.28058509262356995, + "grad_norm": 0.003067211015149951, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 10169 + }, + { + "epoch": 0.2806126848246343, + "grad_norm": 0.0030722361989319324, + "learning_rate": 0.001, + "loss": 0.446, + "step": 10170 + }, + { + "epoch": 0.28064027702569866, + "grad_norm": 0.003344991710036993, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 10171 + }, + { + "epoch": 0.28066786922676307, + "grad_norm": 0.0028220931999385357, + "learning_rate": 0.001, + "loss": 0.4381, + "step": 10172 + }, + { + "epoch": 0.2806954614278274, + "grad_norm": 0.0032366979867219925, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 10173 + }, + { + "epoch": 0.28072305362889177, + "grad_norm": 0.004435013514012098, + "learning_rate": 0.001, + "loss": 0.41, + "step": 10174 + }, + { + "epoch": 0.2807506458299562, + "grad_norm": 0.002561743138357997, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 10175 + }, + { + "epoch": 0.28077823803102053, + "grad_norm": 0.0032213281374424696, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 10176 + }, + { + "epoch": 0.2808058302320849, + "grad_norm": 0.0033837456721812487, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 10177 + }, + { + "epoch": 0.2808334224331493, + "grad_norm": 0.0026573874056339264, + "learning_rate": 0.001, + "loss": 0.4523, + "step": 10178 + }, + { + "epoch": 0.28086101463421365, + "grad_norm": 0.004164142068475485, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 10179 + }, + { + "epoch": 0.280888606835278, + "grad_norm": 0.005075597669929266, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 10180 + }, + { + "epoch": 0.28091619903634235, + "grad_norm": 0.007850431837141514, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 10181 + }, + { + "epoch": 0.28094379123740676, + "grad_norm": 0.0037918195594102144, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 10182 + }, + { + "epoch": 0.2809713834384711, + "grad_norm": 0.004340451210737228, + "learning_rate": 0.001, + "loss": 0.399, + "step": 10183 + }, + { + "epoch": 0.28099897563953546, + "grad_norm": 0.003013703739270568, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 10184 + }, + { + "epoch": 0.2810265678405999, + "grad_norm": 0.004570307210087776, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 10185 + }, + { + "epoch": 0.2810541600416642, + "grad_norm": 0.0028304762672632933, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 10186 + }, + { + "epoch": 0.2810817522427286, + "grad_norm": 0.003872218308970332, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 10187 + }, + { + "epoch": 0.281109344443793, + "grad_norm": 0.003236171556636691, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 10188 + }, + { + "epoch": 0.28113693664485734, + "grad_norm": 0.00254503614269197, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 10189 + }, + { + "epoch": 0.2811645288459217, + "grad_norm": 0.0019429237581789494, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 10190 + }, + { + "epoch": 0.28119212104698604, + "grad_norm": 0.003968310542404652, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 10191 + }, + { + "epoch": 0.28121971324805045, + "grad_norm": 0.0034456211142241955, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 10192 + }, + { + "epoch": 0.2812473054491148, + "grad_norm": 0.00856733787804842, + "learning_rate": 0.001, + "loss": 0.3771, + "step": 10193 + }, + { + "epoch": 0.28127489765017916, + "grad_norm": 0.00205041142180562, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 10194 + }, + { + "epoch": 0.28130248985124356, + "grad_norm": 0.0054842522367835045, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 10195 + }, + { + "epoch": 0.2813300820523079, + "grad_norm": 0.0026425907853990793, + "learning_rate": 0.001, + "loss": 0.415, + "step": 10196 + }, + { + "epoch": 0.28135767425337227, + "grad_norm": 0.0030602633487433195, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 10197 + }, + { + "epoch": 0.2813852664544367, + "grad_norm": 0.00611920328810811, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 10198 + }, + { + "epoch": 0.28141285865550103, + "grad_norm": 0.0028095582965761423, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 10199 + }, + { + "epoch": 0.2814404508565654, + "grad_norm": 0.002753883833065629, + "learning_rate": 0.001, + "loss": 0.408, + "step": 10200 + }, + { + "epoch": 0.28146804305762974, + "grad_norm": 0.0056452443823218346, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 10201 + }, + { + "epoch": 0.28149563525869414, + "grad_norm": 0.004540273919701576, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 10202 + }, + { + "epoch": 0.2815232274597585, + "grad_norm": 0.0029490333981812, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 10203 + }, + { + "epoch": 0.28155081966082285, + "grad_norm": 0.0022447453811764717, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 10204 + }, + { + "epoch": 0.28157841186188726, + "grad_norm": 0.004265769850462675, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 10205 + }, + { + "epoch": 0.2816060040629516, + "grad_norm": 0.007797228638082743, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 10206 + }, + { + "epoch": 0.28163359626401596, + "grad_norm": 0.0029594229999929667, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 10207 + }, + { + "epoch": 0.28166118846508037, + "grad_norm": 0.006314094644039869, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 10208 + }, + { + "epoch": 0.2816887806661447, + "grad_norm": 0.002639149548485875, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 10209 + }, + { + "epoch": 0.2817163728672091, + "grad_norm": 0.0027635907754302025, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 10210 + }, + { + "epoch": 0.2817439650682734, + "grad_norm": 0.006702120881527662, + "learning_rate": 0.001, + "loss": 0.3668, + "step": 10211 + }, + { + "epoch": 0.28177155726933784, + "grad_norm": 0.008459938690066338, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 10212 + }, + { + "epoch": 0.2817991494704022, + "grad_norm": 0.005659305490553379, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 10213 + }, + { + "epoch": 0.28182674167146654, + "grad_norm": 0.002557772444561124, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 10214 + }, + { + "epoch": 0.28185433387253095, + "grad_norm": 0.0043907626532018185, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 10215 + }, + { + "epoch": 0.2818819260735953, + "grad_norm": 0.004590220283716917, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 10216 + }, + { + "epoch": 0.28190951827465965, + "grad_norm": 0.05343657732009888, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 10217 + }, + { + "epoch": 0.28193711047572406, + "grad_norm": 0.008130939677357674, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 10218 + }, + { + "epoch": 0.2819647026767884, + "grad_norm": 0.002707968931645155, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 10219 + }, + { + "epoch": 0.28199229487785277, + "grad_norm": 0.002904132939875126, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 10220 + }, + { + "epoch": 0.2820198870789171, + "grad_norm": 0.002991893794387579, + "learning_rate": 0.001, + "loss": 0.419, + "step": 10221 + }, + { + "epoch": 0.28204747927998153, + "grad_norm": 0.0030220167245715857, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 10222 + }, + { + "epoch": 0.2820750714810459, + "grad_norm": 0.002117662690579891, + "learning_rate": 0.001, + "loss": 0.4007, + "step": 10223 + }, + { + "epoch": 0.28210266368211023, + "grad_norm": 0.003019734052941203, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 10224 + }, + { + "epoch": 0.28213025588317464, + "grad_norm": 0.003011964727193117, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 10225 + }, + { + "epoch": 0.282157848084239, + "grad_norm": 0.0034174530301243067, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 10226 + }, + { + "epoch": 0.28218544028530335, + "grad_norm": 0.0024363279808312654, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 10227 + }, + { + "epoch": 0.28221303248636775, + "grad_norm": 0.002497728681191802, + "learning_rate": 0.001, + "loss": 0.4095, + "step": 10228 + }, + { + "epoch": 0.2822406246874321, + "grad_norm": 0.0034284652210772038, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 10229 + }, + { + "epoch": 0.28226821688849646, + "grad_norm": 0.002253099577501416, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 10230 + }, + { + "epoch": 0.2822958090895608, + "grad_norm": 0.003588828956708312, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 10231 + }, + { + "epoch": 0.2823234012906252, + "grad_norm": 0.0027390895411372185, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 10232 + }, + { + "epoch": 0.2823509934916896, + "grad_norm": 0.002828077645972371, + "learning_rate": 0.001, + "loss": 0.3915, + "step": 10233 + }, + { + "epoch": 0.2823785856927539, + "grad_norm": 0.006078117527067661, + "learning_rate": 0.001, + "loss": 0.3746, + "step": 10234 + }, + { + "epoch": 0.28240617789381833, + "grad_norm": 0.004316994454711676, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 10235 + }, + { + "epoch": 0.2824337700948827, + "grad_norm": 0.0036562427412718534, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 10236 + }, + { + "epoch": 0.28246136229594704, + "grad_norm": 0.004819520283490419, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 10237 + }, + { + "epoch": 0.28248895449701145, + "grad_norm": 0.0036955669056624174, + "learning_rate": 0.001, + "loss": 0.376, + "step": 10238 + }, + { + "epoch": 0.2825165466980758, + "grad_norm": 0.004412720445543528, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 10239 + }, + { + "epoch": 0.28254413889914015, + "grad_norm": 0.0021497290581464767, + "learning_rate": 0.001, + "loss": 0.4165, + "step": 10240 + }, + { + "epoch": 0.2825717311002045, + "grad_norm": 0.003762908047065139, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 10241 + }, + { + "epoch": 0.2825993233012689, + "grad_norm": 0.0028142263181507587, + "learning_rate": 0.001, + "loss": 0.3492, + "step": 10242 + }, + { + "epoch": 0.28262691550233326, + "grad_norm": 0.002374440897256136, + "learning_rate": 0.001, + "loss": 0.4136, + "step": 10243 + }, + { + "epoch": 0.2826545077033976, + "grad_norm": 0.002582703484222293, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 10244 + }, + { + "epoch": 0.282682099904462, + "grad_norm": 0.003294025780633092, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 10245 + }, + { + "epoch": 0.2827096921055264, + "grad_norm": 0.0025567803531885147, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 10246 + }, + { + "epoch": 0.28273728430659073, + "grad_norm": 0.002444926183670759, + "learning_rate": 0.001, + "loss": 0.4432, + "step": 10247 + }, + { + "epoch": 0.28276487650765514, + "grad_norm": 0.002646154025569558, + "learning_rate": 0.001, + "loss": 0.404, + "step": 10248 + }, + { + "epoch": 0.2827924687087195, + "grad_norm": 0.003058774396777153, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 10249 + }, + { + "epoch": 0.28282006090978384, + "grad_norm": 0.004560539964586496, + "learning_rate": 0.001, + "loss": 0.3797, + "step": 10250 + }, + { + "epoch": 0.2828476531108482, + "grad_norm": 0.005327416118234396, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 10251 + }, + { + "epoch": 0.2828752453119126, + "grad_norm": 0.002578763058409095, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 10252 + }, + { + "epoch": 0.28290283751297696, + "grad_norm": 0.0030044415034353733, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 10253 + }, + { + "epoch": 0.2829304297140413, + "grad_norm": 0.002438440453261137, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 10254 + }, + { + "epoch": 0.2829580219151057, + "grad_norm": 0.002474286826327443, + "learning_rate": 0.001, + "loss": 0.4097, + "step": 10255 + }, + { + "epoch": 0.28298561411617007, + "grad_norm": 0.0036326062399894, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 10256 + }, + { + "epoch": 0.2830132063172344, + "grad_norm": 0.006661332678049803, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 10257 + }, + { + "epoch": 0.2830407985182988, + "grad_norm": 0.005072845611721277, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 10258 + }, + { + "epoch": 0.2830683907193632, + "grad_norm": 0.004820408299565315, + "learning_rate": 0.001, + "loss": 0.373, + "step": 10259 + }, + { + "epoch": 0.28309598292042754, + "grad_norm": 0.00312256021425128, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 10260 + }, + { + "epoch": 0.2831235751214919, + "grad_norm": 0.0029510208405554295, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 10261 + }, + { + "epoch": 0.2831511673225563, + "grad_norm": 0.0040268669836223125, + "learning_rate": 0.001, + "loss": 0.3614, + "step": 10262 + }, + { + "epoch": 0.28317875952362065, + "grad_norm": 0.006543302442878485, + "learning_rate": 0.001, + "loss": 0.384, + "step": 10263 + }, + { + "epoch": 0.283206351724685, + "grad_norm": 0.003965241368860006, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 10264 + }, + { + "epoch": 0.2832339439257494, + "grad_norm": 0.0032414360903203487, + "learning_rate": 0.001, + "loss": 0.3433, + "step": 10265 + }, + { + "epoch": 0.28326153612681376, + "grad_norm": 0.002474940847605467, + "learning_rate": 0.001, + "loss": 0.393, + "step": 10266 + }, + { + "epoch": 0.2832891283278781, + "grad_norm": 0.0031417065765708685, + "learning_rate": 0.001, + "loss": 0.393, + "step": 10267 + }, + { + "epoch": 0.28331672052894247, + "grad_norm": 0.0028232133481651545, + "learning_rate": 0.001, + "loss": 0.4599, + "step": 10268 + }, + { + "epoch": 0.2833443127300069, + "grad_norm": 0.002691833535209298, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 10269 + }, + { + "epoch": 0.28337190493107123, + "grad_norm": 0.005066230893135071, + "learning_rate": 0.001, + "loss": 0.3908, + "step": 10270 + }, + { + "epoch": 0.2833994971321356, + "grad_norm": 0.004286530893296003, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 10271 + }, + { + "epoch": 0.2834270893332, + "grad_norm": 0.004262128844857216, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 10272 + }, + { + "epoch": 0.28345468153426434, + "grad_norm": 0.0027960515581071377, + "learning_rate": 0.001, + "loss": 0.4211, + "step": 10273 + }, + { + "epoch": 0.2834822737353287, + "grad_norm": 0.004192339722067118, + "learning_rate": 0.001, + "loss": 0.377, + "step": 10274 + }, + { + "epoch": 0.2835098659363931, + "grad_norm": 0.0032848825212568045, + "learning_rate": 0.001, + "loss": 0.362, + "step": 10275 + }, + { + "epoch": 0.28353745813745745, + "grad_norm": 0.0027345793787389994, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 10276 + }, + { + "epoch": 0.2835650503385218, + "grad_norm": 0.0030612831469625235, + "learning_rate": 0.001, + "loss": 0.3779, + "step": 10277 + }, + { + "epoch": 0.28359264253958616, + "grad_norm": 0.002645805710926652, + "learning_rate": 0.001, + "loss": 0.3881, + "step": 10278 + }, + { + "epoch": 0.28362023474065057, + "grad_norm": 0.0025237167719751596, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 10279 + }, + { + "epoch": 0.2836478269417149, + "grad_norm": 0.0036604590713977814, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 10280 + }, + { + "epoch": 0.2836754191427793, + "grad_norm": 0.007117830216884613, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 10281 + }, + { + "epoch": 0.2837030113438437, + "grad_norm": 0.05505690723657608, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 10282 + }, + { + "epoch": 0.28373060354490803, + "grad_norm": 0.006176114547997713, + "learning_rate": 0.001, + "loss": 0.352, + "step": 10283 + }, + { + "epoch": 0.2837581957459724, + "grad_norm": 0.003028157399967313, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 10284 + }, + { + "epoch": 0.2837857879470368, + "grad_norm": 0.0032485162373632193, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 10285 + }, + { + "epoch": 0.28381338014810115, + "grad_norm": 0.002797128167003393, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 10286 + }, + { + "epoch": 0.2838409723491655, + "grad_norm": 0.003049092134460807, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 10287 + }, + { + "epoch": 0.28386856455022985, + "grad_norm": 0.0031948923133313656, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 10288 + }, + { + "epoch": 0.28389615675129426, + "grad_norm": 0.003836139803752303, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 10289 + }, + { + "epoch": 0.2839237489523586, + "grad_norm": 0.002338884864002466, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 10290 + }, + { + "epoch": 0.28395134115342296, + "grad_norm": 0.009832640178501606, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 10291 + }, + { + "epoch": 0.2839789333544874, + "grad_norm": 0.0026350358966737986, + "learning_rate": 0.001, + "loss": 0.43, + "step": 10292 + }, + { + "epoch": 0.2840065255555517, + "grad_norm": 0.003660572227090597, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 10293 + }, + { + "epoch": 0.2840341177566161, + "grad_norm": 0.00275686988607049, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 10294 + }, + { + "epoch": 0.2840617099576805, + "grad_norm": 0.0033077567350119352, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 10295 + }, + { + "epoch": 0.28408930215874484, + "grad_norm": 0.006942362524569035, + "learning_rate": 0.001, + "loss": 0.354, + "step": 10296 + }, + { + "epoch": 0.2841168943598092, + "grad_norm": 0.0036332812160253525, + "learning_rate": 0.001, + "loss": 0.3707, + "step": 10297 + }, + { + "epoch": 0.28414448656087354, + "grad_norm": 0.012020766735076904, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 10298 + }, + { + "epoch": 0.28417207876193795, + "grad_norm": 0.0028270173352211714, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 10299 + }, + { + "epoch": 0.2841996709630023, + "grad_norm": 0.011626332998275757, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 10300 + }, + { + "epoch": 0.28422726316406666, + "grad_norm": 0.004878376144915819, + "learning_rate": 0.001, + "loss": 0.4427, + "step": 10301 + }, + { + "epoch": 0.28425485536513106, + "grad_norm": 0.004530386067926884, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 10302 + }, + { + "epoch": 0.2842824475661954, + "grad_norm": 0.004930171649903059, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 10303 + }, + { + "epoch": 0.28431003976725977, + "grad_norm": 0.009053234942257404, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 10304 + }, + { + "epoch": 0.2843376319683242, + "grad_norm": 0.004205284174531698, + "learning_rate": 0.001, + "loss": 0.3538, + "step": 10305 + }, + { + "epoch": 0.28436522416938853, + "grad_norm": 0.003883173456415534, + "learning_rate": 0.001, + "loss": 0.4267, + "step": 10306 + }, + { + "epoch": 0.2843928163704529, + "grad_norm": 0.0031415647827088833, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 10307 + }, + { + "epoch": 0.28442040857151724, + "grad_norm": 0.003464039647951722, + "learning_rate": 0.001, + "loss": 0.39, + "step": 10308 + }, + { + "epoch": 0.28444800077258164, + "grad_norm": 0.004332481883466244, + "learning_rate": 0.001, + "loss": 0.3642, + "step": 10309 + }, + { + "epoch": 0.284475592973646, + "grad_norm": 0.00255342829041183, + "learning_rate": 0.001, + "loss": 0.4375, + "step": 10310 + }, + { + "epoch": 0.28450318517471035, + "grad_norm": 0.0028760903514921665, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 10311 + }, + { + "epoch": 0.28453077737577476, + "grad_norm": 0.005956695880740881, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 10312 + }, + { + "epoch": 0.2845583695768391, + "grad_norm": 0.004237044602632523, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 10313 + }, + { + "epoch": 0.28458596177790346, + "grad_norm": 0.005159912630915642, + "learning_rate": 0.001, + "loss": 0.3598, + "step": 10314 + }, + { + "epoch": 0.28461355397896787, + "grad_norm": 0.0030028277542442083, + "learning_rate": 0.001, + "loss": 0.4357, + "step": 10315 + }, + { + "epoch": 0.2846411461800322, + "grad_norm": 0.002339400118216872, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 10316 + }, + { + "epoch": 0.2846687383810966, + "grad_norm": 0.0021588336676359177, + "learning_rate": 0.001, + "loss": 0.4477, + "step": 10317 + }, + { + "epoch": 0.28469633058216093, + "grad_norm": 0.002696117153391242, + "learning_rate": 0.001, + "loss": 0.39, + "step": 10318 + }, + { + "epoch": 0.28472392278322534, + "grad_norm": 0.0029791551642119884, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 10319 + }, + { + "epoch": 0.2847515149842897, + "grad_norm": 0.00416885782033205, + "learning_rate": 0.001, + "loss": 0.419, + "step": 10320 + }, + { + "epoch": 0.28477910718535404, + "grad_norm": 0.005709108896553516, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 10321 + }, + { + "epoch": 0.28480669938641845, + "grad_norm": 0.002943917643278837, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 10322 + }, + { + "epoch": 0.2848342915874828, + "grad_norm": 0.0025159113574773073, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 10323 + }, + { + "epoch": 0.28486188378854715, + "grad_norm": 0.004362201318144798, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 10324 + }, + { + "epoch": 0.28488947598961156, + "grad_norm": 0.0049796863459050655, + "learning_rate": 0.001, + "loss": 0.4253, + "step": 10325 + }, + { + "epoch": 0.2849170681906759, + "grad_norm": 0.004486286547034979, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 10326 + }, + { + "epoch": 0.28494466039174027, + "grad_norm": 0.006912431679666042, + "learning_rate": 0.001, + "loss": 0.4178, + "step": 10327 + }, + { + "epoch": 0.2849722525928046, + "grad_norm": 0.0031216132920235395, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 10328 + }, + { + "epoch": 0.28499984479386903, + "grad_norm": 0.004195013549178839, + "learning_rate": 0.001, + "loss": 0.3776, + "step": 10329 + }, + { + "epoch": 0.2850274369949334, + "grad_norm": 0.0029478815849870443, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 10330 + }, + { + "epoch": 0.28505502919599773, + "grad_norm": 0.004067423287779093, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 10331 + }, + { + "epoch": 0.28508262139706214, + "grad_norm": 0.004478363785892725, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 10332 + }, + { + "epoch": 0.2851102135981265, + "grad_norm": 0.009275195188820362, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 10333 + }, + { + "epoch": 0.28513780579919085, + "grad_norm": 0.0037428569048643112, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 10334 + }, + { + "epoch": 0.28516539800025525, + "grad_norm": 0.004474049899727106, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 10335 + }, + { + "epoch": 0.2851929902013196, + "grad_norm": 0.004371373914182186, + "learning_rate": 0.001, + "loss": 0.3571, + "step": 10336 + }, + { + "epoch": 0.28522058240238396, + "grad_norm": 0.0037502245977520943, + "learning_rate": 0.001, + "loss": 0.3624, + "step": 10337 + }, + { + "epoch": 0.2852481746034483, + "grad_norm": 0.002311047865077853, + "learning_rate": 0.001, + "loss": 0.3721, + "step": 10338 + }, + { + "epoch": 0.2852757668045127, + "grad_norm": 0.006753645371645689, + "learning_rate": 0.001, + "loss": 0.3764, + "step": 10339 + }, + { + "epoch": 0.2853033590055771, + "grad_norm": 0.0031517634633928537, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 10340 + }, + { + "epoch": 0.2853309512066414, + "grad_norm": 0.0038493776228278875, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 10341 + }, + { + "epoch": 0.28535854340770583, + "grad_norm": 0.0019644973799586296, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 10342 + }, + { + "epoch": 0.2853861356087702, + "grad_norm": 0.0025845207273960114, + "learning_rate": 0.001, + "loss": 0.394, + "step": 10343 + }, + { + "epoch": 0.28541372780983454, + "grad_norm": 0.0030156485736370087, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 10344 + }, + { + "epoch": 0.28544132001089895, + "grad_norm": 0.0033527929335832596, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 10345 + }, + { + "epoch": 0.2854689122119633, + "grad_norm": 0.0047877393662929535, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 10346 + }, + { + "epoch": 0.28549650441302765, + "grad_norm": 0.002556293271481991, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 10347 + }, + { + "epoch": 0.285524096614092, + "grad_norm": 0.0028672493062913418, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 10348 + }, + { + "epoch": 0.2855516888151564, + "grad_norm": 0.0026332384441047907, + "learning_rate": 0.001, + "loss": 0.4436, + "step": 10349 + }, + { + "epoch": 0.28557928101622077, + "grad_norm": 0.0033898288384079933, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 10350 + }, + { + "epoch": 0.2856068732172851, + "grad_norm": 0.003938847221434116, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 10351 + }, + { + "epoch": 0.2856344654183495, + "grad_norm": 0.004278605338186026, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 10352 + }, + { + "epoch": 0.2856620576194139, + "grad_norm": 0.009001370519399643, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 10353 + }, + { + "epoch": 0.28568964982047823, + "grad_norm": 0.0058325412683188915, + "learning_rate": 0.001, + "loss": 0.386, + "step": 10354 + }, + { + "epoch": 0.2857172420215426, + "grad_norm": 0.0033330926671624184, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 10355 + }, + { + "epoch": 0.285744834222607, + "grad_norm": 0.0033065094612538815, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 10356 + }, + { + "epoch": 0.28577242642367134, + "grad_norm": 0.002417915500700474, + "learning_rate": 0.001, + "loss": 0.4, + "step": 10357 + }, + { + "epoch": 0.2858000186247357, + "grad_norm": 0.0036074428353458643, + "learning_rate": 0.001, + "loss": 0.3477, + "step": 10358 + }, + { + "epoch": 0.2858276108258001, + "grad_norm": 0.0028410926461219788, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 10359 + }, + { + "epoch": 0.28585520302686446, + "grad_norm": 0.0024444633163511753, + "learning_rate": 0.001, + "loss": 0.4072, + "step": 10360 + }, + { + "epoch": 0.2858827952279288, + "grad_norm": 0.0035574915818870068, + "learning_rate": 0.001, + "loss": 0.3702, + "step": 10361 + }, + { + "epoch": 0.2859103874289932, + "grad_norm": 0.003090563230216503, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 10362 + }, + { + "epoch": 0.28593797963005757, + "grad_norm": 0.003158135572448373, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 10363 + }, + { + "epoch": 0.2859655718311219, + "grad_norm": 0.004847416654229164, + "learning_rate": 0.001, + "loss": 0.3727, + "step": 10364 + }, + { + "epoch": 0.2859931640321863, + "grad_norm": 0.0035511176101863384, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 10365 + }, + { + "epoch": 0.2860207562332507, + "grad_norm": 0.0049200220964848995, + "learning_rate": 0.001, + "loss": 0.399, + "step": 10366 + }, + { + "epoch": 0.28604834843431504, + "grad_norm": 0.0024868594482541084, + "learning_rate": 0.001, + "loss": 0.374, + "step": 10367 + }, + { + "epoch": 0.2860759406353794, + "grad_norm": 0.004043731838464737, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 10368 + }, + { + "epoch": 0.2861035328364438, + "grad_norm": 0.0034267944283783436, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 10369 + }, + { + "epoch": 0.28613112503750815, + "grad_norm": 0.003376040840521455, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 10370 + }, + { + "epoch": 0.2861587172385725, + "grad_norm": 0.0024833050556480885, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 10371 + }, + { + "epoch": 0.2861863094396369, + "grad_norm": 0.0030002393759787083, + "learning_rate": 0.001, + "loss": 0.4271, + "step": 10372 + }, + { + "epoch": 0.28621390164070126, + "grad_norm": 0.0029513488989323378, + "learning_rate": 0.001, + "loss": 0.3705, + "step": 10373 + }, + { + "epoch": 0.2862414938417656, + "grad_norm": 0.0033245980739593506, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 10374 + }, + { + "epoch": 0.28626908604282997, + "grad_norm": 0.002725611673668027, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 10375 + }, + { + "epoch": 0.2862966782438944, + "grad_norm": 0.0023878002539277077, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 10376 + }, + { + "epoch": 0.28632427044495873, + "grad_norm": 0.0076403240673244, + "learning_rate": 0.001, + "loss": 0.4184, + "step": 10377 + }, + { + "epoch": 0.2863518626460231, + "grad_norm": 0.00309283216483891, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 10378 + }, + { + "epoch": 0.2863794548470875, + "grad_norm": 0.002792463870719075, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 10379 + }, + { + "epoch": 0.28640704704815184, + "grad_norm": 0.002797079971060157, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 10380 + }, + { + "epoch": 0.2864346392492162, + "grad_norm": 0.0023108189925551414, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 10381 + }, + { + "epoch": 0.2864622314502806, + "grad_norm": 0.0021790589671581984, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 10382 + }, + { + "epoch": 0.28648982365134495, + "grad_norm": 0.003151221200823784, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 10383 + }, + { + "epoch": 0.2865174158524093, + "grad_norm": 0.0043502976186573505, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 10384 + }, + { + "epoch": 0.28654500805347366, + "grad_norm": 0.0038338969461619854, + "learning_rate": 0.001, + "loss": 0.427, + "step": 10385 + }, + { + "epoch": 0.28657260025453807, + "grad_norm": 0.004629583563655615, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 10386 + }, + { + "epoch": 0.2866001924556024, + "grad_norm": 0.008832892403006554, + "learning_rate": 0.001, + "loss": 0.409, + "step": 10387 + }, + { + "epoch": 0.2866277846566668, + "grad_norm": 0.005081677809357643, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 10388 + }, + { + "epoch": 0.2866553768577312, + "grad_norm": 0.005641726311296225, + "learning_rate": 0.001, + "loss": 0.3621, + "step": 10389 + }, + { + "epoch": 0.28668296905879553, + "grad_norm": 0.005556876305490732, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 10390 + }, + { + "epoch": 0.2867105612598599, + "grad_norm": 0.004483111668378115, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 10391 + }, + { + "epoch": 0.2867381534609243, + "grad_norm": 0.004832763224840164, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 10392 + }, + { + "epoch": 0.28676574566198865, + "grad_norm": 0.0030382846016436815, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 10393 + }, + { + "epoch": 0.286793337863053, + "grad_norm": 0.003834576578810811, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 10394 + }, + { + "epoch": 0.28682093006411735, + "grad_norm": 0.003595273941755295, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 10395 + }, + { + "epoch": 0.28684852226518176, + "grad_norm": 0.006260840687900782, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 10396 + }, + { + "epoch": 0.2868761144662461, + "grad_norm": 0.00411889236420393, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 10397 + }, + { + "epoch": 0.28690370666731047, + "grad_norm": 0.0030989821534603834, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 10398 + }, + { + "epoch": 0.2869312988683749, + "grad_norm": 0.003285297192633152, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 10399 + }, + { + "epoch": 0.2869588910694392, + "grad_norm": 0.0032312078401446342, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 10400 + }, + { + "epoch": 0.2869864832705036, + "grad_norm": 0.004018446430563927, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 10401 + }, + { + "epoch": 0.287014075471568, + "grad_norm": 0.0027517497073858976, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 10402 + }, + { + "epoch": 0.28704166767263234, + "grad_norm": 0.0023782148491591215, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 10403 + }, + { + "epoch": 0.2870692598736967, + "grad_norm": 0.002795133274048567, + "learning_rate": 0.001, + "loss": 0.416, + "step": 10404 + }, + { + "epoch": 0.28709685207476104, + "grad_norm": 0.0036653687711805105, + "learning_rate": 0.001, + "loss": 0.3434, + "step": 10405 + }, + { + "epoch": 0.28712444427582545, + "grad_norm": 0.003916740883141756, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 10406 + }, + { + "epoch": 0.2871520364768898, + "grad_norm": 0.004016435705125332, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 10407 + }, + { + "epoch": 0.28717962867795416, + "grad_norm": 0.0035471522714942694, + "learning_rate": 0.001, + "loss": 0.4374, + "step": 10408 + }, + { + "epoch": 0.28720722087901857, + "grad_norm": 0.010290498845279217, + "learning_rate": 0.001, + "loss": 0.4291, + "step": 10409 + }, + { + "epoch": 0.2872348130800829, + "grad_norm": 0.004986819811165333, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 10410 + }, + { + "epoch": 0.28726240528114727, + "grad_norm": 0.003227797569707036, + "learning_rate": 0.001, + "loss": 0.3876, + "step": 10411 + }, + { + "epoch": 0.2872899974822117, + "grad_norm": 0.0036358062643557787, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 10412 + }, + { + "epoch": 0.28731758968327603, + "grad_norm": 0.002349398098886013, + "learning_rate": 0.001, + "loss": 0.4514, + "step": 10413 + }, + { + "epoch": 0.2873451818843404, + "grad_norm": 0.004284038674086332, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 10414 + }, + { + "epoch": 0.28737277408540474, + "grad_norm": 0.0026736604049801826, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 10415 + }, + { + "epoch": 0.28740036628646914, + "grad_norm": 0.002364154439419508, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 10416 + }, + { + "epoch": 0.2874279584875335, + "grad_norm": 0.0029544299468398094, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 10417 + }, + { + "epoch": 0.28745555068859785, + "grad_norm": 0.0052476017735898495, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 10418 + }, + { + "epoch": 0.28748314288966226, + "grad_norm": 0.0021597561426460743, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 10419 + }, + { + "epoch": 0.2875107350907266, + "grad_norm": 0.006407409440726042, + "learning_rate": 0.001, + "loss": 0.4276, + "step": 10420 + }, + { + "epoch": 0.28753832729179096, + "grad_norm": 0.004578462336212397, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 10421 + }, + { + "epoch": 0.28756591949285537, + "grad_norm": 0.0024405980948358774, + "learning_rate": 0.001, + "loss": 0.4355, + "step": 10422 + }, + { + "epoch": 0.2875935116939197, + "grad_norm": 0.0061929537914693356, + "learning_rate": 0.001, + "loss": 0.3499, + "step": 10423 + }, + { + "epoch": 0.2876211038949841, + "grad_norm": 0.002528023673221469, + "learning_rate": 0.001, + "loss": 0.4066, + "step": 10424 + }, + { + "epoch": 0.28764869609604843, + "grad_norm": 0.005064224358648062, + "learning_rate": 0.001, + "loss": 0.4323, + "step": 10425 + }, + { + "epoch": 0.28767628829711284, + "grad_norm": 0.0034656894858926535, + "learning_rate": 0.001, + "loss": 0.436, + "step": 10426 + }, + { + "epoch": 0.2877038804981772, + "grad_norm": 0.013309495523571968, + "learning_rate": 0.001, + "loss": 0.3463, + "step": 10427 + }, + { + "epoch": 0.28773147269924154, + "grad_norm": 0.0033926607575267553, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 10428 + }, + { + "epoch": 0.28775906490030595, + "grad_norm": 0.003299700329080224, + "learning_rate": 0.001, + "loss": 0.3592, + "step": 10429 + }, + { + "epoch": 0.2877866571013703, + "grad_norm": 0.00263997376896441, + "learning_rate": 0.001, + "loss": 0.383, + "step": 10430 + }, + { + "epoch": 0.28781424930243465, + "grad_norm": 0.002664315514266491, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 10431 + }, + { + "epoch": 0.28784184150349906, + "grad_norm": 0.003565206192433834, + "learning_rate": 0.001, + "loss": 0.395, + "step": 10432 + }, + { + "epoch": 0.2878694337045634, + "grad_norm": 0.0028061075136065483, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 10433 + }, + { + "epoch": 0.28789702590562777, + "grad_norm": 0.005139282438904047, + "learning_rate": 0.001, + "loss": 0.3856, + "step": 10434 + }, + { + "epoch": 0.2879246181066921, + "grad_norm": 0.003619102295488119, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 10435 + }, + { + "epoch": 0.28795221030775653, + "grad_norm": 0.0031147655099630356, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 10436 + }, + { + "epoch": 0.2879798025088209, + "grad_norm": 0.0035453352611511946, + "learning_rate": 0.001, + "loss": 0.414, + "step": 10437 + }, + { + "epoch": 0.28800739470988523, + "grad_norm": 0.013630262576043606, + "learning_rate": 0.001, + "loss": 0.394, + "step": 10438 + }, + { + "epoch": 0.28803498691094964, + "grad_norm": 0.006643341388553381, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 10439 + }, + { + "epoch": 0.288062579112014, + "grad_norm": 0.0038181054405868053, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 10440 + }, + { + "epoch": 0.28809017131307835, + "grad_norm": 0.00438200868666172, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 10441 + }, + { + "epoch": 0.28811776351414276, + "grad_norm": 0.009693934582173824, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 10442 + }, + { + "epoch": 0.2881453557152071, + "grad_norm": 0.0037193638272583485, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 10443 + }, + { + "epoch": 0.28817294791627146, + "grad_norm": 0.005814881529659033, + "learning_rate": 0.001, + "loss": 0.4172, + "step": 10444 + }, + { + "epoch": 0.2882005401173358, + "grad_norm": 0.003509842325001955, + "learning_rate": 0.001, + "loss": 0.404, + "step": 10445 + }, + { + "epoch": 0.2882281323184002, + "grad_norm": 0.048714328557252884, + "learning_rate": 0.001, + "loss": 0.4366, + "step": 10446 + }, + { + "epoch": 0.2882557245194646, + "grad_norm": 0.003179010935127735, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 10447 + }, + { + "epoch": 0.2882833167205289, + "grad_norm": 0.004650244489312172, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 10448 + }, + { + "epoch": 0.28831090892159333, + "grad_norm": 0.010246813297271729, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 10449 + }, + { + "epoch": 0.2883385011226577, + "grad_norm": 0.0024331400636583567, + "learning_rate": 0.001, + "loss": 0.437, + "step": 10450 + }, + { + "epoch": 0.28836609332372204, + "grad_norm": 0.0032423040829598904, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 10451 + }, + { + "epoch": 0.2883936855247864, + "grad_norm": 0.0032161688432097435, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 10452 + }, + { + "epoch": 0.2884212777258508, + "grad_norm": 0.009974083863198757, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 10453 + }, + { + "epoch": 0.28844886992691515, + "grad_norm": 0.0024438994005322456, + "learning_rate": 0.001, + "loss": 0.3818, + "step": 10454 + }, + { + "epoch": 0.2884764621279795, + "grad_norm": 0.0024701599031686783, + "learning_rate": 0.001, + "loss": 0.4116, + "step": 10455 + }, + { + "epoch": 0.2885040543290439, + "grad_norm": 0.0037678529042750597, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 10456 + }, + { + "epoch": 0.28853164653010827, + "grad_norm": 0.0021011261269450188, + "learning_rate": 0.001, + "loss": 0.44, + "step": 10457 + }, + { + "epoch": 0.2885592387311726, + "grad_norm": 0.0035511725582182407, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 10458 + }, + { + "epoch": 0.288586830932237, + "grad_norm": 0.002597698476165533, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 10459 + }, + { + "epoch": 0.2886144231333014, + "grad_norm": 0.006349804345518351, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 10460 + }, + { + "epoch": 0.28864201533436573, + "grad_norm": 0.003128377255052328, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 10461 + }, + { + "epoch": 0.2886696075354301, + "grad_norm": 0.005628944840282202, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 10462 + }, + { + "epoch": 0.2886971997364945, + "grad_norm": 0.009223297238349915, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 10463 + }, + { + "epoch": 0.28872479193755884, + "grad_norm": 0.002501624170690775, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 10464 + }, + { + "epoch": 0.2887523841386232, + "grad_norm": 0.0031182433012872934, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 10465 + }, + { + "epoch": 0.2887799763396876, + "grad_norm": 0.003218629164621234, + "learning_rate": 0.001, + "loss": 0.396, + "step": 10466 + }, + { + "epoch": 0.28880756854075196, + "grad_norm": 0.0033797677606344223, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 10467 + }, + { + "epoch": 0.2888351607418163, + "grad_norm": 0.005143940914422274, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 10468 + }, + { + "epoch": 0.2888627529428807, + "grad_norm": 0.02191130630671978, + "learning_rate": 0.001, + "loss": 0.3642, + "step": 10469 + }, + { + "epoch": 0.28889034514394507, + "grad_norm": 0.013004349544644356, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 10470 + }, + { + "epoch": 0.2889179373450094, + "grad_norm": 0.0044490983709692955, + "learning_rate": 0.001, + "loss": 0.411, + "step": 10471 + }, + { + "epoch": 0.2889455295460738, + "grad_norm": 0.004338787868618965, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 10472 + }, + { + "epoch": 0.2889731217471382, + "grad_norm": 0.0022168918512761593, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 10473 + }, + { + "epoch": 0.28900071394820254, + "grad_norm": 0.0039255740121006966, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 10474 + }, + { + "epoch": 0.2890283061492669, + "grad_norm": 0.0024870086926966906, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 10475 + }, + { + "epoch": 0.2890558983503313, + "grad_norm": 0.009762310422956944, + "learning_rate": 0.001, + "loss": 0.4262, + "step": 10476 + }, + { + "epoch": 0.28908349055139565, + "grad_norm": 0.0031546815298497677, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 10477 + }, + { + "epoch": 0.28911108275246, + "grad_norm": 0.0028812165837734938, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 10478 + }, + { + "epoch": 0.2891386749535244, + "grad_norm": 0.003111601574346423, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 10479 + }, + { + "epoch": 0.28916626715458876, + "grad_norm": 0.0035458174534142017, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 10480 + }, + { + "epoch": 0.2891938593556531, + "grad_norm": 0.0034652771428227425, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 10481 + }, + { + "epoch": 0.28922145155671747, + "grad_norm": 0.004158776253461838, + "learning_rate": 0.001, + "loss": 0.3449, + "step": 10482 + }, + { + "epoch": 0.2892490437577819, + "grad_norm": 0.0031947793904691935, + "learning_rate": 0.001, + "loss": 0.4176, + "step": 10483 + }, + { + "epoch": 0.28927663595884623, + "grad_norm": 0.0023631020449101925, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 10484 + }, + { + "epoch": 0.2893042281599106, + "grad_norm": 0.0029194431845098734, + "learning_rate": 0.001, + "loss": 0.3617, + "step": 10485 + }, + { + "epoch": 0.289331820360975, + "grad_norm": 0.003637043060734868, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 10486 + }, + { + "epoch": 0.28935941256203934, + "grad_norm": 0.002662140876054764, + "learning_rate": 0.001, + "loss": 0.3454, + "step": 10487 + }, + { + "epoch": 0.2893870047631037, + "grad_norm": 0.002793428720906377, + "learning_rate": 0.001, + "loss": 0.398, + "step": 10488 + }, + { + "epoch": 0.2894145969641681, + "grad_norm": 0.0031368432100862265, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 10489 + }, + { + "epoch": 0.28944218916523246, + "grad_norm": 0.003831058507785201, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 10490 + }, + { + "epoch": 0.2894697813662968, + "grad_norm": 0.0026088710874319077, + "learning_rate": 0.001, + "loss": 0.3848, + "step": 10491 + }, + { + "epoch": 0.28949737356736116, + "grad_norm": 0.004104997497051954, + "learning_rate": 0.001, + "loss": 0.431, + "step": 10492 + }, + { + "epoch": 0.28952496576842557, + "grad_norm": 0.004247668199241161, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 10493 + }, + { + "epoch": 0.2895525579694899, + "grad_norm": 0.0025502184871584177, + "learning_rate": 0.001, + "loss": 0.391, + "step": 10494 + }, + { + "epoch": 0.2895801501705543, + "grad_norm": 0.0031428837683051825, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 10495 + }, + { + "epoch": 0.2896077423716187, + "grad_norm": 0.00442470470443368, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 10496 + }, + { + "epoch": 0.28963533457268303, + "grad_norm": 0.0065785483457148075, + "learning_rate": 0.001, + "loss": 0.4503, + "step": 10497 + }, + { + "epoch": 0.2896629267737474, + "grad_norm": 0.0034503082279115915, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 10498 + }, + { + "epoch": 0.2896905189748118, + "grad_norm": 0.0028073948342353106, + "learning_rate": 0.001, + "loss": 0.3872, + "step": 10499 + }, + { + "epoch": 0.28971811117587615, + "grad_norm": 0.006891318131238222, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 10500 + }, + { + "epoch": 0.28971811117587615, + "eval_runtime": 23.9893, + "eval_samples_per_second": 1.334, + "eval_steps_per_second": 0.167, + "step": 10500 + }, + { + "epoch": 0.2897457033769405, + "grad_norm": 0.0038485617842525244, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 10501 + }, + { + "epoch": 0.28977329557800485, + "grad_norm": 0.004269871395081282, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 10502 + }, + { + "epoch": 0.28980088777906926, + "grad_norm": 0.00325970770791173, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 10503 + }, + { + "epoch": 0.2898284799801336, + "grad_norm": 0.0031080457847565413, + "learning_rate": 0.001, + "loss": 0.4237, + "step": 10504 + }, + { + "epoch": 0.28985607218119797, + "grad_norm": 0.003122104099020362, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 10505 + }, + { + "epoch": 0.2898836643822624, + "grad_norm": 0.003600063733756542, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 10506 + }, + { + "epoch": 0.2899112565833267, + "grad_norm": 0.004156127572059631, + "learning_rate": 0.001, + "loss": 0.38, + "step": 10507 + }, + { + "epoch": 0.2899388487843911, + "grad_norm": 0.0035865693353116512, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 10508 + }, + { + "epoch": 0.2899664409854555, + "grad_norm": 0.004097979050129652, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 10509 + }, + { + "epoch": 0.28999403318651984, + "grad_norm": 0.006173746194690466, + "learning_rate": 0.001, + "loss": 0.4409, + "step": 10510 + }, + { + "epoch": 0.2900216253875842, + "grad_norm": 0.0032481453381478786, + "learning_rate": 0.001, + "loss": 0.4437, + "step": 10511 + }, + { + "epoch": 0.29004921758864854, + "grad_norm": 0.009451840072870255, + "learning_rate": 0.001, + "loss": 0.3547, + "step": 10512 + }, + { + "epoch": 0.29007680978971295, + "grad_norm": 0.0031670001335442066, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 10513 + }, + { + "epoch": 0.2901044019907773, + "grad_norm": 0.0040013547986745834, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 10514 + }, + { + "epoch": 0.29013199419184166, + "grad_norm": 0.01435109507292509, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 10515 + }, + { + "epoch": 0.29015958639290607, + "grad_norm": 0.014031428843736649, + "learning_rate": 0.001, + "loss": 0.3909, + "step": 10516 + }, + { + "epoch": 0.2901871785939704, + "grad_norm": 0.021386094391345978, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 10517 + }, + { + "epoch": 0.29021477079503477, + "grad_norm": 0.01621176116168499, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 10518 + }, + { + "epoch": 0.2902423629960992, + "grad_norm": 0.1645982414484024, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 10519 + }, + { + "epoch": 0.29026995519716353, + "grad_norm": 0.009323320351541042, + "learning_rate": 0.001, + "loss": 0.3349, + "step": 10520 + }, + { + "epoch": 0.2902975473982279, + "grad_norm": 0.010149064473807812, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 10521 + }, + { + "epoch": 0.29032513959929224, + "grad_norm": 0.004645455162972212, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 10522 + }, + { + "epoch": 0.29035273180035664, + "grad_norm": 0.005608486942946911, + "learning_rate": 0.001, + "loss": 0.41, + "step": 10523 + }, + { + "epoch": 0.290380324001421, + "grad_norm": 0.020055659115314484, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 10524 + }, + { + "epoch": 0.29040791620248535, + "grad_norm": 0.00331861968152225, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 10525 + }, + { + "epoch": 0.29043550840354976, + "grad_norm": 0.0030799147207289934, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 10526 + }, + { + "epoch": 0.2904631006046141, + "grad_norm": 0.003175821155309677, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 10527 + }, + { + "epoch": 0.29049069280567846, + "grad_norm": 0.002744150348007679, + "learning_rate": 0.001, + "loss": 0.394, + "step": 10528 + }, + { + "epoch": 0.29051828500674287, + "grad_norm": 0.0017547330353409052, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 10529 + }, + { + "epoch": 0.2905458772078072, + "grad_norm": 0.002442681696265936, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 10530 + }, + { + "epoch": 0.2905734694088716, + "grad_norm": 0.0028025461360812187, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 10531 + }, + { + "epoch": 0.29060106160993593, + "grad_norm": 0.003767822403460741, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 10532 + }, + { + "epoch": 0.29062865381100034, + "grad_norm": 0.031069811433553696, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 10533 + }, + { + "epoch": 0.2906562460120647, + "grad_norm": 0.005183485336601734, + "learning_rate": 0.001, + "loss": 0.3973, + "step": 10534 + }, + { + "epoch": 0.29068383821312904, + "grad_norm": 0.004126776475459337, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 10535 + }, + { + "epoch": 0.29071143041419345, + "grad_norm": 0.0029067141003906727, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 10536 + }, + { + "epoch": 0.2907390226152578, + "grad_norm": 0.00455186702311039, + "learning_rate": 0.001, + "loss": 0.413, + "step": 10537 + }, + { + "epoch": 0.29076661481632216, + "grad_norm": 0.0032677233684808016, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 10538 + }, + { + "epoch": 0.29079420701738656, + "grad_norm": 0.0033605170901864767, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 10539 + }, + { + "epoch": 0.2908217992184509, + "grad_norm": 0.0031016338616609573, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 10540 + }, + { + "epoch": 0.29084939141951527, + "grad_norm": 0.0022766580805182457, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 10541 + }, + { + "epoch": 0.2908769836205796, + "grad_norm": 0.0034821683075278997, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 10542 + }, + { + "epoch": 0.29090457582164403, + "grad_norm": 0.0025649424642324448, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 10543 + }, + { + "epoch": 0.2909321680227084, + "grad_norm": 0.004783533047884703, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 10544 + }, + { + "epoch": 0.29095976022377273, + "grad_norm": 0.007414715830236673, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 10545 + }, + { + "epoch": 0.29098735242483714, + "grad_norm": 0.010034749284386635, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 10546 + }, + { + "epoch": 0.2910149446259015, + "grad_norm": 0.0025835982523858547, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 10547 + }, + { + "epoch": 0.29104253682696585, + "grad_norm": 0.0022810695227235556, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 10548 + }, + { + "epoch": 0.2910701290280302, + "grad_norm": 0.0027416511438786983, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 10549 + }, + { + "epoch": 0.2910977212290946, + "grad_norm": 0.004057244397699833, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 10550 + }, + { + "epoch": 0.29112531343015896, + "grad_norm": 0.00185355672147125, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 10551 + }, + { + "epoch": 0.2911529056312233, + "grad_norm": 0.0033400054089725018, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 10552 + }, + { + "epoch": 0.2911804978322877, + "grad_norm": 0.005987055134028196, + "learning_rate": 0.001, + "loss": 0.3716, + "step": 10553 + }, + { + "epoch": 0.2912080900333521, + "grad_norm": 0.007598403375595808, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 10554 + }, + { + "epoch": 0.2912356822344164, + "grad_norm": 0.00337135954760015, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 10555 + }, + { + "epoch": 0.29126327443548083, + "grad_norm": 0.003520526457577944, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 10556 + }, + { + "epoch": 0.2912908666365452, + "grad_norm": 0.002943966304883361, + "learning_rate": 0.001, + "loss": 0.3459, + "step": 10557 + }, + { + "epoch": 0.29131845883760954, + "grad_norm": 0.004878449719399214, + "learning_rate": 0.001, + "loss": 0.3735, + "step": 10558 + }, + { + "epoch": 0.2913460510386739, + "grad_norm": 0.0023713402915745974, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 10559 + }, + { + "epoch": 0.2913736432397383, + "grad_norm": 0.0031815902329981327, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 10560 + }, + { + "epoch": 0.29140123544080265, + "grad_norm": 0.002473505213856697, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 10561 + }, + { + "epoch": 0.291428827641867, + "grad_norm": 0.002570673357695341, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 10562 + }, + { + "epoch": 0.2914564198429314, + "grad_norm": 0.004323242697864771, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 10563 + }, + { + "epoch": 0.29148401204399577, + "grad_norm": 0.003748292336240411, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 10564 + }, + { + "epoch": 0.2915116042450601, + "grad_norm": 0.0039474996738135815, + "learning_rate": 0.001, + "loss": 0.424, + "step": 10565 + }, + { + "epoch": 0.2915391964461245, + "grad_norm": 0.003185428213328123, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 10566 + }, + { + "epoch": 0.2915667886471889, + "grad_norm": 0.008764101192355156, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 10567 + }, + { + "epoch": 0.29159438084825323, + "grad_norm": 0.006038394290953875, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 10568 + }, + { + "epoch": 0.2916219730493176, + "grad_norm": 0.00498878164216876, + "learning_rate": 0.001, + "loss": 0.4099, + "step": 10569 + }, + { + "epoch": 0.291649565250382, + "grad_norm": 0.002792202867567539, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 10570 + }, + { + "epoch": 0.29167715745144634, + "grad_norm": 0.004675569478422403, + "learning_rate": 0.001, + "loss": 0.4226, + "step": 10571 + }, + { + "epoch": 0.2917047496525107, + "grad_norm": 0.00361837400123477, + "learning_rate": 0.001, + "loss": 0.3538, + "step": 10572 + }, + { + "epoch": 0.2917323418535751, + "grad_norm": 0.00293007493019104, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 10573 + }, + { + "epoch": 0.29175993405463946, + "grad_norm": 0.0027430958580225706, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 10574 + }, + { + "epoch": 0.2917875262557038, + "grad_norm": 0.0023098003584891558, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 10575 + }, + { + "epoch": 0.2918151184567682, + "grad_norm": 0.0030539468862116337, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 10576 + }, + { + "epoch": 0.29184271065783257, + "grad_norm": 0.004506285302340984, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 10577 + }, + { + "epoch": 0.2918703028588969, + "grad_norm": 0.00386527506634593, + "learning_rate": 0.001, + "loss": 0.4258, + "step": 10578 + }, + { + "epoch": 0.2918978950599613, + "grad_norm": 0.0044982582330703735, + "learning_rate": 0.001, + "loss": 0.3744, + "step": 10579 + }, + { + "epoch": 0.2919254872610257, + "grad_norm": 0.005035079549998045, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 10580 + }, + { + "epoch": 0.29195307946209004, + "grad_norm": 0.007419761270284653, + "learning_rate": 0.001, + "loss": 0.3572, + "step": 10581 + }, + { + "epoch": 0.2919806716631544, + "grad_norm": 0.003058833070099354, + "learning_rate": 0.001, + "loss": 0.4311, + "step": 10582 + }, + { + "epoch": 0.2920082638642188, + "grad_norm": 0.008622899651527405, + "learning_rate": 0.001, + "loss": 0.405, + "step": 10583 + }, + { + "epoch": 0.29203585606528315, + "grad_norm": 0.0038068420253694057, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 10584 + }, + { + "epoch": 0.2920634482663475, + "grad_norm": 0.0027387929148972034, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 10585 + }, + { + "epoch": 0.2920910404674119, + "grad_norm": 0.004177779424935579, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 10586 + }, + { + "epoch": 0.29211863266847626, + "grad_norm": 0.0050719305872917175, + "learning_rate": 0.001, + "loss": 0.4282, + "step": 10587 + }, + { + "epoch": 0.2921462248695406, + "grad_norm": 0.00421450100839138, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 10588 + }, + { + "epoch": 0.29217381707060497, + "grad_norm": 0.031578365713357925, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 10589 + }, + { + "epoch": 0.2922014092716694, + "grad_norm": 0.008609611541032791, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 10590 + }, + { + "epoch": 0.29222900147273373, + "grad_norm": 0.005752986762672663, + "learning_rate": 0.001, + "loss": 0.38, + "step": 10591 + }, + { + "epoch": 0.2922565936737981, + "grad_norm": 0.004760433454066515, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 10592 + }, + { + "epoch": 0.2922841858748625, + "grad_norm": 0.0037690645549446344, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 10593 + }, + { + "epoch": 0.29231177807592684, + "grad_norm": 0.0029250262305140495, + "learning_rate": 0.001, + "loss": 0.4383, + "step": 10594 + }, + { + "epoch": 0.2923393702769912, + "grad_norm": 0.002340389881283045, + "learning_rate": 0.001, + "loss": 0.4113, + "step": 10595 + }, + { + "epoch": 0.2923669624780556, + "grad_norm": 0.002268069889396429, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 10596 + }, + { + "epoch": 0.29239455467911996, + "grad_norm": 0.004916078876703978, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 10597 + }, + { + "epoch": 0.2924221468801843, + "grad_norm": 0.0026404778473079205, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 10598 + }, + { + "epoch": 0.29244973908124866, + "grad_norm": 0.003348248079419136, + "learning_rate": 0.001, + "loss": 0.387, + "step": 10599 + }, + { + "epoch": 0.29247733128231307, + "grad_norm": 0.0029468834400177, + "learning_rate": 0.001, + "loss": 0.3734, + "step": 10600 + }, + { + "epoch": 0.2925049234833774, + "grad_norm": 0.00501684146001935, + "learning_rate": 0.001, + "loss": 0.4221, + "step": 10601 + }, + { + "epoch": 0.2925325156844418, + "grad_norm": 0.002596599282696843, + "learning_rate": 0.001, + "loss": 0.412, + "step": 10602 + }, + { + "epoch": 0.2925601078855062, + "grad_norm": 0.004242388531565666, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 10603 + }, + { + "epoch": 0.29258770008657053, + "grad_norm": 0.003474390832707286, + "learning_rate": 0.001, + "loss": 0.378, + "step": 10604 + }, + { + "epoch": 0.2926152922876349, + "grad_norm": 0.0028386737685650587, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 10605 + }, + { + "epoch": 0.2926428844886993, + "grad_norm": 0.005176781211048365, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 10606 + }, + { + "epoch": 0.29267047668976365, + "grad_norm": 0.002886307192966342, + "learning_rate": 0.001, + "loss": 0.395, + "step": 10607 + }, + { + "epoch": 0.292698068890828, + "grad_norm": 0.0048707169480621815, + "learning_rate": 0.001, + "loss": 0.405, + "step": 10608 + }, + { + "epoch": 0.29272566109189235, + "grad_norm": 0.003168995026499033, + "learning_rate": 0.001, + "loss": 0.377, + "step": 10609 + }, + { + "epoch": 0.29275325329295676, + "grad_norm": 0.00348648545332253, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 10610 + }, + { + "epoch": 0.2927808454940211, + "grad_norm": 0.002645922591909766, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 10611 + }, + { + "epoch": 0.29280843769508547, + "grad_norm": 0.002931213239207864, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 10612 + }, + { + "epoch": 0.2928360298961499, + "grad_norm": 0.003571485634893179, + "learning_rate": 0.001, + "loss": 0.3484, + "step": 10613 + }, + { + "epoch": 0.2928636220972142, + "grad_norm": 0.002395814750343561, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 10614 + }, + { + "epoch": 0.2928912142982786, + "grad_norm": 0.007757443469017744, + "learning_rate": 0.001, + "loss": 0.3728, + "step": 10615 + }, + { + "epoch": 0.292918806499343, + "grad_norm": 0.003894525347277522, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 10616 + }, + { + "epoch": 0.29294639870040734, + "grad_norm": 0.005945149809122086, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 10617 + }, + { + "epoch": 0.2929739909014717, + "grad_norm": 0.005653233267366886, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 10618 + }, + { + "epoch": 0.29300158310253605, + "grad_norm": 0.006182766519486904, + "learning_rate": 0.001, + "loss": 0.4181, + "step": 10619 + }, + { + "epoch": 0.29302917530360045, + "grad_norm": 0.004933160729706287, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 10620 + }, + { + "epoch": 0.2930567675046648, + "grad_norm": 0.004339593928307295, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 10621 + }, + { + "epoch": 0.29308435970572916, + "grad_norm": 0.0027681838255375624, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 10622 + }, + { + "epoch": 0.29311195190679357, + "grad_norm": 0.002985588042065501, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 10623 + }, + { + "epoch": 0.2931395441078579, + "grad_norm": 0.003226140746846795, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 10624 + }, + { + "epoch": 0.29316713630892227, + "grad_norm": 0.00455946521833539, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 10625 + }, + { + "epoch": 0.2931947285099867, + "grad_norm": 0.004617081955075264, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 10626 + }, + { + "epoch": 0.29322232071105103, + "grad_norm": 0.006010159850120544, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 10627 + }, + { + "epoch": 0.2932499129121154, + "grad_norm": 0.0038819455076009035, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 10628 + }, + { + "epoch": 0.29327750511317974, + "grad_norm": 0.0035864575766026974, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 10629 + }, + { + "epoch": 0.29330509731424415, + "grad_norm": 0.002191227860748768, + "learning_rate": 0.001, + "loss": 0.408, + "step": 10630 + }, + { + "epoch": 0.2933326895153085, + "grad_norm": 0.004282178822904825, + "learning_rate": 0.001, + "loss": 0.3453, + "step": 10631 + }, + { + "epoch": 0.29336028171637285, + "grad_norm": 0.0022272500209510326, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 10632 + }, + { + "epoch": 0.29338787391743726, + "grad_norm": 0.002555177314206958, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 10633 + }, + { + "epoch": 0.2934154661185016, + "grad_norm": 0.006331083830446005, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 10634 + }, + { + "epoch": 0.29344305831956596, + "grad_norm": 0.002670933725312352, + "learning_rate": 0.001, + "loss": 0.4096, + "step": 10635 + }, + { + "epoch": 0.2934706505206303, + "grad_norm": 0.004304058384150267, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 10636 + }, + { + "epoch": 0.2934982427216947, + "grad_norm": 0.0024331146851181984, + "learning_rate": 0.001, + "loss": 0.393, + "step": 10637 + }, + { + "epoch": 0.2935258349227591, + "grad_norm": 0.003931795712560415, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 10638 + }, + { + "epoch": 0.29355342712382343, + "grad_norm": 0.004206740763038397, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 10639 + }, + { + "epoch": 0.29358101932488784, + "grad_norm": 0.0031811101362109184, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 10640 + }, + { + "epoch": 0.2936086115259522, + "grad_norm": 0.002255258383229375, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 10641 + }, + { + "epoch": 0.29363620372701654, + "grad_norm": 0.002453514840453863, + "learning_rate": 0.001, + "loss": 0.4223, + "step": 10642 + }, + { + "epoch": 0.29366379592808095, + "grad_norm": 0.002259905217215419, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 10643 + }, + { + "epoch": 0.2936913881291453, + "grad_norm": 0.0024826505687087774, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 10644 + }, + { + "epoch": 0.29371898033020966, + "grad_norm": 0.0024130498059093952, + "learning_rate": 0.001, + "loss": 0.39, + "step": 10645 + }, + { + "epoch": 0.293746572531274, + "grad_norm": 0.002940199337899685, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 10646 + }, + { + "epoch": 0.2937741647323384, + "grad_norm": 0.0037300034891813993, + "learning_rate": 0.001, + "loss": 0.3452, + "step": 10647 + }, + { + "epoch": 0.29380175693340277, + "grad_norm": 0.0023701984900981188, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 10648 + }, + { + "epoch": 0.2938293491344671, + "grad_norm": 0.0023255145642906427, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 10649 + }, + { + "epoch": 0.29385694133553153, + "grad_norm": 0.001929348916746676, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 10650 + }, + { + "epoch": 0.2938845335365959, + "grad_norm": 0.003764538327232003, + "learning_rate": 0.001, + "loss": 0.4241, + "step": 10651 + }, + { + "epoch": 0.29391212573766023, + "grad_norm": 0.0031687645241618156, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 10652 + }, + { + "epoch": 0.29393971793872464, + "grad_norm": 0.0023800362832844257, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 10653 + }, + { + "epoch": 0.293967310139789, + "grad_norm": 0.003322491655126214, + "learning_rate": 0.001, + "loss": 0.3802, + "step": 10654 + }, + { + "epoch": 0.29399490234085335, + "grad_norm": 0.003406172851100564, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 10655 + }, + { + "epoch": 0.2940224945419177, + "grad_norm": 0.0024194566067308187, + "learning_rate": 0.001, + "loss": 0.3699, + "step": 10656 + }, + { + "epoch": 0.2940500867429821, + "grad_norm": 0.002812894992530346, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 10657 + }, + { + "epoch": 0.29407767894404646, + "grad_norm": 0.004144003614783287, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 10658 + }, + { + "epoch": 0.2941052711451108, + "grad_norm": 0.0025214876513928175, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 10659 + }, + { + "epoch": 0.2941328633461752, + "grad_norm": 0.0037252691108733416, + "learning_rate": 0.001, + "loss": 0.3846, + "step": 10660 + }, + { + "epoch": 0.2941604555472396, + "grad_norm": 0.0028469781391322613, + "learning_rate": 0.001, + "loss": 0.4039, + "step": 10661 + }, + { + "epoch": 0.2941880477483039, + "grad_norm": 0.004957746714353561, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 10662 + }, + { + "epoch": 0.29421563994936833, + "grad_norm": 0.0030585613567382097, + "learning_rate": 0.001, + "loss": 0.383, + "step": 10663 + }, + { + "epoch": 0.2942432321504327, + "grad_norm": 0.003800381440669298, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 10664 + }, + { + "epoch": 0.29427082435149704, + "grad_norm": 0.0074830930680036545, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 10665 + }, + { + "epoch": 0.2942984165525614, + "grad_norm": 0.0026420445647090673, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 10666 + }, + { + "epoch": 0.2943260087536258, + "grad_norm": 0.0033852518536150455, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 10667 + }, + { + "epoch": 0.29435360095469015, + "grad_norm": 0.004352671559900045, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 10668 + }, + { + "epoch": 0.2943811931557545, + "grad_norm": 0.002693094778805971, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 10669 + }, + { + "epoch": 0.2944087853568189, + "grad_norm": 0.0030384764540940523, + "learning_rate": 0.001, + "loss": 0.3952, + "step": 10670 + }, + { + "epoch": 0.29443637755788327, + "grad_norm": 0.0021408821921795607, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 10671 + }, + { + "epoch": 0.2944639697589476, + "grad_norm": 0.0028972120489925146, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 10672 + }, + { + "epoch": 0.294491561960012, + "grad_norm": 0.0024992034304887056, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 10673 + }, + { + "epoch": 0.2945191541610764, + "grad_norm": 0.004567326512187719, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 10674 + }, + { + "epoch": 0.29454674636214073, + "grad_norm": 0.0030367637518793344, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 10675 + }, + { + "epoch": 0.2945743385632051, + "grad_norm": 0.003327249316498637, + "learning_rate": 0.001, + "loss": 0.382, + "step": 10676 + }, + { + "epoch": 0.2946019307642695, + "grad_norm": 0.00256183254532516, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 10677 + }, + { + "epoch": 0.29462952296533385, + "grad_norm": 0.004928114358335733, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 10678 + }, + { + "epoch": 0.2946571151663982, + "grad_norm": 0.0023404727689921856, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 10679 + }, + { + "epoch": 0.2946847073674626, + "grad_norm": 0.0023969600442796946, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 10680 + }, + { + "epoch": 0.29471229956852696, + "grad_norm": 0.0030103707686066628, + "learning_rate": 0.001, + "loss": 0.406, + "step": 10681 + }, + { + "epoch": 0.2947398917695913, + "grad_norm": 0.003063908079639077, + "learning_rate": 0.001, + "loss": 0.4101, + "step": 10682 + }, + { + "epoch": 0.2947674839706557, + "grad_norm": 0.0045205047354102135, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 10683 + }, + { + "epoch": 0.29479507617172007, + "grad_norm": 0.0176579337567091, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 10684 + }, + { + "epoch": 0.2948226683727844, + "grad_norm": 0.011839848011732101, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 10685 + }, + { + "epoch": 0.2948502605738488, + "grad_norm": 0.01756063662469387, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 10686 + }, + { + "epoch": 0.2948778527749132, + "grad_norm": 0.009184034541249275, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 10687 + }, + { + "epoch": 0.29490544497597754, + "grad_norm": 0.005395642481744289, + "learning_rate": 0.001, + "loss": 0.391, + "step": 10688 + }, + { + "epoch": 0.2949330371770419, + "grad_norm": 0.016659358516335487, + "learning_rate": 0.001, + "loss": 0.3581, + "step": 10689 + }, + { + "epoch": 0.2949606293781063, + "grad_norm": 0.01040132250636816, + "learning_rate": 0.001, + "loss": 0.3883, + "step": 10690 + }, + { + "epoch": 0.29498822157917065, + "grad_norm": 0.003244626335799694, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 10691 + }, + { + "epoch": 0.295015813780235, + "grad_norm": 0.004035422578454018, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 10692 + }, + { + "epoch": 0.2950434059812994, + "grad_norm": 0.00780785595998168, + "learning_rate": 0.001, + "loss": 0.4043, + "step": 10693 + }, + { + "epoch": 0.29507099818236376, + "grad_norm": 0.010562491603195667, + "learning_rate": 0.001, + "loss": 0.4504, + "step": 10694 + }, + { + "epoch": 0.2950985903834281, + "grad_norm": 0.003506020875647664, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 10695 + }, + { + "epoch": 0.29512618258449247, + "grad_norm": 0.0033239612821489573, + "learning_rate": 0.001, + "loss": 0.4266, + "step": 10696 + }, + { + "epoch": 0.2951537747855569, + "grad_norm": 0.0031016524881124496, + "learning_rate": 0.001, + "loss": 0.445, + "step": 10697 + }, + { + "epoch": 0.29518136698662123, + "grad_norm": 0.0037100305780768394, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 10698 + }, + { + "epoch": 0.2952089591876856, + "grad_norm": 0.003935493528842926, + "learning_rate": 0.001, + "loss": 0.3695, + "step": 10699 + }, + { + "epoch": 0.29523655138875, + "grad_norm": 0.0022481977939605713, + "learning_rate": 0.001, + "loss": 0.4624, + "step": 10700 + }, + { + "epoch": 0.29526414358981434, + "grad_norm": 0.003701514797285199, + "learning_rate": 0.001, + "loss": 0.4009, + "step": 10701 + }, + { + "epoch": 0.2952917357908787, + "grad_norm": 0.004181206226348877, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 10702 + }, + { + "epoch": 0.2953193279919431, + "grad_norm": 0.0034305029548704624, + "learning_rate": 0.001, + "loss": 0.391, + "step": 10703 + }, + { + "epoch": 0.29534692019300746, + "grad_norm": 0.013461762107908726, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 10704 + }, + { + "epoch": 0.2953745123940718, + "grad_norm": 0.006567132659256458, + "learning_rate": 0.001, + "loss": 0.4033, + "step": 10705 + }, + { + "epoch": 0.29540210459513616, + "grad_norm": 0.006370114628225565, + "learning_rate": 0.001, + "loss": 0.3926, + "step": 10706 + }, + { + "epoch": 0.29542969679620057, + "grad_norm": 0.0061858720146119595, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 10707 + }, + { + "epoch": 0.2954572889972649, + "grad_norm": 0.0025320991408079863, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 10708 + }, + { + "epoch": 0.2954848811983293, + "grad_norm": 0.0028619118966162205, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 10709 + }, + { + "epoch": 0.2955124733993937, + "grad_norm": 0.0026225673500448465, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 10710 + }, + { + "epoch": 0.29554006560045804, + "grad_norm": 0.0035380006302148104, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 10711 + }, + { + "epoch": 0.2955676578015224, + "grad_norm": 0.0031377810519188643, + "learning_rate": 0.001, + "loss": 0.4318, + "step": 10712 + }, + { + "epoch": 0.2955952500025868, + "grad_norm": 0.0046783569268882275, + "learning_rate": 0.001, + "loss": 0.3563, + "step": 10713 + }, + { + "epoch": 0.29562284220365115, + "grad_norm": 0.0029727064538747072, + "learning_rate": 0.001, + "loss": 0.4166, + "step": 10714 + }, + { + "epoch": 0.2956504344047155, + "grad_norm": 0.0036277880426496267, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 10715 + }, + { + "epoch": 0.29567802660577985, + "grad_norm": 0.002516635926440358, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 10716 + }, + { + "epoch": 0.29570561880684426, + "grad_norm": 0.0029823719523847103, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 10717 + }, + { + "epoch": 0.2957332110079086, + "grad_norm": 0.008971653878688812, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 10718 + }, + { + "epoch": 0.29576080320897297, + "grad_norm": 0.0029265042394399643, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 10719 + }, + { + "epoch": 0.2957883954100374, + "grad_norm": 0.003302923170849681, + "learning_rate": 0.001, + "loss": 0.4356, + "step": 10720 + }, + { + "epoch": 0.2958159876111017, + "grad_norm": 0.0031180151272565126, + "learning_rate": 0.001, + "loss": 0.4354, + "step": 10721 + }, + { + "epoch": 0.2958435798121661, + "grad_norm": 0.009599697776138783, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 10722 + }, + { + "epoch": 0.2958711720132305, + "grad_norm": 0.0027971549425274134, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 10723 + }, + { + "epoch": 0.29589876421429484, + "grad_norm": 0.0056414068676531315, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 10724 + }, + { + "epoch": 0.2959263564153592, + "grad_norm": 0.004426025785505772, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 10725 + }, + { + "epoch": 0.29595394861642355, + "grad_norm": 0.0021196724846959114, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 10726 + }, + { + "epoch": 0.29598154081748795, + "grad_norm": 0.002627891954034567, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 10727 + }, + { + "epoch": 0.2960091330185523, + "grad_norm": 0.006108472123742104, + "learning_rate": 0.001, + "loss": 0.3671, + "step": 10728 + }, + { + "epoch": 0.29603672521961666, + "grad_norm": 0.0034290605690330267, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 10729 + }, + { + "epoch": 0.29606431742068107, + "grad_norm": 0.0039821164682507515, + "learning_rate": 0.001, + "loss": 0.3657, + "step": 10730 + }, + { + "epoch": 0.2960919096217454, + "grad_norm": 0.0026045297272503376, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 10731 + }, + { + "epoch": 0.29611950182280977, + "grad_norm": 0.0025215772911906242, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 10732 + }, + { + "epoch": 0.2961470940238741, + "grad_norm": 0.0033087488263845444, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 10733 + }, + { + "epoch": 0.29617468622493853, + "grad_norm": 0.002935874741524458, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 10734 + }, + { + "epoch": 0.2962022784260029, + "grad_norm": 0.00557227386161685, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 10735 + }, + { + "epoch": 0.29622987062706724, + "grad_norm": 0.0036772515159100294, + "learning_rate": 0.001, + "loss": 0.403, + "step": 10736 + }, + { + "epoch": 0.29625746282813165, + "grad_norm": 0.004311760421842337, + "learning_rate": 0.001, + "loss": 0.3504, + "step": 10737 + }, + { + "epoch": 0.296285055029196, + "grad_norm": 0.0028161678928881884, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 10738 + }, + { + "epoch": 0.29631264723026035, + "grad_norm": 0.00484466552734375, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 10739 + }, + { + "epoch": 0.29634023943132476, + "grad_norm": 0.0023396711330860853, + "learning_rate": 0.001, + "loss": 0.4433, + "step": 10740 + }, + { + "epoch": 0.2963678316323891, + "grad_norm": 0.0026779670733958483, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 10741 + }, + { + "epoch": 0.29639542383345346, + "grad_norm": 0.002949584275484085, + "learning_rate": 0.001, + "loss": 0.34, + "step": 10742 + }, + { + "epoch": 0.2964230160345178, + "grad_norm": 0.00291722291149199, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 10743 + }, + { + "epoch": 0.2964506082355822, + "grad_norm": 0.00298800109885633, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 10744 + }, + { + "epoch": 0.2964782004366466, + "grad_norm": 0.003106021322309971, + "learning_rate": 0.001, + "loss": 0.4179, + "step": 10745 + }, + { + "epoch": 0.29650579263771093, + "grad_norm": 0.002655046060681343, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 10746 + }, + { + "epoch": 0.29653338483877534, + "grad_norm": 0.003087179269641638, + "learning_rate": 0.001, + "loss": 0.418, + "step": 10747 + }, + { + "epoch": 0.2965609770398397, + "grad_norm": 0.004109977278858423, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 10748 + }, + { + "epoch": 0.29658856924090404, + "grad_norm": 0.0026674012187868357, + "learning_rate": 0.001, + "loss": 0.423, + "step": 10749 + }, + { + "epoch": 0.29661616144196845, + "grad_norm": 0.004147304221987724, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 10750 + }, + { + "epoch": 0.2966437536430328, + "grad_norm": 0.0035869008861482143, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 10751 + }, + { + "epoch": 0.29667134584409716, + "grad_norm": 0.0047487919218838215, + "learning_rate": 0.001, + "loss": 0.4298, + "step": 10752 + }, + { + "epoch": 0.2966989380451615, + "grad_norm": 0.004151053261011839, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 10753 + }, + { + "epoch": 0.2967265302462259, + "grad_norm": 0.007814058102667332, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 10754 + }, + { + "epoch": 0.29675412244729027, + "grad_norm": 0.011301838792860508, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 10755 + }, + { + "epoch": 0.2967817146483546, + "grad_norm": 0.00466828653588891, + "learning_rate": 0.001, + "loss": 0.411, + "step": 10756 + }, + { + "epoch": 0.29680930684941903, + "grad_norm": 0.002339472994208336, + "learning_rate": 0.001, + "loss": 0.4028, + "step": 10757 + }, + { + "epoch": 0.2968368990504834, + "grad_norm": 0.0024191855918616056, + "learning_rate": 0.001, + "loss": 0.411, + "step": 10758 + }, + { + "epoch": 0.29686449125154774, + "grad_norm": 0.003175315447151661, + "learning_rate": 0.001, + "loss": 0.4209, + "step": 10759 + }, + { + "epoch": 0.29689208345261214, + "grad_norm": 0.010462275706231594, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 10760 + }, + { + "epoch": 0.2969196756536765, + "grad_norm": 0.002783535746857524, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 10761 + }, + { + "epoch": 0.29694726785474085, + "grad_norm": 0.0035620226990431547, + "learning_rate": 0.001, + "loss": 0.4312, + "step": 10762 + }, + { + "epoch": 0.2969748600558052, + "grad_norm": 0.003987728152424097, + "learning_rate": 0.001, + "loss": 0.385, + "step": 10763 + }, + { + "epoch": 0.2970024522568696, + "grad_norm": 0.0033255494199693203, + "learning_rate": 0.001, + "loss": 0.448, + "step": 10764 + }, + { + "epoch": 0.29703004445793396, + "grad_norm": 0.0032936478964984417, + "learning_rate": 0.001, + "loss": 0.3832, + "step": 10765 + }, + { + "epoch": 0.2970576366589983, + "grad_norm": 0.0029676929116249084, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 10766 + }, + { + "epoch": 0.2970852288600627, + "grad_norm": 0.0031973712611943483, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 10767 + }, + { + "epoch": 0.2971128210611271, + "grad_norm": 0.00365530326962471, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 10768 + }, + { + "epoch": 0.2971404132621914, + "grad_norm": 0.0027721880469471216, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 10769 + }, + { + "epoch": 0.29716800546325584, + "grad_norm": 0.003800489939749241, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 10770 + }, + { + "epoch": 0.2971955976643202, + "grad_norm": 0.0025981413200497627, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 10771 + }, + { + "epoch": 0.29722318986538454, + "grad_norm": 0.002971008885651827, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 10772 + }, + { + "epoch": 0.2972507820664489, + "grad_norm": 0.0038061172235757113, + "learning_rate": 0.001, + "loss": 0.3942, + "step": 10773 + }, + { + "epoch": 0.2972783742675133, + "grad_norm": 0.006774569861590862, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 10774 + }, + { + "epoch": 0.29730596646857765, + "grad_norm": 0.002431475091725588, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 10775 + }, + { + "epoch": 0.297333558669642, + "grad_norm": 0.0034196816850453615, + "learning_rate": 0.001, + "loss": 0.4186, + "step": 10776 + }, + { + "epoch": 0.2973611508707064, + "grad_norm": 0.0041458397172391415, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 10777 + }, + { + "epoch": 0.29738874307177077, + "grad_norm": 0.0031936741434037685, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 10778 + }, + { + "epoch": 0.2974163352728351, + "grad_norm": 0.0030546991620212793, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 10779 + }, + { + "epoch": 0.2974439274738995, + "grad_norm": 0.0025455260183662176, + "learning_rate": 0.001, + "loss": 0.4408, + "step": 10780 + }, + { + "epoch": 0.2974715196749639, + "grad_norm": 0.002307919319719076, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 10781 + }, + { + "epoch": 0.29749911187602823, + "grad_norm": 0.0028700283728539944, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 10782 + }, + { + "epoch": 0.2975267040770926, + "grad_norm": 0.0026034286711364985, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 10783 + }, + { + "epoch": 0.297554296278157, + "grad_norm": 0.005439944099634886, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 10784 + }, + { + "epoch": 0.29758188847922135, + "grad_norm": 0.004825190175324678, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 10785 + }, + { + "epoch": 0.2976094806802857, + "grad_norm": 0.0020885509438812733, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 10786 + }, + { + "epoch": 0.2976370728813501, + "grad_norm": 0.0027620510663837194, + "learning_rate": 0.001, + "loss": 0.436, + "step": 10787 + }, + { + "epoch": 0.29766466508241446, + "grad_norm": 0.002639887621626258, + "learning_rate": 0.001, + "loss": 0.3654, + "step": 10788 + }, + { + "epoch": 0.2976922572834788, + "grad_norm": 0.0022529284469783306, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 10789 + }, + { + "epoch": 0.2977198494845432, + "grad_norm": 0.0026258870493620634, + "learning_rate": 0.001, + "loss": 0.4102, + "step": 10790 + }, + { + "epoch": 0.2977474416856076, + "grad_norm": 0.0029656942933797836, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 10791 + }, + { + "epoch": 0.2977750338866719, + "grad_norm": 0.0020525925792753696, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 10792 + }, + { + "epoch": 0.2978026260877363, + "grad_norm": 0.0030268540140241385, + "learning_rate": 0.001, + "loss": 0.4068, + "step": 10793 + }, + { + "epoch": 0.2978302182888007, + "grad_norm": 0.003296051872894168, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 10794 + }, + { + "epoch": 0.29785781048986504, + "grad_norm": 0.002267766511067748, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 10795 + }, + { + "epoch": 0.2978854026909294, + "grad_norm": 0.0039958711713552475, + "learning_rate": 0.001, + "loss": 0.3643, + "step": 10796 + }, + { + "epoch": 0.2979129948919938, + "grad_norm": 0.003096002619713545, + "learning_rate": 0.001, + "loss": 0.356, + "step": 10797 + }, + { + "epoch": 0.29794058709305815, + "grad_norm": 0.004919158294796944, + "learning_rate": 0.001, + "loss": 0.3677, + "step": 10798 + }, + { + "epoch": 0.2979681792941225, + "grad_norm": 0.002651217393577099, + "learning_rate": 0.001, + "loss": 0.397, + "step": 10799 + }, + { + "epoch": 0.2979957714951869, + "grad_norm": 0.0031349128112196922, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 10800 + }, + { + "epoch": 0.29802336369625126, + "grad_norm": 0.0057730907574296, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 10801 + }, + { + "epoch": 0.2980509558973156, + "grad_norm": 0.004934222903102636, + "learning_rate": 0.001, + "loss": 0.3653, + "step": 10802 + }, + { + "epoch": 0.29807854809837997, + "grad_norm": 0.003473837161436677, + "learning_rate": 0.001, + "loss": 0.3967, + "step": 10803 + }, + { + "epoch": 0.2981061402994444, + "grad_norm": 0.0036314206663519144, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 10804 + }, + { + "epoch": 0.29813373250050873, + "grad_norm": 0.004540051333606243, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 10805 + }, + { + "epoch": 0.2981613247015731, + "grad_norm": 0.002627008128911257, + "learning_rate": 0.001, + "loss": 0.4118, + "step": 10806 + }, + { + "epoch": 0.2981889169026375, + "grad_norm": 0.0027296175248920918, + "learning_rate": 0.001, + "loss": 0.3905, + "step": 10807 + }, + { + "epoch": 0.29821650910370184, + "grad_norm": 0.0026647706981748343, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 10808 + }, + { + "epoch": 0.2982441013047662, + "grad_norm": 0.003241637721657753, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 10809 + }, + { + "epoch": 0.2982716935058306, + "grad_norm": 0.0028002928011119366, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 10810 + }, + { + "epoch": 0.29829928570689496, + "grad_norm": 0.0028025200590491295, + "learning_rate": 0.001, + "loss": 0.4006, + "step": 10811 + }, + { + "epoch": 0.2983268779079593, + "grad_norm": 0.002906485227867961, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 10812 + }, + { + "epoch": 0.29835447010902366, + "grad_norm": 0.0039535220712423325, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 10813 + }, + { + "epoch": 0.29838206231008807, + "grad_norm": 0.0033603734336793423, + "learning_rate": 0.001, + "loss": 0.3703, + "step": 10814 + }, + { + "epoch": 0.2984096545111524, + "grad_norm": 0.0031661302782595158, + "learning_rate": 0.001, + "loss": 0.3992, + "step": 10815 + }, + { + "epoch": 0.2984372467122168, + "grad_norm": 0.002344726352021098, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 10816 + }, + { + "epoch": 0.2984648389132812, + "grad_norm": 0.0025039922911673784, + "learning_rate": 0.001, + "loss": 0.398, + "step": 10817 + }, + { + "epoch": 0.29849243111434554, + "grad_norm": 0.002225354313850403, + "learning_rate": 0.001, + "loss": 0.425, + "step": 10818 + }, + { + "epoch": 0.2985200233154099, + "grad_norm": 0.0027313402388244867, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 10819 + }, + { + "epoch": 0.2985476155164743, + "grad_norm": 0.017960865050554276, + "learning_rate": 0.001, + "loss": 0.4091, + "step": 10820 + }, + { + "epoch": 0.29857520771753865, + "grad_norm": 0.011598210781812668, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 10821 + }, + { + "epoch": 0.298602799918603, + "grad_norm": 0.00278772902674973, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 10822 + }, + { + "epoch": 0.29863039211966735, + "grad_norm": 0.0026171396020799875, + "learning_rate": 0.001, + "loss": 0.4718, + "step": 10823 + }, + { + "epoch": 0.29865798432073176, + "grad_norm": 0.0038320801686495543, + "learning_rate": 0.001, + "loss": 0.3419, + "step": 10824 + }, + { + "epoch": 0.2986855765217961, + "grad_norm": 0.002889628754928708, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 10825 + }, + { + "epoch": 0.29871316872286047, + "grad_norm": 0.003588633146136999, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 10826 + }, + { + "epoch": 0.2987407609239249, + "grad_norm": 0.009326872415840626, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 10827 + }, + { + "epoch": 0.2987683531249892, + "grad_norm": 0.003909088671207428, + "learning_rate": 0.001, + "loss": 0.407, + "step": 10828 + }, + { + "epoch": 0.2987959453260536, + "grad_norm": 0.002797875087708235, + "learning_rate": 0.001, + "loss": 0.3514, + "step": 10829 + }, + { + "epoch": 0.29882353752711793, + "grad_norm": 0.003469844814389944, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 10830 + }, + { + "epoch": 0.29885112972818234, + "grad_norm": 0.0033456028904765844, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 10831 + }, + { + "epoch": 0.2988787219292467, + "grad_norm": 0.034299980849027634, + "learning_rate": 0.001, + "loss": 0.3644, + "step": 10832 + }, + { + "epoch": 0.29890631413031105, + "grad_norm": 0.003411337733268738, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 10833 + }, + { + "epoch": 0.29893390633137545, + "grad_norm": 0.0031848186627030373, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 10834 + }, + { + "epoch": 0.2989614985324398, + "grad_norm": 0.004304205067455769, + "learning_rate": 0.001, + "loss": 0.3718, + "step": 10835 + }, + { + "epoch": 0.29898909073350416, + "grad_norm": 0.002746276091784239, + "learning_rate": 0.001, + "loss": 0.3903, + "step": 10836 + }, + { + "epoch": 0.29901668293456857, + "grad_norm": 0.002491393592208624, + "learning_rate": 0.001, + "loss": 0.4229, + "step": 10837 + }, + { + "epoch": 0.2990442751356329, + "grad_norm": 0.0038965316489338875, + "learning_rate": 0.001, + "loss": 0.3433, + "step": 10838 + }, + { + "epoch": 0.2990718673366973, + "grad_norm": 0.0028245735447853804, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 10839 + }, + { + "epoch": 0.2990994595377616, + "grad_norm": 0.0021889859344810247, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 10840 + }, + { + "epoch": 0.29912705173882603, + "grad_norm": 0.0026167482137680054, + "learning_rate": 0.001, + "loss": 0.4401, + "step": 10841 + }, + { + "epoch": 0.2991546439398904, + "grad_norm": 0.0028761785943061113, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 10842 + }, + { + "epoch": 0.29918223614095474, + "grad_norm": 0.0031041146721690893, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 10843 + }, + { + "epoch": 0.29920982834201915, + "grad_norm": 0.002183483447879553, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 10844 + }, + { + "epoch": 0.2992374205430835, + "grad_norm": 0.0021024607121944427, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 10845 + }, + { + "epoch": 0.29926501274414785, + "grad_norm": 0.004469338804483414, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 10846 + }, + { + "epoch": 0.29929260494521226, + "grad_norm": 0.002203304087743163, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 10847 + }, + { + "epoch": 0.2993201971462766, + "grad_norm": 0.002491022925823927, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 10848 + }, + { + "epoch": 0.29934778934734096, + "grad_norm": 0.005152471829205751, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 10849 + }, + { + "epoch": 0.2993753815484053, + "grad_norm": 0.002667092252522707, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 10850 + }, + { + "epoch": 0.2994029737494697, + "grad_norm": 0.002084558829665184, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 10851 + }, + { + "epoch": 0.2994305659505341, + "grad_norm": 0.004005138296633959, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 10852 + }, + { + "epoch": 0.29945815815159843, + "grad_norm": 0.002981225959956646, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 10853 + }, + { + "epoch": 0.29948575035266284, + "grad_norm": 0.002931734314188361, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 10854 + }, + { + "epoch": 0.2995133425537272, + "grad_norm": 0.002261395798996091, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 10855 + }, + { + "epoch": 0.29954093475479154, + "grad_norm": 0.0047109718434512615, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 10856 + }, + { + "epoch": 0.29956852695585595, + "grad_norm": 0.003906469792127609, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 10857 + }, + { + "epoch": 0.2995961191569203, + "grad_norm": 0.003466276917606592, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 10858 + }, + { + "epoch": 0.29962371135798466, + "grad_norm": 0.005029084160923958, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 10859 + }, + { + "epoch": 0.299651303559049, + "grad_norm": 0.0030740953516215086, + "learning_rate": 0.001, + "loss": 0.402, + "step": 10860 + }, + { + "epoch": 0.2996788957601134, + "grad_norm": 0.00288767390884459, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 10861 + }, + { + "epoch": 0.29970648796117777, + "grad_norm": 0.004084376618266106, + "learning_rate": 0.001, + "loss": 0.408, + "step": 10862 + }, + { + "epoch": 0.2997340801622421, + "grad_norm": 0.008121359162032604, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 10863 + }, + { + "epoch": 0.29976167236330653, + "grad_norm": 0.0030457088723778725, + "learning_rate": 0.001, + "loss": 0.406, + "step": 10864 + }, + { + "epoch": 0.2997892645643709, + "grad_norm": 0.0024807127192616463, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 10865 + }, + { + "epoch": 0.29981685676543524, + "grad_norm": 0.0041881585493683815, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 10866 + }, + { + "epoch": 0.29984444896649964, + "grad_norm": 0.005836515221744776, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 10867 + }, + { + "epoch": 0.299872041167564, + "grad_norm": 0.003004920668900013, + "learning_rate": 0.001, + "loss": 0.377, + "step": 10868 + }, + { + "epoch": 0.29989963336862835, + "grad_norm": 0.0062048486433923244, + "learning_rate": 0.001, + "loss": 0.3794, + "step": 10869 + }, + { + "epoch": 0.2999272255696927, + "grad_norm": 0.0027396834921091795, + "learning_rate": 0.001, + "loss": 0.375, + "step": 10870 + }, + { + "epoch": 0.2999548177707571, + "grad_norm": 0.0027734714094549417, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 10871 + }, + { + "epoch": 0.29998240997182146, + "grad_norm": 0.0023340985644608736, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 10872 + }, + { + "epoch": 0.3000100021728858, + "grad_norm": 0.005321372300386429, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 10873 + }, + { + "epoch": 0.3000375943739502, + "grad_norm": 0.002819483634084463, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 10874 + }, + { + "epoch": 0.3000651865750146, + "grad_norm": 0.0026980694383382797, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 10875 + }, + { + "epoch": 0.30009277877607893, + "grad_norm": 0.003244214691221714, + "learning_rate": 0.001, + "loss": 0.3792, + "step": 10876 + }, + { + "epoch": 0.30012037097714334, + "grad_norm": 0.004197717644274235, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 10877 + }, + { + "epoch": 0.3001479631782077, + "grad_norm": 0.0026915615890175104, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 10878 + }, + { + "epoch": 0.30017555537927204, + "grad_norm": 0.003251213114708662, + "learning_rate": 0.001, + "loss": 0.4024, + "step": 10879 + }, + { + "epoch": 0.3002031475803364, + "grad_norm": 0.004156854934990406, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 10880 + }, + { + "epoch": 0.3002307397814008, + "grad_norm": 0.0028878510929644108, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 10881 + }, + { + "epoch": 0.30025833198246515, + "grad_norm": 0.003199911443516612, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 10882 + }, + { + "epoch": 0.3002859241835295, + "grad_norm": 0.002910776762291789, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 10883 + }, + { + "epoch": 0.3003135163845939, + "grad_norm": 0.004030926618725061, + "learning_rate": 0.001, + "loss": 0.394, + "step": 10884 + }, + { + "epoch": 0.30034110858565827, + "grad_norm": 0.004950513131916523, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 10885 + }, + { + "epoch": 0.3003687007867226, + "grad_norm": 0.0032172142527997494, + "learning_rate": 0.001, + "loss": 0.4344, + "step": 10886 + }, + { + "epoch": 0.30039629298778703, + "grad_norm": 0.002721422351896763, + "learning_rate": 0.001, + "loss": 0.3552, + "step": 10887 + }, + { + "epoch": 0.3004238851888514, + "grad_norm": 0.0029736789874732494, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 10888 + }, + { + "epoch": 0.30045147738991573, + "grad_norm": 0.0037150525022298098, + "learning_rate": 0.001, + "loss": 0.4114, + "step": 10889 + }, + { + "epoch": 0.3004790695909801, + "grad_norm": 0.002336825244128704, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 10890 + }, + { + "epoch": 0.3005066617920445, + "grad_norm": 0.0033320027869194746, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 10891 + }, + { + "epoch": 0.30053425399310885, + "grad_norm": 0.002994644921272993, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 10892 + }, + { + "epoch": 0.3005618461941732, + "grad_norm": 0.002780053298920393, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 10893 + }, + { + "epoch": 0.3005894383952376, + "grad_norm": 0.0025280967820435762, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 10894 + }, + { + "epoch": 0.30061703059630196, + "grad_norm": 0.003201976651325822, + "learning_rate": 0.001, + "loss": 0.4263, + "step": 10895 + }, + { + "epoch": 0.3006446227973663, + "grad_norm": 0.0020535339135676622, + "learning_rate": 0.001, + "loss": 0.4297, + "step": 10896 + }, + { + "epoch": 0.3006722149984307, + "grad_norm": 0.0021851370111107826, + "learning_rate": 0.001, + "loss": 0.4011, + "step": 10897 + }, + { + "epoch": 0.3006998071994951, + "grad_norm": 0.003310294123366475, + "learning_rate": 0.001, + "loss": 0.3851, + "step": 10898 + }, + { + "epoch": 0.3007273994005594, + "grad_norm": 0.002345843706279993, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 10899 + }, + { + "epoch": 0.3007549916016238, + "grad_norm": 0.004679036792367697, + "learning_rate": 0.001, + "loss": 0.389, + "step": 10900 + }, + { + "epoch": 0.3007825838026882, + "grad_norm": 0.0026619029231369495, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 10901 + }, + { + "epoch": 0.30081017600375254, + "grad_norm": 0.003363067051395774, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 10902 + }, + { + "epoch": 0.3008377682048169, + "grad_norm": 0.0038764209020882845, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 10903 + }, + { + "epoch": 0.3008653604058813, + "grad_norm": 0.005055352114140987, + "learning_rate": 0.001, + "loss": 0.382, + "step": 10904 + }, + { + "epoch": 0.30089295260694565, + "grad_norm": 0.0033133430406451225, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 10905 + }, + { + "epoch": 0.30092054480801, + "grad_norm": 0.002643629675731063, + "learning_rate": 0.001, + "loss": 0.4331, + "step": 10906 + }, + { + "epoch": 0.3009481370090744, + "grad_norm": 0.010275435633957386, + "learning_rate": 0.001, + "loss": 0.42, + "step": 10907 + }, + { + "epoch": 0.30097572921013876, + "grad_norm": 0.0022841233294457197, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 10908 + }, + { + "epoch": 0.3010033214112031, + "grad_norm": 0.00588460685685277, + "learning_rate": 0.001, + "loss": 0.4, + "step": 10909 + }, + { + "epoch": 0.30103091361226747, + "grad_norm": 0.003090951358899474, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 10910 + }, + { + "epoch": 0.3010585058133319, + "grad_norm": 0.011329708620905876, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 10911 + }, + { + "epoch": 0.30108609801439623, + "grad_norm": 0.002050732960924506, + "learning_rate": 0.001, + "loss": 0.4334, + "step": 10912 + }, + { + "epoch": 0.3011136902154606, + "grad_norm": 0.005159840919077396, + "learning_rate": 0.001, + "loss": 0.3852, + "step": 10913 + }, + { + "epoch": 0.301141282416525, + "grad_norm": 0.002075582044199109, + "learning_rate": 0.001, + "loss": 0.3713, + "step": 10914 + }, + { + "epoch": 0.30116887461758934, + "grad_norm": 0.0043141464702785015, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 10915 + }, + { + "epoch": 0.3011964668186537, + "grad_norm": 0.0034079302567988634, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 10916 + }, + { + "epoch": 0.30122405901971805, + "grad_norm": 0.0023484264966100454, + "learning_rate": 0.001, + "loss": 0.3907, + "step": 10917 + }, + { + "epoch": 0.30125165122078246, + "grad_norm": 0.0037031807005405426, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 10918 + }, + { + "epoch": 0.3012792434218468, + "grad_norm": 0.0023840791545808315, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 10919 + }, + { + "epoch": 0.30130683562291116, + "grad_norm": 0.002175979781895876, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 10920 + }, + { + "epoch": 0.30133442782397557, + "grad_norm": 0.0025980628561228514, + "learning_rate": 0.001, + "loss": 0.4119, + "step": 10921 + }, + { + "epoch": 0.3013620200250399, + "grad_norm": 0.012000390328466892, + "learning_rate": 0.001, + "loss": 0.3716, + "step": 10922 + }, + { + "epoch": 0.3013896122261043, + "grad_norm": 0.002536133164539933, + "learning_rate": 0.001, + "loss": 0.372, + "step": 10923 + }, + { + "epoch": 0.3014172044271687, + "grad_norm": 0.002884393557906151, + "learning_rate": 0.001, + "loss": 0.377, + "step": 10924 + }, + { + "epoch": 0.30144479662823304, + "grad_norm": 0.002150443848222494, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 10925 + }, + { + "epoch": 0.3014723888292974, + "grad_norm": 0.0027890305500477552, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 10926 + }, + { + "epoch": 0.30149998103036174, + "grad_norm": 0.0049143158830702305, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 10927 + }, + { + "epoch": 0.30152757323142615, + "grad_norm": 0.003542589023709297, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 10928 + }, + { + "epoch": 0.3015551654324905, + "grad_norm": 0.002202375093474984, + "learning_rate": 0.001, + "loss": 0.4332, + "step": 10929 + }, + { + "epoch": 0.30158275763355485, + "grad_norm": 0.0022870609536767006, + "learning_rate": 0.001, + "loss": 0.4504, + "step": 10930 + }, + { + "epoch": 0.30161034983461926, + "grad_norm": 0.0055159348994493484, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 10931 + }, + { + "epoch": 0.3016379420356836, + "grad_norm": 0.03571107238531113, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 10932 + }, + { + "epoch": 0.30166553423674797, + "grad_norm": 0.004229827784001827, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 10933 + }, + { + "epoch": 0.3016931264378124, + "grad_norm": 0.0025944889057427645, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 10934 + }, + { + "epoch": 0.30172071863887673, + "grad_norm": 0.007089374121278524, + "learning_rate": 0.001, + "loss": 0.403, + "step": 10935 + }, + { + "epoch": 0.3017483108399411, + "grad_norm": 0.002950213151052594, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 10936 + }, + { + "epoch": 0.30177590304100543, + "grad_norm": 0.006799718365073204, + "learning_rate": 0.001, + "loss": 0.4585, + "step": 10937 + }, + { + "epoch": 0.30180349524206984, + "grad_norm": 0.004761769436299801, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 10938 + }, + { + "epoch": 0.3018310874431342, + "grad_norm": 0.004374027717858553, + "learning_rate": 0.001, + "loss": 0.3667, + "step": 10939 + }, + { + "epoch": 0.30185867964419855, + "grad_norm": 0.007258753292262554, + "learning_rate": 0.001, + "loss": 0.3262, + "step": 10940 + }, + { + "epoch": 0.30188627184526295, + "grad_norm": 0.0028893048875033855, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 10941 + }, + { + "epoch": 0.3019138640463273, + "grad_norm": 0.003643547184765339, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 10942 + }, + { + "epoch": 0.30194145624739166, + "grad_norm": 0.0045185210183262825, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 10943 + }, + { + "epoch": 0.30196904844845607, + "grad_norm": 0.0024632923305034637, + "learning_rate": 0.001, + "loss": 0.4148, + "step": 10944 + }, + { + "epoch": 0.3019966406495204, + "grad_norm": 0.0027886577881872654, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 10945 + }, + { + "epoch": 0.3020242328505848, + "grad_norm": 0.0049835750833153725, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 10946 + }, + { + "epoch": 0.3020518250516491, + "grad_norm": 0.003602303098887205, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 10947 + }, + { + "epoch": 0.30207941725271353, + "grad_norm": 0.0027262901421636343, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 10948 + }, + { + "epoch": 0.3021070094537779, + "grad_norm": 0.0026782192289829254, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 10949 + }, + { + "epoch": 0.30213460165484224, + "grad_norm": 0.0033344633411616087, + "learning_rate": 0.001, + "loss": 0.37, + "step": 10950 + }, + { + "epoch": 0.30216219385590665, + "grad_norm": 0.007024961058050394, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 10951 + }, + { + "epoch": 0.302189786056971, + "grad_norm": 0.0027252105064690113, + "learning_rate": 0.001, + "loss": 0.435, + "step": 10952 + }, + { + "epoch": 0.30221737825803535, + "grad_norm": 0.0036616462748497725, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 10953 + }, + { + "epoch": 0.30224497045909976, + "grad_norm": 0.0024184328503906727, + "learning_rate": 0.001, + "loss": 0.3791, + "step": 10954 + }, + { + "epoch": 0.3022725626601641, + "grad_norm": 0.002676165895536542, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 10955 + }, + { + "epoch": 0.30230015486122847, + "grad_norm": 0.00226954510435462, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 10956 + }, + { + "epoch": 0.3023277470622928, + "grad_norm": 0.002427699277177453, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 10957 + }, + { + "epoch": 0.3023553392633572, + "grad_norm": 0.002578388201072812, + "learning_rate": 0.001, + "loss": 0.3645, + "step": 10958 + }, + { + "epoch": 0.3023829314644216, + "grad_norm": 0.0028099005576223135, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 10959 + }, + { + "epoch": 0.30241052366548593, + "grad_norm": 0.0038160127587616444, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 10960 + }, + { + "epoch": 0.30243811586655034, + "grad_norm": 0.004297412466257811, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 10961 + }, + { + "epoch": 0.3024657080676147, + "grad_norm": 0.004003297537565231, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 10962 + }, + { + "epoch": 0.30249330026867904, + "grad_norm": 0.003333853790536523, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 10963 + }, + { + "epoch": 0.30252089246974345, + "grad_norm": 0.007552805822342634, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 10964 + }, + { + "epoch": 0.3025484846708078, + "grad_norm": 0.002244778210297227, + "learning_rate": 0.001, + "loss": 0.4141, + "step": 10965 + }, + { + "epoch": 0.30257607687187216, + "grad_norm": 0.0028706330340355635, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 10966 + }, + { + "epoch": 0.3026036690729365, + "grad_norm": 0.002905101515352726, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 10967 + }, + { + "epoch": 0.3026312612740009, + "grad_norm": 0.0025956053286790848, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 10968 + }, + { + "epoch": 0.30265885347506527, + "grad_norm": 0.0038647784385830164, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 10969 + }, + { + "epoch": 0.3026864456761296, + "grad_norm": 0.004919635597616434, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 10970 + }, + { + "epoch": 0.30271403787719403, + "grad_norm": 0.0035008483100682497, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 10971 + }, + { + "epoch": 0.3027416300782584, + "grad_norm": 0.0027946606278419495, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 10972 + }, + { + "epoch": 0.30276922227932274, + "grad_norm": 0.0031994767487049103, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 10973 + }, + { + "epoch": 0.30279681448038714, + "grad_norm": 0.006230973172932863, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 10974 + }, + { + "epoch": 0.3028244066814515, + "grad_norm": 0.003084323601797223, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 10975 + }, + { + "epoch": 0.30285199888251585, + "grad_norm": 0.0022413854021579027, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 10976 + }, + { + "epoch": 0.3028795910835802, + "grad_norm": 0.0032844298984855413, + "learning_rate": 0.001, + "loss": 0.4187, + "step": 10977 + }, + { + "epoch": 0.3029071832846446, + "grad_norm": 0.002548948395997286, + "learning_rate": 0.001, + "loss": 0.395, + "step": 10978 + }, + { + "epoch": 0.30293477548570896, + "grad_norm": 0.0052003706805408, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 10979 + }, + { + "epoch": 0.3029623676867733, + "grad_norm": 0.003729903372004628, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 10980 + }, + { + "epoch": 0.3029899598878377, + "grad_norm": 0.0028468905948102474, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 10981 + }, + { + "epoch": 0.3030175520889021, + "grad_norm": 0.0020310785621404648, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 10982 + }, + { + "epoch": 0.30304514428996643, + "grad_norm": 0.002875153673812747, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 10983 + }, + { + "epoch": 0.30307273649103084, + "grad_norm": 0.003797962563112378, + "learning_rate": 0.001, + "loss": 0.4152, + "step": 10984 + }, + { + "epoch": 0.3031003286920952, + "grad_norm": 0.003318538423627615, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 10985 + }, + { + "epoch": 0.30312792089315954, + "grad_norm": 0.0027376478537917137, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 10986 + }, + { + "epoch": 0.3031555130942239, + "grad_norm": 0.0021703215315937996, + "learning_rate": 0.001, + "loss": 0.4027, + "step": 10987 + }, + { + "epoch": 0.3031831052952883, + "grad_norm": 0.00254058837890625, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 10988 + }, + { + "epoch": 0.30321069749635265, + "grad_norm": 0.0031664727721363306, + "learning_rate": 0.001, + "loss": 0.3666, + "step": 10989 + }, + { + "epoch": 0.303238289697417, + "grad_norm": 0.003056199988350272, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 10990 + }, + { + "epoch": 0.3032658818984814, + "grad_norm": 0.002997304080054164, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 10991 + }, + { + "epoch": 0.30329347409954577, + "grad_norm": 0.0028167872224003077, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 10992 + }, + { + "epoch": 0.3033210663006101, + "grad_norm": 0.0038003227673470974, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 10993 + }, + { + "epoch": 0.30334865850167453, + "grad_norm": 0.010225886479020119, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 10994 + }, + { + "epoch": 0.3033762507027389, + "grad_norm": 0.004211194813251495, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 10995 + }, + { + "epoch": 0.30340384290380323, + "grad_norm": 0.0034563657827675343, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 10996 + }, + { + "epoch": 0.3034314351048676, + "grad_norm": 0.0032761215697973967, + "learning_rate": 0.001, + "loss": 0.3444, + "step": 10997 + }, + { + "epoch": 0.303459027305932, + "grad_norm": 0.0042040105909109116, + "learning_rate": 0.001, + "loss": 0.3889, + "step": 10998 + }, + { + "epoch": 0.30348661950699635, + "grad_norm": 0.005281214602291584, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 10999 + }, + { + "epoch": 0.3035142117080607, + "grad_norm": 0.0027140434831380844, + "learning_rate": 0.001, + "loss": 0.414, + "step": 11000 + }, + { + "epoch": 0.3035142117080607, + "eval_runtime": 23.9221, + "eval_samples_per_second": 1.338, + "eval_steps_per_second": 0.167, + "step": 11000 + }, + { + "epoch": 0.3035418039091251, + "grad_norm": 0.0059446897357702255, + "learning_rate": 0.001, + "loss": 0.3585, + "step": 11001 + }, + { + "epoch": 0.30356939611018946, + "grad_norm": 0.002668326487764716, + "learning_rate": 0.001, + "loss": 0.3736, + "step": 11002 + }, + { + "epoch": 0.3035969883112538, + "grad_norm": 0.002568204887211323, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 11003 + }, + { + "epoch": 0.3036245805123182, + "grad_norm": 0.002712225541472435, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 11004 + }, + { + "epoch": 0.3036521727133826, + "grad_norm": 0.004501170478761196, + "learning_rate": 0.001, + "loss": 0.3691, + "step": 11005 + }, + { + "epoch": 0.3036797649144469, + "grad_norm": 0.005471101030707359, + "learning_rate": 0.001, + "loss": 0.3584, + "step": 11006 + }, + { + "epoch": 0.3037073571155113, + "grad_norm": 0.0026964505668729544, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 11007 + }, + { + "epoch": 0.3037349493165757, + "grad_norm": 0.003707374446094036, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 11008 + }, + { + "epoch": 0.30376254151764004, + "grad_norm": 0.012931153178215027, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 11009 + }, + { + "epoch": 0.3037901337187044, + "grad_norm": 0.009963351301848888, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 11010 + }, + { + "epoch": 0.3038177259197688, + "grad_norm": 0.0042528132908046246, + "learning_rate": 0.001, + "loss": 0.4651, + "step": 11011 + }, + { + "epoch": 0.30384531812083315, + "grad_norm": 0.004563808441162109, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11012 + }, + { + "epoch": 0.3038729103218975, + "grad_norm": 0.0027470001950860023, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11013 + }, + { + "epoch": 0.30390050252296186, + "grad_norm": 0.0026821244973689318, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 11014 + }, + { + "epoch": 0.30392809472402627, + "grad_norm": 0.002528084209188819, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11015 + }, + { + "epoch": 0.3039556869250906, + "grad_norm": 0.005667414516210556, + "learning_rate": 0.001, + "loss": 0.3563, + "step": 11016 + }, + { + "epoch": 0.30398327912615497, + "grad_norm": 0.0031073635909706354, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 11017 + }, + { + "epoch": 0.3040108713272194, + "grad_norm": 0.0026901857927441597, + "learning_rate": 0.001, + "loss": 0.3659, + "step": 11018 + }, + { + "epoch": 0.30403846352828373, + "grad_norm": 0.0030291874427348375, + "learning_rate": 0.001, + "loss": 0.373, + "step": 11019 + }, + { + "epoch": 0.3040660557293481, + "grad_norm": 0.0024688898120075464, + "learning_rate": 0.001, + "loss": 0.3581, + "step": 11020 + }, + { + "epoch": 0.3040936479304125, + "grad_norm": 0.0029477437492460012, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 11021 + }, + { + "epoch": 0.30412124013147684, + "grad_norm": 0.0034905215725302696, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 11022 + }, + { + "epoch": 0.3041488323325412, + "grad_norm": 0.0028976472094655037, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 11023 + }, + { + "epoch": 0.30417642453360555, + "grad_norm": 0.002226645825430751, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11024 + }, + { + "epoch": 0.30420401673466996, + "grad_norm": 0.00522614223882556, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 11025 + }, + { + "epoch": 0.3042316089357343, + "grad_norm": 0.004471330437809229, + "learning_rate": 0.001, + "loss": 0.4359, + "step": 11026 + }, + { + "epoch": 0.30425920113679866, + "grad_norm": 0.0025410952512174845, + "learning_rate": 0.001, + "loss": 0.4367, + "step": 11027 + }, + { + "epoch": 0.30428679333786307, + "grad_norm": 0.0039034727960824966, + "learning_rate": 0.001, + "loss": 0.3898, + "step": 11028 + }, + { + "epoch": 0.3043143855389274, + "grad_norm": 0.0032912297174334526, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 11029 + }, + { + "epoch": 0.3043419777399918, + "grad_norm": 0.009197532199323177, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 11030 + }, + { + "epoch": 0.3043695699410562, + "grad_norm": 0.003327523358166218, + "learning_rate": 0.001, + "loss": 0.3578, + "step": 11031 + }, + { + "epoch": 0.30439716214212054, + "grad_norm": 0.0036153208930045366, + "learning_rate": 0.001, + "loss": 0.3604, + "step": 11032 + }, + { + "epoch": 0.3044247543431849, + "grad_norm": 0.0028802400920540094, + "learning_rate": 0.001, + "loss": 0.408, + "step": 11033 + }, + { + "epoch": 0.30445234654424924, + "grad_norm": 0.0023775347508490086, + "learning_rate": 0.001, + "loss": 0.4018, + "step": 11034 + }, + { + "epoch": 0.30447993874531365, + "grad_norm": 0.007655430119484663, + "learning_rate": 0.001, + "loss": 0.3631, + "step": 11035 + }, + { + "epoch": 0.304507530946378, + "grad_norm": 0.0038870847783982754, + "learning_rate": 0.001, + "loss": 0.4255, + "step": 11036 + }, + { + "epoch": 0.30453512314744235, + "grad_norm": 0.010876862332224846, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 11037 + }, + { + "epoch": 0.30456271534850676, + "grad_norm": 0.01851513236761093, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 11038 + }, + { + "epoch": 0.3045903075495711, + "grad_norm": 0.0071109263226389885, + "learning_rate": 0.001, + "loss": 0.3756, + "step": 11039 + }, + { + "epoch": 0.30461789975063547, + "grad_norm": 0.002972143003717065, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 11040 + }, + { + "epoch": 0.3046454919516999, + "grad_norm": 0.0042697228491306305, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 11041 + }, + { + "epoch": 0.30467308415276423, + "grad_norm": 0.0034229194279760122, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 11042 + }, + { + "epoch": 0.3047006763538286, + "grad_norm": 0.0030748506542295218, + "learning_rate": 0.001, + "loss": 0.371, + "step": 11043 + }, + { + "epoch": 0.30472826855489293, + "grad_norm": 0.003974152263253927, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11044 + }, + { + "epoch": 0.30475586075595734, + "grad_norm": 0.0025529295671731234, + "learning_rate": 0.001, + "loss": 0.3786, + "step": 11045 + }, + { + "epoch": 0.3047834529570217, + "grad_norm": 0.002749155042693019, + "learning_rate": 0.001, + "loss": 0.4034, + "step": 11046 + }, + { + "epoch": 0.30481104515808605, + "grad_norm": 0.004938180558383465, + "learning_rate": 0.001, + "loss": 0.3687, + "step": 11047 + }, + { + "epoch": 0.30483863735915046, + "grad_norm": 0.0023326098453253508, + "learning_rate": 0.001, + "loss": 0.4322, + "step": 11048 + }, + { + "epoch": 0.3048662295602148, + "grad_norm": 0.0040608481504023075, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11049 + }, + { + "epoch": 0.30489382176127916, + "grad_norm": 0.004212150815874338, + "learning_rate": 0.001, + "loss": 0.3593, + "step": 11050 + }, + { + "epoch": 0.30492141396234357, + "grad_norm": 0.0022898244205862284, + "learning_rate": 0.001, + "loss": 0.4135, + "step": 11051 + }, + { + "epoch": 0.3049490061634079, + "grad_norm": 0.009196506813168526, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 11052 + }, + { + "epoch": 0.3049765983644723, + "grad_norm": 0.0035148763563483953, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 11053 + }, + { + "epoch": 0.3050041905655366, + "grad_norm": 0.002594373654574156, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 11054 + }, + { + "epoch": 0.30503178276660103, + "grad_norm": 0.0025384812615811825, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 11055 + }, + { + "epoch": 0.3050593749676654, + "grad_norm": 0.0027979982551187277, + "learning_rate": 0.001, + "loss": 0.3702, + "step": 11056 + }, + { + "epoch": 0.30508696716872974, + "grad_norm": 0.0027982275933027267, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 11057 + }, + { + "epoch": 0.30511455936979415, + "grad_norm": 0.002686277497559786, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 11058 + }, + { + "epoch": 0.3051421515708585, + "grad_norm": 0.0058397226966917515, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 11059 + }, + { + "epoch": 0.30516974377192285, + "grad_norm": 0.0036498201079666615, + "learning_rate": 0.001, + "loss": 0.4319, + "step": 11060 + }, + { + "epoch": 0.30519733597298726, + "grad_norm": 0.0023296461440622807, + "learning_rate": 0.001, + "loss": 0.4174, + "step": 11061 + }, + { + "epoch": 0.3052249281740516, + "grad_norm": 0.024197006598114967, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 11062 + }, + { + "epoch": 0.30525252037511597, + "grad_norm": 0.005508562549948692, + "learning_rate": 0.001, + "loss": 0.402, + "step": 11063 + }, + { + "epoch": 0.3052801125761803, + "grad_norm": 0.009980360977351665, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 11064 + }, + { + "epoch": 0.3053077047772447, + "grad_norm": 0.006406312808394432, + "learning_rate": 0.001, + "loss": 0.4074, + "step": 11065 + }, + { + "epoch": 0.3053352969783091, + "grad_norm": 0.00675487145781517, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 11066 + }, + { + "epoch": 0.30536288917937343, + "grad_norm": 0.00432139215990901, + "learning_rate": 0.001, + "loss": 0.4053, + "step": 11067 + }, + { + "epoch": 0.30539048138043784, + "grad_norm": 0.004939731676131487, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 11068 + }, + { + "epoch": 0.3054180735815022, + "grad_norm": 0.048318974673748016, + "learning_rate": 0.001, + "loss": 0.3609, + "step": 11069 + }, + { + "epoch": 0.30544566578256654, + "grad_norm": 0.002895533572882414, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 11070 + }, + { + "epoch": 0.30547325798363095, + "grad_norm": 0.0036742573138326406, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 11071 + }, + { + "epoch": 0.3055008501846953, + "grad_norm": 0.0037072747945785522, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 11072 + }, + { + "epoch": 0.30552844238575966, + "grad_norm": 0.003319724928587675, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 11073 + }, + { + "epoch": 0.305556034586824, + "grad_norm": 0.002916824072599411, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11074 + }, + { + "epoch": 0.3055836267878884, + "grad_norm": 0.00400786055251956, + "learning_rate": 0.001, + "loss": 0.4225, + "step": 11075 + }, + { + "epoch": 0.30561121898895277, + "grad_norm": 0.003279311815276742, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 11076 + }, + { + "epoch": 0.3056388111900171, + "grad_norm": 0.003970045130699873, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 11077 + }, + { + "epoch": 0.30566640339108153, + "grad_norm": 0.0034616603516042233, + "learning_rate": 0.001, + "loss": 0.386, + "step": 11078 + }, + { + "epoch": 0.3056939955921459, + "grad_norm": 0.003935270942747593, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 11079 + }, + { + "epoch": 0.30572158779321024, + "grad_norm": 0.005806989502161741, + "learning_rate": 0.001, + "loss": 0.3649, + "step": 11080 + }, + { + "epoch": 0.30574917999427464, + "grad_norm": 0.004018415231257677, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 11081 + }, + { + "epoch": 0.305776772195339, + "grad_norm": 0.0026464585680514574, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 11082 + }, + { + "epoch": 0.30580436439640335, + "grad_norm": 0.006566127296537161, + "learning_rate": 0.001, + "loss": 0.378, + "step": 11083 + }, + { + "epoch": 0.3058319565974677, + "grad_norm": 0.0035378343891352415, + "learning_rate": 0.001, + "loss": 0.3585, + "step": 11084 + }, + { + "epoch": 0.3058595487985321, + "grad_norm": 0.005152801051735878, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 11085 + }, + { + "epoch": 0.30588714099959646, + "grad_norm": 0.0025191842578351498, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 11086 + }, + { + "epoch": 0.3059147332006608, + "grad_norm": 0.002810958307236433, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 11087 + }, + { + "epoch": 0.3059423254017252, + "grad_norm": 0.003942539449781179, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 11088 + }, + { + "epoch": 0.3059699176027896, + "grad_norm": 0.0032198168337345123, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 11089 + }, + { + "epoch": 0.30599750980385393, + "grad_norm": 0.004471073392778635, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 11090 + }, + { + "epoch": 0.30602510200491834, + "grad_norm": 0.0025380223523825407, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 11091 + }, + { + "epoch": 0.3060526942059827, + "grad_norm": 0.003936004359275103, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 11092 + }, + { + "epoch": 0.30608028640704704, + "grad_norm": 0.0035589123144745827, + "learning_rate": 0.001, + "loss": 0.3807, + "step": 11093 + }, + { + "epoch": 0.3061078786081114, + "grad_norm": 0.003829971654340625, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 11094 + }, + { + "epoch": 0.3061354708091758, + "grad_norm": 0.0033564860932528973, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 11095 + }, + { + "epoch": 0.30616306301024016, + "grad_norm": 0.006199992261826992, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 11096 + }, + { + "epoch": 0.3061906552113045, + "grad_norm": 0.0038411279674619436, + "learning_rate": 0.001, + "loss": 0.4201, + "step": 11097 + }, + { + "epoch": 0.3062182474123689, + "grad_norm": 0.0028849910013377666, + "learning_rate": 0.001, + "loss": 0.4105, + "step": 11098 + }, + { + "epoch": 0.30624583961343327, + "grad_norm": 0.005419904366135597, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 11099 + }, + { + "epoch": 0.3062734318144976, + "grad_norm": 0.004621229134500027, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 11100 + }, + { + "epoch": 0.30630102401556203, + "grad_norm": 0.004461618140339851, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 11101 + }, + { + "epoch": 0.3063286162166264, + "grad_norm": 0.0031210975721478462, + "learning_rate": 0.001, + "loss": 0.402, + "step": 11102 + }, + { + "epoch": 0.30635620841769073, + "grad_norm": 0.0036159909795969725, + "learning_rate": 0.001, + "loss": 0.3989, + "step": 11103 + }, + { + "epoch": 0.3063838006187551, + "grad_norm": 0.0030616209842264652, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 11104 + }, + { + "epoch": 0.3064113928198195, + "grad_norm": 0.004985187668353319, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 11105 + }, + { + "epoch": 0.30643898502088385, + "grad_norm": 0.0028018758166581392, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 11106 + }, + { + "epoch": 0.3064665772219482, + "grad_norm": 0.004196144640445709, + "learning_rate": 0.001, + "loss": 0.3706, + "step": 11107 + }, + { + "epoch": 0.3064941694230126, + "grad_norm": 0.0038290584925562143, + "learning_rate": 0.001, + "loss": 0.3843, + "step": 11108 + }, + { + "epoch": 0.30652176162407696, + "grad_norm": 0.0028051917906850576, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 11109 + }, + { + "epoch": 0.3065493538251413, + "grad_norm": 0.003199791768565774, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 11110 + }, + { + "epoch": 0.30657694602620567, + "grad_norm": 0.0040592108853161335, + "learning_rate": 0.001, + "loss": 0.4014, + "step": 11111 + }, + { + "epoch": 0.3066045382272701, + "grad_norm": 0.005883520934730768, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 11112 + }, + { + "epoch": 0.3066321304283344, + "grad_norm": 0.0024851111229509115, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 11113 + }, + { + "epoch": 0.3066597226293988, + "grad_norm": 0.003925004508346319, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 11114 + }, + { + "epoch": 0.3066873148304632, + "grad_norm": 0.0040878294967114925, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 11115 + }, + { + "epoch": 0.30671490703152754, + "grad_norm": 0.0032062202226370573, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 11116 + }, + { + "epoch": 0.3067424992325919, + "grad_norm": 0.004021904431283474, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 11117 + }, + { + "epoch": 0.3067700914336563, + "grad_norm": 0.002459887880831957, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 11118 + }, + { + "epoch": 0.30679768363472065, + "grad_norm": 0.0019991539884358644, + "learning_rate": 0.001, + "loss": 0.4321, + "step": 11119 + }, + { + "epoch": 0.306825275835785, + "grad_norm": 0.002835212042555213, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 11120 + }, + { + "epoch": 0.30685286803684936, + "grad_norm": 0.003514475654810667, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 11121 + }, + { + "epoch": 0.30688046023791377, + "grad_norm": 0.0025088752154260874, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 11122 + }, + { + "epoch": 0.3069080524389781, + "grad_norm": 0.0025806506164371967, + "learning_rate": 0.001, + "loss": 0.425, + "step": 11123 + }, + { + "epoch": 0.30693564464004247, + "grad_norm": 0.0025829877704381943, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 11124 + }, + { + "epoch": 0.3069632368411069, + "grad_norm": 0.0033166445791721344, + "learning_rate": 0.001, + "loss": 0.3884, + "step": 11125 + }, + { + "epoch": 0.30699082904217123, + "grad_norm": 0.0023280063178390265, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 11126 + }, + { + "epoch": 0.3070184212432356, + "grad_norm": 0.004174636676907539, + "learning_rate": 0.001, + "loss": 0.3766, + "step": 11127 + }, + { + "epoch": 0.3070460134443, + "grad_norm": 0.002358420053496957, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 11128 + }, + { + "epoch": 0.30707360564536434, + "grad_norm": 0.002632340881973505, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 11129 + }, + { + "epoch": 0.3071011978464287, + "grad_norm": 0.004848247393965721, + "learning_rate": 0.001, + "loss": 0.4432, + "step": 11130 + }, + { + "epoch": 0.30712879004749305, + "grad_norm": 0.002614147262647748, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 11131 + }, + { + "epoch": 0.30715638224855746, + "grad_norm": 0.0026096473447978497, + "learning_rate": 0.001, + "loss": 0.4032, + "step": 11132 + }, + { + "epoch": 0.3071839744496218, + "grad_norm": 0.002825048053637147, + "learning_rate": 0.001, + "loss": 0.3376, + "step": 11133 + }, + { + "epoch": 0.30721156665068616, + "grad_norm": 0.002993338042870164, + "learning_rate": 0.001, + "loss": 0.3784, + "step": 11134 + }, + { + "epoch": 0.30723915885175057, + "grad_norm": 0.0025206785649061203, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11135 + }, + { + "epoch": 0.3072667510528149, + "grad_norm": 0.0064323414117097855, + "learning_rate": 0.001, + "loss": 0.3892, + "step": 11136 + }, + { + "epoch": 0.3072943432538793, + "grad_norm": 0.006029163021594286, + "learning_rate": 0.001, + "loss": 0.4013, + "step": 11137 + }, + { + "epoch": 0.3073219354549437, + "grad_norm": 0.003802549559623003, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 11138 + }, + { + "epoch": 0.30734952765600804, + "grad_norm": 0.0036848250310868025, + "learning_rate": 0.001, + "loss": 0.408, + "step": 11139 + }, + { + "epoch": 0.3073771198570724, + "grad_norm": 0.002685883082449436, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 11140 + }, + { + "epoch": 0.30740471205813674, + "grad_norm": 0.004660370759665966, + "learning_rate": 0.001, + "loss": 0.4285, + "step": 11141 + }, + { + "epoch": 0.30743230425920115, + "grad_norm": 0.004408833105117083, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 11142 + }, + { + "epoch": 0.3074598964602655, + "grad_norm": 0.003984685987234116, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 11143 + }, + { + "epoch": 0.30748748866132986, + "grad_norm": 0.0034449570812284946, + "learning_rate": 0.001, + "loss": 0.392, + "step": 11144 + }, + { + "epoch": 0.30751508086239426, + "grad_norm": 0.0050663729198277, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 11145 + }, + { + "epoch": 0.3075426730634586, + "grad_norm": 0.0025960092898458242, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 11146 + }, + { + "epoch": 0.30757026526452297, + "grad_norm": 0.003925015218555927, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 11147 + }, + { + "epoch": 0.3075978574655874, + "grad_norm": 0.003075996646657586, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 11148 + }, + { + "epoch": 0.30762544966665173, + "grad_norm": 0.0028860154561698437, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 11149 + }, + { + "epoch": 0.3076530418677161, + "grad_norm": 0.0029483395628631115, + "learning_rate": 0.001, + "loss": 0.3814, + "step": 11150 + }, + { + "epoch": 0.30768063406878043, + "grad_norm": 0.006685647647827864, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 11151 + }, + { + "epoch": 0.30770822626984484, + "grad_norm": 0.0028818738646805286, + "learning_rate": 0.001, + "loss": 0.3885, + "step": 11152 + }, + { + "epoch": 0.3077358184709092, + "grad_norm": 0.003373377723619342, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 11153 + }, + { + "epoch": 0.30776341067197355, + "grad_norm": 0.005747408606112003, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 11154 + }, + { + "epoch": 0.30779100287303796, + "grad_norm": 0.003609336679801345, + "learning_rate": 0.001, + "loss": 0.4214, + "step": 11155 + }, + { + "epoch": 0.3078185950741023, + "grad_norm": 0.005563123617321253, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 11156 + }, + { + "epoch": 0.30784618727516666, + "grad_norm": 0.003597275586798787, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 11157 + }, + { + "epoch": 0.30787377947623107, + "grad_norm": 0.0027966846246272326, + "learning_rate": 0.001, + "loss": 0.4379, + "step": 11158 + }, + { + "epoch": 0.3079013716772954, + "grad_norm": 0.002950585214421153, + "learning_rate": 0.001, + "loss": 0.3762, + "step": 11159 + }, + { + "epoch": 0.3079289638783598, + "grad_norm": 0.0030036913231015205, + "learning_rate": 0.001, + "loss": 0.4352, + "step": 11160 + }, + { + "epoch": 0.3079565560794241, + "grad_norm": 0.004463705699890852, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 11161 + }, + { + "epoch": 0.30798414828048853, + "grad_norm": 0.0020041607785969973, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 11162 + }, + { + "epoch": 0.3080117404815529, + "grad_norm": 0.003834531642496586, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 11163 + }, + { + "epoch": 0.30803933268261724, + "grad_norm": 0.002345136133953929, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 11164 + }, + { + "epoch": 0.30806692488368165, + "grad_norm": 0.0028496733866631985, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 11165 + }, + { + "epoch": 0.308094517084746, + "grad_norm": 0.003474792465567589, + "learning_rate": 0.001, + "loss": 0.4264, + "step": 11166 + }, + { + "epoch": 0.30812210928581035, + "grad_norm": 0.0036203833296895027, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 11167 + }, + { + "epoch": 0.30814970148687476, + "grad_norm": 0.004971282556653023, + "learning_rate": 0.001, + "loss": 0.3704, + "step": 11168 + }, + { + "epoch": 0.3081772936879391, + "grad_norm": 0.007116068620234728, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 11169 + }, + { + "epoch": 0.30820488588900347, + "grad_norm": 0.005289649125188589, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 11170 + }, + { + "epoch": 0.3082324780900678, + "grad_norm": 0.0034187568817287683, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 11171 + }, + { + "epoch": 0.3082600702911322, + "grad_norm": 0.002938291057944298, + "learning_rate": 0.001, + "loss": 0.432, + "step": 11172 + }, + { + "epoch": 0.3082876624921966, + "grad_norm": 0.004787916783243418, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 11173 + }, + { + "epoch": 0.30831525469326093, + "grad_norm": 0.0029457362834364176, + "learning_rate": 0.001, + "loss": 0.4207, + "step": 11174 + }, + { + "epoch": 0.30834284689432534, + "grad_norm": 0.006731043569743633, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11175 + }, + { + "epoch": 0.3083704390953897, + "grad_norm": 0.005668407306075096, + "learning_rate": 0.001, + "loss": 0.4199, + "step": 11176 + }, + { + "epoch": 0.30839803129645404, + "grad_norm": 0.0028704048600047827, + "learning_rate": 0.001, + "loss": 0.4402, + "step": 11177 + }, + { + "epoch": 0.30842562349751845, + "grad_norm": 0.00468471460044384, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 11178 + }, + { + "epoch": 0.3084532156985828, + "grad_norm": 0.003784140106290579, + "learning_rate": 0.001, + "loss": 0.42, + "step": 11179 + }, + { + "epoch": 0.30848080789964716, + "grad_norm": 0.0028519639745354652, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 11180 + }, + { + "epoch": 0.3085084001007115, + "grad_norm": 0.004001649562269449, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 11181 + }, + { + "epoch": 0.3085359923017759, + "grad_norm": 0.0034179389476776123, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 11182 + }, + { + "epoch": 0.30856358450284027, + "grad_norm": 0.004161709453910589, + "learning_rate": 0.001, + "loss": 0.3561, + "step": 11183 + }, + { + "epoch": 0.3085911767039046, + "grad_norm": 0.0021271705627441406, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 11184 + }, + { + "epoch": 0.30861876890496903, + "grad_norm": 0.004352390766143799, + "learning_rate": 0.001, + "loss": 0.4272, + "step": 11185 + }, + { + "epoch": 0.3086463611060334, + "grad_norm": 0.0035197525285184383, + "learning_rate": 0.001, + "loss": 0.3793, + "step": 11186 + }, + { + "epoch": 0.30867395330709774, + "grad_norm": 0.003885299200192094, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 11187 + }, + { + "epoch": 0.30870154550816215, + "grad_norm": 0.004519937559962273, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 11188 + }, + { + "epoch": 0.3087291377092265, + "grad_norm": 0.001987120136618614, + "learning_rate": 0.001, + "loss": 0.4137, + "step": 11189 + }, + { + "epoch": 0.30875672991029085, + "grad_norm": 0.003369292477145791, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 11190 + }, + { + "epoch": 0.3087843221113552, + "grad_norm": 0.0031297069508582354, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 11191 + }, + { + "epoch": 0.3088119143124196, + "grad_norm": 0.0024408060126006603, + "learning_rate": 0.001, + "loss": 0.4602, + "step": 11192 + }, + { + "epoch": 0.30883950651348396, + "grad_norm": 0.0031313635408878326, + "learning_rate": 0.001, + "loss": 0.4325, + "step": 11193 + }, + { + "epoch": 0.3088670987145483, + "grad_norm": 0.0029580635018646717, + "learning_rate": 0.001, + "loss": 0.3781, + "step": 11194 + }, + { + "epoch": 0.3088946909156127, + "grad_norm": 0.0027078099083155394, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 11195 + }, + { + "epoch": 0.3089222831166771, + "grad_norm": 0.0038466129917651415, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 11196 + }, + { + "epoch": 0.30894987531774143, + "grad_norm": 0.003398042405024171, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 11197 + }, + { + "epoch": 0.30897746751880584, + "grad_norm": 0.007685355842113495, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 11198 + }, + { + "epoch": 0.3090050597198702, + "grad_norm": 0.005074410233646631, + "learning_rate": 0.001, + "loss": 0.3998, + "step": 11199 + }, + { + "epoch": 0.30903265192093454, + "grad_norm": 0.003894890658557415, + "learning_rate": 0.001, + "loss": 0.4394, + "step": 11200 + }, + { + "epoch": 0.3090602441219989, + "grad_norm": 0.0026795901358127594, + "learning_rate": 0.001, + "loss": 0.4315, + "step": 11201 + }, + { + "epoch": 0.3090878363230633, + "grad_norm": 0.003955294843763113, + "learning_rate": 0.001, + "loss": 0.3723, + "step": 11202 + }, + { + "epoch": 0.30911542852412766, + "grad_norm": 0.0025795320980250835, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 11203 + }, + { + "epoch": 0.309143020725192, + "grad_norm": 0.004697353113442659, + "learning_rate": 0.001, + "loss": 0.368, + "step": 11204 + }, + { + "epoch": 0.3091706129262564, + "grad_norm": 0.0036036588717252016, + "learning_rate": 0.001, + "loss": 0.376, + "step": 11205 + }, + { + "epoch": 0.30919820512732077, + "grad_norm": 0.002920630620792508, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 11206 + }, + { + "epoch": 0.3092257973283851, + "grad_norm": 0.00252629560418427, + "learning_rate": 0.001, + "loss": 0.4474, + "step": 11207 + }, + { + "epoch": 0.3092533895294495, + "grad_norm": 0.002593503100797534, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 11208 + }, + { + "epoch": 0.3092809817305139, + "grad_norm": 0.002563952933996916, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 11209 + }, + { + "epoch": 0.30930857393157823, + "grad_norm": 0.00308457319624722, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 11210 + }, + { + "epoch": 0.3093361661326426, + "grad_norm": 0.004335676319897175, + "learning_rate": 0.001, + "loss": 0.3702, + "step": 11211 + }, + { + "epoch": 0.309363758333707, + "grad_norm": 0.004092890303581953, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 11212 + }, + { + "epoch": 0.30939135053477135, + "grad_norm": 0.003627408528700471, + "learning_rate": 0.001, + "loss": 0.4175, + "step": 11213 + }, + { + "epoch": 0.3094189427358357, + "grad_norm": 0.0031013472471386194, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 11214 + }, + { + "epoch": 0.3094465349369001, + "grad_norm": 0.002774468855932355, + "learning_rate": 0.001, + "loss": 0.434, + "step": 11215 + }, + { + "epoch": 0.30947412713796446, + "grad_norm": 0.0022229300811886787, + "learning_rate": 0.001, + "loss": 0.4303, + "step": 11216 + }, + { + "epoch": 0.3095017193390288, + "grad_norm": 0.004068805370479822, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 11217 + }, + { + "epoch": 0.30952931154009317, + "grad_norm": 0.0022410049568861723, + "learning_rate": 0.001, + "loss": 0.4067, + "step": 11218 + }, + { + "epoch": 0.3095569037411576, + "grad_norm": 0.002807425567880273, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11219 + }, + { + "epoch": 0.3095844959422219, + "grad_norm": 0.008168449625372887, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 11220 + }, + { + "epoch": 0.3096120881432863, + "grad_norm": 0.007224308326840401, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 11221 + }, + { + "epoch": 0.3096396803443507, + "grad_norm": 0.004867930430918932, + "learning_rate": 0.001, + "loss": 0.4163, + "step": 11222 + }, + { + "epoch": 0.30966727254541504, + "grad_norm": 0.004449460655450821, + "learning_rate": 0.001, + "loss": 0.4065, + "step": 11223 + }, + { + "epoch": 0.3096948647464794, + "grad_norm": 0.003997356165200472, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 11224 + }, + { + "epoch": 0.3097224569475438, + "grad_norm": 0.0036069692578166723, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 11225 + }, + { + "epoch": 0.30975004914860815, + "grad_norm": 0.003934454172849655, + "learning_rate": 0.001, + "loss": 0.4149, + "step": 11226 + }, + { + "epoch": 0.3097776413496725, + "grad_norm": 0.01457425020635128, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 11227 + }, + { + "epoch": 0.30980523355073686, + "grad_norm": 0.004006854724138975, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 11228 + }, + { + "epoch": 0.30983282575180127, + "grad_norm": 0.0053675188682973385, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 11229 + }, + { + "epoch": 0.3098604179528656, + "grad_norm": 0.0030079265125095844, + "learning_rate": 0.001, + "loss": 0.3465, + "step": 11230 + }, + { + "epoch": 0.30988801015392997, + "grad_norm": 0.004947451408952475, + "learning_rate": 0.001, + "loss": 0.4309, + "step": 11231 + }, + { + "epoch": 0.3099156023549944, + "grad_norm": 0.0025891209952533245, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 11232 + }, + { + "epoch": 0.30994319455605873, + "grad_norm": 0.0022305548191070557, + "learning_rate": 0.001, + "loss": 0.4611, + "step": 11233 + }, + { + "epoch": 0.3099707867571231, + "grad_norm": 0.0024289439897984266, + "learning_rate": 0.001, + "loss": 0.416, + "step": 11234 + }, + { + "epoch": 0.3099983789581875, + "grad_norm": 0.005738126579672098, + "learning_rate": 0.001, + "loss": 0.392, + "step": 11235 + }, + { + "epoch": 0.31002597115925185, + "grad_norm": 0.005195004399865866, + "learning_rate": 0.001, + "loss": 0.3711, + "step": 11236 + }, + { + "epoch": 0.3100535633603162, + "grad_norm": 0.003794547636061907, + "learning_rate": 0.001, + "loss": 0.4293, + "step": 11237 + }, + { + "epoch": 0.31008115556138055, + "grad_norm": 0.0029551167972385883, + "learning_rate": 0.001, + "loss": 0.3751, + "step": 11238 + }, + { + "epoch": 0.31010874776244496, + "grad_norm": 0.002617292571812868, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 11239 + }, + { + "epoch": 0.3101363399635093, + "grad_norm": 0.013282334432005882, + "learning_rate": 0.001, + "loss": 0.3738, + "step": 11240 + }, + { + "epoch": 0.31016393216457366, + "grad_norm": 0.0025673380587249994, + "learning_rate": 0.001, + "loss": 0.4433, + "step": 11241 + }, + { + "epoch": 0.31019152436563807, + "grad_norm": 0.0032431413419544697, + "learning_rate": 0.001, + "loss": 0.3933, + "step": 11242 + }, + { + "epoch": 0.3102191165667024, + "grad_norm": 0.006165751256048679, + "learning_rate": 0.001, + "loss": 0.4224, + "step": 11243 + }, + { + "epoch": 0.3102467087677668, + "grad_norm": 0.0051593780517578125, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 11244 + }, + { + "epoch": 0.3102743009688312, + "grad_norm": 0.004621399566531181, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 11245 + }, + { + "epoch": 0.31030189316989554, + "grad_norm": 0.007019545882940292, + "learning_rate": 0.001, + "loss": 0.3769, + "step": 11246 + }, + { + "epoch": 0.3103294853709599, + "grad_norm": 0.0030706804245710373, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 11247 + }, + { + "epoch": 0.31035707757202424, + "grad_norm": 0.009759355336427689, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 11248 + }, + { + "epoch": 0.31038466977308865, + "grad_norm": 0.004515976645052433, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 11249 + }, + { + "epoch": 0.310412261974153, + "grad_norm": 0.005143680609762669, + "learning_rate": 0.001, + "loss": 0.3672, + "step": 11250 + }, + { + "epoch": 0.31043985417521736, + "grad_norm": 0.004736714996397495, + "learning_rate": 0.001, + "loss": 0.3894, + "step": 11251 + }, + { + "epoch": 0.31046744637628176, + "grad_norm": 0.004402066580951214, + "learning_rate": 0.001, + "loss": 0.3817, + "step": 11252 + }, + { + "epoch": 0.3104950385773461, + "grad_norm": 0.004078419879078865, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 11253 + }, + { + "epoch": 0.31052263077841047, + "grad_norm": 0.005986323114484549, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 11254 + }, + { + "epoch": 0.3105502229794749, + "grad_norm": 0.0041157579980790615, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 11255 + }, + { + "epoch": 0.31057781518053923, + "grad_norm": 0.005283708218485117, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 11256 + }, + { + "epoch": 0.3106054073816036, + "grad_norm": 0.0037576963659375906, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 11257 + }, + { + "epoch": 0.31063299958266793, + "grad_norm": 0.004866046831011772, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 11258 + }, + { + "epoch": 0.31066059178373234, + "grad_norm": 0.007358568720519543, + "learning_rate": 0.001, + "loss": 0.4208, + "step": 11259 + }, + { + "epoch": 0.3106881839847967, + "grad_norm": 0.004052911419421434, + "learning_rate": 0.001, + "loss": 0.412, + "step": 11260 + }, + { + "epoch": 0.31071577618586105, + "grad_norm": 0.004355463664978743, + "learning_rate": 0.001, + "loss": 0.3823, + "step": 11261 + }, + { + "epoch": 0.31074336838692546, + "grad_norm": 0.006888225674629211, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 11262 + }, + { + "epoch": 0.3107709605879898, + "grad_norm": 0.004396567586809397, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 11263 + }, + { + "epoch": 0.31079855278905416, + "grad_norm": 0.003929099999368191, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 11264 + }, + { + "epoch": 0.31082614499011857, + "grad_norm": 0.0028630662709474564, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 11265 + }, + { + "epoch": 0.3108537371911829, + "grad_norm": 0.003565790131688118, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11266 + }, + { + "epoch": 0.3108813293922473, + "grad_norm": 0.004160342272371054, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 11267 + }, + { + "epoch": 0.3109089215933116, + "grad_norm": 0.004211884923279285, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 11268 + }, + { + "epoch": 0.31093651379437603, + "grad_norm": 0.004373464733362198, + "learning_rate": 0.001, + "loss": 0.4307, + "step": 11269 + }, + { + "epoch": 0.3109641059954404, + "grad_norm": 0.0034809240605682135, + "learning_rate": 0.001, + "loss": 0.3354, + "step": 11270 + }, + { + "epoch": 0.31099169819650474, + "grad_norm": 0.004237370565533638, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 11271 + }, + { + "epoch": 0.31101929039756915, + "grad_norm": 0.003246934851631522, + "learning_rate": 0.001, + "loss": 0.4406, + "step": 11272 + }, + { + "epoch": 0.3110468825986335, + "grad_norm": 0.0036773579195141792, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 11273 + }, + { + "epoch": 0.31107447479969785, + "grad_norm": 0.005854573100805283, + "learning_rate": 0.001, + "loss": 0.3478, + "step": 11274 + }, + { + "epoch": 0.31110206700076226, + "grad_norm": 0.004920937120914459, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 11275 + }, + { + "epoch": 0.3111296592018266, + "grad_norm": 0.0030272614676505327, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 11276 + }, + { + "epoch": 0.31115725140289097, + "grad_norm": 0.002614455996081233, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 11277 + }, + { + "epoch": 0.3111848436039553, + "grad_norm": 0.006632671691477299, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 11278 + }, + { + "epoch": 0.3112124358050197, + "grad_norm": 0.005778672639280558, + "learning_rate": 0.001, + "loss": 0.3869, + "step": 11279 + }, + { + "epoch": 0.3112400280060841, + "grad_norm": 0.0035203301813453436, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 11280 + }, + { + "epoch": 0.31126762020714843, + "grad_norm": 0.002483597956597805, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 11281 + }, + { + "epoch": 0.31129521240821284, + "grad_norm": 0.004106397274881601, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 11282 + }, + { + "epoch": 0.3113228046092772, + "grad_norm": 0.002346999244764447, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 11283 + }, + { + "epoch": 0.31135039681034155, + "grad_norm": 0.005101846065372229, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 11284 + }, + { + "epoch": 0.31137798901140595, + "grad_norm": 0.0027893667574971914, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 11285 + }, + { + "epoch": 0.3114055812124703, + "grad_norm": 0.0029769500251859426, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 11286 + }, + { + "epoch": 0.31143317341353466, + "grad_norm": 0.0033861526753753424, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11287 + }, + { + "epoch": 0.311460765614599, + "grad_norm": 0.006664988584816456, + "learning_rate": 0.001, + "loss": 0.3681, + "step": 11288 + }, + { + "epoch": 0.3114883578156634, + "grad_norm": 0.004560802131891251, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11289 + }, + { + "epoch": 0.31151595001672777, + "grad_norm": 0.005946944933384657, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 11290 + }, + { + "epoch": 0.3115435422177921, + "grad_norm": 0.0032140284311026335, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 11291 + }, + { + "epoch": 0.31157113441885653, + "grad_norm": 0.011185484007000923, + "learning_rate": 0.001, + "loss": 0.392, + "step": 11292 + }, + { + "epoch": 0.3115987266199209, + "grad_norm": 0.0024210652336478233, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 11293 + }, + { + "epoch": 0.31162631882098524, + "grad_norm": 0.00252347718924284, + "learning_rate": 0.001, + "loss": 0.404, + "step": 11294 + }, + { + "epoch": 0.3116539110220496, + "grad_norm": 0.0039216154254972935, + "learning_rate": 0.001, + "loss": 0.405, + "step": 11295 + }, + { + "epoch": 0.311681503223114, + "grad_norm": 0.0024138404987752438, + "learning_rate": 0.001, + "loss": 0.3934, + "step": 11296 + }, + { + "epoch": 0.31170909542417835, + "grad_norm": 0.0026965364813804626, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 11297 + }, + { + "epoch": 0.3117366876252427, + "grad_norm": 0.0034357013646513224, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11298 + }, + { + "epoch": 0.3117642798263071, + "grad_norm": 0.0027775561902672052, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 11299 + }, + { + "epoch": 0.31179187202737146, + "grad_norm": 0.002832148689776659, + "learning_rate": 0.001, + "loss": 0.3689, + "step": 11300 + }, + { + "epoch": 0.3118194642284358, + "grad_norm": 0.002956314478069544, + "learning_rate": 0.001, + "loss": 0.4185, + "step": 11301 + }, + { + "epoch": 0.3118470564295002, + "grad_norm": 0.006010833196341991, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 11302 + }, + { + "epoch": 0.3118746486305646, + "grad_norm": 0.002660473110154271, + "learning_rate": 0.001, + "loss": 0.426, + "step": 11303 + }, + { + "epoch": 0.31190224083162893, + "grad_norm": 0.002595923375338316, + "learning_rate": 0.001, + "loss": 0.3733, + "step": 11304 + }, + { + "epoch": 0.3119298330326933, + "grad_norm": 0.004651275463402271, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 11305 + }, + { + "epoch": 0.3119574252337577, + "grad_norm": 0.0029690570663660765, + "learning_rate": 0.001, + "loss": 0.353, + "step": 11306 + }, + { + "epoch": 0.31198501743482204, + "grad_norm": 0.0036981538869440556, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 11307 + }, + { + "epoch": 0.3120126096358864, + "grad_norm": 0.0027398637030273676, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 11308 + }, + { + "epoch": 0.3120402018369508, + "grad_norm": 0.003324718214571476, + "learning_rate": 0.001, + "loss": 0.4538, + "step": 11309 + }, + { + "epoch": 0.31206779403801516, + "grad_norm": 0.003862992627546191, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 11310 + }, + { + "epoch": 0.3120953862390795, + "grad_norm": 0.0027167177759110928, + "learning_rate": 0.001, + "loss": 0.361, + "step": 11311 + }, + { + "epoch": 0.3121229784401439, + "grad_norm": 0.005694078281521797, + "learning_rate": 0.001, + "loss": 0.3341, + "step": 11312 + }, + { + "epoch": 0.31215057064120827, + "grad_norm": 0.0032801406923681498, + "learning_rate": 0.001, + "loss": 0.3755, + "step": 11313 + }, + { + "epoch": 0.3121781628422726, + "grad_norm": 0.0072447690181434155, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 11314 + }, + { + "epoch": 0.312205755043337, + "grad_norm": 0.00575765548273921, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 11315 + }, + { + "epoch": 0.3122333472444014, + "grad_norm": 0.0030593755654990673, + "learning_rate": 0.001, + "loss": 0.3693, + "step": 11316 + }, + { + "epoch": 0.31226093944546573, + "grad_norm": 0.0034721451811492443, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 11317 + }, + { + "epoch": 0.3122885316465301, + "grad_norm": 0.0029739809688180685, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 11318 + }, + { + "epoch": 0.3123161238475945, + "grad_norm": 0.004178667441010475, + "learning_rate": 0.001, + "loss": 0.4132, + "step": 11319 + }, + { + "epoch": 0.31234371604865885, + "grad_norm": 0.00276771723292768, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 11320 + }, + { + "epoch": 0.3123713082497232, + "grad_norm": 0.0033821798861026764, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 11321 + }, + { + "epoch": 0.3123989004507876, + "grad_norm": 0.003055412322282791, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 11322 + }, + { + "epoch": 0.31242649265185196, + "grad_norm": 0.006147374864667654, + "learning_rate": 0.001, + "loss": 0.4117, + "step": 11323 + }, + { + "epoch": 0.3124540848529163, + "grad_norm": 0.002903368789702654, + "learning_rate": 0.001, + "loss": 0.3949, + "step": 11324 + }, + { + "epoch": 0.31248167705398067, + "grad_norm": 0.002361924620345235, + "learning_rate": 0.001, + "loss": 0.3739, + "step": 11325 + }, + { + "epoch": 0.3125092692550451, + "grad_norm": 0.002768756588920951, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11326 + }, + { + "epoch": 0.3125368614561094, + "grad_norm": 0.009059234522283077, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 11327 + }, + { + "epoch": 0.3125644536571738, + "grad_norm": 0.0029197093099355698, + "learning_rate": 0.001, + "loss": 0.4243, + "step": 11328 + }, + { + "epoch": 0.3125920458582382, + "grad_norm": 0.003418351523578167, + "learning_rate": 0.001, + "loss": 0.3994, + "step": 11329 + }, + { + "epoch": 0.31261963805930254, + "grad_norm": 0.004701963625848293, + "learning_rate": 0.001, + "loss": 0.3557, + "step": 11330 + }, + { + "epoch": 0.3126472302603669, + "grad_norm": 0.008553891442716122, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 11331 + }, + { + "epoch": 0.3126748224614313, + "grad_norm": 0.0062170871533453465, + "learning_rate": 0.001, + "loss": 0.3824, + "step": 11332 + }, + { + "epoch": 0.31270241466249565, + "grad_norm": 0.0068610128946602345, + "learning_rate": 0.001, + "loss": 0.4345, + "step": 11333 + }, + { + "epoch": 0.31273000686356, + "grad_norm": 0.008722824975848198, + "learning_rate": 0.001, + "loss": 0.377, + "step": 11334 + }, + { + "epoch": 0.31275759906462436, + "grad_norm": 0.004026619717478752, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 11335 + }, + { + "epoch": 0.31278519126568877, + "grad_norm": 0.006060839165002108, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 11336 + }, + { + "epoch": 0.3128127834667531, + "grad_norm": 0.0033903757575899363, + "learning_rate": 0.001, + "loss": 0.4015, + "step": 11337 + }, + { + "epoch": 0.31284037566781747, + "grad_norm": 0.004284377209842205, + "learning_rate": 0.001, + "loss": 0.3676, + "step": 11338 + }, + { + "epoch": 0.3128679678688819, + "grad_norm": 0.002431008731946349, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 11339 + }, + { + "epoch": 0.31289556006994623, + "grad_norm": 0.0026608938351273537, + "learning_rate": 0.001, + "loss": 0.4473, + "step": 11340 + }, + { + "epoch": 0.3129231522710106, + "grad_norm": 0.0032837912440299988, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 11341 + }, + { + "epoch": 0.312950744472075, + "grad_norm": 0.0023711349349468946, + "learning_rate": 0.001, + "loss": 0.43, + "step": 11342 + }, + { + "epoch": 0.31297833667313935, + "grad_norm": 0.003046794096007943, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 11343 + }, + { + "epoch": 0.3130059288742037, + "grad_norm": 0.0024662583600729704, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 11344 + }, + { + "epoch": 0.31303352107526805, + "grad_norm": 0.002765703946352005, + "learning_rate": 0.001, + "loss": 0.3953, + "step": 11345 + }, + { + "epoch": 0.31306111327633246, + "grad_norm": 0.0024844948202371597, + "learning_rate": 0.001, + "loss": 0.3844, + "step": 11346 + }, + { + "epoch": 0.3130887054773968, + "grad_norm": 0.0025793337263166904, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 11347 + }, + { + "epoch": 0.31311629767846116, + "grad_norm": 0.003900103038176894, + "learning_rate": 0.001, + "loss": 0.3882, + "step": 11348 + }, + { + "epoch": 0.31314388987952557, + "grad_norm": 0.0021257405169308186, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 11349 + }, + { + "epoch": 0.3131714820805899, + "grad_norm": 0.00406573386862874, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 11350 + }, + { + "epoch": 0.3131990742816543, + "grad_norm": 0.003567320527508855, + "learning_rate": 0.001, + "loss": 0.4156, + "step": 11351 + }, + { + "epoch": 0.3132266664827187, + "grad_norm": 0.0028111112769693136, + "learning_rate": 0.001, + "loss": 0.3966, + "step": 11352 + }, + { + "epoch": 0.31325425868378304, + "grad_norm": 0.004948635585606098, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 11353 + }, + { + "epoch": 0.3132818508848474, + "grad_norm": 0.0023954228963702917, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 11354 + }, + { + "epoch": 0.31330944308591174, + "grad_norm": 0.0028324832674115896, + "learning_rate": 0.001, + "loss": 0.3635, + "step": 11355 + }, + { + "epoch": 0.31333703528697615, + "grad_norm": 0.0026057560462504625, + "learning_rate": 0.001, + "loss": 0.4259, + "step": 11356 + }, + { + "epoch": 0.3133646274880405, + "grad_norm": 0.006796353030949831, + "learning_rate": 0.001, + "loss": 0.4002, + "step": 11357 + }, + { + "epoch": 0.31339221968910486, + "grad_norm": 0.0028146374970674515, + "learning_rate": 0.001, + "loss": 0.4088, + "step": 11358 + }, + { + "epoch": 0.31341981189016926, + "grad_norm": 0.0030071684159338474, + "learning_rate": 0.001, + "loss": 0.4035, + "step": 11359 + }, + { + "epoch": 0.3134474040912336, + "grad_norm": 0.0034612752497196198, + "learning_rate": 0.001, + "loss": 0.423, + "step": 11360 + }, + { + "epoch": 0.31347499629229797, + "grad_norm": 0.0025981247890740633, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 11361 + }, + { + "epoch": 0.3135025884933624, + "grad_norm": 0.0051524159498512745, + "learning_rate": 0.001, + "loss": 0.3775, + "step": 11362 + }, + { + "epoch": 0.31353018069442673, + "grad_norm": 0.008304521441459656, + "learning_rate": 0.001, + "loss": 0.3742, + "step": 11363 + }, + { + "epoch": 0.3135577728954911, + "grad_norm": 0.0033939266577363014, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11364 + }, + { + "epoch": 0.31358536509655544, + "grad_norm": 0.0036061664577573538, + "learning_rate": 0.001, + "loss": 0.4467, + "step": 11365 + }, + { + "epoch": 0.31361295729761984, + "grad_norm": 0.005402529612183571, + "learning_rate": 0.001, + "loss": 0.3777, + "step": 11366 + }, + { + "epoch": 0.3136405494986842, + "grad_norm": 0.005045123398303986, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 11367 + }, + { + "epoch": 0.31366814169974855, + "grad_norm": 0.00310247833840549, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 11368 + }, + { + "epoch": 0.31369573390081296, + "grad_norm": 0.002901574596762657, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 11369 + }, + { + "epoch": 0.3137233261018773, + "grad_norm": 0.0029007482808083296, + "learning_rate": 0.001, + "loss": 0.3855, + "step": 11370 + }, + { + "epoch": 0.31375091830294166, + "grad_norm": 0.0036781311500817537, + "learning_rate": 0.001, + "loss": 0.4041, + "step": 11371 + }, + { + "epoch": 0.31377851050400607, + "grad_norm": 0.004679156932979822, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 11372 + }, + { + "epoch": 0.3138061027050704, + "grad_norm": 0.002834693994373083, + "learning_rate": 0.001, + "loss": 0.4189, + "step": 11373 + }, + { + "epoch": 0.3138336949061348, + "grad_norm": 0.0029626016039401293, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 11374 + }, + { + "epoch": 0.3138612871071991, + "grad_norm": 0.0032038032077252865, + "learning_rate": 0.001, + "loss": 0.3963, + "step": 11375 + }, + { + "epoch": 0.31388887930826354, + "grad_norm": 0.004681065212935209, + "learning_rate": 0.001, + "loss": 0.4177, + "step": 11376 + }, + { + "epoch": 0.3139164715093279, + "grad_norm": 0.004868704825639725, + "learning_rate": 0.001, + "loss": 0.4064, + "step": 11377 + }, + { + "epoch": 0.31394406371039224, + "grad_norm": 0.005584596190601587, + "learning_rate": 0.001, + "loss": 0.3819, + "step": 11378 + }, + { + "epoch": 0.31397165591145665, + "grad_norm": 0.002676298376172781, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 11379 + }, + { + "epoch": 0.313999248112521, + "grad_norm": 0.003069305093958974, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 11380 + }, + { + "epoch": 0.31402684031358535, + "grad_norm": 0.002781213726848364, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 11381 + }, + { + "epoch": 0.31405443251464976, + "grad_norm": 0.0022207528818398714, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 11382 + }, + { + "epoch": 0.3140820247157141, + "grad_norm": 0.0025241211988031864, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 11383 + }, + { + "epoch": 0.31410961691677847, + "grad_norm": 0.004623900167644024, + "learning_rate": 0.001, + "loss": 0.3849, + "step": 11384 + }, + { + "epoch": 0.3141372091178428, + "grad_norm": 0.0028964534867554903, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 11385 + }, + { + "epoch": 0.3141648013189072, + "grad_norm": 0.01115355733782053, + "learning_rate": 0.001, + "loss": 0.4417, + "step": 11386 + }, + { + "epoch": 0.3141923935199716, + "grad_norm": 0.003189532784745097, + "learning_rate": 0.001, + "loss": 0.3955, + "step": 11387 + }, + { + "epoch": 0.31421998572103593, + "grad_norm": 0.005057721398770809, + "learning_rate": 0.001, + "loss": 0.4194, + "step": 11388 + }, + { + "epoch": 0.31424757792210034, + "grad_norm": 0.005494655575603247, + "learning_rate": 0.001, + "loss": 0.4234, + "step": 11389 + }, + { + "epoch": 0.3142751701231647, + "grad_norm": 0.0033896707464009523, + "learning_rate": 0.001, + "loss": 0.3345, + "step": 11390 + }, + { + "epoch": 0.31430276232422905, + "grad_norm": 0.0024348304141312838, + "learning_rate": 0.001, + "loss": 0.413, + "step": 11391 + }, + { + "epoch": 0.3143303545252934, + "grad_norm": 0.002852590288966894, + "learning_rate": 0.001, + "loss": 0.4514, + "step": 11392 + }, + { + "epoch": 0.3143579467263578, + "grad_norm": 0.0025470538530498743, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 11393 + }, + { + "epoch": 0.31438553892742216, + "grad_norm": 0.003952103201299906, + "learning_rate": 0.001, + "loss": 0.3673, + "step": 11394 + }, + { + "epoch": 0.3144131311284865, + "grad_norm": 0.0030062925070524216, + "learning_rate": 0.001, + "loss": 0.4349, + "step": 11395 + }, + { + "epoch": 0.3144407233295509, + "grad_norm": 0.008497409522533417, + "learning_rate": 0.001, + "loss": 0.4031, + "step": 11396 + }, + { + "epoch": 0.31446831553061527, + "grad_norm": 0.0024450889322906733, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 11397 + }, + { + "epoch": 0.3144959077316796, + "grad_norm": 0.002428788226097822, + "learning_rate": 0.001, + "loss": 0.3698, + "step": 11398 + }, + { + "epoch": 0.31452349993274403, + "grad_norm": 0.0026488315779715776, + "learning_rate": 0.001, + "loss": 0.4082, + "step": 11399 + }, + { + "epoch": 0.3145510921338084, + "grad_norm": 0.003063708543777466, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 11400 + }, + { + "epoch": 0.31457868433487274, + "grad_norm": 0.0036700747441500425, + "learning_rate": 0.001, + "loss": 0.3839, + "step": 11401 + }, + { + "epoch": 0.3146062765359371, + "grad_norm": 0.0024674683809280396, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 11402 + }, + { + "epoch": 0.3146338687370015, + "grad_norm": 0.002942080609500408, + "learning_rate": 0.001, + "loss": 0.4228, + "step": 11403 + }, + { + "epoch": 0.31466146093806585, + "grad_norm": 0.0024689743295311928, + "learning_rate": 0.001, + "loss": 0.3945, + "step": 11404 + }, + { + "epoch": 0.3146890531391302, + "grad_norm": 0.003294649999588728, + "learning_rate": 0.001, + "loss": 0.4543, + "step": 11405 + }, + { + "epoch": 0.3147166453401946, + "grad_norm": 0.0027752909809350967, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 11406 + }, + { + "epoch": 0.31474423754125896, + "grad_norm": 0.0031409088987857103, + "learning_rate": 0.001, + "loss": 0.3476, + "step": 11407 + }, + { + "epoch": 0.3147718297423233, + "grad_norm": 0.004619597923010588, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 11408 + }, + { + "epoch": 0.3147994219433877, + "grad_norm": 0.005096206441521645, + "learning_rate": 0.001, + "loss": 0.4164, + "step": 11409 + }, + { + "epoch": 0.3148270141444521, + "grad_norm": 0.003756036050617695, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 11410 + }, + { + "epoch": 0.31485460634551643, + "grad_norm": 0.0021125138737261295, + "learning_rate": 0.001, + "loss": 0.4238, + "step": 11411 + }, + { + "epoch": 0.3148821985465808, + "grad_norm": 0.002177385613322258, + "learning_rate": 0.001, + "loss": 0.381, + "step": 11412 + }, + { + "epoch": 0.3149097907476452, + "grad_norm": 0.0027410865295678377, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 11413 + }, + { + "epoch": 0.31493738294870954, + "grad_norm": 0.003590056672692299, + "learning_rate": 0.001, + "loss": 0.4169, + "step": 11414 + }, + { + "epoch": 0.3149649751497739, + "grad_norm": 0.002279781037941575, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 11415 + }, + { + "epoch": 0.3149925673508383, + "grad_norm": 0.0023333400022238493, + "learning_rate": 0.001, + "loss": 0.4085, + "step": 11416 + }, + { + "epoch": 0.31502015955190266, + "grad_norm": 0.004322835244238377, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 11417 + }, + { + "epoch": 0.315047751752967, + "grad_norm": 0.0034005579072982073, + "learning_rate": 0.001, + "loss": 0.4159, + "step": 11418 + }, + { + "epoch": 0.3150753439540314, + "grad_norm": 0.007161610759794712, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 11419 + }, + { + "epoch": 0.31510293615509577, + "grad_norm": 0.007838692516088486, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 11420 + }, + { + "epoch": 0.3151305283561601, + "grad_norm": 0.004573942627757788, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 11421 + }, + { + "epoch": 0.3151581205572245, + "grad_norm": 0.005383932497352362, + "learning_rate": 0.001, + "loss": 0.4437, + "step": 11422 + }, + { + "epoch": 0.3151857127582889, + "grad_norm": 0.004969790577888489, + "learning_rate": 0.001, + "loss": 0.3845, + "step": 11423 + }, + { + "epoch": 0.31521330495935324, + "grad_norm": 0.0022800557781010866, + "learning_rate": 0.001, + "loss": 0.3946, + "step": 11424 + }, + { + "epoch": 0.3152408971604176, + "grad_norm": 0.0025320611894130707, + "learning_rate": 0.001, + "loss": 0.4653, + "step": 11425 + }, + { + "epoch": 0.315268489361482, + "grad_norm": 0.002388893160969019, + "learning_rate": 0.001, + "loss": 0.4077, + "step": 11426 + }, + { + "epoch": 0.31529608156254635, + "grad_norm": 0.008318680338561535, + "learning_rate": 0.001, + "loss": 0.397, + "step": 11427 + }, + { + "epoch": 0.3153236737636107, + "grad_norm": 0.007277218624949455, + "learning_rate": 0.001, + "loss": 0.3586, + "step": 11428 + }, + { + "epoch": 0.3153512659646751, + "grad_norm": 0.0038948303554207087, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 11429 + }, + { + "epoch": 0.31537885816573946, + "grad_norm": 0.003630647901445627, + "learning_rate": 0.001, + "loss": 0.4183, + "step": 11430 + }, + { + "epoch": 0.3154064503668038, + "grad_norm": 0.002882918808609247, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 11431 + }, + { + "epoch": 0.31543404256786817, + "grad_norm": 0.0030421330593526363, + "learning_rate": 0.001, + "loss": 0.367, + "step": 11432 + }, + { + "epoch": 0.3154616347689326, + "grad_norm": 0.00291895167902112, + "learning_rate": 0.001, + "loss": 0.427, + "step": 11433 + }, + { + "epoch": 0.3154892269699969, + "grad_norm": 0.003857139963656664, + "learning_rate": 0.001, + "loss": 0.4236, + "step": 11434 + }, + { + "epoch": 0.3155168191710613, + "grad_norm": 0.0034651218447834253, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 11435 + }, + { + "epoch": 0.3155444113721257, + "grad_norm": 0.0023657598067075014, + "learning_rate": 0.001, + "loss": 0.4227, + "step": 11436 + }, + { + "epoch": 0.31557200357319004, + "grad_norm": 0.002382135484367609, + "learning_rate": 0.001, + "loss": 0.3864, + "step": 11437 + }, + { + "epoch": 0.3155995957742544, + "grad_norm": 0.0034222460817545652, + "learning_rate": 0.001, + "loss": 0.418, + "step": 11438 + }, + { + "epoch": 0.3156271879753188, + "grad_norm": 0.004517734050750732, + "learning_rate": 0.001, + "loss": 0.3899, + "step": 11439 + }, + { + "epoch": 0.31565478017638315, + "grad_norm": 0.005458935163915157, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 11440 + }, + { + "epoch": 0.3156823723774475, + "grad_norm": 0.00377734680660069, + "learning_rate": 0.001, + "loss": 0.3623, + "step": 11441 + }, + { + "epoch": 0.31570996457851186, + "grad_norm": 0.006429413799196482, + "learning_rate": 0.001, + "loss": 0.3875, + "step": 11442 + }, + { + "epoch": 0.31573755677957627, + "grad_norm": 0.002111859619617462, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 11443 + }, + { + "epoch": 0.3157651489806406, + "grad_norm": 0.005765847861766815, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 11444 + }, + { + "epoch": 0.315792741181705, + "grad_norm": 0.0038511583115905523, + "learning_rate": 0.001, + "loss": 0.3508, + "step": 11445 + }, + { + "epoch": 0.3158203333827694, + "grad_norm": 0.006009151693433523, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 11446 + }, + { + "epoch": 0.31584792558383373, + "grad_norm": 0.0027272317092865705, + "learning_rate": 0.001, + "loss": 0.4205, + "step": 11447 + }, + { + "epoch": 0.3158755177848981, + "grad_norm": 0.0022494541481137276, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 11448 + }, + { + "epoch": 0.3159031099859625, + "grad_norm": 0.0025459870230406523, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 11449 + }, + { + "epoch": 0.31593070218702685, + "grad_norm": 0.008022737689316273, + "learning_rate": 0.001, + "loss": 0.3628, + "step": 11450 + }, + { + "epoch": 0.3159582943880912, + "grad_norm": 0.0043347179889678955, + "learning_rate": 0.001, + "loss": 0.3537, + "step": 11451 + }, + { + "epoch": 0.31598588658915555, + "grad_norm": 0.003317405702546239, + "learning_rate": 0.001, + "loss": 0.4351, + "step": 11452 + }, + { + "epoch": 0.31601347879021996, + "grad_norm": 0.003171857912093401, + "learning_rate": 0.001, + "loss": 0.3757, + "step": 11453 + }, + { + "epoch": 0.3160410709912843, + "grad_norm": 0.0032373506110161543, + "learning_rate": 0.001, + "loss": 0.4016, + "step": 11454 + }, + { + "epoch": 0.31606866319234866, + "grad_norm": 0.003428233554586768, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 11455 + }, + { + "epoch": 0.3160962553934131, + "grad_norm": 0.0027770015876740217, + "learning_rate": 0.001, + "loss": 0.4151, + "step": 11456 + }, + { + "epoch": 0.3161238475944774, + "grad_norm": 0.002452113199979067, + "learning_rate": 0.001, + "loss": 0.4296, + "step": 11457 + }, + { + "epoch": 0.3161514397955418, + "grad_norm": 0.0024971971288323402, + "learning_rate": 0.001, + "loss": 0.413, + "step": 11458 + }, + { + "epoch": 0.3161790319966062, + "grad_norm": 0.002314744982868433, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 11459 + }, + { + "epoch": 0.31620662419767054, + "grad_norm": 0.004431735258549452, + "learning_rate": 0.001, + "loss": 0.3828, + "step": 11460 + }, + { + "epoch": 0.3162342163987349, + "grad_norm": 0.007284036837518215, + "learning_rate": 0.001, + "loss": 0.3353, + "step": 11461 + }, + { + "epoch": 0.31626180859979924, + "grad_norm": 0.004431730601936579, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 11462 + }, + { + "epoch": 0.31628940080086365, + "grad_norm": 0.003810502588748932, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 11463 + }, + { + "epoch": 0.316316993001928, + "grad_norm": 0.004132286179810762, + "learning_rate": 0.001, + "loss": 0.379, + "step": 11464 + }, + { + "epoch": 0.31634458520299236, + "grad_norm": 0.0026423779781907797, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11465 + }, + { + "epoch": 0.31637217740405676, + "grad_norm": 0.0030175880528986454, + "learning_rate": 0.001, + "loss": 0.3806, + "step": 11466 + }, + { + "epoch": 0.3163997696051211, + "grad_norm": 0.0023432145826518536, + "learning_rate": 0.001, + "loss": 0.422, + "step": 11467 + }, + { + "epoch": 0.31642736180618547, + "grad_norm": 0.004060344770550728, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 11468 + }, + { + "epoch": 0.3164549540072499, + "grad_norm": 0.00273421430028975, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 11469 + }, + { + "epoch": 0.31648254620831423, + "grad_norm": 0.0024940611328929663, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 11470 + }, + { + "epoch": 0.3165101384093786, + "grad_norm": 0.005460789427161217, + "learning_rate": 0.001, + "loss": 0.4153, + "step": 11471 + }, + { + "epoch": 0.31653773061044294, + "grad_norm": 0.0026238500140607357, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 11472 + }, + { + "epoch": 0.31656532281150734, + "grad_norm": 0.005485900212079287, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 11473 + }, + { + "epoch": 0.3165929150125717, + "grad_norm": 0.0072882408276200294, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 11474 + }, + { + "epoch": 0.31662050721363605, + "grad_norm": 0.002355532720685005, + "learning_rate": 0.001, + "loss": 0.4526, + "step": 11475 + }, + { + "epoch": 0.31664809941470046, + "grad_norm": 0.0029103090055286884, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 11476 + }, + { + "epoch": 0.3166756916157648, + "grad_norm": 0.005752061028033495, + "learning_rate": 0.001, + "loss": 0.4086, + "step": 11477 + }, + { + "epoch": 0.31670328381682916, + "grad_norm": 0.0035546228755265474, + "learning_rate": 0.001, + "loss": 0.4, + "step": 11478 + }, + { + "epoch": 0.31673087601789357, + "grad_norm": 0.007723371963948011, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 11479 + }, + { + "epoch": 0.3167584682189579, + "grad_norm": 0.005489727947860956, + "learning_rate": 0.001, + "loss": 0.3787, + "step": 11480 + }, + { + "epoch": 0.3167860604200223, + "grad_norm": 0.0035039815120399, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 11481 + }, + { + "epoch": 0.3168136526210866, + "grad_norm": 0.00970650278031826, + "learning_rate": 0.001, + "loss": 0.4075, + "step": 11482 + }, + { + "epoch": 0.31684124482215104, + "grad_norm": 0.002773486776277423, + "learning_rate": 0.001, + "loss": 0.4455, + "step": 11483 + }, + { + "epoch": 0.3168688370232154, + "grad_norm": 0.002964557381346822, + "learning_rate": 0.001, + "loss": 0.389, + "step": 11484 + }, + { + "epoch": 0.31689642922427974, + "grad_norm": 0.002768107457086444, + "learning_rate": 0.001, + "loss": 0.4399, + "step": 11485 + }, + { + "epoch": 0.31692402142534415, + "grad_norm": 0.003310150234028697, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 11486 + }, + { + "epoch": 0.3169516136264085, + "grad_norm": 0.0038422702345997095, + "learning_rate": 0.001, + "loss": 0.3947, + "step": 11487 + }, + { + "epoch": 0.31697920582747285, + "grad_norm": 0.0059617673978209496, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 11488 + }, + { + "epoch": 0.3170067980285372, + "grad_norm": 0.004942035768181086, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 11489 + }, + { + "epoch": 0.3170343902296016, + "grad_norm": 0.005482214502990246, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 11490 + }, + { + "epoch": 0.31706198243066597, + "grad_norm": 0.004851524252444506, + "learning_rate": 0.001, + "loss": 0.3555, + "step": 11491 + }, + { + "epoch": 0.3170895746317303, + "grad_norm": 0.0029763453640043736, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 11492 + }, + { + "epoch": 0.31711716683279473, + "grad_norm": 0.0036718971095979214, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 11493 + }, + { + "epoch": 0.3171447590338591, + "grad_norm": 0.003948654048144817, + "learning_rate": 0.001, + "loss": 0.4239, + "step": 11494 + }, + { + "epoch": 0.31717235123492343, + "grad_norm": 0.011904267594218254, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 11495 + }, + { + "epoch": 0.31719994343598784, + "grad_norm": 0.003118073334917426, + "learning_rate": 0.001, + "loss": 0.383, + "step": 11496 + }, + { + "epoch": 0.3172275356370522, + "grad_norm": 0.0038996157236397266, + "learning_rate": 0.001, + "loss": 0.3929, + "step": 11497 + }, + { + "epoch": 0.31725512783811655, + "grad_norm": 0.003080842550843954, + "learning_rate": 0.001, + "loss": 0.379, + "step": 11498 + }, + { + "epoch": 0.3172827200391809, + "grad_norm": 0.0027819147799164057, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 11499 + }, + { + "epoch": 0.3173103122402453, + "grad_norm": 0.0029706538189202547, + "learning_rate": 0.001, + "loss": 0.3841, + "step": 11500 + }, + { + "epoch": 0.3173103122402453, + "eval_runtime": 23.5592, + "eval_samples_per_second": 1.358, + "eval_steps_per_second": 0.17, + "step": 11500 + }, + { + "epoch": 0.31733790444130966, + "grad_norm": 0.002240256406366825, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 11501 + }, + { + "epoch": 0.317365496642374, + "grad_norm": 0.0034430986270308495, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 11502 + }, + { + "epoch": 0.3173930888434384, + "grad_norm": 0.01547628827393055, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 11503 + }, + { + "epoch": 0.3174206810445028, + "grad_norm": 0.003593276022002101, + "learning_rate": 0.001, + "loss": 0.41, + "step": 11504 + }, + { + "epoch": 0.3174482732455671, + "grad_norm": 0.00244096084497869, + "learning_rate": 0.001, + "loss": 0.4129, + "step": 11505 + }, + { + "epoch": 0.31747586544663153, + "grad_norm": 0.002778639318421483, + "learning_rate": 0.001, + "loss": 0.3911, + "step": 11506 + }, + { + "epoch": 0.3175034576476959, + "grad_norm": 0.018177034333348274, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 11507 + }, + { + "epoch": 0.31753104984876024, + "grad_norm": 0.005340322386473417, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 11508 + }, + { + "epoch": 0.3175586420498246, + "grad_norm": 0.0037258395459502935, + "learning_rate": 0.001, + "loss": 0.4093, + "step": 11509 + }, + { + "epoch": 0.317586234250889, + "grad_norm": 0.004941198974847794, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 11510 + }, + { + "epoch": 0.31761382645195335, + "grad_norm": 0.0029092850163578987, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 11511 + }, + { + "epoch": 0.3176414186530177, + "grad_norm": 0.0023247981444001198, + "learning_rate": 0.001, + "loss": 0.3985, + "step": 11512 + }, + { + "epoch": 0.3176690108540821, + "grad_norm": 0.0024109010118991137, + "learning_rate": 0.001, + "loss": 0.3925, + "step": 11513 + }, + { + "epoch": 0.31769660305514646, + "grad_norm": 0.004136643372476101, + "learning_rate": 0.001, + "loss": 0.4308, + "step": 11514 + }, + { + "epoch": 0.3177241952562108, + "grad_norm": 0.002965535270050168, + "learning_rate": 0.001, + "loss": 0.3983, + "step": 11515 + }, + { + "epoch": 0.3177517874572752, + "grad_norm": 0.0028476836159825325, + "learning_rate": 0.001, + "loss": 0.4078, + "step": 11516 + }, + { + "epoch": 0.3177793796583396, + "grad_norm": 0.002511124825105071, + "learning_rate": 0.001, + "loss": 0.3867, + "step": 11517 + }, + { + "epoch": 0.31780697185940393, + "grad_norm": 0.002977583557367325, + "learning_rate": 0.001, + "loss": 0.403, + "step": 11518 + }, + { + "epoch": 0.3178345640604683, + "grad_norm": 0.0023715696297585964, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 11519 + }, + { + "epoch": 0.3178621562615327, + "grad_norm": 0.007563414517790079, + "learning_rate": 0.001, + "loss": 0.388, + "step": 11520 + }, + { + "epoch": 0.31788974846259704, + "grad_norm": 0.004035325720906258, + "learning_rate": 0.001, + "loss": 0.379, + "step": 11521 + }, + { + "epoch": 0.3179173406636614, + "grad_norm": 0.00366159132681787, + "learning_rate": 0.001, + "loss": 0.4218, + "step": 11522 + }, + { + "epoch": 0.3179449328647258, + "grad_norm": 0.0030379300005733967, + "learning_rate": 0.001, + "loss": 0.4373, + "step": 11523 + }, + { + "epoch": 0.31797252506579016, + "grad_norm": 0.002500406000763178, + "learning_rate": 0.001, + "loss": 0.4019, + "step": 11524 + }, + { + "epoch": 0.3180001172668545, + "grad_norm": 0.0029437800403684378, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 11525 + }, + { + "epoch": 0.3180277094679189, + "grad_norm": 0.003492504358291626, + "learning_rate": 0.001, + "loss": 0.3619, + "step": 11526 + }, + { + "epoch": 0.31805530166898327, + "grad_norm": 0.002901204163208604, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 11527 + }, + { + "epoch": 0.3180828938700476, + "grad_norm": 0.003703712485730648, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 11528 + }, + { + "epoch": 0.318110486071112, + "grad_norm": 0.003048625076189637, + "learning_rate": 0.001, + "loss": 0.3834, + "step": 11529 + }, + { + "epoch": 0.3181380782721764, + "grad_norm": 0.0022534143645316362, + "learning_rate": 0.001, + "loss": 0.4213, + "step": 11530 + }, + { + "epoch": 0.31816567047324074, + "grad_norm": 0.0026876600459218025, + "learning_rate": 0.001, + "loss": 0.3785, + "step": 11531 + }, + { + "epoch": 0.3181932626743051, + "grad_norm": 0.00236628670245409, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 11532 + }, + { + "epoch": 0.3182208548753695, + "grad_norm": 0.004099288955330849, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 11533 + }, + { + "epoch": 0.31824844707643385, + "grad_norm": 0.0041442555375397205, + "learning_rate": 0.001, + "loss": 0.358, + "step": 11534 + }, + { + "epoch": 0.3182760392774982, + "grad_norm": 0.003338357200846076, + "learning_rate": 0.001, + "loss": 0.4307, + "step": 11535 + }, + { + "epoch": 0.3183036314785626, + "grad_norm": 0.002358140889555216, + "learning_rate": 0.001, + "loss": 0.4438, + "step": 11536 + }, + { + "epoch": 0.31833122367962696, + "grad_norm": 0.007097299210727215, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 11537 + }, + { + "epoch": 0.3183588158806913, + "grad_norm": 0.003606355981901288, + "learning_rate": 0.001, + "loss": 0.4283, + "step": 11538 + }, + { + "epoch": 0.31838640808175567, + "grad_norm": 0.0027023248840123415, + "learning_rate": 0.001, + "loss": 0.379, + "step": 11539 + }, + { + "epoch": 0.3184140002828201, + "grad_norm": 0.0069968136958777905, + "learning_rate": 0.001, + "loss": 0.3748, + "step": 11540 + }, + { + "epoch": 0.31844159248388443, + "grad_norm": 0.0028446821961551905, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11541 + }, + { + "epoch": 0.3184691846849488, + "grad_norm": 0.003783198306336999, + "learning_rate": 0.001, + "loss": 0.3773, + "step": 11542 + }, + { + "epoch": 0.3184967768860132, + "grad_norm": 0.0042613414116203785, + "learning_rate": 0.001, + "loss": 0.4125, + "step": 11543 + }, + { + "epoch": 0.31852436908707754, + "grad_norm": 0.023959210142493248, + "learning_rate": 0.001, + "loss": 0.4045, + "step": 11544 + }, + { + "epoch": 0.3185519612881419, + "grad_norm": 0.02399979531764984, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 11545 + }, + { + "epoch": 0.3185795534892063, + "grad_norm": 0.003035551868379116, + "learning_rate": 0.001, + "loss": 0.427, + "step": 11546 + }, + { + "epoch": 0.31860714569027065, + "grad_norm": 0.0028156766202300787, + "learning_rate": 0.001, + "loss": 0.394, + "step": 11547 + }, + { + "epoch": 0.318634737891335, + "grad_norm": 0.0029186957981437445, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 11548 + }, + { + "epoch": 0.31866233009239936, + "grad_norm": 0.0037356752436608076, + "learning_rate": 0.001, + "loss": 0.4317, + "step": 11549 + }, + { + "epoch": 0.31868992229346377, + "grad_norm": 0.0047831544652581215, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 11550 + }, + { + "epoch": 0.3187175144945281, + "grad_norm": 0.0023849967401474714, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 11551 + }, + { + "epoch": 0.3187451066955925, + "grad_norm": 0.004315240308642387, + "learning_rate": 0.001, + "loss": 0.3765, + "step": 11552 + }, + { + "epoch": 0.3187726988966569, + "grad_norm": 0.003803263884037733, + "learning_rate": 0.001, + "loss": 0.393, + "step": 11553 + }, + { + "epoch": 0.31880029109772123, + "grad_norm": 0.004018161911517382, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 11554 + }, + { + "epoch": 0.3188278832987856, + "grad_norm": 0.0036311408039182425, + "learning_rate": 0.001, + "loss": 0.3995, + "step": 11555 + }, + { + "epoch": 0.31885547549985, + "grad_norm": 0.004630459472537041, + "learning_rate": 0.001, + "loss": 0.4245, + "step": 11556 + }, + { + "epoch": 0.31888306770091435, + "grad_norm": 0.0048345946706831455, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 11557 + }, + { + "epoch": 0.3189106599019787, + "grad_norm": 0.005161342676728964, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 11558 + }, + { + "epoch": 0.31893825210304305, + "grad_norm": 0.0023576943203806877, + "learning_rate": 0.001, + "loss": 0.4402, + "step": 11559 + }, + { + "epoch": 0.31896584430410746, + "grad_norm": 0.004130503162741661, + "learning_rate": 0.001, + "loss": 0.3545, + "step": 11560 + }, + { + "epoch": 0.3189934365051718, + "grad_norm": 0.005308313295245171, + "learning_rate": 0.001, + "loss": 0.3788, + "step": 11561 + }, + { + "epoch": 0.31902102870623616, + "grad_norm": 0.0037142925430089235, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 11562 + }, + { + "epoch": 0.3190486209073006, + "grad_norm": 0.0037768762558698654, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 11563 + }, + { + "epoch": 0.3190762131083649, + "grad_norm": 0.004423100501298904, + "learning_rate": 0.001, + "loss": 0.3734, + "step": 11564 + }, + { + "epoch": 0.3191038053094293, + "grad_norm": 0.0066397832706570625, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 11565 + }, + { + "epoch": 0.3191313975104937, + "grad_norm": 0.003907995298504829, + "learning_rate": 0.001, + "loss": 0.3999, + "step": 11566 + }, + { + "epoch": 0.31915898971155804, + "grad_norm": 0.003112133825197816, + "learning_rate": 0.001, + "loss": 0.4219, + "step": 11567 + }, + { + "epoch": 0.3191865819126224, + "grad_norm": 0.002988254651427269, + "learning_rate": 0.001, + "loss": 0.3865, + "step": 11568 + }, + { + "epoch": 0.31921417411368674, + "grad_norm": 0.023827673867344856, + "learning_rate": 0.001, + "loss": 0.4147, + "step": 11569 + }, + { + "epoch": 0.31924176631475115, + "grad_norm": 0.0068152598105371, + "learning_rate": 0.001, + "loss": 0.433, + "step": 11570 + }, + { + "epoch": 0.3192693585158155, + "grad_norm": 0.004217714536935091, + "learning_rate": 0.001, + "loss": 0.3661, + "step": 11571 + }, + { + "epoch": 0.31929695071687986, + "grad_norm": 0.002245552372187376, + "learning_rate": 0.001, + "loss": 0.3537, + "step": 11572 + }, + { + "epoch": 0.31932454291794427, + "grad_norm": 0.0026237708516418934, + "learning_rate": 0.001, + "loss": 0.3694, + "step": 11573 + }, + { + "epoch": 0.3193521351190086, + "grad_norm": 0.0031141149811446667, + "learning_rate": 0.001, + "loss": 0.4304, + "step": 11574 + }, + { + "epoch": 0.31937972732007297, + "grad_norm": 0.004059195052832365, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 11575 + }, + { + "epoch": 0.3194073195211373, + "grad_norm": 0.002976267132908106, + "learning_rate": 0.001, + "loss": 0.4212, + "step": 11576 + }, + { + "epoch": 0.31943491172220173, + "grad_norm": 0.0048133903183043, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 11577 + }, + { + "epoch": 0.3194625039232661, + "grad_norm": 0.002898376202210784, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 11578 + }, + { + "epoch": 0.31949009612433044, + "grad_norm": 0.0033135826233774424, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 11579 + }, + { + "epoch": 0.31951768832539484, + "grad_norm": 0.005857154726982117, + "learning_rate": 0.001, + "loss": 0.3859, + "step": 11580 + }, + { + "epoch": 0.3195452805264592, + "grad_norm": 0.0028846007771790028, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 11581 + }, + { + "epoch": 0.31957287272752355, + "grad_norm": 0.002914070850238204, + "learning_rate": 0.001, + "loss": 0.3684, + "step": 11582 + }, + { + "epoch": 0.31960046492858796, + "grad_norm": 0.002994397422298789, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 11583 + }, + { + "epoch": 0.3196280571296523, + "grad_norm": 0.0033264674711972475, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 11584 + }, + { + "epoch": 0.31965564933071666, + "grad_norm": 0.002203144831582904, + "learning_rate": 0.001, + "loss": 0.4649, + "step": 11585 + }, + { + "epoch": 0.319683241531781, + "grad_norm": 0.004528568591922522, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 11586 + }, + { + "epoch": 0.3197108337328454, + "grad_norm": 0.003971675876528025, + "learning_rate": 0.001, + "loss": 0.387, + "step": 11587 + }, + { + "epoch": 0.3197384259339098, + "grad_norm": 0.003969928715378046, + "learning_rate": 0.001, + "loss": 0.3971, + "step": 11588 + }, + { + "epoch": 0.31976601813497413, + "grad_norm": 0.0036615943536162376, + "learning_rate": 0.001, + "loss": 0.3822, + "step": 11589 + }, + { + "epoch": 0.31979361033603854, + "grad_norm": 0.0023915197234600782, + "learning_rate": 0.001, + "loss": 0.4089, + "step": 11590 + }, + { + "epoch": 0.3198212025371029, + "grad_norm": 0.0024923242162913084, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 11591 + }, + { + "epoch": 0.31984879473816724, + "grad_norm": 0.002602427499368787, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 11592 + }, + { + "epoch": 0.31987638693923165, + "grad_norm": 0.005880540236830711, + "learning_rate": 0.001, + "loss": 0.3874, + "step": 11593 + }, + { + "epoch": 0.319903979140296, + "grad_norm": 0.00876854732632637, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 11594 + }, + { + "epoch": 0.31993157134136035, + "grad_norm": 0.006111782975494862, + "learning_rate": 0.001, + "loss": 0.4055, + "step": 11595 + }, + { + "epoch": 0.3199591635424247, + "grad_norm": 0.006039989646524191, + "learning_rate": 0.001, + "loss": 0.4061, + "step": 11596 + }, + { + "epoch": 0.3199867557434891, + "grad_norm": 0.008590095676481724, + "learning_rate": 0.001, + "loss": 0.3448, + "step": 11597 + }, + { + "epoch": 0.32001434794455347, + "grad_norm": 0.015925157815217972, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11598 + }, + { + "epoch": 0.3200419401456178, + "grad_norm": 0.003673270810395479, + "learning_rate": 0.001, + "loss": 0.3928, + "step": 11599 + }, + { + "epoch": 0.32006953234668223, + "grad_norm": 0.0026890907902270555, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 11600 + }, + { + "epoch": 0.3200971245477466, + "grad_norm": 0.0037518092431128025, + "learning_rate": 0.001, + "loss": 0.4057, + "step": 11601 + }, + { + "epoch": 0.32012471674881093, + "grad_norm": 0.003528765868395567, + "learning_rate": 0.001, + "loss": 0.4144, + "step": 11602 + }, + { + "epoch": 0.32015230894987534, + "grad_norm": 0.0036971168592572212, + "learning_rate": 0.001, + "loss": 0.3964, + "step": 11603 + }, + { + "epoch": 0.3201799011509397, + "grad_norm": 0.003124906914308667, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 11604 + }, + { + "epoch": 0.32020749335200405, + "grad_norm": 0.0033891245257109404, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 11605 + }, + { + "epoch": 0.3202350855530684, + "grad_norm": 0.002698018215596676, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 11606 + }, + { + "epoch": 0.3202626777541328, + "grad_norm": 0.0034322156570851803, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 11607 + }, + { + "epoch": 0.32029026995519716, + "grad_norm": 0.0033172587864100933, + "learning_rate": 0.001, + "loss": 0.3796, + "step": 11608 + }, + { + "epoch": 0.3203178621562615, + "grad_norm": 0.0030292505398392677, + "learning_rate": 0.001, + "loss": 0.3811, + "step": 11609 + }, + { + "epoch": 0.3203454543573259, + "grad_norm": 0.0033198799937963486, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 11610 + }, + { + "epoch": 0.3203730465583903, + "grad_norm": 0.003341773757711053, + "learning_rate": 0.001, + "loss": 0.3965, + "step": 11611 + }, + { + "epoch": 0.3204006387594546, + "grad_norm": 0.004839817062020302, + "learning_rate": 0.001, + "loss": 0.3685, + "step": 11612 + }, + { + "epoch": 0.32042823096051903, + "grad_norm": 0.0031633905600756407, + "learning_rate": 0.001, + "loss": 0.3595, + "step": 11613 + }, + { + "epoch": 0.3204558231615834, + "grad_norm": 0.011446718126535416, + "learning_rate": 0.001, + "loss": 0.4126, + "step": 11614 + }, + { + "epoch": 0.32048341536264774, + "grad_norm": 0.011381502263247967, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 11615 + }, + { + "epoch": 0.3205110075637121, + "grad_norm": 0.013346023857593536, + "learning_rate": 0.001, + "loss": 0.4054, + "step": 11616 + }, + { + "epoch": 0.3205385997647765, + "grad_norm": 0.004878256935626268, + "learning_rate": 0.001, + "loss": 0.4247, + "step": 11617 + }, + { + "epoch": 0.32056619196584085, + "grad_norm": 0.005075674969702959, + "learning_rate": 0.001, + "loss": 0.406, + "step": 11618 + }, + { + "epoch": 0.3205937841669052, + "grad_norm": 0.005384589079767466, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 11619 + }, + { + "epoch": 0.3206213763679696, + "grad_norm": 0.004261939786374569, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 11620 + }, + { + "epoch": 0.32064896856903397, + "grad_norm": 0.008900157175958157, + "learning_rate": 0.001, + "loss": 0.4182, + "step": 11621 + }, + { + "epoch": 0.3206765607700983, + "grad_norm": 0.0024166591465473175, + "learning_rate": 0.001, + "loss": 0.4527, + "step": 11622 + }, + { + "epoch": 0.3207041529711627, + "grad_norm": 0.0027629456017166376, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 11623 + }, + { + "epoch": 0.3207317451722271, + "grad_norm": 0.002562351291999221, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 11624 + }, + { + "epoch": 0.32075933737329143, + "grad_norm": 0.0035787278320640326, + "learning_rate": 0.001, + "loss": 0.388, + "step": 11625 + }, + { + "epoch": 0.3207869295743558, + "grad_norm": 0.0025681303814053535, + "learning_rate": 0.001, + "loss": 0.414, + "step": 11626 + }, + { + "epoch": 0.3208145217754202, + "grad_norm": 0.003248112043365836, + "learning_rate": 0.001, + "loss": 0.4581, + "step": 11627 + }, + { + "epoch": 0.32084211397648454, + "grad_norm": 0.001800362253561616, + "learning_rate": 0.001, + "loss": 0.4295, + "step": 11628 + }, + { + "epoch": 0.3208697061775489, + "grad_norm": 0.002697241958230734, + "learning_rate": 0.001, + "loss": 0.3712, + "step": 11629 + }, + { + "epoch": 0.3208972983786133, + "grad_norm": 0.0029813311994075775, + "learning_rate": 0.001, + "loss": 0.3826, + "step": 11630 + }, + { + "epoch": 0.32092489057967766, + "grad_norm": 0.006515763234347105, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 11631 + }, + { + "epoch": 0.320952482780742, + "grad_norm": 0.002813748549669981, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 11632 + }, + { + "epoch": 0.3209800749818064, + "grad_norm": 0.00456245755776763, + "learning_rate": 0.001, + "loss": 0.4173, + "step": 11633 + }, + { + "epoch": 0.32100766718287077, + "grad_norm": 0.0027176521252840757, + "learning_rate": 0.001, + "loss": 0.4196, + "step": 11634 + }, + { + "epoch": 0.3210352593839351, + "grad_norm": 0.003955816384404898, + "learning_rate": 0.001, + "loss": 0.4418, + "step": 11635 + }, + { + "epoch": 0.3210628515849995, + "grad_norm": 0.005285539198666811, + "learning_rate": 0.001, + "loss": 0.3808, + "step": 11636 + }, + { + "epoch": 0.3210904437860639, + "grad_norm": 0.0025900506880134344, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 11637 + }, + { + "epoch": 0.32111803598712824, + "grad_norm": 0.006391063332557678, + "learning_rate": 0.001, + "loss": 0.4104, + "step": 11638 + }, + { + "epoch": 0.3211456281881926, + "grad_norm": 0.0032550417818129063, + "learning_rate": 0.001, + "loss": 0.3613, + "step": 11639 + }, + { + "epoch": 0.321173220389257, + "grad_norm": 0.0030542064923793077, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 11640 + }, + { + "epoch": 0.32120081259032135, + "grad_norm": 0.0037448366638273, + "learning_rate": 0.001, + "loss": 0.3858, + "step": 11641 + }, + { + "epoch": 0.3212284047913857, + "grad_norm": 0.003444848582148552, + "learning_rate": 0.001, + "loss": 0.4203, + "step": 11642 + }, + { + "epoch": 0.3212559969924501, + "grad_norm": 0.004159749951213598, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 11643 + }, + { + "epoch": 0.32128358919351446, + "grad_norm": 0.002784531796351075, + "learning_rate": 0.001, + "loss": 0.4541, + "step": 11644 + }, + { + "epoch": 0.3213111813945788, + "grad_norm": 0.004061343614012003, + "learning_rate": 0.001, + "loss": 0.377, + "step": 11645 + }, + { + "epoch": 0.32133877359564317, + "grad_norm": 0.003336769063025713, + "learning_rate": 0.001, + "loss": 0.3717, + "step": 11646 + }, + { + "epoch": 0.3213663657967076, + "grad_norm": 0.002995783928781748, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 11647 + }, + { + "epoch": 0.32139395799777193, + "grad_norm": 0.0039530228823423386, + "learning_rate": 0.001, + "loss": 0.3861, + "step": 11648 + }, + { + "epoch": 0.3214215501988363, + "grad_norm": 0.0023765622172504663, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 11649 + }, + { + "epoch": 0.3214491423999007, + "grad_norm": 0.006312607321888208, + "learning_rate": 0.001, + "loss": 0.3821, + "step": 11650 + }, + { + "epoch": 0.32147673460096504, + "grad_norm": 0.013422048650681973, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 11651 + }, + { + "epoch": 0.3215043268020294, + "grad_norm": 0.0030129451770335436, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 11652 + }, + { + "epoch": 0.3215319190030938, + "grad_norm": 0.003410701872780919, + "learning_rate": 0.001, + "loss": 0.3984, + "step": 11653 + }, + { + "epoch": 0.32155951120415815, + "grad_norm": 0.004033817909657955, + "learning_rate": 0.001, + "loss": 0.4192, + "step": 11654 + }, + { + "epoch": 0.3215871034052225, + "grad_norm": 0.01334238052368164, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 11655 + }, + { + "epoch": 0.32161469560628686, + "grad_norm": 0.0067011150531470776, + "learning_rate": 0.001, + "loss": 0.3896, + "step": 11656 + }, + { + "epoch": 0.32164228780735127, + "grad_norm": 0.022604364901781082, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 11657 + }, + { + "epoch": 0.3216698800084156, + "grad_norm": 0.00880459789186716, + "learning_rate": 0.001, + "loss": 0.4127, + "step": 11658 + }, + { + "epoch": 0.32169747220948, + "grad_norm": 0.00856684148311615, + "learning_rate": 0.001, + "loss": 0.4005, + "step": 11659 + }, + { + "epoch": 0.3217250644105444, + "grad_norm": 0.009844634681940079, + "learning_rate": 0.001, + "loss": 0.4195, + "step": 11660 + }, + { + "epoch": 0.32175265661160873, + "grad_norm": 0.002705506980419159, + "learning_rate": 0.001, + "loss": 0.4083, + "step": 11661 + }, + { + "epoch": 0.3217802488126731, + "grad_norm": 0.004265344236046076, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 11662 + }, + { + "epoch": 0.3218078410137375, + "grad_norm": 0.0042647975496947765, + "learning_rate": 0.001, + "loss": 0.3847, + "step": 11663 + }, + { + "epoch": 0.32183543321480185, + "grad_norm": 0.0029369243420660496, + "learning_rate": 0.001, + "loss": 0.4092, + "step": 11664 + }, + { + "epoch": 0.3218630254158662, + "grad_norm": 0.00970220472663641, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 11665 + }, + { + "epoch": 0.32189061761693055, + "grad_norm": 0.006463578902184963, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 11666 + }, + { + "epoch": 0.32191820981799496, + "grad_norm": 0.0018640676280483603, + "learning_rate": 0.001, + "loss": 0.4162, + "step": 11667 + }, + { + "epoch": 0.3219458020190593, + "grad_norm": 0.002777427202090621, + "learning_rate": 0.001, + "loss": 0.4171, + "step": 11668 + }, + { + "epoch": 0.32197339422012367, + "grad_norm": 0.004610841162502766, + "learning_rate": 0.001, + "loss": 0.3625, + "step": 11669 + }, + { + "epoch": 0.3220009864211881, + "grad_norm": 0.006540005095303059, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 11670 + }, + { + "epoch": 0.3220285786222524, + "grad_norm": 0.005402629263699055, + "learning_rate": 0.001, + "loss": 0.4004, + "step": 11671 + }, + { + "epoch": 0.3220561708233168, + "grad_norm": 0.00336782680824399, + "learning_rate": 0.001, + "loss": 0.4244, + "step": 11672 + }, + { + "epoch": 0.32208376302438113, + "grad_norm": 0.005407973658293486, + "learning_rate": 0.001, + "loss": 0.388, + "step": 11673 + }, + { + "epoch": 0.32211135522544554, + "grad_norm": 0.002696128562092781, + "learning_rate": 0.001, + "loss": 0.3825, + "step": 11674 + }, + { + "epoch": 0.3221389474265099, + "grad_norm": 0.0032647987827658653, + "learning_rate": 0.001, + "loss": 0.3783, + "step": 11675 + }, + { + "epoch": 0.32216653962757424, + "grad_norm": 0.005423584952950478, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 11676 + }, + { + "epoch": 0.32219413182863865, + "grad_norm": 0.003348333528265357, + "learning_rate": 0.001, + "loss": 0.3715, + "step": 11677 + }, + { + "epoch": 0.322221724029703, + "grad_norm": 0.002469028811901808, + "learning_rate": 0.001, + "loss": 0.3968, + "step": 11678 + }, + { + "epoch": 0.32224931623076736, + "grad_norm": 0.0023266442585736513, + "learning_rate": 0.001, + "loss": 0.398, + "step": 11679 + }, + { + "epoch": 0.32227690843183177, + "grad_norm": 0.003974389284849167, + "learning_rate": 0.001, + "loss": 0.379, + "step": 11680 + }, + { + "epoch": 0.3223045006328961, + "grad_norm": 0.002209985861554742, + "learning_rate": 0.001, + "loss": 0.4158, + "step": 11681 + }, + { + "epoch": 0.32233209283396047, + "grad_norm": 0.002755221212282777, + "learning_rate": 0.001, + "loss": 0.4107, + "step": 11682 + }, + { + "epoch": 0.3223596850350248, + "grad_norm": 0.003273082198575139, + "learning_rate": 0.001, + "loss": 0.3741, + "step": 11683 + }, + { + "epoch": 0.32238727723608923, + "grad_norm": 0.003454297548159957, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 11684 + }, + { + "epoch": 0.3224148694371536, + "grad_norm": 0.0033135886769741774, + "learning_rate": 0.001, + "loss": 0.4422, + "step": 11685 + }, + { + "epoch": 0.32244246163821794, + "grad_norm": 0.00300540286116302, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 11686 + }, + { + "epoch": 0.32247005383928234, + "grad_norm": 0.0025454771239310503, + "learning_rate": 0.001, + "loss": 0.4337, + "step": 11687 + }, + { + "epoch": 0.3224976460403467, + "grad_norm": 0.0030585303902626038, + "learning_rate": 0.001, + "loss": 0.4475, + "step": 11688 + }, + { + "epoch": 0.32252523824141105, + "grad_norm": 0.0024806379806250334, + "learning_rate": 0.001, + "loss": 0.3628, + "step": 11689 + }, + { + "epoch": 0.32255283044247546, + "grad_norm": 0.002650330774486065, + "learning_rate": 0.001, + "loss": 0.4029, + "step": 11690 + }, + { + "epoch": 0.3225804226435398, + "grad_norm": 0.003308363724499941, + "learning_rate": 0.001, + "loss": 0.4157, + "step": 11691 + }, + { + "epoch": 0.32260801484460416, + "grad_norm": 0.0027979854494333267, + "learning_rate": 0.001, + "loss": 0.424, + "step": 11692 + }, + { + "epoch": 0.3226356070456685, + "grad_norm": 0.0035791087429970503, + "learning_rate": 0.001, + "loss": 0.3991, + "step": 11693 + }, + { + "epoch": 0.3226631992467329, + "grad_norm": 0.004163493402302265, + "learning_rate": 0.001, + "loss": 0.3944, + "step": 11694 + }, + { + "epoch": 0.3226907914477973, + "grad_norm": 0.0028729864861816168, + "learning_rate": 0.001, + "loss": 0.4305, + "step": 11695 + }, + { + "epoch": 0.32271838364886163, + "grad_norm": 0.018418997526168823, + "learning_rate": 0.001, + "loss": 0.4063, + "step": 11696 + }, + { + "epoch": 0.32274597584992604, + "grad_norm": 0.0038654280360788107, + "learning_rate": 0.001, + "loss": 0.3831, + "step": 11697 + }, + { + "epoch": 0.3227735680509904, + "grad_norm": 0.002807804848998785, + "learning_rate": 0.001, + "loss": 0.4042, + "step": 11698 + }, + { + "epoch": 0.32280116025205474, + "grad_norm": 0.003091311315074563, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 11699 + }, + { + "epoch": 0.32282875245311915, + "grad_norm": 0.002582644810900092, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 11700 + }, + { + "epoch": 0.3228563446541835, + "grad_norm": 0.0029077474027872086, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 11701 + }, + { + "epoch": 0.32288393685524786, + "grad_norm": 0.004372213501483202, + "learning_rate": 0.001, + "loss": 0.4026, + "step": 11702 + }, + { + "epoch": 0.3229115290563122, + "grad_norm": 0.005009527318179607, + "learning_rate": 0.001, + "loss": 0.4202, + "step": 11703 + }, + { + "epoch": 0.3229391212573766, + "grad_norm": 0.002494605490937829, + "learning_rate": 0.001, + "loss": 0.4044, + "step": 11704 + }, + { + "epoch": 0.32296671345844097, + "grad_norm": 0.0030128166545182467, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 11705 + }, + { + "epoch": 0.3229943056595053, + "grad_norm": 0.005615463946014643, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 11706 + }, + { + "epoch": 0.32302189786056973, + "grad_norm": 0.0026556167285889387, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 11707 + }, + { + "epoch": 0.3230494900616341, + "grad_norm": 0.004147225525230169, + "learning_rate": 0.001, + "loss": 0.4191, + "step": 11708 + }, + { + "epoch": 0.32307708226269843, + "grad_norm": 0.004111787304282188, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 11709 + }, + { + "epoch": 0.32310467446376284, + "grad_norm": 0.0030196192674338818, + "learning_rate": 0.001, + "loss": 0.4388, + "step": 11710 + }, + { + "epoch": 0.3231322666648272, + "grad_norm": 0.0029588905163109303, + "learning_rate": 0.001, + "loss": 0.4076, + "step": 11711 + }, + { + "epoch": 0.32315985886589155, + "grad_norm": 0.0030484474264085293, + "learning_rate": 0.001, + "loss": 0.4257, + "step": 11712 + }, + { + "epoch": 0.3231874510669559, + "grad_norm": 0.0031253311317414045, + "learning_rate": 0.001, + "loss": 0.3743, + "step": 11713 + }, + { + "epoch": 0.3232150432680203, + "grad_norm": 0.004489220213145018, + "learning_rate": 0.001, + "loss": 0.3902, + "step": 11714 + }, + { + "epoch": 0.32324263546908466, + "grad_norm": 0.0034049181267619133, + "learning_rate": 0.001, + "loss": 0.4251, + "step": 11715 + }, + { + "epoch": 0.323270227670149, + "grad_norm": 0.0045754867605865, + "learning_rate": 0.001, + "loss": 0.3577, + "step": 11716 + }, + { + "epoch": 0.3232978198712134, + "grad_norm": 0.0035644634626805782, + "learning_rate": 0.001, + "loss": 0.4277, + "step": 11717 + }, + { + "epoch": 0.3233254120722778, + "grad_norm": 0.0033633566927164793, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 11718 + }, + { + "epoch": 0.3233530042733421, + "grad_norm": 0.003968440927565098, + "learning_rate": 0.001, + "loss": 0.3516, + "step": 11719 + }, + { + "epoch": 0.32338059647440653, + "grad_norm": 0.003708978882059455, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 11720 + }, + { + "epoch": 0.3234081886754709, + "grad_norm": 0.002603841945528984, + "learning_rate": 0.001, + "loss": 0.4278, + "step": 11721 + }, + { + "epoch": 0.32343578087653524, + "grad_norm": 0.0024553535040467978, + "learning_rate": 0.001, + "loss": 0.3641, + "step": 11722 + }, + { + "epoch": 0.3234633730775996, + "grad_norm": 0.0033427192829549313, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 11723 + }, + { + "epoch": 0.323490965278664, + "grad_norm": 0.008662039414048195, + "learning_rate": 0.001, + "loss": 0.4312, + "step": 11724 + }, + { + "epoch": 0.32351855747972835, + "grad_norm": 0.002312082564458251, + "learning_rate": 0.001, + "loss": 0.4505, + "step": 11725 + }, + { + "epoch": 0.3235461496807927, + "grad_norm": 0.0026776569429785013, + "learning_rate": 0.001, + "loss": 0.3886, + "step": 11726 + }, + { + "epoch": 0.3235737418818571, + "grad_norm": 0.003623252734541893, + "learning_rate": 0.001, + "loss": 0.4441, + "step": 11727 + }, + { + "epoch": 0.32360133408292147, + "grad_norm": 0.004855128470808268, + "learning_rate": 0.001, + "loss": 0.4372, + "step": 11728 + }, + { + "epoch": 0.3236289262839858, + "grad_norm": 0.0022675027139484882, + "learning_rate": 0.001, + "loss": 0.4128, + "step": 11729 + }, + { + "epoch": 0.3236565184850502, + "grad_norm": 0.003087605582550168, + "learning_rate": 0.001, + "loss": 0.3768, + "step": 11730 + }, + { + "epoch": 0.3236841106861146, + "grad_norm": 0.003196093952283263, + "learning_rate": 0.001, + "loss": 0.411, + "step": 11731 + }, + { + "epoch": 0.32371170288717893, + "grad_norm": 0.002098439959809184, + "learning_rate": 0.001, + "loss": 0.4549, + "step": 11732 + }, + { + "epoch": 0.3237392950882433, + "grad_norm": 0.0027286570984870195, + "learning_rate": 0.001, + "loss": 0.4001, + "step": 11733 + }, + { + "epoch": 0.3237668872893077, + "grad_norm": 0.0031044287607073784, + "learning_rate": 0.001, + "loss": 0.4143, + "step": 11734 + }, + { + "epoch": 0.32379447949037204, + "grad_norm": 0.002601184183731675, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 11735 + }, + { + "epoch": 0.3238220716914364, + "grad_norm": 0.003185172798112035, + "learning_rate": 0.001, + "loss": 0.37, + "step": 11736 + }, + { + "epoch": 0.3238496638925008, + "grad_norm": 0.0032646963372826576, + "learning_rate": 0.001, + "loss": 0.4073, + "step": 11737 + }, + { + "epoch": 0.32387725609356516, + "grad_norm": 0.004382995422929525, + "learning_rate": 0.001, + "loss": 0.3969, + "step": 11738 + }, + { + "epoch": 0.3239048482946295, + "grad_norm": 0.004330406431108713, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 11739 + }, + { + "epoch": 0.3239324404956939, + "grad_norm": 0.01158521044999361, + "learning_rate": 0.001, + "loss": 0.3747, + "step": 11740 + }, + { + "epoch": 0.32396003269675827, + "grad_norm": 0.002962679835036397, + "learning_rate": 0.001, + "loss": 0.4056, + "step": 11741 + }, + { + "epoch": 0.3239876248978226, + "grad_norm": 0.0029328917153179646, + "learning_rate": 0.001, + "loss": 0.4242, + "step": 11742 + }, + { + "epoch": 0.324015217098887, + "grad_norm": 0.004508418962359428, + "learning_rate": 0.001, + "loss": 0.4069, + "step": 11743 + }, + { + "epoch": 0.3240428092999514, + "grad_norm": 0.0027705549728125334, + "learning_rate": 0.001, + "loss": 0.4, + "step": 11744 + }, + { + "epoch": 0.32407040150101574, + "grad_norm": 0.0022474832367151976, + "learning_rate": 0.001, + "loss": 0.4098, + "step": 11745 + }, + { + "epoch": 0.3240979937020801, + "grad_norm": 0.003890387015417218, + "learning_rate": 0.001, + "loss": 0.3878, + "step": 11746 + }, + { + "epoch": 0.3241255859031445, + "grad_norm": 0.0027710788417607546, + "learning_rate": 0.001, + "loss": 0.3838, + "step": 11747 + }, + { + "epoch": 0.32415317810420885, + "grad_norm": 0.002632656367495656, + "learning_rate": 0.001, + "loss": 0.3931, + "step": 11748 + }, + { + "epoch": 0.3241807703052732, + "grad_norm": 0.0026942237745970488, + "learning_rate": 0.001, + "loss": 0.3633, + "step": 11749 + }, + { + "epoch": 0.3242083625063376, + "grad_norm": 0.0025526436511427164, + "learning_rate": 0.001, + "loss": 0.404, + "step": 11750 + }, + { + "epoch": 0.32423595470740196, + "grad_norm": 0.002429589629173279, + "learning_rate": 0.001, + "loss": 0.3975, + "step": 11751 + }, + { + "epoch": 0.3242635469084663, + "grad_norm": 0.003830046160146594, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 11752 + }, + { + "epoch": 0.32429113910953067, + "grad_norm": 0.003458908060565591, + "learning_rate": 0.001, + "loss": 0.4292, + "step": 11753 + }, + { + "epoch": 0.3243187313105951, + "grad_norm": 0.002460696967318654, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 11754 + }, + { + "epoch": 0.32434632351165943, + "grad_norm": 0.004534050356596708, + "learning_rate": 0.001, + "loss": 0.4216, + "step": 11755 + }, + { + "epoch": 0.3243739157127238, + "grad_norm": 0.0023591818753629923, + "learning_rate": 0.001, + "loss": 0.3782, + "step": 11756 + }, + { + "epoch": 0.3244015079137882, + "grad_norm": 0.0029876313637942076, + "learning_rate": 0.001, + "loss": 0.3675, + "step": 11757 + }, + { + "epoch": 0.32442910011485254, + "grad_norm": 0.003414266975596547, + "learning_rate": 0.001, + "loss": 0.3941, + "step": 11758 + }, + { + "epoch": 0.3244566923159169, + "grad_norm": 0.004012831952422857, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 11759 + }, + { + "epoch": 0.3244842845169813, + "grad_norm": 0.0034918745514005423, + "learning_rate": 0.001, + "loss": 0.419, + "step": 11760 + }, + { + "epoch": 0.32451187671804566, + "grad_norm": 0.002764191012829542, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 11761 + }, + { + "epoch": 0.32453946891911, + "grad_norm": 0.003841676749289036, + "learning_rate": 0.001, + "loss": 0.3758, + "step": 11762 + }, + { + "epoch": 0.32456706112017436, + "grad_norm": 0.004220995120704174, + "learning_rate": 0.001, + "loss": 0.422, + "step": 11763 + }, + { + "epoch": 0.32459465332123877, + "grad_norm": 0.003612626576796174, + "learning_rate": 0.001, + "loss": 0.4017, + "step": 11764 + }, + { + "epoch": 0.3246222455223031, + "grad_norm": 0.0025354884564876556, + "learning_rate": 0.001, + "loss": 0.3927, + "step": 11765 + }, + { + "epoch": 0.3246498377233675, + "grad_norm": 0.003477931022644043, + "learning_rate": 0.001, + "loss": 0.3919, + "step": 11766 + }, + { + "epoch": 0.3246774299244319, + "grad_norm": 0.004952993709594011, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 11767 + }, + { + "epoch": 0.32470502212549623, + "grad_norm": 0.0031629884615540504, + "learning_rate": 0.001, + "loss": 0.358, + "step": 11768 + }, + { + "epoch": 0.3247326143265606, + "grad_norm": 0.002113747876137495, + "learning_rate": 0.001, + "loss": 0.4527, + "step": 11769 + }, + { + "epoch": 0.32476020652762494, + "grad_norm": 0.002285837195813656, + "learning_rate": 0.001, + "loss": 0.4368, + "step": 11770 + }, + { + "epoch": 0.32478779872868935, + "grad_norm": 0.003418036038056016, + "learning_rate": 0.001, + "loss": 0.4313, + "step": 11771 + }, + { + "epoch": 0.3248153909297537, + "grad_norm": 0.00575110362842679, + "learning_rate": 0.001, + "loss": 0.399, + "step": 11772 + }, + { + "epoch": 0.32484298313081805, + "grad_norm": 0.0031514798756688833, + "learning_rate": 0.001, + "loss": 0.4403, + "step": 11773 + }, + { + "epoch": 0.32487057533188246, + "grad_norm": 0.0029524280689656734, + "learning_rate": 0.001, + "loss": 0.3897, + "step": 11774 + }, + { + "epoch": 0.3248981675329468, + "grad_norm": 0.0038776015862822533, + "learning_rate": 0.001, + "loss": 0.4326, + "step": 11775 + }, + { + "epoch": 0.32492575973401117, + "grad_norm": 0.006274119019508362, + "learning_rate": 0.001, + "loss": 0.3959, + "step": 11776 + }, + { + "epoch": 0.3249533519350756, + "grad_norm": 0.00265169283375144, + "learning_rate": 0.001, + "loss": 0.395, + "step": 11777 + }, + { + "epoch": 0.3249809441361399, + "grad_norm": 0.0040013487450778484, + "learning_rate": 0.001, + "loss": 0.3982, + "step": 11778 + }, + { + "epoch": 0.3250085363372043, + "grad_norm": 0.004476572852581739, + "learning_rate": 0.001, + "loss": 0.426, + "step": 11779 + }, + { + "epoch": 0.32503612853826863, + "grad_norm": 0.00346595561131835, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 11780 + }, + { + "epoch": 0.32506372073933304, + "grad_norm": 0.004196068271994591, + "learning_rate": 0.001, + "loss": 0.3683, + "step": 11781 + }, + { + "epoch": 0.3250913129403974, + "grad_norm": 0.0036759015638381243, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 11782 + }, + { + "epoch": 0.32511890514146174, + "grad_norm": 0.0028389294166117907, + "learning_rate": 0.001, + "loss": 0.3608, + "step": 11783 + }, + { + "epoch": 0.32514649734252615, + "grad_norm": 0.00459603127092123, + "learning_rate": 0.001, + "loss": 0.4131, + "step": 11784 + }, + { + "epoch": 0.3251740895435905, + "grad_norm": 0.005619920324534178, + "learning_rate": 0.001, + "loss": 0.3996, + "step": 11785 + }, + { + "epoch": 0.32520168174465486, + "grad_norm": 0.009859694167971611, + "learning_rate": 0.001, + "loss": 0.3829, + "step": 11786 + }, + { + "epoch": 0.32522927394571927, + "grad_norm": 0.006388423964381218, + "learning_rate": 0.001, + "loss": 0.4, + "step": 11787 + }, + { + "epoch": 0.3252568661467836, + "grad_norm": 0.006722534541040659, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 11788 + }, + { + "epoch": 0.32528445834784797, + "grad_norm": 0.008275543339550495, + "learning_rate": 0.001, + "loss": 0.3877, + "step": 11789 + }, + { + "epoch": 0.3253120505489123, + "grad_norm": 0.004540057387202978, + "learning_rate": 0.001, + "loss": 0.43, + "step": 11790 + }, + { + "epoch": 0.32533964274997673, + "grad_norm": 0.0027223837096244097, + "learning_rate": 0.001, + "loss": 0.427, + "step": 11791 + }, + { + "epoch": 0.3253672349510411, + "grad_norm": 0.003166953567415476, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 11792 + }, + { + "epoch": 0.32539482715210544, + "grad_norm": 0.004159968346357346, + "learning_rate": 0.001, + "loss": 0.3904, + "step": 11793 + }, + { + "epoch": 0.32542241935316985, + "grad_norm": 0.002168782986700535, + "learning_rate": 0.001, + "loss": 0.4455, + "step": 11794 + }, + { + "epoch": 0.3254500115542342, + "grad_norm": 0.0049192942678928375, + "learning_rate": 0.001, + "loss": 0.3977, + "step": 11795 + }, + { + "epoch": 0.32547760375529855, + "grad_norm": 0.0027885152958333492, + "learning_rate": 0.001, + "loss": 0.3801, + "step": 11796 + }, + { + "epoch": 0.32550519595636296, + "grad_norm": 0.005841918755322695, + "learning_rate": 0.001, + "loss": 0.3976, + "step": 11797 + }, + { + "epoch": 0.3255327881574273, + "grad_norm": 0.00430692546069622, + "learning_rate": 0.001, + "loss": 0.4142, + "step": 11798 + }, + { + "epoch": 0.32556038035849166, + "grad_norm": 0.003325339872390032, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 11799 + }, + { + "epoch": 0.325587972559556, + "grad_norm": 0.0032596606761217117, + "learning_rate": 0.001, + "loss": 0.3914, + "step": 11800 + }, + { + "epoch": 0.3256155647606204, + "grad_norm": 0.004303377121686935, + "learning_rate": 0.001, + "loss": 0.3912, + "step": 11801 + }, + { + "epoch": 0.3256431569616848, + "grad_norm": 0.002758364425972104, + "learning_rate": 0.001, + "loss": 0.4049, + "step": 11802 + }, + { + "epoch": 0.32567074916274913, + "grad_norm": 0.00543051864951849, + "learning_rate": 0.001, + "loss": 0.3508, + "step": 11803 + }, + { + "epoch": 0.32569834136381354, + "grad_norm": 0.003291371511295438, + "learning_rate": 0.001, + "loss": 0.4341, + "step": 11804 + }, + { + "epoch": 0.3257259335648779, + "grad_norm": 0.0028754028026014566, + "learning_rate": 0.001, + "loss": 0.3812, + "step": 11805 + }, + { + "epoch": 0.32575352576594224, + "grad_norm": 0.002680160803720355, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 11806 + }, + { + "epoch": 0.32578111796700665, + "grad_norm": 0.0026779999025166035, + "learning_rate": 0.001, + "loss": 0.3638, + "step": 11807 + }, + { + "epoch": 0.325808710168071, + "grad_norm": 0.0029692722018808126, + "learning_rate": 0.001, + "loss": 0.3997, + "step": 11808 + }, + { + "epoch": 0.32583630236913536, + "grad_norm": 0.00280319363810122, + "learning_rate": 0.001, + "loss": 0.4106, + "step": 11809 + }, + { + "epoch": 0.3258638945701997, + "grad_norm": 0.002986667212098837, + "learning_rate": 0.001, + "loss": 0.4094, + "step": 11810 + }, + { + "epoch": 0.3258914867712641, + "grad_norm": 0.003013553563505411, + "learning_rate": 0.001, + "loss": 0.4467, + "step": 11811 + }, + { + "epoch": 0.32591907897232847, + "grad_norm": 0.002967800246551633, + "learning_rate": 0.001, + "loss": 0.4084, + "step": 11812 + }, + { + "epoch": 0.3259466711733928, + "grad_norm": 0.0031372224912047386, + "learning_rate": 0.001, + "loss": 0.3709, + "step": 11813 + }, + { + "epoch": 0.32597426337445723, + "grad_norm": 0.0031039847526699305, + "learning_rate": 0.001, + "loss": 0.3604, + "step": 11814 + }, + { + "epoch": 0.3260018555755216, + "grad_norm": 0.003049139864742756, + "learning_rate": 0.001, + "loss": 0.377, + "step": 11815 + }, + { + "epoch": 0.32602944777658593, + "grad_norm": 0.0029599741101264954, + "learning_rate": 0.001, + "loss": 0.4062, + "step": 11816 + }, + { + "epoch": 0.32605703997765034, + "grad_norm": 0.003724336624145508, + "learning_rate": 0.001, + "loss": 0.407, + "step": 11817 + }, + { + "epoch": 0.3260846321787147, + "grad_norm": 0.0028906487859785557, + "learning_rate": 0.001, + "loss": 0.414, + "step": 11818 + }, + { + "epoch": 0.32611222437977905, + "grad_norm": 0.003816339885815978, + "learning_rate": 0.001, + "loss": 0.3857, + "step": 11819 + }, + { + "epoch": 0.3261398165808434, + "grad_norm": 0.005886279512196779, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 11820 + }, + { + "epoch": 0.3261674087819078, + "grad_norm": 0.0029236190021038055, + "learning_rate": 0.001, + "loss": 0.4003, + "step": 11821 + }, + { + "epoch": 0.32619500098297216, + "grad_norm": 0.005116250831633806, + "learning_rate": 0.001, + "loss": 0.3837, + "step": 11822 + }, + { + "epoch": 0.3262225931840365, + "grad_norm": 0.0027175480499863625, + "learning_rate": 0.001, + "loss": 0.3772, + "step": 11823 + }, + { + "epoch": 0.3262501853851009, + "grad_norm": 0.005795364733785391, + "learning_rate": 0.001, + "loss": 0.3854, + "step": 11824 + }, + { + "epoch": 0.3262777775861653, + "grad_norm": 0.0028781124856323004, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 11825 + }, + { + "epoch": 0.3263053697872296, + "grad_norm": 0.008585303090512753, + "learning_rate": 0.001, + "loss": 0.3939, + "step": 11826 + }, + { + "epoch": 0.32633296198829403, + "grad_norm": 0.0027091645170003176, + "learning_rate": 0.001, + "loss": 0.3958, + "step": 11827 + }, + { + "epoch": 0.3263605541893584, + "grad_norm": 0.0028620229568332434, + "learning_rate": 0.001, + "loss": 0.4246, + "step": 11828 + }, + { + "epoch": 0.32638814639042274, + "grad_norm": 0.0028989813290536404, + "learning_rate": 0.001, + "loss": 0.3798, + "step": 11829 + }, + { + "epoch": 0.3264157385914871, + "grad_norm": 0.005880692508071661, + "learning_rate": 0.001, + "loss": 0.3887, + "step": 11830 + }, + { + "epoch": 0.3264433307925515, + "grad_norm": 0.005375871900469065, + "learning_rate": 0.001, + "loss": 0.3917, + "step": 11831 + }, + { + "epoch": 0.32647092299361585, + "grad_norm": 0.00993763655424118, + "learning_rate": 0.001, + "loss": 0.3602, + "step": 11832 + }, + { + "epoch": 0.3264985151946802, + "grad_norm": 0.004657833371311426, + "learning_rate": 0.001, + "loss": 0.408, + "step": 11833 + }, + { + "epoch": 0.3265261073957446, + "grad_norm": 0.0023800579365342855, + "learning_rate": 0.001, + "loss": 0.3682, + "step": 11834 + }, + { + "epoch": 0.32655369959680897, + "grad_norm": 0.003295092610642314, + "learning_rate": 0.001, + "loss": 0.405, + "step": 11835 + }, + { + "epoch": 0.3265812917978733, + "grad_norm": 0.0033699970226734877, + "learning_rate": 0.001, + "loss": 0.4249, + "step": 11836 + }, + { + "epoch": 0.3266088839989377, + "grad_norm": 0.0023069006856530905, + "learning_rate": 0.001, + "loss": 0.381, + "step": 11837 + }, + { + "epoch": 0.3266364762000021, + "grad_norm": 0.002622458152472973, + "learning_rate": 0.001, + "loss": 0.4079, + "step": 11838 + }, + { + "epoch": 0.32666406840106643, + "grad_norm": 0.0072345067746937275, + "learning_rate": 0.001, + "loss": 0.3669, + "step": 11839 + }, + { + "epoch": 0.3266916606021308, + "grad_norm": 0.002343923319131136, + "learning_rate": 0.001, + "loss": 0.4473, + "step": 11840 + }, + { + "epoch": 0.3267192528031952, + "grad_norm": 0.003922601230442524, + "learning_rate": 0.001, + "loss": 0.3833, + "step": 11841 + }, + { + "epoch": 0.32674684500425955, + "grad_norm": 0.002412164816632867, + "learning_rate": 0.001, + "loss": 0.43, + "step": 11842 + }, + { + "epoch": 0.3267744372053239, + "grad_norm": 0.004316416569054127, + "learning_rate": 0.001, + "loss": 0.4081, + "step": 11843 + }, + { + "epoch": 0.3268020294063883, + "grad_norm": 0.0025885479990392923, + "learning_rate": 0.001, + "loss": 0.3644, + "step": 11844 + }, + { + "epoch": 0.32682962160745266, + "grad_norm": 0.008287443779408932, + "learning_rate": 0.001, + "loss": 0.3932, + "step": 11845 + }, + { + "epoch": 0.326857213808517, + "grad_norm": 0.0027159906458109617, + "learning_rate": 0.001, + "loss": 0.4268, + "step": 11846 + }, + { + "epoch": 0.3268848060095814, + "grad_norm": 0.002340937964618206, + "learning_rate": 0.001, + "loss": 0.3868, + "step": 11847 + }, + { + "epoch": 0.32691239821064577, + "grad_norm": 0.0027317411731928587, + "learning_rate": 0.001, + "loss": 0.3732, + "step": 11848 + }, + { + "epoch": 0.3269399904117101, + "grad_norm": 0.0028033903799951077, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 11849 + }, + { + "epoch": 0.3269675826127745, + "grad_norm": 0.004709617234766483, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 11850 + }, + { + "epoch": 0.3269951748138389, + "grad_norm": 0.005631160456687212, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 11851 + }, + { + "epoch": 0.32702276701490324, + "grad_norm": 0.016221268102526665, + "learning_rate": 0.001, + "loss": 0.4047, + "step": 11852 + }, + { + "epoch": 0.3270503592159676, + "grad_norm": 0.0028514659497886896, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 11853 + }, + { + "epoch": 0.327077951417032, + "grad_norm": 0.00452115572988987, + "learning_rate": 0.001, + "loss": 0.3871, + "step": 11854 + }, + { + "epoch": 0.32710554361809635, + "grad_norm": 0.00457549886777997, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11855 + }, + { + "epoch": 0.3271331358191607, + "grad_norm": 0.0027922464068979025, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 11856 + }, + { + "epoch": 0.3271607280202251, + "grad_norm": 0.005448077339679003, + "learning_rate": 0.001, + "loss": 0.3526, + "step": 11857 + }, + { + "epoch": 0.32718832022128946, + "grad_norm": 0.007846580818295479, + "learning_rate": 0.001, + "loss": 0.4052, + "step": 11858 + }, + { + "epoch": 0.3272159124223538, + "grad_norm": 0.0035153308417648077, + "learning_rate": 0.001, + "loss": 0.3719, + "step": 11859 + }, + { + "epoch": 0.32724350462341817, + "grad_norm": 0.0028227800503373146, + "learning_rate": 0.001, + "loss": 0.3836, + "step": 11860 + }, + { + "epoch": 0.3272710968244826, + "grad_norm": 0.004079067148268223, + "learning_rate": 0.001, + "loss": 0.3951, + "step": 11861 + }, + { + "epoch": 0.32729868902554693, + "grad_norm": 0.007391565479338169, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 11862 + }, + { + "epoch": 0.3273262812266113, + "grad_norm": 0.0028963603544980288, + "learning_rate": 0.001, + "loss": 0.4337, + "step": 11863 + }, + { + "epoch": 0.3273538734276757, + "grad_norm": 0.002462986158207059, + "learning_rate": 0.001, + "loss": 0.3778, + "step": 11864 + }, + { + "epoch": 0.32738146562874004, + "grad_norm": 0.0026601895224303007, + "learning_rate": 0.001, + "loss": 0.4071, + "step": 11865 + }, + { + "epoch": 0.3274090578298044, + "grad_norm": 0.0028002276085317135, + "learning_rate": 0.001, + "loss": 0.3891, + "step": 11866 + }, + { + "epoch": 0.32743665003086875, + "grad_norm": 0.0025977008044719696, + "learning_rate": 0.001, + "loss": 0.4384, + "step": 11867 + }, + { + "epoch": 0.32746424223193316, + "grad_norm": 0.0031972804572433233, + "learning_rate": 0.001, + "loss": 0.4012, + "step": 11868 + }, + { + "epoch": 0.3274918344329975, + "grad_norm": 0.0031719047110527754, + "learning_rate": 0.001, + "loss": 0.386, + "step": 11869 + }, + { + "epoch": 0.32751942663406186, + "grad_norm": 0.021741317585110664, + "learning_rate": 0.001, + "loss": 0.3954, + "step": 11870 + }, + { + "epoch": 0.32754701883512627, + "grad_norm": 0.005185617599636316, + "learning_rate": 0.001, + "loss": 0.3913, + "step": 11871 + }, + { + "epoch": 0.3275746110361906, + "grad_norm": 0.004073983523994684, + "learning_rate": 0.001, + "loss": 0.3319, + "step": 11872 + }, + { + "epoch": 0.327602203237255, + "grad_norm": 0.003672317136079073, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 11873 + }, + { + "epoch": 0.3276297954383194, + "grad_norm": 0.003211831208318472, + "learning_rate": 0.001, + "loss": 0.3866, + "step": 11874 + }, + { + "epoch": 0.32765738763938373, + "grad_norm": 0.005349930375814438, + "learning_rate": 0.001, + "loss": 0.3879, + "step": 11875 + }, + { + "epoch": 0.3276849798404481, + "grad_norm": 0.001996625680476427, + "learning_rate": 0.001, + "loss": 0.4215, + "step": 11876 + }, + { + "epoch": 0.32771257204151244, + "grad_norm": 0.0034636626951396465, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 11877 + }, + { + "epoch": 0.32774016424257685, + "grad_norm": 0.006974721793085337, + "learning_rate": 0.001, + "loss": 0.3961, + "step": 11878 + }, + { + "epoch": 0.3277677564436412, + "grad_norm": 0.004640425555408001, + "learning_rate": 0.001, + "loss": 0.4197, + "step": 11879 + }, + { + "epoch": 0.32779534864470555, + "grad_norm": 0.0025745509192347527, + "learning_rate": 0.001, + "loss": 0.3604, + "step": 11880 + }, + { + "epoch": 0.32782294084576996, + "grad_norm": 0.0033829696476459503, + "learning_rate": 0.001, + "loss": 0.3901, + "step": 11881 + }, + { + "epoch": 0.3278505330468343, + "grad_norm": 0.003960746806114912, + "learning_rate": 0.001, + "loss": 0.3708, + "step": 11882 + }, + { + "epoch": 0.32787812524789867, + "grad_norm": 0.006613335572183132, + "learning_rate": 0.001, + "loss": 0.3652, + "step": 11883 + }, + { + "epoch": 0.3279057174489631, + "grad_norm": 0.003319015959277749, + "learning_rate": 0.001, + "loss": 0.389, + "step": 11884 + }, + { + "epoch": 0.3279333096500274, + "grad_norm": 0.0034291057381778955, + "learning_rate": 0.001, + "loss": 0.393, + "step": 11885 + }, + { + "epoch": 0.3279609018510918, + "grad_norm": 0.0027653754223138094, + "learning_rate": 0.001, + "loss": 0.3873, + "step": 11886 + }, + { + "epoch": 0.32798849405215613, + "grad_norm": 0.00384811544790864, + "learning_rate": 0.001, + "loss": 0.3987, + "step": 11887 + }, + { + "epoch": 0.32801608625322054, + "grad_norm": 0.0028639482334256172, + "learning_rate": 0.001, + "loss": 0.3981, + "step": 11888 + }, + { + "epoch": 0.3280436784542849, + "grad_norm": 0.0026863133534789085, + "learning_rate": 0.001, + "loss": 0.4167, + "step": 11889 + }, + { + "epoch": 0.32807127065534925, + "grad_norm": 0.0028144351672381163, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 11890 + }, + { + "epoch": 0.32809886285641365, + "grad_norm": 0.010337802581489086, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 11891 + }, + { + "epoch": 0.328126455057478, + "grad_norm": 0.01729881763458252, + "learning_rate": 0.001, + "loss": 0.4123, + "step": 11892 + }, + { + "epoch": 0.32815404725854236, + "grad_norm": 0.01979982666671276, + "learning_rate": 0.001, + "loss": 0.3943, + "step": 11893 + }, + { + "epoch": 0.32818163945960677, + "grad_norm": 0.003908053506165743, + "learning_rate": 0.001, + "loss": 0.3937, + "step": 11894 + }, + { + "epoch": 0.3282092316606711, + "grad_norm": 0.0044098771177232265, + "learning_rate": 0.001, + "loss": 0.4198, + "step": 11895 + }, + { + "epoch": 0.32823682386173547, + "grad_norm": 0.0028675422072410583, + "learning_rate": 0.001, + "loss": 0.4231, + "step": 11896 + }, + { + "epoch": 0.3282644160627998, + "grad_norm": 0.006255595479160547, + "learning_rate": 0.001, + "loss": 0.3649, + "step": 11897 + }, + { + "epoch": 0.32829200826386423, + "grad_norm": 0.0025888034142553806, + "learning_rate": 0.001, + "loss": 0.4051, + "step": 11898 + }, + { + "epoch": 0.3283196004649286, + "grad_norm": 0.0021037342958152294, + "learning_rate": 0.001, + "loss": 0.4037, + "step": 11899 + }, + { + "epoch": 0.32834719266599294, + "grad_norm": 0.003093705978244543, + "learning_rate": 0.001, + "loss": 0.3568, + "step": 11900 + }, + { + "epoch": 0.32837478486705735, + "grad_norm": 0.0024948539212346077, + "learning_rate": 0.001, + "loss": 0.4112, + "step": 11901 + }, + { + "epoch": 0.3284023770681217, + "grad_norm": 0.0028005533386021852, + "learning_rate": 0.001, + "loss": 0.3761, + "step": 11902 + }, + { + "epoch": 0.32842996926918605, + "grad_norm": 0.006543453317135572, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 11903 + }, + { + "epoch": 0.32845756147025046, + "grad_norm": 0.0032540878746658564, + "learning_rate": 0.001, + "loss": 0.4036, + "step": 11904 + }, + { + "epoch": 0.3284851536713148, + "grad_norm": 0.017330944538116455, + "learning_rate": 0.001, + "loss": 0.392, + "step": 11905 + }, + { + "epoch": 0.32851274587237916, + "grad_norm": 0.015614539384841919, + "learning_rate": 0.001, + "loss": 0.3827, + "step": 11906 + }, + { + "epoch": 0.3285403380734435, + "grad_norm": 0.04733549430966377, + "learning_rate": 0.001, + "loss": 0.4134, + "step": 11907 + }, + { + "epoch": 0.3285679302745079, + "grad_norm": 0.062019579112529755, + "learning_rate": 0.001, + "loss": 0.371, + "step": 11908 + }, + { + "epoch": 0.3285955224755723, + "grad_norm": 0.008806095458567142, + "learning_rate": 0.001, + "loss": 0.4204, + "step": 11909 + }, + { + "epoch": 0.32862311467663663, + "grad_norm": 0.052441034466028214, + "learning_rate": 0.001, + "loss": 0.3895, + "step": 11910 + }, + { + "epoch": 0.32865070687770104, + "grad_norm": 0.004011763259768486, + "learning_rate": 0.001, + "loss": 0.3924, + "step": 11911 + }, + { + "epoch": 0.3286782990787654, + "grad_norm": 0.004210361745208502, + "learning_rate": 0.001, + "loss": 0.3974, + "step": 11912 + }, + { + "epoch": 0.32870589127982974, + "grad_norm": 0.009359506890177727, + "learning_rate": 0.001, + "loss": 0.352, + "step": 11913 + }, + { + "epoch": 0.32873348348089415, + "grad_norm": 0.007341593038290739, + "learning_rate": 0.001, + "loss": 0.3809, + "step": 11914 + }, + { + "epoch": 0.3287610756819585, + "grad_norm": 0.005077636335045099, + "learning_rate": 0.001, + "loss": 0.3935, + "step": 11915 + }, + { + "epoch": 0.32878866788302286, + "grad_norm": 0.01000258419662714, + "learning_rate": 0.001, + "loss": 0.407, + "step": 11916 + }, + { + "epoch": 0.3288162600840872, + "grad_norm": 0.030418379232287407, + "learning_rate": 0.001, + "loss": 0.3816, + "step": 11917 + }, + { + "epoch": 0.3288438522851516, + "grad_norm": 0.003933771047741175, + "learning_rate": 0.001, + "loss": 0.4188, + "step": 11918 + }, + { + "epoch": 0.32887144448621597, + "grad_norm": 0.002575295278802514, + "learning_rate": 0.001, + "loss": 0.4108, + "step": 11919 + }, + { + "epoch": 0.3288990366872803, + "grad_norm": 0.002374958945438266, + "learning_rate": 0.001, + "loss": 0.429, + "step": 11920 + }, + { + "epoch": 0.32892662888834473, + "grad_norm": 0.003249475732445717, + "learning_rate": 0.001, + "loss": 0.3692, + "step": 11921 + }, + { + "epoch": 0.3289542210894091, + "grad_norm": 0.002772397128865123, + "learning_rate": 0.001, + "loss": 0.3916, + "step": 11922 + }, + { + "epoch": 0.32898181329047343, + "grad_norm": 0.0042383186519145966, + "learning_rate": 0.001, + "loss": 0.3551, + "step": 11923 + }, + { + "epoch": 0.32900940549153784, + "grad_norm": 0.003252743510529399, + "learning_rate": 0.001, + "loss": 0.4146, + "step": 11924 + }, + { + "epoch": 0.3290369976926022, + "grad_norm": 0.0025390556547790766, + "learning_rate": 0.001, + "loss": 0.3918, + "step": 11925 + }, + { + "epoch": 0.32906458989366655, + "grad_norm": 0.002343092579394579, + "learning_rate": 0.001, + "loss": 0.4287, + "step": 11926 + }, + { + "epoch": 0.3290921820947309, + "grad_norm": 0.0033002211712300777, + "learning_rate": 0.001, + "loss": 0.3726, + "step": 11927 + }, + { + "epoch": 0.3291197742957953, + "grad_norm": 0.003043625270947814, + "learning_rate": 0.001, + "loss": 0.4038, + "step": 11928 + }, + { + "epoch": 0.32914736649685966, + "grad_norm": 0.006221500225365162, + "learning_rate": 0.001, + "loss": 0.4217, + "step": 11929 + }, + { + "epoch": 0.329174958697924, + "grad_norm": 0.004655203316360712, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 11930 + }, + { + "epoch": 0.3292025508989884, + "grad_norm": 0.0021311945747584105, + "learning_rate": 0.001, + "loss": 0.3988, + "step": 11931 + }, + { + "epoch": 0.3292301431000528, + "grad_norm": 0.0027006971649825573, + "learning_rate": 0.001, + "loss": 0.4346, + "step": 11932 + }, + { + "epoch": 0.3292577353011171, + "grad_norm": 0.002889286493882537, + "learning_rate": 0.001, + "loss": 0.3541, + "step": 11933 + }, + { + "epoch": 0.32928532750218154, + "grad_norm": 0.007051974534988403, + "learning_rate": 0.001, + "loss": 0.4284, + "step": 11934 + }, + { + "epoch": 0.3293129197032459, + "grad_norm": 0.002629111986607313, + "learning_rate": 0.001, + "loss": 0.409, + "step": 11935 + }, + { + "epoch": 0.32934051190431024, + "grad_norm": 0.004683708306401968, + "learning_rate": 0.001, + "loss": 0.3419, + "step": 11936 + }, + { + "epoch": 0.3293681041053746, + "grad_norm": 0.003245966276153922, + "learning_rate": 0.001, + "loss": 0.394, + "step": 11937 + }, + { + "epoch": 0.329395696306439, + "grad_norm": 0.0032094584312289953, + "learning_rate": 0.001, + "loss": 0.3986, + "step": 11938 + }, + { + "epoch": 0.32942328850750335, + "grad_norm": 0.0030683856457471848, + "learning_rate": 0.001, + "loss": 0.3863, + "step": 11939 + }, + { + "epoch": 0.3294508807085677, + "grad_norm": 0.0035426607355475426, + "learning_rate": 0.001, + "loss": 0.4418, + "step": 11940 + }, + { + "epoch": 0.3294784729096321, + "grad_norm": 0.003129177028313279, + "learning_rate": 0.001, + "loss": 0.3972, + "step": 11941 + }, + { + "epoch": 0.32950606511069647, + "grad_norm": 0.003153977682814002, + "learning_rate": 0.001, + "loss": 0.3795, + "step": 11942 + }, + { + "epoch": 0.3295336573117608, + "grad_norm": 0.0056450688280165195, + "learning_rate": 0.001, + "loss": 0.3759, + "step": 11943 + }, + { + "epoch": 0.3295612495128252, + "grad_norm": 0.0025312695652246475, + "learning_rate": 0.001, + "loss": 0.3729, + "step": 11944 + }, + { + "epoch": 0.3295888417138896, + "grad_norm": 0.0025766692124307156, + "learning_rate": 0.001, + "loss": 0.4299, + "step": 11945 + }, + { + "epoch": 0.32961643391495393, + "grad_norm": 0.005080906208604574, + "learning_rate": 0.001, + "loss": 0.4023, + "step": 11946 + }, + { + "epoch": 0.3296440261160183, + "grad_norm": 0.0058040316216647625, + "learning_rate": 0.001, + "loss": 0.3528, + "step": 11947 + }, + { + "epoch": 0.3296716183170827, + "grad_norm": 0.0035558012314140797, + "learning_rate": 0.001, + "loss": 0.3893, + "step": 11948 + }, + { + "epoch": 0.32969921051814705, + "grad_norm": 0.003042223397642374, + "learning_rate": 0.001, + "loss": 0.3936, + "step": 11949 + }, + { + "epoch": 0.3297268027192114, + "grad_norm": 0.0033221363555639982, + "learning_rate": 0.001, + "loss": 0.419, + "step": 11950 + }, + { + "epoch": 0.3297543949202758, + "grad_norm": 0.0032754812855273485, + "learning_rate": 0.001, + "loss": 0.3948, + "step": 11951 + }, + { + "epoch": 0.32978198712134016, + "grad_norm": 0.002569601172581315, + "learning_rate": 0.001, + "loss": 0.3701, + "step": 11952 + }, + { + "epoch": 0.3298095793224045, + "grad_norm": 0.003200158243998885, + "learning_rate": 0.001, + "loss": 0.4138, + "step": 11953 + }, + { + "epoch": 0.32983717152346886, + "grad_norm": 0.003082757582888007, + "learning_rate": 0.001, + "loss": 0.369, + "step": 11954 + }, + { + "epoch": 0.32986476372453327, + "grad_norm": 0.002875220263376832, + "learning_rate": 0.001, + "loss": 0.3803, + "step": 11955 + }, + { + "epoch": 0.3298923559255976, + "grad_norm": 0.002740328898653388, + "learning_rate": 0.001, + "loss": 0.3595, + "step": 11956 + }, + { + "epoch": 0.329919948126662, + "grad_norm": 0.004781671334058046, + "learning_rate": 0.001, + "loss": 0.4139, + "step": 11957 + }, + { + "epoch": 0.3299475403277264, + "grad_norm": 0.002997961826622486, + "learning_rate": 0.001, + "loss": 0.401, + "step": 11958 + }, + { + "epoch": 0.32997513252879074, + "grad_norm": 0.002518662018701434, + "learning_rate": 0.001, + "loss": 0.3957, + "step": 11959 + }, + { + "epoch": 0.3300027247298551, + "grad_norm": 0.0048488592728972435, + "learning_rate": 0.001, + "loss": 0.4122, + "step": 11960 + }, + { + "epoch": 0.3300303169309195, + "grad_norm": 0.005046727601438761, + "learning_rate": 0.001, + "loss": 0.388, + "step": 11961 + }, + { + "epoch": 0.33005790913198385, + "grad_norm": 0.0023717847652733326, + "learning_rate": 0.001, + "loss": 0.4206, + "step": 11962 + }, + { + "epoch": 0.3300855013330482, + "grad_norm": 0.002914504613727331, + "learning_rate": 0.001, + "loss": 0.4109, + "step": 11963 + }, + { + "epoch": 0.33011309353411256, + "grad_norm": 0.006090945564210415, + "learning_rate": 0.001, + "loss": 0.3835, + "step": 11964 + }, + { + "epoch": 0.33014068573517696, + "grad_norm": 0.0030452022328972816, + "learning_rate": 0.001, + "loss": 0.3956, + "step": 11965 + }, + { + "epoch": 0.3301682779362413, + "grad_norm": 0.0026935841888189316, + "learning_rate": 0.001, + "loss": 0.414, + "step": 11966 + }, + { + "epoch": 0.33019587013730567, + "grad_norm": 0.003028253326192498, + "learning_rate": 0.001, + "loss": 0.3754, + "step": 11967 + }, + { + "epoch": 0.3302234623383701, + "grad_norm": 0.004835574887692928, + "learning_rate": 0.001, + "loss": 0.3862, + "step": 11968 + }, + { + "epoch": 0.33025105453943443, + "grad_norm": 0.0029693404212594032, + "learning_rate": 0.001, + "loss": 0.403, + "step": 11969 + }, + { + "epoch": 0.3302786467404988, + "grad_norm": 0.0037660168018192053, + "learning_rate": 0.001, + "loss": 0.4376, + "step": 11970 + }, + { + "epoch": 0.3303062389415632, + "grad_norm": 0.0026343679055571556, + "learning_rate": 0.001, + "loss": 0.4401, + "step": 11971 + }, + { + "epoch": 0.33033383114262754, + "grad_norm": 0.002500650705769658, + "learning_rate": 0.001, + "loss": 0.4046, + "step": 11972 + }, + { + "epoch": 0.3303614233436919, + "grad_norm": 0.0031541388016194105, + "learning_rate": 0.001, + "loss": 0.4058, + "step": 11973 + }, + { + "epoch": 0.33038901554475625, + "grad_norm": 0.0035852715373039246, + "learning_rate": 0.001, + "loss": 0.3842, + "step": 11974 + }, + { + "epoch": 0.33041660774582066, + "grad_norm": 0.0040337685495615005, + "learning_rate": 0.001, + "loss": 0.4281, + "step": 11975 + }, + { + "epoch": 0.330444199946885, + "grad_norm": 0.0031562617514282465, + "learning_rate": 0.001, + "loss": 0.3547, + "step": 11976 + }, + { + "epoch": 0.33047179214794936, + "grad_norm": 0.004418304655700922, + "learning_rate": 0.001, + "loss": 0.3938, + "step": 11977 + }, + { + "epoch": 0.33049938434901377, + "grad_norm": 0.0028235798235982656, + "learning_rate": 0.001, + "loss": 0.3749, + "step": 11978 + }, + { + "epoch": 0.3305269765500781, + "grad_norm": 0.0024365021381527185, + "learning_rate": 0.001, + "loss": 0.4025, + "step": 11979 + }, + { + "epoch": 0.3305545687511425, + "grad_norm": 0.003930834122002125, + "learning_rate": 0.001, + "loss": 0.3923, + "step": 11980 + }, + { + "epoch": 0.3305821609522069, + "grad_norm": 0.002820041496306658, + "learning_rate": 0.001, + "loss": 0.4124, + "step": 11981 + }, + { + "epoch": 0.33060975315327124, + "grad_norm": 0.0039013477507978678, + "learning_rate": 0.001, + "loss": 0.4256, + "step": 11982 + }, + { + "epoch": 0.3306373453543356, + "grad_norm": 0.002920904429629445, + "learning_rate": 0.001, + "loss": 0.389, + "step": 11983 + }, + { + "epoch": 0.33066493755539994, + "grad_norm": 0.0036882515996694565, + "learning_rate": 0.001, + "loss": 0.3485, + "step": 11984 + }, + { + "epoch": 0.33069252975646435, + "grad_norm": 0.0034662606194615364, + "learning_rate": 0.001, + "loss": 0.4022, + "step": 11985 + }, + { + "epoch": 0.3307201219575287, + "grad_norm": 0.003996389918029308, + "learning_rate": 0.001, + "loss": 0.4232, + "step": 11986 + }, + { + "epoch": 0.33074771415859305, + "grad_norm": 0.002916965400800109, + "learning_rate": 0.001, + "loss": 0.4265, + "step": 11987 + }, + { + "epoch": 0.33077530635965746, + "grad_norm": 0.0024744670372456312, + "learning_rate": 0.001, + "loss": 0.4377, + "step": 11988 + }, + { + "epoch": 0.3308028985607218, + "grad_norm": 0.005105322692543268, + "learning_rate": 0.001, + "loss": 0.412, + "step": 11989 + }, + { + "epoch": 0.33083049076178617, + "grad_norm": 0.0023484451230615377, + "learning_rate": 0.001, + "loss": 0.4235, + "step": 11990 + }, + { + "epoch": 0.3308580829628506, + "grad_norm": 0.0028626341372728348, + "learning_rate": 0.001, + "loss": 0.3979, + "step": 11991 + }, + { + "epoch": 0.3308856751639149, + "grad_norm": 0.0024867048487067223, + "learning_rate": 0.001, + "loss": 0.4314, + "step": 11992 + }, + { + "epoch": 0.3309132673649793, + "grad_norm": 0.0038186991587281227, + "learning_rate": 0.001, + "loss": 0.3815, + "step": 11993 + }, + { + "epoch": 0.33094085956604363, + "grad_norm": 0.002811636310070753, + "learning_rate": 0.001, + "loss": 0.4059, + "step": 11994 + }, + { + "epoch": 0.33096845176710804, + "grad_norm": 0.0019408464431762695, + "learning_rate": 0.001, + "loss": 0.4154, + "step": 11995 + }, + { + "epoch": 0.3309960439681724, + "grad_norm": 0.0032817174214869738, + "learning_rate": 0.001, + "loss": 0.39, + "step": 11996 + }, + { + "epoch": 0.33102363616923675, + "grad_norm": 0.005393838509917259, + "learning_rate": 0.001, + "loss": 0.3443, + "step": 11997 + }, + { + "epoch": 0.33105122837030115, + "grad_norm": 0.002987619722262025, + "learning_rate": 0.001, + "loss": 0.4412, + "step": 11998 + }, + { + "epoch": 0.3310788205713655, + "grad_norm": 0.004077446181327105, + "learning_rate": 0.001, + "loss": 0.3921, + "step": 11999 + }, + { + "epoch": 0.33110641277242986, + "grad_norm": 0.0028092057909816504, + "learning_rate": 0.001, + "loss": 0.3888, + "step": 12000 + }, + { + "epoch": 0.33110641277242986, + "eval_runtime": 23.4032, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 0.171, + "step": 12000 + } + ], + "logging_steps": 1, + "max_steps": 36242, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.647879841911721e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}