diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,86221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12313, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.121497604158207e-05, + "grad_norm": 6.562383818863084, + "learning_rate": 1.3513513513513516e-08, + "loss": 1.0675, + "step": 1 + }, + { + "epoch": 0.00016242995208316414, + "grad_norm": 8.277066733441483, + "learning_rate": 2.702702702702703e-08, + "loss": 1.3477, + "step": 2 + }, + { + "epoch": 0.0002436449281247462, + "grad_norm": 5.566945299122935, + "learning_rate": 4.0540540540540545e-08, + "loss": 0.982, + "step": 3 + }, + { + "epoch": 0.0003248599041663283, + "grad_norm": 20.123631677375503, + "learning_rate": 5.405405405405406e-08, + "loss": 1.1009, + "step": 4 + }, + { + "epoch": 0.00040607488020791033, + "grad_norm": 7.516929368064332, + "learning_rate": 6.756756756756757e-08, + "loss": 1.1767, + "step": 5 + }, + { + "epoch": 0.0004872898562494924, + "grad_norm": 6.534930634637095, + "learning_rate": 8.108108108108109e-08, + "loss": 0.9279, + "step": 6 + }, + { + "epoch": 0.0005685048322910744, + "grad_norm": 8.547137659048731, + "learning_rate": 9.459459459459461e-08, + "loss": 1.1229, + "step": 7 + }, + { + "epoch": 0.0006497198083326566, + "grad_norm": 8.40063081083683, + "learning_rate": 1.0810810810810812e-07, + "loss": 0.9784, + "step": 8 + }, + { + "epoch": 0.0007309347843742387, + "grad_norm": 7.113188011014463, + "learning_rate": 1.2162162162162163e-07, + "loss": 1.007, + "step": 9 + }, + { + "epoch": 0.0008121497604158207, + "grad_norm": 9.511299992179831, + "learning_rate": 1.3513513513513515e-07, + "loss": 1.0264, + "step": 10 + }, + { + "epoch": 0.0008933647364574028, + "grad_norm": 6.234366333859408, + "learning_rate": 1.4864864864864866e-07, + "loss": 1.1152, + "step": 11 + }, + { + "epoch": 0.0009745797124989848, + "grad_norm": 8.300258843057527, + "learning_rate": 1.6216216216216218e-07, + "loss": 0.9787, + "step": 12 + }, + { + "epoch": 0.0010557946885405669, + "grad_norm": 6.394938331200344, + "learning_rate": 1.756756756756757e-07, + "loss": 1.0764, + "step": 13 + }, + { + "epoch": 0.0011370096645821489, + "grad_norm": 5.924100005331387, + "learning_rate": 1.8918918918918921e-07, + "loss": 1.0187, + "step": 14 + }, + { + "epoch": 0.001218224640623731, + "grad_norm": 7.076697509559219, + "learning_rate": 2.0270270270270273e-07, + "loss": 1.3016, + "step": 15 + }, + { + "epoch": 0.001299439616665313, + "grad_norm": 7.949795337583316, + "learning_rate": 2.1621621621621625e-07, + "loss": 1.1151, + "step": 16 + }, + { + "epoch": 0.001380654592706895, + "grad_norm": 7.324802587536972, + "learning_rate": 2.2972972972972977e-07, + "loss": 1.0339, + "step": 17 + }, + { + "epoch": 0.0014618695687484773, + "grad_norm": 7.644757636442122, + "learning_rate": 2.4324324324324326e-07, + "loss": 1.0653, + "step": 18 + }, + { + "epoch": 0.0015430845447900593, + "grad_norm": 19.41567123436543, + "learning_rate": 2.567567567567568e-07, + "loss": 1.2616, + "step": 19 + }, + { + "epoch": 0.0016242995208316413, + "grad_norm": 7.096406348798535, + "learning_rate": 2.702702702702703e-07, + "loss": 1.0637, + "step": 20 + }, + { + "epoch": 0.0017055144968732233, + "grad_norm": 9.633919383798274, + "learning_rate": 2.837837837837838e-07, + "loss": 1.185, + "step": 21 + }, + { + "epoch": 0.0017867294729148055, + "grad_norm": 10.180555284457757, + "learning_rate": 2.972972972972973e-07, + "loss": 1.0474, + "step": 22 + }, + { + "epoch": 0.0018679444489563875, + "grad_norm": 8.634935943189125, + "learning_rate": 3.1081081081081084e-07, + "loss": 0.9087, + "step": 23 + }, + { + "epoch": 0.0019491594249979695, + "grad_norm": 7.712256121372059, + "learning_rate": 3.2432432432432436e-07, + "loss": 1.2098, + "step": 24 + }, + { + "epoch": 0.0020303744010395515, + "grad_norm": 7.212382477186713, + "learning_rate": 3.378378378378379e-07, + "loss": 0.9682, + "step": 25 + }, + { + "epoch": 0.0021115893770811338, + "grad_norm": 8.965382654928913, + "learning_rate": 3.513513513513514e-07, + "loss": 0.9742, + "step": 26 + }, + { + "epoch": 0.002192804353122716, + "grad_norm": 9.333155078594814, + "learning_rate": 3.648648648648649e-07, + "loss": 0.9463, + "step": 27 + }, + { + "epoch": 0.0022740193291642978, + "grad_norm": 5.770329415577018, + "learning_rate": 3.7837837837837843e-07, + "loss": 0.9216, + "step": 28 + }, + { + "epoch": 0.00235523430520588, + "grad_norm": 7.19909286290371, + "learning_rate": 3.9189189189189195e-07, + "loss": 0.9299, + "step": 29 + }, + { + "epoch": 0.002436449281247462, + "grad_norm": 8.198959079831749, + "learning_rate": 4.0540540540540546e-07, + "loss": 1.2135, + "step": 30 + }, + { + "epoch": 0.002517664257289044, + "grad_norm": 7.469329596056761, + "learning_rate": 4.18918918918919e-07, + "loss": 1.162, + "step": 31 + }, + { + "epoch": 0.002598879233330626, + "grad_norm": 6.064802318250161, + "learning_rate": 4.324324324324325e-07, + "loss": 1.0726, + "step": 32 + }, + { + "epoch": 0.0026800942093722084, + "grad_norm": 7.561733885808792, + "learning_rate": 4.45945945945946e-07, + "loss": 0.9132, + "step": 33 + }, + { + "epoch": 0.00276130918541379, + "grad_norm": 5.510848606396675, + "learning_rate": 4.5945945945945953e-07, + "loss": 0.8835, + "step": 34 + }, + { + "epoch": 0.0028425241614553724, + "grad_norm": 5.05147854422008, + "learning_rate": 4.7297297297297305e-07, + "loss": 1.025, + "step": 35 + }, + { + "epoch": 0.0029237391374969546, + "grad_norm": 7.449384043723888, + "learning_rate": 4.864864864864865e-07, + "loss": 1.0634, + "step": 36 + }, + { + "epoch": 0.0030049541135385364, + "grad_norm": 5.554970539302449, + "learning_rate": 5.000000000000001e-07, + "loss": 0.9037, + "step": 37 + }, + { + "epoch": 0.0030861690895801186, + "grad_norm": 5.648676923146146, + "learning_rate": 5.135135135135135e-07, + "loss": 1.0197, + "step": 38 + }, + { + "epoch": 0.0031673840656217004, + "grad_norm": 5.581517917957071, + "learning_rate": 5.270270270270271e-07, + "loss": 0.9723, + "step": 39 + }, + { + "epoch": 0.0032485990416632827, + "grad_norm": 6.811513573858421, + "learning_rate": 5.405405405405406e-07, + "loss": 1.0647, + "step": 40 + }, + { + "epoch": 0.003329814017704865, + "grad_norm": 13.02023304402827, + "learning_rate": 5.540540540540542e-07, + "loss": 0.9535, + "step": 41 + }, + { + "epoch": 0.0034110289937464467, + "grad_norm": 5.903154525353516, + "learning_rate": 5.675675675675676e-07, + "loss": 0.9207, + "step": 42 + }, + { + "epoch": 0.003492243969788029, + "grad_norm": 5.713506629308762, + "learning_rate": 5.810810810810812e-07, + "loss": 0.901, + "step": 43 + }, + { + "epoch": 0.003573458945829611, + "grad_norm": 7.568618399596037, + "learning_rate": 5.945945945945947e-07, + "loss": 0.8387, + "step": 44 + }, + { + "epoch": 0.003654673921871193, + "grad_norm": 4.684278865041181, + "learning_rate": 6.081081081081082e-07, + "loss": 0.9369, + "step": 45 + }, + { + "epoch": 0.003735888897912775, + "grad_norm": 4.719765340896398, + "learning_rate": 6.216216216216217e-07, + "loss": 0.9138, + "step": 46 + }, + { + "epoch": 0.0038171038739543573, + "grad_norm": 6.215441613276706, + "learning_rate": 6.351351351351353e-07, + "loss": 0.8665, + "step": 47 + }, + { + "epoch": 0.003898318849995939, + "grad_norm": 5.80882937072084, + "learning_rate": 6.486486486486487e-07, + "loss": 0.8277, + "step": 48 + }, + { + "epoch": 0.003979533826037522, + "grad_norm": 4.4336706590480786, + "learning_rate": 6.621621621621623e-07, + "loss": 0.9194, + "step": 49 + }, + { + "epoch": 0.004060748802079103, + "grad_norm": 6.287882016956921, + "learning_rate": 6.756756756756758e-07, + "loss": 0.997, + "step": 50 + }, + { + "epoch": 0.004141963778120685, + "grad_norm": 5.639526953982635, + "learning_rate": 6.891891891891893e-07, + "loss": 0.9637, + "step": 51 + }, + { + "epoch": 0.0042231787541622675, + "grad_norm": 5.51310604984265, + "learning_rate": 7.027027027027028e-07, + "loss": 0.8232, + "step": 52 + }, + { + "epoch": 0.00430439373020385, + "grad_norm": 4.517639703226352, + "learning_rate": 7.162162162162164e-07, + "loss": 0.8983, + "step": 53 + }, + { + "epoch": 0.004385608706245432, + "grad_norm": 6.666343453939387, + "learning_rate": 7.297297297297298e-07, + "loss": 0.7985, + "step": 54 + }, + { + "epoch": 0.004466823682287013, + "grad_norm": 5.422191276786231, + "learning_rate": 7.432432432432434e-07, + "loss": 0.9058, + "step": 55 + }, + { + "epoch": 0.0045480386583285955, + "grad_norm": 4.624429643253304, + "learning_rate": 7.567567567567569e-07, + "loss": 0.7943, + "step": 56 + }, + { + "epoch": 0.004629253634370178, + "grad_norm": 4.391972463636353, + "learning_rate": 7.702702702702704e-07, + "loss": 0.8046, + "step": 57 + }, + { + "epoch": 0.00471046861041176, + "grad_norm": 7.0317366536589745, + "learning_rate": 7.837837837837839e-07, + "loss": 0.96, + "step": 58 + }, + { + "epoch": 0.004791683586453342, + "grad_norm": 6.253750275868285, + "learning_rate": 7.972972972972974e-07, + "loss": 0.8267, + "step": 59 + }, + { + "epoch": 0.004872898562494924, + "grad_norm": 4.2004573670066545, + "learning_rate": 8.108108108108109e-07, + "loss": 0.7541, + "step": 60 + }, + { + "epoch": 0.004954113538536506, + "grad_norm": 4.843922828461304, + "learning_rate": 8.243243243243244e-07, + "loss": 0.8652, + "step": 61 + }, + { + "epoch": 0.005035328514578088, + "grad_norm": 6.663532165513732, + "learning_rate": 8.37837837837838e-07, + "loss": 0.9765, + "step": 62 + }, + { + "epoch": 0.00511654349061967, + "grad_norm": 5.518475179846077, + "learning_rate": 8.513513513513514e-07, + "loss": 0.8343, + "step": 63 + }, + { + "epoch": 0.005197758466661252, + "grad_norm": 4.16743993524973, + "learning_rate": 8.64864864864865e-07, + "loss": 0.8835, + "step": 64 + }, + { + "epoch": 0.005278973442702835, + "grad_norm": 4.724546088137992, + "learning_rate": 8.783783783783785e-07, + "loss": 0.8031, + "step": 65 + }, + { + "epoch": 0.005360188418744417, + "grad_norm": 5.965079554752788, + "learning_rate": 8.91891891891892e-07, + "loss": 0.8464, + "step": 66 + }, + { + "epoch": 0.005441403394785998, + "grad_norm": 7.388225521826432, + "learning_rate": 9.054054054054055e-07, + "loss": 0.9254, + "step": 67 + }, + { + "epoch": 0.00552261837082758, + "grad_norm": 4.486467374218727, + "learning_rate": 9.189189189189191e-07, + "loss": 0.7852, + "step": 68 + }, + { + "epoch": 0.005603833346869163, + "grad_norm": 4.818486349074325, + "learning_rate": 9.324324324324325e-07, + "loss": 0.6409, + "step": 69 + }, + { + "epoch": 0.005685048322910745, + "grad_norm": 12.591926984529453, + "learning_rate": 9.459459459459461e-07, + "loss": 0.6555, + "step": 70 + }, + { + "epoch": 0.005766263298952327, + "grad_norm": 3.582506346281597, + "learning_rate": 9.594594594594596e-07, + "loss": 0.7594, + "step": 71 + }, + { + "epoch": 0.005847478274993909, + "grad_norm": 6.316272242206502, + "learning_rate": 9.72972972972973e-07, + "loss": 0.8258, + "step": 72 + }, + { + "epoch": 0.005928693251035491, + "grad_norm": 7.768203703731759, + "learning_rate": 9.864864864864867e-07, + "loss": 0.7343, + "step": 73 + }, + { + "epoch": 0.006009908227077073, + "grad_norm": 4.411865830101082, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.8984, + "step": 74 + }, + { + "epoch": 0.006091123203118655, + "grad_norm": 6.221797107498061, + "learning_rate": 1.0135135135135136e-06, + "loss": 0.8646, + "step": 75 + }, + { + "epoch": 0.006172338179160237, + "grad_norm": 4.668292899364404, + "learning_rate": 1.027027027027027e-06, + "loss": 0.7082, + "step": 76 + }, + { + "epoch": 0.0062535531552018195, + "grad_norm": 5.307505469950394, + "learning_rate": 1.0405405405405408e-06, + "loss": 0.7584, + "step": 77 + }, + { + "epoch": 0.006334768131243401, + "grad_norm": 5.411978113961716, + "learning_rate": 1.0540540540540542e-06, + "loss": 0.8715, + "step": 78 + }, + { + "epoch": 0.006415983107284983, + "grad_norm": 11.725200388824204, + "learning_rate": 1.0675675675675677e-06, + "loss": 0.8187, + "step": 79 + }, + { + "epoch": 0.006497198083326565, + "grad_norm": 7.066619804350216, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.7657, + "step": 80 + }, + { + "epoch": 0.0065784130593681475, + "grad_norm": 5.206447831213838, + "learning_rate": 1.0945945945945948e-06, + "loss": 1.0916, + "step": 81 + }, + { + "epoch": 0.00665962803540973, + "grad_norm": 6.270913752130497, + "learning_rate": 1.1081081081081083e-06, + "loss": 0.695, + "step": 82 + }, + { + "epoch": 0.006740843011451312, + "grad_norm": 5.602218933207931, + "learning_rate": 1.1216216216216218e-06, + "loss": 0.7361, + "step": 83 + }, + { + "epoch": 0.006822057987492893, + "grad_norm": 5.592916777746583, + "learning_rate": 1.1351351351351352e-06, + "loss": 0.6823, + "step": 84 + }, + { + "epoch": 0.0069032729635344755, + "grad_norm": 6.7335919368870565, + "learning_rate": 1.148648648648649e-06, + "loss": 0.7927, + "step": 85 + }, + { + "epoch": 0.006984487939576058, + "grad_norm": 7.941076590684681, + "learning_rate": 1.1621621621621624e-06, + "loss": 0.8417, + "step": 86 + }, + { + "epoch": 0.00706570291561764, + "grad_norm": 5.969244160464252, + "learning_rate": 1.1756756756756758e-06, + "loss": 0.689, + "step": 87 + }, + { + "epoch": 0.007146917891659222, + "grad_norm": 5.118618058340234, + "learning_rate": 1.1891891891891893e-06, + "loss": 0.9608, + "step": 88 + }, + { + "epoch": 0.007228132867700804, + "grad_norm": 8.55651778885645, + "learning_rate": 1.2027027027027028e-06, + "loss": 0.6376, + "step": 89 + }, + { + "epoch": 0.007309347843742386, + "grad_norm": 8.9312254966113, + "learning_rate": 1.2162162162162164e-06, + "loss": 0.8781, + "step": 90 + }, + { + "epoch": 0.007390562819783968, + "grad_norm": 5.435421842215974, + "learning_rate": 1.22972972972973e-06, + "loss": 0.7296, + "step": 91 + }, + { + "epoch": 0.00747177779582555, + "grad_norm": 7.00575424705863, + "learning_rate": 1.2432432432432434e-06, + "loss": 0.7004, + "step": 92 + }, + { + "epoch": 0.007552992771867132, + "grad_norm": 5.126051571596671, + "learning_rate": 1.2567567567567568e-06, + "loss": 0.6933, + "step": 93 + }, + { + "epoch": 0.007634207747908715, + "grad_norm": 7.546722570854549, + "learning_rate": 1.2702702702702705e-06, + "loss": 0.7443, + "step": 94 + }, + { + "epoch": 0.007715422723950297, + "grad_norm": 5.82298529351105, + "learning_rate": 1.2837837837837838e-06, + "loss": 0.6609, + "step": 95 + }, + { + "epoch": 0.007796637699991878, + "grad_norm": 5.395134658996866, + "learning_rate": 1.2972972972972974e-06, + "loss": 0.6966, + "step": 96 + }, + { + "epoch": 0.007877852676033461, + "grad_norm": 5.715845533979447, + "learning_rate": 1.310810810810811e-06, + "loss": 0.8075, + "step": 97 + }, + { + "epoch": 0.007959067652075043, + "grad_norm": 4.346663727077859, + "learning_rate": 1.3243243243243246e-06, + "loss": 0.8382, + "step": 98 + }, + { + "epoch": 0.008040282628116624, + "grad_norm": 4.510360891483355, + "learning_rate": 1.3378378378378378e-06, + "loss": 0.8409, + "step": 99 + }, + { + "epoch": 0.008121497604158206, + "grad_norm": 6.252214797052545, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.8968, + "step": 100 + }, + { + "epoch": 0.008202712580199788, + "grad_norm": 4.133668140760274, + "learning_rate": 1.364864864864865e-06, + "loss": 0.7697, + "step": 101 + }, + { + "epoch": 0.00828392755624137, + "grad_norm": 5.13995005645122, + "learning_rate": 1.3783783783783786e-06, + "loss": 0.8302, + "step": 102 + }, + { + "epoch": 0.008365142532282953, + "grad_norm": 5.645133972443234, + "learning_rate": 1.391891891891892e-06, + "loss": 0.8553, + "step": 103 + }, + { + "epoch": 0.008446357508324535, + "grad_norm": 5.768137707230809, + "learning_rate": 1.4054054054054056e-06, + "loss": 0.7269, + "step": 104 + }, + { + "epoch": 0.008527572484366117, + "grad_norm": 3.9384737579894358, + "learning_rate": 1.418918918918919e-06, + "loss": 0.8465, + "step": 105 + }, + { + "epoch": 0.0086087874604077, + "grad_norm": 6.608951862752225, + "learning_rate": 1.4324324324324327e-06, + "loss": 0.7358, + "step": 106 + }, + { + "epoch": 0.008690002436449282, + "grad_norm": 4.8856541626722425, + "learning_rate": 1.445945945945946e-06, + "loss": 0.9122, + "step": 107 + }, + { + "epoch": 0.008771217412490864, + "grad_norm": 3.744812822198274, + "learning_rate": 1.4594594594594596e-06, + "loss": 0.7847, + "step": 108 + }, + { + "epoch": 0.008852432388532446, + "grad_norm": 4.555012637892476, + "learning_rate": 1.4729729729729731e-06, + "loss": 0.6125, + "step": 109 + }, + { + "epoch": 0.008933647364574027, + "grad_norm": 5.417265029398178, + "learning_rate": 1.4864864864864868e-06, + "loss": 0.6426, + "step": 110 + }, + { + "epoch": 0.009014862340615609, + "grad_norm": 6.978404404546592, + "learning_rate": 1.5e-06, + "loss": 0.8351, + "step": 111 + }, + { + "epoch": 0.009096077316657191, + "grad_norm": 7.486793674946231, + "learning_rate": 1.5135135135135137e-06, + "loss": 0.9104, + "step": 112 + }, + { + "epoch": 0.009177292292698773, + "grad_norm": 6.427379250554024, + "learning_rate": 1.5270270270270272e-06, + "loss": 0.7182, + "step": 113 + }, + { + "epoch": 0.009258507268740356, + "grad_norm": 7.581127803220287, + "learning_rate": 1.5405405405405409e-06, + "loss": 0.7966, + "step": 114 + }, + { + "epoch": 0.009339722244781938, + "grad_norm": 4.51084883429586, + "learning_rate": 1.5540540540540541e-06, + "loss": 0.7349, + "step": 115 + }, + { + "epoch": 0.00942093722082352, + "grad_norm": 5.521276273858657, + "learning_rate": 1.5675675675675678e-06, + "loss": 0.6476, + "step": 116 + }, + { + "epoch": 0.009502152196865102, + "grad_norm": 10.232542435537884, + "learning_rate": 1.5810810810810812e-06, + "loss": 0.7003, + "step": 117 + }, + { + "epoch": 0.009583367172906684, + "grad_norm": 5.670487468601197, + "learning_rate": 1.5945945945945947e-06, + "loss": 0.921, + "step": 118 + }, + { + "epoch": 0.009664582148948267, + "grad_norm": 5.136554782606984, + "learning_rate": 1.6081081081081082e-06, + "loss": 0.6932, + "step": 119 + }, + { + "epoch": 0.009745797124989849, + "grad_norm": 3.81643952542769, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.7676, + "step": 120 + }, + { + "epoch": 0.009827012101031431, + "grad_norm": 7.129104413729071, + "learning_rate": 1.6351351351351353e-06, + "loss": 0.7903, + "step": 121 + }, + { + "epoch": 0.009908227077073012, + "grad_norm": 5.89283670831236, + "learning_rate": 1.6486486486486488e-06, + "loss": 0.6741, + "step": 122 + }, + { + "epoch": 0.009989442053114594, + "grad_norm": 4.728889968775353, + "learning_rate": 1.6621621621621622e-06, + "loss": 0.7493, + "step": 123 + }, + { + "epoch": 0.010070657029156176, + "grad_norm": 5.400043416105325, + "learning_rate": 1.675675675675676e-06, + "loss": 0.6254, + "step": 124 + }, + { + "epoch": 0.010151872005197758, + "grad_norm": 5.7442992971209605, + "learning_rate": 1.6891891891891894e-06, + "loss": 0.6588, + "step": 125 + }, + { + "epoch": 0.01023308698123934, + "grad_norm": 5.225132595645112, + "learning_rate": 1.7027027027027028e-06, + "loss": 0.721, + "step": 126 + }, + { + "epoch": 0.010314301957280923, + "grad_norm": 4.917827752874381, + "learning_rate": 1.7162162162162163e-06, + "loss": 0.6719, + "step": 127 + }, + { + "epoch": 0.010395516933322505, + "grad_norm": 5.96105392795946, + "learning_rate": 1.72972972972973e-06, + "loss": 0.8192, + "step": 128 + }, + { + "epoch": 0.010476731909364087, + "grad_norm": 5.348491477184876, + "learning_rate": 1.7432432432432432e-06, + "loss": 0.741, + "step": 129 + }, + { + "epoch": 0.01055794688540567, + "grad_norm": 4.046650669596968, + "learning_rate": 1.756756756756757e-06, + "loss": 0.6943, + "step": 130 + }, + { + "epoch": 0.010639161861447251, + "grad_norm": 5.045700573097575, + "learning_rate": 1.7702702702702704e-06, + "loss": 0.5737, + "step": 131 + }, + { + "epoch": 0.010720376837488834, + "grad_norm": 4.332973323075942, + "learning_rate": 1.783783783783784e-06, + "loss": 0.7721, + "step": 132 + }, + { + "epoch": 0.010801591813530414, + "grad_norm": 12.053968494279287, + "learning_rate": 1.7972972972972973e-06, + "loss": 0.7081, + "step": 133 + }, + { + "epoch": 0.010882806789571996, + "grad_norm": 4.314977931658641, + "learning_rate": 1.810810810810811e-06, + "loss": 0.7476, + "step": 134 + }, + { + "epoch": 0.010964021765613579, + "grad_norm": 4.372992825514325, + "learning_rate": 1.8243243243243245e-06, + "loss": 0.5623, + "step": 135 + }, + { + "epoch": 0.01104523674165516, + "grad_norm": 5.06501074840964, + "learning_rate": 1.8378378378378381e-06, + "loss": 0.8409, + "step": 136 + }, + { + "epoch": 0.011126451717696743, + "grad_norm": 5.486341378807148, + "learning_rate": 1.8513513513513514e-06, + "loss": 0.7097, + "step": 137 + }, + { + "epoch": 0.011207666693738325, + "grad_norm": 5.885624763272731, + "learning_rate": 1.864864864864865e-06, + "loss": 0.8605, + "step": 138 + }, + { + "epoch": 0.011288881669779907, + "grad_norm": 5.810017768183461, + "learning_rate": 1.8783783783783785e-06, + "loss": 0.7589, + "step": 139 + }, + { + "epoch": 0.01137009664582149, + "grad_norm": 5.241258116276019, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.6821, + "step": 140 + }, + { + "epoch": 0.011451311621863072, + "grad_norm": 5.141543904486299, + "learning_rate": 1.9054054054054054e-06, + "loss": 0.6702, + "step": 141 + }, + { + "epoch": 0.011532526597904654, + "grad_norm": 6.614627634564522, + "learning_rate": 1.918918918918919e-06, + "loss": 0.7541, + "step": 142 + }, + { + "epoch": 0.011613741573946236, + "grad_norm": 4.270716146835968, + "learning_rate": 1.9324324324324326e-06, + "loss": 0.6118, + "step": 143 + }, + { + "epoch": 0.011694956549987819, + "grad_norm": 4.833439173611212, + "learning_rate": 1.945945945945946e-06, + "loss": 0.6631, + "step": 144 + }, + { + "epoch": 0.011776171526029399, + "grad_norm": 6.670666111898159, + "learning_rate": 1.9594594594594595e-06, + "loss": 0.5792, + "step": 145 + }, + { + "epoch": 0.011857386502070981, + "grad_norm": 5.320699935552583, + "learning_rate": 1.9729729729729734e-06, + "loss": 0.6649, + "step": 146 + }, + { + "epoch": 0.011938601478112564, + "grad_norm": 10.048210540269071, + "learning_rate": 1.9864864864864864e-06, + "loss": 0.6055, + "step": 147 + }, + { + "epoch": 0.012019816454154146, + "grad_norm": 4.8358064531705525, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7224, + "step": 148 + }, + { + "epoch": 0.012101031430195728, + "grad_norm": 8.585169822222198, + "learning_rate": 2.013513513513514e-06, + "loss": 0.7546, + "step": 149 + }, + { + "epoch": 0.01218224640623731, + "grad_norm": 4.847625342598578, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.6867, + "step": 150 + }, + { + "epoch": 0.012263461382278892, + "grad_norm": 3.7722782135223594, + "learning_rate": 2.0405405405405407e-06, + "loss": 0.7262, + "step": 151 + }, + { + "epoch": 0.012344676358320475, + "grad_norm": 5.149292525910975, + "learning_rate": 2.054054054054054e-06, + "loss": 0.8792, + "step": 152 + }, + { + "epoch": 0.012425891334362057, + "grad_norm": 6.26503383092344, + "learning_rate": 2.0675675675675677e-06, + "loss": 0.7837, + "step": 153 + }, + { + "epoch": 0.012507106310403639, + "grad_norm": 4.188912763837703, + "learning_rate": 2.0810810810810815e-06, + "loss": 0.5692, + "step": 154 + }, + { + "epoch": 0.012588321286445221, + "grad_norm": 5.04456436284514, + "learning_rate": 2.0945945945945946e-06, + "loss": 0.652, + "step": 155 + }, + { + "epoch": 0.012669536262486802, + "grad_norm": 7.405968257695848, + "learning_rate": 2.1081081081081085e-06, + "loss": 0.6266, + "step": 156 + }, + { + "epoch": 0.012750751238528384, + "grad_norm": 5.310169429627866, + "learning_rate": 2.121621621621622e-06, + "loss": 0.6542, + "step": 157 + }, + { + "epoch": 0.012831966214569966, + "grad_norm": 5.046936455927635, + "learning_rate": 2.1351351351351354e-06, + "loss": 0.8108, + "step": 158 + }, + { + "epoch": 0.012913181190611548, + "grad_norm": 4.036458244440137, + "learning_rate": 2.148648648648649e-06, + "loss": 0.6339, + "step": 159 + }, + { + "epoch": 0.01299439616665313, + "grad_norm": 5.254253720134002, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.7129, + "step": 160 + }, + { + "epoch": 0.013075611142694713, + "grad_norm": 4.842001153737952, + "learning_rate": 2.175675675675676e-06, + "loss": 0.7669, + "step": 161 + }, + { + "epoch": 0.013156826118736295, + "grad_norm": 3.45883013736874, + "learning_rate": 2.1891891891891897e-06, + "loss": 0.7403, + "step": 162 + }, + { + "epoch": 0.013238041094777877, + "grad_norm": 6.615557242600742, + "learning_rate": 2.2027027027027027e-06, + "loss": 0.6072, + "step": 163 + }, + { + "epoch": 0.01331925607081946, + "grad_norm": 5.295390317262873, + "learning_rate": 2.2162162162162166e-06, + "loss": 0.4518, + "step": 164 + }, + { + "epoch": 0.013400471046861042, + "grad_norm": 4.553708607425031, + "learning_rate": 2.22972972972973e-06, + "loss": 0.7405, + "step": 165 + }, + { + "epoch": 0.013481686022902624, + "grad_norm": 5.730983978090373, + "learning_rate": 2.2432432432432435e-06, + "loss": 0.7216, + "step": 166 + }, + { + "epoch": 0.013562900998944206, + "grad_norm": 6.28068035640207, + "learning_rate": 2.256756756756757e-06, + "loss": 0.543, + "step": 167 + }, + { + "epoch": 0.013644115974985787, + "grad_norm": 5.534240505988875, + "learning_rate": 2.2702702702702705e-06, + "loss": 0.7273, + "step": 168 + }, + { + "epoch": 0.013725330951027369, + "grad_norm": 10.022692075346896, + "learning_rate": 2.283783783783784e-06, + "loss": 0.6259, + "step": 169 + }, + { + "epoch": 0.013806545927068951, + "grad_norm": 4.133335838910277, + "learning_rate": 2.297297297297298e-06, + "loss": 0.5538, + "step": 170 + }, + { + "epoch": 0.013887760903110533, + "grad_norm": 6.849412426150848, + "learning_rate": 2.310810810810811e-06, + "loss": 0.5594, + "step": 171 + }, + { + "epoch": 0.013968975879152115, + "grad_norm": 7.2240081386637955, + "learning_rate": 2.3243243243243247e-06, + "loss": 0.6406, + "step": 172 + }, + { + "epoch": 0.014050190855193698, + "grad_norm": 4.948498140450114, + "learning_rate": 2.337837837837838e-06, + "loss": 0.697, + "step": 173 + }, + { + "epoch": 0.01413140583123528, + "grad_norm": 3.8421671175611465, + "learning_rate": 2.3513513513513517e-06, + "loss": 0.709, + "step": 174 + }, + { + "epoch": 0.014212620807276862, + "grad_norm": 4.9263286743150925, + "learning_rate": 2.364864864864865e-06, + "loss": 0.5944, + "step": 175 + }, + { + "epoch": 0.014293835783318444, + "grad_norm": 4.181352300851268, + "learning_rate": 2.3783783783783786e-06, + "loss": 0.7951, + "step": 176 + }, + { + "epoch": 0.014375050759360027, + "grad_norm": 5.217958869841234, + "learning_rate": 2.391891891891892e-06, + "loss": 0.8847, + "step": 177 + }, + { + "epoch": 0.014456265735401609, + "grad_norm": 5.53287305794427, + "learning_rate": 2.4054054054054055e-06, + "loss": 0.6506, + "step": 178 + }, + { + "epoch": 0.01453748071144319, + "grad_norm": 4.977958862414195, + "learning_rate": 2.418918918918919e-06, + "loss": 0.649, + "step": 179 + }, + { + "epoch": 0.014618695687484771, + "grad_norm": 7.079003805106819, + "learning_rate": 2.432432432432433e-06, + "loss": 0.5173, + "step": 180 + }, + { + "epoch": 0.014699910663526354, + "grad_norm": 6.4006738662761675, + "learning_rate": 2.4459459459459463e-06, + "loss": 0.7425, + "step": 181 + }, + { + "epoch": 0.014781125639567936, + "grad_norm": 7.223867008336038, + "learning_rate": 2.45945945945946e-06, + "loss": 0.7116, + "step": 182 + }, + { + "epoch": 0.014862340615609518, + "grad_norm": 4.74150217223079, + "learning_rate": 2.4729729729729733e-06, + "loss": 0.8202, + "step": 183 + }, + { + "epoch": 0.0149435555916511, + "grad_norm": 5.838162332180424, + "learning_rate": 2.4864864864864867e-06, + "loss": 0.7202, + "step": 184 + }, + { + "epoch": 0.015024770567692683, + "grad_norm": 7.3164412014837845, + "learning_rate": 2.5e-06, + "loss": 0.6318, + "step": 185 + }, + { + "epoch": 0.015105985543734265, + "grad_norm": 4.0466470539819674, + "learning_rate": 2.5135135135135137e-06, + "loss": 0.6161, + "step": 186 + }, + { + "epoch": 0.015187200519775847, + "grad_norm": 4.20696930651959, + "learning_rate": 2.527027027027027e-06, + "loss": 0.8844, + "step": 187 + }, + { + "epoch": 0.01526841549581743, + "grad_norm": 4.619062654793844, + "learning_rate": 2.540540540540541e-06, + "loss": 0.7893, + "step": 188 + }, + { + "epoch": 0.015349630471859011, + "grad_norm": 4.028095158003242, + "learning_rate": 2.554054054054054e-06, + "loss": 0.9053, + "step": 189 + }, + { + "epoch": 0.015430845447900594, + "grad_norm": 5.474147278368468, + "learning_rate": 2.5675675675675675e-06, + "loss": 0.6785, + "step": 190 + }, + { + "epoch": 0.015512060423942174, + "grad_norm": 7.068401125839219, + "learning_rate": 2.581081081081081e-06, + "loss": 0.6353, + "step": 191 + }, + { + "epoch": 0.015593275399983756, + "grad_norm": 6.211801926350057, + "learning_rate": 2.594594594594595e-06, + "loss": 0.6171, + "step": 192 + }, + { + "epoch": 0.01567449037602534, + "grad_norm": 6.8766856866833805, + "learning_rate": 2.6081081081081083e-06, + "loss": 0.6291, + "step": 193 + }, + { + "epoch": 0.015755705352066923, + "grad_norm": 4.450746282218491, + "learning_rate": 2.621621621621622e-06, + "loss": 0.7322, + "step": 194 + }, + { + "epoch": 0.015836920328108503, + "grad_norm": 6.737626992175194, + "learning_rate": 2.6351351351351353e-06, + "loss": 0.8431, + "step": 195 + }, + { + "epoch": 0.015918135304150087, + "grad_norm": 5.202217528227061, + "learning_rate": 2.648648648648649e-06, + "loss": 0.6027, + "step": 196 + }, + { + "epoch": 0.015999350280191667, + "grad_norm": 6.136140183171829, + "learning_rate": 2.662162162162162e-06, + "loss": 0.7227, + "step": 197 + }, + { + "epoch": 0.016080565256233248, + "grad_norm": 6.20592135137412, + "learning_rate": 2.6756756756756757e-06, + "loss": 0.8206, + "step": 198 + }, + { + "epoch": 0.016161780232274832, + "grad_norm": 5.542234344965935, + "learning_rate": 2.689189189189189e-06, + "loss": 0.5755, + "step": 199 + }, + { + "epoch": 0.016242995208316412, + "grad_norm": 6.481129594142135, + "learning_rate": 2.702702702702703e-06, + "loss": 0.9124, + "step": 200 + }, + { + "epoch": 0.016324210184357996, + "grad_norm": 8.819932985194717, + "learning_rate": 2.7162162162162165e-06, + "loss": 0.7077, + "step": 201 + }, + { + "epoch": 0.016405425160399577, + "grad_norm": 3.7133046294352496, + "learning_rate": 2.72972972972973e-06, + "loss": 0.7187, + "step": 202 + }, + { + "epoch": 0.01648664013644116, + "grad_norm": 4.768904441511917, + "learning_rate": 2.7432432432432434e-06, + "loss": 0.7079, + "step": 203 + }, + { + "epoch": 0.01656785511248274, + "grad_norm": 5.4109232849671445, + "learning_rate": 2.7567567567567573e-06, + "loss": 0.8741, + "step": 204 + }, + { + "epoch": 0.016649070088524325, + "grad_norm": 11.784275280928684, + "learning_rate": 2.7702702702702703e-06, + "loss": 0.6088, + "step": 205 + }, + { + "epoch": 0.016730285064565906, + "grad_norm": 6.596444076197084, + "learning_rate": 2.783783783783784e-06, + "loss": 0.7085, + "step": 206 + }, + { + "epoch": 0.01681150004060749, + "grad_norm": 4.7347497389094055, + "learning_rate": 2.7972972972972973e-06, + "loss": 0.7641, + "step": 207 + }, + { + "epoch": 0.01689271501664907, + "grad_norm": 4.895873979702855, + "learning_rate": 2.810810810810811e-06, + "loss": 0.613, + "step": 208 + }, + { + "epoch": 0.01697392999269065, + "grad_norm": 4.112688584540271, + "learning_rate": 2.8243243243243246e-06, + "loss": 0.6551, + "step": 209 + }, + { + "epoch": 0.017055144968732235, + "grad_norm": 4.280798819098727, + "learning_rate": 2.837837837837838e-06, + "loss": 0.8091, + "step": 210 + }, + { + "epoch": 0.017136359944773815, + "grad_norm": 4.953151146240816, + "learning_rate": 2.851351351351351e-06, + "loss": 0.6225, + "step": 211 + }, + { + "epoch": 0.0172175749208154, + "grad_norm": 4.858934778537565, + "learning_rate": 2.8648648648648654e-06, + "loss": 0.7894, + "step": 212 + }, + { + "epoch": 0.01729878989685698, + "grad_norm": 3.776464089377743, + "learning_rate": 2.8783783783783785e-06, + "loss": 0.6714, + "step": 213 + }, + { + "epoch": 0.017380004872898563, + "grad_norm": 5.927211431887128, + "learning_rate": 2.891891891891892e-06, + "loss": 0.6115, + "step": 214 + }, + { + "epoch": 0.017461219848940144, + "grad_norm": 4.290882863865101, + "learning_rate": 2.9054054054054054e-06, + "loss": 0.8019, + "step": 215 + }, + { + "epoch": 0.017542434824981728, + "grad_norm": 5.014996919723652, + "learning_rate": 2.9189189189189193e-06, + "loss": 0.9165, + "step": 216 + }, + { + "epoch": 0.01762364980102331, + "grad_norm": 4.678677528211619, + "learning_rate": 2.9324324324324328e-06, + "loss": 0.7034, + "step": 217 + }, + { + "epoch": 0.017704864777064892, + "grad_norm": 5.856721537107752, + "learning_rate": 2.9459459459459462e-06, + "loss": 0.6187, + "step": 218 + }, + { + "epoch": 0.017786079753106473, + "grad_norm": 5.675292275975791, + "learning_rate": 2.9594594594594593e-06, + "loss": 0.6282, + "step": 219 + }, + { + "epoch": 0.017867294729148053, + "grad_norm": 4.026148472844569, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.6984, + "step": 220 + }, + { + "epoch": 0.017948509705189637, + "grad_norm": 3.7553668002568115, + "learning_rate": 2.9864864864864866e-06, + "loss": 0.7059, + "step": 221 + }, + { + "epoch": 0.018029724681231218, + "grad_norm": 5.010656407185249, + "learning_rate": 3e-06, + "loss": 0.6676, + "step": 222 + }, + { + "epoch": 0.0181109396572728, + "grad_norm": 5.174403860939526, + "learning_rate": 3.0135135135135135e-06, + "loss": 0.6528, + "step": 223 + }, + { + "epoch": 0.018192154633314382, + "grad_norm": 9.300176785765524, + "learning_rate": 3.0270270270270274e-06, + "loss": 0.6981, + "step": 224 + }, + { + "epoch": 0.018273369609355966, + "grad_norm": 6.1866583084404185, + "learning_rate": 3.040540540540541e-06, + "loss": 0.704, + "step": 225 + }, + { + "epoch": 0.018354584585397547, + "grad_norm": 7.4629748217872605, + "learning_rate": 3.0540540540540544e-06, + "loss": 0.593, + "step": 226 + }, + { + "epoch": 0.01843579956143913, + "grad_norm": 4.530659236075315, + "learning_rate": 3.0675675675675674e-06, + "loss": 0.5932, + "step": 227 + }, + { + "epoch": 0.01851701453748071, + "grad_norm": 5.748440319813538, + "learning_rate": 3.0810810810810817e-06, + "loss": 0.7238, + "step": 228 + }, + { + "epoch": 0.018598229513522295, + "grad_norm": 40.72028295804037, + "learning_rate": 3.0945945945945947e-06, + "loss": 0.7017, + "step": 229 + }, + { + "epoch": 0.018679444489563875, + "grad_norm": 8.886281188290711, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.5761, + "step": 230 + }, + { + "epoch": 0.018760659465605456, + "grad_norm": 4.343869317099014, + "learning_rate": 3.1216216216216217e-06, + "loss": 0.8567, + "step": 231 + }, + { + "epoch": 0.01884187444164704, + "grad_norm": 4.744970167283222, + "learning_rate": 3.1351351351351356e-06, + "loss": 0.7763, + "step": 232 + }, + { + "epoch": 0.01892308941768862, + "grad_norm": 7.495714776851687, + "learning_rate": 3.148648648648649e-06, + "loss": 0.8186, + "step": 233 + }, + { + "epoch": 0.019004304393730204, + "grad_norm": 4.102840820567404, + "learning_rate": 3.1621621621621625e-06, + "loss": 0.7489, + "step": 234 + }, + { + "epoch": 0.019085519369771785, + "grad_norm": 3.8421345026888027, + "learning_rate": 3.1756756756756755e-06, + "loss": 0.6933, + "step": 235 + }, + { + "epoch": 0.01916673434581337, + "grad_norm": 7.387916364375026, + "learning_rate": 3.1891891891891894e-06, + "loss": 0.6052, + "step": 236 + }, + { + "epoch": 0.01924794932185495, + "grad_norm": 3.849533522847183, + "learning_rate": 3.202702702702703e-06, + "loss": 0.7804, + "step": 237 + }, + { + "epoch": 0.019329164297896533, + "grad_norm": 5.396298934498324, + "learning_rate": 3.2162162162162164e-06, + "loss": 0.6641, + "step": 238 + }, + { + "epoch": 0.019410379273938114, + "grad_norm": 4.940398668509269, + "learning_rate": 3.22972972972973e-06, + "loss": 0.5532, + "step": 239 + }, + { + "epoch": 0.019491594249979698, + "grad_norm": 6.577994445331441, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.7118, + "step": 240 + }, + { + "epoch": 0.019572809226021278, + "grad_norm": 5.519259337961804, + "learning_rate": 3.256756756756757e-06, + "loss": 0.7421, + "step": 241 + }, + { + "epoch": 0.019654024202062862, + "grad_norm": 6.0576611783888215, + "learning_rate": 3.2702702702702706e-06, + "loss": 0.6083, + "step": 242 + }, + { + "epoch": 0.019735239178104443, + "grad_norm": 8.457628617936836, + "learning_rate": 3.2837837837837837e-06, + "loss": 0.678, + "step": 243 + }, + { + "epoch": 0.019816454154146023, + "grad_norm": 7.364746035461257, + "learning_rate": 3.2972972972972976e-06, + "loss": 0.5896, + "step": 244 + }, + { + "epoch": 0.019897669130187607, + "grad_norm": 7.750984536568317, + "learning_rate": 3.310810810810811e-06, + "loss": 0.6219, + "step": 245 + }, + { + "epoch": 0.019978884106229187, + "grad_norm": 4.981685980750705, + "learning_rate": 3.3243243243243245e-06, + "loss": 0.6942, + "step": 246 + }, + { + "epoch": 0.02006009908227077, + "grad_norm": 6.278603739216485, + "learning_rate": 3.337837837837838e-06, + "loss": 0.7533, + "step": 247 + }, + { + "epoch": 0.020141314058312352, + "grad_norm": 4.801507815370232, + "learning_rate": 3.351351351351352e-06, + "loss": 0.6608, + "step": 248 + }, + { + "epoch": 0.020222529034353936, + "grad_norm": 7.021222735394233, + "learning_rate": 3.3648648648648653e-06, + "loss": 0.8164, + "step": 249 + }, + { + "epoch": 0.020303744010395516, + "grad_norm": 5.832060700035103, + "learning_rate": 3.3783783783783788e-06, + "loss": 0.6886, + "step": 250 + }, + { + "epoch": 0.0203849589864371, + "grad_norm": 4.387381573209668, + "learning_rate": 3.391891891891892e-06, + "loss": 0.6018, + "step": 251 + }, + { + "epoch": 0.02046617396247868, + "grad_norm": 4.588515154439844, + "learning_rate": 3.4054054054054057e-06, + "loss": 0.8009, + "step": 252 + }, + { + "epoch": 0.020547388938520265, + "grad_norm": 5.9906722613866865, + "learning_rate": 3.418918918918919e-06, + "loss": 0.6694, + "step": 253 + }, + { + "epoch": 0.020628603914561845, + "grad_norm": 4.955892303287632, + "learning_rate": 3.4324324324324326e-06, + "loss": 0.8309, + "step": 254 + }, + { + "epoch": 0.020709818890603426, + "grad_norm": 4.824146659749785, + "learning_rate": 3.445945945945946e-06, + "loss": 0.8692, + "step": 255 + }, + { + "epoch": 0.02079103386664501, + "grad_norm": 5.5270982189809015, + "learning_rate": 3.45945945945946e-06, + "loss": 0.5992, + "step": 256 + }, + { + "epoch": 0.02087224884268659, + "grad_norm": 6.204903985337891, + "learning_rate": 3.4729729729729734e-06, + "loss": 0.777, + "step": 257 + }, + { + "epoch": 0.020953463818728174, + "grad_norm": 3.695647833466083, + "learning_rate": 3.4864864864864865e-06, + "loss": 0.6435, + "step": 258 + }, + { + "epoch": 0.021034678794769755, + "grad_norm": 5.291080353938334, + "learning_rate": 3.5e-06, + "loss": 0.6796, + "step": 259 + }, + { + "epoch": 0.02111589377081134, + "grad_norm": 4.5185266685147285, + "learning_rate": 3.513513513513514e-06, + "loss": 0.6448, + "step": 260 + }, + { + "epoch": 0.02119710874685292, + "grad_norm": 4.851027601896295, + "learning_rate": 3.5270270270270273e-06, + "loss": 0.682, + "step": 261 + }, + { + "epoch": 0.021278323722894503, + "grad_norm": 5.6100378122852925, + "learning_rate": 3.5405405405405408e-06, + "loss": 0.7084, + "step": 262 + }, + { + "epoch": 0.021359538698936083, + "grad_norm": 5.135851719796815, + "learning_rate": 3.5540540540540542e-06, + "loss": 0.8671, + "step": 263 + }, + { + "epoch": 0.021440753674977667, + "grad_norm": 5.735868784715722, + "learning_rate": 3.567567567567568e-06, + "loss": 0.5465, + "step": 264 + }, + { + "epoch": 0.021521968651019248, + "grad_norm": 4.041664795202519, + "learning_rate": 3.5810810810810816e-06, + "loss": 0.6117, + "step": 265 + }, + { + "epoch": 0.02160318362706083, + "grad_norm": 4.742239360128328, + "learning_rate": 3.5945945945945946e-06, + "loss": 0.6543, + "step": 266 + }, + { + "epoch": 0.021684398603102412, + "grad_norm": 5.6376075863303114, + "learning_rate": 3.608108108108108e-06, + "loss": 0.7132, + "step": 267 + }, + { + "epoch": 0.021765613579143993, + "grad_norm": 4.108413552937677, + "learning_rate": 3.621621621621622e-06, + "loss": 0.9163, + "step": 268 + }, + { + "epoch": 0.021846828555185577, + "grad_norm": 7.767807764665652, + "learning_rate": 3.6351351351351354e-06, + "loss": 0.6385, + "step": 269 + }, + { + "epoch": 0.021928043531227157, + "grad_norm": 5.864609873584057, + "learning_rate": 3.648648648648649e-06, + "loss": 0.6538, + "step": 270 + }, + { + "epoch": 0.02200925850726874, + "grad_norm": 4.820511968405664, + "learning_rate": 3.6621621621621624e-06, + "loss": 0.6993, + "step": 271 + }, + { + "epoch": 0.02209047348331032, + "grad_norm": 4.967071259608292, + "learning_rate": 3.6756756756756763e-06, + "loss": 0.8915, + "step": 272 + }, + { + "epoch": 0.022171688459351906, + "grad_norm": 3.7176252188194723, + "learning_rate": 3.6891891891891897e-06, + "loss": 0.906, + "step": 273 + }, + { + "epoch": 0.022252903435393486, + "grad_norm": 7.432800095587209, + "learning_rate": 3.7027027027027028e-06, + "loss": 0.5487, + "step": 274 + }, + { + "epoch": 0.02233411841143507, + "grad_norm": 4.725582603425111, + "learning_rate": 3.7162162162162162e-06, + "loss": 0.6497, + "step": 275 + }, + { + "epoch": 0.02241533338747665, + "grad_norm": 7.338793955644148, + "learning_rate": 3.72972972972973e-06, + "loss": 0.6416, + "step": 276 + }, + { + "epoch": 0.02249654836351823, + "grad_norm": 5.306407989783936, + "learning_rate": 3.7432432432432436e-06, + "loss": 0.6278, + "step": 277 + }, + { + "epoch": 0.022577763339559815, + "grad_norm": 5.1495205257748164, + "learning_rate": 3.756756756756757e-06, + "loss": 0.7679, + "step": 278 + }, + { + "epoch": 0.022658978315601395, + "grad_norm": 5.708997533226292, + "learning_rate": 3.7702702702702705e-06, + "loss": 0.7532, + "step": 279 + }, + { + "epoch": 0.02274019329164298, + "grad_norm": 8.83003188884847, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.7133, + "step": 280 + }, + { + "epoch": 0.02282140826768456, + "grad_norm": 5.539341071050379, + "learning_rate": 3.797297297297298e-06, + "loss": 0.7097, + "step": 281 + }, + { + "epoch": 0.022902623243726144, + "grad_norm": 8.240407300947401, + "learning_rate": 3.810810810810811e-06, + "loss": 0.6048, + "step": 282 + }, + { + "epoch": 0.022983838219767724, + "grad_norm": 4.587467280833204, + "learning_rate": 3.824324324324324e-06, + "loss": 0.8175, + "step": 283 + }, + { + "epoch": 0.02306505319580931, + "grad_norm": 7.179515697903426, + "learning_rate": 3.837837837837838e-06, + "loss": 0.8436, + "step": 284 + }, + { + "epoch": 0.02314626817185089, + "grad_norm": 5.309912678695326, + "learning_rate": 3.851351351351352e-06, + "loss": 0.625, + "step": 285 + }, + { + "epoch": 0.023227483147892473, + "grad_norm": 4.941339777332484, + "learning_rate": 3.864864864864865e-06, + "loss": 0.6989, + "step": 286 + }, + { + "epoch": 0.023308698123934053, + "grad_norm": 5.2052407828367215, + "learning_rate": 3.878378378378378e-06, + "loss": 0.5503, + "step": 287 + }, + { + "epoch": 0.023389913099975637, + "grad_norm": 4.238862573234204, + "learning_rate": 3.891891891891892e-06, + "loss": 0.8404, + "step": 288 + }, + { + "epoch": 0.023471128076017218, + "grad_norm": 9.415691207617115, + "learning_rate": 3.905405405405406e-06, + "loss": 0.6679, + "step": 289 + }, + { + "epoch": 0.023552343052058798, + "grad_norm": 4.698876605885647, + "learning_rate": 3.918918918918919e-06, + "loss": 0.5115, + "step": 290 + }, + { + "epoch": 0.023633558028100382, + "grad_norm": 8.590289244453471, + "learning_rate": 3.932432432432433e-06, + "loss": 0.8333, + "step": 291 + }, + { + "epoch": 0.023714773004141963, + "grad_norm": 3.9911010631765764, + "learning_rate": 3.945945945945947e-06, + "loss": 0.5686, + "step": 292 + }, + { + "epoch": 0.023795987980183547, + "grad_norm": 4.926542248398489, + "learning_rate": 3.95945945945946e-06, + "loss": 0.6576, + "step": 293 + }, + { + "epoch": 0.023877202956225127, + "grad_norm": 7.808087195520982, + "learning_rate": 3.972972972972973e-06, + "loss": 0.7971, + "step": 294 + }, + { + "epoch": 0.02395841793226671, + "grad_norm": 4.140299491048756, + "learning_rate": 3.986486486486487e-06, + "loss": 0.649, + "step": 295 + }, + { + "epoch": 0.02403963290830829, + "grad_norm": 6.598136262319362, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6101, + "step": 296 + }, + { + "epoch": 0.024120847884349875, + "grad_norm": 6.657448338847534, + "learning_rate": 4.013513513513514e-06, + "loss": 0.5807, + "step": 297 + }, + { + "epoch": 0.024202062860391456, + "grad_norm": 6.049362925798347, + "learning_rate": 4.027027027027028e-06, + "loss": 0.614, + "step": 298 + }, + { + "epoch": 0.02428327783643304, + "grad_norm": 7.1798282609319415, + "learning_rate": 4.040540540540541e-06, + "loss": 0.6665, + "step": 299 + }, + { + "epoch": 0.02436449281247462, + "grad_norm": 15.049209513289172, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.6214, + "step": 300 + }, + { + "epoch": 0.0244457077885162, + "grad_norm": 3.8053242555293796, + "learning_rate": 4.067567567567568e-06, + "loss": 0.9301, + "step": 301 + }, + { + "epoch": 0.024526922764557785, + "grad_norm": 5.676808515237133, + "learning_rate": 4.0810810810810815e-06, + "loss": 0.7352, + "step": 302 + }, + { + "epoch": 0.024608137740599365, + "grad_norm": 3.930706133352666, + "learning_rate": 4.0945945945945945e-06, + "loss": 0.6721, + "step": 303 + }, + { + "epoch": 0.02468935271664095, + "grad_norm": 4.260405802198655, + "learning_rate": 4.108108108108108e-06, + "loss": 0.6061, + "step": 304 + }, + { + "epoch": 0.02477056769268253, + "grad_norm": 3.8039307407950544, + "learning_rate": 4.121621621621622e-06, + "loss": 0.7034, + "step": 305 + }, + { + "epoch": 0.024851782668724114, + "grad_norm": 4.487160345392378, + "learning_rate": 4.135135135135135e-06, + "loss": 0.703, + "step": 306 + }, + { + "epoch": 0.024932997644765694, + "grad_norm": 7.679594903469531, + "learning_rate": 4.148648648648649e-06, + "loss": 0.6759, + "step": 307 + }, + { + "epoch": 0.025014212620807278, + "grad_norm": 3.8651431834889203, + "learning_rate": 4.162162162162163e-06, + "loss": 0.6642, + "step": 308 + }, + { + "epoch": 0.02509542759684886, + "grad_norm": 7.531093080992881, + "learning_rate": 4.175675675675676e-06, + "loss": 0.6803, + "step": 309 + }, + { + "epoch": 0.025176642572890442, + "grad_norm": 5.135386924538244, + "learning_rate": 4.189189189189189e-06, + "loss": 0.9062, + "step": 310 + }, + { + "epoch": 0.025257857548932023, + "grad_norm": 6.817605774173451, + "learning_rate": 4.202702702702703e-06, + "loss": 0.5363, + "step": 311 + }, + { + "epoch": 0.025339072524973603, + "grad_norm": 4.999220532436134, + "learning_rate": 4.216216216216217e-06, + "loss": 0.7114, + "step": 312 + }, + { + "epoch": 0.025420287501015187, + "grad_norm": 5.1328146811650495, + "learning_rate": 4.22972972972973e-06, + "loss": 0.6691, + "step": 313 + }, + { + "epoch": 0.025501502477056768, + "grad_norm": 5.421243989150761, + "learning_rate": 4.243243243243244e-06, + "loss": 0.5742, + "step": 314 + }, + { + "epoch": 0.025582717453098352, + "grad_norm": 5.473638240229686, + "learning_rate": 4.256756756756757e-06, + "loss": 0.6078, + "step": 315 + }, + { + "epoch": 0.025663932429139932, + "grad_norm": 4.005836520564742, + "learning_rate": 4.270270270270271e-06, + "loss": 0.6261, + "step": 316 + }, + { + "epoch": 0.025745147405181516, + "grad_norm": 4.474936034323712, + "learning_rate": 4.283783783783784e-06, + "loss": 0.7862, + "step": 317 + }, + { + "epoch": 0.025826362381223097, + "grad_norm": 7.022757009649874, + "learning_rate": 4.297297297297298e-06, + "loss": 0.7268, + "step": 318 + }, + { + "epoch": 0.02590757735726468, + "grad_norm": 4.756298282781381, + "learning_rate": 4.310810810810811e-06, + "loss": 0.9672, + "step": 319 + }, + { + "epoch": 0.02598879233330626, + "grad_norm": 3.7972155322987797, + "learning_rate": 4.324324324324325e-06, + "loss": 0.7338, + "step": 320 + }, + { + "epoch": 0.026070007309347845, + "grad_norm": 6.4304814258504, + "learning_rate": 4.3378378378378385e-06, + "loss": 0.618, + "step": 321 + }, + { + "epoch": 0.026151222285389426, + "grad_norm": 4.985880984448781, + "learning_rate": 4.351351351351352e-06, + "loss": 0.752, + "step": 322 + }, + { + "epoch": 0.02623243726143101, + "grad_norm": 6.965800077960176, + "learning_rate": 4.364864864864865e-06, + "loss": 0.6961, + "step": 323 + }, + { + "epoch": 0.02631365223747259, + "grad_norm": 5.990584472824659, + "learning_rate": 4.378378378378379e-06, + "loss": 0.68, + "step": 324 + }, + { + "epoch": 0.02639486721351417, + "grad_norm": 3.9564290883110647, + "learning_rate": 4.391891891891892e-06, + "loss": 0.6824, + "step": 325 + }, + { + "epoch": 0.026476082189555755, + "grad_norm": 4.712991398333253, + "learning_rate": 4.4054054054054054e-06, + "loss": 0.7004, + "step": 326 + }, + { + "epoch": 0.026557297165597335, + "grad_norm": 4.011893314732997, + "learning_rate": 4.418918918918919e-06, + "loss": 0.5713, + "step": 327 + }, + { + "epoch": 0.02663851214163892, + "grad_norm": 10.789632918330955, + "learning_rate": 4.432432432432433e-06, + "loss": 0.5751, + "step": 328 + }, + { + "epoch": 0.0267197271176805, + "grad_norm": 5.480368608439147, + "learning_rate": 4.445945945945946e-06, + "loss": 0.621, + "step": 329 + }, + { + "epoch": 0.026800942093722083, + "grad_norm": 5.7028472331846, + "learning_rate": 4.45945945945946e-06, + "loss": 0.6287, + "step": 330 + }, + { + "epoch": 0.026882157069763664, + "grad_norm": 4.278443647971252, + "learning_rate": 4.472972972972973e-06, + "loss": 0.7512, + "step": 331 + }, + { + "epoch": 0.026963372045805248, + "grad_norm": 5.540973909463085, + "learning_rate": 4.486486486486487e-06, + "loss": 0.6958, + "step": 332 + }, + { + "epoch": 0.02704458702184683, + "grad_norm": 8.48363442658351, + "learning_rate": 4.5e-06, + "loss": 0.6622, + "step": 333 + }, + { + "epoch": 0.027125801997888412, + "grad_norm": 5.265277541309016, + "learning_rate": 4.513513513513514e-06, + "loss": 0.7521, + "step": 334 + }, + { + "epoch": 0.027207016973929993, + "grad_norm": 6.782051797558551, + "learning_rate": 4.527027027027027e-06, + "loss": 0.5695, + "step": 335 + }, + { + "epoch": 0.027288231949971573, + "grad_norm": 7.707736747999393, + "learning_rate": 4.540540540540541e-06, + "loss": 0.7251, + "step": 336 + }, + { + "epoch": 0.027369446926013157, + "grad_norm": 9.996464962694214, + "learning_rate": 4.554054054054055e-06, + "loss": 0.7112, + "step": 337 + }, + { + "epoch": 0.027450661902054738, + "grad_norm": 5.660779752011064, + "learning_rate": 4.567567567567568e-06, + "loss": 0.7041, + "step": 338 + }, + { + "epoch": 0.02753187687809632, + "grad_norm": 6.58420296764706, + "learning_rate": 4.581081081081081e-06, + "loss": 0.7357, + "step": 339 + }, + { + "epoch": 0.027613091854137902, + "grad_norm": 6.154533828404867, + "learning_rate": 4.594594594594596e-06, + "loss": 0.8672, + "step": 340 + }, + { + "epoch": 0.027694306830179486, + "grad_norm": 3.6916825043347874, + "learning_rate": 4.608108108108109e-06, + "loss": 0.7116, + "step": 341 + }, + { + "epoch": 0.027775521806221067, + "grad_norm": 5.536606059040381, + "learning_rate": 4.621621621621622e-06, + "loss": 0.5388, + "step": 342 + }, + { + "epoch": 0.02785673678226265, + "grad_norm": 5.0131032158949305, + "learning_rate": 4.635135135135136e-06, + "loss": 0.63, + "step": 343 + }, + { + "epoch": 0.02793795175830423, + "grad_norm": 8.645264812499297, + "learning_rate": 4.6486486486486495e-06, + "loss": 0.6898, + "step": 344 + }, + { + "epoch": 0.028019166734345815, + "grad_norm": 8.845007826985789, + "learning_rate": 4.6621621621621625e-06, + "loss": 0.7366, + "step": 345 + }, + { + "epoch": 0.028100381710387395, + "grad_norm": 6.800329318246896, + "learning_rate": 4.675675675675676e-06, + "loss": 0.6626, + "step": 346 + }, + { + "epoch": 0.028181596686428976, + "grad_norm": 3.843752947734245, + "learning_rate": 4.6891891891891895e-06, + "loss": 0.9352, + "step": 347 + }, + { + "epoch": 0.02826281166247056, + "grad_norm": 4.717520289005672, + "learning_rate": 4.702702702702703e-06, + "loss": 0.558, + "step": 348 + }, + { + "epoch": 0.02834402663851214, + "grad_norm": 5.18843657142752, + "learning_rate": 4.716216216216216e-06, + "loss": 0.6733, + "step": 349 + }, + { + "epoch": 0.028425241614553724, + "grad_norm": 6.203969455338302, + "learning_rate": 4.72972972972973e-06, + "loss": 0.7618, + "step": 350 + }, + { + "epoch": 0.028506456590595305, + "grad_norm": 5.787025528310779, + "learning_rate": 4.743243243243243e-06, + "loss": 0.6203, + "step": 351 + }, + { + "epoch": 0.02858767156663689, + "grad_norm": 5.74015922758883, + "learning_rate": 4.756756756756757e-06, + "loss": 0.7899, + "step": 352 + }, + { + "epoch": 0.02866888654267847, + "grad_norm": 3.468789863957611, + "learning_rate": 4.770270270270271e-06, + "loss": 0.5911, + "step": 353 + }, + { + "epoch": 0.028750101518720053, + "grad_norm": 4.222539599204078, + "learning_rate": 4.783783783783784e-06, + "loss": 0.6077, + "step": 354 + }, + { + "epoch": 0.028831316494761634, + "grad_norm": 5.987825387101823, + "learning_rate": 4.797297297297297e-06, + "loss": 0.631, + "step": 355 + }, + { + "epoch": 0.028912531470803218, + "grad_norm": 3.494510978483797, + "learning_rate": 4.810810810810811e-06, + "loss": 0.6175, + "step": 356 + }, + { + "epoch": 0.028993746446844798, + "grad_norm": 5.341804933211992, + "learning_rate": 4.824324324324325e-06, + "loss": 0.8241, + "step": 357 + }, + { + "epoch": 0.02907496142288638, + "grad_norm": 5.332752685486192, + "learning_rate": 4.837837837837838e-06, + "loss": 0.6647, + "step": 358 + }, + { + "epoch": 0.029156176398927963, + "grad_norm": 4.174093461478239, + "learning_rate": 4.851351351351352e-06, + "loss": 0.681, + "step": 359 + }, + { + "epoch": 0.029237391374969543, + "grad_norm": 4.796111353710757, + "learning_rate": 4.864864864864866e-06, + "loss": 0.7803, + "step": 360 + }, + { + "epoch": 0.029318606351011127, + "grad_norm": 4.430094054311749, + "learning_rate": 4.878378378378379e-06, + "loss": 0.5628, + "step": 361 + }, + { + "epoch": 0.029399821327052707, + "grad_norm": 4.736436195711088, + "learning_rate": 4.891891891891893e-06, + "loss": 0.6637, + "step": 362 + }, + { + "epoch": 0.02948103630309429, + "grad_norm": 5.52908335708342, + "learning_rate": 4.905405405405406e-06, + "loss": 0.6167, + "step": 363 + }, + { + "epoch": 0.029562251279135872, + "grad_norm": 5.203669203589008, + "learning_rate": 4.91891891891892e-06, + "loss": 0.7164, + "step": 364 + }, + { + "epoch": 0.029643466255177456, + "grad_norm": 5.632857406640382, + "learning_rate": 4.932432432432433e-06, + "loss": 0.6265, + "step": 365 + }, + { + "epoch": 0.029724681231219036, + "grad_norm": 4.525898082221645, + "learning_rate": 4.9459459459459466e-06, + "loss": 0.6532, + "step": 366 + }, + { + "epoch": 0.02980589620726062, + "grad_norm": 3.993279600481055, + "learning_rate": 4.95945945945946e-06, + "loss": 0.6465, + "step": 367 + }, + { + "epoch": 0.0298871111833022, + "grad_norm": 4.976392222325483, + "learning_rate": 4.9729729729729735e-06, + "loss": 0.6768, + "step": 368 + }, + { + "epoch": 0.029968326159343785, + "grad_norm": 7.027194066182947, + "learning_rate": 4.986486486486487e-06, + "loss": 0.7201, + "step": 369 + }, + { + "epoch": 0.030049541135385365, + "grad_norm": 5.6159327874829605, + "learning_rate": 5e-06, + "loss": 0.6772, + "step": 370 + }, + { + "epoch": 0.030130756111426946, + "grad_norm": 4.876745048221957, + "learning_rate": 4.999999913506616e-06, + "loss": 0.6551, + "step": 371 + }, + { + "epoch": 0.03021197108746853, + "grad_norm": 11.157591685761751, + "learning_rate": 4.999999654026468e-06, + "loss": 0.5551, + "step": 372 + }, + { + "epoch": 0.03029318606351011, + "grad_norm": 4.729581532775085, + "learning_rate": 4.999999221559576e-06, + "loss": 0.7666, + "step": 373 + }, + { + "epoch": 0.030374401039551694, + "grad_norm": 5.683222450900519, + "learning_rate": 4.9999986161059685e-06, + "loss": 0.7107, + "step": 374 + }, + { + "epoch": 0.030455616015593275, + "grad_norm": 20.13916455542101, + "learning_rate": 4.9999978376656875e-06, + "loss": 0.5651, + "step": 375 + }, + { + "epoch": 0.03053683099163486, + "grad_norm": 4.663636959572572, + "learning_rate": 4.999996886238788e-06, + "loss": 0.6166, + "step": 376 + }, + { + "epoch": 0.03061804596767644, + "grad_norm": 3.547301470431729, + "learning_rate": 4.999995761825335e-06, + "loss": 0.6379, + "step": 377 + }, + { + "epoch": 0.030699260943718023, + "grad_norm": 4.026761047432411, + "learning_rate": 4.999994464425406e-06, + "loss": 0.6795, + "step": 378 + }, + { + "epoch": 0.030780475919759603, + "grad_norm": 4.177399049954423, + "learning_rate": 4.99999299403909e-06, + "loss": 0.6356, + "step": 379 + }, + { + "epoch": 0.030861690895801187, + "grad_norm": 8.689971440287565, + "learning_rate": 4.999991350666491e-06, + "loss": 0.5655, + "step": 380 + }, + { + "epoch": 0.030942905871842768, + "grad_norm": 4.286085314862286, + "learning_rate": 4.999989534307722e-06, + "loss": 0.7988, + "step": 381 + }, + { + "epoch": 0.03102412084788435, + "grad_norm": 4.468190941115001, + "learning_rate": 4.999987544962908e-06, + "loss": 0.6375, + "step": 382 + }, + { + "epoch": 0.031105335823925932, + "grad_norm": 11.088261247896137, + "learning_rate": 4.999985382632186e-06, + "loss": 0.6834, + "step": 383 + }, + { + "epoch": 0.031186550799967513, + "grad_norm": 4.746884755745409, + "learning_rate": 4.9999830473157065e-06, + "loss": 0.6825, + "step": 384 + }, + { + "epoch": 0.03126776577600909, + "grad_norm": 5.042936323367531, + "learning_rate": 4.9999805390136315e-06, + "loss": 0.6341, + "step": 385 + }, + { + "epoch": 0.03134898075205068, + "grad_norm": 4.6209559647373375, + "learning_rate": 4.999977857726135e-06, + "loss": 0.5923, + "step": 386 + }, + { + "epoch": 0.03143019572809226, + "grad_norm": 5.563156193085517, + "learning_rate": 4.999975003453401e-06, + "loss": 0.6442, + "step": 387 + }, + { + "epoch": 0.031511410704133845, + "grad_norm": 5.076759188588105, + "learning_rate": 4.999971976195628e-06, + "loss": 0.6856, + "step": 388 + }, + { + "epoch": 0.03159262568017542, + "grad_norm": 3.752599670476179, + "learning_rate": 4.999968775953025e-06, + "loss": 0.7078, + "step": 389 + }, + { + "epoch": 0.031673840656217006, + "grad_norm": 4.540276130175966, + "learning_rate": 4.999965402725812e-06, + "loss": 0.6747, + "step": 390 + }, + { + "epoch": 0.03175505563225859, + "grad_norm": 6.85554781706532, + "learning_rate": 4.999961856514226e-06, + "loss": 0.58, + "step": 391 + }, + { + "epoch": 0.031836270608300174, + "grad_norm": 4.470561799528824, + "learning_rate": 4.99995813731851e-06, + "loss": 0.544, + "step": 392 + }, + { + "epoch": 0.03191748558434175, + "grad_norm": 3.381515929810499, + "learning_rate": 4.999954245138921e-06, + "loss": 0.6551, + "step": 393 + }, + { + "epoch": 0.031998700560383335, + "grad_norm": 5.642691704207667, + "learning_rate": 4.99995017997573e-06, + "loss": 0.7476, + "step": 394 + }, + { + "epoch": 0.03207991553642492, + "grad_norm": 4.423437336138073, + "learning_rate": 4.999945941829217e-06, + "loss": 0.6501, + "step": 395 + }, + { + "epoch": 0.032161130512466496, + "grad_norm": 5.470944485898885, + "learning_rate": 4.999941530699675e-06, + "loss": 0.5093, + "step": 396 + }, + { + "epoch": 0.03224234548850808, + "grad_norm": 6.234514653013104, + "learning_rate": 4.999936946587412e-06, + "loss": 0.5785, + "step": 397 + }, + { + "epoch": 0.032323560464549664, + "grad_norm": 4.217022001267068, + "learning_rate": 4.999932189492741e-06, + "loss": 0.6715, + "step": 398 + }, + { + "epoch": 0.03240477544059125, + "grad_norm": 3.997512319922242, + "learning_rate": 4.999927259415994e-06, + "loss": 0.6798, + "step": 399 + }, + { + "epoch": 0.032485990416632825, + "grad_norm": 5.231675871788692, + "learning_rate": 4.99992215635751e-06, + "loss": 0.581, + "step": 400 + }, + { + "epoch": 0.03256720539267441, + "grad_norm": 6.544773650290789, + "learning_rate": 4.999916880317645e-06, + "loss": 0.608, + "step": 401 + }, + { + "epoch": 0.03264842036871599, + "grad_norm": 5.966992739092824, + "learning_rate": 4.999911431296762e-06, + "loss": 0.6306, + "step": 402 + }, + { + "epoch": 0.03272963534475758, + "grad_norm": 9.143165980782102, + "learning_rate": 4.999905809295239e-06, + "loss": 0.7834, + "step": 403 + }, + { + "epoch": 0.032810850320799154, + "grad_norm": 5.104500215594029, + "learning_rate": 4.999900014313464e-06, + "loss": 0.5444, + "step": 404 + }, + { + "epoch": 0.03289206529684074, + "grad_norm": 3.982825484617581, + "learning_rate": 4.999894046351839e-06, + "loss": 0.6663, + "step": 405 + }, + { + "epoch": 0.03297328027288232, + "grad_norm": 9.429533561374097, + "learning_rate": 4.999887905410775e-06, + "loss": 0.6111, + "step": 406 + }, + { + "epoch": 0.0330544952489239, + "grad_norm": 6.206743996051552, + "learning_rate": 4.9998815914907e-06, + "loss": 0.6354, + "step": 407 + }, + { + "epoch": 0.03313571022496548, + "grad_norm": 8.266987795691529, + "learning_rate": 4.9998751045920494e-06, + "loss": 0.5287, + "step": 408 + }, + { + "epoch": 0.033216925201007066, + "grad_norm": 5.87817379773943, + "learning_rate": 4.999868444715271e-06, + "loss": 0.7346, + "step": 409 + }, + { + "epoch": 0.03329814017704865, + "grad_norm": 5.524749698493624, + "learning_rate": 4.999861611860827e-06, + "loss": 0.702, + "step": 410 + }, + { + "epoch": 0.03337935515309023, + "grad_norm": 4.172513273585029, + "learning_rate": 4.99985460602919e-06, + "loss": 0.6599, + "step": 411 + }, + { + "epoch": 0.03346057012913181, + "grad_norm": 4.750265866846706, + "learning_rate": 4.9998474272208445e-06, + "loss": 0.5904, + "step": 412 + }, + { + "epoch": 0.033541785105173395, + "grad_norm": 5.899694142476412, + "learning_rate": 4.999840075436286e-06, + "loss": 0.5972, + "step": 413 + }, + { + "epoch": 0.03362300008121498, + "grad_norm": 8.320548493491238, + "learning_rate": 4.999832550676026e-06, + "loss": 0.5492, + "step": 414 + }, + { + "epoch": 0.033704215057256556, + "grad_norm": 4.509397796756542, + "learning_rate": 4.999824852940583e-06, + "loss": 0.7786, + "step": 415 + }, + { + "epoch": 0.03378543003329814, + "grad_norm": 5.616284004220855, + "learning_rate": 4.999816982230491e-06, + "loss": 0.7635, + "step": 416 + }, + { + "epoch": 0.033866645009339724, + "grad_norm": 5.599669292826977, + "learning_rate": 4.999808938546294e-06, + "loss": 0.759, + "step": 417 + }, + { + "epoch": 0.0339478599853813, + "grad_norm": 6.6642180967818945, + "learning_rate": 4.999800721888548e-06, + "loss": 0.6626, + "step": 418 + }, + { + "epoch": 0.034029074961422885, + "grad_norm": 5.882885604144428, + "learning_rate": 4.999792332257822e-06, + "loss": 0.8418, + "step": 419 + }, + { + "epoch": 0.03411028993746447, + "grad_norm": 4.363979980545484, + "learning_rate": 4.999783769654697e-06, + "loss": 0.6134, + "step": 420 + }, + { + "epoch": 0.03419150491350605, + "grad_norm": 4.776238235905251, + "learning_rate": 4.999775034079765e-06, + "loss": 0.7331, + "step": 421 + }, + { + "epoch": 0.03427271988954763, + "grad_norm": 4.945197802004096, + "learning_rate": 4.99976612553363e-06, + "loss": 0.692, + "step": 422 + }, + { + "epoch": 0.034353934865589214, + "grad_norm": 3.1538233404162628, + "learning_rate": 4.999757044016909e-06, + "loss": 0.7029, + "step": 423 + }, + { + "epoch": 0.0344351498416308, + "grad_norm": 5.660874509416241, + "learning_rate": 4.99974778953023e-06, + "loss": 0.6652, + "step": 424 + }, + { + "epoch": 0.03451636481767238, + "grad_norm": 3.5726277574942857, + "learning_rate": 4.9997383620742354e-06, + "loss": 0.7189, + "step": 425 + }, + { + "epoch": 0.03459757979371396, + "grad_norm": 5.076928873914476, + "learning_rate": 4.9997287616495745e-06, + "loss": 0.6376, + "step": 426 + }, + { + "epoch": 0.03467879476975554, + "grad_norm": 7.805331425608438, + "learning_rate": 4.999718988256913e-06, + "loss": 0.6691, + "step": 427 + }, + { + "epoch": 0.03476000974579713, + "grad_norm": 6.422161317682913, + "learning_rate": 4.999709041896927e-06, + "loss": 0.6528, + "step": 428 + }, + { + "epoch": 0.034841224721838704, + "grad_norm": 4.561480725928831, + "learning_rate": 4.9996989225703055e-06, + "loss": 0.5714, + "step": 429 + }, + { + "epoch": 0.03492243969788029, + "grad_norm": 5.799407616311238, + "learning_rate": 4.9996886302777466e-06, + "loss": 0.6321, + "step": 430 + }, + { + "epoch": 0.03500365467392187, + "grad_norm": 6.86134449223638, + "learning_rate": 4.9996781650199655e-06, + "loss": 0.6045, + "step": 431 + }, + { + "epoch": 0.035084869649963456, + "grad_norm": 4.7834450971379034, + "learning_rate": 4.999667526797685e-06, + "loss": 0.8719, + "step": 432 + }, + { + "epoch": 0.03516608462600503, + "grad_norm": 8.624024638101803, + "learning_rate": 4.9996567156116395e-06, + "loss": 0.7095, + "step": 433 + }, + { + "epoch": 0.03524729960204662, + "grad_norm": 4.366180557531383, + "learning_rate": 4.9996457314625794e-06, + "loss": 0.5537, + "step": 434 + }, + { + "epoch": 0.0353285145780882, + "grad_norm": 5.128282050398408, + "learning_rate": 4.9996345743512635e-06, + "loss": 0.8241, + "step": 435 + }, + { + "epoch": 0.035409729554129785, + "grad_norm": 5.611359253780588, + "learning_rate": 4.999623244278464e-06, + "loss": 0.6241, + "step": 436 + }, + { + "epoch": 0.03549094453017136, + "grad_norm": 14.430464729028463, + "learning_rate": 4.999611741244965e-06, + "loss": 0.5835, + "step": 437 + }, + { + "epoch": 0.035572159506212946, + "grad_norm": 5.4664011456664285, + "learning_rate": 4.999600065251563e-06, + "loss": 0.6353, + "step": 438 + }, + { + "epoch": 0.03565337448225453, + "grad_norm": 3.9622611695813172, + "learning_rate": 4.999588216299065e-06, + "loss": 0.5976, + "step": 439 + }, + { + "epoch": 0.03573458945829611, + "grad_norm": 5.613516770561363, + "learning_rate": 4.999576194388292e-06, + "loss": 0.7192, + "step": 440 + }, + { + "epoch": 0.03581580443433769, + "grad_norm": 4.503094326677596, + "learning_rate": 4.999563999520075e-06, + "loss": 0.587, + "step": 441 + }, + { + "epoch": 0.035897019410379274, + "grad_norm": 4.636439079968228, + "learning_rate": 4.999551631695257e-06, + "loss": 0.4959, + "step": 442 + }, + { + "epoch": 0.03597823438642086, + "grad_norm": 4.842745847770766, + "learning_rate": 4.999539090914696e-06, + "loss": 0.8282, + "step": 443 + }, + { + "epoch": 0.036059449362462435, + "grad_norm": 19.890875208483, + "learning_rate": 4.999526377179259e-06, + "loss": 0.6707, + "step": 444 + }, + { + "epoch": 0.03614066433850402, + "grad_norm": 4.903212810732332, + "learning_rate": 4.999513490489824e-06, + "loss": 0.5994, + "step": 445 + }, + { + "epoch": 0.0362218793145456, + "grad_norm": 4.394920629425536, + "learning_rate": 4.999500430847284e-06, + "loss": 0.667, + "step": 446 + }, + { + "epoch": 0.03630309429058719, + "grad_norm": 3.62470443817531, + "learning_rate": 4.9994871982525425e-06, + "loss": 0.703, + "step": 447 + }, + { + "epoch": 0.036384309266628764, + "grad_norm": 6.417116392361057, + "learning_rate": 4.999473792706516e-06, + "loss": 0.6643, + "step": 448 + }, + { + "epoch": 0.03646552424267035, + "grad_norm": 4.999543742426304, + "learning_rate": 4.999460214210131e-06, + "loss": 0.7914, + "step": 449 + }, + { + "epoch": 0.03654673921871193, + "grad_norm": 5.788280407646206, + "learning_rate": 4.999446462764327e-06, + "loss": 0.6302, + "step": 450 + }, + { + "epoch": 0.03662795419475351, + "grad_norm": 4.2978695329143495, + "learning_rate": 4.999432538370057e-06, + "loss": 0.6356, + "step": 451 + }, + { + "epoch": 0.03670916917079509, + "grad_norm": 4.224970563305071, + "learning_rate": 4.999418441028283e-06, + "loss": 0.7923, + "step": 452 + }, + { + "epoch": 0.03679038414683668, + "grad_norm": 6.438067501026122, + "learning_rate": 4.9994041707399794e-06, + "loss": 0.4936, + "step": 453 + }, + { + "epoch": 0.03687159912287826, + "grad_norm": 4.579554653548933, + "learning_rate": 4.999389727506137e-06, + "loss": 0.58, + "step": 454 + }, + { + "epoch": 0.03695281409891984, + "grad_norm": 4.4684771869350035, + "learning_rate": 4.999375111327753e-06, + "loss": 0.5475, + "step": 455 + }, + { + "epoch": 0.03703402907496142, + "grad_norm": 7.907710639614374, + "learning_rate": 4.999360322205838e-06, + "loss": 0.6459, + "step": 456 + }, + { + "epoch": 0.037115244051003006, + "grad_norm": 5.848886945428208, + "learning_rate": 4.999345360141417e-06, + "loss": 0.6723, + "step": 457 + }, + { + "epoch": 0.03719645902704459, + "grad_norm": 3.055736509730621, + "learning_rate": 4.999330225135525e-06, + "loss": 0.7079, + "step": 458 + }, + { + "epoch": 0.03727767400308617, + "grad_norm": 4.84395863310794, + "learning_rate": 4.999314917189209e-06, + "loss": 0.8212, + "step": 459 + }, + { + "epoch": 0.03735888897912775, + "grad_norm": 4.4914093742426, + "learning_rate": 4.999299436303527e-06, + "loss": 0.7551, + "step": 460 + }, + { + "epoch": 0.037440103955169335, + "grad_norm": 5.747535926175868, + "learning_rate": 4.999283782479552e-06, + "loss": 0.755, + "step": 461 + }, + { + "epoch": 0.03752131893121091, + "grad_norm": 4.124807094235786, + "learning_rate": 4.999267955718367e-06, + "loss": 0.6151, + "step": 462 + }, + { + "epoch": 0.037602533907252496, + "grad_norm": 4.951699574302496, + "learning_rate": 4.999251956021066e-06, + "loss": 0.6636, + "step": 463 + }, + { + "epoch": 0.03768374888329408, + "grad_norm": 3.634905291149469, + "learning_rate": 4.999235783388757e-06, + "loss": 0.7115, + "step": 464 + }, + { + "epoch": 0.037764963859335664, + "grad_norm": 7.068387813096013, + "learning_rate": 4.999219437822559e-06, + "loss": 0.689, + "step": 465 + }, + { + "epoch": 0.03784617883537724, + "grad_norm": 4.404856833972122, + "learning_rate": 4.999202919323603e-06, + "loss": 0.6214, + "step": 466 + }, + { + "epoch": 0.037927393811418825, + "grad_norm": 4.578684856908467, + "learning_rate": 4.9991862278930315e-06, + "loss": 0.7647, + "step": 467 + }, + { + "epoch": 0.03800860878746041, + "grad_norm": 4.898513292964633, + "learning_rate": 4.9991693635320005e-06, + "loss": 0.6884, + "step": 468 + }, + { + "epoch": 0.03808982376350199, + "grad_norm": 4.575368944744301, + "learning_rate": 4.999152326241675e-06, + "loss": 0.6647, + "step": 469 + }, + { + "epoch": 0.03817103873954357, + "grad_norm": 4.238070688134001, + "learning_rate": 4.999135116023236e-06, + "loss": 0.6667, + "step": 470 + }, + { + "epoch": 0.038252253715585154, + "grad_norm": 6.035566032840847, + "learning_rate": 4.999117732877873e-06, + "loss": 0.6754, + "step": 471 + }, + { + "epoch": 0.03833346869162674, + "grad_norm": 3.607010706319493, + "learning_rate": 4.9991001768067895e-06, + "loss": 0.8115, + "step": 472 + }, + { + "epoch": 0.03841468366766832, + "grad_norm": 6.191780527301594, + "learning_rate": 4.9990824478112e-06, + "loss": 0.6837, + "step": 473 + }, + { + "epoch": 0.0384958986437099, + "grad_norm": 5.355617747355118, + "learning_rate": 4.999064545892331e-06, + "loss": 0.7097, + "step": 474 + }, + { + "epoch": 0.03857711361975148, + "grad_norm": 4.233673909331852, + "learning_rate": 4.999046471051422e-06, + "loss": 0.5915, + "step": 475 + }, + { + "epoch": 0.038658328595793066, + "grad_norm": 4.780173065160517, + "learning_rate": 4.999028223289724e-06, + "loss": 0.9316, + "step": 476 + }, + { + "epoch": 0.03873954357183464, + "grad_norm": 4.063084409048435, + "learning_rate": 4.999009802608497e-06, + "loss": 0.6671, + "step": 477 + }, + { + "epoch": 0.03882075854787623, + "grad_norm": 4.851439856688829, + "learning_rate": 4.998991209009019e-06, + "loss": 0.8034, + "step": 478 + }, + { + "epoch": 0.03890197352391781, + "grad_norm": 7.105889513775342, + "learning_rate": 4.998972442492575e-06, + "loss": 0.532, + "step": 479 + }, + { + "epoch": 0.038983188499959395, + "grad_norm": 3.8451222439770683, + "learning_rate": 4.9989535030604615e-06, + "loss": 0.5993, + "step": 480 + }, + { + "epoch": 0.03906440347600097, + "grad_norm": 5.621997119878575, + "learning_rate": 4.998934390713994e-06, + "loss": 0.5273, + "step": 481 + }, + { + "epoch": 0.039145618452042556, + "grad_norm": 4.81307738692358, + "learning_rate": 4.9989151054544905e-06, + "loss": 0.6318, + "step": 482 + }, + { + "epoch": 0.03922683342808414, + "grad_norm": 3.849217538759433, + "learning_rate": 4.998895647283287e-06, + "loss": 0.6886, + "step": 483 + }, + { + "epoch": 0.039308048404125724, + "grad_norm": 3.909888024088152, + "learning_rate": 4.99887601620173e-06, + "loss": 0.7443, + "step": 484 + }, + { + "epoch": 0.0393892633801673, + "grad_norm": 5.4327964670836835, + "learning_rate": 4.9988562122111785e-06, + "loss": 0.6783, + "step": 485 + }, + { + "epoch": 0.039470478356208885, + "grad_norm": 4.60325431470408, + "learning_rate": 4.998836235313001e-06, + "loss": 0.6589, + "step": 486 + }, + { + "epoch": 0.03955169333225047, + "grad_norm": 4.5476339338388945, + "learning_rate": 4.998816085508582e-06, + "loss": 0.5747, + "step": 487 + }, + { + "epoch": 0.039632908308292046, + "grad_norm": 4.970415173334207, + "learning_rate": 4.9987957627993145e-06, + "loss": 0.6026, + "step": 488 + }, + { + "epoch": 0.03971412328433363, + "grad_norm": 4.315065332489597, + "learning_rate": 4.998775267186605e-06, + "loss": 0.7375, + "step": 489 + }, + { + "epoch": 0.039795338260375214, + "grad_norm": 5.5992123809226, + "learning_rate": 4.998754598671871e-06, + "loss": 0.6427, + "step": 490 + }, + { + "epoch": 0.0398765532364168, + "grad_norm": 3.7014622620168405, + "learning_rate": 4.998733757256544e-06, + "loss": 0.7897, + "step": 491 + }, + { + "epoch": 0.039957768212458375, + "grad_norm": 6.7379942751786235, + "learning_rate": 4.998712742942065e-06, + "loss": 0.6393, + "step": 492 + }, + { + "epoch": 0.04003898318849996, + "grad_norm": 4.857719068955753, + "learning_rate": 4.998691555729888e-06, + "loss": 0.6606, + "step": 493 + }, + { + "epoch": 0.04012019816454154, + "grad_norm": 4.0539838554657885, + "learning_rate": 4.9986701956214804e-06, + "loss": 0.7552, + "step": 494 + }, + { + "epoch": 0.04020141314058313, + "grad_norm": 4.957439145405436, + "learning_rate": 4.998648662618318e-06, + "loss": 0.666, + "step": 495 + }, + { + "epoch": 0.040282628116624704, + "grad_norm": 4.953757290265213, + "learning_rate": 4.998626956721894e-06, + "loss": 0.6508, + "step": 496 + }, + { + "epoch": 0.04036384309266629, + "grad_norm": 3.84200026766362, + "learning_rate": 4.998605077933706e-06, + "loss": 0.6307, + "step": 497 + }, + { + "epoch": 0.04044505806870787, + "grad_norm": 6.038380469894045, + "learning_rate": 4.998583026255272e-06, + "loss": 0.794, + "step": 498 + }, + { + "epoch": 0.04052627304474945, + "grad_norm": 5.862309248926957, + "learning_rate": 4.998560801688116e-06, + "loss": 0.5471, + "step": 499 + }, + { + "epoch": 0.04060748802079103, + "grad_norm": 5.376571375980098, + "learning_rate": 4.998538404233776e-06, + "loss": 0.6685, + "step": 500 + }, + { + "epoch": 0.04068870299683262, + "grad_norm": 3.972813169207523, + "learning_rate": 4.998515833893801e-06, + "loss": 0.5943, + "step": 501 + }, + { + "epoch": 0.0407699179728742, + "grad_norm": 5.457428585886831, + "learning_rate": 4.998493090669754e-06, + "loss": 0.6821, + "step": 502 + }, + { + "epoch": 0.04085113294891578, + "grad_norm": 5.526128969258129, + "learning_rate": 4.998470174563208e-06, + "loss": 0.6268, + "step": 503 + }, + { + "epoch": 0.04093234792495736, + "grad_norm": 5.00358557141072, + "learning_rate": 4.9984470855757485e-06, + "loss": 0.5511, + "step": 504 + }, + { + "epoch": 0.041013562900998946, + "grad_norm": 9.228964746118006, + "learning_rate": 4.998423823708974e-06, + "loss": 0.7192, + "step": 505 + }, + { + "epoch": 0.04109477787704053, + "grad_norm": 7.043937824113614, + "learning_rate": 4.998400388964494e-06, + "loss": 0.6898, + "step": 506 + }, + { + "epoch": 0.041175992853082106, + "grad_norm": 7.469052434654404, + "learning_rate": 4.998376781343929e-06, + "loss": 0.6673, + "step": 507 + }, + { + "epoch": 0.04125720782912369, + "grad_norm": 5.819947119265823, + "learning_rate": 4.998353000848913e-06, + "loss": 0.8352, + "step": 508 + }, + { + "epoch": 0.041338422805165274, + "grad_norm": 4.632267764249478, + "learning_rate": 4.998329047481093e-06, + "loss": 0.6627, + "step": 509 + }, + { + "epoch": 0.04141963778120685, + "grad_norm": 5.04622538690542, + "learning_rate": 4.998304921242124e-06, + "loss": 0.6928, + "step": 510 + }, + { + "epoch": 0.041500852757248435, + "grad_norm": 4.929054550494181, + "learning_rate": 4.998280622133677e-06, + "loss": 0.6864, + "step": 511 + }, + { + "epoch": 0.04158206773329002, + "grad_norm": 5.630142199208348, + "learning_rate": 4.998256150157433e-06, + "loss": 0.5752, + "step": 512 + }, + { + "epoch": 0.0416632827093316, + "grad_norm": 4.327018939734424, + "learning_rate": 4.998231505315085e-06, + "loss": 0.6679, + "step": 513 + }, + { + "epoch": 0.04174449768537318, + "grad_norm": 5.207414191020043, + "learning_rate": 4.998206687608339e-06, + "loss": 0.5235, + "step": 514 + }, + { + "epoch": 0.041825712661414764, + "grad_norm": 5.078181936565571, + "learning_rate": 4.998181697038912e-06, + "loss": 0.5066, + "step": 515 + }, + { + "epoch": 0.04190692763745635, + "grad_norm": 4.596975319564036, + "learning_rate": 4.998156533608531e-06, + "loss": 0.5846, + "step": 516 + }, + { + "epoch": 0.04198814261349793, + "grad_norm": 6.0532848525291545, + "learning_rate": 4.998131197318942e-06, + "loss": 0.5737, + "step": 517 + }, + { + "epoch": 0.04206935758953951, + "grad_norm": 4.35820106724424, + "learning_rate": 4.998105688171893e-06, + "loss": 0.6762, + "step": 518 + }, + { + "epoch": 0.04215057256558109, + "grad_norm": 6.664678597912458, + "learning_rate": 4.998080006169153e-06, + "loss": 0.5852, + "step": 519 + }, + { + "epoch": 0.04223178754162268, + "grad_norm": 8.897090441557413, + "learning_rate": 4.9980541513124966e-06, + "loss": 0.7535, + "step": 520 + }, + { + "epoch": 0.042313002517664254, + "grad_norm": 5.576078105545307, + "learning_rate": 4.998028123603714e-06, + "loss": 0.5836, + "step": 521 + }, + { + "epoch": 0.04239421749370584, + "grad_norm": 5.498839425247008, + "learning_rate": 4.998001923044605e-06, + "loss": 0.638, + "step": 522 + }, + { + "epoch": 0.04247543246974742, + "grad_norm": 7.495880701431175, + "learning_rate": 4.997975549636985e-06, + "loss": 0.6051, + "step": 523 + }, + { + "epoch": 0.042556647445789006, + "grad_norm": 8.634014617651163, + "learning_rate": 4.997949003382676e-06, + "loss": 0.5373, + "step": 524 + }, + { + "epoch": 0.04263786242183058, + "grad_norm": 3.6876707339560153, + "learning_rate": 4.997922284283517e-06, + "loss": 0.8439, + "step": 525 + }, + { + "epoch": 0.04271907739787217, + "grad_norm": 4.386768122021177, + "learning_rate": 4.997895392341356e-06, + "loss": 0.9273, + "step": 526 + }, + { + "epoch": 0.04280029237391375, + "grad_norm": 4.887505724579102, + "learning_rate": 4.997868327558053e-06, + "loss": 1.0235, + "step": 527 + }, + { + "epoch": 0.042881507349955335, + "grad_norm": 4.002537001522704, + "learning_rate": 4.997841089935482e-06, + "loss": 0.7851, + "step": 528 + }, + { + "epoch": 0.04296272232599691, + "grad_norm": 7.16699779080668, + "learning_rate": 4.997813679475528e-06, + "loss": 0.7571, + "step": 529 + }, + { + "epoch": 0.043043937302038496, + "grad_norm": 4.775024972507428, + "learning_rate": 4.997786096180086e-06, + "loss": 0.721, + "step": 530 + }, + { + "epoch": 0.04312515227808008, + "grad_norm": 5.099681936395401, + "learning_rate": 4.997758340051066e-06, + "loss": 0.5692, + "step": 531 + }, + { + "epoch": 0.04320636725412166, + "grad_norm": 4.818659941047612, + "learning_rate": 4.997730411090387e-06, + "loss": 0.643, + "step": 532 + }, + { + "epoch": 0.04328758223016324, + "grad_norm": 6.604728440380809, + "learning_rate": 4.997702309299983e-06, + "loss": 0.7149, + "step": 533 + }, + { + "epoch": 0.043368797206204825, + "grad_norm": 5.6047753258513415, + "learning_rate": 4.997674034681799e-06, + "loss": 0.6094, + "step": 534 + }, + { + "epoch": 0.04345001218224641, + "grad_norm": 5.772050070771113, + "learning_rate": 4.99764558723779e-06, + "loss": 0.5527, + "step": 535 + }, + { + "epoch": 0.043531227158287986, + "grad_norm": 6.695215883564865, + "learning_rate": 4.997616966969925e-06, + "loss": 0.8073, + "step": 536 + }, + { + "epoch": 0.04361244213432957, + "grad_norm": 5.0342584361934035, + "learning_rate": 4.997588173880184e-06, + "loss": 0.7011, + "step": 537 + }, + { + "epoch": 0.043693657110371154, + "grad_norm": 4.746592585436567, + "learning_rate": 4.99755920797056e-06, + "loss": 0.6319, + "step": 538 + }, + { + "epoch": 0.04377487208641274, + "grad_norm": 5.2897074099894095, + "learning_rate": 4.997530069243057e-06, + "loss": 0.5789, + "step": 539 + }, + { + "epoch": 0.043856087062454314, + "grad_norm": 3.8007318490045416, + "learning_rate": 4.997500757699691e-06, + "loss": 0.6953, + "step": 540 + }, + { + "epoch": 0.0439373020384959, + "grad_norm": 5.144861861634209, + "learning_rate": 4.9974712733424905e-06, + "loss": 0.621, + "step": 541 + }, + { + "epoch": 0.04401851701453748, + "grad_norm": 5.905282410348868, + "learning_rate": 4.997441616173495e-06, + "loss": 0.6511, + "step": 542 + }, + { + "epoch": 0.04409973199057906, + "grad_norm": 5.309964350881816, + "learning_rate": 4.997411786194758e-06, + "loss": 0.5526, + "step": 543 + }, + { + "epoch": 0.04418094696662064, + "grad_norm": 11.149541991220845, + "learning_rate": 4.997381783408343e-06, + "loss": 0.6067, + "step": 544 + }, + { + "epoch": 0.04426216194266223, + "grad_norm": 5.410844505382304, + "learning_rate": 4.9973516078163256e-06, + "loss": 0.6418, + "step": 545 + }, + { + "epoch": 0.04434337691870381, + "grad_norm": 5.430764606424544, + "learning_rate": 4.997321259420793e-06, + "loss": 0.6921, + "step": 546 + }, + { + "epoch": 0.04442459189474539, + "grad_norm": 11.412874034754271, + "learning_rate": 4.997290738223847e-06, + "loss": 0.5558, + "step": 547 + }, + { + "epoch": 0.04450580687078697, + "grad_norm": 4.133154872685264, + "learning_rate": 4.9972600442275985e-06, + "loss": 0.6049, + "step": 548 + }, + { + "epoch": 0.044587021846828556, + "grad_norm": 5.592148797903198, + "learning_rate": 4.997229177434171e-06, + "loss": 0.5138, + "step": 549 + }, + { + "epoch": 0.04466823682287014, + "grad_norm": 8.487263757967648, + "learning_rate": 4.997198137845702e-06, + "loss": 0.7193, + "step": 550 + }, + { + "epoch": 0.04474945179891172, + "grad_norm": 7.830506522230207, + "learning_rate": 4.997166925464337e-06, + "loss": 0.7594, + "step": 551 + }, + { + "epoch": 0.0448306667749533, + "grad_norm": 8.206186247100188, + "learning_rate": 4.997135540292237e-06, + "loss": 0.5762, + "step": 552 + }, + { + "epoch": 0.044911881750994885, + "grad_norm": 4.437021611163288, + "learning_rate": 4.997103982331574e-06, + "loss": 0.5914, + "step": 553 + }, + { + "epoch": 0.04499309672703646, + "grad_norm": 5.38722516332683, + "learning_rate": 4.997072251584531e-06, + "loss": 0.5335, + "step": 554 + }, + { + "epoch": 0.045074311703078046, + "grad_norm": 7.791956117401669, + "learning_rate": 4.997040348053304e-06, + "loss": 0.6441, + "step": 555 + }, + { + "epoch": 0.04515552667911963, + "grad_norm": 4.66481520418914, + "learning_rate": 4.9970082717401e-06, + "loss": 0.8322, + "step": 556 + }, + { + "epoch": 0.045236741655161214, + "grad_norm": 5.615716066994191, + "learning_rate": 4.9969760226471385e-06, + "loss": 0.6918, + "step": 557 + }, + { + "epoch": 0.04531795663120279, + "grad_norm": 5.424151701577734, + "learning_rate": 4.9969436007766514e-06, + "loss": 0.5777, + "step": 558 + }, + { + "epoch": 0.045399171607244375, + "grad_norm": 4.7203985695060195, + "learning_rate": 4.9969110061308826e-06, + "loss": 0.7466, + "step": 559 + }, + { + "epoch": 0.04548038658328596, + "grad_norm": 4.730334252790068, + "learning_rate": 4.996878238712087e-06, + "loss": 0.679, + "step": 560 + }, + { + "epoch": 0.04556160155932754, + "grad_norm": 3.979712937320014, + "learning_rate": 4.996845298522531e-06, + "loss": 0.5825, + "step": 561 + }, + { + "epoch": 0.04564281653536912, + "grad_norm": 6.730522946218954, + "learning_rate": 4.996812185564496e-06, + "loss": 0.6467, + "step": 562 + }, + { + "epoch": 0.045724031511410704, + "grad_norm": 3.33620867884503, + "learning_rate": 4.99677889984027e-06, + "loss": 0.7073, + "step": 563 + }, + { + "epoch": 0.04580524648745229, + "grad_norm": 5.926130876855334, + "learning_rate": 4.996745441352159e-06, + "loss": 0.667, + "step": 564 + }, + { + "epoch": 0.04588646146349387, + "grad_norm": 5.572153026812714, + "learning_rate": 4.996711810102478e-06, + "loss": 0.8168, + "step": 565 + }, + { + "epoch": 0.04596767643953545, + "grad_norm": 8.174580316906296, + "learning_rate": 4.996678006093553e-06, + "loss": 0.6412, + "step": 566 + }, + { + "epoch": 0.04604889141557703, + "grad_norm": 4.09571754078007, + "learning_rate": 4.996644029327723e-06, + "loss": 0.6742, + "step": 567 + }, + { + "epoch": 0.04613010639161862, + "grad_norm": 4.109112417478182, + "learning_rate": 4.996609879807341e-06, + "loss": 0.8325, + "step": 568 + }, + { + "epoch": 0.046211321367660194, + "grad_norm": 4.752061462760559, + "learning_rate": 4.9965755575347665e-06, + "loss": 0.6972, + "step": 569 + }, + { + "epoch": 0.04629253634370178, + "grad_norm": 3.9521387750727524, + "learning_rate": 4.996541062512377e-06, + "loss": 0.5081, + "step": 570 + }, + { + "epoch": 0.04637375131974336, + "grad_norm": 7.487630136380704, + "learning_rate": 4.996506394742559e-06, + "loss": 0.5242, + "step": 571 + }, + { + "epoch": 0.046454966295784945, + "grad_norm": 5.41182957872003, + "learning_rate": 4.996471554227711e-06, + "loss": 0.6066, + "step": 572 + }, + { + "epoch": 0.04653618127182652, + "grad_norm": 4.448050180267281, + "learning_rate": 4.996436540970243e-06, + "loss": 0.647, + "step": 573 + }, + { + "epoch": 0.046617396247868106, + "grad_norm": 7.285104243616152, + "learning_rate": 4.99640135497258e-06, + "loss": 0.5929, + "step": 574 + }, + { + "epoch": 0.04669861122390969, + "grad_norm": 3.4657873419662413, + "learning_rate": 4.996365996237155e-06, + "loss": 0.616, + "step": 575 + }, + { + "epoch": 0.046779826199951274, + "grad_norm": 6.130204069422916, + "learning_rate": 4.996330464766414e-06, + "loss": 0.7037, + "step": 576 + }, + { + "epoch": 0.04686104117599285, + "grad_norm": 4.045238400737803, + "learning_rate": 4.996294760562817e-06, + "loss": 0.735, + "step": 577 + }, + { + "epoch": 0.046942256152034435, + "grad_norm": 4.541687240117253, + "learning_rate": 4.996258883628834e-06, + "loss": 0.5942, + "step": 578 + }, + { + "epoch": 0.04702347112807602, + "grad_norm": 3.8724612963620326, + "learning_rate": 4.996222833966947e-06, + "loss": 0.5935, + "step": 579 + }, + { + "epoch": 0.047104686104117596, + "grad_norm": 6.548773937549966, + "learning_rate": 4.996186611579652e-06, + "loss": 0.5016, + "step": 580 + }, + { + "epoch": 0.04718590108015918, + "grad_norm": 4.80061928529426, + "learning_rate": 4.996150216469454e-06, + "loss": 0.7185, + "step": 581 + }, + { + "epoch": 0.047267116056200764, + "grad_norm": 3.5697724511020974, + "learning_rate": 4.996113648638872e-06, + "loss": 0.6544, + "step": 582 + }, + { + "epoch": 0.04734833103224235, + "grad_norm": 8.657160084841168, + "learning_rate": 4.996076908090435e-06, + "loss": 0.712, + "step": 583 + }, + { + "epoch": 0.047429546008283925, + "grad_norm": 5.811863937951256, + "learning_rate": 4.9960399948266865e-06, + "loss": 0.6283, + "step": 584 + }, + { + "epoch": 0.04751076098432551, + "grad_norm": 4.144290801020178, + "learning_rate": 4.9960029088501814e-06, + "loss": 0.5942, + "step": 585 + }, + { + "epoch": 0.04759197596036709, + "grad_norm": 11.532211557025384, + "learning_rate": 4.995965650163485e-06, + "loss": 0.7404, + "step": 586 + }, + { + "epoch": 0.04767319093640868, + "grad_norm": 4.498137515882452, + "learning_rate": 4.995928218769174e-06, + "loss": 0.5951, + "step": 587 + }, + { + "epoch": 0.047754405912450254, + "grad_norm": 5.1824043128703465, + "learning_rate": 4.99589061466984e-06, + "loss": 0.6123, + "step": 588 + }, + { + "epoch": 0.04783562088849184, + "grad_norm": 5.787634937924542, + "learning_rate": 4.995852837868086e-06, + "loss": 0.5376, + "step": 589 + }, + { + "epoch": 0.04791683586453342, + "grad_norm": 7.146634422767184, + "learning_rate": 4.995814888366523e-06, + "loss": 0.599, + "step": 590 + }, + { + "epoch": 0.047998050840575, + "grad_norm": 4.672683860091777, + "learning_rate": 4.995776766167781e-06, + "loss": 0.7119, + "step": 591 + }, + { + "epoch": 0.04807926581661658, + "grad_norm": 4.546708179061631, + "learning_rate": 4.9957384712744935e-06, + "loss": 0.5744, + "step": 592 + }, + { + "epoch": 0.04816048079265817, + "grad_norm": 5.406893771451819, + "learning_rate": 4.9957000036893124e-06, + "loss": 0.6749, + "step": 593 + }, + { + "epoch": 0.04824169576869975, + "grad_norm": 4.6310196800989685, + "learning_rate": 4.9956613634149e-06, + "loss": 0.9589, + "step": 594 + }, + { + "epoch": 0.04832291074474133, + "grad_norm": 5.64526302845874, + "learning_rate": 4.995622550453929e-06, + "loss": 0.7278, + "step": 595 + }, + { + "epoch": 0.04840412572078291, + "grad_norm": 7.582504845192103, + "learning_rate": 4.995583564809086e-06, + "loss": 0.56, + "step": 596 + }, + { + "epoch": 0.048485340696824496, + "grad_norm": 4.631867662632177, + "learning_rate": 4.995544406483067e-06, + "loss": 0.7885, + "step": 597 + }, + { + "epoch": 0.04856655567286608, + "grad_norm": 4.646870659330431, + "learning_rate": 4.9955050754785835e-06, + "loss": 0.7185, + "step": 598 + }, + { + "epoch": 0.04864777064890766, + "grad_norm": 4.724376677010658, + "learning_rate": 4.995465571798356e-06, + "loss": 0.6945, + "step": 599 + }, + { + "epoch": 0.04872898562494924, + "grad_norm": 3.7877986447817684, + "learning_rate": 4.995425895445118e-06, + "loss": 0.7329, + "step": 600 + }, + { + "epoch": 0.048810200600990825, + "grad_norm": 4.6352382568877335, + "learning_rate": 4.995386046421614e-06, + "loss": 0.6972, + "step": 601 + }, + { + "epoch": 0.0488914155770324, + "grad_norm": 4.7603716353361305, + "learning_rate": 4.9953460247306035e-06, + "loss": 0.502, + "step": 602 + }, + { + "epoch": 0.048972630553073986, + "grad_norm": 4.752679614641838, + "learning_rate": 4.995305830374854e-06, + "loss": 0.9242, + "step": 603 + }, + { + "epoch": 0.04905384552911557, + "grad_norm": 3.4417838568029855, + "learning_rate": 4.995265463357147e-06, + "loss": 0.7566, + "step": 604 + }, + { + "epoch": 0.04913506050515715, + "grad_norm": 6.888094078901035, + "learning_rate": 4.995224923680277e-06, + "loss": 0.6201, + "step": 605 + }, + { + "epoch": 0.04921627548119873, + "grad_norm": 4.161212593195377, + "learning_rate": 4.995184211347046e-06, + "loss": 0.7392, + "step": 606 + }, + { + "epoch": 0.049297490457240314, + "grad_norm": 7.5719084378043515, + "learning_rate": 4.995143326360274e-06, + "loss": 0.6514, + "step": 607 + }, + { + "epoch": 0.0493787054332819, + "grad_norm": 10.224202349098311, + "learning_rate": 4.99510226872279e-06, + "loss": 0.5716, + "step": 608 + }, + { + "epoch": 0.04945992040932348, + "grad_norm": 5.767252392060593, + "learning_rate": 4.995061038437434e-06, + "loss": 0.6206, + "step": 609 + }, + { + "epoch": 0.04954113538536506, + "grad_norm": 5.065547686279448, + "learning_rate": 4.995019635507059e-06, + "loss": 0.5774, + "step": 610 + }, + { + "epoch": 0.04962235036140664, + "grad_norm": 5.65097481771209, + "learning_rate": 4.9949780599345295e-06, + "loss": 0.5498, + "step": 611 + }, + { + "epoch": 0.04970356533744823, + "grad_norm": 6.523300211900963, + "learning_rate": 4.994936311722723e-06, + "loss": 0.6711, + "step": 612 + }, + { + "epoch": 0.049784780313489804, + "grad_norm": 6.2556246954187715, + "learning_rate": 4.994894390874527e-06, + "loss": 0.7809, + "step": 613 + }, + { + "epoch": 0.04986599528953139, + "grad_norm": 5.630465680189662, + "learning_rate": 4.994852297392845e-06, + "loss": 0.5909, + "step": 614 + }, + { + "epoch": 0.04994721026557297, + "grad_norm": 5.019843596472891, + "learning_rate": 4.994810031280587e-06, + "loss": 0.8296, + "step": 615 + }, + { + "epoch": 0.050028425241614556, + "grad_norm": 6.614977325258197, + "learning_rate": 4.994767592540678e-06, + "loss": 0.5424, + "step": 616 + }, + { + "epoch": 0.05010964021765613, + "grad_norm": 6.173835775193037, + "learning_rate": 4.9947249811760555e-06, + "loss": 0.6781, + "step": 617 + }, + { + "epoch": 0.05019085519369772, + "grad_norm": 7.520930623536426, + "learning_rate": 4.994682197189667e-06, + "loss": 0.6022, + "step": 618 + }, + { + "epoch": 0.0502720701697393, + "grad_norm": 8.187916646181824, + "learning_rate": 4.994639240584474e-06, + "loss": 0.6612, + "step": 619 + }, + { + "epoch": 0.050353285145780885, + "grad_norm": 7.453069836355382, + "learning_rate": 4.994596111363448e-06, + "loss": 0.7443, + "step": 620 + }, + { + "epoch": 0.05043450012182246, + "grad_norm": 7.747733385771382, + "learning_rate": 4.994552809529573e-06, + "loss": 0.5906, + "step": 621 + }, + { + "epoch": 0.050515715097864046, + "grad_norm": 9.01741469170529, + "learning_rate": 4.994509335085847e-06, + "loss": 0.591, + "step": 622 + }, + { + "epoch": 0.05059693007390563, + "grad_norm": 6.323489129605991, + "learning_rate": 4.994465688035276e-06, + "loss": 0.9078, + "step": 623 + }, + { + "epoch": 0.05067814504994721, + "grad_norm": 13.299317733180853, + "learning_rate": 4.994421868380881e-06, + "loss": 0.525, + "step": 624 + }, + { + "epoch": 0.05075936002598879, + "grad_norm": 4.296125414620111, + "learning_rate": 4.994377876125695e-06, + "loss": 0.5857, + "step": 625 + }, + { + "epoch": 0.050840575002030375, + "grad_norm": 3.877492783088978, + "learning_rate": 4.994333711272761e-06, + "loss": 0.6115, + "step": 626 + }, + { + "epoch": 0.05092178997807196, + "grad_norm": 7.011247326243034, + "learning_rate": 4.9942893738251355e-06, + "loss": 0.5045, + "step": 627 + }, + { + "epoch": 0.051003004954113536, + "grad_norm": 7.046298816782971, + "learning_rate": 4.994244863785887e-06, + "loss": 0.6668, + "step": 628 + }, + { + "epoch": 0.05108421993015512, + "grad_norm": 13.568359594461246, + "learning_rate": 4.994200181158093e-06, + "loss": 0.5775, + "step": 629 + }, + { + "epoch": 0.051165434906196704, + "grad_norm": 7.5605209242829545, + "learning_rate": 4.9941553259448475e-06, + "loss": 0.5511, + "step": 630 + }, + { + "epoch": 0.05124664988223829, + "grad_norm": 7.085727266285509, + "learning_rate": 4.994110298149253e-06, + "loss": 0.6272, + "step": 631 + }, + { + "epoch": 0.051327864858279865, + "grad_norm": 4.890560092384863, + "learning_rate": 4.994065097774426e-06, + "loss": 0.6843, + "step": 632 + }, + { + "epoch": 0.05140907983432145, + "grad_norm": 5.665397086701724, + "learning_rate": 4.994019724823495e-06, + "loss": 0.9322, + "step": 633 + }, + { + "epoch": 0.05149029481036303, + "grad_norm": 12.520866191934164, + "learning_rate": 4.993974179299597e-06, + "loss": 0.9683, + "step": 634 + }, + { + "epoch": 0.05157150978640461, + "grad_norm": 6.353771093218972, + "learning_rate": 4.993928461205885e-06, + "loss": 0.7176, + "step": 635 + }, + { + "epoch": 0.051652724762446194, + "grad_norm": 6.398842006345174, + "learning_rate": 4.993882570545523e-06, + "loss": 0.6833, + "step": 636 + }, + { + "epoch": 0.05173393973848778, + "grad_norm": 4.720771899259599, + "learning_rate": 4.993836507321686e-06, + "loss": 0.554, + "step": 637 + }, + { + "epoch": 0.05181515471452936, + "grad_norm": 6.5529107721965945, + "learning_rate": 4.9937902715375605e-06, + "loss": 0.8592, + "step": 638 + }, + { + "epoch": 0.05189636969057094, + "grad_norm": 6.335582258551991, + "learning_rate": 4.993743863196348e-06, + "loss": 0.6642, + "step": 639 + }, + { + "epoch": 0.05197758466661252, + "grad_norm": 5.488427009047834, + "learning_rate": 4.993697282301256e-06, + "loss": 0.6354, + "step": 640 + }, + { + "epoch": 0.052058799642654106, + "grad_norm": 4.435230831471148, + "learning_rate": 4.99365052885551e-06, + "loss": 0.6361, + "step": 641 + }, + { + "epoch": 0.05214001461869569, + "grad_norm": 3.973892603431311, + "learning_rate": 4.9936036028623465e-06, + "loss": 0.6415, + "step": 642 + }, + { + "epoch": 0.05222122959473727, + "grad_norm": 11.239148633823167, + "learning_rate": 4.99355650432501e-06, + "loss": 0.6895, + "step": 643 + }, + { + "epoch": 0.05230244457077885, + "grad_norm": 4.274914346041189, + "learning_rate": 4.993509233246761e-06, + "loss": 0.5007, + "step": 644 + }, + { + "epoch": 0.052383659546820435, + "grad_norm": 6.237193126781422, + "learning_rate": 4.9934617896308675e-06, + "loss": 0.6882, + "step": 645 + }, + { + "epoch": 0.05246487452286202, + "grad_norm": 3.98788426140263, + "learning_rate": 4.993414173480617e-06, + "loss": 0.6089, + "step": 646 + }, + { + "epoch": 0.052546089498903596, + "grad_norm": 14.335138449996226, + "learning_rate": 4.9933663847993005e-06, + "loss": 0.7351, + "step": 647 + }, + { + "epoch": 0.05262730447494518, + "grad_norm": 7.149335425005816, + "learning_rate": 4.9933184235902275e-06, + "loss": 0.7582, + "step": 648 + }, + { + "epoch": 0.052708519450986764, + "grad_norm": 4.593224187186037, + "learning_rate": 4.993270289856714e-06, + "loss": 0.5349, + "step": 649 + }, + { + "epoch": 0.05278973442702834, + "grad_norm": 11.086443268562299, + "learning_rate": 4.993221983602093e-06, + "loss": 0.5782, + "step": 650 + }, + { + "epoch": 0.052870949403069925, + "grad_norm": 4.566408185176231, + "learning_rate": 4.993173504829705e-06, + "loss": 0.5413, + "step": 651 + }, + { + "epoch": 0.05295216437911151, + "grad_norm": 4.3786114129219165, + "learning_rate": 4.993124853542906e-06, + "loss": 0.7764, + "step": 652 + }, + { + "epoch": 0.05303337935515309, + "grad_norm": 4.4667797149749955, + "learning_rate": 4.993076029745061e-06, + "loss": 0.5315, + "step": 653 + }, + { + "epoch": 0.05311459433119467, + "grad_norm": 4.402424708224538, + "learning_rate": 4.99302703343955e-06, + "loss": 0.6889, + "step": 654 + }, + { + "epoch": 0.053195809307236254, + "grad_norm": 5.124489236968013, + "learning_rate": 4.992977864629762e-06, + "loss": 0.7257, + "step": 655 + }, + { + "epoch": 0.05327702428327784, + "grad_norm": 5.821195473769332, + "learning_rate": 4.9929285233191005e-06, + "loss": 0.6547, + "step": 656 + }, + { + "epoch": 0.05335823925931942, + "grad_norm": 6.888571998941501, + "learning_rate": 4.992879009510978e-06, + "loss": 0.5126, + "step": 657 + }, + { + "epoch": 0.053439454235361, + "grad_norm": 5.619280693409271, + "learning_rate": 4.992829323208822e-06, + "loss": 0.6526, + "step": 658 + }, + { + "epoch": 0.05352066921140258, + "grad_norm": 6.967418216632179, + "learning_rate": 4.992779464416069e-06, + "loss": 0.596, + "step": 659 + }, + { + "epoch": 0.05360188418744417, + "grad_norm": 4.9434745741217645, + "learning_rate": 4.992729433136171e-06, + "loss": 0.6319, + "step": 660 + }, + { + "epoch": 0.053683099163485744, + "grad_norm": 8.648403799491454, + "learning_rate": 4.992679229372588e-06, + "loss": 0.5502, + "step": 661 + }, + { + "epoch": 0.05376431413952733, + "grad_norm": 11.185178358108589, + "learning_rate": 4.9926288531287946e-06, + "loss": 0.6938, + "step": 662 + }, + { + "epoch": 0.05384552911556891, + "grad_norm": 5.6934860327185115, + "learning_rate": 4.992578304408278e-06, + "loss": 0.6992, + "step": 663 + }, + { + "epoch": 0.053926744091610496, + "grad_norm": 5.313881451433935, + "learning_rate": 4.992527583214533e-06, + "loss": 0.738, + "step": 664 + }, + { + "epoch": 0.05400795906765207, + "grad_norm": 6.054202350907215, + "learning_rate": 4.992476689551071e-06, + "loss": 0.6326, + "step": 665 + }, + { + "epoch": 0.05408917404369366, + "grad_norm": 7.4133417376434725, + "learning_rate": 4.992425623421414e-06, + "loss": 0.7014, + "step": 666 + }, + { + "epoch": 0.05417038901973524, + "grad_norm": 4.221490444985979, + "learning_rate": 4.992374384829094e-06, + "loss": 0.7564, + "step": 667 + }, + { + "epoch": 0.054251603995776825, + "grad_norm": 10.257715188715267, + "learning_rate": 4.992322973777658e-06, + "loss": 0.6686, + "step": 668 + }, + { + "epoch": 0.0543328189718184, + "grad_norm": 4.701020797968543, + "learning_rate": 4.992271390270662e-06, + "loss": 0.5668, + "step": 669 + }, + { + "epoch": 0.054414033947859985, + "grad_norm": 5.172562987556883, + "learning_rate": 4.992219634311677e-06, + "loss": 0.563, + "step": 670 + }, + { + "epoch": 0.05449524892390157, + "grad_norm": 8.921514201488444, + "learning_rate": 4.992167705904282e-06, + "loss": 0.694, + "step": 671 + }, + { + "epoch": 0.054576463899943146, + "grad_norm": 3.807857859349466, + "learning_rate": 4.992115605052072e-06, + "loss": 0.5912, + "step": 672 + }, + { + "epoch": 0.05465767887598473, + "grad_norm": 3.2898787814816357, + "learning_rate": 4.992063331758651e-06, + "loss": 0.4573, + "step": 673 + }, + { + "epoch": 0.054738893852026314, + "grad_norm": 3.4263761182268526, + "learning_rate": 4.9920108860276375e-06, + "loss": 0.6525, + "step": 674 + }, + { + "epoch": 0.0548201088280679, + "grad_norm": 4.749436044976401, + "learning_rate": 4.991958267862659e-06, + "loss": 0.51, + "step": 675 + }, + { + "epoch": 0.054901323804109475, + "grad_norm": 5.875542464144077, + "learning_rate": 4.991905477267356e-06, + "loss": 0.6024, + "step": 676 + }, + { + "epoch": 0.05498253878015106, + "grad_norm": 5.5252942081115695, + "learning_rate": 4.991852514245384e-06, + "loss": 0.6975, + "step": 677 + }, + { + "epoch": 0.05506375375619264, + "grad_norm": 3.5503049002474336, + "learning_rate": 4.991799378800404e-06, + "loss": 0.7191, + "step": 678 + }, + { + "epoch": 0.05514496873223423, + "grad_norm": 3.9287521299019947, + "learning_rate": 4.9917460709360955e-06, + "loss": 0.6743, + "step": 679 + }, + { + "epoch": 0.055226183708275804, + "grad_norm": 7.458997142187155, + "learning_rate": 4.991692590656146e-06, + "loss": 0.6277, + "step": 680 + }, + { + "epoch": 0.05530739868431739, + "grad_norm": 5.196449898312582, + "learning_rate": 4.991638937964257e-06, + "loss": 0.5941, + "step": 681 + }, + { + "epoch": 0.05538861366035897, + "grad_norm": 7.808673495814518, + "learning_rate": 4.9915851128641405e-06, + "loss": 0.4626, + "step": 682 + }, + { + "epoch": 0.05546982863640055, + "grad_norm": 10.233712958106928, + "learning_rate": 4.991531115359519e-06, + "loss": 0.6285, + "step": 683 + }, + { + "epoch": 0.05555104361244213, + "grad_norm": 4.467794770096275, + "learning_rate": 4.991476945454133e-06, + "loss": 0.5607, + "step": 684 + }, + { + "epoch": 0.05563225858848372, + "grad_norm": 5.521835573749263, + "learning_rate": 4.991422603151727e-06, + "loss": 0.5919, + "step": 685 + }, + { + "epoch": 0.0557134735645253, + "grad_norm": 5.041631495422818, + "learning_rate": 4.991368088456062e-06, + "loss": 0.5745, + "step": 686 + }, + { + "epoch": 0.05579468854056688, + "grad_norm": 4.822878210770882, + "learning_rate": 4.99131340137091e-06, + "loss": 0.7395, + "step": 687 + }, + { + "epoch": 0.05587590351660846, + "grad_norm": 8.016561518905647, + "learning_rate": 4.991258541900058e-06, + "loss": 0.587, + "step": 688 + }, + { + "epoch": 0.055957118492650046, + "grad_norm": 8.66516164876571, + "learning_rate": 4.991203510047299e-06, + "loss": 0.7209, + "step": 689 + }, + { + "epoch": 0.05603833346869163, + "grad_norm": 4.158292161529495, + "learning_rate": 4.991148305816441e-06, + "loss": 0.5705, + "step": 690 + }, + { + "epoch": 0.05611954844473321, + "grad_norm": 8.004317430680866, + "learning_rate": 4.991092929211305e-06, + "loss": 0.6297, + "step": 691 + }, + { + "epoch": 0.05620076342077479, + "grad_norm": 7.148231924748558, + "learning_rate": 4.9910373802357214e-06, + "loss": 0.5579, + "step": 692 + }, + { + "epoch": 0.056281978396816375, + "grad_norm": 4.294720627427241, + "learning_rate": 4.990981658893535e-06, + "loss": 0.5622, + "step": 693 + }, + { + "epoch": 0.05636319337285795, + "grad_norm": 7.230336818000405, + "learning_rate": 4.990925765188602e-06, + "loss": 0.8874, + "step": 694 + }, + { + "epoch": 0.056444408348899536, + "grad_norm": 3.9400202583073662, + "learning_rate": 4.9908696991247885e-06, + "loss": 0.6678, + "step": 695 + }, + { + "epoch": 0.05652562332494112, + "grad_norm": 4.144892106492086, + "learning_rate": 4.990813460705975e-06, + "loss": 0.6598, + "step": 696 + }, + { + "epoch": 0.056606838300982704, + "grad_norm": 5.029300780914533, + "learning_rate": 4.990757049936051e-06, + "loss": 0.5704, + "step": 697 + }, + { + "epoch": 0.05668805327702428, + "grad_norm": 5.440381358402857, + "learning_rate": 4.990700466818923e-06, + "loss": 0.4935, + "step": 698 + }, + { + "epoch": 0.056769268253065865, + "grad_norm": 64.88422852309522, + "learning_rate": 4.990643711358504e-06, + "loss": 0.6541, + "step": 699 + }, + { + "epoch": 0.05685048322910745, + "grad_norm": 9.761223335464411, + "learning_rate": 4.990586783558722e-06, + "loss": 0.6356, + "step": 700 + }, + { + "epoch": 0.05693169820514903, + "grad_norm": 8.471075449596462, + "learning_rate": 4.990529683423515e-06, + "loss": 0.6032, + "step": 701 + }, + { + "epoch": 0.05701291318119061, + "grad_norm": 8.610029295506125, + "learning_rate": 4.990472410956835e-06, + "loss": 0.6641, + "step": 702 + }, + { + "epoch": 0.057094128157232193, + "grad_norm": 8.074345676422041, + "learning_rate": 4.9904149661626456e-06, + "loss": 0.7349, + "step": 703 + }, + { + "epoch": 0.05717534313327378, + "grad_norm": 6.475287058051118, + "learning_rate": 4.99035734904492e-06, + "loss": 0.601, + "step": 704 + }, + { + "epoch": 0.057256558109315354, + "grad_norm": 5.1928926651854335, + "learning_rate": 4.990299559607646e-06, + "loss": 0.6079, + "step": 705 + }, + { + "epoch": 0.05733777308535694, + "grad_norm": 5.520091117650318, + "learning_rate": 4.990241597854822e-06, + "loss": 0.7949, + "step": 706 + }, + { + "epoch": 0.05741898806139852, + "grad_norm": 6.225321905485309, + "learning_rate": 4.99018346379046e-06, + "loss": 0.5618, + "step": 707 + }, + { + "epoch": 0.057500203037440106, + "grad_norm": 4.444351207370442, + "learning_rate": 4.99012515741858e-06, + "loss": 0.5987, + "step": 708 + }, + { + "epoch": 0.05758141801348168, + "grad_norm": 11.383212330047579, + "learning_rate": 4.990066678743219e-06, + "loss": 0.5409, + "step": 709 + }, + { + "epoch": 0.05766263298952327, + "grad_norm": 5.0174724436449765, + "learning_rate": 4.9900080277684224e-06, + "loss": 0.8077, + "step": 710 + }, + { + "epoch": 0.05774384796556485, + "grad_norm": 6.82758331108689, + "learning_rate": 4.989949204498248e-06, + "loss": 0.7131, + "step": 711 + }, + { + "epoch": 0.057825062941606435, + "grad_norm": 4.599805153792023, + "learning_rate": 4.989890208936767e-06, + "loss": 0.7143, + "step": 712 + }, + { + "epoch": 0.05790627791764801, + "grad_norm": 5.615806754693292, + "learning_rate": 4.98983104108806e-06, + "loss": 0.4989, + "step": 713 + }, + { + "epoch": 0.057987492893689596, + "grad_norm": 5.944167262991299, + "learning_rate": 4.989771700956223e-06, + "loss": 0.6002, + "step": 714 + }, + { + "epoch": 0.05806870786973118, + "grad_norm": 5.363075574799061, + "learning_rate": 4.989712188545362e-06, + "loss": 0.5249, + "step": 715 + }, + { + "epoch": 0.05814992284577276, + "grad_norm": 5.71188897749546, + "learning_rate": 4.989652503859592e-06, + "loss": 0.6282, + "step": 716 + }, + { + "epoch": 0.05823113782181434, + "grad_norm": 6.4060919441849835, + "learning_rate": 4.989592646903047e-06, + "loss": 0.8009, + "step": 717 + }, + { + "epoch": 0.058312352797855925, + "grad_norm": 5.806526365359627, + "learning_rate": 4.989532617679866e-06, + "loss": 0.5443, + "step": 718 + }, + { + "epoch": 0.05839356777389751, + "grad_norm": 5.459430108939258, + "learning_rate": 4.989472416194204e-06, + "loss": 0.5308, + "step": 719 + }, + { + "epoch": 0.058474782749939086, + "grad_norm": 5.688152424438068, + "learning_rate": 4.9894120424502254e-06, + "loss": 0.6284, + "step": 720 + }, + { + "epoch": 0.05855599772598067, + "grad_norm": 7.784647717876923, + "learning_rate": 4.989351496452109e-06, + "loss": 0.5055, + "step": 721 + }, + { + "epoch": 0.058637212702022254, + "grad_norm": 4.6787370535165245, + "learning_rate": 4.9892907782040435e-06, + "loss": 0.6513, + "step": 722 + }, + { + "epoch": 0.05871842767806384, + "grad_norm": 6.799832486685082, + "learning_rate": 4.9892298877102305e-06, + "loss": 0.7293, + "step": 723 + }, + { + "epoch": 0.058799642654105415, + "grad_norm": 5.66494166268401, + "learning_rate": 4.989168824974884e-06, + "loss": 0.5597, + "step": 724 + }, + { + "epoch": 0.058880857630147, + "grad_norm": 6.3608187122909206, + "learning_rate": 4.989107590002228e-06, + "loss": 0.6171, + "step": 725 + }, + { + "epoch": 0.05896207260618858, + "grad_norm": 6.134461895108829, + "learning_rate": 4.989046182796501e-06, + "loss": 0.5414, + "step": 726 + }, + { + "epoch": 0.05904328758223017, + "grad_norm": 7.100189240185222, + "learning_rate": 4.988984603361949e-06, + "loss": 0.6406, + "step": 727 + }, + { + "epoch": 0.059124502558271744, + "grad_norm": 5.1992788570507535, + "learning_rate": 4.988922851702837e-06, + "loss": 0.6249, + "step": 728 + }, + { + "epoch": 0.05920571753431333, + "grad_norm": 4.3545253550275635, + "learning_rate": 4.988860927823436e-06, + "loss": 0.8036, + "step": 729 + }, + { + "epoch": 0.05928693251035491, + "grad_norm": 5.3206883020912, + "learning_rate": 4.988798831728031e-06, + "loss": 0.5943, + "step": 730 + }, + { + "epoch": 0.05936814748639649, + "grad_norm": 6.089917627663601, + "learning_rate": 4.9887365634209186e-06, + "loss": 0.7094, + "step": 731 + }, + { + "epoch": 0.05944936246243807, + "grad_norm": 3.956275748674879, + "learning_rate": 4.9886741229064075e-06, + "loss": 0.5626, + "step": 732 + }, + { + "epoch": 0.05953057743847966, + "grad_norm": 4.511627354156277, + "learning_rate": 4.988611510188818e-06, + "loss": 0.5764, + "step": 733 + }, + { + "epoch": 0.05961179241452124, + "grad_norm": 6.4116489379834025, + "learning_rate": 4.988548725272482e-06, + "loss": 0.5982, + "step": 734 + }, + { + "epoch": 0.05969300739056282, + "grad_norm": 6.311885375734862, + "learning_rate": 4.988485768161746e-06, + "loss": 0.5339, + "step": 735 + }, + { + "epoch": 0.0597742223666044, + "grad_norm": 3.9308120481243902, + "learning_rate": 4.988422638860964e-06, + "loss": 0.584, + "step": 736 + }, + { + "epoch": 0.059855437342645985, + "grad_norm": 4.880711425301184, + "learning_rate": 4.988359337374505e-06, + "loss": 0.5078, + "step": 737 + }, + { + "epoch": 0.05993665231868757, + "grad_norm": 5.281870836321762, + "learning_rate": 4.988295863706751e-06, + "loss": 0.5754, + "step": 738 + }, + { + "epoch": 0.060017867294729146, + "grad_norm": 5.0230810112537485, + "learning_rate": 4.988232217862091e-06, + "loss": 0.6391, + "step": 739 + }, + { + "epoch": 0.06009908227077073, + "grad_norm": 6.955537743254657, + "learning_rate": 4.988168399844931e-06, + "loss": 0.5283, + "step": 740 + }, + { + "epoch": 0.060180297246812314, + "grad_norm": 4.147704339162062, + "learning_rate": 4.988104409659685e-06, + "loss": 0.653, + "step": 741 + }, + { + "epoch": 0.06026151222285389, + "grad_norm": 7.537477012145328, + "learning_rate": 4.988040247310783e-06, + "loss": 0.6525, + "step": 742 + }, + { + "epoch": 0.060342727198895475, + "grad_norm": 4.863974888473347, + "learning_rate": 4.987975912802663e-06, + "loss": 0.5931, + "step": 743 + }, + { + "epoch": 0.06042394217493706, + "grad_norm": 5.366327422788624, + "learning_rate": 4.9879114061397784e-06, + "loss": 0.4838, + "step": 744 + }, + { + "epoch": 0.06050515715097864, + "grad_norm": 4.656417177398882, + "learning_rate": 4.987846727326591e-06, + "loss": 0.681, + "step": 745 + }, + { + "epoch": 0.06058637212702022, + "grad_norm": 20.788942947113796, + "learning_rate": 4.987781876367576e-06, + "loss": 0.6331, + "step": 746 + }, + { + "epoch": 0.060667587103061804, + "grad_norm": 7.459258545982763, + "learning_rate": 4.987716853267222e-06, + "loss": 0.5166, + "step": 747 + }, + { + "epoch": 0.06074880207910339, + "grad_norm": 7.51821304798715, + "learning_rate": 4.9876516580300285e-06, + "loss": 0.4729, + "step": 748 + }, + { + "epoch": 0.06083001705514497, + "grad_norm": 6.15090790030722, + "learning_rate": 4.987586290660506e-06, + "loss": 0.5615, + "step": 749 + }, + { + "epoch": 0.06091123203118655, + "grad_norm": 5.226446559415324, + "learning_rate": 4.987520751163176e-06, + "loss": 0.4638, + "step": 750 + }, + { + "epoch": 0.06099244700722813, + "grad_norm": 6.752685506869791, + "learning_rate": 4.9874550395425764e-06, + "loss": 0.5201, + "step": 751 + }, + { + "epoch": 0.06107366198326972, + "grad_norm": 4.989408238882795, + "learning_rate": 4.987389155803252e-06, + "loss": 0.7494, + "step": 752 + }, + { + "epoch": 0.061154876959311294, + "grad_norm": 6.750783797733722, + "learning_rate": 4.987323099949763e-06, + "loss": 0.65, + "step": 753 + }, + { + "epoch": 0.06123609193535288, + "grad_norm": 7.609585837462416, + "learning_rate": 4.9872568719866795e-06, + "loss": 0.6736, + "step": 754 + }, + { + "epoch": 0.06131730691139446, + "grad_norm": 4.452864412048146, + "learning_rate": 4.987190471918584e-06, + "loss": 0.5907, + "step": 755 + }, + { + "epoch": 0.061398521887436046, + "grad_norm": 4.734835124703949, + "learning_rate": 4.98712389975007e-06, + "loss": 0.653, + "step": 756 + }, + { + "epoch": 0.06147973686347762, + "grad_norm": 5.928107826920242, + "learning_rate": 4.987057155485746e-06, + "loss": 0.7772, + "step": 757 + }, + { + "epoch": 0.06156095183951921, + "grad_norm": 9.884828624607355, + "learning_rate": 4.98699023913023e-06, + "loss": 0.5529, + "step": 758 + }, + { + "epoch": 0.06164216681556079, + "grad_norm": 4.2764549097941, + "learning_rate": 4.986923150688151e-06, + "loss": 0.6011, + "step": 759 + }, + { + "epoch": 0.061723381791602375, + "grad_norm": 7.233144313683277, + "learning_rate": 4.986855890164152e-06, + "loss": 0.6427, + "step": 760 + }, + { + "epoch": 0.06180459676764395, + "grad_norm": 5.536503055820394, + "learning_rate": 4.986788457562887e-06, + "loss": 0.6915, + "step": 761 + }, + { + "epoch": 0.061885811743685536, + "grad_norm": 5.309270582449383, + "learning_rate": 4.986720852889021e-06, + "loss": 0.5991, + "step": 762 + }, + { + "epoch": 0.06196702671972712, + "grad_norm": 4.284356637879479, + "learning_rate": 4.9866530761472335e-06, + "loss": 0.8062, + "step": 763 + }, + { + "epoch": 0.0620482416957687, + "grad_norm": 5.643793776418088, + "learning_rate": 4.986585127342214e-06, + "loss": 0.7215, + "step": 764 + }, + { + "epoch": 0.06212945667181028, + "grad_norm": 4.295093117802471, + "learning_rate": 4.986517006478663e-06, + "loss": 0.4926, + "step": 765 + }, + { + "epoch": 0.062210671647851865, + "grad_norm": 5.881778143353826, + "learning_rate": 4.986448713561295e-06, + "loss": 0.5777, + "step": 766 + }, + { + "epoch": 0.06229188662389345, + "grad_norm": 3.5376491877561533, + "learning_rate": 4.986380248594835e-06, + "loss": 0.8141, + "step": 767 + }, + { + "epoch": 0.062373101599935026, + "grad_norm": 15.197506269463709, + "learning_rate": 4.9863116115840215e-06, + "loss": 0.6347, + "step": 768 + }, + { + "epoch": 0.06245431657597661, + "grad_norm": 5.5635913852411765, + "learning_rate": 4.986242802533603e-06, + "loss": 0.7381, + "step": 769 + }, + { + "epoch": 0.06253553155201819, + "grad_norm": 4.921640572698924, + "learning_rate": 4.986173821448341e-06, + "loss": 0.5331, + "step": 770 + }, + { + "epoch": 0.06261674652805978, + "grad_norm": 4.243356448186626, + "learning_rate": 4.9861046683330085e-06, + "loss": 0.7414, + "step": 771 + }, + { + "epoch": 0.06269796150410135, + "grad_norm": 4.7676286369533685, + "learning_rate": 4.986035343192389e-06, + "loss": 0.7152, + "step": 772 + }, + { + "epoch": 0.06277917648014295, + "grad_norm": 5.052963380949943, + "learning_rate": 4.985965846031283e-06, + "loss": 0.5741, + "step": 773 + }, + { + "epoch": 0.06286039145618452, + "grad_norm": 5.830703109939469, + "learning_rate": 4.985896176854496e-06, + "loss": 0.5442, + "step": 774 + }, + { + "epoch": 0.0629416064322261, + "grad_norm": 28.565367004080375, + "learning_rate": 4.9858263356668505e-06, + "loss": 0.6674, + "step": 775 + }, + { + "epoch": 0.06302282140826769, + "grad_norm": 5.2243621774870475, + "learning_rate": 4.985756322473178e-06, + "loss": 0.5452, + "step": 776 + }, + { + "epoch": 0.06310403638430927, + "grad_norm": 5.380977330836419, + "learning_rate": 4.9856861372783236e-06, + "loss": 0.5991, + "step": 777 + }, + { + "epoch": 0.06318525136035084, + "grad_norm": 6.151320235729201, + "learning_rate": 4.9856157800871455e-06, + "loss": 0.661, + "step": 778 + }, + { + "epoch": 0.06326646633639244, + "grad_norm": 4.520649099692102, + "learning_rate": 4.985545250904509e-06, + "loss": 0.5931, + "step": 779 + }, + { + "epoch": 0.06334768131243401, + "grad_norm": 3.8505363861712336, + "learning_rate": 4.985474549735296e-06, + "loss": 0.6474, + "step": 780 + }, + { + "epoch": 0.06342889628847559, + "grad_norm": 4.383396583650249, + "learning_rate": 4.985403676584397e-06, + "loss": 0.7205, + "step": 781 + }, + { + "epoch": 0.06351011126451718, + "grad_norm": 5.432225271309565, + "learning_rate": 4.985332631456719e-06, + "loss": 0.6348, + "step": 782 + }, + { + "epoch": 0.06359132624055876, + "grad_norm": 8.317084663016479, + "learning_rate": 4.9852614143571755e-06, + "loss": 0.6672, + "step": 783 + }, + { + "epoch": 0.06367254121660035, + "grad_norm": 6.050699233785058, + "learning_rate": 4.985190025290696e-06, + "loss": 0.5816, + "step": 784 + }, + { + "epoch": 0.06375375619264192, + "grad_norm": 4.054546774991104, + "learning_rate": 4.985118464262219e-06, + "loss": 0.5229, + "step": 785 + }, + { + "epoch": 0.0638349711686835, + "grad_norm": 6.331916697325334, + "learning_rate": 4.985046731276697e-06, + "loss": 0.6044, + "step": 786 + }, + { + "epoch": 0.06391618614472509, + "grad_norm": 5.448136315607667, + "learning_rate": 4.984974826339093e-06, + "loss": 0.6758, + "step": 787 + }, + { + "epoch": 0.06399740112076667, + "grad_norm": 3.0701662409934065, + "learning_rate": 4.984902749454382e-06, + "loss": 0.5999, + "step": 788 + }, + { + "epoch": 0.06407861609680825, + "grad_norm": 4.6802757341120556, + "learning_rate": 4.9848305006275525e-06, + "loss": 0.6773, + "step": 789 + }, + { + "epoch": 0.06415983107284984, + "grad_norm": 5.59540917186297, + "learning_rate": 4.984758079863603e-06, + "loss": 0.5526, + "step": 790 + }, + { + "epoch": 0.06424104604889141, + "grad_norm": 4.319537345372907, + "learning_rate": 4.984685487167544e-06, + "loss": 0.4746, + "step": 791 + }, + { + "epoch": 0.06432226102493299, + "grad_norm": 4.812410544795939, + "learning_rate": 4.9846127225444e-06, + "loss": 0.5985, + "step": 792 + }, + { + "epoch": 0.06440347600097458, + "grad_norm": 3.8747262789638, + "learning_rate": 4.984539785999205e-06, + "loss": 0.8711, + "step": 793 + }, + { + "epoch": 0.06448469097701616, + "grad_norm": 9.168257138239873, + "learning_rate": 4.984466677537007e-06, + "loss": 0.6323, + "step": 794 + }, + { + "epoch": 0.06456590595305775, + "grad_norm": 9.621282105988284, + "learning_rate": 4.984393397162862e-06, + "loss": 0.6117, + "step": 795 + }, + { + "epoch": 0.06464712092909933, + "grad_norm": 4.3783026160070895, + "learning_rate": 4.984319944881844e-06, + "loss": 0.6991, + "step": 796 + }, + { + "epoch": 0.0647283359051409, + "grad_norm": 4.35590663167429, + "learning_rate": 4.984246320699033e-06, + "loss": 0.7034, + "step": 797 + }, + { + "epoch": 0.0648095508811825, + "grad_norm": 4.746827833135994, + "learning_rate": 4.984172524619525e-06, + "loss": 0.6776, + "step": 798 + }, + { + "epoch": 0.06489076585722407, + "grad_norm": 5.905725252167514, + "learning_rate": 4.984098556648425e-06, + "loss": 0.5483, + "step": 799 + }, + { + "epoch": 0.06497198083326565, + "grad_norm": 4.596246589585779, + "learning_rate": 4.984024416790852e-06, + "loss": 0.688, + "step": 800 + }, + { + "epoch": 0.06505319580930724, + "grad_norm": 4.9534579923413595, + "learning_rate": 4.983950105051936e-06, + "loss": 0.6562, + "step": 801 + }, + { + "epoch": 0.06513441078534882, + "grad_norm": 5.123985851109989, + "learning_rate": 4.9838756214368185e-06, + "loss": 0.6707, + "step": 802 + }, + { + "epoch": 0.0652156257613904, + "grad_norm": 5.402799215185337, + "learning_rate": 4.9838009659506535e-06, + "loss": 0.6378, + "step": 803 + }, + { + "epoch": 0.06529684073743199, + "grad_norm": 4.492757760289716, + "learning_rate": 4.983726138598608e-06, + "loss": 0.5916, + "step": 804 + }, + { + "epoch": 0.06537805571347356, + "grad_norm": 3.95999131962753, + "learning_rate": 4.9836511393858575e-06, + "loss": 0.5023, + "step": 805 + }, + { + "epoch": 0.06545927068951515, + "grad_norm": 3.920207373866997, + "learning_rate": 4.983575968317593e-06, + "loss": 0.7991, + "step": 806 + }, + { + "epoch": 0.06554048566555673, + "grad_norm": 7.315668961691789, + "learning_rate": 4.983500625399017e-06, + "loss": 0.5301, + "step": 807 + }, + { + "epoch": 0.06562170064159831, + "grad_norm": 3.912121127147346, + "learning_rate": 4.98342511063534e-06, + "loss": 0.7372, + "step": 808 + }, + { + "epoch": 0.0657029156176399, + "grad_norm": 7.552130637632774, + "learning_rate": 4.983349424031789e-06, + "loss": 0.5946, + "step": 809 + }, + { + "epoch": 0.06578413059368148, + "grad_norm": 6.533691238009157, + "learning_rate": 4.983273565593601e-06, + "loss": 0.642, + "step": 810 + }, + { + "epoch": 0.06586534556972305, + "grad_norm": 9.916532299872616, + "learning_rate": 4.983197535326024e-06, + "loss": 0.6117, + "step": 811 + }, + { + "epoch": 0.06594656054576464, + "grad_norm": 5.29822144361094, + "learning_rate": 4.983121333234321e-06, + "loss": 0.576, + "step": 812 + }, + { + "epoch": 0.06602777552180622, + "grad_norm": 7.011546578891193, + "learning_rate": 4.983044959323763e-06, + "loss": 0.6305, + "step": 813 + }, + { + "epoch": 0.0661089904978478, + "grad_norm": 7.6729419826090695, + "learning_rate": 4.982968413599635e-06, + "loss": 0.5282, + "step": 814 + }, + { + "epoch": 0.06619020547388939, + "grad_norm": 3.568801069591388, + "learning_rate": 4.982891696067234e-06, + "loss": 0.5766, + "step": 815 + }, + { + "epoch": 0.06627142044993097, + "grad_norm": 3.822836645624077, + "learning_rate": 4.9828148067318675e-06, + "loss": 0.647, + "step": 816 + }, + { + "epoch": 0.06635263542597256, + "grad_norm": 4.578694026237904, + "learning_rate": 4.982737745598857e-06, + "loss": 0.8134, + "step": 817 + }, + { + "epoch": 0.06643385040201413, + "grad_norm": 6.7533794019944065, + "learning_rate": 4.982660512673534e-06, + "loss": 0.6404, + "step": 818 + }, + { + "epoch": 0.06651506537805571, + "grad_norm": 5.791281720391061, + "learning_rate": 4.982583107961243e-06, + "loss": 0.5909, + "step": 819 + }, + { + "epoch": 0.0665962803540973, + "grad_norm": 4.161592546084501, + "learning_rate": 4.982505531467339e-06, + "loss": 0.5977, + "step": 820 + }, + { + "epoch": 0.06667749533013888, + "grad_norm": 6.394531347329611, + "learning_rate": 4.982427783197191e-06, + "loss": 0.5928, + "step": 821 + }, + { + "epoch": 0.06675871030618045, + "grad_norm": 8.488158051535715, + "learning_rate": 4.982349863156179e-06, + "loss": 0.7508, + "step": 822 + }, + { + "epoch": 0.06683992528222205, + "grad_norm": 5.428324952116931, + "learning_rate": 4.982271771349694e-06, + "loss": 0.5392, + "step": 823 + }, + { + "epoch": 0.06692114025826362, + "grad_norm": 4.025855738456331, + "learning_rate": 4.98219350778314e-06, + "loss": 0.4991, + "step": 824 + }, + { + "epoch": 0.0670023552343052, + "grad_norm": 5.231237358948681, + "learning_rate": 4.982115072461932e-06, + "loss": 0.522, + "step": 825 + }, + { + "epoch": 0.06708357021034679, + "grad_norm": 3.4702942997245616, + "learning_rate": 4.9820364653914964e-06, + "loss": 0.6092, + "step": 826 + }, + { + "epoch": 0.06716478518638837, + "grad_norm": 8.455649789106133, + "learning_rate": 4.981957686577275e-06, + "loss": 0.8485, + "step": 827 + }, + { + "epoch": 0.06724600016242996, + "grad_norm": 6.0384535186038955, + "learning_rate": 4.981878736024716e-06, + "loss": 0.5669, + "step": 828 + }, + { + "epoch": 0.06732721513847154, + "grad_norm": 3.4751635293718732, + "learning_rate": 4.981799613739284e-06, + "loss": 0.5404, + "step": 829 + }, + { + "epoch": 0.06740843011451311, + "grad_norm": 4.516175343460089, + "learning_rate": 4.981720319726453e-06, + "loss": 0.512, + "step": 830 + }, + { + "epoch": 0.0674896450905547, + "grad_norm": 6.121636308285373, + "learning_rate": 4.981640853991712e-06, + "loss": 0.7, + "step": 831 + }, + { + "epoch": 0.06757086006659628, + "grad_norm": 4.916730473198547, + "learning_rate": 4.981561216540556e-06, + "loss": 0.6203, + "step": 832 + }, + { + "epoch": 0.06765207504263786, + "grad_norm": 6.140736960651221, + "learning_rate": 4.981481407378498e-06, + "loss": 0.6689, + "step": 833 + }, + { + "epoch": 0.06773329001867945, + "grad_norm": 3.651255962877865, + "learning_rate": 4.981401426511059e-06, + "loss": 0.6711, + "step": 834 + }, + { + "epoch": 0.06781450499472103, + "grad_norm": 4.433646055135197, + "learning_rate": 4.981321273943775e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.0678957199707626, + "grad_norm": 4.214671257503501, + "learning_rate": 4.98124094968219e-06, + "loss": 0.6696, + "step": 836 + }, + { + "epoch": 0.0679769349468042, + "grad_norm": 8.938425585700115, + "learning_rate": 4.981160453731864e-06, + "loss": 0.5597, + "step": 837 + }, + { + "epoch": 0.06805814992284577, + "grad_norm": 5.5327509204918215, + "learning_rate": 4.981079786098365e-06, + "loss": 0.6733, + "step": 838 + }, + { + "epoch": 0.06813936489888736, + "grad_norm": 4.203367930591512, + "learning_rate": 4.980998946787276e-06, + "loss": 0.6717, + "step": 839 + }, + { + "epoch": 0.06822057987492894, + "grad_norm": 4.743933277670276, + "learning_rate": 4.98091793580419e-06, + "loss": 0.6467, + "step": 840 + }, + { + "epoch": 0.06830179485097052, + "grad_norm": 5.519327116976564, + "learning_rate": 4.9808367531547144e-06, + "loss": 0.5875, + "step": 841 + }, + { + "epoch": 0.0683830098270121, + "grad_norm": 5.421051052782078, + "learning_rate": 4.980755398844464e-06, + "loss": 0.5742, + "step": 842 + }, + { + "epoch": 0.06846422480305368, + "grad_norm": 5.205786851548457, + "learning_rate": 4.980673872879069e-06, + "loss": 0.734, + "step": 843 + }, + { + "epoch": 0.06854543977909526, + "grad_norm": 7.795985083428892, + "learning_rate": 4.980592175264172e-06, + "loss": 0.6009, + "step": 844 + }, + { + "epoch": 0.06862665475513685, + "grad_norm": 4.005038090148275, + "learning_rate": 4.9805103060054235e-06, + "loss": 0.5548, + "step": 845 + }, + { + "epoch": 0.06870786973117843, + "grad_norm": 5.440366650852374, + "learning_rate": 4.980428265108491e-06, + "loss": 0.5983, + "step": 846 + }, + { + "epoch": 0.06878908470722, + "grad_norm": 6.265163613283854, + "learning_rate": 4.980346052579049e-06, + "loss": 0.5767, + "step": 847 + }, + { + "epoch": 0.0688702996832616, + "grad_norm": 4.18497253484256, + "learning_rate": 4.9802636684227875e-06, + "loss": 0.6295, + "step": 848 + }, + { + "epoch": 0.06895151465930317, + "grad_norm": 36.607837712422636, + "learning_rate": 4.980181112645407e-06, + "loss": 0.5392, + "step": 849 + }, + { + "epoch": 0.06903272963534476, + "grad_norm": 4.250875496889967, + "learning_rate": 4.9800983852526195e-06, + "loss": 0.7019, + "step": 850 + }, + { + "epoch": 0.06911394461138634, + "grad_norm": 4.20052946210975, + "learning_rate": 4.980015486250149e-06, + "loss": 0.6024, + "step": 851 + }, + { + "epoch": 0.06919515958742792, + "grad_norm": 8.36262034310868, + "learning_rate": 4.979932415643733e-06, + "loss": 0.505, + "step": 852 + }, + { + "epoch": 0.06927637456346951, + "grad_norm": 3.9569054991402206, + "learning_rate": 4.9798491734391185e-06, + "loss": 0.6134, + "step": 853 + }, + { + "epoch": 0.06935758953951109, + "grad_norm": 8.784347823002246, + "learning_rate": 4.9797657596420655e-06, + "loss": 0.6008, + "step": 854 + }, + { + "epoch": 0.06943880451555266, + "grad_norm": 6.296297313209, + "learning_rate": 4.979682174258346e-06, + "loss": 0.5597, + "step": 855 + }, + { + "epoch": 0.06952001949159425, + "grad_norm": 5.72675894986214, + "learning_rate": 4.979598417293743e-06, + "loss": 0.7964, + "step": 856 + }, + { + "epoch": 0.06960123446763583, + "grad_norm": 9.153121757631224, + "learning_rate": 4.979514488754053e-06, + "loss": 0.6276, + "step": 857 + }, + { + "epoch": 0.06968244944367741, + "grad_norm": 3.4432015000564156, + "learning_rate": 4.979430388645083e-06, + "loss": 0.6616, + "step": 858 + }, + { + "epoch": 0.069763664419719, + "grad_norm": 4.626309196229603, + "learning_rate": 4.979346116972653e-06, + "loss": 0.6686, + "step": 859 + }, + { + "epoch": 0.06984487939576058, + "grad_norm": 4.3625603630162555, + "learning_rate": 4.979261673742592e-06, + "loss": 0.7034, + "step": 860 + }, + { + "epoch": 0.06992609437180217, + "grad_norm": 3.987951551641592, + "learning_rate": 4.9791770589607455e-06, + "loss": 0.7321, + "step": 861 + }, + { + "epoch": 0.07000730934784374, + "grad_norm": 4.806090340673884, + "learning_rate": 4.979092272632968e-06, + "loss": 0.6409, + "step": 862 + }, + { + "epoch": 0.07008852432388532, + "grad_norm": 8.252678583793005, + "learning_rate": 4.979007314765124e-06, + "loss": 0.6183, + "step": 863 + }, + { + "epoch": 0.07016973929992691, + "grad_norm": 7.048613756795895, + "learning_rate": 4.978922185363095e-06, + "loss": 0.5649, + "step": 864 + }, + { + "epoch": 0.07025095427596849, + "grad_norm": 7.107709811977146, + "learning_rate": 4.97883688443277e-06, + "loss": 0.7172, + "step": 865 + }, + { + "epoch": 0.07033216925201007, + "grad_norm": 5.254387891978319, + "learning_rate": 4.9787514119800515e-06, + "loss": 0.756, + "step": 866 + }, + { + "epoch": 0.07041338422805166, + "grad_norm": 4.158496862178936, + "learning_rate": 4.9786657680108545e-06, + "loss": 0.4973, + "step": 867 + }, + { + "epoch": 0.07049459920409323, + "grad_norm": 4.086601599664647, + "learning_rate": 4.978579952531104e-06, + "loss": 0.7624, + "step": 868 + }, + { + "epoch": 0.07057581418013481, + "grad_norm": 4.066778591214372, + "learning_rate": 4.978493965546738e-06, + "loss": 0.4797, + "step": 869 + }, + { + "epoch": 0.0706570291561764, + "grad_norm": 4.458730486420523, + "learning_rate": 4.9784078070637076e-06, + "loss": 0.7739, + "step": 870 + }, + { + "epoch": 0.07073824413221798, + "grad_norm": 4.2724723822092425, + "learning_rate": 4.978321477087974e-06, + "loss": 0.5737, + "step": 871 + }, + { + "epoch": 0.07081945910825957, + "grad_norm": 4.54951709242148, + "learning_rate": 4.97823497562551e-06, + "loss": 0.5447, + "step": 872 + }, + { + "epoch": 0.07090067408430115, + "grad_norm": 4.839083574408698, + "learning_rate": 4.978148302682301e-06, + "loss": 0.7224, + "step": 873 + }, + { + "epoch": 0.07098188906034272, + "grad_norm": 5.90958403636814, + "learning_rate": 4.978061458264346e-06, + "loss": 0.6231, + "step": 874 + }, + { + "epoch": 0.07106310403638431, + "grad_norm": 4.950954034585027, + "learning_rate": 4.977974442377652e-06, + "loss": 0.5298, + "step": 875 + }, + { + "epoch": 0.07114431901242589, + "grad_norm": 4.705158462536222, + "learning_rate": 4.977887255028241e-06, + "loss": 0.5893, + "step": 876 + }, + { + "epoch": 0.07122553398846747, + "grad_norm": 6.237199358592749, + "learning_rate": 4.977799896222148e-06, + "loss": 0.6342, + "step": 877 + }, + { + "epoch": 0.07130674896450906, + "grad_norm": 5.842398936618045, + "learning_rate": 4.977712365965414e-06, + "loss": 0.6228, + "step": 878 + }, + { + "epoch": 0.07138796394055064, + "grad_norm": 4.674400039131552, + "learning_rate": 4.9776246642640965e-06, + "loss": 0.639, + "step": 879 + }, + { + "epoch": 0.07146917891659221, + "grad_norm": 6.64025918584984, + "learning_rate": 4.977536791124267e-06, + "loss": 0.6026, + "step": 880 + }, + { + "epoch": 0.0715503938926338, + "grad_norm": 6.036395710793269, + "learning_rate": 4.9774487465520025e-06, + "loss": 0.8198, + "step": 881 + }, + { + "epoch": 0.07163160886867538, + "grad_norm": 4.761294703683568, + "learning_rate": 4.977360530553397e-06, + "loss": 0.5251, + "step": 882 + }, + { + "epoch": 0.07171282384471697, + "grad_norm": 4.922109235469366, + "learning_rate": 4.977272143134554e-06, + "loss": 0.6486, + "step": 883 + }, + { + "epoch": 0.07179403882075855, + "grad_norm": 4.672737077545084, + "learning_rate": 4.97718358430159e-06, + "loss": 0.6457, + "step": 884 + }, + { + "epoch": 0.07187525379680013, + "grad_norm": 8.289465047137561, + "learning_rate": 4.977094854060631e-06, + "loss": 0.6234, + "step": 885 + }, + { + "epoch": 0.07195646877284172, + "grad_norm": 5.587306517848276, + "learning_rate": 4.977005952417818e-06, + "loss": 0.6662, + "step": 886 + }, + { + "epoch": 0.0720376837488833, + "grad_norm": 3.711622831241775, + "learning_rate": 4.9769168793793036e-06, + "loss": 0.7314, + "step": 887 + }, + { + "epoch": 0.07211889872492487, + "grad_norm": 7.5781498502969145, + "learning_rate": 4.976827634951249e-06, + "loss": 0.6133, + "step": 888 + }, + { + "epoch": 0.07220011370096646, + "grad_norm": 4.823514400373474, + "learning_rate": 4.976738219139831e-06, + "loss": 0.6408, + "step": 889 + }, + { + "epoch": 0.07228132867700804, + "grad_norm": 8.761667850604423, + "learning_rate": 4.976648631951236e-06, + "loss": 0.6452, + "step": 890 + }, + { + "epoch": 0.07236254365304962, + "grad_norm": 3.571671073730977, + "learning_rate": 4.976558873391663e-06, + "loss": 0.6598, + "step": 891 + }, + { + "epoch": 0.0724437586290912, + "grad_norm": 5.322524151802511, + "learning_rate": 4.976468943467323e-06, + "loss": 0.5386, + "step": 892 + }, + { + "epoch": 0.07252497360513278, + "grad_norm": 5.66163748773043, + "learning_rate": 4.976378842184439e-06, + "loss": 0.5516, + "step": 893 + }, + { + "epoch": 0.07260618858117437, + "grad_norm": 5.353033651730894, + "learning_rate": 4.9762885695492454e-06, + "loss": 0.6012, + "step": 894 + }, + { + "epoch": 0.07268740355721595, + "grad_norm": 6.855863397519137, + "learning_rate": 4.976198125567988e-06, + "loss": 0.6303, + "step": 895 + }, + { + "epoch": 0.07276861853325753, + "grad_norm": 4.84169944440175, + "learning_rate": 4.976107510246925e-06, + "loss": 0.528, + "step": 896 + }, + { + "epoch": 0.07284983350929912, + "grad_norm": 5.092992144693499, + "learning_rate": 4.976016723592328e-06, + "loss": 0.5648, + "step": 897 + }, + { + "epoch": 0.0729310484853407, + "grad_norm": 4.858637678276697, + "learning_rate": 4.975925765610476e-06, + "loss": 0.5921, + "step": 898 + }, + { + "epoch": 0.07301226346138227, + "grad_norm": 4.995201775650152, + "learning_rate": 4.975834636307667e-06, + "loss": 0.6145, + "step": 899 + }, + { + "epoch": 0.07309347843742386, + "grad_norm": 6.2150093032360365, + "learning_rate": 4.975743335690203e-06, + "loss": 0.5049, + "step": 900 + }, + { + "epoch": 0.07317469341346544, + "grad_norm": 5.737916586640195, + "learning_rate": 4.975651863764403e-06, + "loss": 0.5949, + "step": 901 + }, + { + "epoch": 0.07325590838950702, + "grad_norm": 5.21474627504135, + "learning_rate": 4.975560220536596e-06, + "loss": 0.8498, + "step": 902 + }, + { + "epoch": 0.07333712336554861, + "grad_norm": 3.9495934594747877, + "learning_rate": 4.975468406013124e-06, + "loss": 0.6854, + "step": 903 + }, + { + "epoch": 0.07341833834159019, + "grad_norm": 7.580396023436531, + "learning_rate": 4.97537642020034e-06, + "loss": 0.585, + "step": 904 + }, + { + "epoch": 0.07349955331763178, + "grad_norm": 6.081348176726435, + "learning_rate": 4.9752842631046075e-06, + "loss": 0.5681, + "step": 905 + }, + { + "epoch": 0.07358076829367335, + "grad_norm": 7.6219497191818695, + "learning_rate": 4.975191934732304e-06, + "loss": 0.5283, + "step": 906 + }, + { + "epoch": 0.07366198326971493, + "grad_norm": 7.644719180949304, + "learning_rate": 4.975099435089819e-06, + "loss": 0.544, + "step": 907 + }, + { + "epoch": 0.07374319824575652, + "grad_norm": 4.967279914364186, + "learning_rate": 4.975006764183552e-06, + "loss": 0.6976, + "step": 908 + }, + { + "epoch": 0.0738244132217981, + "grad_norm": 4.727348872362556, + "learning_rate": 4.974913922019916e-06, + "loss": 0.6466, + "step": 909 + }, + { + "epoch": 0.07390562819783968, + "grad_norm": 6.6416921364354895, + "learning_rate": 4.974820908605336e-06, + "loss": 0.5407, + "step": 910 + }, + { + "epoch": 0.07398684317388127, + "grad_norm": 4.934581673705169, + "learning_rate": 4.974727723946245e-06, + "loss": 0.6653, + "step": 911 + }, + { + "epoch": 0.07406805814992284, + "grad_norm": 3.2488987014947286, + "learning_rate": 4.974634368049094e-06, + "loss": 0.5007, + "step": 912 + }, + { + "epoch": 0.07414927312596442, + "grad_norm": 6.227255018738214, + "learning_rate": 4.974540840920341e-06, + "loss": 0.5501, + "step": 913 + }, + { + "epoch": 0.07423048810200601, + "grad_norm": 4.977844807059578, + "learning_rate": 4.974447142566458e-06, + "loss": 0.7246, + "step": 914 + }, + { + "epoch": 0.07431170307804759, + "grad_norm": 4.9582732140375105, + "learning_rate": 4.974353272993929e-06, + "loss": 0.6714, + "step": 915 + }, + { + "epoch": 0.07439291805408918, + "grad_norm": 5.123191354056708, + "learning_rate": 4.974259232209249e-06, + "loss": 0.7354, + "step": 916 + }, + { + "epoch": 0.07447413303013076, + "grad_norm": 5.206554108430837, + "learning_rate": 4.9741650202189245e-06, + "loss": 0.6421, + "step": 917 + }, + { + "epoch": 0.07455534800617233, + "grad_norm": 8.459450047833085, + "learning_rate": 4.9740706370294755e-06, + "loss": 0.8359, + "step": 918 + }, + { + "epoch": 0.07463656298221392, + "grad_norm": 4.78007740463755, + "learning_rate": 4.973976082647432e-06, + "loss": 0.6941, + "step": 919 + }, + { + "epoch": 0.0747177779582555, + "grad_norm": 4.06407753833687, + "learning_rate": 4.9738813570793365e-06, + "loss": 0.6078, + "step": 920 + }, + { + "epoch": 0.07479899293429708, + "grad_norm": 6.4634082754205915, + "learning_rate": 4.973786460331744e-06, + "loss": 0.6072, + "step": 921 + }, + { + "epoch": 0.07488020791033867, + "grad_norm": 4.794171216936538, + "learning_rate": 4.973691392411221e-06, + "loss": 0.6489, + "step": 922 + }, + { + "epoch": 0.07496142288638025, + "grad_norm": 4.048880670381879, + "learning_rate": 4.973596153324346e-06, + "loss": 0.6415, + "step": 923 + }, + { + "epoch": 0.07504263786242182, + "grad_norm": 4.260077508939967, + "learning_rate": 4.973500743077707e-06, + "loss": 0.5398, + "step": 924 + }, + { + "epoch": 0.07512385283846341, + "grad_norm": 4.630184924560269, + "learning_rate": 4.9734051616779085e-06, + "loss": 0.5155, + "step": 925 + }, + { + "epoch": 0.07520506781450499, + "grad_norm": 6.107378791436437, + "learning_rate": 4.973309409131564e-06, + "loss": 0.5784, + "step": 926 + }, + { + "epoch": 0.07528628279054658, + "grad_norm": 4.991440001564157, + "learning_rate": 4.973213485445298e-06, + "loss": 0.6463, + "step": 927 + }, + { + "epoch": 0.07536749776658816, + "grad_norm": 3.9596877989124017, + "learning_rate": 4.973117390625746e-06, + "loss": 0.5694, + "step": 928 + }, + { + "epoch": 0.07544871274262974, + "grad_norm": 5.9210563584390385, + "learning_rate": 4.9730211246795614e-06, + "loss": 0.5044, + "step": 929 + }, + { + "epoch": 0.07552992771867133, + "grad_norm": 6.400935356399938, + "learning_rate": 4.9729246876134015e-06, + "loss": 0.5711, + "step": 930 + }, + { + "epoch": 0.0756111426947129, + "grad_norm": 6.074570162257794, + "learning_rate": 4.9728280794339426e-06, + "loss": 0.7171, + "step": 931 + }, + { + "epoch": 0.07569235767075448, + "grad_norm": 7.525931570888455, + "learning_rate": 4.972731300147867e-06, + "loss": 0.5338, + "step": 932 + }, + { + "epoch": 0.07577357264679607, + "grad_norm": 5.22558586071865, + "learning_rate": 4.972634349761873e-06, + "loss": 0.5591, + "step": 933 + }, + { + "epoch": 0.07585478762283765, + "grad_norm": 10.047768244682082, + "learning_rate": 4.972537228282668e-06, + "loss": 0.6477, + "step": 934 + }, + { + "epoch": 0.07593600259887923, + "grad_norm": 6.876714921725991, + "learning_rate": 4.972439935716972e-06, + "loss": 0.5482, + "step": 935 + }, + { + "epoch": 0.07601721757492082, + "grad_norm": 6.550177057547305, + "learning_rate": 4.972342472071518e-06, + "loss": 0.5923, + "step": 936 + }, + { + "epoch": 0.0760984325509624, + "grad_norm": 5.709520994341491, + "learning_rate": 4.97224483735305e-06, + "loss": 0.6293, + "step": 937 + }, + { + "epoch": 0.07617964752700399, + "grad_norm": 4.727570596744493, + "learning_rate": 4.972147031568322e-06, + "loss": 0.5793, + "step": 938 + }, + { + "epoch": 0.07626086250304556, + "grad_norm": 5.251994860118902, + "learning_rate": 4.972049054724104e-06, + "loss": 0.7869, + "step": 939 + }, + { + "epoch": 0.07634207747908714, + "grad_norm": 7.152014425413055, + "learning_rate": 4.9719509068271755e-06, + "loss": 0.5164, + "step": 940 + }, + { + "epoch": 0.07642329245512873, + "grad_norm": 9.90794133427569, + "learning_rate": 4.971852587884325e-06, + "loss": 0.5625, + "step": 941 + }, + { + "epoch": 0.07650450743117031, + "grad_norm": 5.980631957938455, + "learning_rate": 4.97175409790236e-06, + "loss": 0.6737, + "step": 942 + }, + { + "epoch": 0.07658572240721188, + "grad_norm": 11.229665313768436, + "learning_rate": 4.97165543688809e-06, + "loss": 0.6511, + "step": 943 + }, + { + "epoch": 0.07666693738325348, + "grad_norm": 15.507076859457626, + "learning_rate": 4.971556604848346e-06, + "loss": 0.595, + "step": 944 + }, + { + "epoch": 0.07674815235929505, + "grad_norm": 4.875894631175207, + "learning_rate": 4.971457601789966e-06, + "loss": 0.5992, + "step": 945 + }, + { + "epoch": 0.07682936733533664, + "grad_norm": 4.406952012061629, + "learning_rate": 4.9713584277198e-06, + "loss": 0.5121, + "step": 946 + }, + { + "epoch": 0.07691058231137822, + "grad_norm": 5.211822267384949, + "learning_rate": 4.97125908264471e-06, + "loss": 0.521, + "step": 947 + }, + { + "epoch": 0.0769917972874198, + "grad_norm": 6.646319909846443, + "learning_rate": 4.97115956657157e-06, + "loss": 0.5977, + "step": 948 + }, + { + "epoch": 0.07707301226346139, + "grad_norm": 5.951258269254235, + "learning_rate": 4.971059879507268e-06, + "loss": 0.6193, + "step": 949 + }, + { + "epoch": 0.07715422723950296, + "grad_norm": 15.849437156314497, + "learning_rate": 4.970960021458699e-06, + "loss": 0.7991, + "step": 950 + }, + { + "epoch": 0.07723544221554454, + "grad_norm": 4.470056142523615, + "learning_rate": 4.9708599924327735e-06, + "loss": 0.5992, + "step": 951 + }, + { + "epoch": 0.07731665719158613, + "grad_norm": 7.357239226247135, + "learning_rate": 4.970759792436414e-06, + "loss": 0.574, + "step": 952 + }, + { + "epoch": 0.07739787216762771, + "grad_norm": 6.193524739615741, + "learning_rate": 4.970659421476553e-06, + "loss": 0.6871, + "step": 953 + }, + { + "epoch": 0.07747908714366929, + "grad_norm": 6.192316599355833, + "learning_rate": 4.970558879560137e-06, + "loss": 0.6069, + "step": 954 + }, + { + "epoch": 0.07756030211971088, + "grad_norm": 7.025324708867777, + "learning_rate": 4.97045816669412e-06, + "loss": 0.45, + "step": 955 + }, + { + "epoch": 0.07764151709575245, + "grad_norm": 5.808792647688984, + "learning_rate": 4.970357282885473e-06, + "loss": 0.7007, + "step": 956 + }, + { + "epoch": 0.07772273207179405, + "grad_norm": 5.800444379893061, + "learning_rate": 4.970256228141177e-06, + "loss": 0.5379, + "step": 957 + }, + { + "epoch": 0.07780394704783562, + "grad_norm": 4.764091635977128, + "learning_rate": 4.970155002468223e-06, + "loss": 0.6805, + "step": 958 + }, + { + "epoch": 0.0778851620238772, + "grad_norm": 5.223528426735964, + "learning_rate": 4.970053605873616e-06, + "loss": 0.6757, + "step": 959 + }, + { + "epoch": 0.07796637699991879, + "grad_norm": 7.117293048534837, + "learning_rate": 4.969952038364372e-06, + "loss": 0.5716, + "step": 960 + }, + { + "epoch": 0.07804759197596037, + "grad_norm": 8.849272822881796, + "learning_rate": 4.96985029994752e-06, + "loss": 0.7158, + "step": 961 + }, + { + "epoch": 0.07812880695200194, + "grad_norm": 4.323247038552405, + "learning_rate": 4.969748390630097e-06, + "loss": 0.7014, + "step": 962 + }, + { + "epoch": 0.07821002192804354, + "grad_norm": 3.15078482423997, + "learning_rate": 4.969646310419157e-06, + "loss": 0.6552, + "step": 963 + }, + { + "epoch": 0.07829123690408511, + "grad_norm": 5.142051674034742, + "learning_rate": 4.9695440593217635e-06, + "loss": 0.5731, + "step": 964 + }, + { + "epoch": 0.07837245188012669, + "grad_norm": 4.530754006814891, + "learning_rate": 4.96944163734499e-06, + "loss": 0.4648, + "step": 965 + }, + { + "epoch": 0.07845366685616828, + "grad_norm": 5.999396192869684, + "learning_rate": 4.969339044495925e-06, + "loss": 0.6981, + "step": 966 + }, + { + "epoch": 0.07853488183220986, + "grad_norm": 4.570316628434694, + "learning_rate": 4.969236280781667e-06, + "loss": 0.5834, + "step": 967 + }, + { + "epoch": 0.07861609680825145, + "grad_norm": 29.36059594145154, + "learning_rate": 4.9691333462093264e-06, + "loss": 0.6442, + "step": 968 + }, + { + "epoch": 0.07869731178429303, + "grad_norm": 4.95470560540118, + "learning_rate": 4.969030240786026e-06, + "loss": 0.4758, + "step": 969 + }, + { + "epoch": 0.0787785267603346, + "grad_norm": 3.893402872277116, + "learning_rate": 4.9689269645189e-06, + "loss": 0.6544, + "step": 970 + }, + { + "epoch": 0.0788597417363762, + "grad_norm": 5.3148301362547885, + "learning_rate": 4.968823517415095e-06, + "loss": 0.611, + "step": 971 + }, + { + "epoch": 0.07894095671241777, + "grad_norm": 5.03652657763463, + "learning_rate": 4.9687198994817685e-06, + "loss": 0.6703, + "step": 972 + }, + { + "epoch": 0.07902217168845935, + "grad_norm": 4.100966531743051, + "learning_rate": 4.9686161107260906e-06, + "loss": 0.5816, + "step": 973 + }, + { + "epoch": 0.07910338666450094, + "grad_norm": 5.116827473240173, + "learning_rate": 4.968512151155242e-06, + "loss": 0.5917, + "step": 974 + }, + { + "epoch": 0.07918460164054252, + "grad_norm": 3.2335435030359596, + "learning_rate": 4.968408020776419e-06, + "loss": 0.5538, + "step": 975 + }, + { + "epoch": 0.07926581661658409, + "grad_norm": 7.5591035161901035, + "learning_rate": 4.968303719596823e-06, + "loss": 0.461, + "step": 976 + }, + { + "epoch": 0.07934703159262568, + "grad_norm": 10.075832303176476, + "learning_rate": 4.9681992476236725e-06, + "loss": 0.6354, + "step": 977 + }, + { + "epoch": 0.07942824656866726, + "grad_norm": 5.561625086524589, + "learning_rate": 4.968094604864198e-06, + "loss": 0.5383, + "step": 978 + }, + { + "epoch": 0.07950946154470885, + "grad_norm": 5.6568874477353095, + "learning_rate": 4.967989791325639e-06, + "loss": 0.6593, + "step": 979 + }, + { + "epoch": 0.07959067652075043, + "grad_norm": 6.416186383122935, + "learning_rate": 4.967884807015247e-06, + "loss": 0.833, + "step": 980 + }, + { + "epoch": 0.079671891496792, + "grad_norm": 5.7530564697103355, + "learning_rate": 4.967779651940289e-06, + "loss": 0.7025, + "step": 981 + }, + { + "epoch": 0.0797531064728336, + "grad_norm": 6.850642228085424, + "learning_rate": 4.967674326108039e-06, + "loss": 0.5582, + "step": 982 + }, + { + "epoch": 0.07983432144887517, + "grad_norm": 4.208095970275836, + "learning_rate": 4.9675688295257855e-06, + "loss": 0.514, + "step": 983 + }, + { + "epoch": 0.07991553642491675, + "grad_norm": 5.1148065579680075, + "learning_rate": 4.967463162200828e-06, + "loss": 0.6708, + "step": 984 + }, + { + "epoch": 0.07999675140095834, + "grad_norm": 7.806584044546556, + "learning_rate": 4.967357324140479e-06, + "loss": 0.639, + "step": 985 + }, + { + "epoch": 0.08007796637699992, + "grad_norm": 7.822904916898733, + "learning_rate": 4.967251315352062e-06, + "loss": 0.7296, + "step": 986 + }, + { + "epoch": 0.0801591813530415, + "grad_norm": 4.251509260453048, + "learning_rate": 4.9671451358429115e-06, + "loss": 0.6169, + "step": 987 + }, + { + "epoch": 0.08024039632908309, + "grad_norm": 3.9712450920352014, + "learning_rate": 4.967038785620374e-06, + "loss": 0.6307, + "step": 988 + }, + { + "epoch": 0.08032161130512466, + "grad_norm": 5.013282143950785, + "learning_rate": 4.96693226469181e-06, + "loss": 0.6896, + "step": 989 + }, + { + "epoch": 0.08040282628116625, + "grad_norm": 6.309239309855388, + "learning_rate": 4.966825573064589e-06, + "loss": 0.5816, + "step": 990 + }, + { + "epoch": 0.08048404125720783, + "grad_norm": 17.18761480554951, + "learning_rate": 4.9667187107460934e-06, + "loss": 0.7024, + "step": 991 + }, + { + "epoch": 0.08056525623324941, + "grad_norm": 8.511668880504516, + "learning_rate": 4.966611677743719e-06, + "loss": 0.6545, + "step": 992 + }, + { + "epoch": 0.080646471209291, + "grad_norm": 7.07740712784662, + "learning_rate": 4.96650447406487e-06, + "loss": 0.6241, + "step": 993 + }, + { + "epoch": 0.08072768618533258, + "grad_norm": 6.4092146264144505, + "learning_rate": 4.966397099716965e-06, + "loss": 0.6096, + "step": 994 + }, + { + "epoch": 0.08080890116137415, + "grad_norm": 3.371066265022369, + "learning_rate": 4.9662895547074345e-06, + "loss": 0.6807, + "step": 995 + }, + { + "epoch": 0.08089011613741574, + "grad_norm": 11.15283861894202, + "learning_rate": 4.96618183904372e-06, + "loss": 0.69, + "step": 996 + }, + { + "epoch": 0.08097133111345732, + "grad_norm": 5.584728402833689, + "learning_rate": 4.966073952733273e-06, + "loss": 0.6545, + "step": 997 + }, + { + "epoch": 0.0810525460894989, + "grad_norm": 4.928404998187488, + "learning_rate": 4.965965895783561e-06, + "loss": 0.7363, + "step": 998 + }, + { + "epoch": 0.08113376106554049, + "grad_norm": 5.18162526836767, + "learning_rate": 4.96585766820206e-06, + "loss": 0.7405, + "step": 999 + }, + { + "epoch": 0.08121497604158207, + "grad_norm": 5.108535543984558, + "learning_rate": 4.965749269996258e-06, + "loss": 0.4666, + "step": 1000 + }, + { + "epoch": 0.08129619101762366, + "grad_norm": 4.081215207318422, + "learning_rate": 4.965640701173657e-06, + "loss": 0.6248, + "step": 1001 + }, + { + "epoch": 0.08137740599366523, + "grad_norm": 4.819602648972772, + "learning_rate": 4.9655319617417674e-06, + "loss": 0.6029, + "step": 1002 + }, + { + "epoch": 0.08145862096970681, + "grad_norm": 3.672836794950573, + "learning_rate": 4.965423051708116e-06, + "loss": 0.7782, + "step": 1003 + }, + { + "epoch": 0.0815398359457484, + "grad_norm": 4.726346322374398, + "learning_rate": 4.965313971080237e-06, + "loss": 0.5324, + "step": 1004 + }, + { + "epoch": 0.08162105092178998, + "grad_norm": 5.155470667426197, + "learning_rate": 4.96520471986568e-06, + "loss": 0.666, + "step": 1005 + }, + { + "epoch": 0.08170226589783156, + "grad_norm": 4.923143643344893, + "learning_rate": 4.965095298072001e-06, + "loss": 0.5542, + "step": 1006 + }, + { + "epoch": 0.08178348087387315, + "grad_norm": 4.1341959962667305, + "learning_rate": 4.964985705706775e-06, + "loss": 0.5911, + "step": 1007 + }, + { + "epoch": 0.08186469584991472, + "grad_norm": 5.790946529810634, + "learning_rate": 4.964875942777584e-06, + "loss": 0.7514, + "step": 1008 + }, + { + "epoch": 0.0819459108259563, + "grad_norm": 5.188744184884793, + "learning_rate": 4.964766009292022e-06, + "loss": 0.704, + "step": 1009 + }, + { + "epoch": 0.08202712580199789, + "grad_norm": 9.318471720754397, + "learning_rate": 4.9646559052576985e-06, + "loss": 0.6622, + "step": 1010 + }, + { + "epoch": 0.08210834077803947, + "grad_norm": 4.902314685975934, + "learning_rate": 4.9645456306822285e-06, + "loss": 0.5447, + "step": 1011 + }, + { + "epoch": 0.08218955575408106, + "grad_norm": 5.828651116573771, + "learning_rate": 4.964435185573245e-06, + "loss": 0.5288, + "step": 1012 + }, + { + "epoch": 0.08227077073012264, + "grad_norm": 2.864291283393185, + "learning_rate": 4.96432456993839e-06, + "loss": 0.6403, + "step": 1013 + }, + { + "epoch": 0.08235198570616421, + "grad_norm": 5.284219787159016, + "learning_rate": 4.964213783785317e-06, + "loss": 0.6291, + "step": 1014 + }, + { + "epoch": 0.0824332006822058, + "grad_norm": 4.43415065344841, + "learning_rate": 4.9641028271216905e-06, + "loss": 0.8372, + "step": 1015 + }, + { + "epoch": 0.08251441565824738, + "grad_norm": 4.37746143116903, + "learning_rate": 4.9639916999551905e-06, + "loss": 0.5823, + "step": 1016 + }, + { + "epoch": 0.08259563063428896, + "grad_norm": 15.987254695136924, + "learning_rate": 4.963880402293506e-06, + "loss": 0.6104, + "step": 1017 + }, + { + "epoch": 0.08267684561033055, + "grad_norm": 5.7244481019012925, + "learning_rate": 4.963768934144336e-06, + "loss": 0.7552, + "step": 1018 + }, + { + "epoch": 0.08275806058637213, + "grad_norm": 4.217271904051636, + "learning_rate": 4.963657295515396e-06, + "loss": 0.6159, + "step": 1019 + }, + { + "epoch": 0.0828392755624137, + "grad_norm": 7.382613542004634, + "learning_rate": 4.963545486414411e-06, + "loss": 0.6277, + "step": 1020 + }, + { + "epoch": 0.0829204905384553, + "grad_norm": 5.353844268940195, + "learning_rate": 4.963433506849115e-06, + "loss": 0.738, + "step": 1021 + }, + { + "epoch": 0.08300170551449687, + "grad_norm": 10.18166117587427, + "learning_rate": 4.963321356827258e-06, + "loss": 0.6173, + "step": 1022 + }, + { + "epoch": 0.08308292049053846, + "grad_norm": 7.319414111585189, + "learning_rate": 4.9632090363565995e-06, + "loss": 0.6377, + "step": 1023 + }, + { + "epoch": 0.08316413546658004, + "grad_norm": 6.252416872046548, + "learning_rate": 4.963096545444913e-06, + "loss": 0.5095, + "step": 1024 + }, + { + "epoch": 0.08324535044262162, + "grad_norm": 4.976076953773711, + "learning_rate": 4.962983884099981e-06, + "loss": 0.5225, + "step": 1025 + }, + { + "epoch": 0.0833265654186632, + "grad_norm": 6.865653287550914, + "learning_rate": 4.9628710523296e-06, + "loss": 0.5425, + "step": 1026 + }, + { + "epoch": 0.08340778039470478, + "grad_norm": 5.6514607039180005, + "learning_rate": 4.962758050141576e-06, + "loss": 0.5492, + "step": 1027 + }, + { + "epoch": 0.08348899537074636, + "grad_norm": 8.200634823957119, + "learning_rate": 4.962644877543729e-06, + "loss": 0.5683, + "step": 1028 + }, + { + "epoch": 0.08357021034678795, + "grad_norm": 4.089877382417978, + "learning_rate": 4.96253153454389e-06, + "loss": 0.7186, + "step": 1029 + }, + { + "epoch": 0.08365142532282953, + "grad_norm": 4.918100288171265, + "learning_rate": 4.9624180211499004e-06, + "loss": 0.4817, + "step": 1030 + }, + { + "epoch": 0.0837326402988711, + "grad_norm": 5.979359137308161, + "learning_rate": 4.962304337369618e-06, + "loss": 0.575, + "step": 1031 + }, + { + "epoch": 0.0838138552749127, + "grad_norm": 5.9147018999524645, + "learning_rate": 4.962190483210906e-06, + "loss": 0.5979, + "step": 1032 + }, + { + "epoch": 0.08389507025095427, + "grad_norm": 3.8882956927363614, + "learning_rate": 4.962076458681642e-06, + "loss": 0.5231, + "step": 1033 + }, + { + "epoch": 0.08397628522699586, + "grad_norm": 8.61999834460005, + "learning_rate": 4.96196226378972e-06, + "loss": 0.6006, + "step": 1034 + }, + { + "epoch": 0.08405750020303744, + "grad_norm": 4.813530840618122, + "learning_rate": 4.961847898543038e-06, + "loss": 0.7872, + "step": 1035 + }, + { + "epoch": 0.08413871517907902, + "grad_norm": 6.218523263233032, + "learning_rate": 4.96173336294951e-06, + "loss": 0.6882, + "step": 1036 + }, + { + "epoch": 0.08421993015512061, + "grad_norm": 5.495336585299689, + "learning_rate": 4.961618657017063e-06, + "loss": 0.828, + "step": 1037 + }, + { + "epoch": 0.08430114513116219, + "grad_norm": 6.714525494166789, + "learning_rate": 4.961503780753633e-06, + "loss": 0.8362, + "step": 1038 + }, + { + "epoch": 0.08438236010720376, + "grad_norm": 3.212772209979488, + "learning_rate": 4.9613887341671675e-06, + "loss": 0.4224, + "step": 1039 + }, + { + "epoch": 0.08446357508324535, + "grad_norm": 7.183836025951974, + "learning_rate": 4.961273517265629e-06, + "loss": 0.5646, + "step": 1040 + }, + { + "epoch": 0.08454479005928693, + "grad_norm": 6.286950241807605, + "learning_rate": 4.961158130056989e-06, + "loss": 0.5711, + "step": 1041 + }, + { + "epoch": 0.08462600503532851, + "grad_norm": 5.85967031646826, + "learning_rate": 4.961042572549232e-06, + "loss": 0.4819, + "step": 1042 + }, + { + "epoch": 0.0847072200113701, + "grad_norm": 4.788980234170179, + "learning_rate": 4.960926844750353e-06, + "loss": 0.5843, + "step": 1043 + }, + { + "epoch": 0.08478843498741168, + "grad_norm": 6.955485093136964, + "learning_rate": 4.960810946668362e-06, + "loss": 0.5618, + "step": 1044 + }, + { + "epoch": 0.08486964996345327, + "grad_norm": 5.484890003960757, + "learning_rate": 4.960694878311276e-06, + "loss": 0.6486, + "step": 1045 + }, + { + "epoch": 0.08495086493949484, + "grad_norm": 5.542258816933063, + "learning_rate": 4.960578639687129e-06, + "loss": 0.4978, + "step": 1046 + }, + { + "epoch": 0.08503207991553642, + "grad_norm": 5.931712433067937, + "learning_rate": 4.960462230803961e-06, + "loss": 0.6843, + "step": 1047 + }, + { + "epoch": 0.08511329489157801, + "grad_norm": 5.759837139562491, + "learning_rate": 4.960345651669829e-06, + "loss": 0.5678, + "step": 1048 + }, + { + "epoch": 0.08519450986761959, + "grad_norm": 8.019301092937635, + "learning_rate": 4.960228902292799e-06, + "loss": 0.5629, + "step": 1049 + }, + { + "epoch": 0.08527572484366117, + "grad_norm": 4.806150258046436, + "learning_rate": 4.96011198268095e-06, + "loss": 0.4966, + "step": 1050 + }, + { + "epoch": 0.08535693981970276, + "grad_norm": 6.278721005688989, + "learning_rate": 4.959994892842371e-06, + "loss": 0.6009, + "step": 1051 + }, + { + "epoch": 0.08543815479574433, + "grad_norm": 4.006330485950786, + "learning_rate": 4.959877632785166e-06, + "loss": 0.538, + "step": 1052 + }, + { + "epoch": 0.08551936977178591, + "grad_norm": 6.0015668910114, + "learning_rate": 4.959760202517446e-06, + "loss": 0.5693, + "step": 1053 + }, + { + "epoch": 0.0856005847478275, + "grad_norm": 17.347793241402837, + "learning_rate": 4.959642602047339e-06, + "loss": 0.5231, + "step": 1054 + }, + { + "epoch": 0.08568179972386908, + "grad_norm": 4.788609767567765, + "learning_rate": 4.959524831382981e-06, + "loss": 0.6282, + "step": 1055 + }, + { + "epoch": 0.08576301469991067, + "grad_norm": 10.272501989404129, + "learning_rate": 4.9594068905325225e-06, + "loss": 0.7099, + "step": 1056 + }, + { + "epoch": 0.08584422967595225, + "grad_norm": 5.360284497181169, + "learning_rate": 4.959288779504122e-06, + "loss": 0.5964, + "step": 1057 + }, + { + "epoch": 0.08592544465199382, + "grad_norm": 4.376816971311538, + "learning_rate": 4.959170498305955e-06, + "loss": 0.6034, + "step": 1058 + }, + { + "epoch": 0.08600665962803541, + "grad_norm": 7.410246905461928, + "learning_rate": 4.959052046946203e-06, + "loss": 0.5496, + "step": 1059 + }, + { + "epoch": 0.08608787460407699, + "grad_norm": 3.455502795284755, + "learning_rate": 4.958933425433065e-06, + "loss": 0.6964, + "step": 1060 + }, + { + "epoch": 0.08616908958011857, + "grad_norm": 5.754779871120552, + "learning_rate": 4.958814633774747e-06, + "loss": 0.5759, + "step": 1061 + }, + { + "epoch": 0.08625030455616016, + "grad_norm": 5.304809027548407, + "learning_rate": 4.95869567197947e-06, + "loss": 0.807, + "step": 1062 + }, + { + "epoch": 0.08633151953220174, + "grad_norm": 6.315153689798026, + "learning_rate": 4.958576540055464e-06, + "loss": 0.6127, + "step": 1063 + }, + { + "epoch": 0.08641273450824331, + "grad_norm": 5.159058407796672, + "learning_rate": 4.958457238010974e-06, + "loss": 0.5677, + "step": 1064 + }, + { + "epoch": 0.0864939494842849, + "grad_norm": 6.440705718742274, + "learning_rate": 4.958337765854254e-06, + "loss": 0.6875, + "step": 1065 + }, + { + "epoch": 0.08657516446032648, + "grad_norm": 5.084236342123366, + "learning_rate": 4.958218123593572e-06, + "loss": 0.7366, + "step": 1066 + }, + { + "epoch": 0.08665637943636807, + "grad_norm": 3.7240825747904447, + "learning_rate": 4.958098311237205e-06, + "loss": 0.674, + "step": 1067 + }, + { + "epoch": 0.08673759441240965, + "grad_norm": 5.058020669894491, + "learning_rate": 4.9579783287934445e-06, + "loss": 0.6569, + "step": 1068 + }, + { + "epoch": 0.08681880938845123, + "grad_norm": 4.0274556261885515, + "learning_rate": 4.957858176270591e-06, + "loss": 0.6855, + "step": 1069 + }, + { + "epoch": 0.08690002436449282, + "grad_norm": 7.179073356157699, + "learning_rate": 4.957737853676961e-06, + "loss": 0.58, + "step": 1070 + }, + { + "epoch": 0.0869812393405344, + "grad_norm": 4.03973215146239, + "learning_rate": 4.957617361020879e-06, + "loss": 0.515, + "step": 1071 + }, + { + "epoch": 0.08706245431657597, + "grad_norm": 7.191602127334415, + "learning_rate": 4.9574966983106824e-06, + "loss": 0.7152, + "step": 1072 + }, + { + "epoch": 0.08714366929261756, + "grad_norm": 4.274773215099363, + "learning_rate": 4.95737586555472e-06, + "loss": 0.6789, + "step": 1073 + }, + { + "epoch": 0.08722488426865914, + "grad_norm": 4.114285344199829, + "learning_rate": 4.957254862761354e-06, + "loss": 0.7913, + "step": 1074 + }, + { + "epoch": 0.08730609924470072, + "grad_norm": 6.861324021554996, + "learning_rate": 4.957133689938955e-06, + "loss": 0.7393, + "step": 1075 + }, + { + "epoch": 0.08738731422074231, + "grad_norm": 6.978671101537034, + "learning_rate": 4.95701234709591e-06, + "loss": 0.6303, + "step": 1076 + }, + { + "epoch": 0.08746852919678388, + "grad_norm": 9.578370134137916, + "learning_rate": 4.956890834240613e-06, + "loss": 0.6739, + "step": 1077 + }, + { + "epoch": 0.08754974417282547, + "grad_norm": 4.953780236460816, + "learning_rate": 4.956769151381474e-06, + "loss": 0.6609, + "step": 1078 + }, + { + "epoch": 0.08763095914886705, + "grad_norm": 6.102476761076953, + "learning_rate": 4.9566472985269125e-06, + "loss": 0.5512, + "step": 1079 + }, + { + "epoch": 0.08771217412490863, + "grad_norm": 4.373415052928376, + "learning_rate": 4.956525275685358e-06, + "loss": 0.5459, + "step": 1080 + }, + { + "epoch": 0.08779338910095022, + "grad_norm": 5.532793766061284, + "learning_rate": 4.9564030828652565e-06, + "loss": 0.6867, + "step": 1081 + }, + { + "epoch": 0.0878746040769918, + "grad_norm": 3.8285573439072254, + "learning_rate": 4.956280720075062e-06, + "loss": 0.7538, + "step": 1082 + }, + { + "epoch": 0.08795581905303337, + "grad_norm": 6.492766812771604, + "learning_rate": 4.9561581873232415e-06, + "loss": 0.5405, + "step": 1083 + }, + { + "epoch": 0.08803703402907496, + "grad_norm": 12.542445075286674, + "learning_rate": 4.956035484618272e-06, + "loss": 0.5867, + "step": 1084 + }, + { + "epoch": 0.08811824900511654, + "grad_norm": 5.8788039197140005, + "learning_rate": 4.955912611968648e-06, + "loss": 0.5364, + "step": 1085 + }, + { + "epoch": 0.08819946398115812, + "grad_norm": 6.820020186044584, + "learning_rate": 4.955789569382866e-06, + "loss": 0.5554, + "step": 1086 + }, + { + "epoch": 0.08828067895719971, + "grad_norm": 5.44097898708547, + "learning_rate": 4.955666356869445e-06, + "loss": 0.6424, + "step": 1087 + }, + { + "epoch": 0.08836189393324129, + "grad_norm": 6.451838410522676, + "learning_rate": 4.955542974436908e-06, + "loss": 0.6376, + "step": 1088 + }, + { + "epoch": 0.08844310890928288, + "grad_norm": 16.847875335305353, + "learning_rate": 4.955419422093792e-06, + "loss": 0.5683, + "step": 1089 + }, + { + "epoch": 0.08852432388532445, + "grad_norm": 5.250679262222032, + "learning_rate": 4.955295699848649e-06, + "loss": 0.5726, + "step": 1090 + }, + { + "epoch": 0.08860553886136603, + "grad_norm": 3.708739891574932, + "learning_rate": 4.955171807710037e-06, + "loss": 0.664, + "step": 1091 + }, + { + "epoch": 0.08868675383740762, + "grad_norm": 6.291250688681625, + "learning_rate": 4.955047745686529e-06, + "loss": 0.5727, + "step": 1092 + }, + { + "epoch": 0.0887679688134492, + "grad_norm": 9.32506667375153, + "learning_rate": 4.954923513786711e-06, + "loss": 0.6444, + "step": 1093 + }, + { + "epoch": 0.08884918378949078, + "grad_norm": 5.9130602901334735, + "learning_rate": 4.954799112019178e-06, + "loss": 0.6713, + "step": 1094 + }, + { + "epoch": 0.08893039876553237, + "grad_norm": 7.27169259231534, + "learning_rate": 4.9546745403925385e-06, + "loss": 0.598, + "step": 1095 + }, + { + "epoch": 0.08901161374157394, + "grad_norm": 4.39443693321864, + "learning_rate": 4.954549798915412e-06, + "loss": 0.5987, + "step": 1096 + }, + { + "epoch": 0.08909282871761552, + "grad_norm": 5.108191362159325, + "learning_rate": 4.95442488759643e-06, + "loss": 0.5794, + "step": 1097 + }, + { + "epoch": 0.08917404369365711, + "grad_norm": 4.394532094744748, + "learning_rate": 4.954299806444236e-06, + "loss": 0.6292, + "step": 1098 + }, + { + "epoch": 0.08925525866969869, + "grad_norm": 5.785973974044764, + "learning_rate": 4.954174555467484e-06, + "loss": 0.7976, + "step": 1099 + }, + { + "epoch": 0.08933647364574028, + "grad_norm": 8.854881338118322, + "learning_rate": 4.954049134674842e-06, + "loss": 0.4992, + "step": 1100 + }, + { + "epoch": 0.08941768862178186, + "grad_norm": 10.211232540471157, + "learning_rate": 4.953923544074987e-06, + "loss": 0.612, + "step": 1101 + }, + { + "epoch": 0.08949890359782343, + "grad_norm": 3.8402408710819724, + "learning_rate": 4.953797783676611e-06, + "loss": 0.7663, + "step": 1102 + }, + { + "epoch": 0.08958011857386503, + "grad_norm": 4.979965541200029, + "learning_rate": 4.9536718534884136e-06, + "loss": 0.5334, + "step": 1103 + }, + { + "epoch": 0.0896613335499066, + "grad_norm": 5.601550825200515, + "learning_rate": 4.9535457535191104e-06, + "loss": 0.7872, + "step": 1104 + }, + { + "epoch": 0.08974254852594818, + "grad_norm": 7.030742327798493, + "learning_rate": 4.953419483777427e-06, + "loss": 0.5551, + "step": 1105 + }, + { + "epoch": 0.08982376350198977, + "grad_norm": 11.753328029891325, + "learning_rate": 4.953293044272099e-06, + "loss": 0.5917, + "step": 1106 + }, + { + "epoch": 0.08990497847803135, + "grad_norm": 4.23164973424406, + "learning_rate": 4.953166435011876e-06, + "loss": 0.7211, + "step": 1107 + }, + { + "epoch": 0.08998619345407292, + "grad_norm": 4.427467426240099, + "learning_rate": 4.953039656005519e-06, + "loss": 0.725, + "step": 1108 + }, + { + "epoch": 0.09006740843011452, + "grad_norm": 6.656857308336458, + "learning_rate": 4.9529127072618e-06, + "loss": 0.7282, + "step": 1109 + }, + { + "epoch": 0.09014862340615609, + "grad_norm": 4.606494946768081, + "learning_rate": 4.952785588789504e-06, + "loss": 0.6008, + "step": 1110 + }, + { + "epoch": 0.09022983838219768, + "grad_norm": 4.861377291580884, + "learning_rate": 4.9526583005974275e-06, + "loss": 0.7185, + "step": 1111 + }, + { + "epoch": 0.09031105335823926, + "grad_norm": 5.1172810841957785, + "learning_rate": 4.952530842694375e-06, + "loss": 0.451, + "step": 1112 + }, + { + "epoch": 0.09039226833428084, + "grad_norm": 5.226532353483265, + "learning_rate": 4.95240321508917e-06, + "loss": 0.658, + "step": 1113 + }, + { + "epoch": 0.09047348331032243, + "grad_norm": 5.970819410829736, + "learning_rate": 4.952275417790641e-06, + "loss": 0.6415, + "step": 1114 + }, + { + "epoch": 0.090554698286364, + "grad_norm": 5.235598768790702, + "learning_rate": 4.95214745080763e-06, + "loss": 0.5784, + "step": 1115 + }, + { + "epoch": 0.09063591326240558, + "grad_norm": 4.456877671742799, + "learning_rate": 4.952019314148995e-06, + "loss": 0.6458, + "step": 1116 + }, + { + "epoch": 0.09071712823844717, + "grad_norm": 5.531350494822164, + "learning_rate": 4.951891007823601e-06, + "loss": 0.7039, + "step": 1117 + }, + { + "epoch": 0.09079834321448875, + "grad_norm": 3.056795429652102, + "learning_rate": 4.951762531840325e-06, + "loss": 0.6661, + "step": 1118 + }, + { + "epoch": 0.09087955819053034, + "grad_norm": 5.721632966434159, + "learning_rate": 4.951633886208057e-06, + "loss": 0.591, + "step": 1119 + }, + { + "epoch": 0.09096077316657192, + "grad_norm": 8.349255186085697, + "learning_rate": 4.951505070935699e-06, + "loss": 0.6245, + "step": 1120 + }, + { + "epoch": 0.0910419881426135, + "grad_norm": 4.572535564011483, + "learning_rate": 4.951376086032166e-06, + "loss": 0.7326, + "step": 1121 + }, + { + "epoch": 0.09112320311865509, + "grad_norm": 4.793988248982188, + "learning_rate": 4.95124693150638e-06, + "loss": 0.6501, + "step": 1122 + }, + { + "epoch": 0.09120441809469666, + "grad_norm": 4.303548964129903, + "learning_rate": 4.951117607367281e-06, + "loss": 0.507, + "step": 1123 + }, + { + "epoch": 0.09128563307073824, + "grad_norm": 3.9997657093256613, + "learning_rate": 4.9509881136238144e-06, + "loss": 0.647, + "step": 1124 + }, + { + "epoch": 0.09136684804677983, + "grad_norm": 6.458649539085403, + "learning_rate": 4.950858450284943e-06, + "loss": 0.7264, + "step": 1125 + }, + { + "epoch": 0.09144806302282141, + "grad_norm": 8.005259138006465, + "learning_rate": 4.950728617359637e-06, + "loss": 0.7207, + "step": 1126 + }, + { + "epoch": 0.09152927799886298, + "grad_norm": 10.787406113255107, + "learning_rate": 4.950598614856882e-06, + "loss": 0.7295, + "step": 1127 + }, + { + "epoch": 0.09161049297490458, + "grad_norm": 7.384428417264019, + "learning_rate": 4.950468442785672e-06, + "loss": 0.7061, + "step": 1128 + }, + { + "epoch": 0.09169170795094615, + "grad_norm": 6.688743726208685, + "learning_rate": 4.9503381011550145e-06, + "loss": 0.5785, + "step": 1129 + }, + { + "epoch": 0.09177292292698774, + "grad_norm": 7.409666261351277, + "learning_rate": 4.950207589973929e-06, + "loss": 0.6196, + "step": 1130 + }, + { + "epoch": 0.09185413790302932, + "grad_norm": 4.151052125921888, + "learning_rate": 4.950076909251445e-06, + "loss": 0.6793, + "step": 1131 + }, + { + "epoch": 0.0919353528790709, + "grad_norm": 6.945429669541111, + "learning_rate": 4.949946058996606e-06, + "loss": 0.7033, + "step": 1132 + }, + { + "epoch": 0.09201656785511249, + "grad_norm": 5.827624293232605, + "learning_rate": 4.949815039218467e-06, + "loss": 0.673, + "step": 1133 + }, + { + "epoch": 0.09209778283115407, + "grad_norm": 7.170155218321833, + "learning_rate": 4.949683849926092e-06, + "loss": 0.5155, + "step": 1134 + }, + { + "epoch": 0.09217899780719564, + "grad_norm": 4.143518961770841, + "learning_rate": 4.949552491128559e-06, + "loss": 0.4783, + "step": 1135 + }, + { + "epoch": 0.09226021278323723, + "grad_norm": 4.759778607114406, + "learning_rate": 4.9494209628349585e-06, + "loss": 0.7102, + "step": 1136 + }, + { + "epoch": 0.09234142775927881, + "grad_norm": 4.218386626424805, + "learning_rate": 4.94928926505439e-06, + "loss": 0.7196, + "step": 1137 + }, + { + "epoch": 0.09242264273532039, + "grad_norm": 3.168695853330223, + "learning_rate": 4.949157397795967e-06, + "loss": 0.5415, + "step": 1138 + }, + { + "epoch": 0.09250385771136198, + "grad_norm": 3.826938431817392, + "learning_rate": 4.949025361068814e-06, + "loss": 0.7162, + "step": 1139 + }, + { + "epoch": 0.09258507268740356, + "grad_norm": 6.290926748077408, + "learning_rate": 4.9488931548820685e-06, + "loss": 0.5282, + "step": 1140 + }, + { + "epoch": 0.09266628766344515, + "grad_norm": 7.032750557333736, + "learning_rate": 4.9487607792448765e-06, + "loss": 0.5308, + "step": 1141 + }, + { + "epoch": 0.09274750263948672, + "grad_norm": 5.947078730213946, + "learning_rate": 4.948628234166398e-06, + "loss": 0.5358, + "step": 1142 + }, + { + "epoch": 0.0928287176155283, + "grad_norm": 5.843171180813921, + "learning_rate": 4.948495519655805e-06, + "loss": 0.5523, + "step": 1143 + }, + { + "epoch": 0.09290993259156989, + "grad_norm": 4.227961968923302, + "learning_rate": 4.948362635722281e-06, + "loss": 0.6363, + "step": 1144 + }, + { + "epoch": 0.09299114756761147, + "grad_norm": 7.599291580837134, + "learning_rate": 4.948229582375021e-06, + "loss": 0.5243, + "step": 1145 + }, + { + "epoch": 0.09307236254365304, + "grad_norm": 6.482980832600019, + "learning_rate": 4.948096359623229e-06, + "loss": 0.5535, + "step": 1146 + }, + { + "epoch": 0.09315357751969464, + "grad_norm": 8.221446398590253, + "learning_rate": 4.9479629674761265e-06, + "loss": 0.603, + "step": 1147 + }, + { + "epoch": 0.09323479249573621, + "grad_norm": 3.448295224589768, + "learning_rate": 4.947829405942942e-06, + "loss": 0.667, + "step": 1148 + }, + { + "epoch": 0.09331600747177779, + "grad_norm": 3.719043167127091, + "learning_rate": 4.947695675032919e-06, + "loss": 0.5841, + "step": 1149 + }, + { + "epoch": 0.09339722244781938, + "grad_norm": 4.156557050698605, + "learning_rate": 4.947561774755307e-06, + "loss": 0.6107, + "step": 1150 + }, + { + "epoch": 0.09347843742386096, + "grad_norm": 4.192630167978793, + "learning_rate": 4.947427705119375e-06, + "loss": 0.4772, + "step": 1151 + }, + { + "epoch": 0.09355965239990255, + "grad_norm": 5.508607665282113, + "learning_rate": 4.947293466134399e-06, + "loss": 0.6472, + "step": 1152 + }, + { + "epoch": 0.09364086737594413, + "grad_norm": 6.1116074527140185, + "learning_rate": 4.947159057809668e-06, + "loss": 0.5252, + "step": 1153 + }, + { + "epoch": 0.0937220823519857, + "grad_norm": 4.348443564825216, + "learning_rate": 4.9470244801544794e-06, + "loss": 0.6287, + "step": 1154 + }, + { + "epoch": 0.0938032973280273, + "grad_norm": 4.398778822812722, + "learning_rate": 4.94688973317815e-06, + "loss": 0.6383, + "step": 1155 + }, + { + "epoch": 0.09388451230406887, + "grad_norm": 8.481255729214475, + "learning_rate": 4.946754816889999e-06, + "loss": 0.6607, + "step": 1156 + }, + { + "epoch": 0.09396572728011045, + "grad_norm": 8.572467423080703, + "learning_rate": 4.946619731299365e-06, + "loss": 0.5745, + "step": 1157 + }, + { + "epoch": 0.09404694225615204, + "grad_norm": 7.588248988044493, + "learning_rate": 4.946484476415593e-06, + "loss": 0.5236, + "step": 1158 + }, + { + "epoch": 0.09412815723219362, + "grad_norm": 6.803366010970415, + "learning_rate": 4.946349052248044e-06, + "loss": 0.4852, + "step": 1159 + }, + { + "epoch": 0.09420937220823519, + "grad_norm": 3.614463840238927, + "learning_rate": 4.946213458806088e-06, + "loss": 0.4897, + "step": 1160 + }, + { + "epoch": 0.09429058718427678, + "grad_norm": 4.806863459042568, + "learning_rate": 4.946077696099107e-06, + "loss": 0.6462, + "step": 1161 + }, + { + "epoch": 0.09437180216031836, + "grad_norm": 5.367629362476872, + "learning_rate": 4.945941764136494e-06, + "loss": 0.5871, + "step": 1162 + }, + { + "epoch": 0.09445301713635995, + "grad_norm": 3.4996137215165826, + "learning_rate": 4.945805662927657e-06, + "loss": 0.5799, + "step": 1163 + }, + { + "epoch": 0.09453423211240153, + "grad_norm": 5.909531771722533, + "learning_rate": 4.9456693924820124e-06, + "loss": 0.672, + "step": 1164 + }, + { + "epoch": 0.0946154470884431, + "grad_norm": 11.436424144343768, + "learning_rate": 4.945532952808989e-06, + "loss": 0.5327, + "step": 1165 + }, + { + "epoch": 0.0946966620644847, + "grad_norm": 3.885963408729381, + "learning_rate": 4.945396343918027e-06, + "loss": 0.6624, + "step": 1166 + }, + { + "epoch": 0.09477787704052627, + "grad_norm": 6.229973134745262, + "learning_rate": 4.945259565818582e-06, + "loss": 0.5427, + "step": 1167 + }, + { + "epoch": 0.09485909201656785, + "grad_norm": 4.17803636708481, + "learning_rate": 4.9451226185201155e-06, + "loss": 0.5436, + "step": 1168 + }, + { + "epoch": 0.09494030699260944, + "grad_norm": 3.7870816927723228, + "learning_rate": 4.9449855020321045e-06, + "loss": 0.6648, + "step": 1169 + }, + { + "epoch": 0.09502152196865102, + "grad_norm": 5.063697581248885, + "learning_rate": 4.944848216364036e-06, + "loss": 0.6312, + "step": 1170 + }, + { + "epoch": 0.0951027369446926, + "grad_norm": 7.690605712577253, + "learning_rate": 4.944710761525411e-06, + "loss": 0.62, + "step": 1171 + }, + { + "epoch": 0.09518395192073419, + "grad_norm": 5.020085939206364, + "learning_rate": 4.94457313752574e-06, + "loss": 0.5703, + "step": 1172 + }, + { + "epoch": 0.09526516689677576, + "grad_norm": 4.5490891920736525, + "learning_rate": 4.944435344374544e-06, + "loss": 0.7024, + "step": 1173 + }, + { + "epoch": 0.09534638187281735, + "grad_norm": 5.913707485877068, + "learning_rate": 4.944297382081361e-06, + "loss": 0.5042, + "step": 1174 + }, + { + "epoch": 0.09542759684885893, + "grad_norm": 3.759445496054164, + "learning_rate": 4.944159250655734e-06, + "loss": 0.7242, + "step": 1175 + }, + { + "epoch": 0.09550881182490051, + "grad_norm": 4.721355611086034, + "learning_rate": 4.944020950107224e-06, + "loss": 0.6527, + "step": 1176 + }, + { + "epoch": 0.0955900268009421, + "grad_norm": 4.718530932065967, + "learning_rate": 4.943882480445398e-06, + "loss": 0.6651, + "step": 1177 + }, + { + "epoch": 0.09567124177698368, + "grad_norm": 3.7323028514817764, + "learning_rate": 4.943743841679839e-06, + "loss": 0.4774, + "step": 1178 + }, + { + "epoch": 0.09575245675302525, + "grad_norm": 7.764398624185714, + "learning_rate": 4.943605033820138e-06, + "loss": 0.5687, + "step": 1179 + }, + { + "epoch": 0.09583367172906684, + "grad_norm": 5.7460830112551715, + "learning_rate": 4.943466056875903e-06, + "loss": 0.6266, + "step": 1180 + }, + { + "epoch": 0.09591488670510842, + "grad_norm": 6.12456544392407, + "learning_rate": 4.943326910856749e-06, + "loss": 0.9285, + "step": 1181 + }, + { + "epoch": 0.09599610168115, + "grad_norm": 5.9339500045832505, + "learning_rate": 4.943187595772302e-06, + "loss": 0.572, + "step": 1182 + }, + { + "epoch": 0.09607731665719159, + "grad_norm": 5.327929545992472, + "learning_rate": 4.943048111632205e-06, + "loss": 0.7426, + "step": 1183 + }, + { + "epoch": 0.09615853163323317, + "grad_norm": 4.422671758582798, + "learning_rate": 4.942908458446107e-06, + "loss": 0.5256, + "step": 1184 + }, + { + "epoch": 0.09623974660927476, + "grad_norm": 4.210710659594965, + "learning_rate": 4.942768636223674e-06, + "loss": 0.6544, + "step": 1185 + }, + { + "epoch": 0.09632096158531633, + "grad_norm": 5.86594483722807, + "learning_rate": 4.94262864497458e-06, + "loss": 0.6285, + "step": 1186 + }, + { + "epoch": 0.09640217656135791, + "grad_norm": 11.95231315351679, + "learning_rate": 4.94248848470851e-06, + "loss": 0.5907, + "step": 1187 + }, + { + "epoch": 0.0964833915373995, + "grad_norm": 4.610476439616043, + "learning_rate": 4.9423481554351636e-06, + "loss": 0.5891, + "step": 1188 + }, + { + "epoch": 0.09656460651344108, + "grad_norm": 15.02936414694867, + "learning_rate": 4.9422076571642516e-06, + "loss": 0.6007, + "step": 1189 + }, + { + "epoch": 0.09664582148948266, + "grad_norm": 7.408105632703543, + "learning_rate": 4.942066989905494e-06, + "loss": 0.5263, + "step": 1190 + }, + { + "epoch": 0.09672703646552425, + "grad_norm": 6.6983744546854105, + "learning_rate": 4.941926153668626e-06, + "loss": 0.6209, + "step": 1191 + }, + { + "epoch": 0.09680825144156582, + "grad_norm": 12.37964028499769, + "learning_rate": 4.941785148463391e-06, + "loss": 0.5479, + "step": 1192 + }, + { + "epoch": 0.0968894664176074, + "grad_norm": 7.949353882372964, + "learning_rate": 4.941643974299547e-06, + "loss": 0.6356, + "step": 1193 + }, + { + "epoch": 0.09697068139364899, + "grad_norm": 12.259554197141354, + "learning_rate": 4.941502631186863e-06, + "loss": 0.5876, + "step": 1194 + }, + { + "epoch": 0.09705189636969057, + "grad_norm": 4.5151134917162254, + "learning_rate": 4.941361119135118e-06, + "loss": 0.6081, + "step": 1195 + }, + { + "epoch": 0.09713311134573216, + "grad_norm": 5.027871122019048, + "learning_rate": 4.941219438154103e-06, + "loss": 0.7848, + "step": 1196 + }, + { + "epoch": 0.09721432632177374, + "grad_norm": 6.044100876833535, + "learning_rate": 4.941077588253624e-06, + "loss": 0.6873, + "step": 1197 + }, + { + "epoch": 0.09729554129781531, + "grad_norm": 5.889644758237491, + "learning_rate": 4.940935569443496e-06, + "loss": 0.5557, + "step": 1198 + }, + { + "epoch": 0.0973767562738569, + "grad_norm": 15.297541275971485, + "learning_rate": 4.940793381733544e-06, + "loss": 0.547, + "step": 1199 + }, + { + "epoch": 0.09745797124989848, + "grad_norm": 12.207281968202853, + "learning_rate": 4.940651025133607e-06, + "loss": 0.7011, + "step": 1200 + }, + { + "epoch": 0.09753918622594006, + "grad_norm": 5.195830064818431, + "learning_rate": 4.9405084996535376e-06, + "loss": 0.7094, + "step": 1201 + }, + { + "epoch": 0.09762040120198165, + "grad_norm": 4.519169304548104, + "learning_rate": 4.940365805303195e-06, + "loss": 0.5373, + "step": 1202 + }, + { + "epoch": 0.09770161617802323, + "grad_norm": 4.18578492619036, + "learning_rate": 4.940222942092455e-06, + "loss": 0.5373, + "step": 1203 + }, + { + "epoch": 0.0977828311540648, + "grad_norm": 5.871433014218688, + "learning_rate": 4.940079910031201e-06, + "loss": 0.6525, + "step": 1204 + }, + { + "epoch": 0.0978640461301064, + "grad_norm": 7.770798201538124, + "learning_rate": 4.939936709129333e-06, + "loss": 0.6541, + "step": 1205 + }, + { + "epoch": 0.09794526110614797, + "grad_norm": 10.00163900865208, + "learning_rate": 4.939793339396756e-06, + "loss": 0.6166, + "step": 1206 + }, + { + "epoch": 0.09802647608218956, + "grad_norm": 6.793365007116543, + "learning_rate": 4.939649800843394e-06, + "loss": 0.5834, + "step": 1207 + }, + { + "epoch": 0.09810769105823114, + "grad_norm": 3.8998039709967927, + "learning_rate": 4.939506093479176e-06, + "loss": 0.654, + "step": 1208 + }, + { + "epoch": 0.09818890603427272, + "grad_norm": 3.5977727046620473, + "learning_rate": 4.939362217314048e-06, + "loss": 0.5947, + "step": 1209 + }, + { + "epoch": 0.0982701210103143, + "grad_norm": 3.430672786433987, + "learning_rate": 4.939218172357965e-06, + "loss": 0.4971, + "step": 1210 + }, + { + "epoch": 0.09835133598635588, + "grad_norm": 4.703232976940314, + "learning_rate": 4.9390739586208926e-06, + "loss": 0.6256, + "step": 1211 + }, + { + "epoch": 0.09843255096239746, + "grad_norm": 8.889711877291688, + "learning_rate": 4.938929576112812e-06, + "loss": 0.6425, + "step": 1212 + }, + { + "epoch": 0.09851376593843905, + "grad_norm": 5.215270092203213, + "learning_rate": 4.938785024843712e-06, + "loss": 0.5402, + "step": 1213 + }, + { + "epoch": 0.09859498091448063, + "grad_norm": 4.480490370978996, + "learning_rate": 4.938640304823596e-06, + "loss": 0.4592, + "step": 1214 + }, + { + "epoch": 0.0986761958905222, + "grad_norm": 4.5775356603291995, + "learning_rate": 4.938495416062477e-06, + "loss": 0.5733, + "step": 1215 + }, + { + "epoch": 0.0987574108665638, + "grad_norm": 5.374607118889353, + "learning_rate": 4.93835035857038e-06, + "loss": 0.4699, + "step": 1216 + }, + { + "epoch": 0.09883862584260537, + "grad_norm": 8.062779666258928, + "learning_rate": 4.938205132357344e-06, + "loss": 0.6582, + "step": 1217 + }, + { + "epoch": 0.09891984081864696, + "grad_norm": 5.276936843727084, + "learning_rate": 4.938059737433416e-06, + "loss": 0.4957, + "step": 1218 + }, + { + "epoch": 0.09900105579468854, + "grad_norm": 5.321609718799565, + "learning_rate": 4.9379141738086575e-06, + "loss": 0.5664, + "step": 1219 + }, + { + "epoch": 0.09908227077073012, + "grad_norm": 4.479754054431104, + "learning_rate": 4.9377684414931415e-06, + "loss": 0.7467, + "step": 1220 + }, + { + "epoch": 0.09916348574677171, + "grad_norm": 4.74529856971756, + "learning_rate": 4.937622540496951e-06, + "loss": 0.5793, + "step": 1221 + }, + { + "epoch": 0.09924470072281329, + "grad_norm": 3.911535284660022, + "learning_rate": 4.937476470830181e-06, + "loss": 0.6115, + "step": 1222 + }, + { + "epoch": 0.09932591569885486, + "grad_norm": 4.166632824497643, + "learning_rate": 4.937330232502939e-06, + "loss": 0.5927, + "step": 1223 + }, + { + "epoch": 0.09940713067489645, + "grad_norm": 5.587700034937294, + "learning_rate": 4.937183825525346e-06, + "loss": 0.9838, + "step": 1224 + }, + { + "epoch": 0.09948834565093803, + "grad_norm": 4.01441412316951, + "learning_rate": 4.937037249907529e-06, + "loss": 0.6088, + "step": 1225 + }, + { + "epoch": 0.09956956062697961, + "grad_norm": 5.422702256253332, + "learning_rate": 4.9368905056596336e-06, + "loss": 0.8051, + "step": 1226 + }, + { + "epoch": 0.0996507756030212, + "grad_norm": 4.730421753529755, + "learning_rate": 4.936743592791812e-06, + "loss": 0.8022, + "step": 1227 + }, + { + "epoch": 0.09973199057906278, + "grad_norm": 6.417481219598529, + "learning_rate": 4.936596511314229e-06, + "loss": 0.7359, + "step": 1228 + }, + { + "epoch": 0.09981320555510437, + "grad_norm": 5.936442260399057, + "learning_rate": 4.936449261237064e-06, + "loss": 0.4835, + "step": 1229 + }, + { + "epoch": 0.09989442053114594, + "grad_norm": 5.161367234189236, + "learning_rate": 4.936301842570505e-06, + "loss": 0.6098, + "step": 1230 + }, + { + "epoch": 0.09997563550718752, + "grad_norm": 6.171812579141409, + "learning_rate": 4.936154255324751e-06, + "loss": 0.5208, + "step": 1231 + }, + { + "epoch": 0.10005685048322911, + "grad_norm": 6.898043493723934, + "learning_rate": 4.936006499510016e-06, + "loss": 0.6272, + "step": 1232 + }, + { + "epoch": 0.10013806545927069, + "grad_norm": 5.279799808361505, + "learning_rate": 4.935858575136525e-06, + "loss": 0.6761, + "step": 1233 + }, + { + "epoch": 0.10021928043531227, + "grad_norm": 6.1780863372983, + "learning_rate": 4.935710482214512e-06, + "loss": 0.5666, + "step": 1234 + }, + { + "epoch": 0.10030049541135386, + "grad_norm": 4.127005405020764, + "learning_rate": 4.935562220754224e-06, + "loss": 0.7762, + "step": 1235 + }, + { + "epoch": 0.10038171038739543, + "grad_norm": 6.939945712344684, + "learning_rate": 4.935413790765919e-06, + "loss": 0.5601, + "step": 1236 + }, + { + "epoch": 0.10046292536343701, + "grad_norm": 5.4373785364212965, + "learning_rate": 4.935265192259871e-06, + "loss": 0.5489, + "step": 1237 + }, + { + "epoch": 0.1005441403394786, + "grad_norm": 3.19887214936069, + "learning_rate": 4.935116425246359e-06, + "loss": 0.6456, + "step": 1238 + }, + { + "epoch": 0.10062535531552018, + "grad_norm": 7.543707007832977, + "learning_rate": 4.934967489735679e-06, + "loss": 0.5061, + "step": 1239 + }, + { + "epoch": 0.10070657029156177, + "grad_norm": 4.008848618860738, + "learning_rate": 4.934818385738135e-06, + "loss": 0.6719, + "step": 1240 + }, + { + "epoch": 0.10078778526760335, + "grad_norm": 4.006099731057698, + "learning_rate": 4.934669113264044e-06, + "loss": 0.6852, + "step": 1241 + }, + { + "epoch": 0.10086900024364492, + "grad_norm": 8.793769739342025, + "learning_rate": 4.934519672323737e-06, + "loss": 0.5916, + "step": 1242 + }, + { + "epoch": 0.10095021521968651, + "grad_norm": 5.1218716406597835, + "learning_rate": 4.9343700629275525e-06, + "loss": 0.4989, + "step": 1243 + }, + { + "epoch": 0.10103143019572809, + "grad_norm": 4.3513298577507245, + "learning_rate": 4.934220285085843e-06, + "loss": 0.5374, + "step": 1244 + }, + { + "epoch": 0.10111264517176967, + "grad_norm": 5.858277317975228, + "learning_rate": 4.934070338808974e-06, + "loss": 0.4365, + "step": 1245 + }, + { + "epoch": 0.10119386014781126, + "grad_norm": 4.971141537823225, + "learning_rate": 4.933920224107319e-06, + "loss": 0.6175, + "step": 1246 + }, + { + "epoch": 0.10127507512385284, + "grad_norm": 7.572976492490089, + "learning_rate": 4.933769940991266e-06, + "loss": 0.6484, + "step": 1247 + }, + { + "epoch": 0.10135629009989441, + "grad_norm": 7.466443178328625, + "learning_rate": 4.933619489471213e-06, + "loss": 0.6078, + "step": 1248 + }, + { + "epoch": 0.101437505075936, + "grad_norm": 3.837978031503154, + "learning_rate": 4.933468869557572e-06, + "loss": 0.672, + "step": 1249 + }, + { + "epoch": 0.10151872005197758, + "grad_norm": 4.587774401604302, + "learning_rate": 4.933318081260763e-06, + "loss": 0.6828, + "step": 1250 + }, + { + "epoch": 0.10159993502801917, + "grad_norm": 5.728533677177494, + "learning_rate": 4.933167124591222e-06, + "loss": 0.5304, + "step": 1251 + }, + { + "epoch": 0.10168115000406075, + "grad_norm": 7.693599571430989, + "learning_rate": 4.9330159995593926e-06, + "loss": 0.6125, + "step": 1252 + }, + { + "epoch": 0.10176236498010233, + "grad_norm": 5.284936139752091, + "learning_rate": 4.9328647061757326e-06, + "loss": 0.6306, + "step": 1253 + }, + { + "epoch": 0.10184357995614392, + "grad_norm": 4.502661220996693, + "learning_rate": 4.932713244450712e-06, + "loss": 0.6503, + "step": 1254 + }, + { + "epoch": 0.1019247949321855, + "grad_norm": 30.435010504189158, + "learning_rate": 4.932561614394809e-06, + "loss": 0.4843, + "step": 1255 + }, + { + "epoch": 0.10200600990822707, + "grad_norm": 3.2680414669887226, + "learning_rate": 4.932409816018516e-06, + "loss": 0.5335, + "step": 1256 + }, + { + "epoch": 0.10208722488426866, + "grad_norm": 5.550536415470609, + "learning_rate": 4.932257849332337e-06, + "loss": 0.6274, + "step": 1257 + }, + { + "epoch": 0.10216843986031024, + "grad_norm": 6.490325484791516, + "learning_rate": 4.932105714346788e-06, + "loss": 0.7055, + "step": 1258 + }, + { + "epoch": 0.10224965483635182, + "grad_norm": 4.635391717749872, + "learning_rate": 4.931953411072395e-06, + "loss": 0.7677, + "step": 1259 + }, + { + "epoch": 0.10233086981239341, + "grad_norm": 5.213698736457178, + "learning_rate": 4.931800939519697e-06, + "loss": 0.6283, + "step": 1260 + }, + { + "epoch": 0.10241208478843498, + "grad_norm": 12.81670585201401, + "learning_rate": 4.931648299699245e-06, + "loss": 0.5411, + "step": 1261 + }, + { + "epoch": 0.10249329976447658, + "grad_norm": 4.7979729375829425, + "learning_rate": 4.931495491621598e-06, + "loss": 0.6, + "step": 1262 + }, + { + "epoch": 0.10257451474051815, + "grad_norm": 5.738885766711591, + "learning_rate": 4.931342515297333e-06, + "loss": 0.6029, + "step": 1263 + }, + { + "epoch": 0.10265572971655973, + "grad_norm": 4.5708865991196586, + "learning_rate": 4.931189370737033e-06, + "loss": 0.6877, + "step": 1264 + }, + { + "epoch": 0.10273694469260132, + "grad_norm": 4.292203823138975, + "learning_rate": 4.931036057951295e-06, + "loss": 0.6054, + "step": 1265 + }, + { + "epoch": 0.1028181596686429, + "grad_norm": 5.094248884490348, + "learning_rate": 4.930882576950728e-06, + "loss": 1.016, + "step": 1266 + }, + { + "epoch": 0.10289937464468447, + "grad_norm": 7.74744023190108, + "learning_rate": 4.930728927745954e-06, + "loss": 0.6266, + "step": 1267 + }, + { + "epoch": 0.10298058962072607, + "grad_norm": 7.178060412399686, + "learning_rate": 4.930575110347601e-06, + "loss": 0.5436, + "step": 1268 + }, + { + "epoch": 0.10306180459676764, + "grad_norm": 4.8955814375005, + "learning_rate": 4.9304211247663135e-06, + "loss": 0.6069, + "step": 1269 + }, + { + "epoch": 0.10314301957280922, + "grad_norm": 6.4873010558747675, + "learning_rate": 4.930266971012748e-06, + "loss": 0.5739, + "step": 1270 + }, + { + "epoch": 0.10322423454885081, + "grad_norm": 5.62659346934576, + "learning_rate": 4.930112649097569e-06, + "loss": 0.7317, + "step": 1271 + }, + { + "epoch": 0.10330544952489239, + "grad_norm": 5.596039021432254, + "learning_rate": 4.929958159031457e-06, + "loss": 0.4922, + "step": 1272 + }, + { + "epoch": 0.10338666450093398, + "grad_norm": 5.488922386676459, + "learning_rate": 4.9298035008251e-06, + "loss": 0.5276, + "step": 1273 + }, + { + "epoch": 0.10346787947697555, + "grad_norm": 5.79593069742807, + "learning_rate": 4.929648674489201e-06, + "loss": 0.5529, + "step": 1274 + }, + { + "epoch": 0.10354909445301713, + "grad_norm": 9.122388238453604, + "learning_rate": 4.929493680034472e-06, + "loss": 0.6647, + "step": 1275 + }, + { + "epoch": 0.10363030942905872, + "grad_norm": 4.654226716442984, + "learning_rate": 4.929338517471638e-06, + "loss": 0.6721, + "step": 1276 + }, + { + "epoch": 0.1037115244051003, + "grad_norm": 4.921506870025976, + "learning_rate": 4.929183186811436e-06, + "loss": 0.4812, + "step": 1277 + }, + { + "epoch": 0.10379273938114188, + "grad_norm": 8.206987541104693, + "learning_rate": 4.9290276880646144e-06, + "loss": 0.6183, + "step": 1278 + }, + { + "epoch": 0.10387395435718347, + "grad_norm": 6.664251212109854, + "learning_rate": 4.928872021241932e-06, + "loss": 0.5676, + "step": 1279 + }, + { + "epoch": 0.10395516933322504, + "grad_norm": 5.059968838894893, + "learning_rate": 4.92871618635416e-06, + "loss": 0.5516, + "step": 1280 + }, + { + "epoch": 0.10403638430926662, + "grad_norm": 4.462582725976913, + "learning_rate": 4.928560183412081e-06, + "loss": 0.637, + "step": 1281 + }, + { + "epoch": 0.10411759928530821, + "grad_norm": 13.013107165252704, + "learning_rate": 4.928404012426491e-06, + "loss": 0.585, + "step": 1282 + }, + { + "epoch": 0.10419881426134979, + "grad_norm": 4.877077629277337, + "learning_rate": 4.9282476734081955e-06, + "loss": 0.4416, + "step": 1283 + }, + { + "epoch": 0.10428002923739138, + "grad_norm": 5.339858862218509, + "learning_rate": 4.928091166368013e-06, + "loss": 0.534, + "step": 1284 + }, + { + "epoch": 0.10436124421343296, + "grad_norm": 4.638726269664911, + "learning_rate": 4.927934491316771e-06, + "loss": 0.5402, + "step": 1285 + }, + { + "epoch": 0.10444245918947453, + "grad_norm": 5.279866806349571, + "learning_rate": 4.927777648265313e-06, + "loss": 0.8809, + "step": 1286 + }, + { + "epoch": 0.10452367416551613, + "grad_norm": 10.172012498738642, + "learning_rate": 4.927620637224489e-06, + "loss": 0.6804, + "step": 1287 + }, + { + "epoch": 0.1046048891415577, + "grad_norm": 4.0125921901692605, + "learning_rate": 4.927463458205167e-06, + "loss": 0.484, + "step": 1288 + }, + { + "epoch": 0.10468610411759928, + "grad_norm": 5.038605450511745, + "learning_rate": 4.9273061112182195e-06, + "loss": 0.4823, + "step": 1289 + }, + { + "epoch": 0.10476731909364087, + "grad_norm": 11.514242344873601, + "learning_rate": 4.9271485962745356e-06, + "loss": 0.5736, + "step": 1290 + }, + { + "epoch": 0.10484853406968245, + "grad_norm": 5.2507131812652705, + "learning_rate": 4.9269909133850146e-06, + "loss": 0.4464, + "step": 1291 + }, + { + "epoch": 0.10492974904572404, + "grad_norm": 3.587842597108573, + "learning_rate": 4.926833062560566e-06, + "loss": 0.5488, + "step": 1292 + }, + { + "epoch": 0.10501096402176562, + "grad_norm": 4.9325898595523405, + "learning_rate": 4.926675043812115e-06, + "loss": 0.7937, + "step": 1293 + }, + { + "epoch": 0.10509217899780719, + "grad_norm": 4.089069618738186, + "learning_rate": 4.926516857150593e-06, + "loss": 0.6927, + "step": 1294 + }, + { + "epoch": 0.10517339397384878, + "grad_norm": 4.157543455041517, + "learning_rate": 4.926358502586948e-06, + "loss": 0.6499, + "step": 1295 + }, + { + "epoch": 0.10525460894989036, + "grad_norm": 4.591475297442821, + "learning_rate": 4.9261999801321345e-06, + "loss": 0.6702, + "step": 1296 + }, + { + "epoch": 0.10533582392593194, + "grad_norm": 5.076099157718857, + "learning_rate": 4.9260412897971225e-06, + "loss": 0.5649, + "step": 1297 + }, + { + "epoch": 0.10541703890197353, + "grad_norm": 5.92574396887693, + "learning_rate": 4.9258824315928935e-06, + "loss": 0.6207, + "step": 1298 + }, + { + "epoch": 0.1054982538780151, + "grad_norm": 4.295889777812483, + "learning_rate": 4.925723405530439e-06, + "loss": 0.6171, + "step": 1299 + }, + { + "epoch": 0.10557946885405668, + "grad_norm": 6.0185008543132055, + "learning_rate": 4.925564211620764e-06, + "loss": 0.6708, + "step": 1300 + }, + { + "epoch": 0.10566068383009827, + "grad_norm": 4.288628260503543, + "learning_rate": 4.9254048498748804e-06, + "loss": 0.5312, + "step": 1301 + }, + { + "epoch": 0.10574189880613985, + "grad_norm": 5.912622580277487, + "learning_rate": 4.925245320303819e-06, + "loss": 0.6256, + "step": 1302 + }, + { + "epoch": 0.10582311378218144, + "grad_norm": 4.036381733012363, + "learning_rate": 4.925085622918618e-06, + "loss": 0.6512, + "step": 1303 + }, + { + "epoch": 0.10590432875822302, + "grad_norm": 11.770792105747262, + "learning_rate": 4.924925757730324e-06, + "loss": 0.6243, + "step": 1304 + }, + { + "epoch": 0.1059855437342646, + "grad_norm": 7.6314249401022085, + "learning_rate": 4.924765724750002e-06, + "loss": 0.5521, + "step": 1305 + }, + { + "epoch": 0.10606675871030619, + "grad_norm": 4.238420473450789, + "learning_rate": 4.9246055239887255e-06, + "loss": 0.626, + "step": 1306 + }, + { + "epoch": 0.10614797368634776, + "grad_norm": 5.023366885034705, + "learning_rate": 4.924445155457578e-06, + "loss": 0.7469, + "step": 1307 + }, + { + "epoch": 0.10622918866238934, + "grad_norm": 5.927044022062121, + "learning_rate": 4.924284619167657e-06, + "loss": 0.6673, + "step": 1308 + }, + { + "epoch": 0.10631040363843093, + "grad_norm": 27.667866009821907, + "learning_rate": 4.924123915130072e-06, + "loss": 0.4591, + "step": 1309 + }, + { + "epoch": 0.10639161861447251, + "grad_norm": 6.067033293424752, + "learning_rate": 4.92396304335594e-06, + "loss": 0.7129, + "step": 1310 + }, + { + "epoch": 0.10647283359051408, + "grad_norm": 4.48662582291358, + "learning_rate": 4.923802003856395e-06, + "loss": 0.7251, + "step": 1311 + }, + { + "epoch": 0.10655404856655568, + "grad_norm": 8.910396684118473, + "learning_rate": 4.923640796642578e-06, + "loss": 0.5621, + "step": 1312 + }, + { + "epoch": 0.10663526354259725, + "grad_norm": 10.709244615053429, + "learning_rate": 4.923479421725646e-06, + "loss": 0.7027, + "step": 1313 + }, + { + "epoch": 0.10671647851863884, + "grad_norm": 6.558186461825374, + "learning_rate": 4.923317879116764e-06, + "loss": 0.6991, + "step": 1314 + }, + { + "epoch": 0.10679769349468042, + "grad_norm": 4.402808201885828, + "learning_rate": 4.923156168827109e-06, + "loss": 0.5193, + "step": 1315 + }, + { + "epoch": 0.106878908470722, + "grad_norm": 3.9531074248473677, + "learning_rate": 4.922994290867872e-06, + "loss": 0.5984, + "step": 1316 + }, + { + "epoch": 0.10696012344676359, + "grad_norm": 5.634957726444024, + "learning_rate": 4.922832245250254e-06, + "loss": 0.5759, + "step": 1317 + }, + { + "epoch": 0.10704133842280517, + "grad_norm": 4.687012040425469, + "learning_rate": 4.922670031985467e-06, + "loss": 0.5291, + "step": 1318 + }, + { + "epoch": 0.10712255339884674, + "grad_norm": 7.331234418642, + "learning_rate": 4.922507651084736e-06, + "loss": 0.631, + "step": 1319 + }, + { + "epoch": 0.10720376837488833, + "grad_norm": 6.179571341056749, + "learning_rate": 4.9223451025592965e-06, + "loss": 0.5047, + "step": 1320 + }, + { + "epoch": 0.10728498335092991, + "grad_norm": 4.797755420619416, + "learning_rate": 4.9221823864203955e-06, + "loss": 0.5616, + "step": 1321 + }, + { + "epoch": 0.10736619832697149, + "grad_norm": 4.7796380411767885, + "learning_rate": 4.922019502679292e-06, + "loss": 0.7038, + "step": 1322 + }, + { + "epoch": 0.10744741330301308, + "grad_norm": 4.880297830435732, + "learning_rate": 4.921856451347258e-06, + "loss": 0.8187, + "step": 1323 + }, + { + "epoch": 0.10752862827905466, + "grad_norm": 4.4975518069828295, + "learning_rate": 4.9216932324355755e-06, + "loss": 0.6288, + "step": 1324 + }, + { + "epoch": 0.10760984325509625, + "grad_norm": 5.693686439938185, + "learning_rate": 4.921529845955537e-06, + "loss": 0.5948, + "step": 1325 + }, + { + "epoch": 0.10769105823113782, + "grad_norm": 9.014157893199854, + "learning_rate": 4.9213662919184495e-06, + "loss": 0.7244, + "step": 1326 + }, + { + "epoch": 0.1077722732071794, + "grad_norm": 17.656772040857238, + "learning_rate": 4.921202570335629e-06, + "loss": 0.5731, + "step": 1327 + }, + { + "epoch": 0.10785348818322099, + "grad_norm": 7.338056929014862, + "learning_rate": 4.921038681218405e-06, + "loss": 0.5082, + "step": 1328 + }, + { + "epoch": 0.10793470315926257, + "grad_norm": 4.851712422566327, + "learning_rate": 4.920874624578118e-06, + "loss": 0.7288, + "step": 1329 + }, + { + "epoch": 0.10801591813530415, + "grad_norm": 5.876176027514485, + "learning_rate": 4.920710400426118e-06, + "loss": 0.4265, + "step": 1330 + }, + { + "epoch": 0.10809713311134574, + "grad_norm": 5.880882787741879, + "learning_rate": 4.920546008773771e-06, + "loss": 0.4902, + "step": 1331 + }, + { + "epoch": 0.10817834808738731, + "grad_norm": 5.58193279698704, + "learning_rate": 4.920381449632451e-06, + "loss": 0.7027, + "step": 1332 + }, + { + "epoch": 0.10825956306342889, + "grad_norm": 5.173776451487993, + "learning_rate": 4.920216723013544e-06, + "loss": 0.646, + "step": 1333 + }, + { + "epoch": 0.10834077803947048, + "grad_norm": 4.388587188343882, + "learning_rate": 4.920051828928448e-06, + "loss": 0.4977, + "step": 1334 + }, + { + "epoch": 0.10842199301551206, + "grad_norm": 4.776839443402457, + "learning_rate": 4.919886767388573e-06, + "loss": 0.5767, + "step": 1335 + }, + { + "epoch": 0.10850320799155365, + "grad_norm": 3.4824258344375725, + "learning_rate": 4.919721538405341e-06, + "loss": 0.533, + "step": 1336 + }, + { + "epoch": 0.10858442296759523, + "grad_norm": 4.742639309949176, + "learning_rate": 4.919556141990186e-06, + "loss": 0.6688, + "step": 1337 + }, + { + "epoch": 0.1086656379436368, + "grad_norm": 4.653988617546264, + "learning_rate": 4.919390578154551e-06, + "loss": 0.631, + "step": 1338 + }, + { + "epoch": 0.1087468529196784, + "grad_norm": 5.238136207553621, + "learning_rate": 4.919224846909891e-06, + "loss": 0.5437, + "step": 1339 + }, + { + "epoch": 0.10882806789571997, + "grad_norm": 6.815351329648775, + "learning_rate": 4.919058948267677e-06, + "loss": 0.8166, + "step": 1340 + }, + { + "epoch": 0.10890928287176155, + "grad_norm": 5.2640299180243355, + "learning_rate": 4.918892882239384e-06, + "loss": 0.6044, + "step": 1341 + }, + { + "epoch": 0.10899049784780314, + "grad_norm": 4.358517148619114, + "learning_rate": 4.918726648836507e-06, + "loss": 0.7873, + "step": 1342 + }, + { + "epoch": 0.10907171282384472, + "grad_norm": 6.052945574950095, + "learning_rate": 4.918560248070547e-06, + "loss": 0.5615, + "step": 1343 + }, + { + "epoch": 0.10915292779988629, + "grad_norm": 4.566253758686044, + "learning_rate": 4.918393679953018e-06, + "loss": 0.5893, + "step": 1344 + }, + { + "epoch": 0.10923414277592788, + "grad_norm": 3.497997250474178, + "learning_rate": 4.918226944495445e-06, + "loss": 0.6582, + "step": 1345 + }, + { + "epoch": 0.10931535775196946, + "grad_norm": 6.920939485791569, + "learning_rate": 4.918060041709366e-06, + "loss": 0.563, + "step": 1346 + }, + { + "epoch": 0.10939657272801105, + "grad_norm": 13.15024709863077, + "learning_rate": 4.917892971606329e-06, + "loss": 0.5954, + "step": 1347 + }, + { + "epoch": 0.10947778770405263, + "grad_norm": 4.208801375695993, + "learning_rate": 4.917725734197896e-06, + "loss": 0.548, + "step": 1348 + }, + { + "epoch": 0.1095590026800942, + "grad_norm": 4.648874348064858, + "learning_rate": 4.917558329495636e-06, + "loss": 0.6602, + "step": 1349 + }, + { + "epoch": 0.1096402176561358, + "grad_norm": 4.308899285098474, + "learning_rate": 4.917390757511136e-06, + "loss": 0.5401, + "step": 1350 + }, + { + "epoch": 0.10972143263217737, + "grad_norm": 5.129516631257819, + "learning_rate": 4.917223018255989e-06, + "loss": 0.5193, + "step": 1351 + }, + { + "epoch": 0.10980264760821895, + "grad_norm": 6.107816197257616, + "learning_rate": 4.917055111741802e-06, + "loss": 0.5366, + "step": 1352 + }, + { + "epoch": 0.10988386258426054, + "grad_norm": 4.707618481744731, + "learning_rate": 4.916887037980193e-06, + "loss": 0.5354, + "step": 1353 + }, + { + "epoch": 0.10996507756030212, + "grad_norm": 4.1837375718082255, + "learning_rate": 4.916718796982793e-06, + "loss": 0.7129, + "step": 1354 + }, + { + "epoch": 0.1100462925363437, + "grad_norm": 3.4672179946418975, + "learning_rate": 4.916550388761242e-06, + "loss": 0.5567, + "step": 1355 + }, + { + "epoch": 0.11012750751238529, + "grad_norm": 4.01249160395805, + "learning_rate": 4.916381813327194e-06, + "loss": 0.5612, + "step": 1356 + }, + { + "epoch": 0.11020872248842686, + "grad_norm": 4.478566912854505, + "learning_rate": 4.916213070692312e-06, + "loss": 0.5274, + "step": 1357 + }, + { + "epoch": 0.11028993746446845, + "grad_norm": 7.752567209797171, + "learning_rate": 4.916044160868273e-06, + "loss": 0.5645, + "step": 1358 + }, + { + "epoch": 0.11037115244051003, + "grad_norm": 4.915949482614922, + "learning_rate": 4.915875083866766e-06, + "loss": 0.5816, + "step": 1359 + }, + { + "epoch": 0.11045236741655161, + "grad_norm": 4.872300208909787, + "learning_rate": 4.915705839699488e-06, + "loss": 0.5693, + "step": 1360 + }, + { + "epoch": 0.1105335823925932, + "grad_norm": 6.8669686866426645, + "learning_rate": 4.915536428378152e-06, + "loss": 0.6373, + "step": 1361 + }, + { + "epoch": 0.11061479736863478, + "grad_norm": 9.217173762118762, + "learning_rate": 4.915366849914479e-06, + "loss": 0.5895, + "step": 1362 + }, + { + "epoch": 0.11069601234467635, + "grad_norm": 5.851967895961872, + "learning_rate": 4.915197104320203e-06, + "loss": 0.6099, + "step": 1363 + }, + { + "epoch": 0.11077722732071794, + "grad_norm": 6.051580089476815, + "learning_rate": 4.915027191607069e-06, + "loss": 0.6278, + "step": 1364 + }, + { + "epoch": 0.11085844229675952, + "grad_norm": 4.182706621669257, + "learning_rate": 4.914857111786835e-06, + "loss": 0.7995, + "step": 1365 + }, + { + "epoch": 0.1109396572728011, + "grad_norm": 6.4377665961998, + "learning_rate": 4.9146868648712694e-06, + "loss": 0.5338, + "step": 1366 + }, + { + "epoch": 0.11102087224884269, + "grad_norm": 4.1795950899223255, + "learning_rate": 4.914516450872152e-06, + "loss": 0.4989, + "step": 1367 + }, + { + "epoch": 0.11110208722488427, + "grad_norm": 5.590546828289337, + "learning_rate": 4.914345869801276e-06, + "loss": 0.5973, + "step": 1368 + }, + { + "epoch": 0.11118330220092586, + "grad_norm": 6.538514315603108, + "learning_rate": 4.914175121670443e-06, + "loss": 0.5177, + "step": 1369 + }, + { + "epoch": 0.11126451717696743, + "grad_norm": 3.691433117466898, + "learning_rate": 4.914004206491467e-06, + "loss": 0.5844, + "step": 1370 + }, + { + "epoch": 0.11134573215300901, + "grad_norm": 4.522420652396943, + "learning_rate": 4.913833124276177e-06, + "loss": 0.5717, + "step": 1371 + }, + { + "epoch": 0.1114269471290506, + "grad_norm": 5.582588983452335, + "learning_rate": 4.9136618750364105e-06, + "loss": 0.5929, + "step": 1372 + }, + { + "epoch": 0.11150816210509218, + "grad_norm": 3.5822825901443145, + "learning_rate": 4.913490458784016e-06, + "loss": 0.6584, + "step": 1373 + }, + { + "epoch": 0.11158937708113376, + "grad_norm": 6.955323322713761, + "learning_rate": 4.913318875530855e-06, + "loss": 0.7448, + "step": 1374 + }, + { + "epoch": 0.11167059205717535, + "grad_norm": 7.999732286808689, + "learning_rate": 4.9131471252887995e-06, + "loss": 0.5943, + "step": 1375 + }, + { + "epoch": 0.11175180703321692, + "grad_norm": 5.699553884482589, + "learning_rate": 4.912975208069735e-06, + "loss": 0.5888, + "step": 1376 + }, + { + "epoch": 0.1118330220092585, + "grad_norm": 9.937720162090995, + "learning_rate": 4.912803123885555e-06, + "loss": 0.579, + "step": 1377 + }, + { + "epoch": 0.11191423698530009, + "grad_norm": 6.426480257301145, + "learning_rate": 4.912630872748171e-06, + "loss": 0.4428, + "step": 1378 + }, + { + "epoch": 0.11199545196134167, + "grad_norm": 4.6331109760310625, + "learning_rate": 4.912458454669498e-06, + "loss": 0.5904, + "step": 1379 + }, + { + "epoch": 0.11207666693738326, + "grad_norm": 5.28940520266137, + "learning_rate": 4.912285869661467e-06, + "loss": 0.5371, + "step": 1380 + }, + { + "epoch": 0.11215788191342484, + "grad_norm": 3.8233787152045418, + "learning_rate": 4.912113117736022e-06, + "loss": 0.7556, + "step": 1381 + }, + { + "epoch": 0.11223909688946641, + "grad_norm": 5.065689586582994, + "learning_rate": 4.911940198905114e-06, + "loss": 0.6213, + "step": 1382 + }, + { + "epoch": 0.112320311865508, + "grad_norm": 6.1114483413633245, + "learning_rate": 4.91176711318071e-06, + "loss": 0.6723, + "step": 1383 + }, + { + "epoch": 0.11240152684154958, + "grad_norm": 7.783537136958568, + "learning_rate": 4.911593860574785e-06, + "loss": 0.5786, + "step": 1384 + }, + { + "epoch": 0.11248274181759116, + "grad_norm": 8.312210011930208, + "learning_rate": 4.911420441099329e-06, + "loss": 0.6476, + "step": 1385 + }, + { + "epoch": 0.11256395679363275, + "grad_norm": 4.230328963619588, + "learning_rate": 4.911246854766341e-06, + "loss": 0.686, + "step": 1386 + }, + { + "epoch": 0.11264517176967433, + "grad_norm": 12.499607763263779, + "learning_rate": 4.911073101587831e-06, + "loss": 0.4509, + "step": 1387 + }, + { + "epoch": 0.1127263867457159, + "grad_norm": 4.97136722337538, + "learning_rate": 4.9108991815758225e-06, + "loss": 0.8017, + "step": 1388 + }, + { + "epoch": 0.1128076017217575, + "grad_norm": 4.68367623760023, + "learning_rate": 4.9107250947423516e-06, + "loss": 0.574, + "step": 1389 + }, + { + "epoch": 0.11288881669779907, + "grad_norm": 4.5482183003098, + "learning_rate": 4.910550841099462e-06, + "loss": 0.6234, + "step": 1390 + }, + { + "epoch": 0.11297003167384066, + "grad_norm": 3.734134582087491, + "learning_rate": 4.910376420659211e-06, + "loss": 0.7674, + "step": 1391 + }, + { + "epoch": 0.11305124664988224, + "grad_norm": 9.222540748041622, + "learning_rate": 4.91020183343367e-06, + "loss": 0.6176, + "step": 1392 + }, + { + "epoch": 0.11313246162592382, + "grad_norm": 7.2087300631114095, + "learning_rate": 4.910027079434917e-06, + "loss": 0.7806, + "step": 1393 + }, + { + "epoch": 0.11321367660196541, + "grad_norm": 4.209780015944576, + "learning_rate": 4.909852158675045e-06, + "loss": 0.6478, + "step": 1394 + }, + { + "epoch": 0.11329489157800698, + "grad_norm": 4.672108189183994, + "learning_rate": 4.9096770711661575e-06, + "loss": 0.7015, + "step": 1395 + }, + { + "epoch": 0.11337610655404856, + "grad_norm": 5.391522206407413, + "learning_rate": 4.90950181692037e-06, + "loss": 0.6014, + "step": 1396 + }, + { + "epoch": 0.11345732153009015, + "grad_norm": 6.512018993656574, + "learning_rate": 4.909326395949809e-06, + "loss": 0.6083, + "step": 1397 + }, + { + "epoch": 0.11353853650613173, + "grad_norm": 5.621883708458305, + "learning_rate": 4.909150808266613e-06, + "loss": 0.6495, + "step": 1398 + }, + { + "epoch": 0.1136197514821733, + "grad_norm": 7.16365741465048, + "learning_rate": 4.908975053882931e-06, + "loss": 0.5368, + "step": 1399 + }, + { + "epoch": 0.1137009664582149, + "grad_norm": 5.860792942986273, + "learning_rate": 4.908799132810924e-06, + "loss": 0.7422, + "step": 1400 + }, + { + "epoch": 0.11378218143425647, + "grad_norm": 3.673990330967191, + "learning_rate": 4.9086230450627655e-06, + "loss": 0.61, + "step": 1401 + }, + { + "epoch": 0.11386339641029807, + "grad_norm": 3.927055438430478, + "learning_rate": 4.908446790650641e-06, + "loss": 0.7079, + "step": 1402 + }, + { + "epoch": 0.11394461138633964, + "grad_norm": 7.144929200227564, + "learning_rate": 4.908270369586744e-06, + "loss": 0.5993, + "step": 1403 + }, + { + "epoch": 0.11402582636238122, + "grad_norm": 4.458920511681134, + "learning_rate": 4.908093781883283e-06, + "loss": 0.7028, + "step": 1404 + }, + { + "epoch": 0.11410704133842281, + "grad_norm": 4.699619928678632, + "learning_rate": 4.9079170275524765e-06, + "loss": 0.5911, + "step": 1405 + }, + { + "epoch": 0.11418825631446439, + "grad_norm": 5.941174948945086, + "learning_rate": 4.907740106606557e-06, + "loss": 0.5615, + "step": 1406 + }, + { + "epoch": 0.11426947129050596, + "grad_norm": 6.471908401673319, + "learning_rate": 4.9075630190577634e-06, + "loss": 0.5378, + "step": 1407 + }, + { + "epoch": 0.11435068626654755, + "grad_norm": 3.9643546125770825, + "learning_rate": 4.907385764918351e-06, + "loss": 0.6547, + "step": 1408 + }, + { + "epoch": 0.11443190124258913, + "grad_norm": 4.878410036784967, + "learning_rate": 4.907208344200585e-06, + "loss": 0.6994, + "step": 1409 + }, + { + "epoch": 0.11451311621863071, + "grad_norm": 5.156488131990337, + "learning_rate": 4.907030756916741e-06, + "loss": 0.5712, + "step": 1410 + }, + { + "epoch": 0.1145943311946723, + "grad_norm": 4.790335390127153, + "learning_rate": 4.906853003079108e-06, + "loss": 0.6316, + "step": 1411 + }, + { + "epoch": 0.11467554617071388, + "grad_norm": 4.153017717675372, + "learning_rate": 4.9066750826999855e-06, + "loss": 0.6906, + "step": 1412 + }, + { + "epoch": 0.11475676114675547, + "grad_norm": 3.87389513168029, + "learning_rate": 4.906496995791684e-06, + "loss": 0.512, + "step": 1413 + }, + { + "epoch": 0.11483797612279704, + "grad_norm": 4.564186801844, + "learning_rate": 4.906318742366527e-06, + "loss": 0.5193, + "step": 1414 + }, + { + "epoch": 0.11491919109883862, + "grad_norm": 5.555381562592889, + "learning_rate": 4.906140322436849e-06, + "loss": 0.7675, + "step": 1415 + }, + { + "epoch": 0.11500040607488021, + "grad_norm": 8.412815182964685, + "learning_rate": 4.9059617360149936e-06, + "loss": 0.4897, + "step": 1416 + }, + { + "epoch": 0.11508162105092179, + "grad_norm": 5.9497894212896325, + "learning_rate": 4.905782983113321e-06, + "loss": 0.8408, + "step": 1417 + }, + { + "epoch": 0.11516283602696337, + "grad_norm": 8.78483062159324, + "learning_rate": 4.905604063744197e-06, + "loss": 0.6732, + "step": 1418 + }, + { + "epoch": 0.11524405100300496, + "grad_norm": 9.970264715659578, + "learning_rate": 4.905424977920004e-06, + "loss": 0.532, + "step": 1419 + }, + { + "epoch": 0.11532526597904653, + "grad_norm": 5.671192067715544, + "learning_rate": 4.9052457256531325e-06, + "loss": 0.5852, + "step": 1420 + }, + { + "epoch": 0.11540648095508811, + "grad_norm": 7.150271447974999, + "learning_rate": 4.905066306955986e-06, + "loss": 0.6486, + "step": 1421 + }, + { + "epoch": 0.1154876959311297, + "grad_norm": 6.958143840228272, + "learning_rate": 4.904886721840981e-06, + "loss": 0.6012, + "step": 1422 + }, + { + "epoch": 0.11556891090717128, + "grad_norm": 4.760774004700414, + "learning_rate": 4.904706970320542e-06, + "loss": 0.6706, + "step": 1423 + }, + { + "epoch": 0.11565012588321287, + "grad_norm": 4.299376204301504, + "learning_rate": 4.904527052407107e-06, + "loss": 0.633, + "step": 1424 + }, + { + "epoch": 0.11573134085925445, + "grad_norm": 4.095629515769702, + "learning_rate": 4.904346968113126e-06, + "loss": 0.8379, + "step": 1425 + }, + { + "epoch": 0.11581255583529602, + "grad_norm": 6.181276965379973, + "learning_rate": 4.904166717451059e-06, + "loss": 0.7148, + "step": 1426 + }, + { + "epoch": 0.11589377081133762, + "grad_norm": 3.8573979500957676, + "learning_rate": 4.90398630043338e-06, + "loss": 0.6601, + "step": 1427 + }, + { + "epoch": 0.11597498578737919, + "grad_norm": 5.572617436631759, + "learning_rate": 4.903805717072572e-06, + "loss": 0.8698, + "step": 1428 + }, + { + "epoch": 0.11605620076342077, + "grad_norm": 6.9707268541736775, + "learning_rate": 4.90362496738113e-06, + "loss": 0.5288, + "step": 1429 + }, + { + "epoch": 0.11613741573946236, + "grad_norm": 4.5564529770959945, + "learning_rate": 4.9034440513715605e-06, + "loss": 0.4828, + "step": 1430 + }, + { + "epoch": 0.11621863071550394, + "grad_norm": 4.109199531053192, + "learning_rate": 4.9032629690563835e-06, + "loss": 0.7356, + "step": 1431 + }, + { + "epoch": 0.11629984569154551, + "grad_norm": 4.437378918259062, + "learning_rate": 4.903081720448128e-06, + "loss": 0.5222, + "step": 1432 + }, + { + "epoch": 0.1163810606675871, + "grad_norm": 6.399358574915106, + "learning_rate": 4.902900305559336e-06, + "loss": 0.5954, + "step": 1433 + }, + { + "epoch": 0.11646227564362868, + "grad_norm": 4.343118949951745, + "learning_rate": 4.9027187244025594e-06, + "loss": 0.5096, + "step": 1434 + }, + { + "epoch": 0.11654349061967027, + "grad_norm": 4.3409440129123515, + "learning_rate": 4.902536976990364e-06, + "loss": 0.5396, + "step": 1435 + }, + { + "epoch": 0.11662470559571185, + "grad_norm": 6.602474988405446, + "learning_rate": 4.902355063335324e-06, + "loss": 0.5902, + "step": 1436 + }, + { + "epoch": 0.11670592057175343, + "grad_norm": 5.5514520331211425, + "learning_rate": 4.902172983450029e-06, + "loss": 0.8389, + "step": 1437 + }, + { + "epoch": 0.11678713554779502, + "grad_norm": 6.703176030649621, + "learning_rate": 4.901990737347076e-06, + "loss": 0.8458, + "step": 1438 + }, + { + "epoch": 0.1168683505238366, + "grad_norm": 4.454570008772595, + "learning_rate": 4.901808325039077e-06, + "loss": 0.6968, + "step": 1439 + }, + { + "epoch": 0.11694956549987817, + "grad_norm": 4.842565250482216, + "learning_rate": 4.901625746538653e-06, + "loss": 0.5741, + "step": 1440 + }, + { + "epoch": 0.11703078047591976, + "grad_norm": 8.75110385587914, + "learning_rate": 4.901443001858438e-06, + "loss": 0.5164, + "step": 1441 + }, + { + "epoch": 0.11711199545196134, + "grad_norm": 4.468121329238175, + "learning_rate": 4.901260091011076e-06, + "loss": 0.6103, + "step": 1442 + }, + { + "epoch": 0.11719321042800292, + "grad_norm": 7.4646000814738045, + "learning_rate": 4.901077014009225e-06, + "loss": 0.4323, + "step": 1443 + }, + { + "epoch": 0.11727442540404451, + "grad_norm": 3.3258631715598055, + "learning_rate": 4.900893770865552e-06, + "loss": 0.5174, + "step": 1444 + }, + { + "epoch": 0.11735564038008608, + "grad_norm": 8.792100552657674, + "learning_rate": 4.900710361592737e-06, + "loss": 0.6525, + "step": 1445 + }, + { + "epoch": 0.11743685535612768, + "grad_norm": 4.961731173202635, + "learning_rate": 4.9005267862034695e-06, + "loss": 0.6754, + "step": 1446 + }, + { + "epoch": 0.11751807033216925, + "grad_norm": 3.814651002868712, + "learning_rate": 4.900343044710453e-06, + "loss": 0.6968, + "step": 1447 + }, + { + "epoch": 0.11759928530821083, + "grad_norm": 4.911489896369547, + "learning_rate": 4.900159137126402e-06, + "loss": 0.5924, + "step": 1448 + }, + { + "epoch": 0.11768050028425242, + "grad_norm": 7.2633083972944945, + "learning_rate": 4.899975063464042e-06, + "loss": 0.5496, + "step": 1449 + }, + { + "epoch": 0.117761715260294, + "grad_norm": 5.005595908510799, + "learning_rate": 4.899790823736108e-06, + "loss": 0.6644, + "step": 1450 + }, + { + "epoch": 0.11784293023633557, + "grad_norm": 4.702266439098655, + "learning_rate": 4.89960641795535e-06, + "loss": 0.8283, + "step": 1451 + }, + { + "epoch": 0.11792414521237717, + "grad_norm": 6.422288029967876, + "learning_rate": 4.899421846134529e-06, + "loss": 0.5909, + "step": 1452 + }, + { + "epoch": 0.11800536018841874, + "grad_norm": 6.425496839266026, + "learning_rate": 4.899237108286414e-06, + "loss": 0.499, + "step": 1453 + }, + { + "epoch": 0.11808657516446033, + "grad_norm": 3.4591295878059247, + "learning_rate": 4.8990522044237884e-06, + "loss": 0.5702, + "step": 1454 + }, + { + "epoch": 0.11816779014050191, + "grad_norm": 4.321892903241153, + "learning_rate": 4.898867134559448e-06, + "loss": 0.4269, + "step": 1455 + }, + { + "epoch": 0.11824900511654349, + "grad_norm": 6.352156069028349, + "learning_rate": 4.898681898706197e-06, + "loss": 0.5002, + "step": 1456 + }, + { + "epoch": 0.11833022009258508, + "grad_norm": 4.739299344449501, + "learning_rate": 4.898496496876854e-06, + "loss": 0.7199, + "step": 1457 + }, + { + "epoch": 0.11841143506862666, + "grad_norm": 5.475577877089898, + "learning_rate": 4.898310929084247e-06, + "loss": 0.6848, + "step": 1458 + }, + { + "epoch": 0.11849265004466823, + "grad_norm": 4.312752315143542, + "learning_rate": 4.898125195341217e-06, + "loss": 0.6698, + "step": 1459 + }, + { + "epoch": 0.11857386502070982, + "grad_norm": 6.101152761282113, + "learning_rate": 4.897939295660615e-06, + "loss": 0.5399, + "step": 1460 + }, + { + "epoch": 0.1186550799967514, + "grad_norm": 5.730964159782436, + "learning_rate": 4.897753230055304e-06, + "loss": 0.5391, + "step": 1461 + }, + { + "epoch": 0.11873629497279298, + "grad_norm": 4.602835724556902, + "learning_rate": 4.89756699853816e-06, + "loss": 0.6377, + "step": 1462 + }, + { + "epoch": 0.11881750994883457, + "grad_norm": 5.577323364531521, + "learning_rate": 4.8973806011220695e-06, + "loss": 0.6413, + "step": 1463 + }, + { + "epoch": 0.11889872492487615, + "grad_norm": 6.109634102169282, + "learning_rate": 4.897194037819928e-06, + "loss": 0.5454, + "step": 1464 + }, + { + "epoch": 0.11897993990091774, + "grad_norm": 4.473657871957378, + "learning_rate": 4.897007308644647e-06, + "loss": 0.5516, + "step": 1465 + }, + { + "epoch": 0.11906115487695931, + "grad_norm": 4.6381132522690995, + "learning_rate": 4.896820413609146e-06, + "loss": 0.6565, + "step": 1466 + }, + { + "epoch": 0.11914236985300089, + "grad_norm": 5.386818421013147, + "learning_rate": 4.896633352726357e-06, + "loss": 0.7063, + "step": 1467 + }, + { + "epoch": 0.11922358482904248, + "grad_norm": 3.1581048679010646, + "learning_rate": 4.896446126009224e-06, + "loss": 0.7565, + "step": 1468 + }, + { + "epoch": 0.11930479980508406, + "grad_norm": 6.401004722074878, + "learning_rate": 4.896258733470702e-06, + "loss": 0.5026, + "step": 1469 + }, + { + "epoch": 0.11938601478112564, + "grad_norm": 2.7519458408541797, + "learning_rate": 4.896071175123758e-06, + "loss": 0.6255, + "step": 1470 + }, + { + "epoch": 0.11946722975716723, + "grad_norm": 4.404610542747091, + "learning_rate": 4.8958834509813706e-06, + "loss": 0.5504, + "step": 1471 + }, + { + "epoch": 0.1195484447332088, + "grad_norm": 6.077687424149937, + "learning_rate": 4.8956955610565275e-06, + "loss": 0.6899, + "step": 1472 + }, + { + "epoch": 0.11962965970925038, + "grad_norm": 5.523332768156323, + "learning_rate": 4.895507505362231e-06, + "loss": 0.5743, + "step": 1473 + }, + { + "epoch": 0.11971087468529197, + "grad_norm": 5.840235758929547, + "learning_rate": 4.895319283911492e-06, + "loss": 0.4483, + "step": 1474 + }, + { + "epoch": 0.11979208966133355, + "grad_norm": 3.9853018352679177, + "learning_rate": 4.895130896717336e-06, + "loss": 0.6511, + "step": 1475 + }, + { + "epoch": 0.11987330463737514, + "grad_norm": 4.087084936629081, + "learning_rate": 4.894942343792799e-06, + "loss": 0.6048, + "step": 1476 + }, + { + "epoch": 0.11995451961341672, + "grad_norm": 4.9948060996575325, + "learning_rate": 4.894753625150927e-06, + "loss": 0.5193, + "step": 1477 + }, + { + "epoch": 0.12003573458945829, + "grad_norm": 5.148303769586792, + "learning_rate": 4.894564740804777e-06, + "loss": 0.4947, + "step": 1478 + }, + { + "epoch": 0.12011694956549988, + "grad_norm": 4.926289780739354, + "learning_rate": 4.89437569076742e-06, + "loss": 0.641, + "step": 1479 + }, + { + "epoch": 0.12019816454154146, + "grad_norm": 4.420640360809511, + "learning_rate": 4.894186475051938e-06, + "loss": 0.6427, + "step": 1480 + }, + { + "epoch": 0.12027937951758304, + "grad_norm": 4.552192514027042, + "learning_rate": 4.893997093671422e-06, + "loss": 0.6684, + "step": 1481 + }, + { + "epoch": 0.12036059449362463, + "grad_norm": 3.6702893426104306, + "learning_rate": 4.893807546638979e-06, + "loss": 0.6778, + "step": 1482 + }, + { + "epoch": 0.1204418094696662, + "grad_norm": 7.860150046179448, + "learning_rate": 4.893617833967721e-06, + "loss": 0.6479, + "step": 1483 + }, + { + "epoch": 0.12052302444570778, + "grad_norm": 3.6225218661319083, + "learning_rate": 4.893427955670778e-06, + "loss": 0.5258, + "step": 1484 + }, + { + "epoch": 0.12060423942174937, + "grad_norm": 7.286179190372548, + "learning_rate": 4.893237911761287e-06, + "loss": 0.5499, + "step": 1485 + }, + { + "epoch": 0.12068545439779095, + "grad_norm": 5.426351678323337, + "learning_rate": 4.893047702252399e-06, + "loss": 0.4595, + "step": 1486 + }, + { + "epoch": 0.12076666937383254, + "grad_norm": 4.641027529381159, + "learning_rate": 4.892857327157275e-06, + "loss": 0.7437, + "step": 1487 + }, + { + "epoch": 0.12084788434987412, + "grad_norm": 5.910005928653577, + "learning_rate": 4.892666786489087e-06, + "loss": 0.5187, + "step": 1488 + }, + { + "epoch": 0.1209290993259157, + "grad_norm": 4.433359476670774, + "learning_rate": 4.8924760802610215e-06, + "loss": 0.6245, + "step": 1489 + }, + { + "epoch": 0.12101031430195729, + "grad_norm": 16.649957190107745, + "learning_rate": 4.8922852084862734e-06, + "loss": 0.6885, + "step": 1490 + }, + { + "epoch": 0.12109152927799886, + "grad_norm": 5.101531362441216, + "learning_rate": 4.892094171178049e-06, + "loss": 0.4662, + "step": 1491 + }, + { + "epoch": 0.12117274425404044, + "grad_norm": 5.1657125457609725, + "learning_rate": 4.891902968349568e-06, + "loss": 0.5948, + "step": 1492 + }, + { + "epoch": 0.12125395923008203, + "grad_norm": 4.342894827579331, + "learning_rate": 4.8917116000140614e-06, + "loss": 0.5409, + "step": 1493 + }, + { + "epoch": 0.12133517420612361, + "grad_norm": 3.8330951289725124, + "learning_rate": 4.8915200661847695e-06, + "loss": 0.52, + "step": 1494 + }, + { + "epoch": 0.12141638918216519, + "grad_norm": 5.179925909022087, + "learning_rate": 4.891328366874946e-06, + "loss": 0.5504, + "step": 1495 + }, + { + "epoch": 0.12149760415820678, + "grad_norm": 5.330823987896905, + "learning_rate": 4.891136502097855e-06, + "loss": 0.6273, + "step": 1496 + }, + { + "epoch": 0.12157881913424835, + "grad_norm": 5.585945845508732, + "learning_rate": 4.890944471866774e-06, + "loss": 0.6236, + "step": 1497 + }, + { + "epoch": 0.12166003411028994, + "grad_norm": 3.83199043102917, + "learning_rate": 4.890752276194989e-06, + "loss": 0.7202, + "step": 1498 + }, + { + "epoch": 0.12174124908633152, + "grad_norm": 3.6411956112594677, + "learning_rate": 4.890559915095798e-06, + "loss": 0.5213, + "step": 1499 + }, + { + "epoch": 0.1218224640623731, + "grad_norm": 4.786252322922742, + "learning_rate": 4.890367388582514e-06, + "loss": 0.9004, + "step": 1500 + }, + { + "epoch": 0.12190367903841469, + "grad_norm": 4.979342968709159, + "learning_rate": 4.890174696668458e-06, + "loss": 0.6452, + "step": 1501 + }, + { + "epoch": 0.12198489401445627, + "grad_norm": 4.11868570254399, + "learning_rate": 4.889981839366962e-06, + "loss": 0.4785, + "step": 1502 + }, + { + "epoch": 0.12206610899049784, + "grad_norm": 4.908815740621781, + "learning_rate": 4.889788816691372e-06, + "loss": 0.6467, + "step": 1503 + }, + { + "epoch": 0.12214732396653943, + "grad_norm": 6.3622155832910625, + "learning_rate": 4.889595628655044e-06, + "loss": 0.61, + "step": 1504 + }, + { + "epoch": 0.12222853894258101, + "grad_norm": 3.6609655168066064, + "learning_rate": 4.8894022752713445e-06, + "loss": 0.7134, + "step": 1505 + }, + { + "epoch": 0.12230975391862259, + "grad_norm": 4.846573857319596, + "learning_rate": 4.8892087565536535e-06, + "loss": 0.6646, + "step": 1506 + }, + { + "epoch": 0.12239096889466418, + "grad_norm": 5.788899246359694, + "learning_rate": 4.889015072515361e-06, + "loss": 0.7547, + "step": 1507 + }, + { + "epoch": 0.12247218387070576, + "grad_norm": 4.583902093938661, + "learning_rate": 4.888821223169869e-06, + "loss": 0.5206, + "step": 1508 + }, + { + "epoch": 0.12255339884674735, + "grad_norm": 3.3647325384813915, + "learning_rate": 4.888627208530592e-06, + "loss": 0.5928, + "step": 1509 + }, + { + "epoch": 0.12263461382278892, + "grad_norm": 3.8539584897978654, + "learning_rate": 4.8884330286109535e-06, + "loss": 0.5397, + "step": 1510 + }, + { + "epoch": 0.1227158287988305, + "grad_norm": 15.266199610075871, + "learning_rate": 4.88823868342439e-06, + "loss": 0.6754, + "step": 1511 + }, + { + "epoch": 0.12279704377487209, + "grad_norm": 5.095675930247071, + "learning_rate": 4.888044172984349e-06, + "loss": 0.7183, + "step": 1512 + }, + { + "epoch": 0.12287825875091367, + "grad_norm": 5.589937550753531, + "learning_rate": 4.887849497304289e-06, + "loss": 0.7005, + "step": 1513 + }, + { + "epoch": 0.12295947372695525, + "grad_norm": 6.394170257036341, + "learning_rate": 4.8876546563976825e-06, + "loss": 0.6316, + "step": 1514 + }, + { + "epoch": 0.12304068870299684, + "grad_norm": 5.521671875850051, + "learning_rate": 4.88745965027801e-06, + "loss": 0.6913, + "step": 1515 + }, + { + "epoch": 0.12312190367903841, + "grad_norm": 4.969659762897969, + "learning_rate": 4.887264478958765e-06, + "loss": 0.5295, + "step": 1516 + }, + { + "epoch": 0.12320311865507999, + "grad_norm": 6.2253045417503845, + "learning_rate": 4.887069142453453e-06, + "loss": 0.6538, + "step": 1517 + }, + { + "epoch": 0.12328433363112158, + "grad_norm": 9.570082889106901, + "learning_rate": 4.886873640775588e-06, + "loss": 0.5829, + "step": 1518 + }, + { + "epoch": 0.12336554860716316, + "grad_norm": 5.609163036466005, + "learning_rate": 4.886677973938701e-06, + "loss": 0.5647, + "step": 1519 + }, + { + "epoch": 0.12344676358320475, + "grad_norm": 7.818406834616618, + "learning_rate": 4.886482141956329e-06, + "loss": 0.6357, + "step": 1520 + }, + { + "epoch": 0.12352797855924633, + "grad_norm": 3.1695857831672067, + "learning_rate": 4.8862861448420234e-06, + "loss": 0.5878, + "step": 1521 + }, + { + "epoch": 0.1236091935352879, + "grad_norm": 4.384307163664372, + "learning_rate": 4.886089982609345e-06, + "loss": 0.5771, + "step": 1522 + }, + { + "epoch": 0.1236904085113295, + "grad_norm": 4.469922202011447, + "learning_rate": 4.885893655271869e-06, + "loss": 0.5124, + "step": 1523 + }, + { + "epoch": 0.12377162348737107, + "grad_norm": 4.594952235845705, + "learning_rate": 4.885697162843179e-06, + "loss": 0.6882, + "step": 1524 + }, + { + "epoch": 0.12385283846341265, + "grad_norm": 11.952868311490398, + "learning_rate": 4.8855005053368715e-06, + "loss": 0.5141, + "step": 1525 + }, + { + "epoch": 0.12393405343945424, + "grad_norm": 5.107326813624944, + "learning_rate": 4.885303682766554e-06, + "loss": 0.6077, + "step": 1526 + }, + { + "epoch": 0.12401526841549582, + "grad_norm": 3.90801909322924, + "learning_rate": 4.885106695145846e-06, + "loss": 0.739, + "step": 1527 + }, + { + "epoch": 0.1240964833915374, + "grad_norm": 4.458856274877279, + "learning_rate": 4.884909542488377e-06, + "loss": 0.4853, + "step": 1528 + }, + { + "epoch": 0.12417769836757898, + "grad_norm": 7.488057909444421, + "learning_rate": 4.88471222480779e-06, + "loss": 0.5267, + "step": 1529 + }, + { + "epoch": 0.12425891334362056, + "grad_norm": 10.709236752099061, + "learning_rate": 4.8845147421177375e-06, + "loss": 0.5767, + "step": 1530 + }, + { + "epoch": 0.12434012831966215, + "grad_norm": 4.227280902938617, + "learning_rate": 4.8843170944318855e-06, + "loss": 0.6582, + "step": 1531 + }, + { + "epoch": 0.12442134329570373, + "grad_norm": 5.304472983158379, + "learning_rate": 4.88411928176391e-06, + "loss": 0.4994, + "step": 1532 + }, + { + "epoch": 0.1245025582717453, + "grad_norm": 6.083933660892841, + "learning_rate": 4.8839213041274955e-06, + "loss": 0.454, + "step": 1533 + }, + { + "epoch": 0.1245837732477869, + "grad_norm": 7.150062978858339, + "learning_rate": 4.8837231615363455e-06, + "loss": 0.7561, + "step": 1534 + }, + { + "epoch": 0.12466498822382847, + "grad_norm": 6.602096765289932, + "learning_rate": 4.883524854004168e-06, + "loss": 0.6597, + "step": 1535 + }, + { + "epoch": 0.12474620319987005, + "grad_norm": 4.853497195307292, + "learning_rate": 4.883326381544686e-06, + "loss": 0.5989, + "step": 1536 + }, + { + "epoch": 0.12482741817591164, + "grad_norm": 5.57662629799743, + "learning_rate": 4.88312774417163e-06, + "loss": 0.555, + "step": 1537 + }, + { + "epoch": 0.12490863315195322, + "grad_norm": 4.64783655989287, + "learning_rate": 4.882928941898748e-06, + "loss": 0.5814, + "step": 1538 + }, + { + "epoch": 0.1249898481279948, + "grad_norm": 8.657184018704008, + "learning_rate": 4.882729974739794e-06, + "loss": 0.5567, + "step": 1539 + }, + { + "epoch": 0.12507106310403637, + "grad_norm": 9.124561847833686, + "learning_rate": 4.882530842708537e-06, + "loss": 0.5428, + "step": 1540 + }, + { + "epoch": 0.12515227808007798, + "grad_norm": 5.93162727377288, + "learning_rate": 4.882331545818755e-06, + "loss": 0.5641, + "step": 1541 + }, + { + "epoch": 0.12523349305611955, + "grad_norm": 4.417596995333459, + "learning_rate": 4.882132084084238e-06, + "loss": 0.6554, + "step": 1542 + }, + { + "epoch": 0.12531470803216113, + "grad_norm": 5.238384789515402, + "learning_rate": 4.8819324575187875e-06, + "loss": 0.8369, + "step": 1543 + }, + { + "epoch": 0.1253959230082027, + "grad_norm": 5.490158000040072, + "learning_rate": 4.881732666136217e-06, + "loss": 0.7737, + "step": 1544 + }, + { + "epoch": 0.12547713798424429, + "grad_norm": 18.610112814184124, + "learning_rate": 4.881532709950352e-06, + "loss": 0.5733, + "step": 1545 + }, + { + "epoch": 0.1255583529602859, + "grad_norm": 4.3346186230095585, + "learning_rate": 4.8813325889750275e-06, + "loss": 0.518, + "step": 1546 + }, + { + "epoch": 0.12563956793632747, + "grad_norm": 5.394377171648292, + "learning_rate": 4.881132303224091e-06, + "loss": 0.4973, + "step": 1547 + }, + { + "epoch": 0.12572078291236904, + "grad_norm": 6.281282652469185, + "learning_rate": 4.880931852711401e-06, + "loss": 0.53, + "step": 1548 + }, + { + "epoch": 0.12580199788841062, + "grad_norm": 5.063604963463662, + "learning_rate": 4.880731237450828e-06, + "loss": 0.543, + "step": 1549 + }, + { + "epoch": 0.1258832128644522, + "grad_norm": 5.589066320656067, + "learning_rate": 4.880530457456252e-06, + "loss": 0.5307, + "step": 1550 + }, + { + "epoch": 0.12596442784049378, + "grad_norm": 5.313990606526498, + "learning_rate": 4.880329512741568e-06, + "loss": 0.6096, + "step": 1551 + }, + { + "epoch": 0.12604564281653538, + "grad_norm": 5.093580296509499, + "learning_rate": 4.88012840332068e-06, + "loss": 0.6667, + "step": 1552 + }, + { + "epoch": 0.12612685779257696, + "grad_norm": 3.074951611991778, + "learning_rate": 4.879927129207502e-06, + "loss": 0.669, + "step": 1553 + }, + { + "epoch": 0.12620807276861853, + "grad_norm": 9.209748336487731, + "learning_rate": 4.8797256904159625e-06, + "loss": 0.7399, + "step": 1554 + }, + { + "epoch": 0.1262892877446601, + "grad_norm": 5.6697335990100965, + "learning_rate": 4.87952408696e-06, + "loss": 0.6407, + "step": 1555 + }, + { + "epoch": 0.1263705027207017, + "grad_norm": 8.545213357048508, + "learning_rate": 4.879322318853564e-06, + "loss": 0.5582, + "step": 1556 + }, + { + "epoch": 0.1264517176967433, + "grad_norm": 7.569125385036642, + "learning_rate": 4.879120386110616e-06, + "loss": 0.6649, + "step": 1557 + }, + { + "epoch": 0.12653293267278487, + "grad_norm": 16.126868678635603, + "learning_rate": 4.878918288745128e-06, + "loss": 0.6249, + "step": 1558 + }, + { + "epoch": 0.12661414764882645, + "grad_norm": 5.161708427963795, + "learning_rate": 4.878716026771086e-06, + "loss": 0.5061, + "step": 1559 + }, + { + "epoch": 0.12669536262486802, + "grad_norm": 4.94640934937331, + "learning_rate": 4.878513600202483e-06, + "loss": 0.6149, + "step": 1560 + }, + { + "epoch": 0.1267765776009096, + "grad_norm": 5.628454265223888, + "learning_rate": 4.878311009053328e-06, + "loss": 0.5789, + "step": 1561 + }, + { + "epoch": 0.12685779257695118, + "grad_norm": 5.099342067120917, + "learning_rate": 4.878108253337638e-06, + "loss": 0.6344, + "step": 1562 + }, + { + "epoch": 0.12693900755299278, + "grad_norm": 5.258000831985147, + "learning_rate": 4.877905333069442e-06, + "loss": 0.5775, + "step": 1563 + }, + { + "epoch": 0.12702022252903436, + "grad_norm": 6.236383483926228, + "learning_rate": 4.877702248262782e-06, + "loss": 0.5334, + "step": 1564 + }, + { + "epoch": 0.12710143750507594, + "grad_norm": 4.048534369160581, + "learning_rate": 4.87749899893171e-06, + "loss": 0.7184, + "step": 1565 + }, + { + "epoch": 0.12718265248111751, + "grad_norm": 4.398427223579315, + "learning_rate": 4.8772955850902914e-06, + "loss": 0.6671, + "step": 1566 + }, + { + "epoch": 0.1272638674571591, + "grad_norm": 5.637044630053773, + "learning_rate": 4.877092006752599e-06, + "loss": 0.621, + "step": 1567 + }, + { + "epoch": 0.1273450824332007, + "grad_norm": 6.7178341766236125, + "learning_rate": 4.876888263932721e-06, + "loss": 0.5828, + "step": 1568 + }, + { + "epoch": 0.12742629740924227, + "grad_norm": 5.624169626907889, + "learning_rate": 4.876684356644754e-06, + "loss": 0.6779, + "step": 1569 + }, + { + "epoch": 0.12750751238528385, + "grad_norm": 4.476842415269882, + "learning_rate": 4.876480284902807e-06, + "loss": 0.4934, + "step": 1570 + }, + { + "epoch": 0.12758872736132543, + "grad_norm": 3.7336585247616885, + "learning_rate": 4.8762760487210035e-06, + "loss": 0.5301, + "step": 1571 + }, + { + "epoch": 0.127669942337367, + "grad_norm": 8.61154812995492, + "learning_rate": 4.876071648113473e-06, + "loss": 0.6757, + "step": 1572 + }, + { + "epoch": 0.12775115731340858, + "grad_norm": 6.969888194514512, + "learning_rate": 4.875867083094359e-06, + "loss": 0.7971, + "step": 1573 + }, + { + "epoch": 0.12783237228945019, + "grad_norm": 7.7890484350762765, + "learning_rate": 4.875662353677818e-06, + "loss": 0.4918, + "step": 1574 + }, + { + "epoch": 0.12791358726549176, + "grad_norm": 4.545675942015323, + "learning_rate": 4.875457459878014e-06, + "loss": 0.6069, + "step": 1575 + }, + { + "epoch": 0.12799480224153334, + "grad_norm": 7.615402665241847, + "learning_rate": 4.875252401709126e-06, + "loss": 0.5996, + "step": 1576 + }, + { + "epoch": 0.12807601721757492, + "grad_norm": 5.198690570704765, + "learning_rate": 4.8750471791853436e-06, + "loss": 0.6422, + "step": 1577 + }, + { + "epoch": 0.1281572321936165, + "grad_norm": 5.875388513113994, + "learning_rate": 4.874841792320865e-06, + "loss": 0.4701, + "step": 1578 + }, + { + "epoch": 0.1282384471696581, + "grad_norm": 6.491950200516407, + "learning_rate": 4.874636241129904e-06, + "loss": 0.5815, + "step": 1579 + }, + { + "epoch": 0.12831966214569968, + "grad_norm": 4.293195229926921, + "learning_rate": 4.874430525626682e-06, + "loss": 0.6365, + "step": 1580 + }, + { + "epoch": 0.12840087712174125, + "grad_norm": 8.402150286140357, + "learning_rate": 4.874224645825435e-06, + "loss": 0.4979, + "step": 1581 + }, + { + "epoch": 0.12848209209778283, + "grad_norm": 6.3159806141726, + "learning_rate": 4.874018601740407e-06, + "loss": 0.504, + "step": 1582 + }, + { + "epoch": 0.1285633070738244, + "grad_norm": 5.865695583703645, + "learning_rate": 4.873812393385856e-06, + "loss": 0.5678, + "step": 1583 + }, + { + "epoch": 0.12864452204986598, + "grad_norm": 4.259448849973863, + "learning_rate": 4.873606020776051e-06, + "loss": 0.637, + "step": 1584 + }, + { + "epoch": 0.1287257370259076, + "grad_norm": 4.795031800681299, + "learning_rate": 4.873399483925272e-06, + "loss": 0.6518, + "step": 1585 + }, + { + "epoch": 0.12880695200194917, + "grad_norm": 6.982468505996596, + "learning_rate": 4.8731927828478085e-06, + "loss": 0.5015, + "step": 1586 + }, + { + "epoch": 0.12888816697799074, + "grad_norm": 4.194815922907224, + "learning_rate": 4.872985917557965e-06, + "loss": 0.6631, + "step": 1587 + }, + { + "epoch": 0.12896938195403232, + "grad_norm": 6.979913270961702, + "learning_rate": 4.872778888070055e-06, + "loss": 0.477, + "step": 1588 + }, + { + "epoch": 0.1290505969300739, + "grad_norm": 6.539545690632459, + "learning_rate": 4.872571694398403e-06, + "loss": 0.5583, + "step": 1589 + }, + { + "epoch": 0.1291318119061155, + "grad_norm": 7.207244340353426, + "learning_rate": 4.872364336557348e-06, + "loss": 0.6374, + "step": 1590 + }, + { + "epoch": 0.12921302688215708, + "grad_norm": 5.271822346774599, + "learning_rate": 4.8721568145612355e-06, + "loss": 0.5826, + "step": 1591 + }, + { + "epoch": 0.12929424185819866, + "grad_norm": 6.699431722524916, + "learning_rate": 4.8719491284244256e-06, + "loss": 0.5468, + "step": 1592 + }, + { + "epoch": 0.12937545683424023, + "grad_norm": 4.303099582925413, + "learning_rate": 4.871741278161291e-06, + "loss": 0.5465, + "step": 1593 + }, + { + "epoch": 0.1294566718102818, + "grad_norm": 3.318090123286297, + "learning_rate": 4.87153326378621e-06, + "loss": 0.5562, + "step": 1594 + }, + { + "epoch": 0.1295378867863234, + "grad_norm": 3.8336332872072507, + "learning_rate": 4.87132508531358e-06, + "loss": 0.6022, + "step": 1595 + }, + { + "epoch": 0.129619101762365, + "grad_norm": 6.539483107346492, + "learning_rate": 4.871116742757803e-06, + "loss": 0.4905, + "step": 1596 + }, + { + "epoch": 0.12970031673840657, + "grad_norm": 3.2101449234700072, + "learning_rate": 4.870908236133297e-06, + "loss": 0.6639, + "step": 1597 + }, + { + "epoch": 0.12978153171444815, + "grad_norm": 11.166157350084525, + "learning_rate": 4.870699565454489e-06, + "loss": 0.5098, + "step": 1598 + }, + { + "epoch": 0.12986274669048972, + "grad_norm": 4.137968227782281, + "learning_rate": 4.870490730735818e-06, + "loss": 0.4853, + "step": 1599 + }, + { + "epoch": 0.1299439616665313, + "grad_norm": 3.555952898094085, + "learning_rate": 4.870281731991733e-06, + "loss": 0.5433, + "step": 1600 + }, + { + "epoch": 0.1300251766425729, + "grad_norm": 6.292559801998863, + "learning_rate": 4.870072569236697e-06, + "loss": 0.5833, + "step": 1601 + }, + { + "epoch": 0.13010639161861448, + "grad_norm": 4.245679851338393, + "learning_rate": 4.869863242485183e-06, + "loss": 0.5839, + "step": 1602 + }, + { + "epoch": 0.13018760659465606, + "grad_norm": 10.695591016769804, + "learning_rate": 4.8696537517516754e-06, + "loss": 0.5284, + "step": 1603 + }, + { + "epoch": 0.13026882157069763, + "grad_norm": 5.501068173317865, + "learning_rate": 4.869444097050668e-06, + "loss": 0.5927, + "step": 1604 + }, + { + "epoch": 0.1303500365467392, + "grad_norm": 5.717664746661781, + "learning_rate": 4.8692342783966706e-06, + "loss": 0.5258, + "step": 1605 + }, + { + "epoch": 0.1304312515227808, + "grad_norm": 5.029968109519692, + "learning_rate": 4.869024295804199e-06, + "loss": 0.6064, + "step": 1606 + }, + { + "epoch": 0.1305124664988224, + "grad_norm": 3.7931205356074575, + "learning_rate": 4.868814149287785e-06, + "loss": 0.5764, + "step": 1607 + }, + { + "epoch": 0.13059368147486397, + "grad_norm": 4.944616527342133, + "learning_rate": 4.868603838861969e-06, + "loss": 0.5991, + "step": 1608 + }, + { + "epoch": 0.13067489645090555, + "grad_norm": 5.178859852231613, + "learning_rate": 4.868393364541301e-06, + "loss": 0.6465, + "step": 1609 + }, + { + "epoch": 0.13075611142694712, + "grad_norm": 4.567090745940825, + "learning_rate": 4.868182726340349e-06, + "loss": 0.6649, + "step": 1610 + }, + { + "epoch": 0.1308373264029887, + "grad_norm": 5.802293496504625, + "learning_rate": 4.867971924273685e-06, + "loss": 0.5043, + "step": 1611 + }, + { + "epoch": 0.1309185413790303, + "grad_norm": 4.914755653217928, + "learning_rate": 4.8677609583558956e-06, + "loss": 0.5295, + "step": 1612 + }, + { + "epoch": 0.13099975635507188, + "grad_norm": 7.208070575519608, + "learning_rate": 4.867549828601579e-06, + "loss": 0.6164, + "step": 1613 + }, + { + "epoch": 0.13108097133111346, + "grad_norm": 7.77772857949289, + "learning_rate": 4.8673385350253454e-06, + "loss": 0.5208, + "step": 1614 + }, + { + "epoch": 0.13116218630715504, + "grad_norm": 8.264639243565528, + "learning_rate": 4.867127077641813e-06, + "loss": 0.6235, + "step": 1615 + }, + { + "epoch": 0.13124340128319661, + "grad_norm": 5.062219960669989, + "learning_rate": 4.866915456465615e-06, + "loss": 0.57, + "step": 1616 + }, + { + "epoch": 0.1313246162592382, + "grad_norm": 16.478111147319943, + "learning_rate": 4.866703671511395e-06, + "loss": 0.5729, + "step": 1617 + }, + { + "epoch": 0.1314058312352798, + "grad_norm": 4.4225512989681475, + "learning_rate": 4.8664917227938056e-06, + "loss": 0.5722, + "step": 1618 + }, + { + "epoch": 0.13148704621132137, + "grad_norm": 5.041669688436434, + "learning_rate": 4.866279610327514e-06, + "loss": 0.7651, + "step": 1619 + }, + { + "epoch": 0.13156826118736295, + "grad_norm": 5.8825838835020505, + "learning_rate": 4.8660673341271966e-06, + "loss": 0.6381, + "step": 1620 + }, + { + "epoch": 0.13164947616340453, + "grad_norm": 4.034188407645285, + "learning_rate": 4.865854894207541e-06, + "loss": 0.6949, + "step": 1621 + }, + { + "epoch": 0.1317306911394461, + "grad_norm": 3.6403419129784864, + "learning_rate": 4.865642290583249e-06, + "loss": 0.5172, + "step": 1622 + }, + { + "epoch": 0.1318119061154877, + "grad_norm": 4.836824286973765, + "learning_rate": 4.86542952326903e-06, + "loss": 0.6615, + "step": 1623 + }, + { + "epoch": 0.1318931210915293, + "grad_norm": 4.388993168301412, + "learning_rate": 4.865216592279607e-06, + "loss": 0.6131, + "step": 1624 + }, + { + "epoch": 0.13197433606757086, + "grad_norm": 6.591216319051455, + "learning_rate": 4.865003497629713e-06, + "loss": 0.6583, + "step": 1625 + }, + { + "epoch": 0.13205555104361244, + "grad_norm": 4.389020157372549, + "learning_rate": 4.8647902393340955e-06, + "loss": 0.6026, + "step": 1626 + }, + { + "epoch": 0.13213676601965402, + "grad_norm": 4.250808042920454, + "learning_rate": 4.864576817407507e-06, + "loss": 0.6323, + "step": 1627 + }, + { + "epoch": 0.1322179809956956, + "grad_norm": 6.881832060317491, + "learning_rate": 4.864363231864717e-06, + "loss": 0.6207, + "step": 1628 + }, + { + "epoch": 0.1322991959717372, + "grad_norm": 10.412011238660916, + "learning_rate": 4.864149482720505e-06, + "loss": 0.5453, + "step": 1629 + }, + { + "epoch": 0.13238041094777878, + "grad_norm": 5.774868373588398, + "learning_rate": 4.863935569989662e-06, + "loss": 0.5582, + "step": 1630 + }, + { + "epoch": 0.13246162592382035, + "grad_norm": 5.245799631843284, + "learning_rate": 4.863721493686987e-06, + "loss": 0.6431, + "step": 1631 + }, + { + "epoch": 0.13254284089986193, + "grad_norm": 4.177337851592638, + "learning_rate": 4.8635072538272954e-06, + "loss": 0.5681, + "step": 1632 + }, + { + "epoch": 0.1326240558759035, + "grad_norm": 3.517896942856102, + "learning_rate": 4.863292850425409e-06, + "loss": 0.6479, + "step": 1633 + }, + { + "epoch": 0.1327052708519451, + "grad_norm": 5.350168873979354, + "learning_rate": 4.863078283496167e-06, + "loss": 0.6847, + "step": 1634 + }, + { + "epoch": 0.1327864858279867, + "grad_norm": 6.986629354106734, + "learning_rate": 4.862863553054413e-06, + "loss": 0.6294, + "step": 1635 + }, + { + "epoch": 0.13286770080402827, + "grad_norm": 4.997326232828624, + "learning_rate": 4.862648659115007e-06, + "loss": 0.7014, + "step": 1636 + }, + { + "epoch": 0.13294891578006984, + "grad_norm": 4.2346207052457485, + "learning_rate": 4.8624336016928175e-06, + "loss": 0.5626, + "step": 1637 + }, + { + "epoch": 0.13303013075611142, + "grad_norm": 4.331783623394118, + "learning_rate": 4.8622183808027255e-06, + "loss": 0.6618, + "step": 1638 + }, + { + "epoch": 0.133111345732153, + "grad_norm": 5.008484418976238, + "learning_rate": 4.8620029964596234e-06, + "loss": 0.6353, + "step": 1639 + }, + { + "epoch": 0.1331925607081946, + "grad_norm": 6.50139784970913, + "learning_rate": 4.861787448678416e-06, + "loss": 0.486, + "step": 1640 + }, + { + "epoch": 0.13327377568423618, + "grad_norm": 6.369833370069241, + "learning_rate": 4.861571737474015e-06, + "loss": 0.5904, + "step": 1641 + }, + { + "epoch": 0.13335499066027776, + "grad_norm": 3.8537585558162317, + "learning_rate": 4.8613558628613494e-06, + "loss": 0.8424, + "step": 1642 + }, + { + "epoch": 0.13343620563631933, + "grad_norm": 6.904987154180493, + "learning_rate": 4.8611398248553554e-06, + "loss": 0.5671, + "step": 1643 + }, + { + "epoch": 0.1335174206123609, + "grad_norm": 3.1433510533347415, + "learning_rate": 4.860923623470981e-06, + "loss": 0.4704, + "step": 1644 + }, + { + "epoch": 0.13359863558840251, + "grad_norm": 5.418253266261615, + "learning_rate": 4.860707258723187e-06, + "loss": 0.5313, + "step": 1645 + }, + { + "epoch": 0.1336798505644441, + "grad_norm": 5.2359535500265615, + "learning_rate": 4.860490730626945e-06, + "loss": 0.5742, + "step": 1646 + }, + { + "epoch": 0.13376106554048567, + "grad_norm": 11.436418505055565, + "learning_rate": 4.860274039197237e-06, + "loss": 0.5654, + "step": 1647 + }, + { + "epoch": 0.13384228051652725, + "grad_norm": 6.262589762855426, + "learning_rate": 4.860057184449057e-06, + "loss": 0.4724, + "step": 1648 + }, + { + "epoch": 0.13392349549256882, + "grad_norm": 5.518995358402323, + "learning_rate": 4.85984016639741e-06, + "loss": 0.5537, + "step": 1649 + }, + { + "epoch": 0.1340047104686104, + "grad_norm": 5.088469039450566, + "learning_rate": 4.859622985057313e-06, + "loss": 0.5638, + "step": 1650 + }, + { + "epoch": 0.134085925444652, + "grad_norm": 4.614369100716233, + "learning_rate": 4.859405640443793e-06, + "loss": 0.6239, + "step": 1651 + }, + { + "epoch": 0.13416714042069358, + "grad_norm": 6.877855199561925, + "learning_rate": 4.85918813257189e-06, + "loss": 0.5406, + "step": 1652 + }, + { + "epoch": 0.13424835539673516, + "grad_norm": 6.336440018776646, + "learning_rate": 4.858970461456655e-06, + "loss": 0.5815, + "step": 1653 + }, + { + "epoch": 0.13432957037277674, + "grad_norm": 6.858031875975209, + "learning_rate": 4.858752627113148e-06, + "loss": 0.6946, + "step": 1654 + }, + { + "epoch": 0.1344107853488183, + "grad_norm": 3.403378239632881, + "learning_rate": 4.8585346295564425e-06, + "loss": 0.7919, + "step": 1655 + }, + { + "epoch": 0.13449200032485992, + "grad_norm": 6.487088297683663, + "learning_rate": 4.858316468801624e-06, + "loss": 0.4289, + "step": 1656 + }, + { + "epoch": 0.1345732153009015, + "grad_norm": 9.756045801760866, + "learning_rate": 4.858098144863786e-06, + "loss": 0.5673, + "step": 1657 + }, + { + "epoch": 0.13465443027694307, + "grad_norm": 4.769994844405226, + "learning_rate": 4.857879657758037e-06, + "loss": 0.6929, + "step": 1658 + }, + { + "epoch": 0.13473564525298465, + "grad_norm": 4.567963041913676, + "learning_rate": 4.857661007499493e-06, + "loss": 0.6402, + "step": 1659 + }, + { + "epoch": 0.13481686022902623, + "grad_norm": 7.539959586550248, + "learning_rate": 4.857442194103287e-06, + "loss": 0.6195, + "step": 1660 + }, + { + "epoch": 0.1348980752050678, + "grad_norm": 3.320972169580362, + "learning_rate": 4.8572232175845574e-06, + "loss": 0.5644, + "step": 1661 + }, + { + "epoch": 0.1349792901811094, + "grad_norm": 6.466693615210596, + "learning_rate": 4.857004077958456e-06, + "loss": 0.6244, + "step": 1662 + }, + { + "epoch": 0.13506050515715098, + "grad_norm": 5.503111817650875, + "learning_rate": 4.8567847752401476e-06, + "loss": 0.5756, + "step": 1663 + }, + { + "epoch": 0.13514172013319256, + "grad_norm": 5.858104142956886, + "learning_rate": 4.8565653094448054e-06, + "loss": 0.6822, + "step": 1664 + }, + { + "epoch": 0.13522293510923414, + "grad_norm": 4.782981817213062, + "learning_rate": 4.856345680587616e-06, + "loss": 0.7815, + "step": 1665 + }, + { + "epoch": 0.13530415008527572, + "grad_norm": 15.4570240471019, + "learning_rate": 4.856125888683775e-06, + "loss": 0.4739, + "step": 1666 + }, + { + "epoch": 0.13538536506131732, + "grad_norm": 4.282142809688187, + "learning_rate": 4.855905933748492e-06, + "loss": 0.5324, + "step": 1667 + }, + { + "epoch": 0.1354665800373589, + "grad_norm": 5.095680068440331, + "learning_rate": 4.855685815796989e-06, + "loss": 0.5745, + "step": 1668 + }, + { + "epoch": 0.13554779501340047, + "grad_norm": 4.141977804675802, + "learning_rate": 4.855465534844494e-06, + "loss": 0.7417, + "step": 1669 + }, + { + "epoch": 0.13562900998944205, + "grad_norm": 7.6969307870140895, + "learning_rate": 4.8552450909062494e-06, + "loss": 0.5206, + "step": 1670 + }, + { + "epoch": 0.13571022496548363, + "grad_norm": 6.450650416903025, + "learning_rate": 4.855024483997509e-06, + "loss": 0.655, + "step": 1671 + }, + { + "epoch": 0.1357914399415252, + "grad_norm": 4.810253355009561, + "learning_rate": 4.85480371413354e-06, + "loss": 0.5323, + "step": 1672 + }, + { + "epoch": 0.1358726549175668, + "grad_norm": 3.3253261110937116, + "learning_rate": 4.8545827813296154e-06, + "loss": 0.5753, + "step": 1673 + }, + { + "epoch": 0.1359538698936084, + "grad_norm": 3.8466882509432496, + "learning_rate": 4.8543616856010235e-06, + "loss": 0.7926, + "step": 1674 + }, + { + "epoch": 0.13603508486964996, + "grad_norm": 4.8978637912258, + "learning_rate": 4.854140426963064e-06, + "loss": 0.5541, + "step": 1675 + }, + { + "epoch": 0.13611629984569154, + "grad_norm": 5.229352786538515, + "learning_rate": 4.853919005431046e-06, + "loss": 0.4727, + "step": 1676 + }, + { + "epoch": 0.13619751482173312, + "grad_norm": 2.9378106286602095, + "learning_rate": 4.85369742102029e-06, + "loss": 0.7667, + "step": 1677 + }, + { + "epoch": 0.13627872979777472, + "grad_norm": 9.698335327128627, + "learning_rate": 4.8534756737461305e-06, + "loss": 0.7053, + "step": 1678 + }, + { + "epoch": 0.1363599447738163, + "grad_norm": 4.258566186707566, + "learning_rate": 4.853253763623909e-06, + "loss": 0.7551, + "step": 1679 + }, + { + "epoch": 0.13644115974985788, + "grad_norm": 7.742141939352821, + "learning_rate": 4.853031690668982e-06, + "loss": 0.6012, + "step": 1680 + }, + { + "epoch": 0.13652237472589945, + "grad_norm": 7.324297389610232, + "learning_rate": 4.852809454896715e-06, + "loss": 0.8128, + "step": 1681 + }, + { + "epoch": 0.13660358970194103, + "grad_norm": 6.79191875565859, + "learning_rate": 4.852587056322485e-06, + "loss": 0.4649, + "step": 1682 + }, + { + "epoch": 0.1366848046779826, + "grad_norm": 4.346976001633204, + "learning_rate": 4.852364494961684e-06, + "loss": 0.5741, + "step": 1683 + }, + { + "epoch": 0.1367660196540242, + "grad_norm": 5.599252491473373, + "learning_rate": 4.852141770829707e-06, + "loss": 0.5591, + "step": 1684 + }, + { + "epoch": 0.1368472346300658, + "grad_norm": 7.657958990969802, + "learning_rate": 4.851918883941969e-06, + "loss": 0.5636, + "step": 1685 + }, + { + "epoch": 0.13692844960610737, + "grad_norm": 3.4450921267982126, + "learning_rate": 4.851695834313892e-06, + "loss": 0.7366, + "step": 1686 + }, + { + "epoch": 0.13700966458214894, + "grad_norm": 5.273690756756951, + "learning_rate": 4.851472621960909e-06, + "loss": 0.5808, + "step": 1687 + }, + { + "epoch": 0.13709087955819052, + "grad_norm": 4.952280541512243, + "learning_rate": 4.851249246898465e-06, + "loss": 0.6648, + "step": 1688 + }, + { + "epoch": 0.13717209453423213, + "grad_norm": 7.707004969920144, + "learning_rate": 4.851025709142018e-06, + "loss": 0.6268, + "step": 1689 + }, + { + "epoch": 0.1372533095102737, + "grad_norm": 4.430949277747921, + "learning_rate": 4.850802008707034e-06, + "loss": 0.4935, + "step": 1690 + }, + { + "epoch": 0.13733452448631528, + "grad_norm": 4.0862917668071175, + "learning_rate": 4.8505781456089926e-06, + "loss": 0.7303, + "step": 1691 + }, + { + "epoch": 0.13741573946235686, + "grad_norm": 8.15399495830509, + "learning_rate": 4.850354119863384e-06, + "loss": 0.573, + "step": 1692 + }, + { + "epoch": 0.13749695443839843, + "grad_norm": 4.4854550799183, + "learning_rate": 4.850129931485709e-06, + "loss": 0.6696, + "step": 1693 + }, + { + "epoch": 0.13757816941444, + "grad_norm": 6.062165358428213, + "learning_rate": 4.849905580491481e-06, + "loss": 0.506, + "step": 1694 + }, + { + "epoch": 0.13765938439048162, + "grad_norm": 4.30707595006544, + "learning_rate": 4.849681066896224e-06, + "loss": 0.5298, + "step": 1695 + }, + { + "epoch": 0.1377405993665232, + "grad_norm": 4.645280994473025, + "learning_rate": 4.849456390715471e-06, + "loss": 0.5178, + "step": 1696 + }, + { + "epoch": 0.13782181434256477, + "grad_norm": 4.408267231508574, + "learning_rate": 4.849231551964771e-06, + "loss": 0.5725, + "step": 1697 + }, + { + "epoch": 0.13790302931860635, + "grad_norm": 9.45812967426135, + "learning_rate": 4.849006550659681e-06, + "loss": 0.6328, + "step": 1698 + }, + { + "epoch": 0.13798424429464792, + "grad_norm": 4.491014201207106, + "learning_rate": 4.84878138681577e-06, + "loss": 0.5886, + "step": 1699 + }, + { + "epoch": 0.13806545927068953, + "grad_norm": 6.727338627008777, + "learning_rate": 4.848556060448617e-06, + "loss": 0.5413, + "step": 1700 + }, + { + "epoch": 0.1381466742467311, + "grad_norm": 6.965832541226805, + "learning_rate": 4.848330571573815e-06, + "loss": 0.582, + "step": 1701 + }, + { + "epoch": 0.13822788922277268, + "grad_norm": 5.263090461409852, + "learning_rate": 4.848104920206964e-06, + "loss": 0.5413, + "step": 1702 + }, + { + "epoch": 0.13830910419881426, + "grad_norm": 4.916463781199789, + "learning_rate": 4.847879106363681e-06, + "loss": 0.5164, + "step": 1703 + }, + { + "epoch": 0.13839031917485584, + "grad_norm": 4.020484866288476, + "learning_rate": 4.847653130059591e-06, + "loss": 0.5987, + "step": 1704 + }, + { + "epoch": 0.1384715341508974, + "grad_norm": 7.925299595269813, + "learning_rate": 4.847426991310327e-06, + "loss": 0.5019, + "step": 1705 + }, + { + "epoch": 0.13855274912693902, + "grad_norm": 4.765085041505615, + "learning_rate": 4.84720069013154e-06, + "loss": 0.6024, + "step": 1706 + }, + { + "epoch": 0.1386339641029806, + "grad_norm": 3.5389423477868704, + "learning_rate": 4.846974226538887e-06, + "loss": 0.5936, + "step": 1707 + }, + { + "epoch": 0.13871517907902217, + "grad_norm": 4.437272587336693, + "learning_rate": 4.846747600548039e-06, + "loss": 0.6592, + "step": 1708 + }, + { + "epoch": 0.13879639405506375, + "grad_norm": 6.923436463835136, + "learning_rate": 4.8465208121746775e-06, + "loss": 0.7827, + "step": 1709 + }, + { + "epoch": 0.13887760903110533, + "grad_norm": 4.075078776605313, + "learning_rate": 4.846293861434494e-06, + "loss": 0.6561, + "step": 1710 + }, + { + "epoch": 0.13895882400714693, + "grad_norm": 4.965543277677793, + "learning_rate": 4.846066748343193e-06, + "loss": 0.6394, + "step": 1711 + }, + { + "epoch": 0.1390400389831885, + "grad_norm": 4.952847589325617, + "learning_rate": 4.84583947291649e-06, + "loss": 0.5429, + "step": 1712 + }, + { + "epoch": 0.13912125395923008, + "grad_norm": 6.231762044145288, + "learning_rate": 4.84561203517011e-06, + "loss": 0.6042, + "step": 1713 + }, + { + "epoch": 0.13920246893527166, + "grad_norm": 7.90195822251666, + "learning_rate": 4.8453844351197906e-06, + "loss": 0.6262, + "step": 1714 + }, + { + "epoch": 0.13928368391131324, + "grad_norm": 5.163740585708867, + "learning_rate": 4.845156672781283e-06, + "loss": 0.4433, + "step": 1715 + }, + { + "epoch": 0.13936489888735482, + "grad_norm": 6.772106592052472, + "learning_rate": 4.844928748170343e-06, + "loss": 0.5625, + "step": 1716 + }, + { + "epoch": 0.13944611386339642, + "grad_norm": 7.967404236571118, + "learning_rate": 4.844700661302745e-06, + "loss": 0.5634, + "step": 1717 + }, + { + "epoch": 0.139527328839438, + "grad_norm": 4.952354575898061, + "learning_rate": 4.844472412194271e-06, + "loss": 0.4938, + "step": 1718 + }, + { + "epoch": 0.13960854381547957, + "grad_norm": 4.882182783614095, + "learning_rate": 4.844244000860713e-06, + "loss": 0.514, + "step": 1719 + }, + { + "epoch": 0.13968975879152115, + "grad_norm": 4.517018593698755, + "learning_rate": 4.844015427317878e-06, + "loss": 0.5567, + "step": 1720 + }, + { + "epoch": 0.13977097376756273, + "grad_norm": 5.353241935249897, + "learning_rate": 4.84378669158158e-06, + "loss": 0.6774, + "step": 1721 + }, + { + "epoch": 0.13985218874360433, + "grad_norm": 4.71686990816346, + "learning_rate": 4.843557793667647e-06, + "loss": 0.5591, + "step": 1722 + }, + { + "epoch": 0.1399334037196459, + "grad_norm": 4.136924741202888, + "learning_rate": 4.843328733591918e-06, + "loss": 0.711, + "step": 1723 + }, + { + "epoch": 0.1400146186956875, + "grad_norm": 5.450901778292648, + "learning_rate": 4.843099511370243e-06, + "loss": 0.6455, + "step": 1724 + }, + { + "epoch": 0.14009583367172906, + "grad_norm": 5.05521301047581, + "learning_rate": 4.842870127018482e-06, + "loss": 0.5929, + "step": 1725 + }, + { + "epoch": 0.14017704864777064, + "grad_norm": 5.278566981624996, + "learning_rate": 4.842640580552508e-06, + "loss": 0.5624, + "step": 1726 + }, + { + "epoch": 0.14025826362381222, + "grad_norm": 3.187653564030086, + "learning_rate": 4.842410871988204e-06, + "loss": 0.4295, + "step": 1727 + }, + { + "epoch": 0.14033947859985382, + "grad_norm": 4.96925386948935, + "learning_rate": 4.842181001341465e-06, + "loss": 0.6622, + "step": 1728 + }, + { + "epoch": 0.1404206935758954, + "grad_norm": 4.620002867331911, + "learning_rate": 4.8419509686281965e-06, + "loss": 0.6541, + "step": 1729 + }, + { + "epoch": 0.14050190855193698, + "grad_norm": 4.351427980680414, + "learning_rate": 4.841720773864315e-06, + "loss": 0.5794, + "step": 1730 + }, + { + "epoch": 0.14058312352797855, + "grad_norm": 6.055475530739616, + "learning_rate": 4.84149041706575e-06, + "loss": 0.6507, + "step": 1731 + }, + { + "epoch": 0.14066433850402013, + "grad_norm": 4.39137607633036, + "learning_rate": 4.8412598982484396e-06, + "loss": 0.5077, + "step": 1732 + }, + { + "epoch": 0.14074555348006174, + "grad_norm": 5.347025072258548, + "learning_rate": 4.8410292174283356e-06, + "loss": 0.5427, + "step": 1733 + }, + { + "epoch": 0.1408267684561033, + "grad_norm": 5.982794959612033, + "learning_rate": 4.840798374621399e-06, + "loss": 0.6222, + "step": 1734 + }, + { + "epoch": 0.1409079834321449, + "grad_norm": 4.786923948878069, + "learning_rate": 4.8405673698436046e-06, + "loss": 0.5887, + "step": 1735 + }, + { + "epoch": 0.14098919840818647, + "grad_norm": 4.34136443925787, + "learning_rate": 4.840336203110934e-06, + "loss": 0.8337, + "step": 1736 + }, + { + "epoch": 0.14107041338422804, + "grad_norm": 6.40416263003894, + "learning_rate": 4.840104874439385e-06, + "loss": 0.4646, + "step": 1737 + }, + { + "epoch": 0.14115162836026962, + "grad_norm": 4.993914992295412, + "learning_rate": 4.839873383844964e-06, + "loss": 0.6091, + "step": 1738 + }, + { + "epoch": 0.14123284333631123, + "grad_norm": 4.072501067349767, + "learning_rate": 4.839641731343688e-06, + "loss": 0.5803, + "step": 1739 + }, + { + "epoch": 0.1413140583123528, + "grad_norm": 4.916864007591204, + "learning_rate": 4.839409916951586e-06, + "loss": 0.6042, + "step": 1740 + }, + { + "epoch": 0.14139527328839438, + "grad_norm": 7.127573823763706, + "learning_rate": 4.839177940684699e-06, + "loss": 0.5263, + "step": 1741 + }, + { + "epoch": 0.14147648826443596, + "grad_norm": 4.634413084773449, + "learning_rate": 4.838945802559079e-06, + "loss": 0.6171, + "step": 1742 + }, + { + "epoch": 0.14155770324047753, + "grad_norm": 4.578421538253424, + "learning_rate": 4.8387135025907885e-06, + "loss": 0.5441, + "step": 1743 + }, + { + "epoch": 0.14163891821651914, + "grad_norm": 4.586524758850179, + "learning_rate": 4.8384810407959e-06, + "loss": 0.5399, + "step": 1744 + }, + { + "epoch": 0.14172013319256072, + "grad_norm": 5.918670545030279, + "learning_rate": 4.8382484171905006e-06, + "loss": 0.4686, + "step": 1745 + }, + { + "epoch": 0.1418013481686023, + "grad_norm": 5.704207192778562, + "learning_rate": 4.8380156317906855e-06, + "loss": 0.6141, + "step": 1746 + }, + { + "epoch": 0.14188256314464387, + "grad_norm": 4.921937978250558, + "learning_rate": 4.837782684612562e-06, + "loss": 0.5936, + "step": 1747 + }, + { + "epoch": 0.14196377812068545, + "grad_norm": 8.366604146884352, + "learning_rate": 4.83754957567225e-06, + "loss": 0.5511, + "step": 1748 + }, + { + "epoch": 0.14204499309672702, + "grad_norm": 3.91804902399355, + "learning_rate": 4.837316304985879e-06, + "loss": 0.6865, + "step": 1749 + }, + { + "epoch": 0.14212620807276863, + "grad_norm": 7.888393422464691, + "learning_rate": 4.8370828725695885e-06, + "loss": 0.5439, + "step": 1750 + }, + { + "epoch": 0.1422074230488102, + "grad_norm": 4.847280666549644, + "learning_rate": 4.836849278439532e-06, + "loss": 0.6774, + "step": 1751 + }, + { + "epoch": 0.14228863802485178, + "grad_norm": 5.563719824565103, + "learning_rate": 4.836615522611874e-06, + "loss": 0.5419, + "step": 1752 + }, + { + "epoch": 0.14236985300089336, + "grad_norm": 9.244694159995396, + "learning_rate": 4.8363816051027875e-06, + "loss": 0.5761, + "step": 1753 + }, + { + "epoch": 0.14245106797693494, + "grad_norm": 4.37956947048255, + "learning_rate": 4.8361475259284604e-06, + "loss": 0.6103, + "step": 1754 + }, + { + "epoch": 0.14253228295297654, + "grad_norm": 11.159158566456547, + "learning_rate": 4.8359132851050875e-06, + "loss": 0.4938, + "step": 1755 + }, + { + "epoch": 0.14261349792901812, + "grad_norm": 5.121940842643738, + "learning_rate": 4.835678882648878e-06, + "loss": 0.7047, + "step": 1756 + }, + { + "epoch": 0.1426947129050597, + "grad_norm": 4.634689837382904, + "learning_rate": 4.8354443185760505e-06, + "loss": 0.5441, + "step": 1757 + }, + { + "epoch": 0.14277592788110127, + "grad_norm": 4.696643113982221, + "learning_rate": 4.835209592902837e-06, + "loss": 0.6011, + "step": 1758 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 33.651614779388595, + "learning_rate": 4.834974705645478e-06, + "loss": 0.5516, + "step": 1759 + }, + { + "epoch": 0.14293835783318443, + "grad_norm": 7.739691900140675, + "learning_rate": 4.834739656820228e-06, + "loss": 0.6453, + "step": 1760 + }, + { + "epoch": 0.14301957280922603, + "grad_norm": 6.791535657044752, + "learning_rate": 4.83450444644335e-06, + "loss": 0.7097, + "step": 1761 + }, + { + "epoch": 0.1431007877852676, + "grad_norm": 5.503204544875674, + "learning_rate": 4.834269074531119e-06, + "loss": 0.5335, + "step": 1762 + }, + { + "epoch": 0.14318200276130919, + "grad_norm": 4.4158123615058935, + "learning_rate": 4.834033541099822e-06, + "loss": 0.5697, + "step": 1763 + }, + { + "epoch": 0.14326321773735076, + "grad_norm": 8.45132186133372, + "learning_rate": 4.833797846165758e-06, + "loss": 0.575, + "step": 1764 + }, + { + "epoch": 0.14334443271339234, + "grad_norm": 4.117636505890885, + "learning_rate": 4.833561989745232e-06, + "loss": 0.7003, + "step": 1765 + }, + { + "epoch": 0.14342564768943394, + "grad_norm": 5.425663068958977, + "learning_rate": 4.833325971854568e-06, + "loss": 0.5898, + "step": 1766 + }, + { + "epoch": 0.14350686266547552, + "grad_norm": 4.870653806817092, + "learning_rate": 4.8330897925100966e-06, + "loss": 0.6641, + "step": 1767 + }, + { + "epoch": 0.1435880776415171, + "grad_norm": 4.047961871711326, + "learning_rate": 4.8328534517281575e-06, + "loss": 0.6453, + "step": 1768 + }, + { + "epoch": 0.14366929261755867, + "grad_norm": 4.458252548214351, + "learning_rate": 4.832616949525107e-06, + "loss": 0.4695, + "step": 1769 + }, + { + "epoch": 0.14375050759360025, + "grad_norm": 7.708237827828306, + "learning_rate": 4.832380285917309e-06, + "loss": 0.5208, + "step": 1770 + }, + { + "epoch": 0.14383172256964183, + "grad_norm": 6.499621307199643, + "learning_rate": 4.8321434609211386e-06, + "loss": 0.6735, + "step": 1771 + }, + { + "epoch": 0.14391293754568343, + "grad_norm": 5.872483962693705, + "learning_rate": 4.831906474552983e-06, + "loss": 0.467, + "step": 1772 + }, + { + "epoch": 0.143994152521725, + "grad_norm": 3.669849200249159, + "learning_rate": 4.831669326829242e-06, + "loss": 0.6378, + "step": 1773 + }, + { + "epoch": 0.1440753674977666, + "grad_norm": 4.3559205214849674, + "learning_rate": 4.831432017766323e-06, + "loss": 0.652, + "step": 1774 + }, + { + "epoch": 0.14415658247380816, + "grad_norm": 4.444082580638693, + "learning_rate": 4.831194547380647e-06, + "loss": 0.6826, + "step": 1775 + }, + { + "epoch": 0.14423779744984974, + "grad_norm": 7.791669775176109, + "learning_rate": 4.830956915688647e-06, + "loss": 0.5328, + "step": 1776 + }, + { + "epoch": 0.14431901242589135, + "grad_norm": 3.828853400773109, + "learning_rate": 4.830719122706764e-06, + "loss": 0.7301, + "step": 1777 + }, + { + "epoch": 0.14440022740193292, + "grad_norm": 4.69526829505865, + "learning_rate": 4.830481168451453e-06, + "loss": 0.6871, + "step": 1778 + }, + { + "epoch": 0.1444814423779745, + "grad_norm": 3.7376839261484927, + "learning_rate": 4.830243052939179e-06, + "loss": 0.6172, + "step": 1779 + }, + { + "epoch": 0.14456265735401608, + "grad_norm": 5.5007109017935365, + "learning_rate": 4.830004776186419e-06, + "loss": 0.6784, + "step": 1780 + }, + { + "epoch": 0.14464387233005765, + "grad_norm": 9.156018680302578, + "learning_rate": 4.82976633820966e-06, + "loss": 0.5217, + "step": 1781 + }, + { + "epoch": 0.14472508730609923, + "grad_norm": 3.661157732313108, + "learning_rate": 4.829527739025399e-06, + "loss": 0.7003, + "step": 1782 + }, + { + "epoch": 0.14480630228214084, + "grad_norm": 6.275804371905447, + "learning_rate": 4.829288978650149e-06, + "loss": 0.5237, + "step": 1783 + }, + { + "epoch": 0.1448875172581824, + "grad_norm": 6.549033525463337, + "learning_rate": 4.829050057100428e-06, + "loss": 0.5854, + "step": 1784 + }, + { + "epoch": 0.144968732234224, + "grad_norm": 3.236921347753896, + "learning_rate": 4.82881097439277e-06, + "loss": 0.6303, + "step": 1785 + }, + { + "epoch": 0.14504994721026557, + "grad_norm": 5.188648638663512, + "learning_rate": 4.828571730543718e-06, + "loss": 0.5043, + "step": 1786 + }, + { + "epoch": 0.14513116218630714, + "grad_norm": 4.430201943407788, + "learning_rate": 4.828332325569825e-06, + "loss": 0.6505, + "step": 1787 + }, + { + "epoch": 0.14521237716234875, + "grad_norm": 4.8572855854705645, + "learning_rate": 4.828092759487658e-06, + "loss": 0.6374, + "step": 1788 + }, + { + "epoch": 0.14529359213839033, + "grad_norm": 18.826306527694214, + "learning_rate": 4.827853032313793e-06, + "loss": 0.4907, + "step": 1789 + }, + { + "epoch": 0.1453748071144319, + "grad_norm": 7.143933819169128, + "learning_rate": 4.827613144064819e-06, + "loss": 0.5695, + "step": 1790 + }, + { + "epoch": 0.14545602209047348, + "grad_norm": 5.6480765506427755, + "learning_rate": 4.827373094757334e-06, + "loss": 0.6081, + "step": 1791 + }, + { + "epoch": 0.14553723706651506, + "grad_norm": 3.6245079382385987, + "learning_rate": 4.827132884407948e-06, + "loss": 0.6509, + "step": 1792 + }, + { + "epoch": 0.14561845204255663, + "grad_norm": 4.493541124449762, + "learning_rate": 4.826892513033283e-06, + "loss": 0.6714, + "step": 1793 + }, + { + "epoch": 0.14569966701859824, + "grad_norm": 4.222014821722098, + "learning_rate": 4.8266519806499705e-06, + "loss": 0.6869, + "step": 1794 + }, + { + "epoch": 0.14578088199463982, + "grad_norm": 5.40579336748145, + "learning_rate": 4.826411287274655e-06, + "loss": 0.5754, + "step": 1795 + }, + { + "epoch": 0.1458620969706814, + "grad_norm": 7.803642190908184, + "learning_rate": 4.82617043292399e-06, + "loss": 0.5751, + "step": 1796 + }, + { + "epoch": 0.14594331194672297, + "grad_norm": 5.10891502640099, + "learning_rate": 4.825929417614643e-06, + "loss": 0.5584, + "step": 1797 + }, + { + "epoch": 0.14602452692276455, + "grad_norm": 5.342228163843412, + "learning_rate": 4.825688241363289e-06, + "loss": 0.6378, + "step": 1798 + }, + { + "epoch": 0.14610574189880615, + "grad_norm": 3.4939671214065506, + "learning_rate": 4.825446904186619e-06, + "loss": 0.7725, + "step": 1799 + }, + { + "epoch": 0.14618695687484773, + "grad_norm": 7.740693551879523, + "learning_rate": 4.825205406101328e-06, + "loss": 0.5815, + "step": 1800 + }, + { + "epoch": 0.1462681718508893, + "grad_norm": 3.8127593882048676, + "learning_rate": 4.824963747124132e-06, + "loss": 0.6857, + "step": 1801 + }, + { + "epoch": 0.14634938682693088, + "grad_norm": 7.341843917850864, + "learning_rate": 4.824721927271747e-06, + "loss": 0.5974, + "step": 1802 + }, + { + "epoch": 0.14643060180297246, + "grad_norm": 5.649042550435367, + "learning_rate": 4.8244799465609095e-06, + "loss": 0.6106, + "step": 1803 + }, + { + "epoch": 0.14651181677901404, + "grad_norm": 6.644722885021702, + "learning_rate": 4.82423780500836e-06, + "loss": 0.5848, + "step": 1804 + }, + { + "epoch": 0.14659303175505564, + "grad_norm": 3.4917280474695853, + "learning_rate": 4.823995502630857e-06, + "loss": 0.5819, + "step": 1805 + }, + { + "epoch": 0.14667424673109722, + "grad_norm": 4.938957664863549, + "learning_rate": 4.823753039445164e-06, + "loss": 0.6797, + "step": 1806 + }, + { + "epoch": 0.1467554617071388, + "grad_norm": 3.8798922539217955, + "learning_rate": 4.823510415468059e-06, + "loss": 0.6352, + "step": 1807 + }, + { + "epoch": 0.14683667668318037, + "grad_norm": 4.629237568082465, + "learning_rate": 4.82326763071633e-06, + "loss": 0.6874, + "step": 1808 + }, + { + "epoch": 0.14691789165922195, + "grad_norm": 4.720441802830114, + "learning_rate": 4.8230246852067784e-06, + "loss": 0.587, + "step": 1809 + }, + { + "epoch": 0.14699910663526355, + "grad_norm": 5.910430878822207, + "learning_rate": 4.822781578956212e-06, + "loss": 0.4856, + "step": 1810 + }, + { + "epoch": 0.14708032161130513, + "grad_norm": 4.138152132805585, + "learning_rate": 4.8225383119814526e-06, + "loss": 0.6528, + "step": 1811 + }, + { + "epoch": 0.1471615365873467, + "grad_norm": 4.151700339228477, + "learning_rate": 4.822294884299335e-06, + "loss": 0.552, + "step": 1812 + }, + { + "epoch": 0.14724275156338829, + "grad_norm": 4.1614483287452915, + "learning_rate": 4.822051295926701e-06, + "loss": 0.7129, + "step": 1813 + }, + { + "epoch": 0.14732396653942986, + "grad_norm": 6.707224091798943, + "learning_rate": 4.821807546880407e-06, + "loss": 0.6328, + "step": 1814 + }, + { + "epoch": 0.14740518151547144, + "grad_norm": 6.368579491491943, + "learning_rate": 4.8215636371773186e-06, + "loss": 0.7908, + "step": 1815 + }, + { + "epoch": 0.14748639649151304, + "grad_norm": 4.199767598115914, + "learning_rate": 4.821319566834314e-06, + "loss": 0.5206, + "step": 1816 + }, + { + "epoch": 0.14756761146755462, + "grad_norm": 4.709792455873804, + "learning_rate": 4.82107533586828e-06, + "loss": 0.6195, + "step": 1817 + }, + { + "epoch": 0.1476488264435962, + "grad_norm": 4.319113558733222, + "learning_rate": 4.820830944296117e-06, + "loss": 0.6602, + "step": 1818 + }, + { + "epoch": 0.14773004141963778, + "grad_norm": 5.544772853522585, + "learning_rate": 4.820586392134735e-06, + "loss": 0.5952, + "step": 1819 + }, + { + "epoch": 0.14781125639567935, + "grad_norm": 4.015521528170778, + "learning_rate": 4.820341679401057e-06, + "loss": 0.5967, + "step": 1820 + }, + { + "epoch": 0.14789247137172096, + "grad_norm": 3.9474000516118215, + "learning_rate": 4.820096806112015e-06, + "loss": 0.6292, + "step": 1821 + }, + { + "epoch": 0.14797368634776253, + "grad_norm": 4.8859410343320615, + "learning_rate": 4.8198517722845524e-06, + "loss": 0.4472, + "step": 1822 + }, + { + "epoch": 0.1480549013238041, + "grad_norm": 4.277013788450521, + "learning_rate": 4.819606577935626e-06, + "loss": 0.6099, + "step": 1823 + }, + { + "epoch": 0.1481361162998457, + "grad_norm": 4.8797029682772095, + "learning_rate": 4.8193612230822e-06, + "loss": 0.598, + "step": 1824 + }, + { + "epoch": 0.14821733127588727, + "grad_norm": 7.742169376350744, + "learning_rate": 4.819115707741252e-06, + "loss": 0.6474, + "step": 1825 + }, + { + "epoch": 0.14829854625192884, + "grad_norm": 5.12268577416012, + "learning_rate": 4.818870031929771e-06, + "loss": 0.4876, + "step": 1826 + }, + { + "epoch": 0.14837976122797045, + "grad_norm": 3.7709678599136605, + "learning_rate": 4.818624195664756e-06, + "loss": 0.727, + "step": 1827 + }, + { + "epoch": 0.14846097620401202, + "grad_norm": 2.5702762627716833, + "learning_rate": 4.818378198963218e-06, + "loss": 0.7224, + "step": 1828 + }, + { + "epoch": 0.1485421911800536, + "grad_norm": 3.097076580981341, + "learning_rate": 4.81813204184218e-06, + "loss": 0.6025, + "step": 1829 + }, + { + "epoch": 0.14862340615609518, + "grad_norm": 7.135529520333667, + "learning_rate": 4.817885724318671e-06, + "loss": 0.6685, + "step": 1830 + }, + { + "epoch": 0.14870462113213676, + "grad_norm": 3.4538569552830003, + "learning_rate": 4.817639246409738e-06, + "loss": 0.6588, + "step": 1831 + }, + { + "epoch": 0.14878583610817836, + "grad_norm": 4.3881359378371965, + "learning_rate": 4.817392608132435e-06, + "loss": 0.4665, + "step": 1832 + }, + { + "epoch": 0.14886705108421994, + "grad_norm": 3.7661484270198304, + "learning_rate": 4.817145809503828e-06, + "loss": 0.5136, + "step": 1833 + }, + { + "epoch": 0.1489482660602615, + "grad_norm": 5.494631448466639, + "learning_rate": 4.816898850540995e-06, + "loss": 0.5539, + "step": 1834 + }, + { + "epoch": 0.1490294810363031, + "grad_norm": 10.552148803818461, + "learning_rate": 4.816651731261023e-06, + "loss": 0.6492, + "step": 1835 + }, + { + "epoch": 0.14911069601234467, + "grad_norm": 4.963897986383459, + "learning_rate": 4.816404451681012e-06, + "loss": 0.6898, + "step": 1836 + }, + { + "epoch": 0.14919191098838624, + "grad_norm": 4.656896578348469, + "learning_rate": 4.816157011818073e-06, + "loss": 0.6373, + "step": 1837 + }, + { + "epoch": 0.14927312596442785, + "grad_norm": 6.1635228629484935, + "learning_rate": 4.815909411689326e-06, + "loss": 0.61, + "step": 1838 + }, + { + "epoch": 0.14935434094046943, + "grad_norm": 5.077711445021842, + "learning_rate": 4.815661651311905e-06, + "loss": 0.5255, + "step": 1839 + }, + { + "epoch": 0.149435555916511, + "grad_norm": 3.5697847643358926, + "learning_rate": 4.815413730702953e-06, + "loss": 0.5737, + "step": 1840 + }, + { + "epoch": 0.14951677089255258, + "grad_norm": 4.3625630887558025, + "learning_rate": 4.8151656498796245e-06, + "loss": 0.5621, + "step": 1841 + }, + { + "epoch": 0.14959798586859416, + "grad_norm": 3.632848050826748, + "learning_rate": 4.814917408859087e-06, + "loss": 0.6579, + "step": 1842 + }, + { + "epoch": 0.14967920084463576, + "grad_norm": 5.537325255170911, + "learning_rate": 4.8146690076585145e-06, + "loss": 0.6326, + "step": 1843 + }, + { + "epoch": 0.14976041582067734, + "grad_norm": 4.502954580803916, + "learning_rate": 4.8144204462950985e-06, + "loss": 0.5227, + "step": 1844 + }, + { + "epoch": 0.14984163079671892, + "grad_norm": 9.989837223985806, + "learning_rate": 4.8141717247860355e-06, + "loss": 0.5365, + "step": 1845 + }, + { + "epoch": 0.1499228457727605, + "grad_norm": 9.268156392807445, + "learning_rate": 4.813922843148537e-06, + "loss": 0.7256, + "step": 1846 + }, + { + "epoch": 0.15000406074880207, + "grad_norm": 4.723473199759222, + "learning_rate": 4.813673801399825e-06, + "loss": 0.6814, + "step": 1847 + }, + { + "epoch": 0.15008527572484365, + "grad_norm": 5.682840300388035, + "learning_rate": 4.81342459955713e-06, + "loss": 0.6083, + "step": 1848 + }, + { + "epoch": 0.15016649070088525, + "grad_norm": 3.7968600757264417, + "learning_rate": 4.813175237637697e-06, + "loss": 0.5635, + "step": 1849 + }, + { + "epoch": 0.15024770567692683, + "grad_norm": 4.69369222204755, + "learning_rate": 4.812925715658779e-06, + "loss": 0.5237, + "step": 1850 + }, + { + "epoch": 0.1503289206529684, + "grad_norm": 4.060309446961183, + "learning_rate": 4.812676033637643e-06, + "loss": 0.6771, + "step": 1851 + }, + { + "epoch": 0.15041013562900998, + "grad_norm": 4.068231340082502, + "learning_rate": 4.812426191591565e-06, + "loss": 0.4976, + "step": 1852 + }, + { + "epoch": 0.15049135060505156, + "grad_norm": 2.940489927194899, + "learning_rate": 4.812176189537833e-06, + "loss": 0.6372, + "step": 1853 + }, + { + "epoch": 0.15057256558109317, + "grad_norm": 18.577063761435014, + "learning_rate": 4.811926027493745e-06, + "loss": 0.5688, + "step": 1854 + }, + { + "epoch": 0.15065378055713474, + "grad_norm": 9.15886334831888, + "learning_rate": 4.811675705476613e-06, + "loss": 0.5274, + "step": 1855 + }, + { + "epoch": 0.15073499553317632, + "grad_norm": 8.116139489468573, + "learning_rate": 4.811425223503755e-06, + "loss": 0.6006, + "step": 1856 + }, + { + "epoch": 0.1508162105092179, + "grad_norm": 5.340837408995612, + "learning_rate": 4.811174581592506e-06, + "loss": 0.4627, + "step": 1857 + }, + { + "epoch": 0.15089742548525947, + "grad_norm": 7.861074956324514, + "learning_rate": 4.810923779760207e-06, + "loss": 0.6517, + "step": 1858 + }, + { + "epoch": 0.15097864046130105, + "grad_norm": 5.046613794107396, + "learning_rate": 4.810672818024212e-06, + "loss": 0.8007, + "step": 1859 + }, + { + "epoch": 0.15105985543734265, + "grad_norm": 3.78641177578102, + "learning_rate": 4.810421696401889e-06, + "loss": 0.4462, + "step": 1860 + }, + { + "epoch": 0.15114107041338423, + "grad_norm": 5.953501964167005, + "learning_rate": 4.810170414910611e-06, + "loss": 0.6483, + "step": 1861 + }, + { + "epoch": 0.1512222853894258, + "grad_norm": 4.508454868931279, + "learning_rate": 4.809918973567767e-06, + "loss": 0.4402, + "step": 1862 + }, + { + "epoch": 0.15130350036546739, + "grad_norm": 6.1091026815566645, + "learning_rate": 4.809667372390755e-06, + "loss": 0.5401, + "step": 1863 + }, + { + "epoch": 0.15138471534150896, + "grad_norm": 4.269870818997486, + "learning_rate": 4.809415611396984e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.15146593031755057, + "grad_norm": 8.488877241499456, + "learning_rate": 4.809163690603877e-06, + "loss": 0.6111, + "step": 1865 + }, + { + "epoch": 0.15154714529359214, + "grad_norm": 5.571115307591249, + "learning_rate": 4.808911610028861e-06, + "loss": 0.5755, + "step": 1866 + }, + { + "epoch": 0.15162836026963372, + "grad_norm": 5.592522309071258, + "learning_rate": 4.808659369689384e-06, + "loss": 0.7285, + "step": 1867 + }, + { + "epoch": 0.1517095752456753, + "grad_norm": 3.3631906733071535, + "learning_rate": 4.808406969602895e-06, + "loss": 0.6066, + "step": 1868 + }, + { + "epoch": 0.15179079022171688, + "grad_norm": 5.558011990131933, + "learning_rate": 4.8081544097868615e-06, + "loss": 0.599, + "step": 1869 + }, + { + "epoch": 0.15187200519775845, + "grad_norm": 4.196854563685466, + "learning_rate": 4.8079016902587586e-06, + "loss": 0.6429, + "step": 1870 + }, + { + "epoch": 0.15195322017380006, + "grad_norm": 5.663489456766142, + "learning_rate": 4.807648811036073e-06, + "loss": 0.4956, + "step": 1871 + }, + { + "epoch": 0.15203443514984163, + "grad_norm": 4.348733009474466, + "learning_rate": 4.807395772136303e-06, + "loss": 0.5331, + "step": 1872 + }, + { + "epoch": 0.1521156501258832, + "grad_norm": 5.237186173256271, + "learning_rate": 4.807142573576958e-06, + "loss": 0.7043, + "step": 1873 + }, + { + "epoch": 0.1521968651019248, + "grad_norm": 5.71550907696815, + "learning_rate": 4.806889215375556e-06, + "loss": 0.491, + "step": 1874 + }, + { + "epoch": 0.15227808007796637, + "grad_norm": 5.112913674384685, + "learning_rate": 4.80663569754963e-06, + "loss": 0.5455, + "step": 1875 + }, + { + "epoch": 0.15235929505400797, + "grad_norm": 4.422551589450753, + "learning_rate": 4.806382020116721e-06, + "loss": 0.6936, + "step": 1876 + }, + { + "epoch": 0.15244051003004955, + "grad_norm": 4.418250640226396, + "learning_rate": 4.806128183094383e-06, + "loss": 0.6371, + "step": 1877 + }, + { + "epoch": 0.15252172500609112, + "grad_norm": 4.611528104096768, + "learning_rate": 4.805874186500179e-06, + "loss": 0.5783, + "step": 1878 + }, + { + "epoch": 0.1526029399821327, + "grad_norm": 4.3459558003142345, + "learning_rate": 4.805620030351686e-06, + "loss": 0.6227, + "step": 1879 + }, + { + "epoch": 0.15268415495817428, + "grad_norm": 6.170103993151758, + "learning_rate": 4.805365714666489e-06, + "loss": 0.5419, + "step": 1880 + }, + { + "epoch": 0.15276536993421586, + "grad_norm": 2.958664035946074, + "learning_rate": 4.805111239462185e-06, + "loss": 0.7513, + "step": 1881 + }, + { + "epoch": 0.15284658491025746, + "grad_norm": 13.416473280689857, + "learning_rate": 4.8048566047563835e-06, + "loss": 0.4584, + "step": 1882 + }, + { + "epoch": 0.15292779988629904, + "grad_norm": 4.99824146528109, + "learning_rate": 4.8046018105667024e-06, + "loss": 0.6614, + "step": 1883 + }, + { + "epoch": 0.15300901486234061, + "grad_norm": 9.102398270856987, + "learning_rate": 4.8043468569107735e-06, + "loss": 0.7377, + "step": 1884 + }, + { + "epoch": 0.1530902298383822, + "grad_norm": 3.746424579208254, + "learning_rate": 4.804091743806237e-06, + "loss": 0.4525, + "step": 1885 + }, + { + "epoch": 0.15317144481442377, + "grad_norm": 6.368130414333825, + "learning_rate": 4.803836471270748e-06, + "loss": 0.5629, + "step": 1886 + }, + { + "epoch": 0.15325265979046537, + "grad_norm": 6.446103551439812, + "learning_rate": 4.803581039321966e-06, + "loss": 0.5416, + "step": 1887 + }, + { + "epoch": 0.15333387476650695, + "grad_norm": 5.229434076566928, + "learning_rate": 4.803325447977568e-06, + "loss": 0.5842, + "step": 1888 + }, + { + "epoch": 0.15341508974254853, + "grad_norm": 5.954690104051001, + "learning_rate": 4.80306969725524e-06, + "loss": 0.5898, + "step": 1889 + }, + { + "epoch": 0.1534963047185901, + "grad_norm": 6.150359963901614, + "learning_rate": 4.802813787172678e-06, + "loss": 0.5839, + "step": 1890 + }, + { + "epoch": 0.15357751969463168, + "grad_norm": 5.125992935197739, + "learning_rate": 4.802557717747588e-06, + "loss": 0.5668, + "step": 1891 + }, + { + "epoch": 0.15365873467067329, + "grad_norm": 4.364632434646274, + "learning_rate": 4.802301488997691e-06, + "loss": 0.4172, + "step": 1892 + }, + { + "epoch": 0.15373994964671486, + "grad_norm": 4.882188374905198, + "learning_rate": 4.802045100940715e-06, + "loss": 0.6242, + "step": 1893 + }, + { + "epoch": 0.15382116462275644, + "grad_norm": 3.3401070060949403, + "learning_rate": 4.801788553594403e-06, + "loss": 0.5872, + "step": 1894 + }, + { + "epoch": 0.15390237959879802, + "grad_norm": 4.868516648657934, + "learning_rate": 4.801531846976504e-06, + "loss": 0.5688, + "step": 1895 + }, + { + "epoch": 0.1539835945748396, + "grad_norm": 5.105870944801884, + "learning_rate": 4.801274981104781e-06, + "loss": 0.6434, + "step": 1896 + }, + { + "epoch": 0.15406480955088117, + "grad_norm": 3.8171439187123903, + "learning_rate": 4.80101795599701e-06, + "loss": 0.5289, + "step": 1897 + }, + { + "epoch": 0.15414602452692278, + "grad_norm": 16.280448607344095, + "learning_rate": 4.800760771670974e-06, + "loss": 0.4721, + "step": 1898 + }, + { + "epoch": 0.15422723950296435, + "grad_norm": 3.7231117337427055, + "learning_rate": 4.800503428144469e-06, + "loss": 0.6052, + "step": 1899 + }, + { + "epoch": 0.15430845447900593, + "grad_norm": 3.648455745421052, + "learning_rate": 4.800245925435302e-06, + "loss": 0.6106, + "step": 1900 + }, + { + "epoch": 0.1543896694550475, + "grad_norm": 6.0421687476954995, + "learning_rate": 4.7999882635612916e-06, + "loss": 0.5272, + "step": 1901 + }, + { + "epoch": 0.15447088443108908, + "grad_norm": 4.462318149762371, + "learning_rate": 4.799730442540265e-06, + "loss": 0.4801, + "step": 1902 + }, + { + "epoch": 0.1545520994071307, + "grad_norm": 5.097311868178371, + "learning_rate": 4.7994724623900636e-06, + "loss": 0.6126, + "step": 1903 + }, + { + "epoch": 0.15463331438317227, + "grad_norm": 10.491382918494583, + "learning_rate": 4.799214323128537e-06, + "loss": 0.7263, + "step": 1904 + }, + { + "epoch": 0.15471452935921384, + "grad_norm": 5.632700323452676, + "learning_rate": 4.798956024773548e-06, + "loss": 0.427, + "step": 1905 + }, + { + "epoch": 0.15479574433525542, + "grad_norm": 4.125424526878242, + "learning_rate": 4.798697567342969e-06, + "loss": 0.8174, + "step": 1906 + }, + { + "epoch": 0.154876959311297, + "grad_norm": 3.2529153125095682, + "learning_rate": 4.798438950854685e-06, + "loss": 0.5607, + "step": 1907 + }, + { + "epoch": 0.15495817428733857, + "grad_norm": 6.639548071036903, + "learning_rate": 4.798180175326589e-06, + "loss": 0.6207, + "step": 1908 + }, + { + "epoch": 0.15503938926338018, + "grad_norm": 3.419026777929521, + "learning_rate": 4.797921240776587e-06, + "loss": 0.6548, + "step": 1909 + }, + { + "epoch": 0.15512060423942176, + "grad_norm": 6.3759242855440315, + "learning_rate": 4.797662147222598e-06, + "loss": 0.722, + "step": 1910 + }, + { + "epoch": 0.15520181921546333, + "grad_norm": 4.2074330009622525, + "learning_rate": 4.797402894682548e-06, + "loss": 0.7711, + "step": 1911 + }, + { + "epoch": 0.1552830341915049, + "grad_norm": 5.86918562130916, + "learning_rate": 4.797143483174377e-06, + "loss": 0.5241, + "step": 1912 + }, + { + "epoch": 0.1553642491675465, + "grad_norm": 4.110856189622478, + "learning_rate": 4.796883912716034e-06, + "loss": 0.6855, + "step": 1913 + }, + { + "epoch": 0.1554454641435881, + "grad_norm": 4.190357484842863, + "learning_rate": 4.79662418332548e-06, + "loss": 0.616, + "step": 1914 + }, + { + "epoch": 0.15552667911962967, + "grad_norm": 6.818021899205355, + "learning_rate": 4.796364295020688e-06, + "loss": 0.554, + "step": 1915 + }, + { + "epoch": 0.15560789409567125, + "grad_norm": 4.400441609364029, + "learning_rate": 4.7961042478196394e-06, + "loss": 0.5425, + "step": 1916 + }, + { + "epoch": 0.15568910907171282, + "grad_norm": 3.755867976669089, + "learning_rate": 4.7958440417403295e-06, + "loss": 0.5295, + "step": 1917 + }, + { + "epoch": 0.1557703240477544, + "grad_norm": 3.799331044287829, + "learning_rate": 4.795583676800762e-06, + "loss": 0.4355, + "step": 1918 + }, + { + "epoch": 0.15585153902379598, + "grad_norm": 5.47739825987387, + "learning_rate": 4.795323153018953e-06, + "loss": 0.6229, + "step": 1919 + }, + { + "epoch": 0.15593275399983758, + "grad_norm": 4.656969062795371, + "learning_rate": 4.795062470412931e-06, + "loss": 0.6301, + "step": 1920 + }, + { + "epoch": 0.15601396897587916, + "grad_norm": 4.914269261314513, + "learning_rate": 4.794801629000732e-06, + "loss": 0.6974, + "step": 1921 + }, + { + "epoch": 0.15609518395192074, + "grad_norm": 6.004759949716761, + "learning_rate": 4.794540628800405e-06, + "loss": 0.6272, + "step": 1922 + }, + { + "epoch": 0.1561763989279623, + "grad_norm": 5.879866804838136, + "learning_rate": 4.79427946983001e-06, + "loss": 0.624, + "step": 1923 + }, + { + "epoch": 0.1562576139040039, + "grad_norm": 4.9010570268519835, + "learning_rate": 4.794018152107618e-06, + "loss": 0.5972, + "step": 1924 + }, + { + "epoch": 0.1563388288800455, + "grad_norm": 6.1005563812014625, + "learning_rate": 4.793756675651311e-06, + "loss": 0.5372, + "step": 1925 + }, + { + "epoch": 0.15642004385608707, + "grad_norm": 7.657069975595423, + "learning_rate": 4.7934950404791815e-06, + "loss": 0.5059, + "step": 1926 + }, + { + "epoch": 0.15650125883212865, + "grad_norm": 3.7164386302842325, + "learning_rate": 4.793233246609333e-06, + "loss": 0.5145, + "step": 1927 + }, + { + "epoch": 0.15658247380817022, + "grad_norm": 6.7098122658626025, + "learning_rate": 4.792971294059882e-06, + "loss": 0.5909, + "step": 1928 + }, + { + "epoch": 0.1566636887842118, + "grad_norm": 4.5647090456408765, + "learning_rate": 4.792709182848951e-06, + "loss": 0.5851, + "step": 1929 + }, + { + "epoch": 0.15674490376025338, + "grad_norm": 5.659279097889884, + "learning_rate": 4.792446912994679e-06, + "loss": 0.509, + "step": 1930 + }, + { + "epoch": 0.15682611873629498, + "grad_norm": 4.660823647963301, + "learning_rate": 4.792184484515214e-06, + "loss": 0.6214, + "step": 1931 + }, + { + "epoch": 0.15690733371233656, + "grad_norm": 5.379888896939118, + "learning_rate": 4.791921897428714e-06, + "loss": 0.6083, + "step": 1932 + }, + { + "epoch": 0.15698854868837814, + "grad_norm": 3.7091543807048555, + "learning_rate": 4.791659151753348e-06, + "loss": 0.5366, + "step": 1933 + }, + { + "epoch": 0.15706976366441971, + "grad_norm": 4.46536923401703, + "learning_rate": 4.791396247507297e-06, + "loss": 0.5548, + "step": 1934 + }, + { + "epoch": 0.1571509786404613, + "grad_norm": 3.0478693770891, + "learning_rate": 4.791133184708753e-06, + "loss": 0.6562, + "step": 1935 + }, + { + "epoch": 0.1572321936165029, + "grad_norm": 5.052248542138617, + "learning_rate": 4.790869963375918e-06, + "loss": 0.6266, + "step": 1936 + }, + { + "epoch": 0.15731340859254447, + "grad_norm": 6.299780666458338, + "learning_rate": 4.790606583527006e-06, + "loss": 0.6602, + "step": 1937 + }, + { + "epoch": 0.15739462356858605, + "grad_norm": 7.111506262099773, + "learning_rate": 4.790343045180242e-06, + "loss": 0.6822, + "step": 1938 + }, + { + "epoch": 0.15747583854462763, + "grad_norm": 3.5717735290621015, + "learning_rate": 4.790079348353859e-06, + "loss": 0.5579, + "step": 1939 + }, + { + "epoch": 0.1575570535206692, + "grad_norm": 4.539252075565922, + "learning_rate": 4.789815493066106e-06, + "loss": 0.5513, + "step": 1940 + }, + { + "epoch": 0.15763826849671078, + "grad_norm": 3.19405646131031, + "learning_rate": 4.78955147933524e-06, + "loss": 0.6305, + "step": 1941 + }, + { + "epoch": 0.1577194834727524, + "grad_norm": 3.9600914109632805, + "learning_rate": 4.7892873071795285e-06, + "loss": 0.5575, + "step": 1942 + }, + { + "epoch": 0.15780069844879396, + "grad_norm": 5.033361013107082, + "learning_rate": 4.789022976617251e-06, + "loss": 0.6199, + "step": 1943 + }, + { + "epoch": 0.15788191342483554, + "grad_norm": 15.306333789764619, + "learning_rate": 4.7887584876666984e-06, + "loss": 0.458, + "step": 1944 + }, + { + "epoch": 0.15796312840087712, + "grad_norm": 4.337274209547185, + "learning_rate": 4.788493840346172e-06, + "loss": 0.691, + "step": 1945 + }, + { + "epoch": 0.1580443433769187, + "grad_norm": 4.2820007012074415, + "learning_rate": 4.788229034673983e-06, + "loss": 0.7317, + "step": 1946 + }, + { + "epoch": 0.1581255583529603, + "grad_norm": 4.361412328827347, + "learning_rate": 4.787964070668455e-06, + "loss": 0.5208, + "step": 1947 + }, + { + "epoch": 0.15820677332900188, + "grad_norm": 4.543017266735986, + "learning_rate": 4.787698948347922e-06, + "loss": 0.7488, + "step": 1948 + }, + { + "epoch": 0.15828798830504345, + "grad_norm": 6.9360624754566444, + "learning_rate": 4.78743366773073e-06, + "loss": 0.478, + "step": 1949 + }, + { + "epoch": 0.15836920328108503, + "grad_norm": 4.321466474176901, + "learning_rate": 4.787168228835234e-06, + "loss": 0.8373, + "step": 1950 + }, + { + "epoch": 0.1584504182571266, + "grad_norm": 10.29187836219349, + "learning_rate": 4.7869026316798005e-06, + "loss": 0.5927, + "step": 1951 + }, + { + "epoch": 0.15853163323316818, + "grad_norm": 5.206315709924438, + "learning_rate": 4.7866368762828095e-06, + "loss": 0.7284, + "step": 1952 + }, + { + "epoch": 0.1586128482092098, + "grad_norm": 4.695971985775556, + "learning_rate": 4.786370962662647e-06, + "loss": 0.6129, + "step": 1953 + }, + { + "epoch": 0.15869406318525137, + "grad_norm": 3.810668859896304, + "learning_rate": 4.786104890837715e-06, + "loss": 0.7817, + "step": 1954 + }, + { + "epoch": 0.15877527816129294, + "grad_norm": 5.945278033090391, + "learning_rate": 4.785838660826424e-06, + "loss": 0.4929, + "step": 1955 + }, + { + "epoch": 0.15885649313733452, + "grad_norm": 5.613790832997751, + "learning_rate": 4.785572272647196e-06, + "loss": 0.5365, + "step": 1956 + }, + { + "epoch": 0.1589377081133761, + "grad_norm": 12.194006996853599, + "learning_rate": 4.785305726318461e-06, + "loss": 0.5274, + "step": 1957 + }, + { + "epoch": 0.1590189230894177, + "grad_norm": 6.67246997329038, + "learning_rate": 4.785039021858665e-06, + "loss": 0.5129, + "step": 1958 + }, + { + "epoch": 0.15910013806545928, + "grad_norm": 4.361772829570909, + "learning_rate": 4.784772159286263e-06, + "loss": 0.5762, + "step": 1959 + }, + { + "epoch": 0.15918135304150086, + "grad_norm": 3.6171339798903115, + "learning_rate": 4.784505138619719e-06, + "loss": 0.5687, + "step": 1960 + }, + { + "epoch": 0.15926256801754243, + "grad_norm": 3.3498401020879696, + "learning_rate": 4.78423795987751e-06, + "loss": 0.6731, + "step": 1961 + }, + { + "epoch": 0.159343782993584, + "grad_norm": 4.218304560758474, + "learning_rate": 4.783970623078124e-06, + "loss": 0.5832, + "step": 1962 + }, + { + "epoch": 0.1594249979696256, + "grad_norm": 3.2954022912979473, + "learning_rate": 4.783703128240058e-06, + "loss": 0.626, + "step": 1963 + }, + { + "epoch": 0.1595062129456672, + "grad_norm": 5.637076756015832, + "learning_rate": 4.783435475381822e-06, + "loss": 0.549, + "step": 1964 + }, + { + "epoch": 0.15958742792170877, + "grad_norm": 4.690580073274557, + "learning_rate": 4.7831676645219364e-06, + "loss": 0.6577, + "step": 1965 + }, + { + "epoch": 0.15966864289775035, + "grad_norm": 5.3749569715672125, + "learning_rate": 4.782899695678931e-06, + "loss": 0.5685, + "step": 1966 + }, + { + "epoch": 0.15974985787379192, + "grad_norm": 6.809233492187388, + "learning_rate": 4.782631568871349e-06, + "loss": 0.6891, + "step": 1967 + }, + { + "epoch": 0.1598310728498335, + "grad_norm": 4.530421811396376, + "learning_rate": 4.782363284117744e-06, + "loss": 0.3833, + "step": 1968 + }, + { + "epoch": 0.1599122878258751, + "grad_norm": 4.835132250782642, + "learning_rate": 4.782094841436677e-06, + "loss": 0.4926, + "step": 1969 + }, + { + "epoch": 0.15999350280191668, + "grad_norm": 3.8849675683937184, + "learning_rate": 4.781826240846726e-06, + "loss": 0.7052, + "step": 1970 + }, + { + "epoch": 0.16007471777795826, + "grad_norm": 4.260063677720038, + "learning_rate": 4.781557482366477e-06, + "loss": 0.6791, + "step": 1971 + }, + { + "epoch": 0.16015593275399984, + "grad_norm": 9.298148375842267, + "learning_rate": 4.781288566014524e-06, + "loss": 0.6356, + "step": 1972 + }, + { + "epoch": 0.1602371477300414, + "grad_norm": 4.484161226398304, + "learning_rate": 4.781019491809475e-06, + "loss": 0.5682, + "step": 1973 + }, + { + "epoch": 0.160318362706083, + "grad_norm": 3.7283315441063984, + "learning_rate": 4.78075025976995e-06, + "loss": 0.6072, + "step": 1974 + }, + { + "epoch": 0.1603995776821246, + "grad_norm": 6.059353678550269, + "learning_rate": 4.780480869914578e-06, + "loss": 0.5845, + "step": 1975 + }, + { + "epoch": 0.16048079265816617, + "grad_norm": 4.554935916876435, + "learning_rate": 4.780211322261998e-06, + "loss": 0.5095, + "step": 1976 + }, + { + "epoch": 0.16056200763420775, + "grad_norm": 6.981060106812001, + "learning_rate": 4.779941616830863e-06, + "loss": 0.5098, + "step": 1977 + }, + { + "epoch": 0.16064322261024933, + "grad_norm": 4.9440805505487, + "learning_rate": 4.779671753639835e-06, + "loss": 0.7675, + "step": 1978 + }, + { + "epoch": 0.1607244375862909, + "grad_norm": 8.280400518698295, + "learning_rate": 4.779401732707586e-06, + "loss": 0.5639, + "step": 1979 + }, + { + "epoch": 0.1608056525623325, + "grad_norm": 6.056327257248986, + "learning_rate": 4.779131554052801e-06, + "loss": 0.5981, + "step": 1980 + }, + { + "epoch": 0.16088686753837408, + "grad_norm": 5.385241894979969, + "learning_rate": 4.778861217694174e-06, + "loss": 0.6512, + "step": 1981 + }, + { + "epoch": 0.16096808251441566, + "grad_norm": 4.550144423102369, + "learning_rate": 4.778590723650413e-06, + "loss": 0.5836, + "step": 1982 + }, + { + "epoch": 0.16104929749045724, + "grad_norm": 4.3719124593201055, + "learning_rate": 4.778320071940231e-06, + "loss": 0.7309, + "step": 1983 + }, + { + "epoch": 0.16113051246649882, + "grad_norm": 4.283507192707898, + "learning_rate": 4.77804926258236e-06, + "loss": 0.5897, + "step": 1984 + }, + { + "epoch": 0.1612117274425404, + "grad_norm": 3.9610170071985524, + "learning_rate": 4.777778295595535e-06, + "loss": 0.6077, + "step": 1985 + }, + { + "epoch": 0.161292942418582, + "grad_norm": 6.219189683636367, + "learning_rate": 4.777507170998508e-06, + "loss": 0.7406, + "step": 1986 + }, + { + "epoch": 0.16137415739462357, + "grad_norm": 4.465198885140077, + "learning_rate": 4.777235888810037e-06, + "loss": 0.5908, + "step": 1987 + }, + { + "epoch": 0.16145537237066515, + "grad_norm": 6.450650746265666, + "learning_rate": 4.776964449048895e-06, + "loss": 0.5436, + "step": 1988 + }, + { + "epoch": 0.16153658734670673, + "grad_norm": 9.863744897399046, + "learning_rate": 4.776692851733864e-06, + "loss": 0.4796, + "step": 1989 + }, + { + "epoch": 0.1616178023227483, + "grad_norm": 3.703087572235551, + "learning_rate": 4.776421096883737e-06, + "loss": 0.737, + "step": 1990 + }, + { + "epoch": 0.1616990172987899, + "grad_norm": 5.885859853270686, + "learning_rate": 4.776149184517318e-06, + "loss": 0.7037, + "step": 1991 + }, + { + "epoch": 0.1617802322748315, + "grad_norm": 4.7082939980756136, + "learning_rate": 4.775877114653422e-06, + "loss": 0.5807, + "step": 1992 + }, + { + "epoch": 0.16186144725087306, + "grad_norm": 4.719665547064216, + "learning_rate": 4.775604887310874e-06, + "loss": 0.5684, + "step": 1993 + }, + { + "epoch": 0.16194266222691464, + "grad_norm": 4.372013687616748, + "learning_rate": 4.775332502508511e-06, + "loss": 0.5176, + "step": 1994 + }, + { + "epoch": 0.16202387720295622, + "grad_norm": 3.574773949292872, + "learning_rate": 4.775059960265181e-06, + "loss": 0.6306, + "step": 1995 + }, + { + "epoch": 0.1621050921789978, + "grad_norm": 3.0419771807588134, + "learning_rate": 4.774787260599744e-06, + "loss": 0.5673, + "step": 1996 + }, + { + "epoch": 0.1621863071550394, + "grad_norm": 4.11991510271793, + "learning_rate": 4.7745144035310656e-06, + "loss": 0.5221, + "step": 1997 + }, + { + "epoch": 0.16226752213108098, + "grad_norm": 8.878399802384537, + "learning_rate": 4.77424138907803e-06, + "loss": 0.5639, + "step": 1998 + }, + { + "epoch": 0.16234873710712255, + "grad_norm": 5.661428317678857, + "learning_rate": 4.773968217259525e-06, + "loss": 0.6172, + "step": 1999 + }, + { + "epoch": 0.16242995208316413, + "grad_norm": 4.587434812251549, + "learning_rate": 4.773694888094454e-06, + "loss": 0.5641, + "step": 2000 + }, + { + "epoch": 0.1625111670592057, + "grad_norm": 6.782537400712025, + "learning_rate": 4.773421401601731e-06, + "loss": 0.6165, + "step": 2001 + }, + { + "epoch": 0.1625923820352473, + "grad_norm": 5.248837296847351, + "learning_rate": 4.773147757800279e-06, + "loss": 0.5165, + "step": 2002 + }, + { + "epoch": 0.1626735970112889, + "grad_norm": 6.771172761964919, + "learning_rate": 4.772873956709032e-06, + "loss": 0.5348, + "step": 2003 + }, + { + "epoch": 0.16275481198733047, + "grad_norm": 3.7091479668688407, + "learning_rate": 4.772599998346937e-06, + "loss": 0.7519, + "step": 2004 + }, + { + "epoch": 0.16283602696337204, + "grad_norm": 4.478594715066707, + "learning_rate": 4.772325882732949e-06, + "loss": 0.5111, + "step": 2005 + }, + { + "epoch": 0.16291724193941362, + "grad_norm": 5.771923023591905, + "learning_rate": 4.772051609886036e-06, + "loss": 0.4556, + "step": 2006 + }, + { + "epoch": 0.1629984569154552, + "grad_norm": 4.808968764339891, + "learning_rate": 4.771777179825176e-06, + "loss": 0.5635, + "step": 2007 + }, + { + "epoch": 0.1630796718914968, + "grad_norm": 6.317225738730894, + "learning_rate": 4.7715025925693595e-06, + "loss": 0.8752, + "step": 2008 + }, + { + "epoch": 0.16316088686753838, + "grad_norm": 4.920049883294183, + "learning_rate": 4.771227848137585e-06, + "loss": 0.6393, + "step": 2009 + }, + { + "epoch": 0.16324210184357996, + "grad_norm": 4.256612076091997, + "learning_rate": 4.770952946548864e-06, + "loss": 0.6003, + "step": 2010 + }, + { + "epoch": 0.16332331681962153, + "grad_norm": 4.5734192613631315, + "learning_rate": 4.770677887822217e-06, + "loss": 0.4413, + "step": 2011 + }, + { + "epoch": 0.1634045317956631, + "grad_norm": 4.163070055535816, + "learning_rate": 4.770402671976677e-06, + "loss": 0.6597, + "step": 2012 + }, + { + "epoch": 0.16348574677170472, + "grad_norm": 3.0742054711235887, + "learning_rate": 4.77012729903129e-06, + "loss": 0.4076, + "step": 2013 + }, + { + "epoch": 0.1635669617477463, + "grad_norm": 5.55441667151701, + "learning_rate": 4.769851769005107e-06, + "loss": 0.5314, + "step": 2014 + }, + { + "epoch": 0.16364817672378787, + "grad_norm": 4.576121873555356, + "learning_rate": 4.769576081917195e-06, + "loss": 0.6124, + "step": 2015 + }, + { + "epoch": 0.16372939169982945, + "grad_norm": 6.382821144731639, + "learning_rate": 4.7693002377866295e-06, + "loss": 0.6729, + "step": 2016 + }, + { + "epoch": 0.16381060667587102, + "grad_norm": 9.489241945586448, + "learning_rate": 4.769024236632498e-06, + "loss": 0.4966, + "step": 2017 + }, + { + "epoch": 0.1638918216519126, + "grad_norm": 10.574632939063292, + "learning_rate": 4.768748078473898e-06, + "loss": 0.6579, + "step": 2018 + }, + { + "epoch": 0.1639730366279542, + "grad_norm": 3.323125086340652, + "learning_rate": 4.768471763329938e-06, + "loss": 0.9138, + "step": 2019 + }, + { + "epoch": 0.16405425160399578, + "grad_norm": 14.315467246083593, + "learning_rate": 4.768195291219738e-06, + "loss": 0.4129, + "step": 2020 + }, + { + "epoch": 0.16413546658003736, + "grad_norm": 3.5370073104621613, + "learning_rate": 4.767918662162428e-06, + "loss": 0.6045, + "step": 2021 + }, + { + "epoch": 0.16421668155607894, + "grad_norm": 4.952315377512936, + "learning_rate": 4.767641876177149e-06, + "loss": 0.5661, + "step": 2022 + }, + { + "epoch": 0.1642978965321205, + "grad_norm": 2.9834860448273273, + "learning_rate": 4.767364933283053e-06, + "loss": 0.5964, + "step": 2023 + }, + { + "epoch": 0.16437911150816212, + "grad_norm": 38.67374390447997, + "learning_rate": 4.767087833499305e-06, + "loss": 0.545, + "step": 2024 + }, + { + "epoch": 0.1644603264842037, + "grad_norm": 8.792176852895958, + "learning_rate": 4.7668105768450755e-06, + "loss": 0.6039, + "step": 2025 + }, + { + "epoch": 0.16454154146024527, + "grad_norm": 8.176394720323723, + "learning_rate": 4.766533163339553e-06, + "loss": 0.5289, + "step": 2026 + }, + { + "epoch": 0.16462275643628685, + "grad_norm": 6.977860465026282, + "learning_rate": 4.766255593001929e-06, + "loss": 0.4951, + "step": 2027 + }, + { + "epoch": 0.16470397141232843, + "grad_norm": 5.399038234082669, + "learning_rate": 4.765977865851413e-06, + "loss": 0.4714, + "step": 2028 + }, + { + "epoch": 0.16478518638837, + "grad_norm": 6.62984701521281, + "learning_rate": 4.765699981907221e-06, + "loss": 0.5556, + "step": 2029 + }, + { + "epoch": 0.1648664013644116, + "grad_norm": 6.225819915723398, + "learning_rate": 4.765421941188582e-06, + "loss": 0.8441, + "step": 2030 + }, + { + "epoch": 0.16494761634045318, + "grad_norm": 5.344504032565665, + "learning_rate": 4.765143743714734e-06, + "loss": 0.411, + "step": 2031 + }, + { + "epoch": 0.16502883131649476, + "grad_norm": 6.407678819914006, + "learning_rate": 4.764865389504927e-06, + "loss": 0.5036, + "step": 2032 + }, + { + "epoch": 0.16511004629253634, + "grad_norm": 4.502779183775894, + "learning_rate": 4.764586878578421e-06, + "loss": 0.5595, + "step": 2033 + }, + { + "epoch": 0.16519126126857792, + "grad_norm": 5.385105120731812, + "learning_rate": 4.7643082109544894e-06, + "loss": 0.5466, + "step": 2034 + }, + { + "epoch": 0.16527247624461952, + "grad_norm": 5.3322559203413435, + "learning_rate": 4.764029386652412e-06, + "loss": 0.5369, + "step": 2035 + }, + { + "epoch": 0.1653536912206611, + "grad_norm": 5.983847101773067, + "learning_rate": 4.763750405691483e-06, + "loss": 0.5569, + "step": 2036 + }, + { + "epoch": 0.16543490619670267, + "grad_norm": 4.212501542827999, + "learning_rate": 4.7634712680910075e-06, + "loss": 0.6493, + "step": 2037 + }, + { + "epoch": 0.16551612117274425, + "grad_norm": 5.116892931497582, + "learning_rate": 4.7631919738703e-06, + "loss": 0.511, + "step": 2038 + }, + { + "epoch": 0.16559733614878583, + "grad_norm": 6.147176716463638, + "learning_rate": 4.762912523048685e-06, + "loss": 0.5332, + "step": 2039 + }, + { + "epoch": 0.1656785511248274, + "grad_norm": 4.334801382262547, + "learning_rate": 4.7626329156455e-06, + "loss": 0.843, + "step": 2040 + }, + { + "epoch": 0.165759766100869, + "grad_norm": 6.491344666003298, + "learning_rate": 4.7623531516800916e-06, + "loss": 0.5301, + "step": 2041 + }, + { + "epoch": 0.1658409810769106, + "grad_norm": 4.47643734321392, + "learning_rate": 4.762073231171819e-06, + "loss": 0.7063, + "step": 2042 + }, + { + "epoch": 0.16592219605295216, + "grad_norm": 4.934657642144628, + "learning_rate": 4.76179315414005e-06, + "loss": 0.6217, + "step": 2043 + }, + { + "epoch": 0.16600341102899374, + "grad_norm": 6.800502146070916, + "learning_rate": 4.761512920604165e-06, + "loss": 0.5184, + "step": 2044 + }, + { + "epoch": 0.16608462600503532, + "grad_norm": 4.106268196492193, + "learning_rate": 4.761232530583556e-06, + "loss": 0.4322, + "step": 2045 + }, + { + "epoch": 0.16616584098107692, + "grad_norm": 4.860918988738907, + "learning_rate": 4.760951984097622e-06, + "loss": 0.541, + "step": 2046 + }, + { + "epoch": 0.1662470559571185, + "grad_norm": 3.6112840346969564, + "learning_rate": 4.760671281165777e-06, + "loss": 0.7025, + "step": 2047 + }, + { + "epoch": 0.16632827093316008, + "grad_norm": 3.363415520275136, + "learning_rate": 4.760390421807445e-06, + "loss": 0.5791, + "step": 2048 + }, + { + "epoch": 0.16640948590920165, + "grad_norm": 5.52653404100446, + "learning_rate": 4.760109406042057e-06, + "loss": 0.545, + "step": 2049 + }, + { + "epoch": 0.16649070088524323, + "grad_norm": 6.546707591746984, + "learning_rate": 4.759828233889061e-06, + "loss": 0.5705, + "step": 2050 + }, + { + "epoch": 0.1665719158612848, + "grad_norm": 5.661580374415145, + "learning_rate": 4.75954690536791e-06, + "loss": 0.5044, + "step": 2051 + }, + { + "epoch": 0.1666531308373264, + "grad_norm": 4.883386412845315, + "learning_rate": 4.759265420498073e-06, + "loss": 0.5467, + "step": 2052 + }, + { + "epoch": 0.166734345813368, + "grad_norm": 9.411178068543851, + "learning_rate": 4.758983779299025e-06, + "loss": 0.5892, + "step": 2053 + }, + { + "epoch": 0.16681556078940957, + "grad_norm": 4.132413410970519, + "learning_rate": 4.758701981790255e-06, + "loss": 0.986, + "step": 2054 + }, + { + "epoch": 0.16689677576545114, + "grad_norm": 34.42736555216525, + "learning_rate": 4.7584200279912614e-06, + "loss": 0.6445, + "step": 2055 + }, + { + "epoch": 0.16697799074149272, + "grad_norm": 4.276245246172677, + "learning_rate": 4.7581379179215545e-06, + "loss": 0.7808, + "step": 2056 + }, + { + "epoch": 0.16705920571753433, + "grad_norm": 3.964870477401249, + "learning_rate": 4.757855651600656e-06, + "loss": 0.7379, + "step": 2057 + }, + { + "epoch": 0.1671404206935759, + "grad_norm": 3.606188650388298, + "learning_rate": 4.757573229048095e-06, + "loss": 0.7927, + "step": 2058 + }, + { + "epoch": 0.16722163566961748, + "grad_norm": 4.909495914927784, + "learning_rate": 4.757290650283414e-06, + "loss": 0.5828, + "step": 2059 + }, + { + "epoch": 0.16730285064565906, + "grad_norm": 4.647983652975215, + "learning_rate": 4.757007915326167e-06, + "loss": 0.5334, + "step": 2060 + }, + { + "epoch": 0.16738406562170063, + "grad_norm": 6.091135267313866, + "learning_rate": 4.756725024195918e-06, + "loss": 0.7908, + "step": 2061 + }, + { + "epoch": 0.1674652805977422, + "grad_norm": 4.236635859135639, + "learning_rate": 4.75644197691224e-06, + "loss": 0.4387, + "step": 2062 + }, + { + "epoch": 0.16754649557378382, + "grad_norm": 5.254900085055447, + "learning_rate": 4.7561587734947195e-06, + "loss": 0.5377, + "step": 2063 + }, + { + "epoch": 0.1676277105498254, + "grad_norm": 6.08722014972973, + "learning_rate": 4.755875413962953e-06, + "loss": 0.4746, + "step": 2064 + }, + { + "epoch": 0.16770892552586697, + "grad_norm": 6.117784644667623, + "learning_rate": 4.7555918983365456e-06, + "loss": 0.4134, + "step": 2065 + }, + { + "epoch": 0.16779014050190855, + "grad_norm": 3.931238911985374, + "learning_rate": 4.755308226635117e-06, + "loss": 0.6176, + "step": 2066 + }, + { + "epoch": 0.16787135547795012, + "grad_norm": 5.993875492451563, + "learning_rate": 4.755024398878296e-06, + "loss": 0.5139, + "step": 2067 + }, + { + "epoch": 0.16795257045399173, + "grad_norm": 4.120681576998816, + "learning_rate": 4.75474041508572e-06, + "loss": 0.7734, + "step": 2068 + }, + { + "epoch": 0.1680337854300333, + "grad_norm": 4.2419734083458716, + "learning_rate": 4.7544562752770415e-06, + "loss": 0.6459, + "step": 2069 + }, + { + "epoch": 0.16811500040607488, + "grad_norm": 8.463809226446845, + "learning_rate": 4.75417197947192e-06, + "loss": 0.4706, + "step": 2070 + }, + { + "epoch": 0.16819621538211646, + "grad_norm": 3.8343323335367128, + "learning_rate": 4.753887527690027e-06, + "loss": 0.4932, + "step": 2071 + }, + { + "epoch": 0.16827743035815804, + "grad_norm": 8.218415227607817, + "learning_rate": 4.753602919951046e-06, + "loss": 0.5316, + "step": 2072 + }, + { + "epoch": 0.1683586453341996, + "grad_norm": 3.6050617955271216, + "learning_rate": 4.753318156274669e-06, + "loss": 0.5167, + "step": 2073 + }, + { + "epoch": 0.16843986031024122, + "grad_norm": 4.841750253634242, + "learning_rate": 4.753033236680602e-06, + "loss": 0.47, + "step": 2074 + }, + { + "epoch": 0.1685210752862828, + "grad_norm": 5.173184919410824, + "learning_rate": 4.75274816118856e-06, + "loss": 0.7099, + "step": 2075 + }, + { + "epoch": 0.16860229026232437, + "grad_norm": 3.9701269933467622, + "learning_rate": 4.7524629298182655e-06, + "loss": 0.7438, + "step": 2076 + }, + { + "epoch": 0.16868350523836595, + "grad_norm": 5.896346024335442, + "learning_rate": 4.752177542589459e-06, + "loss": 0.5601, + "step": 2077 + }, + { + "epoch": 0.16876472021440753, + "grad_norm": 4.249527053571267, + "learning_rate": 4.7518919995218854e-06, + "loss": 0.6669, + "step": 2078 + }, + { + "epoch": 0.16884593519044913, + "grad_norm": 6.037758485869208, + "learning_rate": 4.7516063006353035e-06, + "loss": 0.5073, + "step": 2079 + }, + { + "epoch": 0.1689271501664907, + "grad_norm": 3.7856446494128266, + "learning_rate": 4.7513204459494825e-06, + "loss": 0.5701, + "step": 2080 + }, + { + "epoch": 0.16900836514253229, + "grad_norm": 7.188036974920219, + "learning_rate": 4.751034435484201e-06, + "loss": 0.5066, + "step": 2081 + }, + { + "epoch": 0.16908958011857386, + "grad_norm": 4.317432007080439, + "learning_rate": 4.75074826925925e-06, + "loss": 0.7534, + "step": 2082 + }, + { + "epoch": 0.16917079509461544, + "grad_norm": 5.902119726898797, + "learning_rate": 4.750461947294431e-06, + "loss": 0.6171, + "step": 2083 + }, + { + "epoch": 0.16925201007065702, + "grad_norm": 3.7384712420136523, + "learning_rate": 4.750175469609555e-06, + "loss": 0.6519, + "step": 2084 + }, + { + "epoch": 0.16933322504669862, + "grad_norm": 3.71902625991903, + "learning_rate": 4.749888836224446e-06, + "loss": 0.6105, + "step": 2085 + }, + { + "epoch": 0.1694144400227402, + "grad_norm": 4.570880922255994, + "learning_rate": 4.749602047158937e-06, + "loss": 0.8081, + "step": 2086 + }, + { + "epoch": 0.16949565499878178, + "grad_norm": 5.288276551549225, + "learning_rate": 4.749315102432872e-06, + "loss": 0.6383, + "step": 2087 + }, + { + "epoch": 0.16957686997482335, + "grad_norm": 11.77165556632558, + "learning_rate": 4.749028002066106e-06, + "loss": 0.5472, + "step": 2088 + }, + { + "epoch": 0.16965808495086493, + "grad_norm": 4.971856688210904, + "learning_rate": 4.748740746078505e-06, + "loss": 0.3578, + "step": 2089 + }, + { + "epoch": 0.16973929992690653, + "grad_norm": 3.215058873377653, + "learning_rate": 4.748453334489947e-06, + "loss": 0.6725, + "step": 2090 + }, + { + "epoch": 0.1698205149029481, + "grad_norm": 5.921520125153694, + "learning_rate": 4.748165767320316e-06, + "loss": 0.604, + "step": 2091 + }, + { + "epoch": 0.1699017298789897, + "grad_norm": 16.542194531090658, + "learning_rate": 4.747878044589513e-06, + "loss": 0.527, + "step": 2092 + }, + { + "epoch": 0.16998294485503126, + "grad_norm": 6.697998722104277, + "learning_rate": 4.747590166317447e-06, + "loss": 0.6809, + "step": 2093 + }, + { + "epoch": 0.17006415983107284, + "grad_norm": 4.365123433682997, + "learning_rate": 4.7473021325240355e-06, + "loss": 0.6052, + "step": 2094 + }, + { + "epoch": 0.17014537480711442, + "grad_norm": 4.341123013344583, + "learning_rate": 4.74701394322921e-06, + "loss": 0.7157, + "step": 2095 + }, + { + "epoch": 0.17022658978315602, + "grad_norm": 7.815755600154752, + "learning_rate": 4.7467255984529124e-06, + "loss": 0.4511, + "step": 2096 + }, + { + "epoch": 0.1703078047591976, + "grad_norm": 3.5475344534604276, + "learning_rate": 4.746437098215094e-06, + "loss": 0.5972, + "step": 2097 + }, + { + "epoch": 0.17038901973523918, + "grad_norm": 5.6422926548593635, + "learning_rate": 4.746148442535717e-06, + "loss": 0.6985, + "step": 2098 + }, + { + "epoch": 0.17047023471128075, + "grad_norm": 3.7587908987149032, + "learning_rate": 4.745859631434757e-06, + "loss": 0.7605, + "step": 2099 + }, + { + "epoch": 0.17055144968732233, + "grad_norm": 6.674895969578671, + "learning_rate": 4.745570664932195e-06, + "loss": 0.7068, + "step": 2100 + }, + { + "epoch": 0.17063266466336394, + "grad_norm": 5.123301796152971, + "learning_rate": 4.745281543048027e-06, + "loss": 0.4672, + "step": 2101 + }, + { + "epoch": 0.1707138796394055, + "grad_norm": 5.4550745050456655, + "learning_rate": 4.744992265802261e-06, + "loss": 0.4934, + "step": 2102 + }, + { + "epoch": 0.1707950946154471, + "grad_norm": 5.148605782096688, + "learning_rate": 4.74470283321491e-06, + "loss": 0.5513, + "step": 2103 + }, + { + "epoch": 0.17087630959148867, + "grad_norm": 4.960917833837893, + "learning_rate": 4.7444132453060046e-06, + "loss": 0.5779, + "step": 2104 + }, + { + "epoch": 0.17095752456753024, + "grad_norm": 5.30615437756679, + "learning_rate": 4.744123502095579e-06, + "loss": 0.4572, + "step": 2105 + }, + { + "epoch": 0.17103873954357182, + "grad_norm": 12.9603922902617, + "learning_rate": 4.743833603603685e-06, + "loss": 0.6257, + "step": 2106 + }, + { + "epoch": 0.17111995451961343, + "grad_norm": 4.802287043719077, + "learning_rate": 4.743543549850381e-06, + "loss": 0.695, + "step": 2107 + }, + { + "epoch": 0.171201169495655, + "grad_norm": 10.403228449617778, + "learning_rate": 4.743253340855737e-06, + "loss": 0.6116, + "step": 2108 + }, + { + "epoch": 0.17128238447169658, + "grad_norm": 6.756935813007018, + "learning_rate": 4.742962976639835e-06, + "loss": 0.7586, + "step": 2109 + }, + { + "epoch": 0.17136359944773816, + "grad_norm": 10.07886788901266, + "learning_rate": 4.742672457222764e-06, + "loss": 0.4777, + "step": 2110 + }, + { + "epoch": 0.17144481442377973, + "grad_norm": 5.62161561654736, + "learning_rate": 4.742381782624629e-06, + "loss": 0.4269, + "step": 2111 + }, + { + "epoch": 0.17152602939982134, + "grad_norm": 5.949206043889717, + "learning_rate": 4.7420909528655416e-06, + "loss": 0.7793, + "step": 2112 + }, + { + "epoch": 0.17160724437586292, + "grad_norm": 4.263095903085148, + "learning_rate": 4.741799967965627e-06, + "loss": 0.5637, + "step": 2113 + }, + { + "epoch": 0.1716884593519045, + "grad_norm": 10.633709854598653, + "learning_rate": 4.74150882794502e-06, + "loss": 0.82, + "step": 2114 + }, + { + "epoch": 0.17176967432794607, + "grad_norm": 3.032916783451706, + "learning_rate": 4.741217532823864e-06, + "loss": 0.72, + "step": 2115 + }, + { + "epoch": 0.17185088930398765, + "grad_norm": 3.8608167614052484, + "learning_rate": 4.740926082622316e-06, + "loss": 0.7894, + "step": 2116 + }, + { + "epoch": 0.17193210428002922, + "grad_norm": 3.9765186741541436, + "learning_rate": 4.740634477360544e-06, + "loss": 0.5949, + "step": 2117 + }, + { + "epoch": 0.17201331925607083, + "grad_norm": 3.6048556532254246, + "learning_rate": 4.740342717058723e-06, + "loss": 0.6829, + "step": 2118 + }, + { + "epoch": 0.1720945342321124, + "grad_norm": 3.612675372859013, + "learning_rate": 4.740050801737045e-06, + "loss": 0.4803, + "step": 2119 + }, + { + "epoch": 0.17217574920815398, + "grad_norm": 4.645131781522265, + "learning_rate": 4.739758731415705e-06, + "loss": 0.7209, + "step": 2120 + }, + { + "epoch": 0.17225696418419556, + "grad_norm": 5.631469115723518, + "learning_rate": 4.739466506114916e-06, + "loss": 0.6008, + "step": 2121 + }, + { + "epoch": 0.17233817916023714, + "grad_norm": 4.1088983111658735, + "learning_rate": 4.739174125854896e-06, + "loss": 0.6917, + "step": 2122 + }, + { + "epoch": 0.17241939413627874, + "grad_norm": 5.169477468086094, + "learning_rate": 4.738881590655877e-06, + "loss": 0.5403, + "step": 2123 + }, + { + "epoch": 0.17250060911232032, + "grad_norm": 4.946107948001271, + "learning_rate": 4.738588900538102e-06, + "loss": 0.608, + "step": 2124 + }, + { + "epoch": 0.1725818240883619, + "grad_norm": 3.559651178268444, + "learning_rate": 4.738296055521821e-06, + "loss": 0.6463, + "step": 2125 + }, + { + "epoch": 0.17266303906440347, + "grad_norm": 6.129364916078834, + "learning_rate": 4.738003055627301e-06, + "loss": 0.4888, + "step": 2126 + }, + { + "epoch": 0.17274425404044505, + "grad_norm": 4.278147231585998, + "learning_rate": 4.7377099008748125e-06, + "loss": 0.8504, + "step": 2127 + }, + { + "epoch": 0.17282546901648663, + "grad_norm": 8.024624331958586, + "learning_rate": 4.737416591284643e-06, + "loss": 0.6727, + "step": 2128 + }, + { + "epoch": 0.17290668399252823, + "grad_norm": 29.45093767091559, + "learning_rate": 4.737123126877086e-06, + "loss": 0.7422, + "step": 2129 + }, + { + "epoch": 0.1729878989685698, + "grad_norm": 3.13136259957198, + "learning_rate": 4.736829507672449e-06, + "loss": 0.5307, + "step": 2130 + }, + { + "epoch": 0.17306911394461139, + "grad_norm": 7.567136439537917, + "learning_rate": 4.736535733691048e-06, + "loss": 0.6303, + "step": 2131 + }, + { + "epoch": 0.17315032892065296, + "grad_norm": 5.220061316823007, + "learning_rate": 4.73624180495321e-06, + "loss": 0.5315, + "step": 2132 + }, + { + "epoch": 0.17323154389669454, + "grad_norm": 7.195772361468456, + "learning_rate": 4.7359477214792754e-06, + "loss": 0.5764, + "step": 2133 + }, + { + "epoch": 0.17331275887273614, + "grad_norm": 7.3854852778069615, + "learning_rate": 4.735653483289591e-06, + "loss": 0.5438, + "step": 2134 + }, + { + "epoch": 0.17339397384877772, + "grad_norm": 6.686734429776475, + "learning_rate": 4.7353590904045184e-06, + "loss": 0.6625, + "step": 2135 + }, + { + "epoch": 0.1734751888248193, + "grad_norm": 3.9164463638901514, + "learning_rate": 4.735064542844428e-06, + "loss": 0.5811, + "step": 2136 + }, + { + "epoch": 0.17355640380086088, + "grad_norm": 4.957310787853842, + "learning_rate": 4.734769840629699e-06, + "loss": 0.4821, + "step": 2137 + }, + { + "epoch": 0.17363761877690245, + "grad_norm": 8.419842894201476, + "learning_rate": 4.734474983780724e-06, + "loss": 0.5227, + "step": 2138 + }, + { + "epoch": 0.17371883375294403, + "grad_norm": 8.041916137901524, + "learning_rate": 4.734179972317907e-06, + "loss": 0.6015, + "step": 2139 + }, + { + "epoch": 0.17380004872898563, + "grad_norm": 4.1188752273792195, + "learning_rate": 4.73388480626166e-06, + "loss": 0.6766, + "step": 2140 + }, + { + "epoch": 0.1738812637050272, + "grad_norm": 5.065614073087699, + "learning_rate": 4.733589485632407e-06, + "loss": 0.5939, + "step": 2141 + }, + { + "epoch": 0.1739624786810688, + "grad_norm": 3.8622027019274077, + "learning_rate": 4.733294010450583e-06, + "loss": 0.5127, + "step": 2142 + }, + { + "epoch": 0.17404369365711037, + "grad_norm": 5.31622281327025, + "learning_rate": 4.732998380736632e-06, + "loss": 0.6016, + "step": 2143 + }, + { + "epoch": 0.17412490863315194, + "grad_norm": 3.832733531671153, + "learning_rate": 4.732702596511012e-06, + "loss": 0.4538, + "step": 2144 + }, + { + "epoch": 0.17420612360919355, + "grad_norm": 4.720524116716847, + "learning_rate": 4.732406657794188e-06, + "loss": 0.5959, + "step": 2145 + }, + { + "epoch": 0.17428733858523512, + "grad_norm": 3.3574370172144734, + "learning_rate": 4.732110564606639e-06, + "loss": 0.6259, + "step": 2146 + }, + { + "epoch": 0.1743685535612767, + "grad_norm": 3.845949435304935, + "learning_rate": 4.7318143169688515e-06, + "loss": 0.5966, + "step": 2147 + }, + { + "epoch": 0.17444976853731828, + "grad_norm": 3.7946357565371973, + "learning_rate": 4.731517914901324e-06, + "loss": 0.6552, + "step": 2148 + }, + { + "epoch": 0.17453098351335986, + "grad_norm": 8.89191739354207, + "learning_rate": 4.731221358424569e-06, + "loss": 0.7327, + "step": 2149 + }, + { + "epoch": 0.17461219848940143, + "grad_norm": 6.740032168112452, + "learning_rate": 4.730924647559103e-06, + "loss": 0.6989, + "step": 2150 + }, + { + "epoch": 0.17469341346544304, + "grad_norm": 4.6320616184970875, + "learning_rate": 4.730627782325459e-06, + "loss": 0.5567, + "step": 2151 + }, + { + "epoch": 0.17477462844148461, + "grad_norm": 4.627022975502933, + "learning_rate": 4.730330762744178e-06, + "loss": 0.5177, + "step": 2152 + }, + { + "epoch": 0.1748558434175262, + "grad_norm": 5.2547160244571876, + "learning_rate": 4.730033588835812e-06, + "loss": 0.578, + "step": 2153 + }, + { + "epoch": 0.17493705839356777, + "grad_norm": 4.545741644009064, + "learning_rate": 4.729736260620924e-06, + "loss": 0.5025, + "step": 2154 + }, + { + "epoch": 0.17501827336960935, + "grad_norm": 3.1745271529048074, + "learning_rate": 4.729438778120088e-06, + "loss": 0.6065, + "step": 2155 + }, + { + "epoch": 0.17509948834565095, + "grad_norm": 3.614800398396882, + "learning_rate": 4.729141141353887e-06, + "loss": 0.5726, + "step": 2156 + }, + { + "epoch": 0.17518070332169253, + "grad_norm": 4.093940125645144, + "learning_rate": 4.7288433503429165e-06, + "loss": 0.7513, + "step": 2157 + }, + { + "epoch": 0.1752619182977341, + "grad_norm": 6.4087498177915885, + "learning_rate": 4.728545405107782e-06, + "loss": 0.5511, + "step": 2158 + }, + { + "epoch": 0.17534313327377568, + "grad_norm": 6.964419687177815, + "learning_rate": 4.7282473056691e-06, + "loss": 0.5388, + "step": 2159 + }, + { + "epoch": 0.17542434824981726, + "grad_norm": 7.381195247577337, + "learning_rate": 4.727949052047498e-06, + "loss": 0.5553, + "step": 2160 + }, + { + "epoch": 0.17550556322585883, + "grad_norm": 3.447447501574935, + "learning_rate": 4.7276506442636125e-06, + "loss": 0.6407, + "step": 2161 + }, + { + "epoch": 0.17558677820190044, + "grad_norm": 3.624265660451841, + "learning_rate": 4.727352082338092e-06, + "loss": 0.5699, + "step": 2162 + }, + { + "epoch": 0.17566799317794202, + "grad_norm": 4.736948616378692, + "learning_rate": 4.727053366291595e-06, + "loss": 0.5561, + "step": 2163 + }, + { + "epoch": 0.1757492081539836, + "grad_norm": 5.557189925086155, + "learning_rate": 4.726754496144792e-06, + "loss": 0.6388, + "step": 2164 + }, + { + "epoch": 0.17583042313002517, + "grad_norm": 5.654257813296, + "learning_rate": 4.726455471918363e-06, + "loss": 0.5798, + "step": 2165 + }, + { + "epoch": 0.17591163810606675, + "grad_norm": 4.017186015236763, + "learning_rate": 4.726156293632998e-06, + "loss": 0.6494, + "step": 2166 + }, + { + "epoch": 0.17599285308210835, + "grad_norm": 3.2976783263541036, + "learning_rate": 4.725856961309401e-06, + "loss": 0.6894, + "step": 2167 + }, + { + "epoch": 0.17607406805814993, + "grad_norm": 9.505301533813855, + "learning_rate": 4.725557474968281e-06, + "loss": 0.5775, + "step": 2168 + }, + { + "epoch": 0.1761552830341915, + "grad_norm": 5.237101724122944, + "learning_rate": 4.725257834630362e-06, + "loss": 0.5132, + "step": 2169 + }, + { + "epoch": 0.17623649801023308, + "grad_norm": 4.666530981374312, + "learning_rate": 4.7249580403163786e-06, + "loss": 0.4708, + "step": 2170 + }, + { + "epoch": 0.17631771298627466, + "grad_norm": 3.82098458375026, + "learning_rate": 4.7246580920470746e-06, + "loss": 0.6887, + "step": 2171 + }, + { + "epoch": 0.17639892796231624, + "grad_norm": 4.794596840480159, + "learning_rate": 4.7243579898432035e-06, + "loss": 0.5326, + "step": 2172 + }, + { + "epoch": 0.17648014293835784, + "grad_norm": 5.163380467947997, + "learning_rate": 4.724057733725532e-06, + "loss": 0.4342, + "step": 2173 + }, + { + "epoch": 0.17656135791439942, + "grad_norm": 6.130524877364227, + "learning_rate": 4.723757323714836e-06, + "loss": 0.5504, + "step": 2174 + }, + { + "epoch": 0.176642572890441, + "grad_norm": 3.6388040943311566, + "learning_rate": 4.723456759831903e-06, + "loss": 0.519, + "step": 2175 + }, + { + "epoch": 0.17672378786648257, + "grad_norm": 5.217405099472009, + "learning_rate": 4.7231560420975294e-06, + "loss": 0.4867, + "step": 2176 + }, + { + "epoch": 0.17680500284252415, + "grad_norm": 8.145512643236385, + "learning_rate": 4.722855170532523e-06, + "loss": 0.532, + "step": 2177 + }, + { + "epoch": 0.17688621781856576, + "grad_norm": 6.513554956269954, + "learning_rate": 4.7225541451577035e-06, + "loss": 0.7263, + "step": 2178 + }, + { + "epoch": 0.17696743279460733, + "grad_norm": 5.647381362856042, + "learning_rate": 4.7222529659939e-06, + "loss": 0.5248, + "step": 2179 + }, + { + "epoch": 0.1770486477706489, + "grad_norm": 8.634842181250088, + "learning_rate": 4.721951633061952e-06, + "loss": 0.5163, + "step": 2180 + }, + { + "epoch": 0.1771298627466905, + "grad_norm": 5.804232556564542, + "learning_rate": 4.721650146382711e-06, + "loss": 0.5169, + "step": 2181 + }, + { + "epoch": 0.17721107772273206, + "grad_norm": 5.170134284377379, + "learning_rate": 4.721348505977037e-06, + "loss": 0.569, + "step": 2182 + }, + { + "epoch": 0.17729229269877364, + "grad_norm": 6.347534050197857, + "learning_rate": 4.721046711865803e-06, + "loss": 0.7412, + "step": 2183 + }, + { + "epoch": 0.17737350767481525, + "grad_norm": 3.947255210170048, + "learning_rate": 4.720744764069892e-06, + "loss": 0.6272, + "step": 2184 + }, + { + "epoch": 0.17745472265085682, + "grad_norm": 8.963510036415032, + "learning_rate": 4.7204426626101955e-06, + "loss": 0.5039, + "step": 2185 + }, + { + "epoch": 0.1775359376268984, + "grad_norm": 5.0045242805343255, + "learning_rate": 4.720140407507619e-06, + "loss": 0.5961, + "step": 2186 + }, + { + "epoch": 0.17761715260293998, + "grad_norm": 80.66410418515012, + "learning_rate": 4.719837998783075e-06, + "loss": 0.6292, + "step": 2187 + }, + { + "epoch": 0.17769836757898155, + "grad_norm": 3.8541304256097804, + "learning_rate": 4.7195354364574915e-06, + "loss": 0.5465, + "step": 2188 + }, + { + "epoch": 0.17777958255502316, + "grad_norm": 3.524866314545861, + "learning_rate": 4.719232720551802e-06, + "loss": 0.6409, + "step": 2189 + }, + { + "epoch": 0.17786079753106473, + "grad_norm": 3.929607398758829, + "learning_rate": 4.718929851086953e-06, + "loss": 0.5239, + "step": 2190 + }, + { + "epoch": 0.1779420125071063, + "grad_norm": 6.4534562177957575, + "learning_rate": 4.718626828083902e-06, + "loss": 0.515, + "step": 2191 + }, + { + "epoch": 0.1780232274831479, + "grad_norm": 4.416334886882551, + "learning_rate": 4.718323651563616e-06, + "loss": 0.4572, + "step": 2192 + }, + { + "epoch": 0.17810444245918947, + "grad_norm": 4.2150922659750085, + "learning_rate": 4.718020321547075e-06, + "loss": 0.6128, + "step": 2193 + }, + { + "epoch": 0.17818565743523104, + "grad_norm": 3.3177339549952705, + "learning_rate": 4.717716838055265e-06, + "loss": 0.642, + "step": 2194 + }, + { + "epoch": 0.17826687241127265, + "grad_norm": 4.248446599343683, + "learning_rate": 4.717413201109187e-06, + "loss": 0.8113, + "step": 2195 + }, + { + "epoch": 0.17834808738731422, + "grad_norm": 4.561435467958204, + "learning_rate": 4.717109410729851e-06, + "loss": 0.6368, + "step": 2196 + }, + { + "epoch": 0.1784293023633558, + "grad_norm": 5.104529981932405, + "learning_rate": 4.716805466938278e-06, + "loss": 0.6438, + "step": 2197 + }, + { + "epoch": 0.17851051733939738, + "grad_norm": 3.5323774996376374, + "learning_rate": 4.7165013697555e-06, + "loss": 0.6165, + "step": 2198 + }, + { + "epoch": 0.17859173231543896, + "grad_norm": 6.640997619244458, + "learning_rate": 4.716197119202556e-06, + "loss": 0.6347, + "step": 2199 + }, + { + "epoch": 0.17867294729148056, + "grad_norm": 11.632083395111554, + "learning_rate": 4.715892715300501e-06, + "loss": 0.7224, + "step": 2200 + }, + { + "epoch": 0.17875416226752214, + "grad_norm": 4.224735988915179, + "learning_rate": 4.7155881580703984e-06, + "loss": 0.5769, + "step": 2201 + }, + { + "epoch": 0.17883537724356371, + "grad_norm": 7.9851742695130685, + "learning_rate": 4.71528344753332e-06, + "loss": 0.4985, + "step": 2202 + }, + { + "epoch": 0.1789165922196053, + "grad_norm": 6.25379024192091, + "learning_rate": 4.714978583710352e-06, + "loss": 0.5391, + "step": 2203 + }, + { + "epoch": 0.17899780719564687, + "grad_norm": 3.486820916489094, + "learning_rate": 4.714673566622589e-06, + "loss": 0.567, + "step": 2204 + }, + { + "epoch": 0.17907902217168845, + "grad_norm": 3.4864795044696617, + "learning_rate": 4.714368396291135e-06, + "loss": 0.7785, + "step": 2205 + }, + { + "epoch": 0.17916023714773005, + "grad_norm": 3.6141664132762172, + "learning_rate": 4.714063072737108e-06, + "loss": 0.4835, + "step": 2206 + }, + { + "epoch": 0.17924145212377163, + "grad_norm": 5.839740334780792, + "learning_rate": 4.713757595981634e-06, + "loss": 0.5031, + "step": 2207 + }, + { + "epoch": 0.1793226670998132, + "grad_norm": 8.510815914171106, + "learning_rate": 4.713451966045851e-06, + "loss": 0.6997, + "step": 2208 + }, + { + "epoch": 0.17940388207585478, + "grad_norm": 4.803667331216526, + "learning_rate": 4.713146182950905e-06, + "loss": 0.5733, + "step": 2209 + }, + { + "epoch": 0.17948509705189636, + "grad_norm": 3.5388152829208015, + "learning_rate": 4.7128402467179575e-06, + "loss": 0.5907, + "step": 2210 + }, + { + "epoch": 0.17956631202793796, + "grad_norm": 3.03616427792538, + "learning_rate": 4.712534157368176e-06, + "loss": 0.5774, + "step": 2211 + }, + { + "epoch": 0.17964752700397954, + "grad_norm": 9.760631640572191, + "learning_rate": 4.7122279149227405e-06, + "loss": 0.5262, + "step": 2212 + }, + { + "epoch": 0.17972874198002112, + "grad_norm": 4.386181821659895, + "learning_rate": 4.711921519402841e-06, + "loss": 0.6952, + "step": 2213 + }, + { + "epoch": 0.1798099569560627, + "grad_norm": 3.3351010256653573, + "learning_rate": 4.711614970829679e-06, + "loss": 0.5091, + "step": 2214 + }, + { + "epoch": 0.17989117193210427, + "grad_norm": 6.9608846044796024, + "learning_rate": 4.711308269224466e-06, + "loss": 0.4963, + "step": 2215 + }, + { + "epoch": 0.17997238690814585, + "grad_norm": 5.215289754630284, + "learning_rate": 4.7110014146084235e-06, + "loss": 0.5736, + "step": 2216 + }, + { + "epoch": 0.18005360188418745, + "grad_norm": 11.93663447822261, + "learning_rate": 4.710694407002785e-06, + "loss": 0.7406, + "step": 2217 + }, + { + "epoch": 0.18013481686022903, + "grad_norm": 5.163645505705543, + "learning_rate": 4.710387246428794e-06, + "loss": 0.6448, + "step": 2218 + }, + { + "epoch": 0.1802160318362706, + "grad_norm": 4.661698490114284, + "learning_rate": 4.710079932907703e-06, + "loss": 0.7224, + "step": 2219 + }, + { + "epoch": 0.18029724681231218, + "grad_norm": 3.401258624128954, + "learning_rate": 4.7097724664607775e-06, + "loss": 0.5251, + "step": 2220 + }, + { + "epoch": 0.18037846178835376, + "grad_norm": 5.125047047447445, + "learning_rate": 4.709464847109292e-06, + "loss": 0.6384, + "step": 2221 + }, + { + "epoch": 0.18045967676439537, + "grad_norm": 4.023853533344583, + "learning_rate": 4.709157074874533e-06, + "loss": 0.5047, + "step": 2222 + }, + { + "epoch": 0.18054089174043694, + "grad_norm": 6.949692493391, + "learning_rate": 4.7088491497777965e-06, + "loss": 0.5204, + "step": 2223 + }, + { + "epoch": 0.18062210671647852, + "grad_norm": 4.10788473501353, + "learning_rate": 4.708541071840388e-06, + "loss": 0.5865, + "step": 2224 + }, + { + "epoch": 0.1807033216925201, + "grad_norm": 4.694036134420709, + "learning_rate": 4.708232841083628e-06, + "loss": 0.5692, + "step": 2225 + }, + { + "epoch": 0.18078453666856167, + "grad_norm": 5.994257706825592, + "learning_rate": 4.70792445752884e-06, + "loss": 0.5018, + "step": 2226 + }, + { + "epoch": 0.18086575164460328, + "grad_norm": 8.767129488205637, + "learning_rate": 4.707615921197366e-06, + "loss": 0.5381, + "step": 2227 + }, + { + "epoch": 0.18094696662064486, + "grad_norm": 4.847865205503708, + "learning_rate": 4.707307232110554e-06, + "loss": 0.7474, + "step": 2228 + }, + { + "epoch": 0.18102818159668643, + "grad_norm": 5.668800604259548, + "learning_rate": 4.706998390289763e-06, + "loss": 0.6027, + "step": 2229 + }, + { + "epoch": 0.181109396572728, + "grad_norm": 6.745634612012881, + "learning_rate": 4.706689395756363e-06, + "loss": 0.5229, + "step": 2230 + }, + { + "epoch": 0.1811906115487696, + "grad_norm": 4.895045129217889, + "learning_rate": 4.706380248531737e-06, + "loss": 0.5253, + "step": 2231 + }, + { + "epoch": 0.18127182652481116, + "grad_norm": 4.810666628926101, + "learning_rate": 4.706070948637274e-06, + "loss": 0.6069, + "step": 2232 + }, + { + "epoch": 0.18135304150085277, + "grad_norm": 7.912604183646548, + "learning_rate": 4.705761496094377e-06, + "loss": 0.5576, + "step": 2233 + }, + { + "epoch": 0.18143425647689435, + "grad_norm": 4.074525434845986, + "learning_rate": 4.705451890924459e-06, + "loss": 0.6926, + "step": 2234 + }, + { + "epoch": 0.18151547145293592, + "grad_norm": 9.471081474256104, + "learning_rate": 4.705142133148943e-06, + "loss": 0.5167, + "step": 2235 + }, + { + "epoch": 0.1815966864289775, + "grad_norm": 4.8687622437884555, + "learning_rate": 4.70483222278926e-06, + "loss": 0.6418, + "step": 2236 + }, + { + "epoch": 0.18167790140501908, + "grad_norm": 4.3746758828868995, + "learning_rate": 4.704522159866857e-06, + "loss": 0.5135, + "step": 2237 + }, + { + "epoch": 0.18175911638106068, + "grad_norm": 4.148684813653182, + "learning_rate": 4.704211944403188e-06, + "loss": 0.6289, + "step": 2238 + }, + { + "epoch": 0.18184033135710226, + "grad_norm": 4.101220246406701, + "learning_rate": 4.703901576419717e-06, + "loss": 0.473, + "step": 2239 + }, + { + "epoch": 0.18192154633314384, + "grad_norm": 4.884276965609116, + "learning_rate": 4.703591055937922e-06, + "loss": 0.5761, + "step": 2240 + }, + { + "epoch": 0.1820027613091854, + "grad_norm": 9.506027279588086, + "learning_rate": 4.7032803829792875e-06, + "loss": 0.5745, + "step": 2241 + }, + { + "epoch": 0.182083976285227, + "grad_norm": 4.5640936040307825, + "learning_rate": 4.702969557565312e-06, + "loss": 0.6804, + "step": 2242 + }, + { + "epoch": 0.18216519126126857, + "grad_norm": 5.16553253038635, + "learning_rate": 4.702658579717502e-06, + "loss": 0.4686, + "step": 2243 + }, + { + "epoch": 0.18224640623731017, + "grad_norm": 5.1964649031938395, + "learning_rate": 4.702347449457375e-06, + "loss": 0.5184, + "step": 2244 + }, + { + "epoch": 0.18232762121335175, + "grad_norm": 8.556515016420585, + "learning_rate": 4.702036166806461e-06, + "loss": 0.3698, + "step": 2245 + }, + { + "epoch": 0.18240883618939333, + "grad_norm": 7.054847431413247, + "learning_rate": 4.7017247317862976e-06, + "loss": 0.5364, + "step": 2246 + }, + { + "epoch": 0.1824900511654349, + "grad_norm": 5.019891726347235, + "learning_rate": 4.701413144418437e-06, + "loss": 0.4901, + "step": 2247 + }, + { + "epoch": 0.18257126614147648, + "grad_norm": 4.195016853555098, + "learning_rate": 4.701101404724435e-06, + "loss": 0.4601, + "step": 2248 + }, + { + "epoch": 0.18265248111751808, + "grad_norm": 5.378788487799258, + "learning_rate": 4.700789512725867e-06, + "loss": 0.6267, + "step": 2249 + }, + { + "epoch": 0.18273369609355966, + "grad_norm": 3.793906030224367, + "learning_rate": 4.700477468444311e-06, + "loss": 0.6438, + "step": 2250 + }, + { + "epoch": 0.18281491106960124, + "grad_norm": 4.868151253449179, + "learning_rate": 4.700165271901361e-06, + "loss": 0.6933, + "step": 2251 + }, + { + "epoch": 0.18289612604564282, + "grad_norm": 7.777361957555963, + "learning_rate": 4.699852923118618e-06, + "loss": 0.4894, + "step": 2252 + }, + { + "epoch": 0.1829773410216844, + "grad_norm": 4.106355721520342, + "learning_rate": 4.699540422117695e-06, + "loss": 0.4232, + "step": 2253 + }, + { + "epoch": 0.18305855599772597, + "grad_norm": 5.430574051132866, + "learning_rate": 4.699227768920216e-06, + "loss": 0.5387, + "step": 2254 + }, + { + "epoch": 0.18313977097376757, + "grad_norm": 6.575598696133923, + "learning_rate": 4.6989149635478145e-06, + "loss": 0.5371, + "step": 2255 + }, + { + "epoch": 0.18322098594980915, + "grad_norm": 5.4676777555816765, + "learning_rate": 4.698602006022136e-06, + "loss": 0.5012, + "step": 2256 + }, + { + "epoch": 0.18330220092585073, + "grad_norm": 4.576711281154184, + "learning_rate": 4.698288896364834e-06, + "loss": 0.6324, + "step": 2257 + }, + { + "epoch": 0.1833834159018923, + "grad_norm": 11.976819517596883, + "learning_rate": 4.697975634597574e-06, + "loss": 0.5405, + "step": 2258 + }, + { + "epoch": 0.18346463087793388, + "grad_norm": 6.301644428134142, + "learning_rate": 4.697662220742033e-06, + "loss": 0.5355, + "step": 2259 + }, + { + "epoch": 0.1835458458539755, + "grad_norm": 16.567162364593425, + "learning_rate": 4.697348654819898e-06, + "loss": 0.5812, + "step": 2260 + }, + { + "epoch": 0.18362706083001706, + "grad_norm": 5.845436104952721, + "learning_rate": 4.697034936852865e-06, + "loss": 0.6177, + "step": 2261 + }, + { + "epoch": 0.18370827580605864, + "grad_norm": 3.4478757836782608, + "learning_rate": 4.6967210668626415e-06, + "loss": 0.6487, + "step": 2262 + }, + { + "epoch": 0.18378949078210022, + "grad_norm": 8.26455582424995, + "learning_rate": 4.696407044870947e-06, + "loss": 0.6865, + "step": 2263 + }, + { + "epoch": 0.1838707057581418, + "grad_norm": 3.6489257014011924, + "learning_rate": 4.696092870899509e-06, + "loss": 0.4881, + "step": 2264 + }, + { + "epoch": 0.18395192073418337, + "grad_norm": 7.476743860852032, + "learning_rate": 4.695778544970066e-06, + "loss": 0.5365, + "step": 2265 + }, + { + "epoch": 0.18403313571022498, + "grad_norm": 5.335228274279977, + "learning_rate": 4.695464067104371e-06, + "loss": 0.5978, + "step": 2266 + }, + { + "epoch": 0.18411435068626655, + "grad_norm": 7.0816544622030175, + "learning_rate": 4.6951494373241805e-06, + "loss": 0.769, + "step": 2267 + }, + { + "epoch": 0.18419556566230813, + "grad_norm": 3.851216245811756, + "learning_rate": 4.694834655651266e-06, + "loss": 0.627, + "step": 2268 + }, + { + "epoch": 0.1842767806383497, + "grad_norm": 4.805289628209942, + "learning_rate": 4.6945197221074104e-06, + "loss": 0.8659, + "step": 2269 + }, + { + "epoch": 0.18435799561439128, + "grad_norm": 7.794985897939018, + "learning_rate": 4.694204636714403e-06, + "loss": 0.5485, + "step": 2270 + }, + { + "epoch": 0.1844392105904329, + "grad_norm": 20.84411786544806, + "learning_rate": 4.693889399494049e-06, + "loss": 0.5583, + "step": 2271 + }, + { + "epoch": 0.18452042556647447, + "grad_norm": 4.8093342027022326, + "learning_rate": 4.693574010468159e-06, + "loss": 0.7422, + "step": 2272 + }, + { + "epoch": 0.18460164054251604, + "grad_norm": 4.897421838824223, + "learning_rate": 4.693258469658557e-06, + "loss": 0.6693, + "step": 2273 + }, + { + "epoch": 0.18468285551855762, + "grad_norm": 5.418227314133965, + "learning_rate": 4.692942777087076e-06, + "loss": 0.5361, + "step": 2274 + }, + { + "epoch": 0.1847640704945992, + "grad_norm": 4.1871350066526745, + "learning_rate": 4.692626932775561e-06, + "loss": 0.6755, + "step": 2275 + }, + { + "epoch": 0.18484528547064077, + "grad_norm": 10.739798887484266, + "learning_rate": 4.6923109367458665e-06, + "loss": 0.7373, + "step": 2276 + }, + { + "epoch": 0.18492650044668238, + "grad_norm": 4.134435714566178, + "learning_rate": 4.6919947890198585e-06, + "loss": 0.5737, + "step": 2277 + }, + { + "epoch": 0.18500771542272396, + "grad_norm": 4.222974710982374, + "learning_rate": 4.691678489619411e-06, + "loss": 0.7376, + "step": 2278 + }, + { + "epoch": 0.18508893039876553, + "grad_norm": 4.6176435172773065, + "learning_rate": 4.691362038566411e-06, + "loss": 0.6068, + "step": 2279 + }, + { + "epoch": 0.1851701453748071, + "grad_norm": 3.8516607452654577, + "learning_rate": 4.691045435882758e-06, + "loss": 0.5776, + "step": 2280 + }, + { + "epoch": 0.1852513603508487, + "grad_norm": 4.380266220174846, + "learning_rate": 4.690728681590354e-06, + "loss": 0.4589, + "step": 2281 + }, + { + "epoch": 0.1853325753268903, + "grad_norm": 20.10704260314573, + "learning_rate": 4.6904117757111215e-06, + "loss": 0.4806, + "step": 2282 + }, + { + "epoch": 0.18541379030293187, + "grad_norm": 4.622570996308856, + "learning_rate": 4.6900947182669855e-06, + "loss": 0.5437, + "step": 2283 + }, + { + "epoch": 0.18549500527897345, + "grad_norm": 3.4999814335004125, + "learning_rate": 4.689777509279886e-06, + "loss": 0.6253, + "step": 2284 + }, + { + "epoch": 0.18557622025501502, + "grad_norm": 8.159646406598347, + "learning_rate": 4.689460148771773e-06, + "loss": 0.6941, + "step": 2285 + }, + { + "epoch": 0.1856574352310566, + "grad_norm": 4.119601877397449, + "learning_rate": 4.6891426367646046e-06, + "loss": 0.5396, + "step": 2286 + }, + { + "epoch": 0.18573865020709818, + "grad_norm": 5.783579437576225, + "learning_rate": 4.6888249732803516e-06, + "loss": 0.3815, + "step": 2287 + }, + { + "epoch": 0.18581986518313978, + "grad_norm": 4.479086903423486, + "learning_rate": 4.688507158340994e-06, + "loss": 0.5961, + "step": 2288 + }, + { + "epoch": 0.18590108015918136, + "grad_norm": 5.521280967881707, + "learning_rate": 4.688189191968524e-06, + "loss": 0.5213, + "step": 2289 + }, + { + "epoch": 0.18598229513522294, + "grad_norm": 5.560685189930327, + "learning_rate": 4.687871074184944e-06, + "loss": 0.5069, + "step": 2290 + }, + { + "epoch": 0.1860635101112645, + "grad_norm": 8.713540857848809, + "learning_rate": 4.687552805012263e-06, + "loss": 0.4698, + "step": 2291 + }, + { + "epoch": 0.1861447250873061, + "grad_norm": 7.120678804920857, + "learning_rate": 4.687234384472506e-06, + "loss": 0.5941, + "step": 2292 + }, + { + "epoch": 0.1862259400633477, + "grad_norm": 3.7747766229799105, + "learning_rate": 4.686915812587706e-06, + "loss": 0.532, + "step": 2293 + }, + { + "epoch": 0.18630715503938927, + "grad_norm": 5.78663404222734, + "learning_rate": 4.686597089379905e-06, + "loss": 0.6332, + "step": 2294 + }, + { + "epoch": 0.18638837001543085, + "grad_norm": 5.344791467556041, + "learning_rate": 4.6862782148711584e-06, + "loss": 0.551, + "step": 2295 + }, + { + "epoch": 0.18646958499147243, + "grad_norm": 3.2548480768417183, + "learning_rate": 4.685959189083531e-06, + "loss": 0.7081, + "step": 2296 + }, + { + "epoch": 0.186550799967514, + "grad_norm": 4.522340647304046, + "learning_rate": 4.685640012039095e-06, + "loss": 0.6577, + "step": 2297 + }, + { + "epoch": 0.18663201494355558, + "grad_norm": 4.317215335517002, + "learning_rate": 4.685320683759939e-06, + "loss": 0.5544, + "step": 2298 + }, + { + "epoch": 0.18671322991959718, + "grad_norm": 5.363458185770196, + "learning_rate": 4.685001204268156e-06, + "loss": 0.4849, + "step": 2299 + }, + { + "epoch": 0.18679444489563876, + "grad_norm": 25.593738509492916, + "learning_rate": 4.684681573585854e-06, + "loss": 0.5735, + "step": 2300 + }, + { + "epoch": 0.18687565987168034, + "grad_norm": 6.172329696323195, + "learning_rate": 4.684361791735149e-06, + "loss": 0.5465, + "step": 2301 + }, + { + "epoch": 0.18695687484772192, + "grad_norm": 4.8669050067958874, + "learning_rate": 4.684041858738169e-06, + "loss": 0.6078, + "step": 2302 + }, + { + "epoch": 0.1870380898237635, + "grad_norm": 12.729610338065322, + "learning_rate": 4.683721774617052e-06, + "loss": 0.5647, + "step": 2303 + }, + { + "epoch": 0.1871193047998051, + "grad_norm": 5.8134799742582475, + "learning_rate": 4.6834015393939445e-06, + "loss": 0.5026, + "step": 2304 + }, + { + "epoch": 0.18720051977584667, + "grad_norm": 3.5817996920030257, + "learning_rate": 4.683081153091006e-06, + "loss": 0.4811, + "step": 2305 + }, + { + "epoch": 0.18728173475188825, + "grad_norm": 4.114121320477102, + "learning_rate": 4.682760615730405e-06, + "loss": 0.6439, + "step": 2306 + }, + { + "epoch": 0.18736294972792983, + "grad_norm": 7.202440009743923, + "learning_rate": 4.682439927334323e-06, + "loss": 0.6019, + "step": 2307 + }, + { + "epoch": 0.1874441647039714, + "grad_norm": 5.323112544773871, + "learning_rate": 4.682119087924948e-06, + "loss": 0.7116, + "step": 2308 + }, + { + "epoch": 0.18752537968001298, + "grad_norm": 3.6253125361558967, + "learning_rate": 4.681798097524479e-06, + "loss": 0.6427, + "step": 2309 + }, + { + "epoch": 0.1876065946560546, + "grad_norm": 4.355698784301835, + "learning_rate": 4.681476956155131e-06, + "loss": 0.4505, + "step": 2310 + }, + { + "epoch": 0.18768780963209616, + "grad_norm": 3.395486667581571, + "learning_rate": 4.681155663839122e-06, + "loss": 0.5654, + "step": 2311 + }, + { + "epoch": 0.18776902460813774, + "grad_norm": 5.363908094574731, + "learning_rate": 4.680834220598685e-06, + "loss": 0.5545, + "step": 2312 + }, + { + "epoch": 0.18785023958417932, + "grad_norm": 4.304796229403271, + "learning_rate": 4.6805126264560605e-06, + "loss": 0.6393, + "step": 2313 + }, + { + "epoch": 0.1879314545602209, + "grad_norm": 4.291548258367333, + "learning_rate": 4.680190881433504e-06, + "loss": 0.6884, + "step": 2314 + }, + { + "epoch": 0.1880126695362625, + "grad_norm": 9.784318271093905, + "learning_rate": 4.679868985553276e-06, + "loss": 0.5777, + "step": 2315 + }, + { + "epoch": 0.18809388451230408, + "grad_norm": 7.761537889560317, + "learning_rate": 4.6795469388376525e-06, + "loss": 0.47, + "step": 2316 + }, + { + "epoch": 0.18817509948834565, + "grad_norm": 6.995156273431482, + "learning_rate": 4.6792247413089145e-06, + "loss": 0.6137, + "step": 2317 + }, + { + "epoch": 0.18825631446438723, + "grad_norm": 4.313226522351422, + "learning_rate": 4.678902392989359e-06, + "loss": 0.6229, + "step": 2318 + }, + { + "epoch": 0.1883375294404288, + "grad_norm": 5.081068946857541, + "learning_rate": 4.678579893901288e-06, + "loss": 0.5712, + "step": 2319 + }, + { + "epoch": 0.18841874441647039, + "grad_norm": 4.809789843673943, + "learning_rate": 4.678257244067019e-06, + "loss": 0.5531, + "step": 2320 + }, + { + "epoch": 0.188499959392512, + "grad_norm": 3.8452865625471255, + "learning_rate": 4.677934443508877e-06, + "loss": 0.5659, + "step": 2321 + }, + { + "epoch": 0.18858117436855357, + "grad_norm": 4.917273325549435, + "learning_rate": 4.6776114922491985e-06, + "loss": 0.5518, + "step": 2322 + }, + { + "epoch": 0.18866238934459514, + "grad_norm": 4.459277191818069, + "learning_rate": 4.67728839031033e-06, + "loss": 0.5119, + "step": 2323 + }, + { + "epoch": 0.18874360432063672, + "grad_norm": 8.098290875630656, + "learning_rate": 4.676965137714626e-06, + "loss": 0.5824, + "step": 2324 + }, + { + "epoch": 0.1888248192966783, + "grad_norm": 5.792865775052247, + "learning_rate": 4.676641734484457e-06, + "loss": 0.5991, + "step": 2325 + }, + { + "epoch": 0.1889060342727199, + "grad_norm": 6.158616589247779, + "learning_rate": 4.6763181806422e-06, + "loss": 0.6869, + "step": 2326 + }, + { + "epoch": 0.18898724924876148, + "grad_norm": 6.1672909888851395, + "learning_rate": 4.675994476210243e-06, + "loss": 0.4876, + "step": 2327 + }, + { + "epoch": 0.18906846422480306, + "grad_norm": 4.374487244986078, + "learning_rate": 4.675670621210985e-06, + "loss": 0.4851, + "step": 2328 + }, + { + "epoch": 0.18914967920084463, + "grad_norm": 4.378802636321053, + "learning_rate": 4.675346615666834e-06, + "loss": 0.5938, + "step": 2329 + }, + { + "epoch": 0.1892308941768862, + "grad_norm": 6.805865659937306, + "learning_rate": 4.675022459600209e-06, + "loss": 0.5517, + "step": 2330 + }, + { + "epoch": 0.1893121091529278, + "grad_norm": 10.069257270167085, + "learning_rate": 4.674698153033542e-06, + "loss": 0.5793, + "step": 2331 + }, + { + "epoch": 0.1893933241289694, + "grad_norm": 5.002867430656955, + "learning_rate": 4.674373695989272e-06, + "loss": 0.4501, + "step": 2332 + }, + { + "epoch": 0.18947453910501097, + "grad_norm": 6.5558875829671726, + "learning_rate": 4.67404908848985e-06, + "loss": 0.6941, + "step": 2333 + }, + { + "epoch": 0.18955575408105255, + "grad_norm": 3.310181376592309, + "learning_rate": 4.673724330557737e-06, + "loss": 0.5876, + "step": 2334 + }, + { + "epoch": 0.18963696905709412, + "grad_norm": 4.3922451080442615, + "learning_rate": 4.673399422215405e-06, + "loss": 0.6729, + "step": 2335 + }, + { + "epoch": 0.1897181840331357, + "grad_norm": 3.2619584312724124, + "learning_rate": 4.673074363485336e-06, + "loss": 0.6935, + "step": 2336 + }, + { + "epoch": 0.1897993990091773, + "grad_norm": 4.553167577761572, + "learning_rate": 4.672749154390021e-06, + "loss": 0.4633, + "step": 2337 + }, + { + "epoch": 0.18988061398521888, + "grad_norm": 3.149250992164633, + "learning_rate": 4.6724237949519635e-06, + "loss": 0.6547, + "step": 2338 + }, + { + "epoch": 0.18996182896126046, + "grad_norm": 3.8540236289067646, + "learning_rate": 4.672098285193677e-06, + "loss": 0.5031, + "step": 2339 + }, + { + "epoch": 0.19004304393730204, + "grad_norm": 5.029855666317853, + "learning_rate": 4.671772625137685e-06, + "loss": 0.5433, + "step": 2340 + }, + { + "epoch": 0.1901242589133436, + "grad_norm": 4.4089431032158, + "learning_rate": 4.6714468148065215e-06, + "loss": 0.6162, + "step": 2341 + }, + { + "epoch": 0.1902054738893852, + "grad_norm": 4.951136934389378, + "learning_rate": 4.67112085422273e-06, + "loss": 0.5549, + "step": 2342 + }, + { + "epoch": 0.1902866888654268, + "grad_norm": 4.969440488977865, + "learning_rate": 4.6707947434088665e-06, + "loss": 0.5311, + "step": 2343 + }, + { + "epoch": 0.19036790384146837, + "grad_norm": 7.467675141875417, + "learning_rate": 4.670468482387495e-06, + "loss": 0.5161, + "step": 2344 + }, + { + "epoch": 0.19044911881750995, + "grad_norm": 5.817463721744332, + "learning_rate": 4.670142071181192e-06, + "loss": 0.5573, + "step": 2345 + }, + { + "epoch": 0.19053033379355153, + "grad_norm": 3.588261641157244, + "learning_rate": 4.6698155098125435e-06, + "loss": 0.4921, + "step": 2346 + }, + { + "epoch": 0.1906115487695931, + "grad_norm": 5.247075946408668, + "learning_rate": 4.6694887983041434e-06, + "loss": 0.6335, + "step": 2347 + }, + { + "epoch": 0.1906927637456347, + "grad_norm": 5.98092813691066, + "learning_rate": 4.669161936678602e-06, + "loss": 0.6301, + "step": 2348 + }, + { + "epoch": 0.19077397872167629, + "grad_norm": 4.882612458777484, + "learning_rate": 4.668834924958534e-06, + "loss": 0.605, + "step": 2349 + }, + { + "epoch": 0.19085519369771786, + "grad_norm": 4.135928802898827, + "learning_rate": 4.668507763166568e-06, + "loss": 0.7428, + "step": 2350 + }, + { + "epoch": 0.19093640867375944, + "grad_norm": 6.436181992978232, + "learning_rate": 4.668180451325341e-06, + "loss": 0.6098, + "step": 2351 + }, + { + "epoch": 0.19101762364980102, + "grad_norm": 8.974548488637932, + "learning_rate": 4.667852989457502e-06, + "loss": 0.5165, + "step": 2352 + }, + { + "epoch": 0.1910988386258426, + "grad_norm": 4.064278950992003, + "learning_rate": 4.6675253775857096e-06, + "loss": 0.5658, + "step": 2353 + }, + { + "epoch": 0.1911800536018842, + "grad_norm": 4.497922853159247, + "learning_rate": 4.667197615732633e-06, + "loss": 0.6748, + "step": 2354 + }, + { + "epoch": 0.19126126857792577, + "grad_norm": 4.494597527974153, + "learning_rate": 4.66686970392095e-06, + "loss": 0.5488, + "step": 2355 + }, + { + "epoch": 0.19134248355396735, + "grad_norm": 5.505569141881063, + "learning_rate": 4.666541642173352e-06, + "loss": 0.5822, + "step": 2356 + }, + { + "epoch": 0.19142369853000893, + "grad_norm": 3.730344521450571, + "learning_rate": 4.666213430512538e-06, + "loss": 0.634, + "step": 2357 + }, + { + "epoch": 0.1915049135060505, + "grad_norm": 5.750851724066535, + "learning_rate": 4.66588506896122e-06, + "loss": 0.5967, + "step": 2358 + }, + { + "epoch": 0.1915861284820921, + "grad_norm": 5.307685486956423, + "learning_rate": 4.665556557542118e-06, + "loss": 0.7144, + "step": 2359 + }, + { + "epoch": 0.1916673434581337, + "grad_norm": 5.143047388945135, + "learning_rate": 4.6652278962779625e-06, + "loss": 0.6383, + "step": 2360 + }, + { + "epoch": 0.19174855843417526, + "grad_norm": 4.999760195133255, + "learning_rate": 4.664899085191496e-06, + "loss": 0.6477, + "step": 2361 + }, + { + "epoch": 0.19182977341021684, + "grad_norm": 7.787092748476295, + "learning_rate": 4.664570124305472e-06, + "loss": 0.5304, + "step": 2362 + }, + { + "epoch": 0.19191098838625842, + "grad_norm": 4.318743886643792, + "learning_rate": 4.66424101364265e-06, + "loss": 0.4677, + "step": 2363 + }, + { + "epoch": 0.1919922033623, + "grad_norm": 4.9984105511208545, + "learning_rate": 4.663911753225803e-06, + "loss": 0.5786, + "step": 2364 + }, + { + "epoch": 0.1920734183383416, + "grad_norm": 3.7096978585490556, + "learning_rate": 4.663582343077716e-06, + "loss": 0.4654, + "step": 2365 + }, + { + "epoch": 0.19215463331438318, + "grad_norm": 5.0966976399052815, + "learning_rate": 4.663252783221182e-06, + "loss": 0.4797, + "step": 2366 + }, + { + "epoch": 0.19223584829042475, + "grad_norm": 11.273482637934793, + "learning_rate": 4.662923073679003e-06, + "loss": 0.5612, + "step": 2367 + }, + { + "epoch": 0.19231706326646633, + "grad_norm": 5.498434540946209, + "learning_rate": 4.662593214473995e-06, + "loss": 0.6027, + "step": 2368 + }, + { + "epoch": 0.1923982782425079, + "grad_norm": 3.859479042229787, + "learning_rate": 4.662263205628983e-06, + "loss": 0.7433, + "step": 2369 + }, + { + "epoch": 0.1924794932185495, + "grad_norm": 4.418822540532742, + "learning_rate": 4.661933047166799e-06, + "loss": 0.4785, + "step": 2370 + }, + { + "epoch": 0.1925607081945911, + "grad_norm": 5.051803928696753, + "learning_rate": 4.661602739110291e-06, + "loss": 0.5654, + "step": 2371 + }, + { + "epoch": 0.19264192317063267, + "grad_norm": 4.0841298739845895, + "learning_rate": 4.661272281482313e-06, + "loss": 0.7034, + "step": 2372 + }, + { + "epoch": 0.19272313814667424, + "grad_norm": 7.928386324620138, + "learning_rate": 4.660941674305732e-06, + "loss": 0.57, + "step": 2373 + }, + { + "epoch": 0.19280435312271582, + "grad_norm": 6.641168636182495, + "learning_rate": 4.660610917603423e-06, + "loss": 0.4108, + "step": 2374 + }, + { + "epoch": 0.1928855680987574, + "grad_norm": 7.233453638420033, + "learning_rate": 4.6602800113982746e-06, + "loss": 0.5886, + "step": 2375 + }, + { + "epoch": 0.192966783074799, + "grad_norm": 5.3241909788384385, + "learning_rate": 4.659948955713181e-06, + "loss": 0.6516, + "step": 2376 + }, + { + "epoch": 0.19304799805084058, + "grad_norm": 6.517892337897202, + "learning_rate": 4.659617750571052e-06, + "loss": 0.5341, + "step": 2377 + }, + { + "epoch": 0.19312921302688216, + "grad_norm": 8.78691467540056, + "learning_rate": 4.659286395994806e-06, + "loss": 0.7056, + "step": 2378 + }, + { + "epoch": 0.19321042800292373, + "grad_norm": 5.510965559445886, + "learning_rate": 4.658954892007367e-06, + "loss": 0.4839, + "step": 2379 + }, + { + "epoch": 0.1932916429789653, + "grad_norm": 5.065066805659556, + "learning_rate": 4.658623238631675e-06, + "loss": 0.5688, + "step": 2380 + }, + { + "epoch": 0.19337285795500692, + "grad_norm": 4.049904522221411, + "learning_rate": 4.658291435890681e-06, + "loss": 0.5662, + "step": 2381 + }, + { + "epoch": 0.1934540729310485, + "grad_norm": 5.778500245146091, + "learning_rate": 4.657959483807342e-06, + "loss": 0.52, + "step": 2382 + }, + { + "epoch": 0.19353528790709007, + "grad_norm": 4.275606455147308, + "learning_rate": 4.657627382404627e-06, + "loss": 0.4357, + "step": 2383 + }, + { + "epoch": 0.19361650288313165, + "grad_norm": 4.9185646028031345, + "learning_rate": 4.657295131705516e-06, + "loss": 0.6067, + "step": 2384 + }, + { + "epoch": 0.19369771785917322, + "grad_norm": 4.858322104090867, + "learning_rate": 4.6569627317329995e-06, + "loss": 0.58, + "step": 2385 + }, + { + "epoch": 0.1937789328352148, + "grad_norm": 5.010825230384572, + "learning_rate": 4.656630182510078e-06, + "loss": 0.5096, + "step": 2386 + }, + { + "epoch": 0.1938601478112564, + "grad_norm": 4.971344756018251, + "learning_rate": 4.656297484059761e-06, + "loss": 0.5781, + "step": 2387 + }, + { + "epoch": 0.19394136278729798, + "grad_norm": 5.7140608376861906, + "learning_rate": 4.655964636405071e-06, + "loss": 0.5301, + "step": 2388 + }, + { + "epoch": 0.19402257776333956, + "grad_norm": 5.25952348586539, + "learning_rate": 4.655631639569037e-06, + "loss": 0.5345, + "step": 2389 + }, + { + "epoch": 0.19410379273938114, + "grad_norm": 4.858225371501829, + "learning_rate": 4.655298493574704e-06, + "loss": 0.5165, + "step": 2390 + }, + { + "epoch": 0.19418500771542271, + "grad_norm": 7.9815798526373145, + "learning_rate": 4.65496519844512e-06, + "loss": 0.5336, + "step": 2391 + }, + { + "epoch": 0.19426622269146432, + "grad_norm": 6.221631927317744, + "learning_rate": 4.654631754203351e-06, + "loss": 0.5233, + "step": 2392 + }, + { + "epoch": 0.1943474376675059, + "grad_norm": 5.199395166051189, + "learning_rate": 4.6542981608724665e-06, + "loss": 0.5399, + "step": 2393 + }, + { + "epoch": 0.19442865264354747, + "grad_norm": 5.85510080538778, + "learning_rate": 4.6539644184755515e-06, + "loss": 0.6658, + "step": 2394 + }, + { + "epoch": 0.19450986761958905, + "grad_norm": 5.187441178217061, + "learning_rate": 4.6536305270356975e-06, + "loss": 0.5119, + "step": 2395 + }, + { + "epoch": 0.19459108259563063, + "grad_norm": 4.41810789985307, + "learning_rate": 4.65329648657601e-06, + "loss": 0.3997, + "step": 2396 + }, + { + "epoch": 0.1946722975716722, + "grad_norm": 5.73516894480618, + "learning_rate": 4.652962297119601e-06, + "loss": 0.7692, + "step": 2397 + }, + { + "epoch": 0.1947535125477138, + "grad_norm": 7.959466481454268, + "learning_rate": 4.652627958689596e-06, + "loss": 0.4979, + "step": 2398 + }, + { + "epoch": 0.19483472752375539, + "grad_norm": 5.346996268227039, + "learning_rate": 4.65229347130913e-06, + "loss": 0.6685, + "step": 2399 + }, + { + "epoch": 0.19491594249979696, + "grad_norm": 14.105527004476242, + "learning_rate": 4.651958835001345e-06, + "loss": 0.5679, + "step": 2400 + }, + { + "epoch": 0.19499715747583854, + "grad_norm": 5.030593581231049, + "learning_rate": 4.651624049789397e-06, + "loss": 0.4162, + "step": 2401 + }, + { + "epoch": 0.19507837245188012, + "grad_norm": 10.896072520758361, + "learning_rate": 4.651289115696454e-06, + "loss": 0.479, + "step": 2402 + }, + { + "epoch": 0.19515958742792172, + "grad_norm": 4.958720750747442, + "learning_rate": 4.650954032745689e-06, + "loss": 0.5053, + "step": 2403 + }, + { + "epoch": 0.1952408024039633, + "grad_norm": 3.974794068624415, + "learning_rate": 4.6506188009602885e-06, + "loss": 0.574, + "step": 2404 + }, + { + "epoch": 0.19532201738000488, + "grad_norm": 8.24891618792213, + "learning_rate": 4.65028342036345e-06, + "loss": 0.514, + "step": 2405 + }, + { + "epoch": 0.19540323235604645, + "grad_norm": 4.335525878369569, + "learning_rate": 4.6499478909783764e-06, + "loss": 0.427, + "step": 2406 + }, + { + "epoch": 0.19548444733208803, + "grad_norm": 6.424849313389938, + "learning_rate": 4.649612212828289e-06, + "loss": 0.4575, + "step": 2407 + }, + { + "epoch": 0.1955656623081296, + "grad_norm": 4.535461924057379, + "learning_rate": 4.6492763859364134e-06, + "loss": 0.6179, + "step": 2408 + }, + { + "epoch": 0.1956468772841712, + "grad_norm": 7.409680541443817, + "learning_rate": 4.648940410325987e-06, + "loss": 0.6637, + "step": 2409 + }, + { + "epoch": 0.1957280922602128, + "grad_norm": 4.6863434003973845, + "learning_rate": 4.648604286020256e-06, + "loss": 0.5738, + "step": 2410 + }, + { + "epoch": 0.19580930723625437, + "grad_norm": 5.544427789484983, + "learning_rate": 4.64826801304248e-06, + "loss": 0.4905, + "step": 2411 + }, + { + "epoch": 0.19589052221229594, + "grad_norm": 6.119211167714467, + "learning_rate": 4.647931591415929e-06, + "loss": 0.7071, + "step": 2412 + }, + { + "epoch": 0.19597173718833752, + "grad_norm": 3.1340827150045714, + "learning_rate": 4.647595021163878e-06, + "loss": 0.6431, + "step": 2413 + }, + { + "epoch": 0.19605295216437912, + "grad_norm": 5.376283662779039, + "learning_rate": 4.647258302309618e-06, + "loss": 0.6288, + "step": 2414 + }, + { + "epoch": 0.1961341671404207, + "grad_norm": 6.177768802745785, + "learning_rate": 4.646921434876447e-06, + "loss": 0.4535, + "step": 2415 + }, + { + "epoch": 0.19621538211646228, + "grad_norm": 12.088856729370683, + "learning_rate": 4.646584418887675e-06, + "loss": 0.6907, + "step": 2416 + }, + { + "epoch": 0.19629659709250386, + "grad_norm": 8.938823511128325, + "learning_rate": 4.646247254366622e-06, + "loss": 0.5471, + "step": 2417 + }, + { + "epoch": 0.19637781206854543, + "grad_norm": 6.009413684644707, + "learning_rate": 4.645909941336619e-06, + "loss": 0.6261, + "step": 2418 + }, + { + "epoch": 0.196459027044587, + "grad_norm": 5.0207649299589034, + "learning_rate": 4.645572479821004e-06, + "loss": 0.5226, + "step": 2419 + }, + { + "epoch": 0.1965402420206286, + "grad_norm": 5.8465718027493025, + "learning_rate": 4.645234869843129e-06, + "loss": 0.7304, + "step": 2420 + }, + { + "epoch": 0.1966214569966702, + "grad_norm": 4.44988664177105, + "learning_rate": 4.644897111426355e-06, + "loss": 0.4947, + "step": 2421 + }, + { + "epoch": 0.19670267197271177, + "grad_norm": 4.080957228552931, + "learning_rate": 4.6445592045940515e-06, + "loss": 0.6638, + "step": 2422 + }, + { + "epoch": 0.19678388694875334, + "grad_norm": 6.67424212054808, + "learning_rate": 4.644221149369602e-06, + "loss": 0.5218, + "step": 2423 + }, + { + "epoch": 0.19686510192479492, + "grad_norm": 4.3515760143355084, + "learning_rate": 4.643882945776397e-06, + "loss": 0.5116, + "step": 2424 + }, + { + "epoch": 0.19694631690083653, + "grad_norm": 4.665310299217762, + "learning_rate": 4.6435445938378375e-06, + "loss": 0.5858, + "step": 2425 + }, + { + "epoch": 0.1970275318768781, + "grad_norm": 3.4118287749102367, + "learning_rate": 4.643206093577338e-06, + "loss": 0.3923, + "step": 2426 + }, + { + "epoch": 0.19710874685291968, + "grad_norm": 4.722707242292216, + "learning_rate": 4.642867445018318e-06, + "loss": 0.5732, + "step": 2427 + }, + { + "epoch": 0.19718996182896126, + "grad_norm": 4.984185032243032, + "learning_rate": 4.642528648184213e-06, + "loss": 0.4799, + "step": 2428 + }, + { + "epoch": 0.19727117680500283, + "grad_norm": 4.175601292098027, + "learning_rate": 4.642189703098466e-06, + "loss": 0.5171, + "step": 2429 + }, + { + "epoch": 0.1973523917810444, + "grad_norm": 8.539390583313239, + "learning_rate": 4.6418506097845264e-06, + "loss": 0.7051, + "step": 2430 + }, + { + "epoch": 0.19743360675708602, + "grad_norm": 6.857246634219236, + "learning_rate": 4.641511368265861e-06, + "loss": 0.491, + "step": 2431 + }, + { + "epoch": 0.1975148217331276, + "grad_norm": 8.537762993370494, + "learning_rate": 4.641171978565943e-06, + "loss": 0.7512, + "step": 2432 + }, + { + "epoch": 0.19759603670916917, + "grad_norm": 4.408249935592557, + "learning_rate": 4.640832440708256e-06, + "loss": 0.6716, + "step": 2433 + }, + { + "epoch": 0.19767725168521075, + "grad_norm": 5.109570036404427, + "learning_rate": 4.640492754716294e-06, + "loss": 0.5133, + "step": 2434 + }, + { + "epoch": 0.19775846666125232, + "grad_norm": 4.736765394511924, + "learning_rate": 4.640152920613562e-06, + "loss": 0.9746, + "step": 2435 + }, + { + "epoch": 0.19783968163729393, + "grad_norm": 5.063603609846527, + "learning_rate": 4.639812938423574e-06, + "loss": 0.4769, + "step": 2436 + }, + { + "epoch": 0.1979208966133355, + "grad_norm": 5.272415199098991, + "learning_rate": 4.639472808169857e-06, + "loss": 0.5295, + "step": 2437 + }, + { + "epoch": 0.19800211158937708, + "grad_norm": 8.257542098149266, + "learning_rate": 4.639132529875943e-06, + "loss": 0.6467, + "step": 2438 + }, + { + "epoch": 0.19808332656541866, + "grad_norm": 5.3139070377916315, + "learning_rate": 4.63879210356538e-06, + "loss": 0.5433, + "step": 2439 + }, + { + "epoch": 0.19816454154146024, + "grad_norm": 6.841621560493213, + "learning_rate": 4.6384515292617226e-06, + "loss": 0.5335, + "step": 2440 + }, + { + "epoch": 0.19824575651750181, + "grad_norm": 8.402046267183342, + "learning_rate": 4.6381108069885376e-06, + "loss": 0.5433, + "step": 2441 + }, + { + "epoch": 0.19832697149354342, + "grad_norm": 3.6536735919997674, + "learning_rate": 4.6377699367694e-06, + "loss": 0.5768, + "step": 2442 + }, + { + "epoch": 0.198408186469585, + "grad_norm": 6.961987557428991, + "learning_rate": 4.637428918627896e-06, + "loss": 0.5184, + "step": 2443 + }, + { + "epoch": 0.19848940144562657, + "grad_norm": 5.119773267513261, + "learning_rate": 4.637087752587624e-06, + "loss": 0.6723, + "step": 2444 + }, + { + "epoch": 0.19857061642166815, + "grad_norm": 5.37579569573017, + "learning_rate": 4.636746438672189e-06, + "loss": 0.4536, + "step": 2445 + }, + { + "epoch": 0.19865183139770973, + "grad_norm": 4.8653070857140825, + "learning_rate": 4.63640497690521e-06, + "loss": 0.7141, + "step": 2446 + }, + { + "epoch": 0.19873304637375133, + "grad_norm": 5.213638271403077, + "learning_rate": 4.636063367310313e-06, + "loss": 0.4026, + "step": 2447 + }, + { + "epoch": 0.1988142613497929, + "grad_norm": 4.022917999876865, + "learning_rate": 4.635721609911137e-06, + "loss": 0.8184, + "step": 2448 + }, + { + "epoch": 0.19889547632583449, + "grad_norm": 5.721442337840194, + "learning_rate": 4.635379704731327e-06, + "loss": 0.5173, + "step": 2449 + }, + { + "epoch": 0.19897669130187606, + "grad_norm": 4.746208323114987, + "learning_rate": 4.635037651794544e-06, + "loss": 0.6091, + "step": 2450 + }, + { + "epoch": 0.19905790627791764, + "grad_norm": 3.7466714374073393, + "learning_rate": 4.634695451124454e-06, + "loss": 0.5977, + "step": 2451 + }, + { + "epoch": 0.19913912125395922, + "grad_norm": 5.06981488833001, + "learning_rate": 4.634353102744737e-06, + "loss": 0.4463, + "step": 2452 + }, + { + "epoch": 0.19922033623000082, + "grad_norm": 4.625187502181156, + "learning_rate": 4.634010606679081e-06, + "loss": 0.5105, + "step": 2453 + }, + { + "epoch": 0.1993015512060424, + "grad_norm": 4.590440891315889, + "learning_rate": 4.633667962951186e-06, + "loss": 0.5703, + "step": 2454 + }, + { + "epoch": 0.19938276618208398, + "grad_norm": 8.282884396118154, + "learning_rate": 4.6333251715847595e-06, + "loss": 0.5573, + "step": 2455 + }, + { + "epoch": 0.19946398115812555, + "grad_norm": 5.99824415434856, + "learning_rate": 4.6329822326035214e-06, + "loss": 0.6637, + "step": 2456 + }, + { + "epoch": 0.19954519613416713, + "grad_norm": 4.69181027886392, + "learning_rate": 4.632639146031201e-06, + "loss": 0.548, + "step": 2457 + }, + { + "epoch": 0.19962641111020873, + "grad_norm": 4.774146261889797, + "learning_rate": 4.63229591189154e-06, + "loss": 0.5267, + "step": 2458 + }, + { + "epoch": 0.1997076260862503, + "grad_norm": 4.0388362634362505, + "learning_rate": 4.631952530208286e-06, + "loss": 0.6004, + "step": 2459 + }, + { + "epoch": 0.1997888410622919, + "grad_norm": 11.372135655939333, + "learning_rate": 4.6316090010052006e-06, + "loss": 0.5377, + "step": 2460 + }, + { + "epoch": 0.19987005603833347, + "grad_norm": 8.258738666338818, + "learning_rate": 4.631265324306053e-06, + "loss": 0.6255, + "step": 2461 + }, + { + "epoch": 0.19995127101437504, + "grad_norm": 4.838265403873367, + "learning_rate": 4.630921500134625e-06, + "loss": 0.513, + "step": 2462 + }, + { + "epoch": 0.20003248599041662, + "grad_norm": 6.210942277246636, + "learning_rate": 4.630577528514707e-06, + "loss": 0.5705, + "step": 2463 + }, + { + "epoch": 0.20011370096645822, + "grad_norm": 4.867014588828138, + "learning_rate": 4.6302334094701e-06, + "loss": 0.7147, + "step": 2464 + }, + { + "epoch": 0.2001949159424998, + "grad_norm": 3.4107828105330116, + "learning_rate": 4.629889143024615e-06, + "loss": 0.4722, + "step": 2465 + }, + { + "epoch": 0.20027613091854138, + "grad_norm": 6.2252845176976574, + "learning_rate": 4.6295447292020735e-06, + "loss": 0.5165, + "step": 2466 + }, + { + "epoch": 0.20035734589458296, + "grad_norm": 5.732379914152118, + "learning_rate": 4.629200168026307e-06, + "loss": 0.4561, + "step": 2467 + }, + { + "epoch": 0.20043856087062453, + "grad_norm": 6.463671611218234, + "learning_rate": 4.6288554595211575e-06, + "loss": 0.5683, + "step": 2468 + }, + { + "epoch": 0.20051977584666614, + "grad_norm": 6.359868474828741, + "learning_rate": 4.628510603710478e-06, + "loss": 0.5942, + "step": 2469 + }, + { + "epoch": 0.20060099082270771, + "grad_norm": 5.920271897280425, + "learning_rate": 4.628165600618129e-06, + "loss": 0.5336, + "step": 2470 + }, + { + "epoch": 0.2006822057987493, + "grad_norm": 4.578491403510653, + "learning_rate": 4.627820450267984e-06, + "loss": 0.4962, + "step": 2471 + }, + { + "epoch": 0.20076342077479087, + "grad_norm": 3.829851338031233, + "learning_rate": 4.627475152683924e-06, + "loss": 0.554, + "step": 2472 + }, + { + "epoch": 0.20084463575083245, + "grad_norm": 5.016784542180406, + "learning_rate": 4.627129707889843e-06, + "loss": 0.7465, + "step": 2473 + }, + { + "epoch": 0.20092585072687402, + "grad_norm": 7.808176500440273, + "learning_rate": 4.626784115909645e-06, + "loss": 0.5234, + "step": 2474 + }, + { + "epoch": 0.20100706570291563, + "grad_norm": 3.777859086820638, + "learning_rate": 4.626438376767241e-06, + "loss": 0.7113, + "step": 2475 + }, + { + "epoch": 0.2010882806789572, + "grad_norm": 6.167573513223816, + "learning_rate": 4.626092490486557e-06, + "loss": 0.5625, + "step": 2476 + }, + { + "epoch": 0.20116949565499878, + "grad_norm": 4.609400429566531, + "learning_rate": 4.6257464570915235e-06, + "loss": 0.39, + "step": 2477 + }, + { + "epoch": 0.20125071063104036, + "grad_norm": 4.207144293674372, + "learning_rate": 4.625400276606086e-06, + "loss": 0.4698, + "step": 2478 + }, + { + "epoch": 0.20133192560708194, + "grad_norm": 4.988407235613174, + "learning_rate": 4.625053949054198e-06, + "loss": 0.6785, + "step": 2479 + }, + { + "epoch": 0.20141314058312354, + "grad_norm": 3.9367620435534314, + "learning_rate": 4.6247074744598234e-06, + "loss": 0.7332, + "step": 2480 + }, + { + "epoch": 0.20149435555916512, + "grad_norm": 3.9271827060994564, + "learning_rate": 4.6243608528469356e-06, + "loss": 0.489, + "step": 2481 + }, + { + "epoch": 0.2015755705352067, + "grad_norm": 5.02867311978453, + "learning_rate": 4.6240140842395205e-06, + "loss": 0.5228, + "step": 2482 + }, + { + "epoch": 0.20165678551124827, + "grad_norm": 3.9679711814084406, + "learning_rate": 4.623667168661572e-06, + "loss": 0.8265, + "step": 2483 + }, + { + "epoch": 0.20173800048728985, + "grad_norm": 4.395768395942478, + "learning_rate": 4.623320106137095e-06, + "loss": 0.5777, + "step": 2484 + }, + { + "epoch": 0.20181921546333143, + "grad_norm": 4.203898879590677, + "learning_rate": 4.6229728966901036e-06, + "loss": 0.7345, + "step": 2485 + }, + { + "epoch": 0.20190043043937303, + "grad_norm": 3.5440873158516673, + "learning_rate": 4.622625540344623e-06, + "loss": 0.6419, + "step": 2486 + }, + { + "epoch": 0.2019816454154146, + "grad_norm": 2.545901550496782, + "learning_rate": 4.62227803712469e-06, + "loss": 0.6928, + "step": 2487 + }, + { + "epoch": 0.20206286039145618, + "grad_norm": 4.89784077750463, + "learning_rate": 4.621930387054349e-06, + "loss": 0.5345, + "step": 2488 + }, + { + "epoch": 0.20214407536749776, + "grad_norm": 5.15304789976238, + "learning_rate": 4.621582590157654e-06, + "loss": 0.5168, + "step": 2489 + }, + { + "epoch": 0.20222529034353934, + "grad_norm": 3.6944859365300062, + "learning_rate": 4.621234646458673e-06, + "loss": 0.6469, + "step": 2490 + }, + { + "epoch": 0.20230650531958094, + "grad_norm": 4.253884701651634, + "learning_rate": 4.6208865559814805e-06, + "loss": 0.6652, + "step": 2491 + }, + { + "epoch": 0.20238772029562252, + "grad_norm": 5.40192379256261, + "learning_rate": 4.620538318750163e-06, + "loss": 0.5256, + "step": 2492 + }, + { + "epoch": 0.2024689352716641, + "grad_norm": 3.4100834853157704, + "learning_rate": 4.620189934788817e-06, + "loss": 0.6897, + "step": 2493 + }, + { + "epoch": 0.20255015024770567, + "grad_norm": 3.8199295718971102, + "learning_rate": 4.6198414041215484e-06, + "loss": 0.5001, + "step": 2494 + }, + { + "epoch": 0.20263136522374725, + "grad_norm": 3.145895750195355, + "learning_rate": 4.619492726772473e-06, + "loss": 0.6773, + "step": 2495 + }, + { + "epoch": 0.20271258019978883, + "grad_norm": 4.373203705255476, + "learning_rate": 4.619143902765719e-06, + "loss": 0.6648, + "step": 2496 + }, + { + "epoch": 0.20279379517583043, + "grad_norm": 6.177848983200393, + "learning_rate": 4.618794932125422e-06, + "loss": 0.4791, + "step": 2497 + }, + { + "epoch": 0.202875010151872, + "grad_norm": 3.952402300031322, + "learning_rate": 4.61844581487573e-06, + "loss": 0.8067, + "step": 2498 + }, + { + "epoch": 0.2029562251279136, + "grad_norm": 3.670343152102567, + "learning_rate": 4.618096551040798e-06, + "loss": 0.502, + "step": 2499 + }, + { + "epoch": 0.20303744010395516, + "grad_norm": 4.359414649768362, + "learning_rate": 4.617747140644796e-06, + "loss": 0.5255, + "step": 2500 + }, + { + "epoch": 0.20311865507999674, + "grad_norm": 6.653021158021161, + "learning_rate": 4.617397583711899e-06, + "loss": 0.6263, + "step": 2501 + }, + { + "epoch": 0.20319987005603835, + "grad_norm": 4.748016005088914, + "learning_rate": 4.617047880266295e-06, + "loss": 0.6699, + "step": 2502 + }, + { + "epoch": 0.20328108503207992, + "grad_norm": 3.35437246059798, + "learning_rate": 4.616698030332183e-06, + "loss": 0.4972, + "step": 2503 + }, + { + "epoch": 0.2033623000081215, + "grad_norm": 14.870459669223077, + "learning_rate": 4.616348033933769e-06, + "loss": 0.5515, + "step": 2504 + }, + { + "epoch": 0.20344351498416308, + "grad_norm": 5.088873364458127, + "learning_rate": 4.615997891095272e-06, + "loss": 0.6022, + "step": 2505 + }, + { + "epoch": 0.20352472996020465, + "grad_norm": 3.7990841422951447, + "learning_rate": 4.6156476018409204e-06, + "loss": 0.6326, + "step": 2506 + }, + { + "epoch": 0.20360594493624623, + "grad_norm": 4.333396061155222, + "learning_rate": 4.61529716619495e-06, + "loss": 0.6267, + "step": 2507 + }, + { + "epoch": 0.20368715991228784, + "grad_norm": 4.685302979204798, + "learning_rate": 4.614946584181612e-06, + "loss": 0.6191, + "step": 2508 + }, + { + "epoch": 0.2037683748883294, + "grad_norm": 4.124779647332827, + "learning_rate": 4.614595855825164e-06, + "loss": 0.4998, + "step": 2509 + }, + { + "epoch": 0.203849589864371, + "grad_norm": 4.501692024239236, + "learning_rate": 4.6142449811498725e-06, + "loss": 0.5782, + "step": 2510 + }, + { + "epoch": 0.20393080484041257, + "grad_norm": 4.707923879718902, + "learning_rate": 4.613893960180018e-06, + "loss": 0.5297, + "step": 2511 + }, + { + "epoch": 0.20401201981645414, + "grad_norm": 7.3215672264551035, + "learning_rate": 4.613542792939891e-06, + "loss": 0.7369, + "step": 2512 + }, + { + "epoch": 0.20409323479249575, + "grad_norm": 5.2423338527201615, + "learning_rate": 4.613191479453787e-06, + "loss": 0.8081, + "step": 2513 + }, + { + "epoch": 0.20417444976853732, + "grad_norm": 5.115816586092132, + "learning_rate": 4.612840019746016e-06, + "loss": 0.67, + "step": 2514 + }, + { + "epoch": 0.2042556647445789, + "grad_norm": 3.4887642989488348, + "learning_rate": 4.612488413840899e-06, + "loss": 0.8183, + "step": 2515 + }, + { + "epoch": 0.20433687972062048, + "grad_norm": 5.202669123420781, + "learning_rate": 4.6121366617627635e-06, + "loss": 0.4956, + "step": 2516 + }, + { + "epoch": 0.20441809469666206, + "grad_norm": 4.783952508433326, + "learning_rate": 4.6117847635359494e-06, + "loss": 0.5427, + "step": 2517 + }, + { + "epoch": 0.20449930967270363, + "grad_norm": 3.8836746569798, + "learning_rate": 4.611432719184806e-06, + "loss": 0.7236, + "step": 2518 + }, + { + "epoch": 0.20458052464874524, + "grad_norm": 4.277705470777233, + "learning_rate": 4.611080528733693e-06, + "loss": 0.5379, + "step": 2519 + }, + { + "epoch": 0.20466173962478681, + "grad_norm": 4.597920469594989, + "learning_rate": 4.6107281922069805e-06, + "loss": 0.6815, + "step": 2520 + }, + { + "epoch": 0.2047429546008284, + "grad_norm": 5.510027910722754, + "learning_rate": 4.610375709629047e-06, + "loss": 0.5213, + "step": 2521 + }, + { + "epoch": 0.20482416957686997, + "grad_norm": 26.571846192694977, + "learning_rate": 4.610023081024284e-06, + "loss": 0.6494, + "step": 2522 + }, + { + "epoch": 0.20490538455291155, + "grad_norm": 7.215116215564211, + "learning_rate": 4.6096703064170915e-06, + "loss": 0.5739, + "step": 2523 + }, + { + "epoch": 0.20498659952895315, + "grad_norm": 5.568682872058116, + "learning_rate": 4.609317385831879e-06, + "loss": 0.7281, + "step": 2524 + }, + { + "epoch": 0.20506781450499473, + "grad_norm": 7.457927768406811, + "learning_rate": 4.608964319293066e-06, + "loss": 0.6219, + "step": 2525 + }, + { + "epoch": 0.2051490294810363, + "grad_norm": 4.3162604433484155, + "learning_rate": 4.6086111068250834e-06, + "loss": 0.5108, + "step": 2526 + }, + { + "epoch": 0.20523024445707788, + "grad_norm": 5.339053755707136, + "learning_rate": 4.608257748452372e-06, + "loss": 0.4834, + "step": 2527 + }, + { + "epoch": 0.20531145943311946, + "grad_norm": 3.319564859771241, + "learning_rate": 4.607904244199384e-06, + "loss": 0.6459, + "step": 2528 + }, + { + "epoch": 0.20539267440916104, + "grad_norm": 3.3949377280505657, + "learning_rate": 4.6075505940905765e-06, + "loss": 0.8157, + "step": 2529 + }, + { + "epoch": 0.20547388938520264, + "grad_norm": 4.846601443639954, + "learning_rate": 4.607196798150423e-06, + "loss": 0.6862, + "step": 2530 + }, + { + "epoch": 0.20555510436124422, + "grad_norm": 3.634653145365433, + "learning_rate": 4.606842856403402e-06, + "loss": 0.4921, + "step": 2531 + }, + { + "epoch": 0.2056363193372858, + "grad_norm": 3.453187944742966, + "learning_rate": 4.6064887688740065e-06, + "loss": 0.7135, + "step": 2532 + }, + { + "epoch": 0.20571753431332737, + "grad_norm": 6.8028434368171, + "learning_rate": 4.606134535586737e-06, + "loss": 0.7577, + "step": 2533 + }, + { + "epoch": 0.20579874928936895, + "grad_norm": 5.81957789301719, + "learning_rate": 4.605780156566103e-06, + "loss": 0.6454, + "step": 2534 + }, + { + "epoch": 0.20587996426541055, + "grad_norm": 3.726486607180334, + "learning_rate": 4.6054256318366275e-06, + "loss": 0.5258, + "step": 2535 + }, + { + "epoch": 0.20596117924145213, + "grad_norm": 6.553307065329235, + "learning_rate": 4.6050709614228416e-06, + "loss": 0.6497, + "step": 2536 + }, + { + "epoch": 0.2060423942174937, + "grad_norm": 6.182500794690838, + "learning_rate": 4.604716145349285e-06, + "loss": 0.5451, + "step": 2537 + }, + { + "epoch": 0.20612360919353528, + "grad_norm": 3.621812427485916, + "learning_rate": 4.604361183640511e-06, + "loss": 0.5455, + "step": 2538 + }, + { + "epoch": 0.20620482416957686, + "grad_norm": 6.106652776271362, + "learning_rate": 4.60400607632108e-06, + "loss": 0.5335, + "step": 2539 + }, + { + "epoch": 0.20628603914561844, + "grad_norm": 8.175917425138248, + "learning_rate": 4.603650823415563e-06, + "loss": 0.5764, + "step": 2540 + }, + { + "epoch": 0.20636725412166004, + "grad_norm": 4.015447399797649, + "learning_rate": 4.603295424948544e-06, + "loss": 0.514, + "step": 2541 + }, + { + "epoch": 0.20644846909770162, + "grad_norm": 4.883808409792754, + "learning_rate": 4.602939880944612e-06, + "loss": 0.5623, + "step": 2542 + }, + { + "epoch": 0.2065296840737432, + "grad_norm": 3.0577448372785745, + "learning_rate": 4.6025841914283705e-06, + "loss": 0.5176, + "step": 2543 + }, + { + "epoch": 0.20661089904978477, + "grad_norm": 3.73178717833002, + "learning_rate": 4.602228356424431e-06, + "loss": 0.5816, + "step": 2544 + }, + { + "epoch": 0.20669211402582635, + "grad_norm": 3.105000510085083, + "learning_rate": 4.601872375957414e-06, + "loss": 0.7313, + "step": 2545 + }, + { + "epoch": 0.20677332900186796, + "grad_norm": 5.053370814100823, + "learning_rate": 4.601516250051954e-06, + "loss": 0.4578, + "step": 2546 + }, + { + "epoch": 0.20685454397790953, + "grad_norm": 3.758806458666759, + "learning_rate": 4.601159978732691e-06, + "loss": 0.5377, + "step": 2547 + }, + { + "epoch": 0.2069357589539511, + "grad_norm": 3.486408365033022, + "learning_rate": 4.600803562024277e-06, + "loss": 0.6031, + "step": 2548 + }, + { + "epoch": 0.2070169739299927, + "grad_norm": 4.0286968600378925, + "learning_rate": 4.6004469999513755e-06, + "loss": 0.6561, + "step": 2549 + }, + { + "epoch": 0.20709818890603426, + "grad_norm": 4.653795014118944, + "learning_rate": 4.600090292538658e-06, + "loss": 0.5098, + "step": 2550 + }, + { + "epoch": 0.20717940388207584, + "grad_norm": 4.008738162793329, + "learning_rate": 4.599733439810807e-06, + "loss": 0.596, + "step": 2551 + }, + { + "epoch": 0.20726061885811745, + "grad_norm": 6.379187521913423, + "learning_rate": 4.5993764417925145e-06, + "loss": 0.4166, + "step": 2552 + }, + { + "epoch": 0.20734183383415902, + "grad_norm": 5.162913747178339, + "learning_rate": 4.599019298508482e-06, + "loss": 0.6534, + "step": 2553 + }, + { + "epoch": 0.2074230488102006, + "grad_norm": 4.626210052826402, + "learning_rate": 4.598662009983424e-06, + "loss": 0.459, + "step": 2554 + }, + { + "epoch": 0.20750426378624218, + "grad_norm": 4.343058752050401, + "learning_rate": 4.598304576242063e-06, + "loss": 0.5086, + "step": 2555 + }, + { + "epoch": 0.20758547876228375, + "grad_norm": 7.300206881482361, + "learning_rate": 4.597946997309129e-06, + "loss": 0.5035, + "step": 2556 + }, + { + "epoch": 0.20766669373832536, + "grad_norm": 3.0899015840363075, + "learning_rate": 4.597589273209366e-06, + "loss": 0.5187, + "step": 2557 + }, + { + "epoch": 0.20774790871436694, + "grad_norm": 6.052813956883026, + "learning_rate": 4.597231403967527e-06, + "loss": 0.5379, + "step": 2558 + }, + { + "epoch": 0.2078291236904085, + "grad_norm": 5.517294439794208, + "learning_rate": 4.5968733896083745e-06, + "loss": 0.8438, + "step": 2559 + }, + { + "epoch": 0.2079103386664501, + "grad_norm": 4.128375503613039, + "learning_rate": 4.59651523015668e-06, + "loss": 0.6151, + "step": 2560 + }, + { + "epoch": 0.20799155364249167, + "grad_norm": 6.118250286731413, + "learning_rate": 4.5961569256372285e-06, + "loss": 0.5982, + "step": 2561 + }, + { + "epoch": 0.20807276861853324, + "grad_norm": 6.553871079659764, + "learning_rate": 4.595798476074811e-06, + "loss": 0.4833, + "step": 2562 + }, + { + "epoch": 0.20815398359457485, + "grad_norm": 5.380395025356601, + "learning_rate": 4.59543988149423e-06, + "loss": 0.506, + "step": 2563 + }, + { + "epoch": 0.20823519857061643, + "grad_norm": 5.37381105799694, + "learning_rate": 4.595081141920301e-06, + "loss": 0.4393, + "step": 2564 + }, + { + "epoch": 0.208316413546658, + "grad_norm": 4.617124660591345, + "learning_rate": 4.594722257377844e-06, + "loss": 0.6313, + "step": 2565 + }, + { + "epoch": 0.20839762852269958, + "grad_norm": 3.1785490526390974, + "learning_rate": 4.594363227891693e-06, + "loss": 0.6085, + "step": 2566 + }, + { + "epoch": 0.20847884349874116, + "grad_norm": 4.209871969565939, + "learning_rate": 4.5940040534866905e-06, + "loss": 0.8759, + "step": 2567 + }, + { + "epoch": 0.20856005847478276, + "grad_norm": 5.084178440847334, + "learning_rate": 4.59364473418769e-06, + "loss": 0.4414, + "step": 2568 + }, + { + "epoch": 0.20864127345082434, + "grad_norm": 4.799592885282473, + "learning_rate": 4.593285270019555e-06, + "loss": 0.4927, + "step": 2569 + }, + { + "epoch": 0.20872248842686592, + "grad_norm": 3.748127121013163, + "learning_rate": 4.592925661007157e-06, + "loss": 0.7891, + "step": 2570 + }, + { + "epoch": 0.2088037034029075, + "grad_norm": 7.646211226477787, + "learning_rate": 4.592565907175381e-06, + "loss": 0.4914, + "step": 2571 + }, + { + "epoch": 0.20888491837894907, + "grad_norm": 4.10621409423765, + "learning_rate": 4.592206008549118e-06, + "loss": 0.5923, + "step": 2572 + }, + { + "epoch": 0.20896613335499067, + "grad_norm": 5.917514515415448, + "learning_rate": 4.591845965153272e-06, + "loss": 0.6719, + "step": 2573 + }, + { + "epoch": 0.20904734833103225, + "grad_norm": 4.0238274192183985, + "learning_rate": 4.591485777012757e-06, + "loss": 0.636, + "step": 2574 + }, + { + "epoch": 0.20912856330707383, + "grad_norm": 5.0280430135245116, + "learning_rate": 4.591125444152495e-06, + "loss": 0.7144, + "step": 2575 + }, + { + "epoch": 0.2092097782831154, + "grad_norm": 5.449721642811628, + "learning_rate": 4.590764966597419e-06, + "loss": 0.5855, + "step": 2576 + }, + { + "epoch": 0.20929099325915698, + "grad_norm": 5.452281756916645, + "learning_rate": 4.590404344372472e-06, + "loss": 0.5043, + "step": 2577 + }, + { + "epoch": 0.20937220823519856, + "grad_norm": 7.9224346525989375, + "learning_rate": 4.590043577502609e-06, + "loss": 0.5569, + "step": 2578 + }, + { + "epoch": 0.20945342321124016, + "grad_norm": 5.570197940829358, + "learning_rate": 4.589682666012791e-06, + "loss": 0.5516, + "step": 2579 + }, + { + "epoch": 0.20953463818728174, + "grad_norm": 4.7228930255767425, + "learning_rate": 4.5893216099279925e-06, + "loss": 0.7413, + "step": 2580 + }, + { + "epoch": 0.20961585316332332, + "grad_norm": 4.327696809821123, + "learning_rate": 4.588960409273196e-06, + "loss": 0.6607, + "step": 2581 + }, + { + "epoch": 0.2096970681393649, + "grad_norm": 3.271070383423843, + "learning_rate": 4.588599064073395e-06, + "loss": 0.6295, + "step": 2582 + }, + { + "epoch": 0.20977828311540647, + "grad_norm": 4.928203939520188, + "learning_rate": 4.588237574353592e-06, + "loss": 0.4415, + "step": 2583 + }, + { + "epoch": 0.20985949809144808, + "grad_norm": 4.524773784130613, + "learning_rate": 4.587875940138801e-06, + "loss": 0.4493, + "step": 2584 + }, + { + "epoch": 0.20994071306748965, + "grad_norm": 5.4612869658664005, + "learning_rate": 4.587514161454045e-06, + "loss": 0.6702, + "step": 2585 + }, + { + "epoch": 0.21002192804353123, + "grad_norm": 5.877089399039422, + "learning_rate": 4.587152238324357e-06, + "loss": 0.5599, + "step": 2586 + }, + { + "epoch": 0.2101031430195728, + "grad_norm": 5.615645343923605, + "learning_rate": 4.58679017077478e-06, + "loss": 0.6475, + "step": 2587 + }, + { + "epoch": 0.21018435799561438, + "grad_norm": 4.466358103769594, + "learning_rate": 4.586427958830367e-06, + "loss": 0.7045, + "step": 2588 + }, + { + "epoch": 0.21026557297165596, + "grad_norm": 3.905668647886352, + "learning_rate": 4.586065602516182e-06, + "loss": 0.5548, + "step": 2589 + }, + { + "epoch": 0.21034678794769757, + "grad_norm": 5.834395303055314, + "learning_rate": 4.585703101857298e-06, + "loss": 0.4948, + "step": 2590 + }, + { + "epoch": 0.21042800292373914, + "grad_norm": 5.414482998853449, + "learning_rate": 4.585340456878798e-06, + "loss": 0.4951, + "step": 2591 + }, + { + "epoch": 0.21050921789978072, + "grad_norm": 3.1815206113624535, + "learning_rate": 4.584977667605774e-06, + "loss": 0.5363, + "step": 2592 + }, + { + "epoch": 0.2105904328758223, + "grad_norm": 3.1419631204303915, + "learning_rate": 4.5846147340633305e-06, + "loss": 0.5075, + "step": 2593 + }, + { + "epoch": 0.21067164785186387, + "grad_norm": 3.5141257327029205, + "learning_rate": 4.58425165627658e-06, + "loss": 0.543, + "step": 2594 + }, + { + "epoch": 0.21075286282790548, + "grad_norm": 4.156860647624618, + "learning_rate": 4.583888434270645e-06, + "loss": 0.5039, + "step": 2595 + }, + { + "epoch": 0.21083407780394706, + "grad_norm": 4.750813630585856, + "learning_rate": 4.58352506807066e-06, + "loss": 0.5138, + "step": 2596 + }, + { + "epoch": 0.21091529277998863, + "grad_norm": 9.312381581764441, + "learning_rate": 4.583161557701767e-06, + "loss": 0.5763, + "step": 2597 + }, + { + "epoch": 0.2109965077560302, + "grad_norm": 8.915059751168238, + "learning_rate": 4.582797903189119e-06, + "loss": 0.5809, + "step": 2598 + }, + { + "epoch": 0.2110777227320718, + "grad_norm": 3.648070795784496, + "learning_rate": 4.582434104557879e-06, + "loss": 0.482, + "step": 2599 + }, + { + "epoch": 0.21115893770811336, + "grad_norm": 4.2519549882216845, + "learning_rate": 4.582070161833221e-06, + "loss": 0.5429, + "step": 2600 + }, + { + "epoch": 0.21124015268415497, + "grad_norm": 4.546548576906945, + "learning_rate": 4.581706075040326e-06, + "loss": 0.5398, + "step": 2601 + }, + { + "epoch": 0.21132136766019655, + "grad_norm": 5.001987184534323, + "learning_rate": 4.5813418442043885e-06, + "loss": 0.5318, + "step": 2602 + }, + { + "epoch": 0.21140258263623812, + "grad_norm": 6.362483259653969, + "learning_rate": 4.58097746935061e-06, + "loss": 0.5363, + "step": 2603 + }, + { + "epoch": 0.2114837976122797, + "grad_norm": 4.0989340540279064, + "learning_rate": 4.580612950504204e-06, + "loss": 0.5545, + "step": 2604 + }, + { + "epoch": 0.21156501258832128, + "grad_norm": 5.132444357319185, + "learning_rate": 4.580248287690394e-06, + "loss": 0.4948, + "step": 2605 + }, + { + "epoch": 0.21164622756436288, + "grad_norm": 7.374853871968238, + "learning_rate": 4.579883480934413e-06, + "loss": 0.5281, + "step": 2606 + }, + { + "epoch": 0.21172744254040446, + "grad_norm": 8.014579215103465, + "learning_rate": 4.579518530261501e-06, + "loss": 0.6517, + "step": 2607 + }, + { + "epoch": 0.21180865751644604, + "grad_norm": 4.7524183309310315, + "learning_rate": 4.579153435696913e-06, + "loss": 0.5649, + "step": 2608 + }, + { + "epoch": 0.2118898724924876, + "grad_norm": 3.2946505830547697, + "learning_rate": 4.578788197265911e-06, + "loss": 0.4996, + "step": 2609 + }, + { + "epoch": 0.2119710874685292, + "grad_norm": 4.2400296703901645, + "learning_rate": 4.578422814993768e-06, + "loss": 0.5894, + "step": 2610 + }, + { + "epoch": 0.21205230244457077, + "grad_norm": 5.74324372003261, + "learning_rate": 4.578057288905766e-06, + "loss": 0.6345, + "step": 2611 + }, + { + "epoch": 0.21213351742061237, + "grad_norm": 3.66763262733531, + "learning_rate": 4.577691619027197e-06, + "loss": 0.5196, + "step": 2612 + }, + { + "epoch": 0.21221473239665395, + "grad_norm": 7.157752603621458, + "learning_rate": 4.577325805383364e-06, + "loss": 0.4639, + "step": 2613 + }, + { + "epoch": 0.21229594737269553, + "grad_norm": 6.775920682174733, + "learning_rate": 4.57695984799958e-06, + "loss": 0.5971, + "step": 2614 + }, + { + "epoch": 0.2123771623487371, + "grad_norm": 5.5027350147320275, + "learning_rate": 4.576593746901166e-06, + "loss": 0.6259, + "step": 2615 + }, + { + "epoch": 0.21245837732477868, + "grad_norm": 4.893766528880051, + "learning_rate": 4.576227502113455e-06, + "loss": 0.5943, + "step": 2616 + }, + { + "epoch": 0.21253959230082028, + "grad_norm": 4.639805956669454, + "learning_rate": 4.575861113661791e-06, + "loss": 0.7361, + "step": 2617 + }, + { + "epoch": 0.21262080727686186, + "grad_norm": 4.20860766014754, + "learning_rate": 4.575494581571521e-06, + "loss": 0.3826, + "step": 2618 + }, + { + "epoch": 0.21270202225290344, + "grad_norm": 5.655888239003785, + "learning_rate": 4.575127905868013e-06, + "loss": 0.5891, + "step": 2619 + }, + { + "epoch": 0.21278323722894502, + "grad_norm": 5.032532015247516, + "learning_rate": 4.574761086576635e-06, + "loss": 0.4914, + "step": 2620 + }, + { + "epoch": 0.2128644522049866, + "grad_norm": 11.837098472717457, + "learning_rate": 4.57439412372277e-06, + "loss": 0.4658, + "step": 2621 + }, + { + "epoch": 0.21294566718102817, + "grad_norm": 14.561447419694415, + "learning_rate": 4.574027017331812e-06, + "loss": 0.5611, + "step": 2622 + }, + { + "epoch": 0.21302688215706977, + "grad_norm": 3.633636275262492, + "learning_rate": 4.57365976742916e-06, + "loss": 0.5933, + "step": 2623 + }, + { + "epoch": 0.21310809713311135, + "grad_norm": 6.065413078817048, + "learning_rate": 4.573292374040227e-06, + "loss": 0.5984, + "step": 2624 + }, + { + "epoch": 0.21318931210915293, + "grad_norm": 12.083369823179202, + "learning_rate": 4.572924837190434e-06, + "loss": 0.4037, + "step": 2625 + }, + { + "epoch": 0.2132705270851945, + "grad_norm": 4.512701432450698, + "learning_rate": 4.572557156905213e-06, + "loss": 0.6599, + "step": 2626 + }, + { + "epoch": 0.21335174206123608, + "grad_norm": 4.466963402457013, + "learning_rate": 4.572189333210007e-06, + "loss": 0.6408, + "step": 2627 + }, + { + "epoch": 0.2134329570372777, + "grad_norm": 4.5137793929351115, + "learning_rate": 4.571821366130265e-06, + "loss": 0.4764, + "step": 2628 + }, + { + "epoch": 0.21351417201331926, + "grad_norm": 3.8656056596236184, + "learning_rate": 4.571453255691449e-06, + "loss": 0.4507, + "step": 2629 + }, + { + "epoch": 0.21359538698936084, + "grad_norm": 4.382341801906267, + "learning_rate": 4.571085001919031e-06, + "loss": 0.6934, + "step": 2630 + }, + { + "epoch": 0.21367660196540242, + "grad_norm": 8.589875710292333, + "learning_rate": 4.570716604838492e-06, + "loss": 0.6249, + "step": 2631 + }, + { + "epoch": 0.213757816941444, + "grad_norm": 4.007630827099167, + "learning_rate": 4.570348064475323e-06, + "loss": 0.6744, + "step": 2632 + }, + { + "epoch": 0.21383903191748557, + "grad_norm": 5.484359947965143, + "learning_rate": 4.569979380855025e-06, + "loss": 0.5493, + "step": 2633 + }, + { + "epoch": 0.21392024689352718, + "grad_norm": 4.493010697403236, + "learning_rate": 4.56961055400311e-06, + "loss": 0.5519, + "step": 2634 + }, + { + "epoch": 0.21400146186956875, + "grad_norm": 4.045159603215263, + "learning_rate": 4.5692415839450965e-06, + "loss": 0.59, + "step": 2635 + }, + { + "epoch": 0.21408267684561033, + "grad_norm": 4.821023174942057, + "learning_rate": 4.568872470706518e-06, + "loss": 0.3977, + "step": 2636 + }, + { + "epoch": 0.2141638918216519, + "grad_norm": 5.675194155783661, + "learning_rate": 4.568503214312913e-06, + "loss": 0.5558, + "step": 2637 + }, + { + "epoch": 0.21424510679769349, + "grad_norm": 4.7601824577948015, + "learning_rate": 4.568133814789833e-06, + "loss": 0.5638, + "step": 2638 + }, + { + "epoch": 0.2143263217737351, + "grad_norm": 5.107493849030825, + "learning_rate": 4.567764272162839e-06, + "loss": 0.5493, + "step": 2639 + }, + { + "epoch": 0.21440753674977667, + "grad_norm": 4.8431041110711766, + "learning_rate": 4.567394586457501e-06, + "loss": 0.5961, + "step": 2640 + }, + { + "epoch": 0.21448875172581824, + "grad_norm": 6.24655286207524, + "learning_rate": 4.567024757699399e-06, + "loss": 0.5455, + "step": 2641 + }, + { + "epoch": 0.21456996670185982, + "grad_norm": 4.819548308107245, + "learning_rate": 4.566654785914123e-06, + "loss": 0.5451, + "step": 2642 + }, + { + "epoch": 0.2146511816779014, + "grad_norm": 8.148594207313314, + "learning_rate": 4.566284671127273e-06, + "loss": 0.4268, + "step": 2643 + }, + { + "epoch": 0.21473239665394298, + "grad_norm": 5.200028510658096, + "learning_rate": 4.56591441336446e-06, + "loss": 0.5667, + "step": 2644 + }, + { + "epoch": 0.21481361162998458, + "grad_norm": 3.953005099419873, + "learning_rate": 4.565544012651304e-06, + "loss": 0.9163, + "step": 2645 + }, + { + "epoch": 0.21489482660602616, + "grad_norm": 6.166119347251547, + "learning_rate": 4.565173469013432e-06, + "loss": 0.4929, + "step": 2646 + }, + { + "epoch": 0.21497604158206773, + "grad_norm": 17.196433928853956, + "learning_rate": 4.564802782476487e-06, + "loss": 0.5905, + "step": 2647 + }, + { + "epoch": 0.2150572565581093, + "grad_norm": 6.178896351260531, + "learning_rate": 4.564431953066118e-06, + "loss": 0.7147, + "step": 2648 + }, + { + "epoch": 0.2151384715341509, + "grad_norm": 6.440122516675341, + "learning_rate": 4.564060980807983e-06, + "loss": 0.7101, + "step": 2649 + }, + { + "epoch": 0.2152196865101925, + "grad_norm": 5.2000555006994125, + "learning_rate": 4.563689865727752e-06, + "loss": 0.6096, + "step": 2650 + }, + { + "epoch": 0.21530090148623407, + "grad_norm": 4.6419662478035155, + "learning_rate": 4.563318607851104e-06, + "loss": 0.5928, + "step": 2651 + }, + { + "epoch": 0.21538211646227565, + "grad_norm": 8.141047545711643, + "learning_rate": 4.562947207203728e-06, + "loss": 0.4954, + "step": 2652 + }, + { + "epoch": 0.21546333143831722, + "grad_norm": 3.9486721093318153, + "learning_rate": 4.562575663811324e-06, + "loss": 0.6568, + "step": 2653 + }, + { + "epoch": 0.2155445464143588, + "grad_norm": 5.149290757056804, + "learning_rate": 4.5622039776996006e-06, + "loss": 0.5941, + "step": 2654 + }, + { + "epoch": 0.21562576139040038, + "grad_norm": 6.391781782039198, + "learning_rate": 4.561832148894275e-06, + "loss": 0.4204, + "step": 2655 + }, + { + "epoch": 0.21570697636644198, + "grad_norm": 9.968941914044036, + "learning_rate": 4.561460177421078e-06, + "loss": 0.5599, + "step": 2656 + }, + { + "epoch": 0.21578819134248356, + "grad_norm": 5.522046324373681, + "learning_rate": 4.561088063305745e-06, + "loss": 0.6469, + "step": 2657 + }, + { + "epoch": 0.21586940631852514, + "grad_norm": 3.8464087079545277, + "learning_rate": 4.560715806574028e-06, + "loss": 0.6275, + "step": 2658 + }, + { + "epoch": 0.2159506212945667, + "grad_norm": 5.078649015582283, + "learning_rate": 4.560343407251682e-06, + "loss": 0.5032, + "step": 2659 + }, + { + "epoch": 0.2160318362706083, + "grad_norm": 4.334473048243304, + "learning_rate": 4.559970865364477e-06, + "loss": 0.6853, + "step": 2660 + }, + { + "epoch": 0.2161130512466499, + "grad_norm": 6.419291069877329, + "learning_rate": 4.55959818093819e-06, + "loss": 0.5029, + "step": 2661 + }, + { + "epoch": 0.21619426622269147, + "grad_norm": 5.8025560914497385, + "learning_rate": 4.559225353998609e-06, + "loss": 0.4812, + "step": 2662 + }, + { + "epoch": 0.21627548119873305, + "grad_norm": 4.30038594406105, + "learning_rate": 4.558852384571533e-06, + "loss": 0.5198, + "step": 2663 + }, + { + "epoch": 0.21635669617477463, + "grad_norm": 4.101496826162754, + "learning_rate": 4.558479272682768e-06, + "loss": 0.6267, + "step": 2664 + }, + { + "epoch": 0.2164379111508162, + "grad_norm": 5.736035126104208, + "learning_rate": 4.558106018358131e-06, + "loss": 0.5187, + "step": 2665 + }, + { + "epoch": 0.21651912612685778, + "grad_norm": 5.345843996754261, + "learning_rate": 4.557732621623449e-06, + "loss": 0.5716, + "step": 2666 + }, + { + "epoch": 0.21660034110289939, + "grad_norm": 4.853630853027857, + "learning_rate": 4.557359082504562e-06, + "loss": 0.4583, + "step": 2667 + }, + { + "epoch": 0.21668155607894096, + "grad_norm": 3.473558582520133, + "learning_rate": 4.556985401027314e-06, + "loss": 0.4872, + "step": 2668 + }, + { + "epoch": 0.21676277105498254, + "grad_norm": 6.011752197100141, + "learning_rate": 4.556611577217563e-06, + "loss": 0.5814, + "step": 2669 + }, + { + "epoch": 0.21684398603102412, + "grad_norm": 4.738766968848329, + "learning_rate": 4.5562376111011745e-06, + "loss": 0.4709, + "step": 2670 + }, + { + "epoch": 0.2169252010070657, + "grad_norm": 9.10744576822217, + "learning_rate": 4.5558635027040265e-06, + "loss": 0.542, + "step": 2671 + }, + { + "epoch": 0.2170064159831073, + "grad_norm": 3.811056086790161, + "learning_rate": 4.555489252052005e-06, + "loss": 0.6779, + "step": 2672 + }, + { + "epoch": 0.21708763095914888, + "grad_norm": 5.148351495523538, + "learning_rate": 4.5551148591710045e-06, + "loss": 0.491, + "step": 2673 + }, + { + "epoch": 0.21716884593519045, + "grad_norm": 5.581434436680359, + "learning_rate": 4.5547403240869335e-06, + "loss": 0.6449, + "step": 2674 + }, + { + "epoch": 0.21725006091123203, + "grad_norm": 3.742718625839809, + "learning_rate": 4.554365646825706e-06, + "loss": 0.7014, + "step": 2675 + }, + { + "epoch": 0.2173312758872736, + "grad_norm": 6.612469532237822, + "learning_rate": 4.5539908274132485e-06, + "loss": 0.596, + "step": 2676 + }, + { + "epoch": 0.21741249086331518, + "grad_norm": 4.306520677998056, + "learning_rate": 4.553615865875496e-06, + "loss": 0.7381, + "step": 2677 + }, + { + "epoch": 0.2174937058393568, + "grad_norm": 4.614381185208202, + "learning_rate": 4.553240762238394e-06, + "loss": 0.6964, + "step": 2678 + }, + { + "epoch": 0.21757492081539836, + "grad_norm": 7.406343400623716, + "learning_rate": 4.552865516527899e-06, + "loss": 0.5004, + "step": 2679 + }, + { + "epoch": 0.21765613579143994, + "grad_norm": 3.9606433285981266, + "learning_rate": 4.552490128769975e-06, + "loss": 0.6269, + "step": 2680 + }, + { + "epoch": 0.21773735076748152, + "grad_norm": 4.11495963139638, + "learning_rate": 4.5521145989905955e-06, + "loss": 0.5763, + "step": 2681 + }, + { + "epoch": 0.2178185657435231, + "grad_norm": 4.588404789142679, + "learning_rate": 4.551738927215747e-06, + "loss": 0.4674, + "step": 2682 + }, + { + "epoch": 0.2178997807195647, + "grad_norm": 7.297594706002203, + "learning_rate": 4.5513631134714235e-06, + "loss": 0.5801, + "step": 2683 + }, + { + "epoch": 0.21798099569560628, + "grad_norm": 4.3047090539293045, + "learning_rate": 4.550987157783629e-06, + "loss": 0.4813, + "step": 2684 + }, + { + "epoch": 0.21806221067164785, + "grad_norm": 5.768465577812249, + "learning_rate": 4.550611060178378e-06, + "loss": 0.567, + "step": 2685 + }, + { + "epoch": 0.21814342564768943, + "grad_norm": 5.651861901253761, + "learning_rate": 4.550234820681695e-06, + "loss": 0.6873, + "step": 2686 + }, + { + "epoch": 0.218224640623731, + "grad_norm": 9.183694716367905, + "learning_rate": 4.549858439319612e-06, + "loss": 0.6324, + "step": 2687 + }, + { + "epoch": 0.21830585559977259, + "grad_norm": 7.585915804173281, + "learning_rate": 4.549481916118174e-06, + "loss": 0.5873, + "step": 2688 + }, + { + "epoch": 0.2183870705758142, + "grad_norm": 3.760740144941256, + "learning_rate": 4.5491052511034345e-06, + "loss": 0.6382, + "step": 2689 + }, + { + "epoch": 0.21846828555185577, + "grad_norm": 4.945745347505512, + "learning_rate": 4.548728444301456e-06, + "loss": 0.5818, + "step": 2690 + }, + { + "epoch": 0.21854950052789734, + "grad_norm": 4.9769795576864215, + "learning_rate": 4.548351495738312e-06, + "loss": 0.7123, + "step": 2691 + }, + { + "epoch": 0.21863071550393892, + "grad_norm": 4.481119259930459, + "learning_rate": 4.547974405440085e-06, + "loss": 0.4976, + "step": 2692 + }, + { + "epoch": 0.2187119304799805, + "grad_norm": 5.7374511609884955, + "learning_rate": 4.547597173432869e-06, + "loss": 0.5487, + "step": 2693 + }, + { + "epoch": 0.2187931454560221, + "grad_norm": 5.502760303179846, + "learning_rate": 4.547219799742765e-06, + "loss": 0.7687, + "step": 2694 + }, + { + "epoch": 0.21887436043206368, + "grad_norm": 10.421957536535567, + "learning_rate": 4.5468422843958845e-06, + "loss": 0.5052, + "step": 2695 + }, + { + "epoch": 0.21895557540810526, + "grad_norm": 3.6707221757502917, + "learning_rate": 4.546464627418351e-06, + "loss": 0.5888, + "step": 2696 + }, + { + "epoch": 0.21903679038414683, + "grad_norm": 3.304249792857001, + "learning_rate": 4.546086828836297e-06, + "loss": 0.6277, + "step": 2697 + }, + { + "epoch": 0.2191180053601884, + "grad_norm": 4.76062394973188, + "learning_rate": 4.545708888675862e-06, + "loss": 0.6074, + "step": 2698 + }, + { + "epoch": 0.21919922033623, + "grad_norm": 4.462680720514244, + "learning_rate": 4.5453308069632e-06, + "loss": 0.5367, + "step": 2699 + }, + { + "epoch": 0.2192804353122716, + "grad_norm": 7.372740705732592, + "learning_rate": 4.54495258372447e-06, + "loss": 0.5074, + "step": 2700 + }, + { + "epoch": 0.21936165028831317, + "grad_norm": 7.860054087415267, + "learning_rate": 4.544574218985845e-06, + "loss": 0.4761, + "step": 2701 + }, + { + "epoch": 0.21944286526435475, + "grad_norm": 8.081036919499534, + "learning_rate": 4.544195712773504e-06, + "loss": 0.5659, + "step": 2702 + }, + { + "epoch": 0.21952408024039632, + "grad_norm": 5.8269057668228985, + "learning_rate": 4.543817065113638e-06, + "loss": 0.462, + "step": 2703 + }, + { + "epoch": 0.2196052952164379, + "grad_norm": 4.2668146590202305, + "learning_rate": 4.543438276032448e-06, + "loss": 0.4101, + "step": 2704 + }, + { + "epoch": 0.2196865101924795, + "grad_norm": 5.140738961702395, + "learning_rate": 4.543059345556145e-06, + "loss": 0.48, + "step": 2705 + }, + { + "epoch": 0.21976772516852108, + "grad_norm": 5.170374218467899, + "learning_rate": 4.542680273710947e-06, + "loss": 0.5453, + "step": 2706 + }, + { + "epoch": 0.21984894014456266, + "grad_norm": 5.753753477369303, + "learning_rate": 4.542301060523086e-06, + "loss": 0.5962, + "step": 2707 + }, + { + "epoch": 0.21993015512060424, + "grad_norm": 4.532213136821538, + "learning_rate": 4.541921706018799e-06, + "loss": 0.561, + "step": 2708 + }, + { + "epoch": 0.22001137009664581, + "grad_norm": 5.5635492887810125, + "learning_rate": 4.541542210224337e-06, + "loss": 0.6049, + "step": 2709 + }, + { + "epoch": 0.2200925850726874, + "grad_norm": 4.789661183080903, + "learning_rate": 4.5411625731659595e-06, + "loss": 0.5815, + "step": 2710 + }, + { + "epoch": 0.220173800048729, + "grad_norm": 3.231674262508777, + "learning_rate": 4.540782794869933e-06, + "loss": 0.5184, + "step": 2711 + }, + { + "epoch": 0.22025501502477057, + "grad_norm": 6.341088144720809, + "learning_rate": 4.5404028753625396e-06, + "loss": 0.4839, + "step": 2712 + }, + { + "epoch": 0.22033623000081215, + "grad_norm": 4.816690291787395, + "learning_rate": 4.5400228146700654e-06, + "loss": 0.6509, + "step": 2713 + }, + { + "epoch": 0.22041744497685373, + "grad_norm": 3.6214196801416474, + "learning_rate": 4.539642612818809e-06, + "loss": 0.627, + "step": 2714 + }, + { + "epoch": 0.2204986599528953, + "grad_norm": 7.585870445781305, + "learning_rate": 4.539262269835078e-06, + "loss": 0.4846, + "step": 2715 + }, + { + "epoch": 0.2205798749289369, + "grad_norm": 5.185206149837472, + "learning_rate": 4.538881785745191e-06, + "loss": 0.5169, + "step": 2716 + }, + { + "epoch": 0.22066108990497849, + "grad_norm": 9.120252331770077, + "learning_rate": 4.538501160575475e-06, + "loss": 0.5466, + "step": 2717 + }, + { + "epoch": 0.22074230488102006, + "grad_norm": 4.271633224077144, + "learning_rate": 4.538120394352267e-06, + "loss": 0.5654, + "step": 2718 + }, + { + "epoch": 0.22082351985706164, + "grad_norm": 6.425265204493728, + "learning_rate": 4.5377394871019145e-06, + "loss": 0.5984, + "step": 2719 + }, + { + "epoch": 0.22090473483310322, + "grad_norm": 5.662199724686198, + "learning_rate": 4.5373584388507745e-06, + "loss": 0.5098, + "step": 2720 + }, + { + "epoch": 0.2209859498091448, + "grad_norm": 7.194165344057847, + "learning_rate": 4.536977249625213e-06, + "loss": 0.49, + "step": 2721 + }, + { + "epoch": 0.2210671647851864, + "grad_norm": 6.981685701050804, + "learning_rate": 4.536595919451606e-06, + "loss": 0.6383, + "step": 2722 + }, + { + "epoch": 0.22114837976122798, + "grad_norm": 4.116671598914044, + "learning_rate": 4.53621444835634e-06, + "loss": 0.5659, + "step": 2723 + }, + { + "epoch": 0.22122959473726955, + "grad_norm": 5.134961603692183, + "learning_rate": 4.535832836365811e-06, + "loss": 0.4805, + "step": 2724 + }, + { + "epoch": 0.22131080971331113, + "grad_norm": 4.576284219113115, + "learning_rate": 4.535451083506424e-06, + "loss": 0.7364, + "step": 2725 + }, + { + "epoch": 0.2213920246893527, + "grad_norm": 6.0501880740602365, + "learning_rate": 4.535069189804594e-06, + "loss": 0.5578, + "step": 2726 + }, + { + "epoch": 0.2214732396653943, + "grad_norm": 4.658417802807258, + "learning_rate": 4.534687155286747e-06, + "loss": 0.5017, + "step": 2727 + }, + { + "epoch": 0.2215544546414359, + "grad_norm": 3.5188270873838117, + "learning_rate": 4.534304979979317e-06, + "loss": 0.5166, + "step": 2728 + }, + { + "epoch": 0.22163566961747747, + "grad_norm": 3.6771178180337865, + "learning_rate": 4.53392266390875e-06, + "loss": 0.717, + "step": 2729 + }, + { + "epoch": 0.22171688459351904, + "grad_norm": 4.874076485337167, + "learning_rate": 4.533540207101498e-06, + "loss": 0.676, + "step": 2730 + }, + { + "epoch": 0.22179809956956062, + "grad_norm": 3.3053339197225307, + "learning_rate": 4.533157609584026e-06, + "loss": 0.7047, + "step": 2731 + }, + { + "epoch": 0.2218793145456022, + "grad_norm": 4.0334080892398845, + "learning_rate": 4.532774871382807e-06, + "loss": 0.6683, + "step": 2732 + }, + { + "epoch": 0.2219605295216438, + "grad_norm": 7.9679246058681885, + "learning_rate": 4.532391992524327e-06, + "loss": 0.5633, + "step": 2733 + }, + { + "epoch": 0.22204174449768538, + "grad_norm": 5.020060727741971, + "learning_rate": 4.532008973035076e-06, + "loss": 0.6868, + "step": 2734 + }, + { + "epoch": 0.22212295947372696, + "grad_norm": 3.77106126429378, + "learning_rate": 4.531625812941559e-06, + "loss": 0.5032, + "step": 2735 + }, + { + "epoch": 0.22220417444976853, + "grad_norm": 3.682569403476453, + "learning_rate": 4.531242512270287e-06, + "loss": 0.7004, + "step": 2736 + }, + { + "epoch": 0.2222853894258101, + "grad_norm": 4.487348592439573, + "learning_rate": 4.530859071047785e-06, + "loss": 0.5239, + "step": 2737 + }, + { + "epoch": 0.22236660440185171, + "grad_norm": 4.188000337446031, + "learning_rate": 4.530475489300583e-06, + "loss": 0.4732, + "step": 2738 + }, + { + "epoch": 0.2224478193778933, + "grad_norm": 8.498349813305607, + "learning_rate": 4.530091767055223e-06, + "loss": 0.4986, + "step": 2739 + }, + { + "epoch": 0.22252903435393487, + "grad_norm": 5.52528756591483, + "learning_rate": 4.5297079043382566e-06, + "loss": 0.6785, + "step": 2740 + }, + { + "epoch": 0.22261024932997645, + "grad_norm": 5.752497762609521, + "learning_rate": 4.529323901176245e-06, + "loss": 0.4531, + "step": 2741 + }, + { + "epoch": 0.22269146430601802, + "grad_norm": 3.092790657666851, + "learning_rate": 4.52893975759576e-06, + "loss": 0.7052, + "step": 2742 + }, + { + "epoch": 0.2227726792820596, + "grad_norm": 4.559669547606855, + "learning_rate": 4.528555473623381e-06, + "loss": 0.5464, + "step": 2743 + }, + { + "epoch": 0.2228538942581012, + "grad_norm": 3.5783978557136127, + "learning_rate": 4.5281710492857e-06, + "loss": 0.6876, + "step": 2744 + }, + { + "epoch": 0.22293510923414278, + "grad_norm": 4.328813594256833, + "learning_rate": 4.527786484609316e-06, + "loss": 0.564, + "step": 2745 + }, + { + "epoch": 0.22301632421018436, + "grad_norm": 3.952478916018802, + "learning_rate": 4.52740177962084e-06, + "loss": 0.5783, + "step": 2746 + }, + { + "epoch": 0.22309753918622593, + "grad_norm": 4.924032799834654, + "learning_rate": 4.52701693434689e-06, + "loss": 0.7729, + "step": 2747 + }, + { + "epoch": 0.2231787541622675, + "grad_norm": 5.139762166088582, + "learning_rate": 4.526631948814096e-06, + "loss": 0.5408, + "step": 2748 + }, + { + "epoch": 0.22325996913830912, + "grad_norm": 4.468079919633165, + "learning_rate": 4.5262468230490975e-06, + "loss": 0.6876, + "step": 2749 + }, + { + "epoch": 0.2233411841143507, + "grad_norm": 4.463738118530915, + "learning_rate": 4.525861557078542e-06, + "loss": 0.7465, + "step": 2750 + }, + { + "epoch": 0.22342239909039227, + "grad_norm": 7.852415705301734, + "learning_rate": 4.525476150929089e-06, + "loss": 0.5134, + "step": 2751 + }, + { + "epoch": 0.22350361406643385, + "grad_norm": 4.963966987188751, + "learning_rate": 4.525090604627406e-06, + "loss": 0.5476, + "step": 2752 + }, + { + "epoch": 0.22358482904247542, + "grad_norm": 6.07590271752492, + "learning_rate": 4.52470491820017e-06, + "loss": 0.4523, + "step": 2753 + }, + { + "epoch": 0.223666044018517, + "grad_norm": 4.302538032685536, + "learning_rate": 4.52431909167407e-06, + "loss": 0.6684, + "step": 2754 + }, + { + "epoch": 0.2237472589945586, + "grad_norm": 4.256570317991106, + "learning_rate": 4.5239331250758025e-06, + "loss": 0.5804, + "step": 2755 + }, + { + "epoch": 0.22382847397060018, + "grad_norm": 3.987772037631392, + "learning_rate": 4.523547018432074e-06, + "loss": 0.5361, + "step": 2756 + }, + { + "epoch": 0.22390968894664176, + "grad_norm": 4.25893742048742, + "learning_rate": 4.523160771769602e-06, + "loss": 0.5403, + "step": 2757 + }, + { + "epoch": 0.22399090392268334, + "grad_norm": 4.754148515001833, + "learning_rate": 4.52277438511511e-06, + "loss": 0.6913, + "step": 2758 + }, + { + "epoch": 0.22407211889872491, + "grad_norm": 11.158110474612203, + "learning_rate": 4.522387858495337e-06, + "loss": 0.4877, + "step": 2759 + }, + { + "epoch": 0.22415333387476652, + "grad_norm": 5.122504334003654, + "learning_rate": 4.522001191937028e-06, + "loss": 0.4932, + "step": 2760 + }, + { + "epoch": 0.2242345488508081, + "grad_norm": 4.861511116519465, + "learning_rate": 4.521614385466938e-06, + "loss": 0.5527, + "step": 2761 + }, + { + "epoch": 0.22431576382684967, + "grad_norm": 3.6596485107845003, + "learning_rate": 4.521227439111831e-06, + "loss": 0.7121, + "step": 2762 + }, + { + "epoch": 0.22439697880289125, + "grad_norm": 4.522445266045689, + "learning_rate": 4.520840352898483e-06, + "loss": 0.5672, + "step": 2763 + }, + { + "epoch": 0.22447819377893283, + "grad_norm": 7.737848493782743, + "learning_rate": 4.520453126853677e-06, + "loss": 0.5862, + "step": 2764 + }, + { + "epoch": 0.2245594087549744, + "grad_norm": 6.141302741029408, + "learning_rate": 4.520065761004209e-06, + "loss": 0.5703, + "step": 2765 + }, + { + "epoch": 0.224640623731016, + "grad_norm": 6.569966061095918, + "learning_rate": 4.51967825537688e-06, + "loss": 0.5038, + "step": 2766 + }, + { + "epoch": 0.2247218387070576, + "grad_norm": 5.8858674844088235, + "learning_rate": 4.5192906099985055e-06, + "loss": 0.5216, + "step": 2767 + }, + { + "epoch": 0.22480305368309916, + "grad_norm": 4.116267752952551, + "learning_rate": 4.518902824895908e-06, + "loss": 0.4604, + "step": 2768 + }, + { + "epoch": 0.22488426865914074, + "grad_norm": 6.293436608006783, + "learning_rate": 4.518514900095919e-06, + "loss": 0.466, + "step": 2769 + }, + { + "epoch": 0.22496548363518232, + "grad_norm": 8.339236282987327, + "learning_rate": 4.518126835625382e-06, + "loss": 0.5357, + "step": 2770 + }, + { + "epoch": 0.22504669861122392, + "grad_norm": 4.726236800961924, + "learning_rate": 4.51773863151115e-06, + "loss": 0.6255, + "step": 2771 + }, + { + "epoch": 0.2251279135872655, + "grad_norm": 4.1928620515740365, + "learning_rate": 4.517350287780081e-06, + "loss": 0.553, + "step": 2772 + }, + { + "epoch": 0.22520912856330708, + "grad_norm": 6.445972691759003, + "learning_rate": 4.51696180445905e-06, + "loss": 0.4875, + "step": 2773 + }, + { + "epoch": 0.22529034353934865, + "grad_norm": 14.794833956873413, + "learning_rate": 4.516573181574937e-06, + "loss": 0.5604, + "step": 2774 + }, + { + "epoch": 0.22537155851539023, + "grad_norm": 5.562154003766993, + "learning_rate": 4.516184419154633e-06, + "loss": 0.5572, + "step": 2775 + }, + { + "epoch": 0.2254527734914318, + "grad_norm": 3.647681045658577, + "learning_rate": 4.515795517225037e-06, + "loss": 0.617, + "step": 2776 + }, + { + "epoch": 0.2255339884674734, + "grad_norm": 5.695037511735684, + "learning_rate": 4.51540647581306e-06, + "loss": 0.412, + "step": 2777 + }, + { + "epoch": 0.225615203443515, + "grad_norm": 4.475228081800305, + "learning_rate": 4.51501729494562e-06, + "loss": 0.6703, + "step": 2778 + }, + { + "epoch": 0.22569641841955657, + "grad_norm": 4.611656081080196, + "learning_rate": 4.514627974649649e-06, + "loss": 0.6964, + "step": 2779 + }, + { + "epoch": 0.22577763339559814, + "grad_norm": 3.968385112319059, + "learning_rate": 4.514238514952084e-06, + "loss": 0.486, + "step": 2780 + }, + { + "epoch": 0.22585884837163972, + "grad_norm": 4.069304366874197, + "learning_rate": 4.513848915879874e-06, + "loss": 0.501, + "step": 2781 + }, + { + "epoch": 0.22594006334768132, + "grad_norm": 5.496815740257597, + "learning_rate": 4.513459177459977e-06, + "loss": 0.6377, + "step": 2782 + }, + { + "epoch": 0.2260212783237229, + "grad_norm": 6.857579000068127, + "learning_rate": 4.513069299719361e-06, + "loss": 0.5332, + "step": 2783 + }, + { + "epoch": 0.22610249329976448, + "grad_norm": 3.9422493886680354, + "learning_rate": 4.512679282685003e-06, + "loss": 0.7389, + "step": 2784 + }, + { + "epoch": 0.22618370827580606, + "grad_norm": 3.4503956109068135, + "learning_rate": 4.512289126383892e-06, + "loss": 0.4416, + "step": 2785 + }, + { + "epoch": 0.22626492325184763, + "grad_norm": 4.6592622473563665, + "learning_rate": 4.511898830843022e-06, + "loss": 0.5942, + "step": 2786 + }, + { + "epoch": 0.2263461382278892, + "grad_norm": 5.905176155853469, + "learning_rate": 4.511508396089401e-06, + "loss": 0.5971, + "step": 2787 + }, + { + "epoch": 0.22642735320393081, + "grad_norm": 3.3680130138338744, + "learning_rate": 4.5111178221500455e-06, + "loss": 0.5056, + "step": 2788 + }, + { + "epoch": 0.2265085681799724, + "grad_norm": 3.6474870201796006, + "learning_rate": 4.51072710905198e-06, + "loss": 0.623, + "step": 2789 + }, + { + "epoch": 0.22658978315601397, + "grad_norm": 5.380202578661765, + "learning_rate": 4.5103362568222395e-06, + "loss": 0.5094, + "step": 2790 + }, + { + "epoch": 0.22667099813205555, + "grad_norm": 3.51083847592959, + "learning_rate": 4.509945265487871e-06, + "loss": 0.5929, + "step": 2791 + }, + { + "epoch": 0.22675221310809712, + "grad_norm": 4.91331163188237, + "learning_rate": 4.5095541350759265e-06, + "loss": 0.5545, + "step": 2792 + }, + { + "epoch": 0.22683342808413873, + "grad_norm": 3.869643139237871, + "learning_rate": 4.5091628656134715e-06, + "loss": 0.5104, + "step": 2793 + }, + { + "epoch": 0.2269146430601803, + "grad_norm": 11.289546788075414, + "learning_rate": 4.508771457127579e-06, + "loss": 0.4783, + "step": 2794 + }, + { + "epoch": 0.22699585803622188, + "grad_norm": 4.701834247380548, + "learning_rate": 4.508379909645334e-06, + "loss": 0.6242, + "step": 2795 + }, + { + "epoch": 0.22707707301226346, + "grad_norm": 3.8081500149721608, + "learning_rate": 4.5079882231938274e-06, + "loss": 0.6682, + "step": 2796 + }, + { + "epoch": 0.22715828798830504, + "grad_norm": 8.196015374000737, + "learning_rate": 4.5075963978001634e-06, + "loss": 0.5618, + "step": 2797 + }, + { + "epoch": 0.2272395029643466, + "grad_norm": 3.4819053096272743, + "learning_rate": 4.5072044334914546e-06, + "loss": 0.4528, + "step": 2798 + }, + { + "epoch": 0.22732071794038822, + "grad_norm": 5.156427328355886, + "learning_rate": 4.506812330294821e-06, + "loss": 0.5095, + "step": 2799 + }, + { + "epoch": 0.2274019329164298, + "grad_norm": 3.7634245699967477, + "learning_rate": 4.506420088237395e-06, + "loss": 0.6707, + "step": 2800 + }, + { + "epoch": 0.22748314789247137, + "grad_norm": 3.431587308735926, + "learning_rate": 4.5060277073463174e-06, + "loss": 0.566, + "step": 2801 + }, + { + "epoch": 0.22756436286851295, + "grad_norm": 4.437013694098261, + "learning_rate": 4.50563518764874e-06, + "loss": 0.7123, + "step": 2802 + }, + { + "epoch": 0.22764557784455453, + "grad_norm": 4.567183665178771, + "learning_rate": 4.505242529171822e-06, + "loss": 0.5152, + "step": 2803 + }, + { + "epoch": 0.22772679282059613, + "grad_norm": 3.9017580026940664, + "learning_rate": 4.504849731942734e-06, + "loss": 0.5201, + "step": 2804 + }, + { + "epoch": 0.2278080077966377, + "grad_norm": 4.536121689617816, + "learning_rate": 4.504456795988654e-06, + "loss": 0.6886, + "step": 2805 + }, + { + "epoch": 0.22788922277267928, + "grad_norm": 10.84869509304053, + "learning_rate": 4.504063721336773e-06, + "loss": 0.5154, + "step": 2806 + }, + { + "epoch": 0.22797043774872086, + "grad_norm": 5.404245994389873, + "learning_rate": 4.503670508014289e-06, + "loss": 0.5609, + "step": 2807 + }, + { + "epoch": 0.22805165272476244, + "grad_norm": 6.579796252814421, + "learning_rate": 4.50327715604841e-06, + "loss": 0.523, + "step": 2808 + }, + { + "epoch": 0.22813286770080402, + "grad_norm": 3.973362927458198, + "learning_rate": 4.5028836654663535e-06, + "loss": 0.5637, + "step": 2809 + }, + { + "epoch": 0.22821408267684562, + "grad_norm": 5.793419725047325, + "learning_rate": 4.502490036295348e-06, + "loss": 0.5813, + "step": 2810 + }, + { + "epoch": 0.2282952976528872, + "grad_norm": 4.201946994996218, + "learning_rate": 4.50209626856263e-06, + "loss": 0.3738, + "step": 2811 + }, + { + "epoch": 0.22837651262892877, + "grad_norm": 4.182121020616893, + "learning_rate": 4.501702362295446e-06, + "loss": 0.5465, + "step": 2812 + }, + { + "epoch": 0.22845772760497035, + "grad_norm": 3.5727363113753645, + "learning_rate": 4.501308317521052e-06, + "loss": 0.5189, + "step": 2813 + }, + { + "epoch": 0.22853894258101193, + "grad_norm": 5.356568404800836, + "learning_rate": 4.500914134266715e-06, + "loss": 0.8021, + "step": 2814 + }, + { + "epoch": 0.22862015755705353, + "grad_norm": 4.550631173863838, + "learning_rate": 4.500519812559709e-06, + "loss": 0.583, + "step": 2815 + }, + { + "epoch": 0.2287013725330951, + "grad_norm": 4.328142108387039, + "learning_rate": 4.50012535242732e-06, + "loss": 0.4888, + "step": 2816 + }, + { + "epoch": 0.2287825875091367, + "grad_norm": 4.774451426228126, + "learning_rate": 4.499730753896841e-06, + "loss": 0.4892, + "step": 2817 + }, + { + "epoch": 0.22886380248517826, + "grad_norm": 4.5207191018836355, + "learning_rate": 4.4993360169955784e-06, + "loss": 0.4906, + "step": 2818 + }, + { + "epoch": 0.22894501746121984, + "grad_norm": 3.12349199747274, + "learning_rate": 4.498941141750845e-06, + "loss": 0.6733, + "step": 2819 + }, + { + "epoch": 0.22902623243726142, + "grad_norm": 3.911794379996745, + "learning_rate": 4.498546128189963e-06, + "loss": 0.5263, + "step": 2820 + }, + { + "epoch": 0.22910744741330302, + "grad_norm": 5.139451559263501, + "learning_rate": 4.498150976340266e-06, + "loss": 0.5766, + "step": 2821 + }, + { + "epoch": 0.2291886623893446, + "grad_norm": 4.27868628115041, + "learning_rate": 4.497755686229097e-06, + "loss": 0.529, + "step": 2822 + }, + { + "epoch": 0.22926987736538618, + "grad_norm": 3.9641320918354372, + "learning_rate": 4.497360257883808e-06, + "loss": 0.5311, + "step": 2823 + }, + { + "epoch": 0.22935109234142775, + "grad_norm": 3.5108973371563055, + "learning_rate": 4.496964691331759e-06, + "loss": 0.5227, + "step": 2824 + }, + { + "epoch": 0.22943230731746933, + "grad_norm": 5.403815218671819, + "learning_rate": 4.496568986600323e-06, + "loss": 0.9042, + "step": 2825 + }, + { + "epoch": 0.22951352229351094, + "grad_norm": 4.12742663870129, + "learning_rate": 4.4961731437168795e-06, + "loss": 0.7359, + "step": 2826 + }, + { + "epoch": 0.2295947372695525, + "grad_norm": 3.568114706661449, + "learning_rate": 4.4957771627088185e-06, + "loss": 0.6484, + "step": 2827 + }, + { + "epoch": 0.2296759522455941, + "grad_norm": 5.842995081486275, + "learning_rate": 4.495381043603541e-06, + "loss": 0.6022, + "step": 2828 + }, + { + "epoch": 0.22975716722163567, + "grad_norm": 5.602774830593073, + "learning_rate": 4.494984786428455e-06, + "loss": 0.6084, + "step": 2829 + }, + { + "epoch": 0.22983838219767724, + "grad_norm": 5.779320891496907, + "learning_rate": 4.494588391210981e-06, + "loss": 0.5428, + "step": 2830 + }, + { + "epoch": 0.22991959717371882, + "grad_norm": 5.734619447546111, + "learning_rate": 4.494191857978546e-06, + "loss": 0.5494, + "step": 2831 + }, + { + "epoch": 0.23000081214976043, + "grad_norm": 3.540278577269194, + "learning_rate": 4.493795186758589e-06, + "loss": 0.6195, + "step": 2832 + }, + { + "epoch": 0.230082027125802, + "grad_norm": 5.458025507421001, + "learning_rate": 4.493398377578557e-06, + "loss": 0.5911, + "step": 2833 + }, + { + "epoch": 0.23016324210184358, + "grad_norm": 3.0958255282858596, + "learning_rate": 4.4930014304659066e-06, + "loss": 0.6099, + "step": 2834 + }, + { + "epoch": 0.23024445707788516, + "grad_norm": 4.075827299092992, + "learning_rate": 4.492604345448106e-06, + "loss": 0.5688, + "step": 2835 + }, + { + "epoch": 0.23032567205392673, + "grad_norm": 5.7956480372939065, + "learning_rate": 4.492207122552629e-06, + "loss": 0.6, + "step": 2836 + }, + { + "epoch": 0.23040688702996834, + "grad_norm": 5.13610708333408, + "learning_rate": 4.491809761806964e-06, + "loss": 0.5496, + "step": 2837 + }, + { + "epoch": 0.23048810200600992, + "grad_norm": 5.910698565673017, + "learning_rate": 4.491412263238605e-06, + "loss": 0.4669, + "step": 2838 + }, + { + "epoch": 0.2305693169820515, + "grad_norm": 3.4340630962930305, + "learning_rate": 4.4910146268750555e-06, + "loss": 0.5895, + "step": 2839 + }, + { + "epoch": 0.23065053195809307, + "grad_norm": 4.029802742225141, + "learning_rate": 4.490616852743832e-06, + "loss": 0.5887, + "step": 2840 + }, + { + "epoch": 0.23073174693413465, + "grad_norm": 5.37046614141328, + "learning_rate": 4.490218940872457e-06, + "loss": 0.6715, + "step": 2841 + }, + { + "epoch": 0.23081296191017622, + "grad_norm": 6.574976668750347, + "learning_rate": 4.489820891288466e-06, + "loss": 0.6401, + "step": 2842 + }, + { + "epoch": 0.23089417688621783, + "grad_norm": 4.12936501875235, + "learning_rate": 4.489422704019399e-06, + "loss": 0.6287, + "step": 2843 + }, + { + "epoch": 0.2309753918622594, + "grad_norm": 4.078499752253989, + "learning_rate": 4.489024379092809e-06, + "loss": 0.5193, + "step": 2844 + }, + { + "epoch": 0.23105660683830098, + "grad_norm": 8.139115119863513, + "learning_rate": 4.48862591653626e-06, + "loss": 0.415, + "step": 2845 + }, + { + "epoch": 0.23113782181434256, + "grad_norm": 6.159373760750005, + "learning_rate": 4.488227316377322e-06, + "loss": 0.5214, + "step": 2846 + }, + { + "epoch": 0.23121903679038414, + "grad_norm": 3.4411255569736956, + "learning_rate": 4.487828578643576e-06, + "loss": 0.541, + "step": 2847 + }, + { + "epoch": 0.23130025176642574, + "grad_norm": 4.121631308793716, + "learning_rate": 4.4874297033626126e-06, + "loss": 0.6158, + "step": 2848 + }, + { + "epoch": 0.23138146674246732, + "grad_norm": 3.8683262343429647, + "learning_rate": 4.487030690562032e-06, + "loss": 0.5957, + "step": 2849 + }, + { + "epoch": 0.2314626817185089, + "grad_norm": 4.36066324441673, + "learning_rate": 4.486631540269445e-06, + "loss": 0.5908, + "step": 2850 + }, + { + "epoch": 0.23154389669455047, + "grad_norm": 23.856582803005605, + "learning_rate": 4.486232252512468e-06, + "loss": 0.6196, + "step": 2851 + }, + { + "epoch": 0.23162511167059205, + "grad_norm": 4.565500746372573, + "learning_rate": 4.485832827318733e-06, + "loss": 0.517, + "step": 2852 + }, + { + "epoch": 0.23170632664663363, + "grad_norm": 5.415515197041984, + "learning_rate": 4.485433264715874e-06, + "loss": 0.7045, + "step": 2853 + }, + { + "epoch": 0.23178754162267523, + "grad_norm": 7.611491735007953, + "learning_rate": 4.485033564731542e-06, + "loss": 0.632, + "step": 2854 + }, + { + "epoch": 0.2318687565987168, + "grad_norm": 5.094950046339584, + "learning_rate": 4.484633727393393e-06, + "loss": 0.5768, + "step": 2855 + }, + { + "epoch": 0.23194997157475838, + "grad_norm": 46.65401837853409, + "learning_rate": 4.484233752729093e-06, + "loss": 0.5038, + "step": 2856 + }, + { + "epoch": 0.23203118655079996, + "grad_norm": 25.235913155436222, + "learning_rate": 4.483833640766319e-06, + "loss": 0.6252, + "step": 2857 + }, + { + "epoch": 0.23211240152684154, + "grad_norm": 7.36939234328555, + "learning_rate": 4.4834333915327564e-06, + "loss": 0.6505, + "step": 2858 + }, + { + "epoch": 0.23219361650288314, + "grad_norm": 4.830643690141342, + "learning_rate": 4.483033005056101e-06, + "loss": 0.5895, + "step": 2859 + }, + { + "epoch": 0.23227483147892472, + "grad_norm": 9.183888328065596, + "learning_rate": 4.482632481364055e-06, + "loss": 0.4614, + "step": 2860 + }, + { + "epoch": 0.2323560464549663, + "grad_norm": 4.613246918041221, + "learning_rate": 4.482231820484336e-06, + "loss": 0.6206, + "step": 2861 + }, + { + "epoch": 0.23243726143100787, + "grad_norm": 7.324346436954002, + "learning_rate": 4.4818310224446645e-06, + "loss": 0.4812, + "step": 2862 + }, + { + "epoch": 0.23251847640704945, + "grad_norm": 6.5981856748393755, + "learning_rate": 4.481430087272776e-06, + "loss": 0.6606, + "step": 2863 + }, + { + "epoch": 0.23259969138309103, + "grad_norm": 4.619941208113225, + "learning_rate": 4.481029014996412e-06, + "loss": 0.4754, + "step": 2864 + }, + { + "epoch": 0.23268090635913263, + "grad_norm": 5.7207154634398085, + "learning_rate": 4.480627805643324e-06, + "loss": 0.5482, + "step": 2865 + }, + { + "epoch": 0.2327621213351742, + "grad_norm": 7.298594910122886, + "learning_rate": 4.480226459241275e-06, + "loss": 0.6597, + "step": 2866 + }, + { + "epoch": 0.2328433363112158, + "grad_norm": 9.363398730309251, + "learning_rate": 4.479824975818034e-06, + "loss": 0.6121, + "step": 2867 + }, + { + "epoch": 0.23292455128725736, + "grad_norm": 4.781902048228461, + "learning_rate": 4.4794233554013835e-06, + "loss": 0.5151, + "step": 2868 + }, + { + "epoch": 0.23300576626329894, + "grad_norm": 2.949065575080919, + "learning_rate": 4.479021598019113e-06, + "loss": 0.5063, + "step": 2869 + }, + { + "epoch": 0.23308698123934055, + "grad_norm": 4.021888781896201, + "learning_rate": 4.4786197036990205e-06, + "loss": 0.5932, + "step": 2870 + }, + { + "epoch": 0.23316819621538212, + "grad_norm": 5.096647466230962, + "learning_rate": 4.478217672468918e-06, + "loss": 0.6553, + "step": 2871 + }, + { + "epoch": 0.2332494111914237, + "grad_norm": 3.3544437426357434, + "learning_rate": 4.47781550435662e-06, + "loss": 0.7021, + "step": 2872 + }, + { + "epoch": 0.23333062616746528, + "grad_norm": 3.955029448497719, + "learning_rate": 4.4774131993899585e-06, + "loss": 0.5843, + "step": 2873 + }, + { + "epoch": 0.23341184114350685, + "grad_norm": 5.718278060740205, + "learning_rate": 4.477010757596768e-06, + "loss": 0.5778, + "step": 2874 + }, + { + "epoch": 0.23349305611954843, + "grad_norm": 3.558212336992993, + "learning_rate": 4.4766081790048965e-06, + "loss": 0.4537, + "step": 2875 + }, + { + "epoch": 0.23357427109559004, + "grad_norm": 3.821349848295602, + "learning_rate": 4.4762054636422005e-06, + "loss": 0.5913, + "step": 2876 + }, + { + "epoch": 0.2336554860716316, + "grad_norm": 5.178508133946879, + "learning_rate": 4.475802611536545e-06, + "loss": 0.5516, + "step": 2877 + }, + { + "epoch": 0.2337367010476732, + "grad_norm": 4.64155733262824, + "learning_rate": 4.475399622715805e-06, + "loss": 0.5186, + "step": 2878 + }, + { + "epoch": 0.23381791602371477, + "grad_norm": 5.700083589065183, + "learning_rate": 4.474996497207866e-06, + "loss": 0.6616, + "step": 2879 + }, + { + "epoch": 0.23389913099975634, + "grad_norm": 5.357424859826312, + "learning_rate": 4.4745932350406225e-06, + "loss": 0.6395, + "step": 2880 + }, + { + "epoch": 0.23398034597579795, + "grad_norm": 5.301441690670492, + "learning_rate": 4.474189836241976e-06, + "loss": 0.5116, + "step": 2881 + }, + { + "epoch": 0.23406156095183953, + "grad_norm": 3.7935528708576784, + "learning_rate": 4.473786300839843e-06, + "loss": 0.537, + "step": 2882 + }, + { + "epoch": 0.2341427759278811, + "grad_norm": 4.6354063500549945, + "learning_rate": 4.4733826288621435e-06, + "loss": 0.4027, + "step": 2883 + }, + { + "epoch": 0.23422399090392268, + "grad_norm": 4.258268978553181, + "learning_rate": 4.47297882033681e-06, + "loss": 0.5345, + "step": 2884 + }, + { + "epoch": 0.23430520587996426, + "grad_norm": 4.645513866526441, + "learning_rate": 4.472574875291784e-06, + "loss": 0.3998, + "step": 2885 + }, + { + "epoch": 0.23438642085600583, + "grad_norm": 5.592577886479116, + "learning_rate": 4.472170793755016e-06, + "loss": 0.5404, + "step": 2886 + }, + { + "epoch": 0.23446763583204744, + "grad_norm": 4.864175858056895, + "learning_rate": 4.471766575754467e-06, + "loss": 0.4769, + "step": 2887 + }, + { + "epoch": 0.23454885080808902, + "grad_norm": 5.3559791822991984, + "learning_rate": 4.471362221318106e-06, + "loss": 0.5522, + "step": 2888 + }, + { + "epoch": 0.2346300657841306, + "grad_norm": 3.985841294995745, + "learning_rate": 4.470957730473913e-06, + "loss": 0.5913, + "step": 2889 + }, + { + "epoch": 0.23471128076017217, + "grad_norm": 7.9982149419628765, + "learning_rate": 4.470553103249876e-06, + "loss": 0.5101, + "step": 2890 + }, + { + "epoch": 0.23479249573621375, + "grad_norm": 3.748824385107628, + "learning_rate": 4.470148339673993e-06, + "loss": 0.5213, + "step": 2891 + }, + { + "epoch": 0.23487371071225535, + "grad_norm": 4.816022996866285, + "learning_rate": 4.469743439774272e-06, + "loss": 0.5209, + "step": 2892 + }, + { + "epoch": 0.23495492568829693, + "grad_norm": 5.863963017632638, + "learning_rate": 4.46933840357873e-06, + "loss": 0.5256, + "step": 2893 + }, + { + "epoch": 0.2350361406643385, + "grad_norm": 4.876008648396625, + "learning_rate": 4.468933231115393e-06, + "loss": 0.53, + "step": 2894 + }, + { + "epoch": 0.23511735564038008, + "grad_norm": 6.008481144166018, + "learning_rate": 4.468527922412297e-06, + "loss": 0.5812, + "step": 2895 + }, + { + "epoch": 0.23519857061642166, + "grad_norm": 6.418455057806858, + "learning_rate": 4.468122477497486e-06, + "loss": 0.5318, + "step": 2896 + }, + { + "epoch": 0.23527978559246324, + "grad_norm": 2.5240378044740415, + "learning_rate": 4.467716896399017e-06, + "loss": 0.5948, + "step": 2897 + }, + { + "epoch": 0.23536100056850484, + "grad_norm": 4.361122855517343, + "learning_rate": 4.4673111791449515e-06, + "loss": 0.5233, + "step": 2898 + }, + { + "epoch": 0.23544221554454642, + "grad_norm": 4.528523763290337, + "learning_rate": 4.466905325763365e-06, + "loss": 0.6362, + "step": 2899 + }, + { + "epoch": 0.235523430520588, + "grad_norm": 6.538860668916424, + "learning_rate": 4.4664993362823394e-06, + "loss": 0.5745, + "step": 2900 + }, + { + "epoch": 0.23560464549662957, + "grad_norm": 5.847755183433358, + "learning_rate": 4.466093210729967e-06, + "loss": 0.6584, + "step": 2901 + }, + { + "epoch": 0.23568586047267115, + "grad_norm": 4.179474365670219, + "learning_rate": 4.465686949134351e-06, + "loss": 0.6396, + "step": 2902 + }, + { + "epoch": 0.23576707544871275, + "grad_norm": 5.774986352129162, + "learning_rate": 4.465280551523601e-06, + "loss": 0.5861, + "step": 2903 + }, + { + "epoch": 0.23584829042475433, + "grad_norm": 3.2478970012240325, + "learning_rate": 4.464874017925837e-06, + "loss": 0.6337, + "step": 2904 + }, + { + "epoch": 0.2359295054007959, + "grad_norm": 6.068452434980089, + "learning_rate": 4.46446734836919e-06, + "loss": 0.6407, + "step": 2905 + }, + { + "epoch": 0.23601072037683749, + "grad_norm": 3.2686213630532026, + "learning_rate": 4.4640605428818e-06, + "loss": 0.6955, + "step": 2906 + }, + { + "epoch": 0.23609193535287906, + "grad_norm": 4.07905119935594, + "learning_rate": 4.463653601491815e-06, + "loss": 0.4519, + "step": 2907 + }, + { + "epoch": 0.23617315032892067, + "grad_norm": 6.169592515743247, + "learning_rate": 4.463246524227393e-06, + "loss": 0.523, + "step": 2908 + }, + { + "epoch": 0.23625436530496224, + "grad_norm": 4.321690950129896, + "learning_rate": 4.462839311116702e-06, + "loss": 0.522, + "step": 2909 + }, + { + "epoch": 0.23633558028100382, + "grad_norm": 7.237413918755258, + "learning_rate": 4.462431962187919e-06, + "loss": 0.747, + "step": 2910 + }, + { + "epoch": 0.2364167952570454, + "grad_norm": 3.734173317333828, + "learning_rate": 4.46202447746923e-06, + "loss": 0.6299, + "step": 2911 + }, + { + "epoch": 0.23649801023308697, + "grad_norm": 5.880101867809025, + "learning_rate": 4.461616856988831e-06, + "loss": 0.6253, + "step": 2912 + }, + { + "epoch": 0.23657922520912855, + "grad_norm": 4.370253406800247, + "learning_rate": 4.461209100774928e-06, + "loss": 0.5456, + "step": 2913 + }, + { + "epoch": 0.23666044018517016, + "grad_norm": 4.39926590709237, + "learning_rate": 4.460801208855734e-06, + "loss": 0.427, + "step": 2914 + }, + { + "epoch": 0.23674165516121173, + "grad_norm": 6.280728294974258, + "learning_rate": 4.4603931812594735e-06, + "loss": 0.7197, + "step": 2915 + }, + { + "epoch": 0.2368228701372533, + "grad_norm": 6.982111134101197, + "learning_rate": 4.45998501801438e-06, + "loss": 0.4599, + "step": 2916 + }, + { + "epoch": 0.2369040851132949, + "grad_norm": 7.601832967298451, + "learning_rate": 4.459576719148697e-06, + "loss": 0.4797, + "step": 2917 + }, + { + "epoch": 0.23698530008933646, + "grad_norm": 5.8714956548009924, + "learning_rate": 4.459168284690676e-06, + "loss": 0.5447, + "step": 2918 + }, + { + "epoch": 0.23706651506537807, + "grad_norm": 7.1059975498172685, + "learning_rate": 4.458759714668578e-06, + "loss": 0.6044, + "step": 2919 + }, + { + "epoch": 0.23714773004141965, + "grad_norm": 4.612443635513808, + "learning_rate": 4.458351009110675e-06, + "loss": 0.5054, + "step": 2920 + }, + { + "epoch": 0.23722894501746122, + "grad_norm": 4.117158983616684, + "learning_rate": 4.457942168045246e-06, + "loss": 0.6243, + "step": 2921 + }, + { + "epoch": 0.2373101599935028, + "grad_norm": 4.243517625792392, + "learning_rate": 4.457533191500581e-06, + "loss": 0.6199, + "step": 2922 + }, + { + "epoch": 0.23739137496954438, + "grad_norm": 6.196728526337707, + "learning_rate": 4.45712407950498e-06, + "loss": 0.4745, + "step": 2923 + }, + { + "epoch": 0.23747258994558595, + "grad_norm": 4.089855406887254, + "learning_rate": 4.45671483208675e-06, + "loss": 0.5762, + "step": 2924 + }, + { + "epoch": 0.23755380492162756, + "grad_norm": 8.680736194237051, + "learning_rate": 4.45630544927421e-06, + "loss": 0.6403, + "step": 2925 + }, + { + "epoch": 0.23763501989766914, + "grad_norm": 3.4364871224606452, + "learning_rate": 4.4558959310956865e-06, + "loss": 0.5683, + "step": 2926 + }, + { + "epoch": 0.2377162348737107, + "grad_norm": 4.94476918130175, + "learning_rate": 4.4554862775795146e-06, + "loss": 0.4727, + "step": 2927 + }, + { + "epoch": 0.2377974498497523, + "grad_norm": 3.68514712087398, + "learning_rate": 4.455076488754043e-06, + "loss": 0.5719, + "step": 2928 + }, + { + "epoch": 0.23787866482579387, + "grad_norm": 4.764481615544404, + "learning_rate": 4.4546665646476254e-06, + "loss": 0.575, + "step": 2929 + }, + { + "epoch": 0.23795987980183547, + "grad_norm": 4.8777757605766405, + "learning_rate": 4.4542565052886256e-06, + "loss": 0.6751, + "step": 2930 + }, + { + "epoch": 0.23804109477787705, + "grad_norm": 4.182540781833306, + "learning_rate": 4.45384631070542e-06, + "loss": 0.6563, + "step": 2931 + }, + { + "epoch": 0.23812230975391863, + "grad_norm": 3.5345505370213264, + "learning_rate": 4.453435980926388e-06, + "loss": 0.5779, + "step": 2932 + }, + { + "epoch": 0.2382035247299602, + "grad_norm": 8.12041102050761, + "learning_rate": 4.453025515979926e-06, + "loss": 0.5615, + "step": 2933 + }, + { + "epoch": 0.23828473970600178, + "grad_norm": 6.278197290858452, + "learning_rate": 4.452614915894434e-06, + "loss": 0.4717, + "step": 2934 + }, + { + "epoch": 0.23836595468204336, + "grad_norm": 3.470804618193489, + "learning_rate": 4.452204180698325e-06, + "loss": 0.5222, + "step": 2935 + }, + { + "epoch": 0.23844716965808496, + "grad_norm": 4.736573933269891, + "learning_rate": 4.451793310420017e-06, + "loss": 0.6273, + "step": 2936 + }, + { + "epoch": 0.23852838463412654, + "grad_norm": 6.791656553293532, + "learning_rate": 4.451382305087943e-06, + "loss": 0.6002, + "step": 2937 + }, + { + "epoch": 0.23860959961016812, + "grad_norm": 4.478548259706515, + "learning_rate": 4.450971164730541e-06, + "loss": 0.4814, + "step": 2938 + }, + { + "epoch": 0.2386908145862097, + "grad_norm": 3.681202303319773, + "learning_rate": 4.4505598893762595e-06, + "loss": 0.5895, + "step": 2939 + }, + { + "epoch": 0.23877202956225127, + "grad_norm": 4.1452045492879055, + "learning_rate": 4.4501484790535555e-06, + "loss": 0.6095, + "step": 2940 + }, + { + "epoch": 0.23885324453829287, + "grad_norm": 6.265763596141227, + "learning_rate": 4.449736933790899e-06, + "loss": 0.6445, + "step": 2941 + }, + { + "epoch": 0.23893445951433445, + "grad_norm": 3.195443239329471, + "learning_rate": 4.449325253616765e-06, + "loss": 0.6796, + "step": 2942 + }, + { + "epoch": 0.23901567449037603, + "grad_norm": 4.681861481447409, + "learning_rate": 4.448913438559641e-06, + "loss": 0.5013, + "step": 2943 + }, + { + "epoch": 0.2390968894664176, + "grad_norm": 3.9893399361235873, + "learning_rate": 4.448501488648021e-06, + "loss": 0.4841, + "step": 2944 + }, + { + "epoch": 0.23917810444245918, + "grad_norm": 8.26472685534627, + "learning_rate": 4.448089403910411e-06, + "loss": 0.65, + "step": 2945 + }, + { + "epoch": 0.23925931941850076, + "grad_norm": 7.279481691054026, + "learning_rate": 4.447677184375323e-06, + "loss": 0.6863, + "step": 2946 + }, + { + "epoch": 0.23934053439454236, + "grad_norm": 5.107617453615278, + "learning_rate": 4.447264830071282e-06, + "loss": 0.4314, + "step": 2947 + }, + { + "epoch": 0.23942174937058394, + "grad_norm": 10.45135842568149, + "learning_rate": 4.446852341026822e-06, + "loss": 0.5368, + "step": 2948 + }, + { + "epoch": 0.23950296434662552, + "grad_norm": 3.384814108308006, + "learning_rate": 4.4464397172704825e-06, + "loss": 0.5992, + "step": 2949 + }, + { + "epoch": 0.2395841793226671, + "grad_norm": 3.6038417485455723, + "learning_rate": 4.446026958830816e-06, + "loss": 0.6577, + "step": 2950 + }, + { + "epoch": 0.23966539429870867, + "grad_norm": 5.105585351046193, + "learning_rate": 4.4456140657363824e-06, + "loss": 0.5815, + "step": 2951 + }, + { + "epoch": 0.23974660927475028, + "grad_norm": 6.436128437554316, + "learning_rate": 4.445201038015753e-06, + "loss": 0.6109, + "step": 2952 + }, + { + "epoch": 0.23982782425079185, + "grad_norm": 6.0286114195287, + "learning_rate": 4.4447878756975074e-06, + "loss": 0.5737, + "step": 2953 + }, + { + "epoch": 0.23990903922683343, + "grad_norm": 6.340347602254372, + "learning_rate": 4.444374578810233e-06, + "loss": 0.6333, + "step": 2954 + }, + { + "epoch": 0.239990254202875, + "grad_norm": 3.611889348535864, + "learning_rate": 4.443961147382528e-06, + "loss": 0.5584, + "step": 2955 + }, + { + "epoch": 0.24007146917891659, + "grad_norm": 4.508170768419845, + "learning_rate": 4.4435475814429995e-06, + "loss": 0.5662, + "step": 2956 + }, + { + "epoch": 0.24015268415495816, + "grad_norm": 3.857621418576937, + "learning_rate": 4.4431338810202655e-06, + "loss": 0.5413, + "step": 2957 + }, + { + "epoch": 0.24023389913099977, + "grad_norm": 5.167525653044904, + "learning_rate": 4.4427200461429494e-06, + "loss": 0.5279, + "step": 2958 + }, + { + "epoch": 0.24031511410704134, + "grad_norm": 9.873321863622351, + "learning_rate": 4.442306076839689e-06, + "loss": 0.4993, + "step": 2959 + }, + { + "epoch": 0.24039632908308292, + "grad_norm": 9.73450941091562, + "learning_rate": 4.441891973139127e-06, + "loss": 0.5868, + "step": 2960 + }, + { + "epoch": 0.2404775440591245, + "grad_norm": 3.7053040122979297, + "learning_rate": 4.441477735069918e-06, + "loss": 0.6352, + "step": 2961 + }, + { + "epoch": 0.24055875903516608, + "grad_norm": 5.920625186279193, + "learning_rate": 4.441063362660726e-06, + "loss": 0.6923, + "step": 2962 + }, + { + "epoch": 0.24063997401120768, + "grad_norm": 6.875957645683971, + "learning_rate": 4.44064885594022e-06, + "loss": 0.5829, + "step": 2963 + }, + { + "epoch": 0.24072118898724926, + "grad_norm": 6.3088089208901215, + "learning_rate": 4.440234214937086e-06, + "loss": 0.4949, + "step": 2964 + }, + { + "epoch": 0.24080240396329083, + "grad_norm": 6.283915304694119, + "learning_rate": 4.439819439680012e-06, + "loss": 0.5373, + "step": 2965 + }, + { + "epoch": 0.2408836189393324, + "grad_norm": 5.544344295999567, + "learning_rate": 4.439404530197699e-06, + "loss": 0.5615, + "step": 2966 + }, + { + "epoch": 0.240964833915374, + "grad_norm": 5.522735082552348, + "learning_rate": 4.438989486518856e-06, + "loss": 0.496, + "step": 2967 + }, + { + "epoch": 0.24104604889141557, + "grad_norm": 6.134276115094066, + "learning_rate": 4.438574308672203e-06, + "loss": 0.3989, + "step": 2968 + }, + { + "epoch": 0.24112726386745717, + "grad_norm": 18.34187318276363, + "learning_rate": 4.438158996686468e-06, + "loss": 0.4992, + "step": 2969 + }, + { + "epoch": 0.24120847884349875, + "grad_norm": 5.7407597574290605, + "learning_rate": 4.4377435505903876e-06, + "loss": 0.7617, + "step": 2970 + }, + { + "epoch": 0.24128969381954032, + "grad_norm": 5.129242694583421, + "learning_rate": 4.4373279704127095e-06, + "loss": 0.6338, + "step": 2971 + }, + { + "epoch": 0.2413709087955819, + "grad_norm": 5.956882401490225, + "learning_rate": 4.4369122561821885e-06, + "loss": 0.4831, + "step": 2972 + }, + { + "epoch": 0.24145212377162348, + "grad_norm": 4.390822342695221, + "learning_rate": 4.436496407927591e-06, + "loss": 0.5962, + "step": 2973 + }, + { + "epoch": 0.24153333874766508, + "grad_norm": 8.138342846156773, + "learning_rate": 4.436080425677689e-06, + "loss": 0.5129, + "step": 2974 + }, + { + "epoch": 0.24161455372370666, + "grad_norm": 3.692323205845064, + "learning_rate": 4.43566430946127e-06, + "loss": 0.6631, + "step": 2975 + }, + { + "epoch": 0.24169576869974824, + "grad_norm": 4.598524784324876, + "learning_rate": 4.435248059307124e-06, + "loss": 0.5203, + "step": 2976 + }, + { + "epoch": 0.2417769836757898, + "grad_norm": 36.50302303476803, + "learning_rate": 4.434831675244056e-06, + "loss": 0.5976, + "step": 2977 + }, + { + "epoch": 0.2418581986518314, + "grad_norm": 4.225579732329154, + "learning_rate": 4.434415157300875e-06, + "loss": 0.7187, + "step": 2978 + }, + { + "epoch": 0.24193941362787297, + "grad_norm": 6.144364914404981, + "learning_rate": 4.433998505506402e-06, + "loss": 0.5036, + "step": 2979 + }, + { + "epoch": 0.24202062860391457, + "grad_norm": 5.890869286549283, + "learning_rate": 4.433581719889469e-06, + "loss": 0.5174, + "step": 2980 + }, + { + "epoch": 0.24210184357995615, + "grad_norm": 4.666862090401287, + "learning_rate": 4.433164800478914e-06, + "loss": 0.5758, + "step": 2981 + }, + { + "epoch": 0.24218305855599773, + "grad_norm": 4.488080886068684, + "learning_rate": 4.432747747303586e-06, + "loss": 0.5223, + "step": 2982 + }, + { + "epoch": 0.2422642735320393, + "grad_norm": 4.722733122087884, + "learning_rate": 4.432330560392343e-06, + "loss": 0.5231, + "step": 2983 + }, + { + "epoch": 0.24234548850808088, + "grad_norm": 10.302897259600046, + "learning_rate": 4.431913239774052e-06, + "loss": 0.6269, + "step": 2984 + }, + { + "epoch": 0.24242670348412249, + "grad_norm": 4.986694907260506, + "learning_rate": 4.4314957854775895e-06, + "loss": 0.4772, + "step": 2985 + }, + { + "epoch": 0.24250791846016406, + "grad_norm": 28.644633867544297, + "learning_rate": 4.43107819753184e-06, + "loss": 0.4776, + "step": 2986 + }, + { + "epoch": 0.24258913343620564, + "grad_norm": 4.650389954451338, + "learning_rate": 4.4306604759657e-06, + "loss": 0.6489, + "step": 2987 + }, + { + "epoch": 0.24267034841224722, + "grad_norm": 8.110031601217097, + "learning_rate": 4.430242620808073e-06, + "loss": 0.4797, + "step": 2988 + }, + { + "epoch": 0.2427515633882888, + "grad_norm": 4.435012545474071, + "learning_rate": 4.429824632087873e-06, + "loss": 0.5772, + "step": 2989 + }, + { + "epoch": 0.24283277836433037, + "grad_norm": 6.6461768049947505, + "learning_rate": 4.42940650983402e-06, + "loss": 0.6512, + "step": 2990 + }, + { + "epoch": 0.24291399334037198, + "grad_norm": 4.2488570760327935, + "learning_rate": 4.428988254075449e-06, + "loss": 0.4932, + "step": 2991 + }, + { + "epoch": 0.24299520831641355, + "grad_norm": 3.4583725498967657, + "learning_rate": 4.4285698648411005e-06, + "loss": 0.6507, + "step": 2992 + }, + { + "epoch": 0.24307642329245513, + "grad_norm": 8.442412186418991, + "learning_rate": 4.428151342159923e-06, + "loss": 0.7045, + "step": 2993 + }, + { + "epoch": 0.2431576382684967, + "grad_norm": 4.275956058237837, + "learning_rate": 4.427732686060877e-06, + "loss": 0.9022, + "step": 2994 + }, + { + "epoch": 0.24323885324453828, + "grad_norm": 3.5996383051852328, + "learning_rate": 4.427313896572933e-06, + "loss": 0.5063, + "step": 2995 + }, + { + "epoch": 0.2433200682205799, + "grad_norm": 8.226615556870824, + "learning_rate": 4.426894973725066e-06, + "loss": 0.5054, + "step": 2996 + }, + { + "epoch": 0.24340128319662147, + "grad_norm": 4.428674996775757, + "learning_rate": 4.426475917546266e-06, + "loss": 0.5189, + "step": 2997 + }, + { + "epoch": 0.24348249817266304, + "grad_norm": 3.292995498057493, + "learning_rate": 4.426056728065527e-06, + "loss": 0.5175, + "step": 2998 + }, + { + "epoch": 0.24356371314870462, + "grad_norm": 5.396362723485166, + "learning_rate": 4.425637405311857e-06, + "loss": 0.5239, + "step": 2999 + }, + { + "epoch": 0.2436449281247462, + "grad_norm": 4.409324480057902, + "learning_rate": 4.425217949314269e-06, + "loss": 0.5758, + "step": 3000 + }, + { + "epoch": 0.24372614310078777, + "grad_norm": 4.772191402586126, + "learning_rate": 4.424798360101788e-06, + "loss": 0.5178, + "step": 3001 + }, + { + "epoch": 0.24380735807682938, + "grad_norm": 4.70270174443938, + "learning_rate": 4.424378637703448e-06, + "loss": 0.5634, + "step": 3002 + }, + { + "epoch": 0.24388857305287096, + "grad_norm": 4.384411856516953, + "learning_rate": 4.423958782148291e-06, + "loss": 0.6138, + "step": 3003 + }, + { + "epoch": 0.24396978802891253, + "grad_norm": 4.849333757605469, + "learning_rate": 4.423538793465368e-06, + "loss": 0.5399, + "step": 3004 + }, + { + "epoch": 0.2440510030049541, + "grad_norm": 7.683916564358679, + "learning_rate": 4.423118671683741e-06, + "loss": 0.5882, + "step": 3005 + }, + { + "epoch": 0.24413221798099569, + "grad_norm": 6.414015664183622, + "learning_rate": 4.42269841683248e-06, + "loss": 0.501, + "step": 3006 + }, + { + "epoch": 0.2442134329570373, + "grad_norm": 3.445748481334447, + "learning_rate": 4.422278028940664e-06, + "loss": 0.7989, + "step": 3007 + }, + { + "epoch": 0.24429464793307887, + "grad_norm": 3.446395382611435, + "learning_rate": 4.4218575080373825e-06, + "loss": 0.6564, + "step": 3008 + }, + { + "epoch": 0.24437586290912044, + "grad_norm": 7.075485495123882, + "learning_rate": 4.421436854151731e-06, + "loss": 0.5491, + "step": 3009 + }, + { + "epoch": 0.24445707788516202, + "grad_norm": 13.379409500342488, + "learning_rate": 4.421016067312821e-06, + "loss": 0.5192, + "step": 3010 + }, + { + "epoch": 0.2445382928612036, + "grad_norm": 6.268741236760656, + "learning_rate": 4.420595147549764e-06, + "loss": 0.5197, + "step": 3011 + }, + { + "epoch": 0.24461950783724518, + "grad_norm": 4.324676469837308, + "learning_rate": 4.420174094891688e-06, + "loss": 0.6056, + "step": 3012 + }, + { + "epoch": 0.24470072281328678, + "grad_norm": 4.740074003918016, + "learning_rate": 4.419752909367727e-06, + "loss": 0.4472, + "step": 3013 + }, + { + "epoch": 0.24478193778932836, + "grad_norm": 4.345165270453679, + "learning_rate": 4.419331591007025e-06, + "loss": 0.662, + "step": 3014 + }, + { + "epoch": 0.24486315276536993, + "grad_norm": 3.2912342359960913, + "learning_rate": 4.418910139838734e-06, + "loss": 0.6347, + "step": 3015 + }, + { + "epoch": 0.2449443677414115, + "grad_norm": 4.197613659900217, + "learning_rate": 4.418488555892018e-06, + "loss": 0.709, + "step": 3016 + }, + { + "epoch": 0.2450255827174531, + "grad_norm": 6.165218832788323, + "learning_rate": 4.418066839196047e-06, + "loss": 0.5097, + "step": 3017 + }, + { + "epoch": 0.2451067976934947, + "grad_norm": 5.754235502940853, + "learning_rate": 4.4176449897800025e-06, + "loss": 0.4602, + "step": 3018 + }, + { + "epoch": 0.24518801266953627, + "grad_norm": 6.807261983784002, + "learning_rate": 4.417223007673073e-06, + "loss": 0.6116, + "step": 3019 + }, + { + "epoch": 0.24526922764557785, + "grad_norm": 5.874117891239969, + "learning_rate": 4.4168008929044585e-06, + "loss": 0.4809, + "step": 3020 + }, + { + "epoch": 0.24535044262161942, + "grad_norm": 4.771234322125993, + "learning_rate": 4.416378645503366e-06, + "loss": 0.5298, + "step": 3021 + }, + { + "epoch": 0.245431657597661, + "grad_norm": 5.033152836788516, + "learning_rate": 4.415956265499014e-06, + "loss": 0.5011, + "step": 3022 + }, + { + "epoch": 0.24551287257370258, + "grad_norm": 4.892045552702636, + "learning_rate": 4.415533752920629e-06, + "loss": 0.5847, + "step": 3023 + }, + { + "epoch": 0.24559408754974418, + "grad_norm": 3.158494131947643, + "learning_rate": 4.415111107797445e-06, + "loss": 0.5626, + "step": 3024 + }, + { + "epoch": 0.24567530252578576, + "grad_norm": 19.734315350148133, + "learning_rate": 4.414688330158709e-06, + "loss": 0.6228, + "step": 3025 + }, + { + "epoch": 0.24575651750182734, + "grad_norm": 3.975333547273237, + "learning_rate": 4.4142654200336735e-06, + "loss": 0.6962, + "step": 3026 + }, + { + "epoch": 0.24583773247786891, + "grad_norm": 5.615922654297775, + "learning_rate": 4.413842377451602e-06, + "loss": 0.541, + "step": 3027 + }, + { + "epoch": 0.2459189474539105, + "grad_norm": 9.003336691465453, + "learning_rate": 4.4134192024417674e-06, + "loss": 0.4546, + "step": 3028 + }, + { + "epoch": 0.2460001624299521, + "grad_norm": 3.891965066900615, + "learning_rate": 4.412995895033449e-06, + "loss": 0.4849, + "step": 3029 + }, + { + "epoch": 0.24608137740599367, + "grad_norm": 4.878722949580337, + "learning_rate": 4.412572455255942e-06, + "loss": 0.615, + "step": 3030 + }, + { + "epoch": 0.24616259238203525, + "grad_norm": 7.419504517267929, + "learning_rate": 4.412148883138541e-06, + "loss": 0.5744, + "step": 3031 + }, + { + "epoch": 0.24624380735807683, + "grad_norm": 15.433110249433737, + "learning_rate": 4.4117251787105566e-06, + "loss": 0.4246, + "step": 3032 + }, + { + "epoch": 0.2463250223341184, + "grad_norm": 4.626381274322953, + "learning_rate": 4.411301342001309e-06, + "loss": 0.4779, + "step": 3033 + }, + { + "epoch": 0.24640623731015998, + "grad_norm": 3.140055018404333, + "learning_rate": 4.4108773730401235e-06, + "loss": 0.5733, + "step": 3034 + }, + { + "epoch": 0.24648745228620159, + "grad_norm": 5.288841429027363, + "learning_rate": 4.410453271856337e-06, + "loss": 0.5525, + "step": 3035 + }, + { + "epoch": 0.24656866726224316, + "grad_norm": 5.257436077679411, + "learning_rate": 4.410029038479295e-06, + "loss": 0.5659, + "step": 3036 + }, + { + "epoch": 0.24664988223828474, + "grad_norm": 5.634845248429157, + "learning_rate": 4.409604672938352e-06, + "loss": 0.5136, + "step": 3037 + }, + { + "epoch": 0.24673109721432632, + "grad_norm": 6.3457504654963195, + "learning_rate": 4.409180175262872e-06, + "loss": 0.5139, + "step": 3038 + }, + { + "epoch": 0.2468123121903679, + "grad_norm": 6.705941236376485, + "learning_rate": 4.408755545482229e-06, + "loss": 0.6184, + "step": 3039 + }, + { + "epoch": 0.2468935271664095, + "grad_norm": 4.672344302149794, + "learning_rate": 4.408330783625803e-06, + "loss": 0.6296, + "step": 3040 + }, + { + "epoch": 0.24697474214245108, + "grad_norm": 7.4047101228444685, + "learning_rate": 4.407905889722987e-06, + "loss": 0.3766, + "step": 3041 + }, + { + "epoch": 0.24705595711849265, + "grad_norm": 4.2484417683054545, + "learning_rate": 4.407480863803181e-06, + "loss": 0.5838, + "step": 3042 + }, + { + "epoch": 0.24713717209453423, + "grad_norm": 5.209047094212918, + "learning_rate": 4.407055705895794e-06, + "loss": 0.6218, + "step": 3043 + }, + { + "epoch": 0.2472183870705758, + "grad_norm": 4.0255565337154025, + "learning_rate": 4.4066304160302455e-06, + "loss": 0.5566, + "step": 3044 + }, + { + "epoch": 0.24729960204661738, + "grad_norm": 4.333677201999764, + "learning_rate": 4.4062049942359634e-06, + "loss": 0.5295, + "step": 3045 + }, + { + "epoch": 0.247380817022659, + "grad_norm": 4.229077150168856, + "learning_rate": 4.405779440542383e-06, + "loss": 0.7309, + "step": 3046 + }, + { + "epoch": 0.24746203199870057, + "grad_norm": 7.678488339873706, + "learning_rate": 4.405353754978952e-06, + "loss": 0.5675, + "step": 3047 + }, + { + "epoch": 0.24754324697474214, + "grad_norm": 5.135214318233568, + "learning_rate": 4.404927937575125e-06, + "loss": 0.6678, + "step": 3048 + }, + { + "epoch": 0.24762446195078372, + "grad_norm": 5.25362634341942, + "learning_rate": 4.4045019883603676e-06, + "loss": 0.544, + "step": 3049 + }, + { + "epoch": 0.2477056769268253, + "grad_norm": 5.0585319314475194, + "learning_rate": 4.40407590736415e-06, + "loss": 0.6509, + "step": 3050 + }, + { + "epoch": 0.2477868919028669, + "grad_norm": 6.248691395394384, + "learning_rate": 4.403649694615959e-06, + "loss": 0.5899, + "step": 3051 + }, + { + "epoch": 0.24786810687890848, + "grad_norm": 10.336963003211816, + "learning_rate": 4.403223350145283e-06, + "loss": 0.4696, + "step": 3052 + }, + { + "epoch": 0.24794932185495006, + "grad_norm": 4.541662859187703, + "learning_rate": 4.402796873981623e-06, + "loss": 0.4006, + "step": 3053 + }, + { + "epoch": 0.24803053683099163, + "grad_norm": 6.296840627991974, + "learning_rate": 4.402370266154491e-06, + "loss": 0.5062, + "step": 3054 + }, + { + "epoch": 0.2481117518070332, + "grad_norm": 5.894042406789113, + "learning_rate": 4.401943526693404e-06, + "loss": 0.7281, + "step": 3055 + }, + { + "epoch": 0.2481929667830748, + "grad_norm": 5.656446562333912, + "learning_rate": 4.401516655627891e-06, + "loss": 0.6206, + "step": 3056 + }, + { + "epoch": 0.2482741817591164, + "grad_norm": 5.686281684309384, + "learning_rate": 4.401089652987489e-06, + "loss": 0.4256, + "step": 3057 + }, + { + "epoch": 0.24835539673515797, + "grad_norm": 6.0150133514660284, + "learning_rate": 4.4006625188017445e-06, + "loss": 0.502, + "step": 3058 + }, + { + "epoch": 0.24843661171119955, + "grad_norm": 5.249632475937014, + "learning_rate": 4.400235253100214e-06, + "loss": 0.5524, + "step": 3059 + }, + { + "epoch": 0.24851782668724112, + "grad_norm": 4.835925595719573, + "learning_rate": 4.399807855912459e-06, + "loss": 0.6269, + "step": 3060 + }, + { + "epoch": 0.2485990416632827, + "grad_norm": 4.606327993826141, + "learning_rate": 4.3993803272680555e-06, + "loss": 0.6161, + "step": 3061 + }, + { + "epoch": 0.2486802566393243, + "grad_norm": 7.236306200132777, + "learning_rate": 4.398952667196585e-06, + "loss": 0.479, + "step": 3062 + }, + { + "epoch": 0.24876147161536588, + "grad_norm": 5.858172893023612, + "learning_rate": 4.398524875727641e-06, + "loss": 0.568, + "step": 3063 + }, + { + "epoch": 0.24884268659140746, + "grad_norm": 4.056227046494207, + "learning_rate": 4.398096952890823e-06, + "loss": 0.631, + "step": 3064 + }, + { + "epoch": 0.24892390156744904, + "grad_norm": 8.0717170292666, + "learning_rate": 4.397668898715743e-06, + "loss": 0.5397, + "step": 3065 + }, + { + "epoch": 0.2490051165434906, + "grad_norm": 6.347095752035782, + "learning_rate": 4.397240713232016e-06, + "loss": 0.5774, + "step": 3066 + }, + { + "epoch": 0.2490863315195322, + "grad_norm": 4.659148068120233, + "learning_rate": 4.3968123964692745e-06, + "loss": 0.4825, + "step": 3067 + }, + { + "epoch": 0.2491675464955738, + "grad_norm": 5.987640448722061, + "learning_rate": 4.396383948457153e-06, + "loss": 0.6587, + "step": 3068 + }, + { + "epoch": 0.24924876147161537, + "grad_norm": 3.9258521372891573, + "learning_rate": 4.395955369225299e-06, + "loss": 0.8384, + "step": 3069 + }, + { + "epoch": 0.24932997644765695, + "grad_norm": 5.296577196209221, + "learning_rate": 4.395526658803367e-06, + "loss": 0.5995, + "step": 3070 + }, + { + "epoch": 0.24941119142369853, + "grad_norm": 6.077098107367331, + "learning_rate": 4.395097817221023e-06, + "loss": 0.5141, + "step": 3071 + }, + { + "epoch": 0.2494924063997401, + "grad_norm": 9.074255129254627, + "learning_rate": 4.39466884450794e-06, + "loss": 0.5545, + "step": 3072 + }, + { + "epoch": 0.2495736213757817, + "grad_norm": 3.837360404731743, + "learning_rate": 4.3942397406937996e-06, + "loss": 0.7089, + "step": 3073 + }, + { + "epoch": 0.24965483635182328, + "grad_norm": 24.0705694528878, + "learning_rate": 4.393810505808294e-06, + "loss": 0.685, + "step": 3074 + }, + { + "epoch": 0.24973605132786486, + "grad_norm": 29.345212216374478, + "learning_rate": 4.393381139881125e-06, + "loss": 0.5362, + "step": 3075 + }, + { + "epoch": 0.24981726630390644, + "grad_norm": 2.6160968338209645, + "learning_rate": 4.392951642942001e-06, + "loss": 0.5189, + "step": 3076 + }, + { + "epoch": 0.24989848127994801, + "grad_norm": 3.2671386577254586, + "learning_rate": 4.392522015020643e-06, + "loss": 0.526, + "step": 3077 + }, + { + "epoch": 0.2499796962559896, + "grad_norm": 3.4560985141832905, + "learning_rate": 4.392092256146776e-06, + "loss": 0.484, + "step": 3078 + }, + { + "epoch": 0.25006091123203117, + "grad_norm": 4.1989067824587405, + "learning_rate": 4.391662366350139e-06, + "loss": 0.482, + "step": 3079 + }, + { + "epoch": 0.25014212620807275, + "grad_norm": 5.326290581465668, + "learning_rate": 4.3912323456604785e-06, + "loss": 0.6224, + "step": 3080 + }, + { + "epoch": 0.2502233411841144, + "grad_norm": 7.502364694548779, + "learning_rate": 4.390802194107548e-06, + "loss": 0.5254, + "step": 3081 + }, + { + "epoch": 0.25030455616015596, + "grad_norm": 4.007678084848024, + "learning_rate": 4.390371911721113e-06, + "loss": 0.6956, + "step": 3082 + }, + { + "epoch": 0.25038577113619753, + "grad_norm": 6.194239862786131, + "learning_rate": 4.389941498530946e-06, + "loss": 0.6204, + "step": 3083 + }, + { + "epoch": 0.2504669861122391, + "grad_norm": 5.28161752223026, + "learning_rate": 4.38951095456683e-06, + "loss": 0.6249, + "step": 3084 + }, + { + "epoch": 0.2505482010882807, + "grad_norm": 6.6004264867031415, + "learning_rate": 4.389080279858556e-06, + "loss": 0.6299, + "step": 3085 + }, + { + "epoch": 0.25062941606432226, + "grad_norm": 4.08243439857223, + "learning_rate": 4.388649474435925e-06, + "loss": 0.5395, + "step": 3086 + }, + { + "epoch": 0.25071063104036384, + "grad_norm": 10.991487088043705, + "learning_rate": 4.388218538328746e-06, + "loss": 0.4487, + "step": 3087 + }, + { + "epoch": 0.2507918460164054, + "grad_norm": 5.374906431529036, + "learning_rate": 4.387787471566837e-06, + "loss": 0.6908, + "step": 3088 + }, + { + "epoch": 0.250873060992447, + "grad_norm": 6.4091647653829344, + "learning_rate": 4.387356274180025e-06, + "loss": 0.559, + "step": 3089 + }, + { + "epoch": 0.25095427596848857, + "grad_norm": 3.189931420557015, + "learning_rate": 4.386924946198148e-06, + "loss": 0.5294, + "step": 3090 + }, + { + "epoch": 0.25103549094453015, + "grad_norm": 3.881260853633071, + "learning_rate": 4.386493487651052e-06, + "loss": 0.7014, + "step": 3091 + }, + { + "epoch": 0.2511167059205718, + "grad_norm": 9.67421505771052, + "learning_rate": 4.38606189856859e-06, + "loss": 0.506, + "step": 3092 + }, + { + "epoch": 0.25119792089661336, + "grad_norm": 12.331043817036711, + "learning_rate": 4.385630178980627e-06, + "loss": 0.539, + "step": 3093 + }, + { + "epoch": 0.25127913587265494, + "grad_norm": 4.246731275050449, + "learning_rate": 4.385198328917034e-06, + "loss": 0.5523, + "step": 3094 + }, + { + "epoch": 0.2513603508486965, + "grad_norm": 5.769645319972767, + "learning_rate": 4.384766348407695e-06, + "loss": 0.5281, + "step": 3095 + }, + { + "epoch": 0.2514415658247381, + "grad_norm": 4.286081820286076, + "learning_rate": 4.3843342374825e-06, + "loss": 0.5954, + "step": 3096 + }, + { + "epoch": 0.25152278080077967, + "grad_norm": 3.6620430541424187, + "learning_rate": 4.383901996171348e-06, + "loss": 0.809, + "step": 3097 + }, + { + "epoch": 0.25160399577682124, + "grad_norm": 4.730554157695557, + "learning_rate": 4.383469624504148e-06, + "loss": 0.5542, + "step": 3098 + }, + { + "epoch": 0.2516852107528628, + "grad_norm": 5.354061606302798, + "learning_rate": 4.3830371225108185e-06, + "loss": 0.4754, + "step": 3099 + }, + { + "epoch": 0.2517664257289044, + "grad_norm": 7.122075601960895, + "learning_rate": 4.382604490221286e-06, + "loss": 0.4081, + "step": 3100 + }, + { + "epoch": 0.251847640704946, + "grad_norm": 4.487864560668316, + "learning_rate": 4.382171727665486e-06, + "loss": 0.71, + "step": 3101 + }, + { + "epoch": 0.25192885568098755, + "grad_norm": 5.423045386515985, + "learning_rate": 4.381738834873364e-06, + "loss": 0.518, + "step": 3102 + }, + { + "epoch": 0.2520100706570292, + "grad_norm": 4.117696423281904, + "learning_rate": 4.381305811874873e-06, + "loss": 0.3472, + "step": 3103 + }, + { + "epoch": 0.25209128563307076, + "grad_norm": 4.590924951658724, + "learning_rate": 4.3808726586999766e-06, + "loss": 0.6369, + "step": 3104 + }, + { + "epoch": 0.25217250060911234, + "grad_norm": 6.69015366947571, + "learning_rate": 4.380439375378646e-06, + "loss": 0.5335, + "step": 3105 + }, + { + "epoch": 0.2522537155851539, + "grad_norm": 9.103381660915732, + "learning_rate": 4.380005961940864e-06, + "loss": 0.4577, + "step": 3106 + }, + { + "epoch": 0.2523349305611955, + "grad_norm": 7.434831084210403, + "learning_rate": 4.379572418416619e-06, + "loss": 0.5355, + "step": 3107 + }, + { + "epoch": 0.25241614553723707, + "grad_norm": 8.173889824189096, + "learning_rate": 4.37913874483591e-06, + "loss": 0.4481, + "step": 3108 + }, + { + "epoch": 0.25249736051327865, + "grad_norm": 10.499303258759314, + "learning_rate": 4.378704941228746e-06, + "loss": 0.5386, + "step": 3109 + }, + { + "epoch": 0.2525785754893202, + "grad_norm": 5.028078700952843, + "learning_rate": 4.378271007625141e-06, + "loss": 0.4977, + "step": 3110 + }, + { + "epoch": 0.2526597904653618, + "grad_norm": 4.3897095234161245, + "learning_rate": 4.377836944055124e-06, + "loss": 0.5747, + "step": 3111 + }, + { + "epoch": 0.2527410054414034, + "grad_norm": 5.593416675150363, + "learning_rate": 4.377402750548729e-06, + "loss": 0.6102, + "step": 3112 + }, + { + "epoch": 0.25282222041744495, + "grad_norm": 5.376961891531233, + "learning_rate": 4.376968427135999e-06, + "loss": 0.6942, + "step": 3113 + }, + { + "epoch": 0.2529034353934866, + "grad_norm": 7.126459999216967, + "learning_rate": 4.376533973846988e-06, + "loss": 0.7788, + "step": 3114 + }, + { + "epoch": 0.25298465036952816, + "grad_norm": 4.854425202352291, + "learning_rate": 4.376099390711758e-06, + "loss": 0.4626, + "step": 3115 + }, + { + "epoch": 0.25306586534556974, + "grad_norm": 8.756831124179643, + "learning_rate": 4.375664677760378e-06, + "loss": 0.6323, + "step": 3116 + }, + { + "epoch": 0.2531470803216113, + "grad_norm": 4.267269088423493, + "learning_rate": 4.375229835022929e-06, + "loss": 0.6067, + "step": 3117 + }, + { + "epoch": 0.2532282952976529, + "grad_norm": 5.198815978500407, + "learning_rate": 4.374794862529501e-06, + "loss": 0.5796, + "step": 3118 + }, + { + "epoch": 0.25330951027369447, + "grad_norm": 4.95003494031335, + "learning_rate": 4.374359760310191e-06, + "loss": 0.4924, + "step": 3119 + }, + { + "epoch": 0.25339072524973605, + "grad_norm": 4.604098155034568, + "learning_rate": 4.373924528395105e-06, + "loss": 0.6536, + "step": 3120 + }, + { + "epoch": 0.2534719402257776, + "grad_norm": 5.033107338044671, + "learning_rate": 4.373489166814358e-06, + "loss": 0.6378, + "step": 3121 + }, + { + "epoch": 0.2535531552018192, + "grad_norm": 5.806258612021515, + "learning_rate": 4.3730536755980776e-06, + "loss": 0.4952, + "step": 3122 + }, + { + "epoch": 0.2536343701778608, + "grad_norm": 4.217899119753335, + "learning_rate": 4.372618054776395e-06, + "loss": 0.7578, + "step": 3123 + }, + { + "epoch": 0.25371558515390236, + "grad_norm": 4.5310110282850085, + "learning_rate": 4.372182304379455e-06, + "loss": 0.5417, + "step": 3124 + }, + { + "epoch": 0.253796800129944, + "grad_norm": 5.792695550840339, + "learning_rate": 4.371746424437406e-06, + "loss": 0.5578, + "step": 3125 + }, + { + "epoch": 0.25387801510598557, + "grad_norm": 9.674533837851303, + "learning_rate": 4.371310414980412e-06, + "loss": 0.5975, + "step": 3126 + }, + { + "epoch": 0.25395923008202714, + "grad_norm": 4.234861474964158, + "learning_rate": 4.37087427603864e-06, + "loss": 0.5482, + "step": 3127 + }, + { + "epoch": 0.2540404450580687, + "grad_norm": 5.754506563667726, + "learning_rate": 4.37043800764227e-06, + "loss": 0.5447, + "step": 3128 + }, + { + "epoch": 0.2541216600341103, + "grad_norm": 10.678832172560531, + "learning_rate": 4.37000160982149e-06, + "loss": 0.7462, + "step": 3129 + }, + { + "epoch": 0.2542028750101519, + "grad_norm": 4.948964860269852, + "learning_rate": 4.369565082606495e-06, + "loss": 0.7041, + "step": 3130 + }, + { + "epoch": 0.25428408998619345, + "grad_norm": 3.6881446493621657, + "learning_rate": 4.369128426027489e-06, + "loss": 0.6471, + "step": 3131 + }, + { + "epoch": 0.25436530496223503, + "grad_norm": 4.0215825826282625, + "learning_rate": 4.36869164011469e-06, + "loss": 0.6399, + "step": 3132 + }, + { + "epoch": 0.2544465199382766, + "grad_norm": 5.583956863498479, + "learning_rate": 4.368254724898319e-06, + "loss": 0.7778, + "step": 3133 + }, + { + "epoch": 0.2545277349143182, + "grad_norm": 4.313157219895635, + "learning_rate": 4.367817680408609e-06, + "loss": 0.5685, + "step": 3134 + }, + { + "epoch": 0.25460894989035976, + "grad_norm": 5.121719815553165, + "learning_rate": 4.3673805066758e-06, + "loss": 0.6546, + "step": 3135 + }, + { + "epoch": 0.2546901648664014, + "grad_norm": 6.598529582533699, + "learning_rate": 4.366943203730144e-06, + "loss": 0.5776, + "step": 3136 + }, + { + "epoch": 0.25477137984244297, + "grad_norm": 8.179382375263176, + "learning_rate": 4.366505771601898e-06, + "loss": 0.655, + "step": 3137 + }, + { + "epoch": 0.25485259481848455, + "grad_norm": 3.5740269182727085, + "learning_rate": 4.366068210321331e-06, + "loss": 0.5393, + "step": 3138 + }, + { + "epoch": 0.2549338097945261, + "grad_norm": 9.451730618946357, + "learning_rate": 4.3656305199187195e-06, + "loss": 0.5082, + "step": 3139 + }, + { + "epoch": 0.2550150247705677, + "grad_norm": 6.028800369548608, + "learning_rate": 4.365192700424351e-06, + "loss": 0.6664, + "step": 3140 + }, + { + "epoch": 0.2550962397466093, + "grad_norm": 5.858773502768749, + "learning_rate": 4.364754751868519e-06, + "loss": 0.5164, + "step": 3141 + }, + { + "epoch": 0.25517745472265085, + "grad_norm": 7.052601898082844, + "learning_rate": 4.364316674281526e-06, + "loss": 0.544, + "step": 3142 + }, + { + "epoch": 0.25525866969869243, + "grad_norm": 5.076254392341973, + "learning_rate": 4.363878467693686e-06, + "loss": 0.5765, + "step": 3143 + }, + { + "epoch": 0.255339884674734, + "grad_norm": 4.671237955123445, + "learning_rate": 4.363440132135322e-06, + "loss": 0.6367, + "step": 3144 + }, + { + "epoch": 0.2554210996507756, + "grad_norm": 4.135246305916654, + "learning_rate": 4.363001667636762e-06, + "loss": 0.725, + "step": 3145 + }, + { + "epoch": 0.25550231462681716, + "grad_norm": 5.579014444017926, + "learning_rate": 4.362563074228346e-06, + "loss": 0.5253, + "step": 3146 + }, + { + "epoch": 0.2555835296028588, + "grad_norm": 4.386851280267495, + "learning_rate": 4.3621243519404235e-06, + "loss": 0.5849, + "step": 3147 + }, + { + "epoch": 0.25566474457890037, + "grad_norm": 4.068805050530992, + "learning_rate": 4.36168550080335e-06, + "loss": 0.5252, + "step": 3148 + }, + { + "epoch": 0.25574595955494195, + "grad_norm": 5.079228239353849, + "learning_rate": 4.361246520847493e-06, + "loss": 0.4509, + "step": 3149 + }, + { + "epoch": 0.2558271745309835, + "grad_norm": 4.417633075136854, + "learning_rate": 4.360807412103228e-06, + "loss": 0.5594, + "step": 3150 + }, + { + "epoch": 0.2559083895070251, + "grad_norm": 5.457155176446224, + "learning_rate": 4.3603681746009374e-06, + "loss": 0.5581, + "step": 3151 + }, + { + "epoch": 0.2559896044830667, + "grad_norm": 4.311137196331857, + "learning_rate": 4.3599288083710155e-06, + "loss": 0.5777, + "step": 3152 + }, + { + "epoch": 0.25607081945910826, + "grad_norm": 5.779855459554121, + "learning_rate": 4.359489313443864e-06, + "loss": 0.6154, + "step": 3153 + }, + { + "epoch": 0.25615203443514983, + "grad_norm": 5.203623644425137, + "learning_rate": 4.359049689849893e-06, + "loss": 0.5689, + "step": 3154 + }, + { + "epoch": 0.2562332494111914, + "grad_norm": 6.2014101236501995, + "learning_rate": 4.358609937619522e-06, + "loss": 0.6593, + "step": 3155 + }, + { + "epoch": 0.256314464387233, + "grad_norm": 6.085891873335906, + "learning_rate": 4.358170056783179e-06, + "loss": 0.425, + "step": 3156 + }, + { + "epoch": 0.25639567936327456, + "grad_norm": 3.58366241946543, + "learning_rate": 4.357730047371304e-06, + "loss": 0.538, + "step": 3157 + }, + { + "epoch": 0.2564768943393162, + "grad_norm": 5.44533166154409, + "learning_rate": 4.357289909414341e-06, + "loss": 0.5894, + "step": 3158 + }, + { + "epoch": 0.2565581093153578, + "grad_norm": 3.8173019501608625, + "learning_rate": 4.356849642942746e-06, + "loss": 0.5385, + "step": 3159 + }, + { + "epoch": 0.25663932429139935, + "grad_norm": 6.388033240415221, + "learning_rate": 4.356409247986982e-06, + "loss": 0.5628, + "step": 3160 + }, + { + "epoch": 0.25672053926744093, + "grad_norm": 10.182238415534854, + "learning_rate": 4.355968724577523e-06, + "loss": 0.5962, + "step": 3161 + }, + { + "epoch": 0.2568017542434825, + "grad_norm": 3.7342682561919998, + "learning_rate": 4.355528072744851e-06, + "loss": 0.6925, + "step": 3162 + }, + { + "epoch": 0.2568829692195241, + "grad_norm": 3.771086888643871, + "learning_rate": 4.355087292519458e-06, + "loss": 0.7971, + "step": 3163 + }, + { + "epoch": 0.25696418419556566, + "grad_norm": 4.185200079768148, + "learning_rate": 4.354646383931841e-06, + "loss": 0.6958, + "step": 3164 + }, + { + "epoch": 0.25704539917160724, + "grad_norm": 4.773930727509564, + "learning_rate": 4.3542053470125104e-06, + "loss": 0.5607, + "step": 3165 + }, + { + "epoch": 0.2571266141476488, + "grad_norm": 4.658976115697843, + "learning_rate": 4.353764181791983e-06, + "loss": 0.5175, + "step": 3166 + }, + { + "epoch": 0.2572078291236904, + "grad_norm": 6.33815354108174, + "learning_rate": 4.353322888300785e-06, + "loss": 0.4758, + "step": 3167 + }, + { + "epoch": 0.25728904409973197, + "grad_norm": 3.6435625688001423, + "learning_rate": 4.3528814665694515e-06, + "loss": 0.6159, + "step": 3168 + }, + { + "epoch": 0.2573702590757736, + "grad_norm": 3.901416738242754, + "learning_rate": 4.352439916628527e-06, + "loss": 0.6253, + "step": 3169 + }, + { + "epoch": 0.2574514740518152, + "grad_norm": 5.97609923012503, + "learning_rate": 4.351998238508563e-06, + "loss": 0.4725, + "step": 3170 + }, + { + "epoch": 0.25753268902785675, + "grad_norm": 3.836440857096687, + "learning_rate": 4.351556432240124e-06, + "loss": 0.4923, + "step": 3171 + }, + { + "epoch": 0.25761390400389833, + "grad_norm": 6.0800255854901595, + "learning_rate": 4.351114497853779e-06, + "loss": 0.6401, + "step": 3172 + }, + { + "epoch": 0.2576951189799399, + "grad_norm": 5.622546507114739, + "learning_rate": 4.350672435380107e-06, + "loss": 0.7105, + "step": 3173 + }, + { + "epoch": 0.2577763339559815, + "grad_norm": 5.822121836191669, + "learning_rate": 4.350230244849697e-06, + "loss": 0.6403, + "step": 3174 + }, + { + "epoch": 0.25785754893202306, + "grad_norm": 3.5905578627539123, + "learning_rate": 4.349787926293146e-06, + "loss": 0.6267, + "step": 3175 + }, + { + "epoch": 0.25793876390806464, + "grad_norm": 3.4379879993140126, + "learning_rate": 4.349345479741062e-06, + "loss": 0.7763, + "step": 3176 + }, + { + "epoch": 0.2580199788841062, + "grad_norm": 5.296351841281579, + "learning_rate": 4.348902905224057e-06, + "loss": 0.5434, + "step": 3177 + }, + { + "epoch": 0.2581011938601478, + "grad_norm": 4.502659862665315, + "learning_rate": 4.348460202772756e-06, + "loss": 0.6596, + "step": 3178 + }, + { + "epoch": 0.25818240883618937, + "grad_norm": 5.66077638544368, + "learning_rate": 4.348017372417792e-06, + "loss": 0.4524, + "step": 3179 + }, + { + "epoch": 0.258263623812231, + "grad_norm": 6.837754388761909, + "learning_rate": 4.347574414189807e-06, + "loss": 0.539, + "step": 3180 + }, + { + "epoch": 0.2583448387882726, + "grad_norm": 4.065776898335388, + "learning_rate": 4.347131328119451e-06, + "loss": 0.4425, + "step": 3181 + }, + { + "epoch": 0.25842605376431416, + "grad_norm": 6.317177110076131, + "learning_rate": 4.346688114237381e-06, + "loss": 0.5503, + "step": 3182 + }, + { + "epoch": 0.25850726874035573, + "grad_norm": 3.9597286603343207, + "learning_rate": 4.346244772574268e-06, + "loss": 0.565, + "step": 3183 + }, + { + "epoch": 0.2585884837163973, + "grad_norm": 4.617361547979039, + "learning_rate": 4.345801303160788e-06, + "loss": 0.6364, + "step": 3184 + }, + { + "epoch": 0.2586696986924389, + "grad_norm": 3.869309442994426, + "learning_rate": 4.3453577060276264e-06, + "loss": 0.7814, + "step": 3185 + }, + { + "epoch": 0.25875091366848046, + "grad_norm": 5.268139660706938, + "learning_rate": 4.344913981205479e-06, + "loss": 0.6746, + "step": 3186 + }, + { + "epoch": 0.25883212864452204, + "grad_norm": 8.40326725980448, + "learning_rate": 4.344470128725047e-06, + "loss": 0.5914, + "step": 3187 + }, + { + "epoch": 0.2589133436205636, + "grad_norm": 6.871592559950185, + "learning_rate": 4.344026148617043e-06, + "loss": 0.5718, + "step": 3188 + }, + { + "epoch": 0.2589945585966052, + "grad_norm": 5.20815182160167, + "learning_rate": 4.343582040912191e-06, + "loss": 0.5049, + "step": 3189 + }, + { + "epoch": 0.2590757735726468, + "grad_norm": 5.335316024532678, + "learning_rate": 4.343137805641217e-06, + "loss": 0.6226, + "step": 3190 + }, + { + "epoch": 0.2591569885486884, + "grad_norm": 3.8793130716336335, + "learning_rate": 4.3426934428348624e-06, + "loss": 0.4488, + "step": 3191 + }, + { + "epoch": 0.25923820352473, + "grad_norm": 5.127013360030541, + "learning_rate": 4.342248952523874e-06, + "loss": 0.4591, + "step": 3192 + }, + { + "epoch": 0.25931941850077156, + "grad_norm": 4.532500566508718, + "learning_rate": 4.341804334739008e-06, + "loss": 0.4925, + "step": 3193 + }, + { + "epoch": 0.25940063347681314, + "grad_norm": 5.195487587147855, + "learning_rate": 4.34135958951103e-06, + "loss": 0.5041, + "step": 3194 + }, + { + "epoch": 0.2594818484528547, + "grad_norm": 3.983184952506586, + "learning_rate": 4.3409147168707124e-06, + "loss": 0.5198, + "step": 3195 + }, + { + "epoch": 0.2595630634288963, + "grad_norm": 4.96090598003835, + "learning_rate": 4.34046971684884e-06, + "loss": 0.4985, + "step": 3196 + }, + { + "epoch": 0.25964427840493787, + "grad_norm": 4.583631169515384, + "learning_rate": 4.340024589476204e-06, + "loss": 0.634, + "step": 3197 + }, + { + "epoch": 0.25972549338097944, + "grad_norm": 5.545926672500218, + "learning_rate": 4.3395793347836034e-06, + "loss": 0.4752, + "step": 3198 + }, + { + "epoch": 0.259806708357021, + "grad_norm": 6.711460341815773, + "learning_rate": 4.33913395280185e-06, + "loss": 0.7308, + "step": 3199 + }, + { + "epoch": 0.2598879233330626, + "grad_norm": 3.7919381803697276, + "learning_rate": 4.33868844356176e-06, + "loss": 0.5049, + "step": 3200 + }, + { + "epoch": 0.2599691383091042, + "grad_norm": 6.205008626331162, + "learning_rate": 4.338242807094161e-06, + "loss": 0.6063, + "step": 3201 + }, + { + "epoch": 0.2600503532851458, + "grad_norm": 4.694118634605663, + "learning_rate": 4.3377970434298885e-06, + "loss": 0.6166, + "step": 3202 + }, + { + "epoch": 0.2601315682611874, + "grad_norm": 5.435770883892459, + "learning_rate": 4.337351152599787e-06, + "loss": 0.5616, + "step": 3203 + }, + { + "epoch": 0.26021278323722896, + "grad_norm": 3.8614175476698485, + "learning_rate": 4.33690513463471e-06, + "loss": 0.5997, + "step": 3204 + }, + { + "epoch": 0.26029399821327054, + "grad_norm": 5.804567048793658, + "learning_rate": 4.336458989565519e-06, + "loss": 0.5467, + "step": 3205 + }, + { + "epoch": 0.2603752131893121, + "grad_norm": 6.725056494128386, + "learning_rate": 4.336012717423085e-06, + "loss": 0.4595, + "step": 3206 + }, + { + "epoch": 0.2604564281653537, + "grad_norm": 5.3233685676327065, + "learning_rate": 4.335566318238289e-06, + "loss": 0.5616, + "step": 3207 + }, + { + "epoch": 0.26053764314139527, + "grad_norm": 3.4875685060457218, + "learning_rate": 4.335119792042017e-06, + "loss": 0.5414, + "step": 3208 + }, + { + "epoch": 0.26061885811743685, + "grad_norm": 4.305522238103512, + "learning_rate": 4.334673138865169e-06, + "loss": 0.5734, + "step": 3209 + }, + { + "epoch": 0.2607000730934784, + "grad_norm": 6.18157123882514, + "learning_rate": 4.334226358738649e-06, + "loss": 0.462, + "step": 3210 + }, + { + "epoch": 0.26078128806952, + "grad_norm": 14.713830091090161, + "learning_rate": 4.333779451693373e-06, + "loss": 0.6092, + "step": 3211 + }, + { + "epoch": 0.2608625030455616, + "grad_norm": 11.006487008864882, + "learning_rate": 4.333332417760263e-06, + "loss": 0.4972, + "step": 3212 + }, + { + "epoch": 0.2609437180216032, + "grad_norm": 4.400134883259066, + "learning_rate": 4.332885256970253e-06, + "loss": 0.5909, + "step": 3213 + }, + { + "epoch": 0.2610249329976448, + "grad_norm": 5.443017416295305, + "learning_rate": 4.332437969354284e-06, + "loss": 0.4379, + "step": 3214 + }, + { + "epoch": 0.26110614797368636, + "grad_norm": 4.4738174242506235, + "learning_rate": 4.331990554943305e-06, + "loss": 0.6238, + "step": 3215 + }, + { + "epoch": 0.26118736294972794, + "grad_norm": 5.0416535662280655, + "learning_rate": 4.331543013768276e-06, + "loss": 0.3969, + "step": 3216 + }, + { + "epoch": 0.2612685779257695, + "grad_norm": 4.237641745664451, + "learning_rate": 4.331095345860162e-06, + "loss": 0.4619, + "step": 3217 + }, + { + "epoch": 0.2613497929018111, + "grad_norm": 5.09656551194579, + "learning_rate": 4.330647551249942e-06, + "loss": 0.5224, + "step": 3218 + }, + { + "epoch": 0.2614310078778527, + "grad_norm": 4.113097985628701, + "learning_rate": 4.330199629968601e-06, + "loss": 0.7045, + "step": 3219 + }, + { + "epoch": 0.26151222285389425, + "grad_norm": 4.391696991616136, + "learning_rate": 4.329751582047132e-06, + "loss": 0.4772, + "step": 3220 + }, + { + "epoch": 0.2615934378299358, + "grad_norm": 6.284634192395287, + "learning_rate": 4.3293034075165355e-06, + "loss": 0.7468, + "step": 3221 + }, + { + "epoch": 0.2616746528059774, + "grad_norm": 5.31239683535975, + "learning_rate": 4.328855106407826e-06, + "loss": 0.5164, + "step": 3222 + }, + { + "epoch": 0.261755867782019, + "grad_norm": 4.2612158979297945, + "learning_rate": 4.328406678752022e-06, + "loss": 0.6003, + "step": 3223 + }, + { + "epoch": 0.2618370827580606, + "grad_norm": 5.243856895565067, + "learning_rate": 4.3279581245801515e-06, + "loss": 0.5995, + "step": 3224 + }, + { + "epoch": 0.2619182977341022, + "grad_norm": 5.3541602152619046, + "learning_rate": 4.327509443923254e-06, + "loss": 0.4661, + "step": 3225 + }, + { + "epoch": 0.26199951271014377, + "grad_norm": 5.615300922143262, + "learning_rate": 4.327060636812375e-06, + "loss": 0.634, + "step": 3226 + }, + { + "epoch": 0.26208072768618534, + "grad_norm": 5.152013169697966, + "learning_rate": 4.32661170327857e-06, + "loss": 0.6963, + "step": 3227 + }, + { + "epoch": 0.2621619426622269, + "grad_norm": 5.2073960016961545, + "learning_rate": 4.326162643352901e-06, + "loss": 0.7672, + "step": 3228 + }, + { + "epoch": 0.2622431576382685, + "grad_norm": 5.909318364895728, + "learning_rate": 4.325713457066443e-06, + "loss": 0.6173, + "step": 3229 + }, + { + "epoch": 0.2623243726143101, + "grad_norm": 16.26621162135915, + "learning_rate": 4.325264144450276e-06, + "loss": 0.3936, + "step": 3230 + }, + { + "epoch": 0.26240558759035165, + "grad_norm": 4.238425289228819, + "learning_rate": 4.324814705535491e-06, + "loss": 0.5425, + "step": 3231 + }, + { + "epoch": 0.26248680256639323, + "grad_norm": 7.639214601519878, + "learning_rate": 4.324365140353185e-06, + "loss": 0.6787, + "step": 3232 + }, + { + "epoch": 0.2625680175424348, + "grad_norm": 4.776323919697988, + "learning_rate": 4.323915448934466e-06, + "loss": 0.7179, + "step": 3233 + }, + { + "epoch": 0.2626492325184764, + "grad_norm": 4.829079791949981, + "learning_rate": 4.323465631310452e-06, + "loss": 0.5759, + "step": 3234 + }, + { + "epoch": 0.262730447494518, + "grad_norm": 5.074740707653109, + "learning_rate": 4.323015687512267e-06, + "loss": 0.5894, + "step": 3235 + }, + { + "epoch": 0.2628116624705596, + "grad_norm": 9.615050714429785, + "learning_rate": 4.322565617571044e-06, + "loss": 0.5466, + "step": 3236 + }, + { + "epoch": 0.26289287744660117, + "grad_norm": 5.123025702283434, + "learning_rate": 4.322115421517926e-06, + "loss": 0.5913, + "step": 3237 + }, + { + "epoch": 0.26297409242264275, + "grad_norm": 4.751136267011921, + "learning_rate": 4.321665099384064e-06, + "loss": 0.4789, + "step": 3238 + }, + { + "epoch": 0.2630553073986843, + "grad_norm": 10.947055513952483, + "learning_rate": 4.321214651200619e-06, + "loss": 0.5118, + "step": 3239 + }, + { + "epoch": 0.2631365223747259, + "grad_norm": 5.687786414091623, + "learning_rate": 4.320764076998759e-06, + "loss": 0.6006, + "step": 3240 + }, + { + "epoch": 0.2632177373507675, + "grad_norm": 4.846530548806665, + "learning_rate": 4.32031337680966e-06, + "loss": 0.6819, + "step": 3241 + }, + { + "epoch": 0.26329895232680905, + "grad_norm": 4.29992728461369, + "learning_rate": 4.31986255066451e-06, + "loss": 0.6645, + "step": 3242 + }, + { + "epoch": 0.26338016730285063, + "grad_norm": 4.728540495274627, + "learning_rate": 4.319411598594503e-06, + "loss": 0.65, + "step": 3243 + }, + { + "epoch": 0.2634613822788922, + "grad_norm": 4.044743720033708, + "learning_rate": 4.318960520630842e-06, + "loss": 0.5344, + "step": 3244 + }, + { + "epoch": 0.2635425972549338, + "grad_norm": 4.062414651992789, + "learning_rate": 4.3185093168047395e-06, + "loss": 0.6314, + "step": 3245 + }, + { + "epoch": 0.2636238122309754, + "grad_norm": 5.77814635308223, + "learning_rate": 4.318057987147418e-06, + "loss": 0.5496, + "step": 3246 + }, + { + "epoch": 0.263705027207017, + "grad_norm": 8.868723303386513, + "learning_rate": 4.317606531690104e-06, + "loss": 0.5484, + "step": 3247 + }, + { + "epoch": 0.2637862421830586, + "grad_norm": 5.931221128514847, + "learning_rate": 4.317154950464039e-06, + "loss": 0.5311, + "step": 3248 + }, + { + "epoch": 0.26386745715910015, + "grad_norm": 5.582327237175479, + "learning_rate": 4.316703243500467e-06, + "loss": 0.532, + "step": 3249 + }, + { + "epoch": 0.2639486721351417, + "grad_norm": 3.5277983786023235, + "learning_rate": 4.3162514108306465e-06, + "loss": 0.7615, + "step": 3250 + }, + { + "epoch": 0.2640298871111833, + "grad_norm": 6.701677321017773, + "learning_rate": 4.315799452485841e-06, + "loss": 0.4753, + "step": 3251 + }, + { + "epoch": 0.2641111020872249, + "grad_norm": 4.478895629503352, + "learning_rate": 4.3153473684973226e-06, + "loss": 0.5828, + "step": 3252 + }, + { + "epoch": 0.26419231706326646, + "grad_norm": 6.992879628361261, + "learning_rate": 4.314895158896374e-06, + "loss": 0.5683, + "step": 3253 + }, + { + "epoch": 0.26427353203930803, + "grad_norm": 4.622185359910068, + "learning_rate": 4.314442823714286e-06, + "loss": 0.4762, + "step": 3254 + }, + { + "epoch": 0.2643547470153496, + "grad_norm": 3.3396459737640587, + "learning_rate": 4.313990362982357e-06, + "loss": 0.7585, + "step": 3255 + }, + { + "epoch": 0.2644359619913912, + "grad_norm": 9.71525006135501, + "learning_rate": 4.313537776731895e-06, + "loss": 0.4599, + "step": 3256 + }, + { + "epoch": 0.2645171769674328, + "grad_norm": 5.997259305627276, + "learning_rate": 4.313085064994218e-06, + "loss": 0.59, + "step": 3257 + }, + { + "epoch": 0.2645983919434744, + "grad_norm": 3.4896157532430285, + "learning_rate": 4.3126322278006496e-06, + "loss": 0.6535, + "step": 3258 + }, + { + "epoch": 0.264679606919516, + "grad_norm": 3.163990271513905, + "learning_rate": 4.312179265182523e-06, + "loss": 0.4253, + "step": 3259 + }, + { + "epoch": 0.26476082189555755, + "grad_norm": 4.468139415385732, + "learning_rate": 4.311726177171184e-06, + "loss": 0.5902, + "step": 3260 + }, + { + "epoch": 0.26484203687159913, + "grad_norm": 3.7893680396692053, + "learning_rate": 4.311272963797981e-06, + "loss": 0.5972, + "step": 3261 + }, + { + "epoch": 0.2649232518476407, + "grad_norm": 5.50109202559223, + "learning_rate": 4.3108196250942746e-06, + "loss": 0.4875, + "step": 3262 + }, + { + "epoch": 0.2650044668236823, + "grad_norm": 10.031595831871455, + "learning_rate": 4.310366161091435e-06, + "loss": 0.7109, + "step": 3263 + }, + { + "epoch": 0.26508568179972386, + "grad_norm": 5.5919773594492, + "learning_rate": 4.309912571820837e-06, + "loss": 0.521, + "step": 3264 + }, + { + "epoch": 0.26516689677576544, + "grad_norm": 4.444420349163222, + "learning_rate": 4.309458857313868e-06, + "loss": 0.6694, + "step": 3265 + }, + { + "epoch": 0.265248111751807, + "grad_norm": 3.8949106613348947, + "learning_rate": 4.309005017601924e-06, + "loss": 0.6129, + "step": 3266 + }, + { + "epoch": 0.2653293267278486, + "grad_norm": 3.3059067568093847, + "learning_rate": 4.308551052716406e-06, + "loss": 0.6001, + "step": 3267 + }, + { + "epoch": 0.2654105417038902, + "grad_norm": 5.114398735836412, + "learning_rate": 4.308096962688726e-06, + "loss": 0.5416, + "step": 3268 + }, + { + "epoch": 0.2654917566799318, + "grad_norm": 3.1154319915363855, + "learning_rate": 4.307642747550306e-06, + "loss": 0.5931, + "step": 3269 + }, + { + "epoch": 0.2655729716559734, + "grad_norm": 4.307648831780373, + "learning_rate": 4.307188407332574e-06, + "loss": 0.5852, + "step": 3270 + }, + { + "epoch": 0.26565418663201495, + "grad_norm": 4.424486660650266, + "learning_rate": 4.306733942066969e-06, + "loss": 0.5399, + "step": 3271 + }, + { + "epoch": 0.26573540160805653, + "grad_norm": 6.036748818342054, + "learning_rate": 4.306279351784938e-06, + "loss": 0.4185, + "step": 3272 + }, + { + "epoch": 0.2658166165840981, + "grad_norm": 3.6762274889278195, + "learning_rate": 4.305824636517935e-06, + "loss": 0.4484, + "step": 3273 + }, + { + "epoch": 0.2658978315601397, + "grad_norm": 4.098602260190931, + "learning_rate": 4.305369796297424e-06, + "loss": 0.4614, + "step": 3274 + }, + { + "epoch": 0.26597904653618126, + "grad_norm": 5.298237691361923, + "learning_rate": 4.3049148311548785e-06, + "loss": 0.5971, + "step": 3275 + }, + { + "epoch": 0.26606026151222284, + "grad_norm": 5.05302699787098, + "learning_rate": 4.304459741121778e-06, + "loss": 0.4597, + "step": 3276 + }, + { + "epoch": 0.2661414764882644, + "grad_norm": 4.995834340661036, + "learning_rate": 4.304004526229614e-06, + "loss": 0.5854, + "step": 3277 + }, + { + "epoch": 0.266222691464306, + "grad_norm": 2.5777374640004838, + "learning_rate": 4.303549186509884e-06, + "loss": 0.498, + "step": 3278 + }, + { + "epoch": 0.2663039064403476, + "grad_norm": 3.390125198708796, + "learning_rate": 4.303093721994096e-06, + "loss": 0.4985, + "step": 3279 + }, + { + "epoch": 0.2663851214163892, + "grad_norm": 5.071175646539791, + "learning_rate": 4.302638132713766e-06, + "loss": 0.5427, + "step": 3280 + }, + { + "epoch": 0.2664663363924308, + "grad_norm": 4.587539332384161, + "learning_rate": 4.302182418700415e-06, + "loss": 0.4533, + "step": 3281 + }, + { + "epoch": 0.26654755136847236, + "grad_norm": 10.760547007231233, + "learning_rate": 4.301726579985581e-06, + "loss": 0.5711, + "step": 3282 + }, + { + "epoch": 0.26662876634451393, + "grad_norm": 6.241807773314562, + "learning_rate": 4.301270616600802e-06, + "loss": 0.7594, + "step": 3283 + }, + { + "epoch": 0.2667099813205555, + "grad_norm": 5.23430106311079, + "learning_rate": 4.30081452857763e-06, + "loss": 0.5922, + "step": 3284 + }, + { + "epoch": 0.2667911962965971, + "grad_norm": 5.314419566343465, + "learning_rate": 4.300358315947622e-06, + "loss": 0.6298, + "step": 3285 + }, + { + "epoch": 0.26687241127263867, + "grad_norm": 5.02528385387479, + "learning_rate": 4.299901978742349e-06, + "loss": 0.5465, + "step": 3286 + }, + { + "epoch": 0.26695362624868024, + "grad_norm": 4.18489447765815, + "learning_rate": 4.2994455169933835e-06, + "loss": 0.5514, + "step": 3287 + }, + { + "epoch": 0.2670348412247218, + "grad_norm": 4.050947136506907, + "learning_rate": 4.298988930732312e-06, + "loss": 0.6557, + "step": 3288 + }, + { + "epoch": 0.2671160562007634, + "grad_norm": 4.82281153344797, + "learning_rate": 4.2985322199907275e-06, + "loss": 0.4805, + "step": 3289 + }, + { + "epoch": 0.26719727117680503, + "grad_norm": 3.2786394296234787, + "learning_rate": 4.298075384800232e-06, + "loss": 0.7165, + "step": 3290 + }, + { + "epoch": 0.2672784861528466, + "grad_norm": 4.134763536721899, + "learning_rate": 4.297618425192436e-06, + "loss": 0.6281, + "step": 3291 + }, + { + "epoch": 0.2673597011288882, + "grad_norm": 5.393028330188706, + "learning_rate": 4.297161341198957e-06, + "loss": 0.4522, + "step": 3292 + }, + { + "epoch": 0.26744091610492976, + "grad_norm": 3.3337915433920178, + "learning_rate": 4.296704132851427e-06, + "loss": 0.5315, + "step": 3293 + }, + { + "epoch": 0.26752213108097134, + "grad_norm": 4.494608257605046, + "learning_rate": 4.296246800181479e-06, + "loss": 0.594, + "step": 3294 + }, + { + "epoch": 0.2676033460570129, + "grad_norm": 6.986279435908919, + "learning_rate": 4.29578934322076e-06, + "loss": 0.4113, + "step": 3295 + }, + { + "epoch": 0.2676845610330545, + "grad_norm": 4.754454279458852, + "learning_rate": 4.295331762000921e-06, + "loss": 0.656, + "step": 3296 + }, + { + "epoch": 0.26776577600909607, + "grad_norm": 4.557941560884389, + "learning_rate": 4.294874056553626e-06, + "loss": 0.6003, + "step": 3297 + }, + { + "epoch": 0.26784699098513765, + "grad_norm": 3.5253434081641593, + "learning_rate": 4.294416226910546e-06, + "loss": 0.6373, + "step": 3298 + }, + { + "epoch": 0.2679282059611792, + "grad_norm": 5.390157405133369, + "learning_rate": 4.2939582731033605e-06, + "loss": 0.5665, + "step": 3299 + }, + { + "epoch": 0.2680094209372208, + "grad_norm": 4.444716348064678, + "learning_rate": 4.293500195163756e-06, + "loss": 0.4936, + "step": 3300 + }, + { + "epoch": 0.26809063591326243, + "grad_norm": 4.929185123300809, + "learning_rate": 4.29304199312343e-06, + "loss": 0.6039, + "step": 3301 + }, + { + "epoch": 0.268171850889304, + "grad_norm": 5.005771469258986, + "learning_rate": 4.292583667014087e-06, + "loss": 0.6245, + "step": 3302 + }, + { + "epoch": 0.2682530658653456, + "grad_norm": 6.794918253939256, + "learning_rate": 4.292125216867443e-06, + "loss": 0.4753, + "step": 3303 + }, + { + "epoch": 0.26833428084138716, + "grad_norm": 4.918313944683376, + "learning_rate": 4.2916666427152175e-06, + "loss": 0.6796, + "step": 3304 + }, + { + "epoch": 0.26841549581742874, + "grad_norm": 5.184653793095706, + "learning_rate": 4.291207944589143e-06, + "loss": 0.5299, + "step": 3305 + }, + { + "epoch": 0.2684967107934703, + "grad_norm": 5.158508970573285, + "learning_rate": 4.290749122520959e-06, + "loss": 0.5115, + "step": 3306 + }, + { + "epoch": 0.2685779257695119, + "grad_norm": 5.019559380014884, + "learning_rate": 4.290290176542412e-06, + "loss": 0.6411, + "step": 3307 + }, + { + "epoch": 0.26865914074555347, + "grad_norm": 5.867738157836222, + "learning_rate": 4.289831106685261e-06, + "loss": 0.5529, + "step": 3308 + }, + { + "epoch": 0.26874035572159505, + "grad_norm": 4.019129715439323, + "learning_rate": 4.289371912981268e-06, + "loss": 0.5655, + "step": 3309 + }, + { + "epoch": 0.2688215706976366, + "grad_norm": 4.959547313791584, + "learning_rate": 4.28891259546221e-06, + "loss": 0.7881, + "step": 3310 + }, + { + "epoch": 0.2689027856736782, + "grad_norm": 3.3894537647637466, + "learning_rate": 4.288453154159869e-06, + "loss": 0.6476, + "step": 3311 + }, + { + "epoch": 0.26898400064971983, + "grad_norm": 4.576484804759739, + "learning_rate": 4.287993589106034e-06, + "loss": 0.4195, + "step": 3312 + }, + { + "epoch": 0.2690652156257614, + "grad_norm": 5.5159598677639075, + "learning_rate": 4.287533900332506e-06, + "loss": 0.5731, + "step": 3313 + }, + { + "epoch": 0.269146430601803, + "grad_norm": 3.476369390277886, + "learning_rate": 4.287074087871092e-06, + "loss": 0.7422, + "step": 3314 + }, + { + "epoch": 0.26922764557784457, + "grad_norm": 4.3872226024253305, + "learning_rate": 4.2866141517536085e-06, + "loss": 0.5666, + "step": 3315 + }, + { + "epoch": 0.26930886055388614, + "grad_norm": 4.1888189167131795, + "learning_rate": 4.286154092011882e-06, + "loss": 0.6373, + "step": 3316 + }, + { + "epoch": 0.2693900755299277, + "grad_norm": 6.277380300582101, + "learning_rate": 4.285693908677746e-06, + "loss": 0.5566, + "step": 3317 + }, + { + "epoch": 0.2694712905059693, + "grad_norm": 3.4248947125674802, + "learning_rate": 4.285233601783041e-06, + "loss": 0.7619, + "step": 3318 + }, + { + "epoch": 0.2695525054820109, + "grad_norm": 9.083452534210942, + "learning_rate": 4.28477317135962e-06, + "loss": 0.441, + "step": 3319 + }, + { + "epoch": 0.26963372045805245, + "grad_norm": 5.194205565045371, + "learning_rate": 4.28431261743934e-06, + "loss": 0.5573, + "step": 3320 + }, + { + "epoch": 0.269714935434094, + "grad_norm": 4.678152409953697, + "learning_rate": 4.2838519400540715e-06, + "loss": 0.5867, + "step": 3321 + }, + { + "epoch": 0.2697961504101356, + "grad_norm": 6.079974041201289, + "learning_rate": 4.283391139235688e-06, + "loss": 0.5913, + "step": 3322 + }, + { + "epoch": 0.26987736538617724, + "grad_norm": 4.43636337967898, + "learning_rate": 4.282930215016078e-06, + "loss": 0.4492, + "step": 3323 + }, + { + "epoch": 0.2699585803622188, + "grad_norm": 5.658242363279942, + "learning_rate": 4.282469167427132e-06, + "loss": 0.7522, + "step": 3324 + }, + { + "epoch": 0.2700397953382604, + "grad_norm": 6.996911479616853, + "learning_rate": 4.2820079965007545e-06, + "loss": 0.5871, + "step": 3325 + }, + { + "epoch": 0.27012101031430197, + "grad_norm": 4.53448881154164, + "learning_rate": 4.281546702268853e-06, + "loss": 0.3827, + "step": 3326 + }, + { + "epoch": 0.27020222529034355, + "grad_norm": 6.424034548623486, + "learning_rate": 4.28108528476335e-06, + "loss": 0.3635, + "step": 3327 + }, + { + "epoch": 0.2702834402663851, + "grad_norm": 4.466125493504516, + "learning_rate": 4.280623744016171e-06, + "loss": 0.5539, + "step": 3328 + }, + { + "epoch": 0.2703646552424267, + "grad_norm": 3.693236745360508, + "learning_rate": 4.280162080059252e-06, + "loss": 0.5083, + "step": 3329 + }, + { + "epoch": 0.2704458702184683, + "grad_norm": 7.92497787565039, + "learning_rate": 4.279700292924539e-06, + "loss": 0.6099, + "step": 3330 + }, + { + "epoch": 0.27052708519450985, + "grad_norm": 6.126906391884235, + "learning_rate": 4.279238382643985e-06, + "loss": 0.4806, + "step": 3331 + }, + { + "epoch": 0.27060830017055143, + "grad_norm": 10.147235977729743, + "learning_rate": 4.278776349249551e-06, + "loss": 0.5868, + "step": 3332 + }, + { + "epoch": 0.270689515146593, + "grad_norm": 4.694547184635383, + "learning_rate": 4.278314192773208e-06, + "loss": 0.5579, + "step": 3333 + }, + { + "epoch": 0.27077073012263464, + "grad_norm": 5.790255949685903, + "learning_rate": 4.277851913246934e-06, + "loss": 0.5259, + "step": 3334 + }, + { + "epoch": 0.2708519450986762, + "grad_norm": 4.610604823465653, + "learning_rate": 4.277389510702717e-06, + "loss": 0.5614, + "step": 3335 + }, + { + "epoch": 0.2709331600747178, + "grad_norm": 6.063379795121963, + "learning_rate": 4.276926985172553e-06, + "loss": 0.5133, + "step": 3336 + }, + { + "epoch": 0.27101437505075937, + "grad_norm": 4.576106292327785, + "learning_rate": 4.276464336688445e-06, + "loss": 0.4748, + "step": 3337 + }, + { + "epoch": 0.27109559002680095, + "grad_norm": 5.762207004631984, + "learning_rate": 4.2760015652824074e-06, + "loss": 0.7092, + "step": 3338 + }, + { + "epoch": 0.2711768050028425, + "grad_norm": 4.920850361695753, + "learning_rate": 4.27553867098646e-06, + "loss": 0.4025, + "step": 3339 + }, + { + "epoch": 0.2712580199788841, + "grad_norm": 4.965952642074921, + "learning_rate": 4.275075653832635e-06, + "loss": 0.5994, + "step": 3340 + }, + { + "epoch": 0.2713392349549257, + "grad_norm": 6.661938259420609, + "learning_rate": 4.274612513852968e-06, + "loss": 0.6148, + "step": 3341 + }, + { + "epoch": 0.27142044993096726, + "grad_norm": 4.176847484212596, + "learning_rate": 4.274149251079507e-06, + "loss": 0.4974, + "step": 3342 + }, + { + "epoch": 0.27150166490700883, + "grad_norm": 9.324003992934651, + "learning_rate": 4.273685865544308e-06, + "loss": 0.518, + "step": 3343 + }, + { + "epoch": 0.2715828798830504, + "grad_norm": 10.84005640144426, + "learning_rate": 4.273222357279434e-06, + "loss": 0.5259, + "step": 3344 + }, + { + "epoch": 0.27166409485909204, + "grad_norm": 4.583615144085001, + "learning_rate": 4.272758726316958e-06, + "loss": 0.5688, + "step": 3345 + }, + { + "epoch": 0.2717453098351336, + "grad_norm": 5.704188407998161, + "learning_rate": 4.272294972688959e-06, + "loss": 0.7256, + "step": 3346 + }, + { + "epoch": 0.2718265248111752, + "grad_norm": 5.611046782367686, + "learning_rate": 4.2718310964275285e-06, + "loss": 0.4029, + "step": 3347 + }, + { + "epoch": 0.2719077397872168, + "grad_norm": 3.432494651225952, + "learning_rate": 4.271367097564763e-06, + "loss": 0.5548, + "step": 3348 + }, + { + "epoch": 0.27198895476325835, + "grad_norm": 5.993392392105414, + "learning_rate": 4.27090297613277e-06, + "loss": 0.4336, + "step": 3349 + }, + { + "epoch": 0.2720701697392999, + "grad_norm": 5.973550784862252, + "learning_rate": 4.270438732163663e-06, + "loss": 0.5539, + "step": 3350 + }, + { + "epoch": 0.2721513847153415, + "grad_norm": 7.021609581750342, + "learning_rate": 4.269974365689565e-06, + "loss": 0.5499, + "step": 3351 + }, + { + "epoch": 0.2722325996913831, + "grad_norm": 4.0778999962904106, + "learning_rate": 4.269509876742609e-06, + "loss": 0.5756, + "step": 3352 + }, + { + "epoch": 0.27231381466742466, + "grad_norm": 4.339699329280206, + "learning_rate": 4.269045265354935e-06, + "loss": 0.6475, + "step": 3353 + }, + { + "epoch": 0.27239502964346624, + "grad_norm": 3.557598291542795, + "learning_rate": 4.26858053155869e-06, + "loss": 0.6789, + "step": 3354 + }, + { + "epoch": 0.2724762446195078, + "grad_norm": 5.638513919453793, + "learning_rate": 4.268115675386033e-06, + "loss": 0.5661, + "step": 3355 + }, + { + "epoch": 0.27255745959554945, + "grad_norm": 7.007916663451633, + "learning_rate": 4.267650696869129e-06, + "loss": 0.6213, + "step": 3356 + }, + { + "epoch": 0.272638674571591, + "grad_norm": 6.707656670971426, + "learning_rate": 4.267185596040152e-06, + "loss": 0.6185, + "step": 3357 + }, + { + "epoch": 0.2727198895476326, + "grad_norm": 5.468221363182492, + "learning_rate": 4.266720372931285e-06, + "loss": 0.6918, + "step": 3358 + }, + { + "epoch": 0.2728011045236742, + "grad_norm": 3.4031280276219826, + "learning_rate": 4.2662550275747175e-06, + "loss": 0.4968, + "step": 3359 + }, + { + "epoch": 0.27288231949971575, + "grad_norm": 3.7289743320117914, + "learning_rate": 4.26578956000265e-06, + "loss": 0.5128, + "step": 3360 + }, + { + "epoch": 0.27296353447575733, + "grad_norm": 4.058327224361656, + "learning_rate": 4.26532397024729e-06, + "loss": 0.5662, + "step": 3361 + }, + { + "epoch": 0.2730447494517989, + "grad_norm": 6.483948262408605, + "learning_rate": 4.264858258340854e-06, + "loss": 0.5985, + "step": 3362 + }, + { + "epoch": 0.2731259644278405, + "grad_norm": 6.7970175715192065, + "learning_rate": 4.264392424315568e-06, + "loss": 0.5611, + "step": 3363 + }, + { + "epoch": 0.27320717940388206, + "grad_norm": 4.719015734535453, + "learning_rate": 4.263926468203663e-06, + "loss": 0.5915, + "step": 3364 + }, + { + "epoch": 0.27328839437992364, + "grad_norm": 5.439068319982008, + "learning_rate": 4.2634603900373825e-06, + "loss": 0.6571, + "step": 3365 + }, + { + "epoch": 0.2733696093559652, + "grad_norm": 9.218141495545822, + "learning_rate": 4.262994189848976e-06, + "loss": 0.5393, + "step": 3366 + }, + { + "epoch": 0.27345082433200685, + "grad_norm": 4.307338467347387, + "learning_rate": 4.262527867670702e-06, + "loss": 0.5521, + "step": 3367 + }, + { + "epoch": 0.2735320393080484, + "grad_norm": 4.015516448961165, + "learning_rate": 4.2620614235348265e-06, + "loss": 0.4343, + "step": 3368 + }, + { + "epoch": 0.27361325428409, + "grad_norm": 5.503396831053377, + "learning_rate": 4.261594857473628e-06, + "loss": 0.4884, + "step": 3369 + }, + { + "epoch": 0.2736944692601316, + "grad_norm": 5.761220256833325, + "learning_rate": 4.261128169519388e-06, + "loss": 0.5385, + "step": 3370 + }, + { + "epoch": 0.27377568423617316, + "grad_norm": 7.751355969511051, + "learning_rate": 4.2606613597043975e-06, + "loss": 0.5659, + "step": 3371 + }, + { + "epoch": 0.27385689921221473, + "grad_norm": 4.671040413073545, + "learning_rate": 4.260194428060961e-06, + "loss": 0.4999, + "step": 3372 + }, + { + "epoch": 0.2739381141882563, + "grad_norm": 7.901394405330174, + "learning_rate": 4.2597273746213855e-06, + "loss": 0.6527, + "step": 3373 + }, + { + "epoch": 0.2740193291642979, + "grad_norm": 6.603352470726141, + "learning_rate": 4.259260199417988e-06, + "loss": 0.5473, + "step": 3374 + }, + { + "epoch": 0.27410054414033946, + "grad_norm": 5.40024658636952, + "learning_rate": 4.2587929024830964e-06, + "loss": 0.5654, + "step": 3375 + }, + { + "epoch": 0.27418175911638104, + "grad_norm": 9.759841121522395, + "learning_rate": 4.258325483849044e-06, + "loss": 0.6142, + "step": 3376 + }, + { + "epoch": 0.2742629740924226, + "grad_norm": 5.125699081398096, + "learning_rate": 4.257857943548173e-06, + "loss": 0.5213, + "step": 3377 + }, + { + "epoch": 0.27434418906846425, + "grad_norm": 4.34028312916136, + "learning_rate": 4.257390281612837e-06, + "loss": 0.4747, + "step": 3378 + }, + { + "epoch": 0.2744254040445058, + "grad_norm": 4.263312229046128, + "learning_rate": 4.256922498075394e-06, + "loss": 0.5938, + "step": 3379 + }, + { + "epoch": 0.2745066190205474, + "grad_norm": 4.797086281941267, + "learning_rate": 4.256454592968212e-06, + "loss": 0.4205, + "step": 3380 + }, + { + "epoch": 0.274587833996589, + "grad_norm": 7.033829432880135, + "learning_rate": 4.255986566323668e-06, + "loss": 0.6218, + "step": 3381 + }, + { + "epoch": 0.27466904897263056, + "grad_norm": 7.8902900782976095, + "learning_rate": 4.255518418174148e-06, + "loss": 0.5008, + "step": 3382 + }, + { + "epoch": 0.27475026394867214, + "grad_norm": 3.2994006635611655, + "learning_rate": 4.2550501485520445e-06, + "loss": 0.694, + "step": 3383 + }, + { + "epoch": 0.2748314789247137, + "grad_norm": 5.401884002359012, + "learning_rate": 4.254581757489758e-06, + "loss": 0.5462, + "step": 3384 + }, + { + "epoch": 0.2749126939007553, + "grad_norm": 11.938262897785561, + "learning_rate": 4.254113245019701e-06, + "loss": 0.5148, + "step": 3385 + }, + { + "epoch": 0.27499390887679687, + "grad_norm": 6.8820894517380555, + "learning_rate": 4.25364461117429e-06, + "loss": 0.6357, + "step": 3386 + }, + { + "epoch": 0.27507512385283844, + "grad_norm": 3.919541658984626, + "learning_rate": 4.2531758559859535e-06, + "loss": 0.609, + "step": 3387 + }, + { + "epoch": 0.27515633882888, + "grad_norm": 3.9922271367677475, + "learning_rate": 4.252706979487127e-06, + "loss": 0.5394, + "step": 3388 + }, + { + "epoch": 0.27523755380492165, + "grad_norm": 4.307590756579932, + "learning_rate": 4.2522379817102525e-06, + "loss": 0.4479, + "step": 3389 + }, + { + "epoch": 0.27531876878096323, + "grad_norm": 4.6485314568936715, + "learning_rate": 4.251768862687783e-06, + "loss": 0.5758, + "step": 3390 + }, + { + "epoch": 0.2753999837570048, + "grad_norm": 3.219961667439573, + "learning_rate": 4.25129962245218e-06, + "loss": 0.5229, + "step": 3391 + }, + { + "epoch": 0.2754811987330464, + "grad_norm": 3.6351810295222955, + "learning_rate": 4.250830261035911e-06, + "loss": 0.5625, + "step": 3392 + }, + { + "epoch": 0.27556241370908796, + "grad_norm": 5.369015436096168, + "learning_rate": 4.250360778471455e-06, + "loss": 0.6667, + "step": 3393 + }, + { + "epoch": 0.27564362868512954, + "grad_norm": 4.427072372118147, + "learning_rate": 4.249891174791297e-06, + "loss": 0.6424, + "step": 3394 + }, + { + "epoch": 0.2757248436611711, + "grad_norm": 5.994983906902898, + "learning_rate": 4.249421450027929e-06, + "loss": 0.6288, + "step": 3395 + }, + { + "epoch": 0.2758060586372127, + "grad_norm": 5.740886381814397, + "learning_rate": 4.248951604213858e-06, + "loss": 0.6017, + "step": 3396 + }, + { + "epoch": 0.27588727361325427, + "grad_norm": 6.578063566695588, + "learning_rate": 4.24848163738159e-06, + "loss": 0.5481, + "step": 3397 + }, + { + "epoch": 0.27596848858929585, + "grad_norm": 4.114106520635557, + "learning_rate": 4.248011549563647e-06, + "loss": 0.601, + "step": 3398 + }, + { + "epoch": 0.2760497035653374, + "grad_norm": 5.445436802534245, + "learning_rate": 4.247541340792557e-06, + "loss": 0.5067, + "step": 3399 + }, + { + "epoch": 0.27613091854137906, + "grad_norm": 5.480238969386964, + "learning_rate": 4.247071011100855e-06, + "loss": 0.4474, + "step": 3400 + }, + { + "epoch": 0.27621213351742063, + "grad_norm": 3.9075643825539865, + "learning_rate": 4.246600560521084e-06, + "loss": 0.5349, + "step": 3401 + }, + { + "epoch": 0.2762933484934622, + "grad_norm": 4.585770407486754, + "learning_rate": 4.246129989085798e-06, + "loss": 0.6055, + "step": 3402 + }, + { + "epoch": 0.2763745634695038, + "grad_norm": 6.06730610156998, + "learning_rate": 4.245659296827559e-06, + "loss": 0.5557, + "step": 3403 + }, + { + "epoch": 0.27645577844554536, + "grad_norm": 4.632787296567119, + "learning_rate": 4.245188483778935e-06, + "loss": 0.5214, + "step": 3404 + }, + { + "epoch": 0.27653699342158694, + "grad_norm": 8.317514548088848, + "learning_rate": 4.244717549972504e-06, + "loss": 0.5625, + "step": 3405 + }, + { + "epoch": 0.2766182083976285, + "grad_norm": 3.6101896303329255, + "learning_rate": 4.2442464954408524e-06, + "loss": 0.5844, + "step": 3406 + }, + { + "epoch": 0.2766994233736701, + "grad_norm": 4.099925210417941, + "learning_rate": 4.243775320216575e-06, + "loss": 0.5043, + "step": 3407 + }, + { + "epoch": 0.27678063834971167, + "grad_norm": 5.864792263094429, + "learning_rate": 4.243304024332273e-06, + "loss": 0.5581, + "step": 3408 + }, + { + "epoch": 0.27686185332575325, + "grad_norm": 4.148641967222804, + "learning_rate": 4.24283260782056e-06, + "loss": 0.7032, + "step": 3409 + }, + { + "epoch": 0.2769430683017948, + "grad_norm": 5.951997919915405, + "learning_rate": 4.2423610707140545e-06, + "loss": 0.4561, + "step": 3410 + }, + { + "epoch": 0.27702428327783646, + "grad_norm": 3.9198889092781233, + "learning_rate": 4.241889413045384e-06, + "loss": 0.6652, + "step": 3411 + }, + { + "epoch": 0.27710549825387804, + "grad_norm": 4.4203783957213725, + "learning_rate": 4.2414176348471845e-06, + "loss": 0.7011, + "step": 3412 + }, + { + "epoch": 0.2771867132299196, + "grad_norm": 2.6236418539062303, + "learning_rate": 4.240945736152101e-06, + "loss": 0.5444, + "step": 3413 + }, + { + "epoch": 0.2772679282059612, + "grad_norm": 3.283626852296422, + "learning_rate": 4.240473716992786e-06, + "loss": 0.5089, + "step": 3414 + }, + { + "epoch": 0.27734914318200277, + "grad_norm": 4.388557235644921, + "learning_rate": 4.240001577401903e-06, + "loss": 0.6291, + "step": 3415 + }, + { + "epoch": 0.27743035815804434, + "grad_norm": 5.882384300446532, + "learning_rate": 4.239529317412118e-06, + "loss": 0.3563, + "step": 3416 + }, + { + "epoch": 0.2775115731340859, + "grad_norm": 5.663696285420261, + "learning_rate": 4.239056937056111e-06, + "loss": 0.4948, + "step": 3417 + }, + { + "epoch": 0.2775927881101275, + "grad_norm": 5.913404763781138, + "learning_rate": 4.238584436366568e-06, + "loss": 0.7371, + "step": 3418 + }, + { + "epoch": 0.2776740030861691, + "grad_norm": 2.9595554999598455, + "learning_rate": 4.238111815376182e-06, + "loss": 0.5524, + "step": 3419 + }, + { + "epoch": 0.27775521806221065, + "grad_norm": 6.622001053281284, + "learning_rate": 4.23763907411766e-06, + "loss": 0.436, + "step": 3420 + }, + { + "epoch": 0.27783643303825223, + "grad_norm": 4.194004633417891, + "learning_rate": 4.237166212623708e-06, + "loss": 0.4323, + "step": 3421 + }, + { + "epoch": 0.27791764801429386, + "grad_norm": 5.700219795773494, + "learning_rate": 4.236693230927048e-06, + "loss": 0.496, + "step": 3422 + }, + { + "epoch": 0.27799886299033544, + "grad_norm": 4.15322324920816, + "learning_rate": 4.2362201290604085e-06, + "loss": 0.5907, + "step": 3423 + }, + { + "epoch": 0.278080077966377, + "grad_norm": 5.540030074318849, + "learning_rate": 4.235746907056525e-06, + "loss": 0.652, + "step": 3424 + }, + { + "epoch": 0.2781612929424186, + "grad_norm": 6.39998859259988, + "learning_rate": 4.235273564948142e-06, + "loss": 0.4876, + "step": 3425 + }, + { + "epoch": 0.27824250791846017, + "grad_norm": 3.7525190738960816, + "learning_rate": 4.234800102768012e-06, + "loss": 0.7998, + "step": 3426 + }, + { + "epoch": 0.27832372289450175, + "grad_norm": 4.294740550636778, + "learning_rate": 4.234326520548895e-06, + "loss": 0.6744, + "step": 3427 + }, + { + "epoch": 0.2784049378705433, + "grad_norm": 5.52457863696226, + "learning_rate": 4.233852818323563e-06, + "loss": 0.4104, + "step": 3428 + }, + { + "epoch": 0.2784861528465849, + "grad_norm": 5.182650646362298, + "learning_rate": 4.233378996124792e-06, + "loss": 0.5443, + "step": 3429 + }, + { + "epoch": 0.2785673678226265, + "grad_norm": 8.063914556948795, + "learning_rate": 4.232905053985368e-06, + "loss": 0.6733, + "step": 3430 + }, + { + "epoch": 0.27864858279866805, + "grad_norm": 3.9395695955779186, + "learning_rate": 4.232430991938085e-06, + "loss": 0.5109, + "step": 3431 + }, + { + "epoch": 0.27872979777470963, + "grad_norm": 2.5171709827838993, + "learning_rate": 4.231956810015747e-06, + "loss": 0.672, + "step": 3432 + }, + { + "epoch": 0.27881101275075126, + "grad_norm": 5.452147154881219, + "learning_rate": 4.231482508251164e-06, + "loss": 0.5055, + "step": 3433 + }, + { + "epoch": 0.27889222772679284, + "grad_norm": 10.095151440140139, + "learning_rate": 4.231008086677154e-06, + "loss": 0.4714, + "step": 3434 + }, + { + "epoch": 0.2789734427028344, + "grad_norm": 4.246691334687768, + "learning_rate": 4.230533545326547e-06, + "loss": 0.5819, + "step": 3435 + }, + { + "epoch": 0.279054657678876, + "grad_norm": 7.118049034428091, + "learning_rate": 4.230058884232177e-06, + "loss": 0.7539, + "step": 3436 + }, + { + "epoch": 0.27913587265491757, + "grad_norm": 4.493070103171718, + "learning_rate": 4.229584103426888e-06, + "loss": 0.5447, + "step": 3437 + }, + { + "epoch": 0.27921708763095915, + "grad_norm": 3.9508050807048236, + "learning_rate": 4.229109202943533e-06, + "loss": 0.5914, + "step": 3438 + }, + { + "epoch": 0.2792983026070007, + "grad_norm": 6.3185262215803775, + "learning_rate": 4.228634182814972e-06, + "loss": 0.5831, + "step": 3439 + }, + { + "epoch": 0.2793795175830423, + "grad_norm": 4.369461605720211, + "learning_rate": 4.228159043074075e-06, + "loss": 0.6527, + "step": 3440 + }, + { + "epoch": 0.2794607325590839, + "grad_norm": 6.77973389079385, + "learning_rate": 4.227683783753717e-06, + "loss": 0.5189, + "step": 3441 + }, + { + "epoch": 0.27954194753512546, + "grad_norm": 4.379921845032675, + "learning_rate": 4.227208404886787e-06, + "loss": 0.5284, + "step": 3442 + }, + { + "epoch": 0.27962316251116703, + "grad_norm": 4.507924635404893, + "learning_rate": 4.2267329065061745e-06, + "loss": 0.5291, + "step": 3443 + }, + { + "epoch": 0.27970437748720867, + "grad_norm": 6.075130482555993, + "learning_rate": 4.226257288644784e-06, + "loss": 0.5183, + "step": 3444 + }, + { + "epoch": 0.27978559246325024, + "grad_norm": 4.72530804483612, + "learning_rate": 4.225781551335526e-06, + "loss": 0.5276, + "step": 3445 + }, + { + "epoch": 0.2798668074392918, + "grad_norm": 5.2670855376913535, + "learning_rate": 4.225305694611318e-06, + "loss": 0.5282, + "step": 3446 + }, + { + "epoch": 0.2799480224153334, + "grad_norm": 4.900878051447264, + "learning_rate": 4.224829718505087e-06, + "loss": 0.5453, + "step": 3447 + }, + { + "epoch": 0.280029237391375, + "grad_norm": 7.178013534818383, + "learning_rate": 4.224353623049767e-06, + "loss": 0.6766, + "step": 3448 + }, + { + "epoch": 0.28011045236741655, + "grad_norm": 4.664935512001858, + "learning_rate": 4.2238774082783025e-06, + "loss": 0.6089, + "step": 3449 + }, + { + "epoch": 0.28019166734345813, + "grad_norm": 4.852476116024416, + "learning_rate": 4.223401074223646e-06, + "loss": 0.6572, + "step": 3450 + }, + { + "epoch": 0.2802728823194997, + "grad_norm": 8.563098982741186, + "learning_rate": 4.222924620918755e-06, + "loss": 0.5789, + "step": 3451 + }, + { + "epoch": 0.2803540972955413, + "grad_norm": 4.014234539171271, + "learning_rate": 4.222448048396599e-06, + "loss": 0.6374, + "step": 3452 + }, + { + "epoch": 0.28043531227158286, + "grad_norm": 4.822310015531145, + "learning_rate": 4.221971356690154e-06, + "loss": 0.5382, + "step": 3453 + }, + { + "epoch": 0.28051652724762444, + "grad_norm": 4.267417151682677, + "learning_rate": 4.221494545832405e-06, + "loss": 0.4697, + "step": 3454 + }, + { + "epoch": 0.28059774222366607, + "grad_norm": 8.796169733126867, + "learning_rate": 4.221017615856344e-06, + "loss": 0.467, + "step": 3455 + }, + { + "epoch": 0.28067895719970765, + "grad_norm": 5.742915475497312, + "learning_rate": 4.220540566794972e-06, + "loss": 0.5189, + "step": 3456 + }, + { + "epoch": 0.2807601721757492, + "grad_norm": 4.702000191158769, + "learning_rate": 4.220063398681299e-06, + "loss": 0.6394, + "step": 3457 + }, + { + "epoch": 0.2808413871517908, + "grad_norm": 7.827322462489466, + "learning_rate": 4.219586111548342e-06, + "loss": 0.427, + "step": 3458 + }, + { + "epoch": 0.2809226021278324, + "grad_norm": 4.5952790790019495, + "learning_rate": 4.219108705429127e-06, + "loss": 0.5275, + "step": 3459 + }, + { + "epoch": 0.28100381710387395, + "grad_norm": 4.260809941886097, + "learning_rate": 4.218631180356688e-06, + "loss": 0.6774, + "step": 3460 + }, + { + "epoch": 0.28108503207991553, + "grad_norm": 5.933145706552524, + "learning_rate": 4.218153536364067e-06, + "loss": 0.5556, + "step": 3461 + }, + { + "epoch": 0.2811662470559571, + "grad_norm": 5.5215075629008545, + "learning_rate": 4.217675773484314e-06, + "loss": 0.6402, + "step": 3462 + }, + { + "epoch": 0.2812474620319987, + "grad_norm": 4.367099875870321, + "learning_rate": 4.217197891750488e-06, + "loss": 0.553, + "step": 3463 + }, + { + "epoch": 0.28132867700804026, + "grad_norm": 8.120031592132388, + "learning_rate": 4.216719891195657e-06, + "loss": 0.7601, + "step": 3464 + }, + { + "epoch": 0.28140989198408184, + "grad_norm": 13.915399996993806, + "learning_rate": 4.216241771852895e-06, + "loss": 0.3699, + "step": 3465 + }, + { + "epoch": 0.28149110696012347, + "grad_norm": 4.475259050725444, + "learning_rate": 4.215763533755285e-06, + "loss": 0.6308, + "step": 3466 + }, + { + "epoch": 0.28157232193616505, + "grad_norm": 4.246073015911453, + "learning_rate": 4.215285176935919e-06, + "loss": 0.5941, + "step": 3467 + }, + { + "epoch": 0.2816535369122066, + "grad_norm": 4.463656799978896, + "learning_rate": 4.214806701427896e-06, + "loss": 0.5504, + "step": 3468 + }, + { + "epoch": 0.2817347518882482, + "grad_norm": 2.9321912614363517, + "learning_rate": 4.214328107264326e-06, + "loss": 0.5993, + "step": 3469 + }, + { + "epoch": 0.2818159668642898, + "grad_norm": 5.912480103116702, + "learning_rate": 4.213849394478323e-06, + "loss": 0.8218, + "step": 3470 + }, + { + "epoch": 0.28189718184033136, + "grad_norm": 3.0648081596549996, + "learning_rate": 4.213370563103013e-06, + "loss": 0.5549, + "step": 3471 + }, + { + "epoch": 0.28197839681637293, + "grad_norm": 7.610704699641839, + "learning_rate": 4.212891613171528e-06, + "loss": 0.539, + "step": 3472 + }, + { + "epoch": 0.2820596117924145, + "grad_norm": 4.821090407393734, + "learning_rate": 4.212412544717009e-06, + "loss": 0.5433, + "step": 3473 + }, + { + "epoch": 0.2821408267684561, + "grad_norm": 4.480177003679071, + "learning_rate": 4.211933357772604e-06, + "loss": 0.5649, + "step": 3474 + }, + { + "epoch": 0.28222204174449766, + "grad_norm": 6.0539202446851546, + "learning_rate": 4.211454052371471e-06, + "loss": 0.5074, + "step": 3475 + }, + { + "epoch": 0.28230325672053924, + "grad_norm": 3.2128791712651066, + "learning_rate": 4.210974628546776e-06, + "loss": 0.6066, + "step": 3476 + }, + { + "epoch": 0.2823844716965809, + "grad_norm": 5.748988331106903, + "learning_rate": 4.210495086331691e-06, + "loss": 0.5114, + "step": 3477 + }, + { + "epoch": 0.28246568667262245, + "grad_norm": 2.985462121611765, + "learning_rate": 4.2100154257594e-06, + "loss": 0.6491, + "step": 3478 + }, + { + "epoch": 0.28254690164866403, + "grad_norm": 4.631989287533722, + "learning_rate": 4.20953564686309e-06, + "loss": 0.5046, + "step": 3479 + }, + { + "epoch": 0.2826281166247056, + "grad_norm": 7.645616919092413, + "learning_rate": 4.2090557496759615e-06, + "loss": 0.5868, + "step": 3480 + }, + { + "epoch": 0.2827093316007472, + "grad_norm": 13.216352857254574, + "learning_rate": 4.208575734231221e-06, + "loss": 0.539, + "step": 3481 + }, + { + "epoch": 0.28279054657678876, + "grad_norm": 4.282296217606311, + "learning_rate": 4.208095600562081e-06, + "loss": 0.6534, + "step": 3482 + }, + { + "epoch": 0.28287176155283034, + "grad_norm": 3.613475303783212, + "learning_rate": 4.2076153487017655e-06, + "loss": 0.5671, + "step": 3483 + }, + { + "epoch": 0.2829529765288719, + "grad_norm": 4.958757945118231, + "learning_rate": 4.207134978683506e-06, + "loss": 0.5416, + "step": 3484 + }, + { + "epoch": 0.2830341915049135, + "grad_norm": 4.199726726733855, + "learning_rate": 4.206654490540541e-06, + "loss": 0.6141, + "step": 3485 + }, + { + "epoch": 0.28311540648095507, + "grad_norm": 4.600582479141581, + "learning_rate": 4.206173884306116e-06, + "loss": 0.6657, + "step": 3486 + }, + { + "epoch": 0.28319662145699664, + "grad_norm": 7.6947988469297375, + "learning_rate": 4.20569316001349e-06, + "loss": 0.5541, + "step": 3487 + }, + { + "epoch": 0.2832778364330383, + "grad_norm": 5.367699544702637, + "learning_rate": 4.205212317695924e-06, + "loss": 0.6524, + "step": 3488 + }, + { + "epoch": 0.28335905140907985, + "grad_norm": 6.029688715188512, + "learning_rate": 4.204731357386689e-06, + "loss": 0.5518, + "step": 3489 + }, + { + "epoch": 0.28344026638512143, + "grad_norm": 4.352090436645465, + "learning_rate": 4.204250279119068e-06, + "loss": 0.5141, + "step": 3490 + }, + { + "epoch": 0.283521481361163, + "grad_norm": 4.073146574157928, + "learning_rate": 4.203769082926346e-06, + "loss": 0.5047, + "step": 3491 + }, + { + "epoch": 0.2836026963372046, + "grad_norm": 4.576078022715138, + "learning_rate": 4.203287768841822e-06, + "loss": 0.5063, + "step": 3492 + }, + { + "epoch": 0.28368391131324616, + "grad_norm": 5.141508394238785, + "learning_rate": 4.202806336898798e-06, + "loss": 0.5, + "step": 3493 + }, + { + "epoch": 0.28376512628928774, + "grad_norm": 4.33832646184562, + "learning_rate": 4.202324787130587e-06, + "loss": 0.5375, + "step": 3494 + }, + { + "epoch": 0.2838463412653293, + "grad_norm": 7.671372488757828, + "learning_rate": 4.201843119570511e-06, + "loss": 0.6192, + "step": 3495 + }, + { + "epoch": 0.2839275562413709, + "grad_norm": 6.000625361865301, + "learning_rate": 4.201361334251898e-06, + "loss": 0.5039, + "step": 3496 + }, + { + "epoch": 0.28400877121741247, + "grad_norm": 10.56462611054689, + "learning_rate": 4.200879431208084e-06, + "loss": 0.4379, + "step": 3497 + }, + { + "epoch": 0.28408998619345405, + "grad_norm": 5.2118113489010325, + "learning_rate": 4.200397410472416e-06, + "loss": 0.4859, + "step": 3498 + }, + { + "epoch": 0.2841712011694957, + "grad_norm": 3.7680592065666447, + "learning_rate": 4.199915272078247e-06, + "loss": 0.5509, + "step": 3499 + }, + { + "epoch": 0.28425241614553726, + "grad_norm": 4.998842782205934, + "learning_rate": 4.199433016058936e-06, + "loss": 0.5495, + "step": 3500 + }, + { + "epoch": 0.28433363112157883, + "grad_norm": 3.6541213589146166, + "learning_rate": 4.198950642447856e-06, + "loss": 0.7963, + "step": 3501 + }, + { + "epoch": 0.2844148460976204, + "grad_norm": 4.6320107487602655, + "learning_rate": 4.198468151278382e-06, + "loss": 0.5354, + "step": 3502 + }, + { + "epoch": 0.284496061073662, + "grad_norm": 16.856109294832514, + "learning_rate": 4.197985542583902e-06, + "loss": 0.5639, + "step": 3503 + }, + { + "epoch": 0.28457727604970356, + "grad_norm": 4.341047779732985, + "learning_rate": 4.197502816397809e-06, + "loss": 0.5327, + "step": 3504 + }, + { + "epoch": 0.28465849102574514, + "grad_norm": 6.54679516358405, + "learning_rate": 4.197019972753504e-06, + "loss": 0.5685, + "step": 3505 + }, + { + "epoch": 0.2847397060017867, + "grad_norm": 8.538628468801688, + "learning_rate": 4.1965370116843985e-06, + "loss": 0.6608, + "step": 3506 + }, + { + "epoch": 0.2848209209778283, + "grad_norm": 6.781146736308602, + "learning_rate": 4.1960539332239115e-06, + "loss": 0.5363, + "step": 3507 + }, + { + "epoch": 0.2849021359538699, + "grad_norm": 6.568218511710609, + "learning_rate": 4.195570737405468e-06, + "loss": 0.4654, + "step": 3508 + }, + { + "epoch": 0.28498335092991145, + "grad_norm": 6.14037981907578, + "learning_rate": 4.195087424262503e-06, + "loss": 0.6075, + "step": 3509 + }, + { + "epoch": 0.2850645659059531, + "grad_norm": 3.9057111334726016, + "learning_rate": 4.194603993828459e-06, + "loss": 0.4975, + "step": 3510 + }, + { + "epoch": 0.28514578088199466, + "grad_norm": 6.56859758615991, + "learning_rate": 4.194120446136788e-06, + "loss": 0.6143, + "step": 3511 + }, + { + "epoch": 0.28522699585803624, + "grad_norm": 7.248281271806781, + "learning_rate": 4.193636781220948e-06, + "loss": 0.6135, + "step": 3512 + }, + { + "epoch": 0.2853082108340778, + "grad_norm": 3.6814056969521265, + "learning_rate": 4.1931529991144056e-06, + "loss": 0.644, + "step": 3513 + }, + { + "epoch": 0.2853894258101194, + "grad_norm": 4.124004556862366, + "learning_rate": 4.192669099850637e-06, + "loss": 0.4091, + "step": 3514 + }, + { + "epoch": 0.28547064078616097, + "grad_norm": 9.237020510741628, + "learning_rate": 4.192185083463125e-06, + "loss": 0.6916, + "step": 3515 + }, + { + "epoch": 0.28555185576220254, + "grad_norm": 4.325518645446921, + "learning_rate": 4.19170094998536e-06, + "loss": 0.641, + "step": 3516 + }, + { + "epoch": 0.2856330707382441, + "grad_norm": 3.772328467417333, + "learning_rate": 4.191216699450844e-06, + "loss": 0.5248, + "step": 3517 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 4.5828680511801, + "learning_rate": 4.190732331893083e-06, + "loss": 0.5488, + "step": 3518 + }, + { + "epoch": 0.2857955006903273, + "grad_norm": 6.7144709613110045, + "learning_rate": 4.190247847345591e-06, + "loss": 0.6085, + "step": 3519 + }, + { + "epoch": 0.28587671566636885, + "grad_norm": 6.61578411126398, + "learning_rate": 4.189763245841895e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 0.2859579306424105, + "grad_norm": 7.697721977409483, + "learning_rate": 4.189278527415524e-06, + "loss": 0.4666, + "step": 3521 + }, + { + "epoch": 0.28603914561845206, + "grad_norm": 9.805505848445298, + "learning_rate": 4.188793692100021e-06, + "loss": 0.5197, + "step": 3522 + }, + { + "epoch": 0.28612036059449364, + "grad_norm": 4.38159024051503, + "learning_rate": 4.1883087399289315e-06, + "loss": 0.6191, + "step": 3523 + }, + { + "epoch": 0.2862015755705352, + "grad_norm": 5.443162368213296, + "learning_rate": 4.187823670935812e-06, + "loss": 0.4839, + "step": 3524 + }, + { + "epoch": 0.2862827905465768, + "grad_norm": 9.060902521439406, + "learning_rate": 4.187338485154228e-06, + "loss": 0.5212, + "step": 3525 + }, + { + "epoch": 0.28636400552261837, + "grad_norm": 4.5276722061450405, + "learning_rate": 4.186853182617751e-06, + "loss": 0.5874, + "step": 3526 + }, + { + "epoch": 0.28644522049865995, + "grad_norm": 4.787775939104659, + "learning_rate": 4.1863677633599605e-06, + "loss": 0.6216, + "step": 3527 + }, + { + "epoch": 0.2865264354747015, + "grad_norm": 4.032333523480888, + "learning_rate": 4.1858822274144465e-06, + "loss": 0.8107, + "step": 3528 + }, + { + "epoch": 0.2866076504507431, + "grad_norm": 4.164487711997063, + "learning_rate": 4.185396574814804e-06, + "loss": 0.5834, + "step": 3529 + }, + { + "epoch": 0.2866888654267847, + "grad_norm": 4.736119308914805, + "learning_rate": 4.184910805594639e-06, + "loss": 0.5395, + "step": 3530 + }, + { + "epoch": 0.28677008040282626, + "grad_norm": 4.206365573240851, + "learning_rate": 4.184424919787563e-06, + "loss": 0.634, + "step": 3531 + }, + { + "epoch": 0.2868512953788679, + "grad_norm": 3.609471547339424, + "learning_rate": 4.183938917427198e-06, + "loss": 0.512, + "step": 3532 + }, + { + "epoch": 0.28693251035490946, + "grad_norm": 3.3756577667802303, + "learning_rate": 4.183452798547171e-06, + "loss": 0.6594, + "step": 3533 + }, + { + "epoch": 0.28701372533095104, + "grad_norm": 4.577782600286523, + "learning_rate": 4.1829665631811214e-06, + "loss": 0.5162, + "step": 3534 + }, + { + "epoch": 0.2870949403069926, + "grad_norm": 5.423760632233463, + "learning_rate": 4.182480211362691e-06, + "loss": 0.6164, + "step": 3535 + }, + { + "epoch": 0.2871761552830342, + "grad_norm": 8.37175609091639, + "learning_rate": 4.181993743125535e-06, + "loss": 0.5202, + "step": 3536 + }, + { + "epoch": 0.2872573702590758, + "grad_norm": 4.332707609863406, + "learning_rate": 4.181507158503314e-06, + "loss": 0.7249, + "step": 3537 + }, + { + "epoch": 0.28733858523511735, + "grad_norm": 4.538954914725168, + "learning_rate": 4.1810204575296966e-06, + "loss": 0.5169, + "step": 3538 + }, + { + "epoch": 0.2874198002111589, + "grad_norm": 4.191746753570993, + "learning_rate": 4.180533640238361e-06, + "loss": 0.6053, + "step": 3539 + }, + { + "epoch": 0.2875010151872005, + "grad_norm": 4.85627080590084, + "learning_rate": 4.180046706662991e-06, + "loss": 0.5235, + "step": 3540 + }, + { + "epoch": 0.2875822301632421, + "grad_norm": 6.5813372383035915, + "learning_rate": 4.17955965683728e-06, + "loss": 0.5635, + "step": 3541 + }, + { + "epoch": 0.28766344513928366, + "grad_norm": 6.353502644005653, + "learning_rate": 4.17907249079493e-06, + "loss": 0.5271, + "step": 3542 + }, + { + "epoch": 0.2877446601153253, + "grad_norm": 3.316222423088927, + "learning_rate": 4.17858520856965e-06, + "loss": 0.5198, + "step": 3543 + }, + { + "epoch": 0.28782587509136687, + "grad_norm": 3.6027797950419846, + "learning_rate": 4.178097810195157e-06, + "loss": 0.5364, + "step": 3544 + }, + { + "epoch": 0.28790709006740844, + "grad_norm": 6.196439592693107, + "learning_rate": 4.177610295705178e-06, + "loss": 0.5973, + "step": 3545 + }, + { + "epoch": 0.28798830504345, + "grad_norm": 3.281604318639601, + "learning_rate": 4.177122665133444e-06, + "loss": 0.6097, + "step": 3546 + }, + { + "epoch": 0.2880695200194916, + "grad_norm": 5.0788056380548525, + "learning_rate": 4.176634918513698e-06, + "loss": 0.479, + "step": 3547 + }, + { + "epoch": 0.2881507349955332, + "grad_norm": 10.492165661109485, + "learning_rate": 4.176147055879689e-06, + "loss": 0.6935, + "step": 3548 + }, + { + "epoch": 0.28823194997157475, + "grad_norm": 5.566980762163421, + "learning_rate": 4.175659077265175e-06, + "loss": 0.5723, + "step": 3549 + }, + { + "epoch": 0.28831316494761633, + "grad_norm": 4.510747031993934, + "learning_rate": 4.175170982703921e-06, + "loss": 0.4667, + "step": 3550 + }, + { + "epoch": 0.2883943799236579, + "grad_norm": 5.795017468211081, + "learning_rate": 4.1746827722297e-06, + "loss": 0.6429, + "step": 3551 + }, + { + "epoch": 0.2884755948996995, + "grad_norm": 4.90772281384199, + "learning_rate": 4.174194445876295e-06, + "loss": 0.6138, + "step": 3552 + }, + { + "epoch": 0.28855680987574106, + "grad_norm": 9.38458036391076, + "learning_rate": 4.1737060036774945e-06, + "loss": 0.5942, + "step": 3553 + }, + { + "epoch": 0.2886380248517827, + "grad_norm": 3.744621239251605, + "learning_rate": 4.173217445667097e-06, + "loss": 0.4725, + "step": 3554 + }, + { + "epoch": 0.28871923982782427, + "grad_norm": 3.5760812252375858, + "learning_rate": 4.172728771878908e-06, + "loss": 0.4955, + "step": 3555 + }, + { + "epoch": 0.28880045480386585, + "grad_norm": 5.300448357602322, + "learning_rate": 4.17223998234674e-06, + "loss": 0.5496, + "step": 3556 + }, + { + "epoch": 0.2888816697799074, + "grad_norm": 4.301151535524972, + "learning_rate": 4.171751077104415e-06, + "loss": 0.6269, + "step": 3557 + }, + { + "epoch": 0.288962884755949, + "grad_norm": 4.150238159594849, + "learning_rate": 4.171262056185764e-06, + "loss": 0.6023, + "step": 3558 + }, + { + "epoch": 0.2890440997319906, + "grad_norm": 6.588110479356063, + "learning_rate": 4.170772919624624e-06, + "loss": 0.5044, + "step": 3559 + }, + { + "epoch": 0.28912531470803216, + "grad_norm": 9.52780556407402, + "learning_rate": 4.170283667454839e-06, + "loss": 0.4627, + "step": 3560 + }, + { + "epoch": 0.28920652968407373, + "grad_norm": 3.112608140290074, + "learning_rate": 4.169794299710266e-06, + "loss": 0.5403, + "step": 3561 + }, + { + "epoch": 0.2892877446601153, + "grad_norm": 5.782425679496101, + "learning_rate": 4.169304816424763e-06, + "loss": 0.6422, + "step": 3562 + }, + { + "epoch": 0.2893689596361569, + "grad_norm": 6.327129043676947, + "learning_rate": 4.168815217632202e-06, + "loss": 0.5983, + "step": 3563 + }, + { + "epoch": 0.28945017461219846, + "grad_norm": 5.64892446517355, + "learning_rate": 4.168325503366461e-06, + "loss": 0.6639, + "step": 3564 + }, + { + "epoch": 0.2895313895882401, + "grad_norm": 4.212091644362981, + "learning_rate": 4.167835673661422e-06, + "loss": 0.5173, + "step": 3565 + }, + { + "epoch": 0.2896126045642817, + "grad_norm": 4.627484710444349, + "learning_rate": 4.167345728550984e-06, + "loss": 0.5776, + "step": 3566 + }, + { + "epoch": 0.28969381954032325, + "grad_norm": 6.431815623240765, + "learning_rate": 4.166855668069045e-06, + "loss": 0.5357, + "step": 3567 + }, + { + "epoch": 0.2897750345163648, + "grad_norm": 5.104463737375422, + "learning_rate": 4.166365492249514e-06, + "loss": 0.4888, + "step": 3568 + }, + { + "epoch": 0.2898562494924064, + "grad_norm": 2.6553169339995755, + "learning_rate": 4.1658752011263125e-06, + "loss": 0.4652, + "step": 3569 + }, + { + "epoch": 0.289937464468448, + "grad_norm": 4.522379342995965, + "learning_rate": 4.1653847947333625e-06, + "loss": 0.6268, + "step": 3570 + }, + { + "epoch": 0.29001867944448956, + "grad_norm": 4.206423587982746, + "learning_rate": 4.164894273104599e-06, + "loss": 0.7023, + "step": 3571 + }, + { + "epoch": 0.29009989442053113, + "grad_norm": 3.550921658943635, + "learning_rate": 4.164403636273963e-06, + "loss": 0.6467, + "step": 3572 + }, + { + "epoch": 0.2901811093965727, + "grad_norm": 4.41960430630749, + "learning_rate": 4.163912884275403e-06, + "loss": 0.6449, + "step": 3573 + }, + { + "epoch": 0.2902623243726143, + "grad_norm": 5.335473970114484, + "learning_rate": 4.163422017142879e-06, + "loss": 0.532, + "step": 3574 + }, + { + "epoch": 0.29034353934865587, + "grad_norm": 9.485322332094816, + "learning_rate": 4.162931034910354e-06, + "loss": 0.5057, + "step": 3575 + }, + { + "epoch": 0.2904247543246975, + "grad_norm": 5.472990906211878, + "learning_rate": 4.162439937611803e-06, + "loss": 0.5523, + "step": 3576 + }, + { + "epoch": 0.2905059693007391, + "grad_norm": 4.626244114541653, + "learning_rate": 4.161948725281206e-06, + "loss": 0.5859, + "step": 3577 + }, + { + "epoch": 0.29058718427678065, + "grad_norm": 4.102620389372604, + "learning_rate": 4.161457397952553e-06, + "loss": 0.5745, + "step": 3578 + }, + { + "epoch": 0.29066839925282223, + "grad_norm": 5.532218377747797, + "learning_rate": 4.160965955659843e-06, + "loss": 0.4707, + "step": 3579 + }, + { + "epoch": 0.2907496142288638, + "grad_norm": 4.2544863217335545, + "learning_rate": 4.160474398437077e-06, + "loss": 0.5238, + "step": 3580 + }, + { + "epoch": 0.2908308292049054, + "grad_norm": 5.496605956094565, + "learning_rate": 4.159982726318271e-06, + "loss": 0.8256, + "step": 3581 + }, + { + "epoch": 0.29091204418094696, + "grad_norm": 4.300837180788803, + "learning_rate": 4.159490939337447e-06, + "loss": 0.6179, + "step": 3582 + }, + { + "epoch": 0.29099325915698854, + "grad_norm": 3.957326025756882, + "learning_rate": 4.158999037528632e-06, + "loss": 0.5216, + "step": 3583 + }, + { + "epoch": 0.2910744741330301, + "grad_norm": 5.327052291363593, + "learning_rate": 4.1585070209258635e-06, + "loss": 0.859, + "step": 3584 + }, + { + "epoch": 0.2911556891090717, + "grad_norm": 6.708637795707786, + "learning_rate": 4.158014889563187e-06, + "loss": 0.4936, + "step": 3585 + }, + { + "epoch": 0.29123690408511327, + "grad_norm": 3.6081169619085487, + "learning_rate": 4.157522643474654e-06, + "loss": 0.4951, + "step": 3586 + }, + { + "epoch": 0.2913181190611549, + "grad_norm": 5.692407639431199, + "learning_rate": 4.157030282694328e-06, + "loss": 0.4514, + "step": 3587 + }, + { + "epoch": 0.2913993340371965, + "grad_norm": 5.96319840346763, + "learning_rate": 4.156537807256275e-06, + "loss": 0.6435, + "step": 3588 + }, + { + "epoch": 0.29148054901323805, + "grad_norm": 4.057628153052015, + "learning_rate": 4.156045217194573e-06, + "loss": 0.5785, + "step": 3589 + }, + { + "epoch": 0.29156176398927963, + "grad_norm": 4.725549294999352, + "learning_rate": 4.1555525125433074e-06, + "loss": 0.561, + "step": 3590 + }, + { + "epoch": 0.2916429789653212, + "grad_norm": 4.897497425355875, + "learning_rate": 4.155059693336569e-06, + "loss": 0.4877, + "step": 3591 + }, + { + "epoch": 0.2917241939413628, + "grad_norm": 6.7306605891617, + "learning_rate": 4.1545667596084596e-06, + "loss": 0.6536, + "step": 3592 + }, + { + "epoch": 0.29180540891740436, + "grad_norm": 4.458589616639652, + "learning_rate": 4.154073711393087e-06, + "loss": 0.6075, + "step": 3593 + }, + { + "epoch": 0.29188662389344594, + "grad_norm": 16.172763248388275, + "learning_rate": 4.153580548724567e-06, + "loss": 0.5503, + "step": 3594 + }, + { + "epoch": 0.2919678388694875, + "grad_norm": 5.597703145374191, + "learning_rate": 4.153087271637025e-06, + "loss": 0.5837, + "step": 3595 + }, + { + "epoch": 0.2920490538455291, + "grad_norm": 11.754470585418524, + "learning_rate": 4.1525938801645926e-06, + "loss": 0.5668, + "step": 3596 + }, + { + "epoch": 0.29213026882157067, + "grad_norm": 6.254803456678843, + "learning_rate": 4.152100374341409e-06, + "loss": 0.5732, + "step": 3597 + }, + { + "epoch": 0.2922114837976123, + "grad_norm": 5.190101012718226, + "learning_rate": 4.151606754201625e-06, + "loss": 0.6051, + "step": 3598 + }, + { + "epoch": 0.2922926987736539, + "grad_norm": 10.669928997016616, + "learning_rate": 4.151113019779393e-06, + "loss": 0.639, + "step": 3599 + }, + { + "epoch": 0.29237391374969546, + "grad_norm": 5.619799524172815, + "learning_rate": 4.150619171108879e-06, + "loss": 0.5745, + "step": 3600 + }, + { + "epoch": 0.29245512872573703, + "grad_norm": 4.562644028883115, + "learning_rate": 4.150125208224255e-06, + "loss": 0.6914, + "step": 3601 + }, + { + "epoch": 0.2925363437017786, + "grad_norm": 4.491512432892291, + "learning_rate": 4.149631131159698e-06, + "loss": 0.4882, + "step": 3602 + }, + { + "epoch": 0.2926175586778202, + "grad_norm": 4.214753071853409, + "learning_rate": 4.149136939949399e-06, + "loss": 0.5967, + "step": 3603 + }, + { + "epoch": 0.29269877365386177, + "grad_norm": 6.53315802645859, + "learning_rate": 4.14864263462755e-06, + "loss": 0.5239, + "step": 3604 + }, + { + "epoch": 0.29277998862990334, + "grad_norm": 5.806026900388708, + "learning_rate": 4.148148215228357e-06, + "loss": 0.6479, + "step": 3605 + }, + { + "epoch": 0.2928612036059449, + "grad_norm": 4.392767448225316, + "learning_rate": 4.147653681786031e-06, + "loss": 0.5045, + "step": 3606 + }, + { + "epoch": 0.2929424185819865, + "grad_norm": 7.680285801817868, + "learning_rate": 4.147159034334789e-06, + "loss": 0.5433, + "step": 3607 + }, + { + "epoch": 0.2930236335580281, + "grad_norm": 5.275584522954317, + "learning_rate": 4.146664272908859e-06, + "loss": 0.5867, + "step": 3608 + }, + { + "epoch": 0.2931048485340697, + "grad_norm": 5.37369190885388, + "learning_rate": 4.146169397542478e-06, + "loss": 0.5683, + "step": 3609 + }, + { + "epoch": 0.2931860635101113, + "grad_norm": 3.8049589428305413, + "learning_rate": 4.145674408269885e-06, + "loss": 0.6545, + "step": 3610 + }, + { + "epoch": 0.29326727848615286, + "grad_norm": 5.091756495748853, + "learning_rate": 4.145179305125333e-06, + "loss": 0.5765, + "step": 3611 + }, + { + "epoch": 0.29334849346219444, + "grad_norm": 13.084271350497739, + "learning_rate": 4.14468408814308e-06, + "loss": 0.5044, + "step": 3612 + }, + { + "epoch": 0.293429708438236, + "grad_norm": 5.905329389388773, + "learning_rate": 4.1441887573573935e-06, + "loss": 0.5513, + "step": 3613 + }, + { + "epoch": 0.2935109234142776, + "grad_norm": 4.255974685046024, + "learning_rate": 4.143693312802546e-06, + "loss": 0.4885, + "step": 3614 + }, + { + "epoch": 0.29359213839031917, + "grad_norm": 7.189955017278807, + "learning_rate": 4.143197754512821e-06, + "loss": 0.5293, + "step": 3615 + }, + { + "epoch": 0.29367335336636075, + "grad_norm": 4.434545028745304, + "learning_rate": 4.142702082522507e-06, + "loss": 0.4807, + "step": 3616 + }, + { + "epoch": 0.2937545683424023, + "grad_norm": 5.613891388134908, + "learning_rate": 4.142206296865904e-06, + "loss": 0.5229, + "step": 3617 + }, + { + "epoch": 0.2938357833184439, + "grad_norm": 3.159086352727405, + "learning_rate": 4.141710397577315e-06, + "loss": 0.6718, + "step": 3618 + }, + { + "epoch": 0.2939169982944855, + "grad_norm": 3.5367459467799858, + "learning_rate": 4.141214384691056e-06, + "loss": 0.5547, + "step": 3619 + }, + { + "epoch": 0.2939982132705271, + "grad_norm": 7.18465729456046, + "learning_rate": 4.1407182582414476e-06, + "loss": 0.5301, + "step": 3620 + }, + { + "epoch": 0.2940794282465687, + "grad_norm": 5.6826492806068405, + "learning_rate": 4.140222018262818e-06, + "loss": 0.6391, + "step": 3621 + }, + { + "epoch": 0.29416064322261026, + "grad_norm": 3.357835141229998, + "learning_rate": 4.139725664789507e-06, + "loss": 0.7172, + "step": 3622 + }, + { + "epoch": 0.29424185819865184, + "grad_norm": 17.104412542873206, + "learning_rate": 4.139229197855857e-06, + "loss": 0.441, + "step": 3623 + }, + { + "epoch": 0.2943230731746934, + "grad_norm": 4.059380496052101, + "learning_rate": 4.138732617496223e-06, + "loss": 0.5264, + "step": 3624 + }, + { + "epoch": 0.294404288150735, + "grad_norm": 4.907343820967712, + "learning_rate": 4.138235923744964e-06, + "loss": 0.4865, + "step": 3625 + }, + { + "epoch": 0.29448550312677657, + "grad_norm": 4.767491409739287, + "learning_rate": 4.13773911663645e-06, + "loss": 0.5446, + "step": 3626 + }, + { + "epoch": 0.29456671810281815, + "grad_norm": 11.043074971749009, + "learning_rate": 4.137242196205056e-06, + "loss": 0.514, + "step": 3627 + }, + { + "epoch": 0.2946479330788597, + "grad_norm": 8.751284247365856, + "learning_rate": 4.136745162485168e-06, + "loss": 0.5782, + "step": 3628 + }, + { + "epoch": 0.2947291480549013, + "grad_norm": 4.259342421928949, + "learning_rate": 4.1362480155111764e-06, + "loss": 0.4735, + "step": 3629 + }, + { + "epoch": 0.2948103630309429, + "grad_norm": 5.982588574242516, + "learning_rate": 4.135750755317481e-06, + "loss": 0.6233, + "step": 3630 + }, + { + "epoch": 0.2948915780069845, + "grad_norm": 4.717722006131577, + "learning_rate": 4.135253381938492e-06, + "loss": 0.6496, + "step": 3631 + }, + { + "epoch": 0.2949727929830261, + "grad_norm": 3.9506979794735337, + "learning_rate": 4.134755895408623e-06, + "loss": 0.6076, + "step": 3632 + }, + { + "epoch": 0.29505400795906767, + "grad_norm": 6.244712141178371, + "learning_rate": 4.134258295762297e-06, + "loss": 0.5481, + "step": 3633 + }, + { + "epoch": 0.29513522293510924, + "grad_norm": 5.110391491927889, + "learning_rate": 4.1337605830339465e-06, + "loss": 0.5417, + "step": 3634 + }, + { + "epoch": 0.2952164379111508, + "grad_norm": 4.8675201483594615, + "learning_rate": 4.133262757258011e-06, + "loss": 0.6065, + "step": 3635 + }, + { + "epoch": 0.2952976528871924, + "grad_norm": 11.498154785191828, + "learning_rate": 4.132764818468936e-06, + "loss": 0.5392, + "step": 3636 + }, + { + "epoch": 0.295378867863234, + "grad_norm": 6.065167654609438, + "learning_rate": 4.1322667667011774e-06, + "loss": 0.6767, + "step": 3637 + }, + { + "epoch": 0.29546008283927555, + "grad_norm": 9.853858235216926, + "learning_rate": 4.131768601989196e-06, + "loss": 0.5793, + "step": 3638 + }, + { + "epoch": 0.2955412978153171, + "grad_norm": 5.19821566952064, + "learning_rate": 4.131270324367464e-06, + "loss": 0.7266, + "step": 3639 + }, + { + "epoch": 0.2956225127913587, + "grad_norm": 6.474598040312163, + "learning_rate": 4.130771933870459e-06, + "loss": 0.6649, + "step": 3640 + }, + { + "epoch": 0.2957037277674003, + "grad_norm": 6.622824718236689, + "learning_rate": 4.130273430532667e-06, + "loss": 0.4317, + "step": 3641 + }, + { + "epoch": 0.2957849427434419, + "grad_norm": 8.781051249556342, + "learning_rate": 4.129774814388582e-06, + "loss": 0.4864, + "step": 3642 + }, + { + "epoch": 0.2958661577194835, + "grad_norm": 4.776911195727594, + "learning_rate": 4.1292760854727045e-06, + "loss": 0.5531, + "step": 3643 + }, + { + "epoch": 0.29594737269552507, + "grad_norm": 4.828770110545079, + "learning_rate": 4.128777243819546e-06, + "loss": 0.5435, + "step": 3644 + }, + { + "epoch": 0.29602858767156665, + "grad_norm": 4.550741300328438, + "learning_rate": 4.128278289463621e-06, + "loss": 0.4474, + "step": 3645 + }, + { + "epoch": 0.2961098026476082, + "grad_norm": 3.03442154965047, + "learning_rate": 4.127779222439457e-06, + "loss": 0.5896, + "step": 3646 + }, + { + "epoch": 0.2961910176236498, + "grad_norm": 4.7725701865486085, + "learning_rate": 4.127280042781585e-06, + "loss": 0.6183, + "step": 3647 + }, + { + "epoch": 0.2962722325996914, + "grad_norm": 5.751607562129402, + "learning_rate": 4.126780750524546e-06, + "loss": 0.4919, + "step": 3648 + }, + { + "epoch": 0.29635344757573295, + "grad_norm": 3.5887135989563057, + "learning_rate": 4.126281345702889e-06, + "loss": 0.5275, + "step": 3649 + }, + { + "epoch": 0.29643466255177453, + "grad_norm": 6.464495222017677, + "learning_rate": 4.125781828351171e-06, + "loss": 0.8401, + "step": 3650 + }, + { + "epoch": 0.2965158775278161, + "grad_norm": 5.980573147403461, + "learning_rate": 4.125282198503953e-06, + "loss": 0.5954, + "step": 3651 + }, + { + "epoch": 0.2965970925038577, + "grad_norm": 5.228111544471104, + "learning_rate": 4.124782456195809e-06, + "loss": 0.5105, + "step": 3652 + }, + { + "epoch": 0.2966783074798993, + "grad_norm": 7.53149544733575, + "learning_rate": 4.124282601461319e-06, + "loss": 0.4924, + "step": 3653 + }, + { + "epoch": 0.2967595224559409, + "grad_norm": 5.556355856990694, + "learning_rate": 4.123782634335068e-06, + "loss": 0.4124, + "step": 3654 + }, + { + "epoch": 0.29684073743198247, + "grad_norm": 4.358048092574572, + "learning_rate": 4.123282554851654e-06, + "loss": 0.5824, + "step": 3655 + }, + { + "epoch": 0.29692195240802405, + "grad_norm": 3.5786979265887187, + "learning_rate": 4.122782363045677e-06, + "loss": 0.4748, + "step": 3656 + }, + { + "epoch": 0.2970031673840656, + "grad_norm": 4.524617536184045, + "learning_rate": 4.12228205895175e-06, + "loss": 0.4482, + "step": 3657 + }, + { + "epoch": 0.2970843823601072, + "grad_norm": 6.2371920872247735, + "learning_rate": 4.12178164260449e-06, + "loss": 0.565, + "step": 3658 + }, + { + "epoch": 0.2971655973361488, + "grad_norm": 3.54005929501434, + "learning_rate": 4.121281114038524e-06, + "loss": 0.4706, + "step": 3659 + }, + { + "epoch": 0.29724681231219036, + "grad_norm": 6.2010580244373905, + "learning_rate": 4.120780473288485e-06, + "loss": 0.4807, + "step": 3660 + }, + { + "epoch": 0.29732802728823193, + "grad_norm": 3.39391672989296, + "learning_rate": 4.120279720389015e-06, + "loss": 0.5279, + "step": 3661 + }, + { + "epoch": 0.2974092422642735, + "grad_norm": 7.485601794086229, + "learning_rate": 4.119778855374763e-06, + "loss": 0.6333, + "step": 3662 + }, + { + "epoch": 0.2974904572403151, + "grad_norm": 4.283931201224972, + "learning_rate": 4.1192778782803875e-06, + "loss": 0.5886, + "step": 3663 + }, + { + "epoch": 0.2975716722163567, + "grad_norm": 6.09549459673354, + "learning_rate": 4.118776789140551e-06, + "loss": 0.4158, + "step": 3664 + }, + { + "epoch": 0.2976528871923983, + "grad_norm": 3.9183720909856645, + "learning_rate": 4.1182755879899305e-06, + "loss": 0.5857, + "step": 3665 + }, + { + "epoch": 0.2977341021684399, + "grad_norm": 4.554155011331954, + "learning_rate": 4.117774274863203e-06, + "loss": 0.5828, + "step": 3666 + }, + { + "epoch": 0.29781531714448145, + "grad_norm": 6.026037573474456, + "learning_rate": 4.117272849795057e-06, + "loss": 0.7132, + "step": 3667 + }, + { + "epoch": 0.297896532120523, + "grad_norm": 5.369767546922633, + "learning_rate": 4.116771312820189e-06, + "loss": 0.6505, + "step": 3668 + }, + { + "epoch": 0.2979777470965646, + "grad_norm": 3.49258490401417, + "learning_rate": 4.116269663973304e-06, + "loss": 0.537, + "step": 3669 + }, + { + "epoch": 0.2980589620726062, + "grad_norm": 8.038084077662122, + "learning_rate": 4.115767903289112e-06, + "loss": 0.6225, + "step": 3670 + }, + { + "epoch": 0.29814017704864776, + "grad_norm": 10.562283989123358, + "learning_rate": 4.115266030802332e-06, + "loss": 0.4825, + "step": 3671 + }, + { + "epoch": 0.29822139202468934, + "grad_norm": 5.676004986730943, + "learning_rate": 4.114764046547691e-06, + "loss": 0.59, + "step": 3672 + }, + { + "epoch": 0.2983026070007309, + "grad_norm": 8.432493876901468, + "learning_rate": 4.114261950559924e-06, + "loss": 0.5298, + "step": 3673 + }, + { + "epoch": 0.2983838219767725, + "grad_norm": 4.1760389034352015, + "learning_rate": 4.113759742873774e-06, + "loss": 0.6273, + "step": 3674 + }, + { + "epoch": 0.2984650369528141, + "grad_norm": 6.694938489409981, + "learning_rate": 4.11325742352399e-06, + "loss": 0.4842, + "step": 3675 + }, + { + "epoch": 0.2985462519288557, + "grad_norm": 4.346992428927781, + "learning_rate": 4.112754992545331e-06, + "loss": 0.5768, + "step": 3676 + }, + { + "epoch": 0.2986274669048973, + "grad_norm": 4.528058186506179, + "learning_rate": 4.112252449972562e-06, + "loss": 0.5956, + "step": 3677 + }, + { + "epoch": 0.29870868188093885, + "grad_norm": 4.764966099793393, + "learning_rate": 4.111749795840455e-06, + "loss": 0.5005, + "step": 3678 + }, + { + "epoch": 0.29878989685698043, + "grad_norm": 10.20025692058819, + "learning_rate": 4.111247030183793e-06, + "loss": 0.5522, + "step": 3679 + }, + { + "epoch": 0.298871111833022, + "grad_norm": 31.38171024066769, + "learning_rate": 4.110744153037363e-06, + "loss": 0.505, + "step": 3680 + }, + { + "epoch": 0.2989523268090636, + "grad_norm": 4.523115084329832, + "learning_rate": 4.110241164435964e-06, + "loss": 0.4673, + "step": 3681 + }, + { + "epoch": 0.29903354178510516, + "grad_norm": 3.7531700099140393, + "learning_rate": 4.109738064414397e-06, + "loss": 0.4812, + "step": 3682 + }, + { + "epoch": 0.29911475676114674, + "grad_norm": 5.06537300558853, + "learning_rate": 4.1092348530074764e-06, + "loss": 0.5328, + "step": 3683 + }, + { + "epoch": 0.2991959717371883, + "grad_norm": 6.176828372115656, + "learning_rate": 4.10873153025002e-06, + "loss": 0.4383, + "step": 3684 + }, + { + "epoch": 0.2992771867132299, + "grad_norm": 5.037654343199056, + "learning_rate": 4.108228096176856e-06, + "loss": 0.6148, + "step": 3685 + }, + { + "epoch": 0.2993584016892715, + "grad_norm": 6.942767647531231, + "learning_rate": 4.10772455082282e-06, + "loss": 0.558, + "step": 3686 + }, + { + "epoch": 0.2994396166653131, + "grad_norm": 4.277595045669092, + "learning_rate": 4.107220894222753e-06, + "loss": 0.7493, + "step": 3687 + }, + { + "epoch": 0.2995208316413547, + "grad_norm": 6.128933496939369, + "learning_rate": 4.106717126411506e-06, + "loss": 0.4093, + "step": 3688 + }, + { + "epoch": 0.29960204661739626, + "grad_norm": 3.4094618806546713, + "learning_rate": 4.106213247423938e-06, + "loss": 0.5386, + "step": 3689 + }, + { + "epoch": 0.29968326159343783, + "grad_norm": 5.429499060264869, + "learning_rate": 4.105709257294914e-06, + "loss": 0.5606, + "step": 3690 + }, + { + "epoch": 0.2997644765694794, + "grad_norm": 8.126911051862221, + "learning_rate": 4.105205156059307e-06, + "loss": 0.4964, + "step": 3691 + }, + { + "epoch": 0.299845691545521, + "grad_norm": 5.576298258657682, + "learning_rate": 4.104700943751999e-06, + "loss": 0.5032, + "step": 3692 + }, + { + "epoch": 0.29992690652156256, + "grad_norm": 5.428256954230044, + "learning_rate": 4.104196620407878e-06, + "loss": 0.5164, + "step": 3693 + }, + { + "epoch": 0.30000812149760414, + "grad_norm": 5.490233849085779, + "learning_rate": 4.1036921860618415e-06, + "loss": 0.4619, + "step": 3694 + }, + { + "epoch": 0.3000893364736457, + "grad_norm": 3.6578438271498683, + "learning_rate": 4.103187640748792e-06, + "loss": 0.4709, + "step": 3695 + }, + { + "epoch": 0.3001705514496873, + "grad_norm": 4.119105065976182, + "learning_rate": 4.102682984503644e-06, + "loss": 0.5943, + "step": 3696 + }, + { + "epoch": 0.3002517664257289, + "grad_norm": 4.816227234767905, + "learning_rate": 4.102178217361315e-06, + "loss": 0.5706, + "step": 3697 + }, + { + "epoch": 0.3003329814017705, + "grad_norm": 3.586911969028518, + "learning_rate": 4.101673339356733e-06, + "loss": 0.4774, + "step": 3698 + }, + { + "epoch": 0.3004141963778121, + "grad_norm": 9.66099144536593, + "learning_rate": 4.101168350524832e-06, + "loss": 0.4695, + "step": 3699 + }, + { + "epoch": 0.30049541135385366, + "grad_norm": 4.619743032013704, + "learning_rate": 4.100663250900556e-06, + "loss": 0.4937, + "step": 3700 + }, + { + "epoch": 0.30057662632989524, + "grad_norm": 5.0528198312806705, + "learning_rate": 4.100158040518854e-06, + "loss": 0.4896, + "step": 3701 + }, + { + "epoch": 0.3006578413059368, + "grad_norm": 6.192798320269057, + "learning_rate": 4.099652719414684e-06, + "loss": 0.4393, + "step": 3702 + }, + { + "epoch": 0.3007390562819784, + "grad_norm": 4.544818358602807, + "learning_rate": 4.099147287623012e-06, + "loss": 0.4435, + "step": 3703 + }, + { + "epoch": 0.30082027125801997, + "grad_norm": 3.843568388195506, + "learning_rate": 4.098641745178812e-06, + "loss": 0.5758, + "step": 3704 + }, + { + "epoch": 0.30090148623406154, + "grad_norm": 7.032873685349521, + "learning_rate": 4.098136092117063e-06, + "loss": 0.5837, + "step": 3705 + }, + { + "epoch": 0.3009827012101031, + "grad_norm": 4.039586791406998, + "learning_rate": 4.097630328472755e-06, + "loss": 0.4516, + "step": 3706 + }, + { + "epoch": 0.3010639161861447, + "grad_norm": 5.893186542446594, + "learning_rate": 4.097124454280883e-06, + "loss": 0.4981, + "step": 3707 + }, + { + "epoch": 0.30114513116218633, + "grad_norm": 4.234572464495813, + "learning_rate": 4.096618469576451e-06, + "loss": 0.721, + "step": 3708 + }, + { + "epoch": 0.3012263461382279, + "grad_norm": 4.138584167113376, + "learning_rate": 4.0961123743944715e-06, + "loss": 0.4969, + "step": 3709 + }, + { + "epoch": 0.3013075611142695, + "grad_norm": 4.733715325450099, + "learning_rate": 4.095606168769964e-06, + "loss": 0.65, + "step": 3710 + }, + { + "epoch": 0.30138877609031106, + "grad_norm": 5.217622232509772, + "learning_rate": 4.095099852737953e-06, + "loss": 0.6026, + "step": 3711 + }, + { + "epoch": 0.30146999106635264, + "grad_norm": 6.573735753701009, + "learning_rate": 4.094593426333474e-06, + "loss": 0.6201, + "step": 3712 + }, + { + "epoch": 0.3015512060423942, + "grad_norm": 7.983639715927653, + "learning_rate": 4.09408688959157e-06, + "loss": 0.6581, + "step": 3713 + }, + { + "epoch": 0.3016324210184358, + "grad_norm": 5.652732617432702, + "learning_rate": 4.093580242547289e-06, + "loss": 0.7463, + "step": 3714 + }, + { + "epoch": 0.30171363599447737, + "grad_norm": 5.775875513388475, + "learning_rate": 4.09307348523569e-06, + "loss": 0.4833, + "step": 3715 + }, + { + "epoch": 0.30179485097051895, + "grad_norm": 5.373772908608717, + "learning_rate": 4.092566617691837e-06, + "loss": 0.4648, + "step": 3716 + }, + { + "epoch": 0.3018760659465605, + "grad_norm": 4.681484734835625, + "learning_rate": 4.092059639950802e-06, + "loss": 0.543, + "step": 3717 + }, + { + "epoch": 0.3019572809226021, + "grad_norm": 3.293957563526287, + "learning_rate": 4.0915525520476665e-06, + "loss": 0.7871, + "step": 3718 + }, + { + "epoch": 0.30203849589864373, + "grad_norm": 9.736689094555402, + "learning_rate": 4.091045354017517e-06, + "loss": 0.54, + "step": 3719 + }, + { + "epoch": 0.3021197108746853, + "grad_norm": 7.004070985864832, + "learning_rate": 4.090538045895449e-06, + "loss": 0.4686, + "step": 3720 + }, + { + "epoch": 0.3022009258507269, + "grad_norm": 6.304287102750301, + "learning_rate": 4.090030627716567e-06, + "loss": 0.6621, + "step": 3721 + }, + { + "epoch": 0.30228214082676846, + "grad_norm": 5.678988794221106, + "learning_rate": 4.08952309951598e-06, + "loss": 0.6359, + "step": 3722 + }, + { + "epoch": 0.30236335580281004, + "grad_norm": 7.331654562862624, + "learning_rate": 4.0890154613288066e-06, + "loss": 0.6547, + "step": 3723 + }, + { + "epoch": 0.3024445707788516, + "grad_norm": 3.7966525875405277, + "learning_rate": 4.088507713190174e-06, + "loss": 0.5675, + "step": 3724 + }, + { + "epoch": 0.3025257857548932, + "grad_norm": 4.688025131548344, + "learning_rate": 4.087999855135215e-06, + "loss": 0.6325, + "step": 3725 + }, + { + "epoch": 0.30260700073093477, + "grad_norm": 4.823758976025031, + "learning_rate": 4.087491887199069e-06, + "loss": 0.5405, + "step": 3726 + }, + { + "epoch": 0.30268821570697635, + "grad_norm": 3.9620946246180884, + "learning_rate": 4.086983809416887e-06, + "loss": 0.4993, + "step": 3727 + }, + { + "epoch": 0.3027694306830179, + "grad_norm": 4.5032403641906305, + "learning_rate": 4.086475621823824e-06, + "loss": 0.4079, + "step": 3728 + }, + { + "epoch": 0.3028506456590595, + "grad_norm": 20.477342042531873, + "learning_rate": 4.085967324455045e-06, + "loss": 0.5201, + "step": 3729 + }, + { + "epoch": 0.30293186063510114, + "grad_norm": 6.231510482092364, + "learning_rate": 4.085458917345721e-06, + "loss": 0.7878, + "step": 3730 + }, + { + "epoch": 0.3030130756111427, + "grad_norm": 4.154775199567728, + "learning_rate": 4.084950400531029e-06, + "loss": 0.5936, + "step": 3731 + }, + { + "epoch": 0.3030942905871843, + "grad_norm": 4.694151372288833, + "learning_rate": 4.0844417740461586e-06, + "loss": 0.4481, + "step": 3732 + }, + { + "epoch": 0.30317550556322587, + "grad_norm": 4.709650246764357, + "learning_rate": 4.083933037926303e-06, + "loss": 0.6051, + "step": 3733 + }, + { + "epoch": 0.30325672053926744, + "grad_norm": 5.029331738383321, + "learning_rate": 4.0834241922066644e-06, + "loss": 0.5312, + "step": 3734 + }, + { + "epoch": 0.303337935515309, + "grad_norm": 4.293546250505703, + "learning_rate": 4.082915236922451e-06, + "loss": 0.5698, + "step": 3735 + }, + { + "epoch": 0.3034191504913506, + "grad_norm": 3.9815078810069218, + "learning_rate": 4.082406172108882e-06, + "loss": 0.6138, + "step": 3736 + }, + { + "epoch": 0.3035003654673922, + "grad_norm": 8.25343748834109, + "learning_rate": 4.0818969978011795e-06, + "loss": 0.5962, + "step": 3737 + }, + { + "epoch": 0.30358158044343375, + "grad_norm": 8.293232016506174, + "learning_rate": 4.081387714034577e-06, + "loss": 0.5964, + "step": 3738 + }, + { + "epoch": 0.30366279541947533, + "grad_norm": 9.876486919775553, + "learning_rate": 4.080878320844315e-06, + "loss": 0.5288, + "step": 3739 + }, + { + "epoch": 0.3037440103955169, + "grad_norm": 4.90237518330183, + "learning_rate": 4.080368818265639e-06, + "loss": 0.542, + "step": 3740 + }, + { + "epoch": 0.30382522537155854, + "grad_norm": 5.433619699450851, + "learning_rate": 4.079859206333805e-06, + "loss": 0.5346, + "step": 3741 + }, + { + "epoch": 0.3039064403476001, + "grad_norm": 6.417127497404236, + "learning_rate": 4.079349485084074e-06, + "loss": 0.5068, + "step": 3742 + }, + { + "epoch": 0.3039876553236417, + "grad_norm": 9.395212073819222, + "learning_rate": 4.078839654551718e-06, + "loss": 0.5981, + "step": 3743 + }, + { + "epoch": 0.30406887029968327, + "grad_norm": 6.080896733160445, + "learning_rate": 4.078329714772015e-06, + "loss": 0.5716, + "step": 3744 + }, + { + "epoch": 0.30415008527572485, + "grad_norm": 3.9795202208766463, + "learning_rate": 4.0778196657802484e-06, + "loss": 0.589, + "step": 3745 + }, + { + "epoch": 0.3042313002517664, + "grad_norm": 5.978058004337624, + "learning_rate": 4.077309507611711e-06, + "loss": 0.559, + "step": 3746 + }, + { + "epoch": 0.304312515227808, + "grad_norm": 10.73728162343595, + "learning_rate": 4.076799240301703e-06, + "loss": 0.5419, + "step": 3747 + }, + { + "epoch": 0.3043937302038496, + "grad_norm": 5.705981837585083, + "learning_rate": 4.076288863885533e-06, + "loss": 0.5277, + "step": 3748 + }, + { + "epoch": 0.30447494517989115, + "grad_norm": 4.148490626037343, + "learning_rate": 4.0757783783985164e-06, + "loss": 0.6104, + "step": 3749 + }, + { + "epoch": 0.30455616015593273, + "grad_norm": 7.3071974055301006, + "learning_rate": 4.0752677838759755e-06, + "loss": 0.6195, + "step": 3750 + }, + { + "epoch": 0.3046373751319743, + "grad_norm": 3.536531811654719, + "learning_rate": 4.074757080353241e-06, + "loss": 0.6436, + "step": 3751 + }, + { + "epoch": 0.30471859010801594, + "grad_norm": 4.25870092662668, + "learning_rate": 4.074246267865652e-06, + "loss": 0.6274, + "step": 3752 + }, + { + "epoch": 0.3047998050840575, + "grad_norm": 3.6079741565893895, + "learning_rate": 4.073735346448551e-06, + "loss": 0.6034, + "step": 3753 + }, + { + "epoch": 0.3048810200600991, + "grad_norm": 7.935117748318247, + "learning_rate": 4.073224316137293e-06, + "loss": 0.5586, + "step": 3754 + }, + { + "epoch": 0.30496223503614067, + "grad_norm": 6.813447251757785, + "learning_rate": 4.072713176967239e-06, + "loss": 0.5641, + "step": 3755 + }, + { + "epoch": 0.30504345001218225, + "grad_norm": 6.715619033709069, + "learning_rate": 4.072201928973757e-06, + "loss": 0.5836, + "step": 3756 + }, + { + "epoch": 0.3051246649882238, + "grad_norm": 3.5300282565737944, + "learning_rate": 4.071690572192222e-06, + "loss": 0.6304, + "step": 3757 + }, + { + "epoch": 0.3052058799642654, + "grad_norm": 9.284044398768938, + "learning_rate": 4.071179106658017e-06, + "loss": 0.5776, + "step": 3758 + }, + { + "epoch": 0.305287094940307, + "grad_norm": 6.469344755338421, + "learning_rate": 4.070667532406534e-06, + "loss": 0.518, + "step": 3759 + }, + { + "epoch": 0.30536830991634856, + "grad_norm": 9.204050268328828, + "learning_rate": 4.070155849473169e-06, + "loss": 0.4228, + "step": 3760 + }, + { + "epoch": 0.30544952489239013, + "grad_norm": 7.33932480094826, + "learning_rate": 4.06964405789333e-06, + "loss": 0.5276, + "step": 3761 + }, + { + "epoch": 0.3055307398684317, + "grad_norm": 4.5551115854633695, + "learning_rate": 4.06913215770243e-06, + "loss": 0.5275, + "step": 3762 + }, + { + "epoch": 0.30561195484447334, + "grad_norm": 7.628505640253093, + "learning_rate": 4.068620148935889e-06, + "loss": 0.5706, + "step": 3763 + }, + { + "epoch": 0.3056931698205149, + "grad_norm": 7.296776756806712, + "learning_rate": 4.0681080316291355e-06, + "loss": 0.4332, + "step": 3764 + }, + { + "epoch": 0.3057743847965565, + "grad_norm": 6.211933336592377, + "learning_rate": 4.067595805817604e-06, + "loss": 0.5385, + "step": 3765 + }, + { + "epoch": 0.3058555997725981, + "grad_norm": 5.2682390368881125, + "learning_rate": 4.0670834715367405e-06, + "loss": 0.6172, + "step": 3766 + }, + { + "epoch": 0.30593681474863965, + "grad_norm": 5.7266015459873465, + "learning_rate": 4.066571028821994e-06, + "loss": 0.663, + "step": 3767 + }, + { + "epoch": 0.30601802972468123, + "grad_norm": 4.566744030336992, + "learning_rate": 4.066058477708824e-06, + "loss": 0.6204, + "step": 3768 + }, + { + "epoch": 0.3060992447007228, + "grad_norm": 5.241528003933075, + "learning_rate": 4.065545818232695e-06, + "loss": 0.5374, + "step": 3769 + }, + { + "epoch": 0.3061804596767644, + "grad_norm": 4.210043089768735, + "learning_rate": 4.06503305042908e-06, + "loss": 0.5627, + "step": 3770 + }, + { + "epoch": 0.30626167465280596, + "grad_norm": 24.358962625710497, + "learning_rate": 4.064520174333462e-06, + "loss": 0.4803, + "step": 3771 + }, + { + "epoch": 0.30634288962884754, + "grad_norm": 5.001904028877196, + "learning_rate": 4.0640071899813284e-06, + "loss": 0.5884, + "step": 3772 + }, + { + "epoch": 0.30642410460488917, + "grad_norm": 3.685895651696086, + "learning_rate": 4.0634940974081735e-06, + "loss": 0.5976, + "step": 3773 + }, + { + "epoch": 0.30650531958093075, + "grad_norm": 4.690707174624639, + "learning_rate": 4.062980896649502e-06, + "loss": 0.4432, + "step": 3774 + }, + { + "epoch": 0.3065865345569723, + "grad_norm": 7.66057017392417, + "learning_rate": 4.062467587740825e-06, + "loss": 0.5297, + "step": 3775 + }, + { + "epoch": 0.3066677495330139, + "grad_norm": 4.80151641086556, + "learning_rate": 4.0619541707176595e-06, + "loss": 0.5717, + "step": 3776 + }, + { + "epoch": 0.3067489645090555, + "grad_norm": 5.073279099762993, + "learning_rate": 4.061440645615532e-06, + "loss": 0.4853, + "step": 3777 + }, + { + "epoch": 0.30683017948509705, + "grad_norm": 6.406852301630108, + "learning_rate": 4.060927012469976e-06, + "loss": 0.5093, + "step": 3778 + }, + { + "epoch": 0.30691139446113863, + "grad_norm": 3.919683009512748, + "learning_rate": 4.060413271316531e-06, + "loss": 0.4647, + "step": 3779 + }, + { + "epoch": 0.3069926094371802, + "grad_norm": 4.810874854512679, + "learning_rate": 4.059899422190747e-06, + "loss": 0.3816, + "step": 3780 + }, + { + "epoch": 0.3070738244132218, + "grad_norm": 7.470318516960077, + "learning_rate": 4.059385465128179e-06, + "loss": 0.4818, + "step": 3781 + }, + { + "epoch": 0.30715503938926336, + "grad_norm": 4.142497626949441, + "learning_rate": 4.058871400164388e-06, + "loss": 0.6359, + "step": 3782 + }, + { + "epoch": 0.30723625436530494, + "grad_norm": 4.107423918701493, + "learning_rate": 4.058357227334947e-06, + "loss": 0.5755, + "step": 3783 + }, + { + "epoch": 0.30731746934134657, + "grad_norm": 7.190290906025693, + "learning_rate": 4.057842946675434e-06, + "loss": 0.4618, + "step": 3784 + }, + { + "epoch": 0.30739868431738815, + "grad_norm": 6.779887515845196, + "learning_rate": 4.057328558221434e-06, + "loss": 0.4605, + "step": 3785 + }, + { + "epoch": 0.3074798992934297, + "grad_norm": 6.8626494300279495, + "learning_rate": 4.056814062008539e-06, + "loss": 0.4735, + "step": 3786 + }, + { + "epoch": 0.3075611142694713, + "grad_norm": 4.43663040407582, + "learning_rate": 4.056299458072351e-06, + "loss": 0.5549, + "step": 3787 + }, + { + "epoch": 0.3076423292455129, + "grad_norm": 9.927091358251289, + "learning_rate": 4.0557847464484766e-06, + "loss": 0.6012, + "step": 3788 + }, + { + "epoch": 0.30772354422155446, + "grad_norm": 3.3047332458402874, + "learning_rate": 4.055269927172532e-06, + "loss": 0.5754, + "step": 3789 + }, + { + "epoch": 0.30780475919759603, + "grad_norm": 4.086984576275844, + "learning_rate": 4.054755000280139e-06, + "loss": 0.6482, + "step": 3790 + }, + { + "epoch": 0.3078859741736376, + "grad_norm": 10.124343594059361, + "learning_rate": 4.054239965806929e-06, + "loss": 0.6541, + "step": 3791 + }, + { + "epoch": 0.3079671891496792, + "grad_norm": 9.088285597560201, + "learning_rate": 4.053724823788538e-06, + "loss": 0.5313, + "step": 3792 + }, + { + "epoch": 0.30804840412572077, + "grad_norm": 10.64301926225871, + "learning_rate": 4.053209574260614e-06, + "loss": 0.5672, + "step": 3793 + }, + { + "epoch": 0.30812961910176234, + "grad_norm": 4.299438598485095, + "learning_rate": 4.052694217258806e-06, + "loss": 0.4908, + "step": 3794 + }, + { + "epoch": 0.308210834077804, + "grad_norm": 50.64834820262253, + "learning_rate": 4.052178752818776e-06, + "loss": 0.4808, + "step": 3795 + }, + { + "epoch": 0.30829204905384555, + "grad_norm": 3.318247558859262, + "learning_rate": 4.051663180976192e-06, + "loss": 0.6804, + "step": 3796 + }, + { + "epoch": 0.30837326402988713, + "grad_norm": 15.92488834517386, + "learning_rate": 4.051147501766727e-06, + "loss": 0.5227, + "step": 3797 + }, + { + "epoch": 0.3084544790059287, + "grad_norm": 3.980666622212895, + "learning_rate": 4.050631715226064e-06, + "loss": 0.5733, + "step": 3798 + }, + { + "epoch": 0.3085356939819703, + "grad_norm": 5.924342358250464, + "learning_rate": 4.050115821389894e-06, + "loss": 0.4722, + "step": 3799 + }, + { + "epoch": 0.30861690895801186, + "grad_norm": 6.826343924107241, + "learning_rate": 4.049599820293913e-06, + "loss": 0.5608, + "step": 3800 + }, + { + "epoch": 0.30869812393405344, + "grad_norm": 5.798254498370531, + "learning_rate": 4.049083711973824e-06, + "loss": 0.482, + "step": 3801 + }, + { + "epoch": 0.308779338910095, + "grad_norm": 4.3433768010776905, + "learning_rate": 4.0485674964653424e-06, + "loss": 0.4834, + "step": 3802 + }, + { + "epoch": 0.3088605538861366, + "grad_norm": 4.326553027616668, + "learning_rate": 4.048051173804185e-06, + "loss": 0.5556, + "step": 3803 + }, + { + "epoch": 0.30894176886217817, + "grad_norm": 6.800606044283711, + "learning_rate": 4.047534744026079e-06, + "loss": 0.6, + "step": 3804 + }, + { + "epoch": 0.30902298383821974, + "grad_norm": 10.590880966190726, + "learning_rate": 4.04701820716676e-06, + "loss": 0.4796, + "step": 3805 + }, + { + "epoch": 0.3091041988142614, + "grad_norm": 5.6795091144428635, + "learning_rate": 4.046501563261968e-06, + "loss": 0.5129, + "step": 3806 + }, + { + "epoch": 0.30918541379030295, + "grad_norm": 3.210645271484872, + "learning_rate": 4.045984812347452e-06, + "loss": 0.5332, + "step": 3807 + }, + { + "epoch": 0.30926662876634453, + "grad_norm": 6.294410231720947, + "learning_rate": 4.045467954458969e-06, + "loss": 0.6181, + "step": 3808 + }, + { + "epoch": 0.3093478437423861, + "grad_norm": 4.474575703083307, + "learning_rate": 4.044950989632283e-06, + "loss": 0.4792, + "step": 3809 + }, + { + "epoch": 0.3094290587184277, + "grad_norm": 4.779056570525405, + "learning_rate": 4.044433917903166e-06, + "loss": 0.5667, + "step": 3810 + }, + { + "epoch": 0.30951027369446926, + "grad_norm": 4.782139569666095, + "learning_rate": 4.043916739307394e-06, + "loss": 0.7153, + "step": 3811 + }, + { + "epoch": 0.30959148867051084, + "grad_norm": 6.550010939029318, + "learning_rate": 4.0433994538807564e-06, + "loss": 0.5078, + "step": 3812 + }, + { + "epoch": 0.3096727036465524, + "grad_norm": 7.70154315773568, + "learning_rate": 4.042882061659043e-06, + "loss": 0.5411, + "step": 3813 + }, + { + "epoch": 0.309753918622594, + "grad_norm": 5.468435592136346, + "learning_rate": 4.042364562678059e-06, + "loss": 0.6321, + "step": 3814 + }, + { + "epoch": 0.30983513359863557, + "grad_norm": 4.143944075441312, + "learning_rate": 4.041846956973608e-06, + "loss": 0.46, + "step": 3815 + }, + { + "epoch": 0.30991634857467715, + "grad_norm": 4.798354632196031, + "learning_rate": 4.041329244581509e-06, + "loss": 0.4921, + "step": 3816 + }, + { + "epoch": 0.3099975635507188, + "grad_norm": 5.155312927681158, + "learning_rate": 4.040811425537583e-06, + "loss": 0.6239, + "step": 3817 + }, + { + "epoch": 0.31007877852676036, + "grad_norm": 3.701165442814568, + "learning_rate": 4.040293499877661e-06, + "loss": 0.6161, + "step": 3818 + }, + { + "epoch": 0.31015999350280193, + "grad_norm": 5.396166406448343, + "learning_rate": 4.039775467637581e-06, + "loss": 0.4338, + "step": 3819 + }, + { + "epoch": 0.3102412084788435, + "grad_norm": 4.72888620508123, + "learning_rate": 4.039257328853188e-06, + "loss": 0.7018, + "step": 3820 + }, + { + "epoch": 0.3103224234548851, + "grad_norm": 4.775493593228451, + "learning_rate": 4.038739083560334e-06, + "loss": 0.526, + "step": 3821 + }, + { + "epoch": 0.31040363843092666, + "grad_norm": 4.103968235635027, + "learning_rate": 4.038220731794878e-06, + "loss": 0.6318, + "step": 3822 + }, + { + "epoch": 0.31048485340696824, + "grad_norm": 8.31477225841899, + "learning_rate": 4.03770227359269e-06, + "loss": 0.5303, + "step": 3823 + }, + { + "epoch": 0.3105660683830098, + "grad_norm": 3.8725355108338766, + "learning_rate": 4.037183708989642e-06, + "loss": 0.4775, + "step": 3824 + }, + { + "epoch": 0.3106472833590514, + "grad_norm": 7.614229033887325, + "learning_rate": 4.0366650380216165e-06, + "loss": 0.4905, + "step": 3825 + }, + { + "epoch": 0.310728498335093, + "grad_norm": 5.153358446059254, + "learning_rate": 4.036146260724503e-06, + "loss": 0.5249, + "step": 3826 + }, + { + "epoch": 0.31080971331113455, + "grad_norm": 8.578326851480215, + "learning_rate": 4.0356273771341984e-06, + "loss": 0.6979, + "step": 3827 + }, + { + "epoch": 0.3108909282871762, + "grad_norm": 5.785574362196001, + "learning_rate": 4.035108387286607e-06, + "loss": 0.4557, + "step": 3828 + }, + { + "epoch": 0.31097214326321776, + "grad_norm": 3.55379778886365, + "learning_rate": 4.03458929121764e-06, + "loss": 0.6176, + "step": 3829 + }, + { + "epoch": 0.31105335823925934, + "grad_norm": 7.612980594268243, + "learning_rate": 4.0340700889632145e-06, + "loss": 0.7673, + "step": 3830 + }, + { + "epoch": 0.3111345732153009, + "grad_norm": 9.252507294094109, + "learning_rate": 4.033550780559258e-06, + "loss": 0.5036, + "step": 3831 + }, + { + "epoch": 0.3112157881913425, + "grad_norm": 3.594686390622405, + "learning_rate": 4.033031366041704e-06, + "loss": 0.4821, + "step": 3832 + }, + { + "epoch": 0.31129700316738407, + "grad_norm": 6.285694736961024, + "learning_rate": 4.0325118454464935e-06, + "loss": 0.5007, + "step": 3833 + }, + { + "epoch": 0.31137821814342564, + "grad_norm": 3.160762643660874, + "learning_rate": 4.031992218809573e-06, + "loss": 0.5703, + "step": 3834 + }, + { + "epoch": 0.3114594331194672, + "grad_norm": 8.890289793696613, + "learning_rate": 4.0314724861669e-06, + "loss": 0.5602, + "step": 3835 + }, + { + "epoch": 0.3115406480955088, + "grad_norm": 4.321571181434268, + "learning_rate": 4.0309526475544355e-06, + "loss": 0.6061, + "step": 3836 + }, + { + "epoch": 0.3116218630715504, + "grad_norm": 7.365273193938057, + "learning_rate": 4.03043270300815e-06, + "loss": 0.5242, + "step": 3837 + }, + { + "epoch": 0.31170307804759195, + "grad_norm": 6.152202896973093, + "learning_rate": 4.029912652564022e-06, + "loss": 0.527, + "step": 3838 + }, + { + "epoch": 0.3117842930236336, + "grad_norm": 9.939281394272458, + "learning_rate": 4.029392496258035e-06, + "loss": 0.6952, + "step": 3839 + }, + { + "epoch": 0.31186550799967516, + "grad_norm": 7.9876078180108685, + "learning_rate": 4.028872234126181e-06, + "loss": 0.4901, + "step": 3840 + }, + { + "epoch": 0.31194672297571674, + "grad_norm": 4.522557141556952, + "learning_rate": 4.02835186620446e-06, + "loss": 0.4127, + "step": 3841 + }, + { + "epoch": 0.3120279379517583, + "grad_norm": 5.5314558917789505, + "learning_rate": 4.027831392528879e-06, + "loss": 0.5045, + "step": 3842 + }, + { + "epoch": 0.3121091529277999, + "grad_norm": 6.372253503387104, + "learning_rate": 4.027310813135451e-06, + "loss": 0.5654, + "step": 3843 + }, + { + "epoch": 0.31219036790384147, + "grad_norm": 3.5189797244740446, + "learning_rate": 4.0267901280601985e-06, + "loss": 0.4954, + "step": 3844 + }, + { + "epoch": 0.31227158287988305, + "grad_norm": 6.001913797204444, + "learning_rate": 4.026269337339149e-06, + "loss": 0.5535, + "step": 3845 + }, + { + "epoch": 0.3123527978559246, + "grad_norm": 6.805227975140487, + "learning_rate": 4.025748441008339e-06, + "loss": 0.5175, + "step": 3846 + }, + { + "epoch": 0.3124340128319662, + "grad_norm": 3.6933697953285423, + "learning_rate": 4.0252274391038125e-06, + "loss": 0.6538, + "step": 3847 + }, + { + "epoch": 0.3125152278080078, + "grad_norm": 4.975606249432986, + "learning_rate": 4.024706331661618e-06, + "loss": 0.6194, + "step": 3848 + }, + { + "epoch": 0.31259644278404936, + "grad_norm": 21.413001819693548, + "learning_rate": 4.024185118717816e-06, + "loss": 0.5108, + "step": 3849 + }, + { + "epoch": 0.312677657760091, + "grad_norm": 4.688146949121658, + "learning_rate": 4.023663800308471e-06, + "loss": 0.5959, + "step": 3850 + }, + { + "epoch": 0.31275887273613256, + "grad_norm": 4.929413723785714, + "learning_rate": 4.023142376469653e-06, + "loss": 0.4974, + "step": 3851 + }, + { + "epoch": 0.31284008771217414, + "grad_norm": 7.165568666050521, + "learning_rate": 4.022620847237445e-06, + "loss": 0.5106, + "step": 3852 + }, + { + "epoch": 0.3129213026882157, + "grad_norm": 5.314358756089364, + "learning_rate": 4.022099212647933e-06, + "loss": 0.6104, + "step": 3853 + }, + { + "epoch": 0.3130025176642573, + "grad_norm": 4.330125714880257, + "learning_rate": 4.021577472737209e-06, + "loss": 0.587, + "step": 3854 + }, + { + "epoch": 0.3130837326402989, + "grad_norm": 4.654860553507325, + "learning_rate": 4.021055627541379e-06, + "loss": 0.4938, + "step": 3855 + }, + { + "epoch": 0.31316494761634045, + "grad_norm": 5.006258634790825, + "learning_rate": 4.020533677096549e-06, + "loss": 0.5747, + "step": 3856 + }, + { + "epoch": 0.313246162592382, + "grad_norm": 6.0280062807614545, + "learning_rate": 4.020011621438836e-06, + "loss": 0.5764, + "step": 3857 + }, + { + "epoch": 0.3133273775684236, + "grad_norm": 6.574325175703227, + "learning_rate": 4.019489460604364e-06, + "loss": 0.5299, + "step": 3858 + }, + { + "epoch": 0.3134085925444652, + "grad_norm": 4.080793981103117, + "learning_rate": 4.018967194629261e-06, + "loss": 0.7868, + "step": 3859 + }, + { + "epoch": 0.31348980752050676, + "grad_norm": 3.847772634998914, + "learning_rate": 4.0184448235496685e-06, + "loss": 0.6058, + "step": 3860 + }, + { + "epoch": 0.3135710224965484, + "grad_norm": 4.404444873697644, + "learning_rate": 4.017922347401731e-06, + "loss": 0.523, + "step": 3861 + }, + { + "epoch": 0.31365223747258997, + "grad_norm": 9.45105154996585, + "learning_rate": 4.017399766221599e-06, + "loss": 0.4319, + "step": 3862 + }, + { + "epoch": 0.31373345244863154, + "grad_norm": 4.265430944389896, + "learning_rate": 4.016877080045435e-06, + "loss": 0.4388, + "step": 3863 + }, + { + "epoch": 0.3138146674246731, + "grad_norm": 4.626292776222082, + "learning_rate": 4.016354288909405e-06, + "loss": 0.4834, + "step": 3864 + }, + { + "epoch": 0.3138958824007147, + "grad_norm": 3.893386508353787, + "learning_rate": 4.0158313928496826e-06, + "loss": 0.5888, + "step": 3865 + }, + { + "epoch": 0.3139770973767563, + "grad_norm": 5.310652622885137, + "learning_rate": 4.015308391902452e-06, + "loss": 0.5323, + "step": 3866 + }, + { + "epoch": 0.31405831235279785, + "grad_norm": 6.885983808946358, + "learning_rate": 4.014785286103898e-06, + "loss": 0.5397, + "step": 3867 + }, + { + "epoch": 0.31413952732883943, + "grad_norm": 7.793035367661995, + "learning_rate": 4.014262075490221e-06, + "loss": 0.4684, + "step": 3868 + }, + { + "epoch": 0.314220742304881, + "grad_norm": 14.532888308919345, + "learning_rate": 4.013738760097622e-06, + "loss": 0.5751, + "step": 3869 + }, + { + "epoch": 0.3143019572809226, + "grad_norm": 6.316239281714061, + "learning_rate": 4.0132153399623106e-06, + "loss": 0.5754, + "step": 3870 + }, + { + "epoch": 0.31438317225696416, + "grad_norm": 5.218130390396817, + "learning_rate": 4.012691815120508e-06, + "loss": 0.5718, + "step": 3871 + }, + { + "epoch": 0.3144643872330058, + "grad_norm": 10.486211867182627, + "learning_rate": 4.012168185608437e-06, + "loss": 0.4341, + "step": 3872 + }, + { + "epoch": 0.31454560220904737, + "grad_norm": 13.04476716032136, + "learning_rate": 4.011644451462331e-06, + "loss": 0.6967, + "step": 3873 + }, + { + "epoch": 0.31462681718508895, + "grad_norm": 4.374789125818755, + "learning_rate": 4.011120612718429e-06, + "loss": 0.5778, + "step": 3874 + }, + { + "epoch": 0.3147080321611305, + "grad_norm": 4.427126765436375, + "learning_rate": 4.010596669412978e-06, + "loss": 0.5147, + "step": 3875 + }, + { + "epoch": 0.3147892471371721, + "grad_norm": 6.121898739230059, + "learning_rate": 4.010072621582233e-06, + "loss": 0.4541, + "step": 3876 + }, + { + "epoch": 0.3148704621132137, + "grad_norm": 8.191495131008242, + "learning_rate": 4.009548469262453e-06, + "loss": 0.7081, + "step": 3877 + }, + { + "epoch": 0.31495167708925526, + "grad_norm": 5.939103893735796, + "learning_rate": 4.009024212489909e-06, + "loss": 0.588, + "step": 3878 + }, + { + "epoch": 0.31503289206529683, + "grad_norm": 5.468809556850152, + "learning_rate": 4.0084998513008765e-06, + "loss": 0.5442, + "step": 3879 + }, + { + "epoch": 0.3151141070413384, + "grad_norm": 3.760788030695487, + "learning_rate": 4.007975385731637e-06, + "loss": 0.771, + "step": 3880 + }, + { + "epoch": 0.31519532201738, + "grad_norm": 4.150500328342237, + "learning_rate": 4.007450815818481e-06, + "loss": 0.5091, + "step": 3881 + }, + { + "epoch": 0.31527653699342156, + "grad_norm": 4.7154167246286285, + "learning_rate": 4.0069261415977075e-06, + "loss": 0.5819, + "step": 3882 + }, + { + "epoch": 0.3153577519694632, + "grad_norm": 4.032822924231925, + "learning_rate": 4.006401363105621e-06, + "loss": 0.5721, + "step": 3883 + }, + { + "epoch": 0.3154389669455048, + "grad_norm": 4.862866759718993, + "learning_rate": 4.0058764803785325e-06, + "loss": 0.6929, + "step": 3884 + }, + { + "epoch": 0.31552018192154635, + "grad_norm": 3.918915992085951, + "learning_rate": 4.00535149345276e-06, + "loss": 0.5521, + "step": 3885 + }, + { + "epoch": 0.3156013968975879, + "grad_norm": 5.105255048171445, + "learning_rate": 4.0048264023646325e-06, + "loss": 0.5472, + "step": 3886 + }, + { + "epoch": 0.3156826118736295, + "grad_norm": 8.077151245590299, + "learning_rate": 4.004301207150482e-06, + "loss": 0.4525, + "step": 3887 + }, + { + "epoch": 0.3157638268496711, + "grad_norm": 8.364488519427972, + "learning_rate": 4.003775907846648e-06, + "loss": 0.566, + "step": 3888 + }, + { + "epoch": 0.31584504182571266, + "grad_norm": 4.65775942487372, + "learning_rate": 4.003250504489481e-06, + "loss": 0.4639, + "step": 3889 + }, + { + "epoch": 0.31592625680175423, + "grad_norm": 10.681842547013567, + "learning_rate": 4.002724997115335e-06, + "loss": 0.3836, + "step": 3890 + }, + { + "epoch": 0.3160074717777958, + "grad_norm": 4.910721299022231, + "learning_rate": 4.002199385760571e-06, + "loss": 0.5612, + "step": 3891 + }, + { + "epoch": 0.3160886867538374, + "grad_norm": 5.963214780659344, + "learning_rate": 4.001673670461561e-06, + "loss": 0.6232, + "step": 3892 + }, + { + "epoch": 0.31616990172987897, + "grad_norm": 6.652157701535889, + "learning_rate": 4.0011478512546805e-06, + "loss": 0.5664, + "step": 3893 + }, + { + "epoch": 0.3162511167059206, + "grad_norm": 3.8712273283479233, + "learning_rate": 4.000621928176313e-06, + "loss": 0.5419, + "step": 3894 + }, + { + "epoch": 0.3163323316819622, + "grad_norm": 5.721147211228416, + "learning_rate": 4.000095901262851e-06, + "loss": 0.5002, + "step": 3895 + }, + { + "epoch": 0.31641354665800375, + "grad_norm": 5.522015478399547, + "learning_rate": 3.99956977055069e-06, + "loss": 0.5785, + "step": 3896 + }, + { + "epoch": 0.31649476163404533, + "grad_norm": 12.710524189514066, + "learning_rate": 3.999043536076238e-06, + "loss": 0.5987, + "step": 3897 + }, + { + "epoch": 0.3165759766100869, + "grad_norm": 15.432871133292979, + "learning_rate": 3.998517197875908e-06, + "loss": 0.5758, + "step": 3898 + }, + { + "epoch": 0.3166571915861285, + "grad_norm": 3.2538000933185622, + "learning_rate": 3.997990755986117e-06, + "loss": 0.3758, + "step": 3899 + }, + { + "epoch": 0.31673840656217006, + "grad_norm": 6.504572502780165, + "learning_rate": 3.9974642104432945e-06, + "loss": 0.6192, + "step": 3900 + }, + { + "epoch": 0.31681962153821164, + "grad_norm": 5.380343912168226, + "learning_rate": 3.996937561283874e-06, + "loss": 0.5603, + "step": 3901 + }, + { + "epoch": 0.3169008365142532, + "grad_norm": 8.424493108304915, + "learning_rate": 3.996410808544296e-06, + "loss": 0.5061, + "step": 3902 + }, + { + "epoch": 0.3169820514902948, + "grad_norm": 5.3477146060500225, + "learning_rate": 3.99588395226101e-06, + "loss": 0.5234, + "step": 3903 + }, + { + "epoch": 0.31706326646633637, + "grad_norm": 12.609875677912806, + "learning_rate": 3.9953569924704715e-06, + "loss": 0.5047, + "step": 3904 + }, + { + "epoch": 0.317144481442378, + "grad_norm": 6.884048976659566, + "learning_rate": 3.994829929209143e-06, + "loss": 0.5393, + "step": 3905 + }, + { + "epoch": 0.3172256964184196, + "grad_norm": 5.4396250935337465, + "learning_rate": 3.994302762513496e-06, + "loss": 0.5701, + "step": 3906 + }, + { + "epoch": 0.31730691139446116, + "grad_norm": 3.9789051349804905, + "learning_rate": 3.993775492420005e-06, + "loss": 0.8038, + "step": 3907 + }, + { + "epoch": 0.31738812637050273, + "grad_norm": 6.0730946208938335, + "learning_rate": 3.993248118965155e-06, + "loss": 0.5355, + "step": 3908 + }, + { + "epoch": 0.3174693413465443, + "grad_norm": 7.073767654590764, + "learning_rate": 3.992720642185439e-06, + "loss": 0.4949, + "step": 3909 + }, + { + "epoch": 0.3175505563225859, + "grad_norm": 6.028367836015118, + "learning_rate": 3.992193062117354e-06, + "loss": 0.5065, + "step": 3910 + }, + { + "epoch": 0.31763177129862746, + "grad_norm": 5.478458346134022, + "learning_rate": 3.991665378797408e-06, + "loss": 0.6328, + "step": 3911 + }, + { + "epoch": 0.31771298627466904, + "grad_norm": 5.80032080288049, + "learning_rate": 3.991137592262111e-06, + "loss": 0.3775, + "step": 3912 + }, + { + "epoch": 0.3177942012507106, + "grad_norm": 5.364477054479598, + "learning_rate": 3.990609702547985e-06, + "loss": 0.6169, + "step": 3913 + }, + { + "epoch": 0.3178754162267522, + "grad_norm": 6.357814340494215, + "learning_rate": 3.990081709691556e-06, + "loss": 0.5666, + "step": 3914 + }, + { + "epoch": 0.31795663120279377, + "grad_norm": 3.0964122690765055, + "learning_rate": 3.989553613729359e-06, + "loss": 0.5291, + "step": 3915 + }, + { + "epoch": 0.3180378461788354, + "grad_norm": 6.1633146191090615, + "learning_rate": 3.989025414697935e-06, + "loss": 0.4764, + "step": 3916 + }, + { + "epoch": 0.318119061154877, + "grad_norm": 5.380367903861603, + "learning_rate": 3.988497112633834e-06, + "loss": 0.4923, + "step": 3917 + }, + { + "epoch": 0.31820027613091856, + "grad_norm": 4.635335577517618, + "learning_rate": 3.98796870757361e-06, + "loss": 0.5211, + "step": 3918 + }, + { + "epoch": 0.31828149110696013, + "grad_norm": 4.837892560831567, + "learning_rate": 3.987440199553826e-06, + "loss": 0.5608, + "step": 3919 + }, + { + "epoch": 0.3183627060830017, + "grad_norm": 5.412511747936819, + "learning_rate": 3.986911588611052e-06, + "loss": 0.5652, + "step": 3920 + }, + { + "epoch": 0.3184439210590433, + "grad_norm": 6.754512301450377, + "learning_rate": 3.986382874781866e-06, + "loss": 0.6123, + "step": 3921 + }, + { + "epoch": 0.31852513603508487, + "grad_norm": 3.851560698751757, + "learning_rate": 3.985854058102851e-06, + "loss": 0.4627, + "step": 3922 + }, + { + "epoch": 0.31860635101112644, + "grad_norm": 9.180563910365933, + "learning_rate": 3.9853251386106e-06, + "loss": 0.5239, + "step": 3923 + }, + { + "epoch": 0.318687565987168, + "grad_norm": 6.398627354244881, + "learning_rate": 3.9847961163417094e-06, + "loss": 0.4989, + "step": 3924 + }, + { + "epoch": 0.3187687809632096, + "grad_norm": 4.516597706332039, + "learning_rate": 3.984266991332787e-06, + "loss": 0.5573, + "step": 3925 + }, + { + "epoch": 0.3188499959392512, + "grad_norm": 6.97958059945626, + "learning_rate": 3.9837377636204435e-06, + "loss": 0.3659, + "step": 3926 + }, + { + "epoch": 0.3189312109152928, + "grad_norm": 9.380173505814613, + "learning_rate": 3.983208433241298e-06, + "loss": 0.5665, + "step": 3927 + }, + { + "epoch": 0.3190124258913344, + "grad_norm": 5.179839950219316, + "learning_rate": 3.98267900023198e-06, + "loss": 0.6098, + "step": 3928 + }, + { + "epoch": 0.31909364086737596, + "grad_norm": 11.243873982204496, + "learning_rate": 3.982149464629123e-06, + "loss": 0.5793, + "step": 3929 + }, + { + "epoch": 0.31917485584341754, + "grad_norm": 4.1291560709585164, + "learning_rate": 3.981619826469366e-06, + "loss": 0.5195, + "step": 3930 + }, + { + "epoch": 0.3192560708194591, + "grad_norm": 4.373741253760804, + "learning_rate": 3.981090085789359e-06, + "loss": 0.5506, + "step": 3931 + }, + { + "epoch": 0.3193372857955007, + "grad_norm": 6.318431118742852, + "learning_rate": 3.980560242625756e-06, + "loss": 0.6606, + "step": 3932 + }, + { + "epoch": 0.31941850077154227, + "grad_norm": 4.240677812054353, + "learning_rate": 3.9800302970152205e-06, + "loss": 0.6272, + "step": 3933 + }, + { + "epoch": 0.31949971574758385, + "grad_norm": 5.672962140852729, + "learning_rate": 3.9795002489944216e-06, + "loss": 0.6, + "step": 3934 + }, + { + "epoch": 0.3195809307236254, + "grad_norm": 6.714289970142845, + "learning_rate": 3.978970098600035e-06, + "loss": 0.5626, + "step": 3935 + }, + { + "epoch": 0.319662145699667, + "grad_norm": 4.761339278563214, + "learning_rate": 3.978439845868745e-06, + "loss": 0.4723, + "step": 3936 + }, + { + "epoch": 0.3197433606757086, + "grad_norm": 13.677606899149971, + "learning_rate": 3.977909490837242e-06, + "loss": 0.5259, + "step": 3937 + }, + { + "epoch": 0.3198245756517502, + "grad_norm": 6.128118023978932, + "learning_rate": 3.977379033542225e-06, + "loss": 0.438, + "step": 3938 + }, + { + "epoch": 0.3199057906277918, + "grad_norm": 6.303253700795032, + "learning_rate": 3.976848474020397e-06, + "loss": 0.4292, + "step": 3939 + }, + { + "epoch": 0.31998700560383336, + "grad_norm": 10.396013099121385, + "learning_rate": 3.97631781230847e-06, + "loss": 0.635, + "step": 3940 + }, + { + "epoch": 0.32006822057987494, + "grad_norm": 3.2724038031097575, + "learning_rate": 3.975787048443165e-06, + "loss": 0.7294, + "step": 3941 + }, + { + "epoch": 0.3201494355559165, + "grad_norm": 5.0292671115469645, + "learning_rate": 3.975256182461206e-06, + "loss": 0.5492, + "step": 3942 + }, + { + "epoch": 0.3202306505319581, + "grad_norm": 15.222409035535888, + "learning_rate": 3.9747252143993265e-06, + "loss": 0.5846, + "step": 3943 + }, + { + "epoch": 0.32031186550799967, + "grad_norm": 3.721523920711308, + "learning_rate": 3.9741941442942685e-06, + "loss": 0.7537, + "step": 3944 + }, + { + "epoch": 0.32039308048404125, + "grad_norm": 9.490205761589818, + "learning_rate": 3.973662972182777e-06, + "loss": 0.4823, + "step": 3945 + }, + { + "epoch": 0.3204742954600828, + "grad_norm": 5.92729992222592, + "learning_rate": 3.973131698101606e-06, + "loss": 0.6342, + "step": 3946 + }, + { + "epoch": 0.3205555104361244, + "grad_norm": 3.889265293501103, + "learning_rate": 3.97260032208752e-06, + "loss": 0.6895, + "step": 3947 + }, + { + "epoch": 0.320636725412166, + "grad_norm": 5.718930345280851, + "learning_rate": 3.972068844177284e-06, + "loss": 0.5711, + "step": 3948 + }, + { + "epoch": 0.3207179403882076, + "grad_norm": 5.665338399507542, + "learning_rate": 3.971537264407674e-06, + "loss": 0.4434, + "step": 3949 + }, + { + "epoch": 0.3207991553642492, + "grad_norm": 5.195191497507738, + "learning_rate": 3.971005582815475e-06, + "loss": 0.5603, + "step": 3950 + }, + { + "epoch": 0.32088037034029077, + "grad_norm": 4.595813111552932, + "learning_rate": 3.970473799437475e-06, + "loss": 0.5229, + "step": 3951 + }, + { + "epoch": 0.32096158531633234, + "grad_norm": 7.011551829999244, + "learning_rate": 3.969941914310469e-06, + "loss": 0.5054, + "step": 3952 + }, + { + "epoch": 0.3210428002923739, + "grad_norm": 5.133064719413326, + "learning_rate": 3.969409927471263e-06, + "loss": 0.4704, + "step": 3953 + }, + { + "epoch": 0.3211240152684155, + "grad_norm": 4.606505759316552, + "learning_rate": 3.968877838956667e-06, + "loss": 0.525, + "step": 3954 + }, + { + "epoch": 0.3212052302444571, + "grad_norm": 5.298403703345239, + "learning_rate": 3.968345648803497e-06, + "loss": 0.4454, + "step": 3955 + }, + { + "epoch": 0.32128644522049865, + "grad_norm": 3.561393835187397, + "learning_rate": 3.96781335704858e-06, + "loss": 0.5577, + "step": 3956 + }, + { + "epoch": 0.32136766019654023, + "grad_norm": 4.307683957878129, + "learning_rate": 3.967280963728748e-06, + "loss": 0.4494, + "step": 3957 + }, + { + "epoch": 0.3214488751725818, + "grad_norm": 6.747635355041429, + "learning_rate": 3.966748468880838e-06, + "loss": 0.6197, + "step": 3958 + }, + { + "epoch": 0.3215300901486234, + "grad_norm": 7.367076215445246, + "learning_rate": 3.9662158725416964e-06, + "loss": 0.5736, + "step": 3959 + }, + { + "epoch": 0.321611305124665, + "grad_norm": 4.986542323666267, + "learning_rate": 3.965683174748176e-06, + "loss": 0.6222, + "step": 3960 + }, + { + "epoch": 0.3216925201007066, + "grad_norm": 4.343391538341711, + "learning_rate": 3.965150375537137e-06, + "loss": 0.4032, + "step": 3961 + }, + { + "epoch": 0.32177373507674817, + "grad_norm": 10.317677958429377, + "learning_rate": 3.964617474945447e-06, + "loss": 0.5128, + "step": 3962 + }, + { + "epoch": 0.32185495005278975, + "grad_norm": 5.753574432660448, + "learning_rate": 3.9640844730099795e-06, + "loss": 0.5081, + "step": 3963 + }, + { + "epoch": 0.3219361650288313, + "grad_norm": 3.9575067307483045, + "learning_rate": 3.963551369767613e-06, + "loss": 0.5913, + "step": 3964 + }, + { + "epoch": 0.3220173800048729, + "grad_norm": 4.7333217124955365, + "learning_rate": 3.963018165255239e-06, + "loss": 0.5454, + "step": 3965 + }, + { + "epoch": 0.3220985949809145, + "grad_norm": 4.107636292267094, + "learning_rate": 3.962484859509751e-06, + "loss": 0.4283, + "step": 3966 + }, + { + "epoch": 0.32217980995695605, + "grad_norm": 3.710054258246495, + "learning_rate": 3.96195145256805e-06, + "loss": 0.5421, + "step": 3967 + }, + { + "epoch": 0.32226102493299763, + "grad_norm": 4.914093323605704, + "learning_rate": 3.961417944467046e-06, + "loss": 0.624, + "step": 3968 + }, + { + "epoch": 0.3223422399090392, + "grad_norm": 6.367334808510856, + "learning_rate": 3.960884335243655e-06, + "loss": 0.557, + "step": 3969 + }, + { + "epoch": 0.3224234548850808, + "grad_norm": 7.4083421441872455, + "learning_rate": 3.9603506249348e-06, + "loss": 0.7381, + "step": 3970 + }, + { + "epoch": 0.3225046698611224, + "grad_norm": 5.365590371657556, + "learning_rate": 3.959816813577409e-06, + "loss": 0.4419, + "step": 3971 + }, + { + "epoch": 0.322585884837164, + "grad_norm": 4.032690708340649, + "learning_rate": 3.959282901208422e-06, + "loss": 0.5859, + "step": 3972 + }, + { + "epoch": 0.32266709981320557, + "grad_norm": 5.285352909806383, + "learning_rate": 3.9587488878647816e-06, + "loss": 0.5464, + "step": 3973 + }, + { + "epoch": 0.32274831478924715, + "grad_norm": 6.70864234598179, + "learning_rate": 3.958214773583437e-06, + "loss": 0.5481, + "step": 3974 + }, + { + "epoch": 0.3228295297652887, + "grad_norm": 8.065418291262025, + "learning_rate": 3.957680558401348e-06, + "loss": 0.5129, + "step": 3975 + }, + { + "epoch": 0.3229107447413303, + "grad_norm": 5.8858306697747755, + "learning_rate": 3.95714624235548e-06, + "loss": 0.5457, + "step": 3976 + }, + { + "epoch": 0.3229919597173719, + "grad_norm": 4.006111281727507, + "learning_rate": 3.956611825482803e-06, + "loss": 0.5355, + "step": 3977 + }, + { + "epoch": 0.32307317469341346, + "grad_norm": 5.150201779401238, + "learning_rate": 3.956077307820296e-06, + "loss": 0.4682, + "step": 3978 + }, + { + "epoch": 0.32315438966945503, + "grad_norm": 5.941131977925157, + "learning_rate": 3.955542689404948e-06, + "loss": 0.5036, + "step": 3979 + }, + { + "epoch": 0.3232356046454966, + "grad_norm": 5.8735315327446, + "learning_rate": 3.955007970273747e-06, + "loss": 0.7358, + "step": 3980 + }, + { + "epoch": 0.3233168196215382, + "grad_norm": 6.004080782992699, + "learning_rate": 3.954473150463696e-06, + "loss": 0.4277, + "step": 3981 + }, + { + "epoch": 0.3233980345975798, + "grad_norm": 3.969562702014225, + "learning_rate": 3.9539382300117995e-06, + "loss": 0.6674, + "step": 3982 + }, + { + "epoch": 0.3234792495736214, + "grad_norm": 5.552544120013278, + "learning_rate": 3.953403208955074e-06, + "loss": 0.5466, + "step": 3983 + }, + { + "epoch": 0.323560464549663, + "grad_norm": 6.198037037595219, + "learning_rate": 3.952868087330537e-06, + "loss": 0.5557, + "step": 3984 + }, + { + "epoch": 0.32364167952570455, + "grad_norm": 3.7865591645101278, + "learning_rate": 3.952332865175218e-06, + "loss": 0.622, + "step": 3985 + }, + { + "epoch": 0.32372289450174613, + "grad_norm": 4.933123244556156, + "learning_rate": 3.951797542526151e-06, + "loss": 0.5681, + "step": 3986 + }, + { + "epoch": 0.3238041094777877, + "grad_norm": 5.397627483042807, + "learning_rate": 3.951262119420378e-06, + "loss": 0.4988, + "step": 3987 + }, + { + "epoch": 0.3238853244538293, + "grad_norm": 4.851399386478982, + "learning_rate": 3.950726595894947e-06, + "loss": 0.4111, + "step": 3988 + }, + { + "epoch": 0.32396653942987086, + "grad_norm": 3.1067739446054636, + "learning_rate": 3.950190971986913e-06, + "loss": 0.6122, + "step": 3989 + }, + { + "epoch": 0.32404775440591244, + "grad_norm": 5.367173504535532, + "learning_rate": 3.9496552477333396e-06, + "loss": 0.5503, + "step": 3990 + }, + { + "epoch": 0.324128969381954, + "grad_norm": 6.546774201268415, + "learning_rate": 3.9491194231712945e-06, + "loss": 0.4951, + "step": 3991 + }, + { + "epoch": 0.3242101843579956, + "grad_norm": 3.3008359196847876, + "learning_rate": 3.948583498337854e-06, + "loss": 0.4695, + "step": 3992 + }, + { + "epoch": 0.3242913993340372, + "grad_norm": 6.2106989027955315, + "learning_rate": 3.9480474732701034e-06, + "loss": 0.6426, + "step": 3993 + }, + { + "epoch": 0.3243726143100788, + "grad_norm": 7.437097992706253, + "learning_rate": 3.9475113480051305e-06, + "loss": 0.5088, + "step": 3994 + }, + { + "epoch": 0.3244538292861204, + "grad_norm": 4.649897071407217, + "learning_rate": 3.9469751225800344e-06, + "loss": 0.3348, + "step": 3995 + }, + { + "epoch": 0.32453504426216195, + "grad_norm": 7.807412990752628, + "learning_rate": 3.946438797031916e-06, + "loss": 0.5809, + "step": 3996 + }, + { + "epoch": 0.32461625923820353, + "grad_norm": 10.891441494223873, + "learning_rate": 3.9459023713978895e-06, + "loss": 0.4846, + "step": 3997 + }, + { + "epoch": 0.3246974742142451, + "grad_norm": 8.904826186725101, + "learning_rate": 3.94536584571507e-06, + "loss": 0.6143, + "step": 3998 + }, + { + "epoch": 0.3247786891902867, + "grad_norm": 5.6954977350735785, + "learning_rate": 3.944829220020584e-06, + "loss": 0.5203, + "step": 3999 + }, + { + "epoch": 0.32485990416632826, + "grad_norm": 4.93125171789273, + "learning_rate": 3.944292494351563e-06, + "loss": 0.6325, + "step": 4000 + }, + { + "epoch": 0.32494111914236984, + "grad_norm": 4.965862139362356, + "learning_rate": 3.943755668745145e-06, + "loss": 0.5805, + "step": 4001 + }, + { + "epoch": 0.3250223341184114, + "grad_norm": 5.16787159248045, + "learning_rate": 3.943218743238476e-06, + "loss": 0.5562, + "step": 4002 + }, + { + "epoch": 0.325103549094453, + "grad_norm": 7.5362717832466055, + "learning_rate": 3.942681717868707e-06, + "loss": 0.5688, + "step": 4003 + }, + { + "epoch": 0.3251847640704946, + "grad_norm": 4.981243847271942, + "learning_rate": 3.942144592673e-06, + "loss": 0.5032, + "step": 4004 + }, + { + "epoch": 0.3252659790465362, + "grad_norm": 5.040020101083127, + "learning_rate": 3.941607367688518e-06, + "loss": 0.6819, + "step": 4005 + }, + { + "epoch": 0.3253471940225778, + "grad_norm": 5.368844738609336, + "learning_rate": 3.941070042952437e-06, + "loss": 0.602, + "step": 4006 + }, + { + "epoch": 0.32542840899861936, + "grad_norm": 6.620173821128225, + "learning_rate": 3.940532618501935e-06, + "loss": 0.4266, + "step": 4007 + }, + { + "epoch": 0.32550962397466093, + "grad_norm": 3.763790781156802, + "learning_rate": 3.9399950943742e-06, + "loss": 0.6278, + "step": 4008 + }, + { + "epoch": 0.3255908389507025, + "grad_norm": 5.280317791344952, + "learning_rate": 3.939457470606426e-06, + "loss": 0.5235, + "step": 4009 + }, + { + "epoch": 0.3256720539267441, + "grad_norm": 6.207070016608509, + "learning_rate": 3.938919747235812e-06, + "loss": 0.498, + "step": 4010 + }, + { + "epoch": 0.32575326890278566, + "grad_norm": 3.9148661505895923, + "learning_rate": 3.938381924299568e-06, + "loss": 0.6105, + "step": 4011 + }, + { + "epoch": 0.32583448387882724, + "grad_norm": 11.982123261840117, + "learning_rate": 3.937844001834907e-06, + "loss": 0.4771, + "step": 4012 + }, + { + "epoch": 0.3259156988548688, + "grad_norm": 8.177076369022492, + "learning_rate": 3.93730597987905e-06, + "loss": 0.5762, + "step": 4013 + }, + { + "epoch": 0.3259969138309104, + "grad_norm": 4.802261707731817, + "learning_rate": 3.936767858469228e-06, + "loss": 0.4574, + "step": 4014 + }, + { + "epoch": 0.32607812880695203, + "grad_norm": 4.140558850278409, + "learning_rate": 3.936229637642672e-06, + "loss": 0.4855, + "step": 4015 + }, + { + "epoch": 0.3261593437829936, + "grad_norm": 3.5682197931038644, + "learning_rate": 3.935691317436628e-06, + "loss": 0.6508, + "step": 4016 + }, + { + "epoch": 0.3262405587590352, + "grad_norm": 4.124877825005949, + "learning_rate": 3.9351528978883425e-06, + "loss": 0.6438, + "step": 4017 + }, + { + "epoch": 0.32632177373507676, + "grad_norm": 5.830213074804944, + "learning_rate": 3.934614379035071e-06, + "loss": 0.5822, + "step": 4018 + }, + { + "epoch": 0.32640298871111834, + "grad_norm": 8.315171390345546, + "learning_rate": 3.9340757609140785e-06, + "loss": 0.5308, + "step": 4019 + }, + { + "epoch": 0.3264842036871599, + "grad_norm": 3.4244913994169313, + "learning_rate": 3.933537043562632e-06, + "loss": 0.4855, + "step": 4020 + }, + { + "epoch": 0.3265654186632015, + "grad_norm": 4.156835180938438, + "learning_rate": 3.932998227018009e-06, + "loss": 0.6688, + "step": 4021 + }, + { + "epoch": 0.32664663363924307, + "grad_norm": 3.4822522397229956, + "learning_rate": 3.932459311317494e-06, + "loss": 0.5877, + "step": 4022 + }, + { + "epoch": 0.32672784861528464, + "grad_norm": 4.707979946257839, + "learning_rate": 3.931920296498374e-06, + "loss": 0.5496, + "step": 4023 + }, + { + "epoch": 0.3268090635913262, + "grad_norm": 5.369501192774783, + "learning_rate": 3.931381182597949e-06, + "loss": 0.4053, + "step": 4024 + }, + { + "epoch": 0.3268902785673678, + "grad_norm": 7.0165291185775, + "learning_rate": 3.930841969653521e-06, + "loss": 0.5032, + "step": 4025 + }, + { + "epoch": 0.32697149354340943, + "grad_norm": 3.7285656291064844, + "learning_rate": 3.930302657702402e-06, + "loss": 0.5277, + "step": 4026 + }, + { + "epoch": 0.327052708519451, + "grad_norm": 4.479547869133663, + "learning_rate": 3.929763246781909e-06, + "loss": 0.5248, + "step": 4027 + }, + { + "epoch": 0.3271339234954926, + "grad_norm": 5.115409985178063, + "learning_rate": 3.929223736929366e-06, + "loss": 0.4248, + "step": 4028 + }, + { + "epoch": 0.32721513847153416, + "grad_norm": 6.753042464948224, + "learning_rate": 3.928684128182104e-06, + "loss": 0.5976, + "step": 4029 + }, + { + "epoch": 0.32729635344757574, + "grad_norm": 6.503506149392067, + "learning_rate": 3.9281444205774625e-06, + "loss": 0.5214, + "step": 4030 + }, + { + "epoch": 0.3273775684236173, + "grad_norm": 3.8958786789276787, + "learning_rate": 3.927604614152784e-06, + "loss": 0.6028, + "step": 4031 + }, + { + "epoch": 0.3274587833996589, + "grad_norm": 4.602684285068746, + "learning_rate": 3.927064708945423e-06, + "loss": 0.5836, + "step": 4032 + }, + { + "epoch": 0.32753999837570047, + "grad_norm": 5.4356792062574515, + "learning_rate": 3.926524704992736e-06, + "loss": 0.4976, + "step": 4033 + }, + { + "epoch": 0.32762121335174205, + "grad_norm": 5.1282337398644, + "learning_rate": 3.9259846023320895e-06, + "loss": 0.6328, + "step": 4034 + }, + { + "epoch": 0.3277024283277836, + "grad_norm": 5.574289088333828, + "learning_rate": 3.925444401000855e-06, + "loss": 0.5181, + "step": 4035 + }, + { + "epoch": 0.3277836433038252, + "grad_norm": 4.638010244640014, + "learning_rate": 3.924904101036413e-06, + "loss": 0.8373, + "step": 4036 + }, + { + "epoch": 0.32786485827986683, + "grad_norm": 3.1082588154104225, + "learning_rate": 3.924363702476147e-06, + "loss": 0.5468, + "step": 4037 + }, + { + "epoch": 0.3279460732559084, + "grad_norm": 3.8088490874389427, + "learning_rate": 3.923823205357453e-06, + "loss": 0.555, + "step": 4038 + }, + { + "epoch": 0.32802728823195, + "grad_norm": 3.64966498327372, + "learning_rate": 3.923282609717727e-06, + "loss": 0.4554, + "step": 4039 + }, + { + "epoch": 0.32810850320799156, + "grad_norm": 5.086174759269899, + "learning_rate": 3.922741915594378e-06, + "loss": 0.5311, + "step": 4040 + }, + { + "epoch": 0.32818971818403314, + "grad_norm": 3.596353017219979, + "learning_rate": 3.9222011230248175e-06, + "loss": 0.6889, + "step": 4041 + }, + { + "epoch": 0.3282709331600747, + "grad_norm": 4.391572461946081, + "learning_rate": 3.9216602320464655e-06, + "loss": 0.6185, + "step": 4042 + }, + { + "epoch": 0.3283521481361163, + "grad_norm": 3.615775625258503, + "learning_rate": 3.921119242696751e-06, + "loss": 0.5438, + "step": 4043 + }, + { + "epoch": 0.3284333631121579, + "grad_norm": 5.820794815401509, + "learning_rate": 3.920578155013106e-06, + "loss": 0.6468, + "step": 4044 + }, + { + "epoch": 0.32851457808819945, + "grad_norm": 5.467081465670503, + "learning_rate": 3.92003696903297e-06, + "loss": 0.4855, + "step": 4045 + }, + { + "epoch": 0.328595793064241, + "grad_norm": 5.561814863652521, + "learning_rate": 3.919495684793792e-06, + "loss": 0.6468, + "step": 4046 + }, + { + "epoch": 0.3286770080402826, + "grad_norm": 3.807058673477291, + "learning_rate": 3.918954302333025e-06, + "loss": 0.5918, + "step": 4047 + }, + { + "epoch": 0.32875822301632424, + "grad_norm": 13.31883734675245, + "learning_rate": 3.91841282168813e-06, + "loss": 0.6432, + "step": 4048 + }, + { + "epoch": 0.3288394379923658, + "grad_norm": 7.056921038692695, + "learning_rate": 3.917871242896575e-06, + "loss": 0.4209, + "step": 4049 + }, + { + "epoch": 0.3289206529684074, + "grad_norm": 3.9195936437606025, + "learning_rate": 3.917329565995833e-06, + "loss": 0.5187, + "step": 4050 + }, + { + "epoch": 0.32900186794444897, + "grad_norm": 4.550380167725966, + "learning_rate": 3.916787791023386e-06, + "loss": 0.5355, + "step": 4051 + }, + { + "epoch": 0.32908308292049054, + "grad_norm": 3.6800755148359148, + "learning_rate": 3.916245918016724e-06, + "loss": 0.5518, + "step": 4052 + }, + { + "epoch": 0.3291642978965321, + "grad_norm": 3.478748983074201, + "learning_rate": 3.915703947013338e-06, + "loss": 0.7117, + "step": 4053 + }, + { + "epoch": 0.3292455128725737, + "grad_norm": 3.7123489856745957, + "learning_rate": 3.9151618780507316e-06, + "loss": 0.5708, + "step": 4054 + }, + { + "epoch": 0.3293267278486153, + "grad_norm": 5.643512507779105, + "learning_rate": 3.914619711166413e-06, + "loss": 0.5249, + "step": 4055 + }, + { + "epoch": 0.32940794282465685, + "grad_norm": 3.920365869559619, + "learning_rate": 3.914077446397897e-06, + "loss": 0.493, + "step": 4056 + }, + { + "epoch": 0.32948915780069843, + "grad_norm": 5.253760200255425, + "learning_rate": 3.913535083782707e-06, + "loss": 0.4857, + "step": 4057 + }, + { + "epoch": 0.32957037277674, + "grad_norm": 5.037318300624211, + "learning_rate": 3.912992623358368e-06, + "loss": 0.5694, + "step": 4058 + }, + { + "epoch": 0.32965158775278164, + "grad_norm": 16.068629096649826, + "learning_rate": 3.91245006516242e-06, + "loss": 0.407, + "step": 4059 + }, + { + "epoch": 0.3297328027288232, + "grad_norm": 8.438998004731252, + "learning_rate": 3.911907409232402e-06, + "loss": 0.5709, + "step": 4060 + }, + { + "epoch": 0.3298140177048648, + "grad_norm": 4.031413575021964, + "learning_rate": 3.911364655605863e-06, + "loss": 0.5698, + "step": 4061 + }, + { + "epoch": 0.32989523268090637, + "grad_norm": 6.129803113623354, + "learning_rate": 3.9108218043203595e-06, + "loss": 0.5103, + "step": 4062 + }, + { + "epoch": 0.32997644765694795, + "grad_norm": 5.08608452932562, + "learning_rate": 3.910278855413454e-06, + "loss": 0.6426, + "step": 4063 + }, + { + "epoch": 0.3300576626329895, + "grad_norm": 4.487322847861987, + "learning_rate": 3.909735808922716e-06, + "loss": 0.5326, + "step": 4064 + }, + { + "epoch": 0.3301388776090311, + "grad_norm": 5.951925940505768, + "learning_rate": 3.90919266488572e-06, + "loss": 0.481, + "step": 4065 + }, + { + "epoch": 0.3302200925850727, + "grad_norm": 5.479126876701789, + "learning_rate": 3.908649423340049e-06, + "loss": 0.5089, + "step": 4066 + }, + { + "epoch": 0.33030130756111425, + "grad_norm": 4.922261403979778, + "learning_rate": 3.908106084323295e-06, + "loss": 0.3857, + "step": 4067 + }, + { + "epoch": 0.33038252253715583, + "grad_norm": 5.574237760697828, + "learning_rate": 3.9075626478730515e-06, + "loss": 0.6416, + "step": 4068 + }, + { + "epoch": 0.3304637375131974, + "grad_norm": 3.790844587172487, + "learning_rate": 3.907019114026922e-06, + "loss": 0.5985, + "step": 4069 + }, + { + "epoch": 0.33054495248923904, + "grad_norm": 6.112434755772752, + "learning_rate": 3.906475482822517e-06, + "loss": 0.5749, + "step": 4070 + }, + { + "epoch": 0.3306261674652806, + "grad_norm": 3.6083314382705836, + "learning_rate": 3.905931754297451e-06, + "loss": 0.5349, + "step": 4071 + }, + { + "epoch": 0.3307073824413222, + "grad_norm": 7.284570880294314, + "learning_rate": 3.905387928489349e-06, + "loss": 1.0363, + "step": 4072 + }, + { + "epoch": 0.33078859741736377, + "grad_norm": 4.232160296762864, + "learning_rate": 3.904844005435841e-06, + "loss": 0.5585, + "step": 4073 + }, + { + "epoch": 0.33086981239340535, + "grad_norm": 6.814310707158021, + "learning_rate": 3.904299985174562e-06, + "loss": 0.6862, + "step": 4074 + }, + { + "epoch": 0.3309510273694469, + "grad_norm": 4.171877579913468, + "learning_rate": 3.903755867743156e-06, + "loss": 0.5983, + "step": 4075 + }, + { + "epoch": 0.3310322423454885, + "grad_norm": 7.101609370383942, + "learning_rate": 3.9032116531792745e-06, + "loss": 0.5303, + "step": 4076 + }, + { + "epoch": 0.3311134573215301, + "grad_norm": 5.802670937749591, + "learning_rate": 3.902667341520572e-06, + "loss": 0.5949, + "step": 4077 + }, + { + "epoch": 0.33119467229757166, + "grad_norm": 4.414654939089757, + "learning_rate": 3.902122932804713e-06, + "loss": 0.5337, + "step": 4078 + }, + { + "epoch": 0.33127588727361323, + "grad_norm": 5.085650564698827, + "learning_rate": 3.901578427069368e-06, + "loss": 0.5897, + "step": 4079 + }, + { + "epoch": 0.3313571022496548, + "grad_norm": 3.7722391508616036, + "learning_rate": 3.901033824352213e-06, + "loss": 0.5486, + "step": 4080 + }, + { + "epoch": 0.33143831722569644, + "grad_norm": 6.237526026700791, + "learning_rate": 3.9004891246909325e-06, + "loss": 0.5186, + "step": 4081 + }, + { + "epoch": 0.331519532201738, + "grad_norm": 4.2314785310516365, + "learning_rate": 3.8999443281232175e-06, + "loss": 0.727, + "step": 4082 + }, + { + "epoch": 0.3316007471777796, + "grad_norm": 4.575453746497402, + "learning_rate": 3.899399434686762e-06, + "loss": 0.4363, + "step": 4083 + }, + { + "epoch": 0.3316819621538212, + "grad_norm": 5.329498253898677, + "learning_rate": 3.898854444419274e-06, + "loss": 0.4231, + "step": 4084 + }, + { + "epoch": 0.33176317712986275, + "grad_norm": 7.175459151315008, + "learning_rate": 3.8983093573584605e-06, + "loss": 0.4597, + "step": 4085 + }, + { + "epoch": 0.33184439210590433, + "grad_norm": 4.3131878789803855, + "learning_rate": 3.89776417354204e-06, + "loss": 0.4702, + "step": 4086 + }, + { + "epoch": 0.3319256070819459, + "grad_norm": 4.921638293579504, + "learning_rate": 3.897218893007737e-06, + "loss": 0.5382, + "step": 4087 + }, + { + "epoch": 0.3320068220579875, + "grad_norm": 5.520197777335041, + "learning_rate": 3.896673515793281e-06, + "loss": 0.5162, + "step": 4088 + }, + { + "epoch": 0.33208803703402906, + "grad_norm": 4.2991891020242265, + "learning_rate": 3.89612804193641e-06, + "loss": 0.4928, + "step": 4089 + }, + { + "epoch": 0.33216925201007064, + "grad_norm": 5.160012794460195, + "learning_rate": 3.895582471474866e-06, + "loss": 0.5771, + "step": 4090 + }, + { + "epoch": 0.3322504669861122, + "grad_norm": 4.051259036428905, + "learning_rate": 3.895036804446402e-06, + "loss": 0.4006, + "step": 4091 + }, + { + "epoch": 0.33233168196215385, + "grad_norm": 8.065546594512844, + "learning_rate": 3.894491040888774e-06, + "loss": 0.7044, + "step": 4092 + }, + { + "epoch": 0.3324128969381954, + "grad_norm": 5.703512084951918, + "learning_rate": 3.893945180839747e-06, + "loss": 0.6109, + "step": 4093 + }, + { + "epoch": 0.332494111914237, + "grad_norm": 6.178693238141215, + "learning_rate": 3.893399224337089e-06, + "loss": 0.5347, + "step": 4094 + }, + { + "epoch": 0.3325753268902786, + "grad_norm": 3.9476533225077293, + "learning_rate": 3.892853171418581e-06, + "loss": 0.7827, + "step": 4095 + }, + { + "epoch": 0.33265654186632015, + "grad_norm": 6.5472391710424755, + "learning_rate": 3.8923070221220035e-06, + "loss": 0.5795, + "step": 4096 + }, + { + "epoch": 0.33273775684236173, + "grad_norm": 6.934804355699152, + "learning_rate": 3.891760776485151e-06, + "loss": 0.4096, + "step": 4097 + }, + { + "epoch": 0.3328189718184033, + "grad_norm": 4.704286066411406, + "learning_rate": 3.891214434545817e-06, + "loss": 0.5433, + "step": 4098 + }, + { + "epoch": 0.3329001867944449, + "grad_norm": 6.41984014774679, + "learning_rate": 3.890667996341806e-06, + "loss": 0.4911, + "step": 4099 + }, + { + "epoch": 0.33298140177048646, + "grad_norm": 5.021861875613623, + "learning_rate": 3.8901214619109315e-06, + "loss": 0.6617, + "step": 4100 + }, + { + "epoch": 0.33306261674652804, + "grad_norm": 5.862846079718947, + "learning_rate": 3.889574831291008e-06, + "loss": 0.5158, + "step": 4101 + }, + { + "epoch": 0.3331438317225696, + "grad_norm": 3.942541898863967, + "learning_rate": 3.88902810451986e-06, + "loss": 0.6377, + "step": 4102 + }, + { + "epoch": 0.33322504669861125, + "grad_norm": 4.915257424154396, + "learning_rate": 3.88848128163532e-06, + "loss": 0.516, + "step": 4103 + }, + { + "epoch": 0.3333062616746528, + "grad_norm": 4.848991134824205, + "learning_rate": 3.887934362675223e-06, + "loss": 0.427, + "step": 4104 + }, + { + "epoch": 0.3333874766506944, + "grad_norm": 5.880446166851823, + "learning_rate": 3.887387347677413e-06, + "loss": 0.4525, + "step": 4105 + }, + { + "epoch": 0.333468691626736, + "grad_norm": 7.1524682868349405, + "learning_rate": 3.886840236679742e-06, + "loss": 0.4618, + "step": 4106 + }, + { + "epoch": 0.33354990660277756, + "grad_norm": 4.096340585575771, + "learning_rate": 3.8862930297200665e-06, + "loss": 0.4553, + "step": 4107 + }, + { + "epoch": 0.33363112157881913, + "grad_norm": 3.963306450481558, + "learning_rate": 3.885745726836249e-06, + "loss": 0.5802, + "step": 4108 + }, + { + "epoch": 0.3337123365548607, + "grad_norm": 7.460142742604042, + "learning_rate": 3.885198328066163e-06, + "loss": 0.4519, + "step": 4109 + }, + { + "epoch": 0.3337935515309023, + "grad_norm": 4.260196594904515, + "learning_rate": 3.8846508334476824e-06, + "loss": 0.5484, + "step": 4110 + }, + { + "epoch": 0.33387476650694387, + "grad_norm": 3.7783084323283074, + "learning_rate": 3.884103243018693e-06, + "loss": 0.5141, + "step": 4111 + }, + { + "epoch": 0.33395598148298544, + "grad_norm": 5.894477787381174, + "learning_rate": 3.883555556817083e-06, + "loss": 0.6465, + "step": 4112 + }, + { + "epoch": 0.334037196459027, + "grad_norm": 3.0063514235175917, + "learning_rate": 3.883007774880753e-06, + "loss": 0.4949, + "step": 4113 + }, + { + "epoch": 0.33411841143506865, + "grad_norm": 5.033096732224493, + "learning_rate": 3.882459897247603e-06, + "loss": 0.3705, + "step": 4114 + }, + { + "epoch": 0.33419962641111023, + "grad_norm": 5.991095754571292, + "learning_rate": 3.881911923955545e-06, + "loss": 0.4903, + "step": 4115 + }, + { + "epoch": 0.3342808413871518, + "grad_norm": 6.237218731747614, + "learning_rate": 3.881363855042496e-06, + "loss": 0.5439, + "step": 4116 + }, + { + "epoch": 0.3343620563631934, + "grad_norm": 4.522440392622378, + "learning_rate": 3.880815690546378e-06, + "loss": 0.4756, + "step": 4117 + }, + { + "epoch": 0.33444327133923496, + "grad_norm": 12.342772678402822, + "learning_rate": 3.880267430505123e-06, + "loss": 0.5839, + "step": 4118 + }, + { + "epoch": 0.33452448631527654, + "grad_norm": 4.148599127985545, + "learning_rate": 3.879719074956667e-06, + "loss": 0.5127, + "step": 4119 + }, + { + "epoch": 0.3346057012913181, + "grad_norm": 4.58831638518393, + "learning_rate": 3.879170623938951e-06, + "loss": 0.6514, + "step": 4120 + }, + { + "epoch": 0.3346869162673597, + "grad_norm": 4.795703097401847, + "learning_rate": 3.878622077489929e-06, + "loss": 0.5412, + "step": 4121 + }, + { + "epoch": 0.33476813124340127, + "grad_norm": 2.8973056633483325, + "learning_rate": 3.8780734356475555e-06, + "loss": 0.5741, + "step": 4122 + }, + { + "epoch": 0.33484934621944284, + "grad_norm": 3.5866605917438474, + "learning_rate": 3.8775246984497924e-06, + "loss": 0.5285, + "step": 4123 + }, + { + "epoch": 0.3349305611954844, + "grad_norm": 4.6085635915185, + "learning_rate": 3.876975865934612e-06, + "loss": 0.6005, + "step": 4124 + }, + { + "epoch": 0.33501177617152605, + "grad_norm": 8.852381490643886, + "learning_rate": 3.876426938139988e-06, + "loss": 0.3847, + "step": 4125 + }, + { + "epoch": 0.33509299114756763, + "grad_norm": 4.677167179306538, + "learning_rate": 3.875877915103905e-06, + "loss": 0.5997, + "step": 4126 + }, + { + "epoch": 0.3351742061236092, + "grad_norm": 5.192252334603382, + "learning_rate": 3.875328796864353e-06, + "loss": 0.4351, + "step": 4127 + }, + { + "epoch": 0.3352554210996508, + "grad_norm": 5.52994136456474, + "learning_rate": 3.8747795834593255e-06, + "loss": 0.5296, + "step": 4128 + }, + { + "epoch": 0.33533663607569236, + "grad_norm": 9.716260977214207, + "learning_rate": 3.8742302749268264e-06, + "loss": 0.5424, + "step": 4129 + }, + { + "epoch": 0.33541785105173394, + "grad_norm": 5.297918507808388, + "learning_rate": 3.873680871304867e-06, + "loss": 0.6156, + "step": 4130 + }, + { + "epoch": 0.3354990660277755, + "grad_norm": 6.873079350204181, + "learning_rate": 3.8731313726314615e-06, + "loss": 0.5344, + "step": 4131 + }, + { + "epoch": 0.3355802810038171, + "grad_norm": 7.36681510636934, + "learning_rate": 3.87258177894463e-06, + "loss": 0.4929, + "step": 4132 + }, + { + "epoch": 0.33566149597985867, + "grad_norm": 5.602356345828153, + "learning_rate": 3.872032090282406e-06, + "loss": 0.5442, + "step": 4133 + }, + { + "epoch": 0.33574271095590025, + "grad_norm": 7.3672095285990435, + "learning_rate": 3.871482306682821e-06, + "loss": 0.492, + "step": 4134 + }, + { + "epoch": 0.3358239259319418, + "grad_norm": 3.5366167997788427, + "learning_rate": 3.8709324281839205e-06, + "loss": 0.5198, + "step": 4135 + }, + { + "epoch": 0.33590514090798346, + "grad_norm": 6.858040921971297, + "learning_rate": 3.87038245482375e-06, + "loss": 0.5951, + "step": 4136 + }, + { + "epoch": 0.33598635588402503, + "grad_norm": 6.617980225631782, + "learning_rate": 3.869832386640367e-06, + "loss": 0.5853, + "step": 4137 + }, + { + "epoch": 0.3360675708600666, + "grad_norm": 4.219229609437151, + "learning_rate": 3.8692822236718334e-06, + "loss": 0.5662, + "step": 4138 + }, + { + "epoch": 0.3361487858361082, + "grad_norm": 7.439696385730913, + "learning_rate": 3.868731965956215e-06, + "loss": 0.4658, + "step": 4139 + }, + { + "epoch": 0.33623000081214977, + "grad_norm": 5.487826086727248, + "learning_rate": 3.86818161353159e-06, + "loss": 0.4537, + "step": 4140 + }, + { + "epoch": 0.33631121578819134, + "grad_norm": 4.200360102230179, + "learning_rate": 3.867631166436038e-06, + "loss": 0.5663, + "step": 4141 + }, + { + "epoch": 0.3363924307642329, + "grad_norm": 6.128604530592701, + "learning_rate": 3.867080624707647e-06, + "loss": 0.6134, + "step": 4142 + }, + { + "epoch": 0.3364736457402745, + "grad_norm": 8.81104997626726, + "learning_rate": 3.866529988384512e-06, + "loss": 0.5773, + "step": 4143 + }, + { + "epoch": 0.3365548607163161, + "grad_norm": 5.16122326363201, + "learning_rate": 3.865979257504734e-06, + "loss": 0.6132, + "step": 4144 + }, + { + "epoch": 0.33663607569235765, + "grad_norm": 5.870897322328888, + "learning_rate": 3.8654284321064205e-06, + "loss": 0.6016, + "step": 4145 + }, + { + "epoch": 0.3367172906683992, + "grad_norm": 5.314926965529579, + "learning_rate": 3.864877512227686e-06, + "loss": 0.5678, + "step": 4146 + }, + { + "epoch": 0.33679850564444086, + "grad_norm": 4.818645386450623, + "learning_rate": 3.864326497906652e-06, + "loss": 0.5558, + "step": 4147 + }, + { + "epoch": 0.33687972062048244, + "grad_norm": 14.41001497081986, + "learning_rate": 3.8637753891814435e-06, + "loss": 0.5669, + "step": 4148 + }, + { + "epoch": 0.336960935596524, + "grad_norm": 10.608448522114792, + "learning_rate": 3.863224186090197e-06, + "loss": 0.5596, + "step": 4149 + }, + { + "epoch": 0.3370421505725656, + "grad_norm": 6.7059941990937775, + "learning_rate": 3.862672888671051e-06, + "loss": 0.5213, + "step": 4150 + }, + { + "epoch": 0.33712336554860717, + "grad_norm": 3.7804743609104237, + "learning_rate": 3.862121496962153e-06, + "loss": 0.7185, + "step": 4151 + }, + { + "epoch": 0.33720458052464874, + "grad_norm": 5.24213546954736, + "learning_rate": 3.861570011001658e-06, + "loss": 0.5231, + "step": 4152 + }, + { + "epoch": 0.3372857955006903, + "grad_norm": 7.547577006376037, + "learning_rate": 3.8610184308277216e-06, + "loss": 0.4785, + "step": 4153 + }, + { + "epoch": 0.3373670104767319, + "grad_norm": 2.5300512242174387, + "learning_rate": 3.860466756478514e-06, + "loss": 0.6946, + "step": 4154 + }, + { + "epoch": 0.3374482254527735, + "grad_norm": 4.653767821488039, + "learning_rate": 3.859914987992207e-06, + "loss": 0.7683, + "step": 4155 + }, + { + "epoch": 0.33752944042881505, + "grad_norm": 4.930191892040406, + "learning_rate": 3.85936312540698e-06, + "loss": 0.5301, + "step": 4156 + }, + { + "epoch": 0.33761065540485663, + "grad_norm": 5.429582397608964, + "learning_rate": 3.858811168761019e-06, + "loss": 0.4413, + "step": 4157 + }, + { + "epoch": 0.33769187038089826, + "grad_norm": 4.863739044933661, + "learning_rate": 3.8582591180925164e-06, + "loss": 0.4454, + "step": 4158 + }, + { + "epoch": 0.33777308535693984, + "grad_norm": 5.522590385270159, + "learning_rate": 3.857706973439672e-06, + "loss": 0.507, + "step": 4159 + }, + { + "epoch": 0.3378543003329814, + "grad_norm": 5.8411296020442, + "learning_rate": 3.85715473484069e-06, + "loss": 0.5373, + "step": 4160 + }, + { + "epoch": 0.337935515309023, + "grad_norm": 6.215435578421796, + "learning_rate": 3.856602402333783e-06, + "loss": 0.5099, + "step": 4161 + }, + { + "epoch": 0.33801673028506457, + "grad_norm": 3.6082614721689215, + "learning_rate": 3.85604997595717e-06, + "loss": 0.5364, + "step": 4162 + }, + { + "epoch": 0.33809794526110615, + "grad_norm": 6.122674136049005, + "learning_rate": 3.855497455749076e-06, + "loss": 0.5525, + "step": 4163 + }, + { + "epoch": 0.3381791602371477, + "grad_norm": 7.470369967210209, + "learning_rate": 3.854944841747731e-06, + "loss": 0.4537, + "step": 4164 + }, + { + "epoch": 0.3382603752131893, + "grad_norm": 3.13970191041356, + "learning_rate": 3.854392133991373e-06, + "loss": 0.4119, + "step": 4165 + }, + { + "epoch": 0.3383415901892309, + "grad_norm": 5.817426875860977, + "learning_rate": 3.853839332518249e-06, + "loss": 0.6385, + "step": 4166 + }, + { + "epoch": 0.33842280516527246, + "grad_norm": 6.211091908912435, + "learning_rate": 3.8532864373666076e-06, + "loss": 0.5794, + "step": 4167 + }, + { + "epoch": 0.33850402014131403, + "grad_norm": 5.831264510593337, + "learning_rate": 3.852733448574707e-06, + "loss": 0.53, + "step": 4168 + }, + { + "epoch": 0.33858523511735567, + "grad_norm": 6.324531527655222, + "learning_rate": 3.8521803661808105e-06, + "loss": 0.4364, + "step": 4169 + }, + { + "epoch": 0.33866645009339724, + "grad_norm": 4.057887664623083, + "learning_rate": 3.851627190223189e-06, + "loss": 0.6626, + "step": 4170 + }, + { + "epoch": 0.3387476650694388, + "grad_norm": 6.855870724748997, + "learning_rate": 3.85107392074012e-06, + "loss": 0.5495, + "step": 4171 + }, + { + "epoch": 0.3388288800454804, + "grad_norm": 4.761555193030634, + "learning_rate": 3.850520557769886e-06, + "loss": 0.5127, + "step": 4172 + }, + { + "epoch": 0.338910095021522, + "grad_norm": 4.705148709464969, + "learning_rate": 3.849967101350777e-06, + "loss": 0.4324, + "step": 4173 + }, + { + "epoch": 0.33899130999756355, + "grad_norm": 6.188404278264953, + "learning_rate": 3.849413551521089e-06, + "loss": 0.4936, + "step": 4174 + }, + { + "epoch": 0.3390725249736051, + "grad_norm": 4.011266087707078, + "learning_rate": 3.848859908319124e-06, + "loss": 0.4387, + "step": 4175 + }, + { + "epoch": 0.3391537399496467, + "grad_norm": 3.339221489587599, + "learning_rate": 3.8483061717831935e-06, + "loss": 0.6163, + "step": 4176 + }, + { + "epoch": 0.3392349549256883, + "grad_norm": 5.751859436326058, + "learning_rate": 3.8477523419516115e-06, + "loss": 0.5683, + "step": 4177 + }, + { + "epoch": 0.33931616990172986, + "grad_norm": 5.088627831398807, + "learning_rate": 3.8471984188627e-06, + "loss": 0.5431, + "step": 4178 + }, + { + "epoch": 0.33939738487777144, + "grad_norm": 5.37045736248948, + "learning_rate": 3.846644402554788e-06, + "loss": 0.7555, + "step": 4179 + }, + { + "epoch": 0.33947859985381307, + "grad_norm": 5.257725496690353, + "learning_rate": 3.84609029306621e-06, + "loss": 0.4728, + "step": 4180 + }, + { + "epoch": 0.33955981482985464, + "grad_norm": 4.575264421254666, + "learning_rate": 3.845536090435308e-06, + "loss": 0.5399, + "step": 4181 + }, + { + "epoch": 0.3396410298058962, + "grad_norm": 8.153018797116616, + "learning_rate": 3.84498179470043e-06, + "loss": 0.6235, + "step": 4182 + }, + { + "epoch": 0.3397222447819378, + "grad_norm": 6.682613939663991, + "learning_rate": 3.8444274058999295e-06, + "loss": 0.4972, + "step": 4183 + }, + { + "epoch": 0.3398034597579794, + "grad_norm": 4.184180026129862, + "learning_rate": 3.843872924072168e-06, + "loss": 0.4771, + "step": 4184 + }, + { + "epoch": 0.33988467473402095, + "grad_norm": 5.132706408054481, + "learning_rate": 3.843318349255512e-06, + "loss": 0.4656, + "step": 4185 + }, + { + "epoch": 0.33996588971006253, + "grad_norm": 4.503889789040779, + "learning_rate": 3.842763681488337e-06, + "loss": 0.5627, + "step": 4186 + }, + { + "epoch": 0.3400471046861041, + "grad_norm": 8.418497213540109, + "learning_rate": 3.84220892080902e-06, + "loss": 0.4399, + "step": 4187 + }, + { + "epoch": 0.3401283196621457, + "grad_norm": 5.704270836674129, + "learning_rate": 3.841654067255951e-06, + "loss": 0.4365, + "step": 4188 + }, + { + "epoch": 0.34020953463818726, + "grad_norm": 3.7971463715319813, + "learning_rate": 3.84109912086752e-06, + "loss": 0.6448, + "step": 4189 + }, + { + "epoch": 0.34029074961422884, + "grad_norm": 5.144040812396076, + "learning_rate": 3.840544081682128e-06, + "loss": 0.7299, + "step": 4190 + }, + { + "epoch": 0.34037196459027047, + "grad_norm": 4.053682645967981, + "learning_rate": 3.839988949738179e-06, + "loss": 0.4847, + "step": 4191 + }, + { + "epoch": 0.34045317956631205, + "grad_norm": 4.2685472291584245, + "learning_rate": 3.8394337250740886e-06, + "loss": 0.4542, + "step": 4192 + }, + { + "epoch": 0.3405343945423536, + "grad_norm": 5.165758993343546, + "learning_rate": 3.838878407728272e-06, + "loss": 0.5573, + "step": 4193 + }, + { + "epoch": 0.3406156095183952, + "grad_norm": 4.470607814383757, + "learning_rate": 3.838322997739155e-06, + "loss": 0.6386, + "step": 4194 + }, + { + "epoch": 0.3406968244944368, + "grad_norm": 6.678206627035974, + "learning_rate": 3.837767495145171e-06, + "loss": 0.6893, + "step": 4195 + }, + { + "epoch": 0.34077803947047836, + "grad_norm": 3.54047842198611, + "learning_rate": 3.837211899984756e-06, + "loss": 0.6608, + "step": 4196 + }, + { + "epoch": 0.34085925444651993, + "grad_norm": 2.4752964852325023, + "learning_rate": 3.836656212296353e-06, + "loss": 0.752, + "step": 4197 + }, + { + "epoch": 0.3409404694225615, + "grad_norm": 5.131230574367217, + "learning_rate": 3.836100432118416e-06, + "loss": 0.5224, + "step": 4198 + }, + { + "epoch": 0.3410216843986031, + "grad_norm": 7.654106670047192, + "learning_rate": 3.8355445594894e-06, + "loss": 0.5236, + "step": 4199 + }, + { + "epoch": 0.34110289937464466, + "grad_norm": 5.475141337183228, + "learning_rate": 3.834988594447768e-06, + "loss": 0.3241, + "step": 4200 + }, + { + "epoch": 0.34118411435068624, + "grad_norm": 4.9511971467274964, + "learning_rate": 3.8344325370319914e-06, + "loss": 0.5689, + "step": 4201 + }, + { + "epoch": 0.3412653293267279, + "grad_norm": 4.8632486482468, + "learning_rate": 3.833876387280546e-06, + "loss": 0.6184, + "step": 4202 + }, + { + "epoch": 0.34134654430276945, + "grad_norm": 8.274806032942742, + "learning_rate": 3.833320145231913e-06, + "loss": 0.6777, + "step": 4203 + }, + { + "epoch": 0.341427759278811, + "grad_norm": 3.411012348367635, + "learning_rate": 3.832763810924583e-06, + "loss": 0.5455, + "step": 4204 + }, + { + "epoch": 0.3415089742548526, + "grad_norm": 4.294223710510589, + "learning_rate": 3.832207384397051e-06, + "loss": 0.5742, + "step": 4205 + }, + { + "epoch": 0.3415901892308942, + "grad_norm": 7.056992590176889, + "learning_rate": 3.831650865687818e-06, + "loss": 0.8163, + "step": 4206 + }, + { + "epoch": 0.34167140420693576, + "grad_norm": 4.513215530320878, + "learning_rate": 3.831094254835393e-06, + "loss": 0.4881, + "step": 4207 + }, + { + "epoch": 0.34175261918297734, + "grad_norm": 7.046767735752562, + "learning_rate": 3.8305375518782905e-06, + "loss": 0.5084, + "step": 4208 + }, + { + "epoch": 0.3418338341590189, + "grad_norm": 4.65613704507043, + "learning_rate": 3.829980756855032e-06, + "loss": 0.4564, + "step": 4209 + }, + { + "epoch": 0.3419150491350605, + "grad_norm": 5.501379878695886, + "learning_rate": 3.829423869804143e-06, + "loss": 0.5426, + "step": 4210 + }, + { + "epoch": 0.34199626411110207, + "grad_norm": 7.787760682762794, + "learning_rate": 3.828866890764157e-06, + "loss": 0.4953, + "step": 4211 + }, + { + "epoch": 0.34207747908714364, + "grad_norm": 4.761244297351993, + "learning_rate": 3.828309819773617e-06, + "loss": 0.7606, + "step": 4212 + }, + { + "epoch": 0.3421586940631853, + "grad_norm": 6.816779994137413, + "learning_rate": 3.827752656871067e-06, + "loss": 0.4022, + "step": 4213 + }, + { + "epoch": 0.34223990903922685, + "grad_norm": 5.030056000377152, + "learning_rate": 3.827195402095059e-06, + "loss": 0.5904, + "step": 4214 + }, + { + "epoch": 0.34232112401526843, + "grad_norm": 5.852140493007803, + "learning_rate": 3.826638055484154e-06, + "loss": 0.423, + "step": 4215 + }, + { + "epoch": 0.34240233899131, + "grad_norm": 4.727427200553123, + "learning_rate": 3.826080617076917e-06, + "loss": 0.6244, + "step": 4216 + }, + { + "epoch": 0.3424835539673516, + "grad_norm": 8.080683087721372, + "learning_rate": 3.825523086911919e-06, + "loss": 0.4194, + "step": 4217 + }, + { + "epoch": 0.34256476894339316, + "grad_norm": 3.4664753816902834, + "learning_rate": 3.824965465027739e-06, + "loss": 0.5493, + "step": 4218 + }, + { + "epoch": 0.34264598391943474, + "grad_norm": 8.5527804211556, + "learning_rate": 3.824407751462962e-06, + "loss": 0.5962, + "step": 4219 + }, + { + "epoch": 0.3427271988954763, + "grad_norm": 9.91272395777568, + "learning_rate": 3.823849946256176e-06, + "loss": 0.5413, + "step": 4220 + }, + { + "epoch": 0.3428084138715179, + "grad_norm": 5.49826382214047, + "learning_rate": 3.82329204944598e-06, + "loss": 0.4414, + "step": 4221 + }, + { + "epoch": 0.34288962884755947, + "grad_norm": 5.466358398271932, + "learning_rate": 3.822734061070979e-06, + "loss": 0.653, + "step": 4222 + }, + { + "epoch": 0.34297084382360105, + "grad_norm": 11.56485219391147, + "learning_rate": 3.8221759811697814e-06, + "loss": 0.6647, + "step": 4223 + }, + { + "epoch": 0.3430520587996427, + "grad_norm": 17.302847263409163, + "learning_rate": 3.821617809781004e-06, + "loss": 0.5001, + "step": 4224 + }, + { + "epoch": 0.34313327377568426, + "grad_norm": 4.9393254820446, + "learning_rate": 3.821059546943268e-06, + "loss": 0.4429, + "step": 4225 + }, + { + "epoch": 0.34321448875172583, + "grad_norm": 4.737433746889213, + "learning_rate": 3.820501192695202e-06, + "loss": 0.6441, + "step": 4226 + }, + { + "epoch": 0.3432957037277674, + "grad_norm": 5.321040703740516, + "learning_rate": 3.819942747075443e-06, + "loss": 0.4669, + "step": 4227 + }, + { + "epoch": 0.343376918703809, + "grad_norm": 5.28796113251718, + "learning_rate": 3.819384210122631e-06, + "loss": 0.682, + "step": 4228 + }, + { + "epoch": 0.34345813367985056, + "grad_norm": 4.339078198914844, + "learning_rate": 3.818825581875415e-06, + "loss": 0.5883, + "step": 4229 + }, + { + "epoch": 0.34353934865589214, + "grad_norm": 4.128567458915762, + "learning_rate": 3.818266862372449e-06, + "loss": 0.5184, + "step": 4230 + }, + { + "epoch": 0.3436205636319337, + "grad_norm": 3.9206753012437146, + "learning_rate": 3.817708051652392e-06, + "loss": 0.6334, + "step": 4231 + }, + { + "epoch": 0.3437017786079753, + "grad_norm": 4.303153750803405, + "learning_rate": 3.817149149753912e-06, + "loss": 0.5891, + "step": 4232 + }, + { + "epoch": 0.34378299358401687, + "grad_norm": 4.474909831406013, + "learning_rate": 3.816590156715682e-06, + "loss": 0.5449, + "step": 4233 + }, + { + "epoch": 0.34386420856005845, + "grad_norm": 4.504556577904222, + "learning_rate": 3.81603107257638e-06, + "loss": 0.4061, + "step": 4234 + }, + { + "epoch": 0.3439454235361001, + "grad_norm": 2.939069265586592, + "learning_rate": 3.815471897374695e-06, + "loss": 0.5448, + "step": 4235 + }, + { + "epoch": 0.34402663851214166, + "grad_norm": 4.384216019391756, + "learning_rate": 3.814912631149315e-06, + "loss": 0.4506, + "step": 4236 + }, + { + "epoch": 0.34410785348818324, + "grad_norm": 5.19610868271772, + "learning_rate": 3.8143532739389403e-06, + "loss": 0.5923, + "step": 4237 + }, + { + "epoch": 0.3441890684642248, + "grad_norm": 4.2187938416631185, + "learning_rate": 3.813793825782276e-06, + "loss": 0.6411, + "step": 4238 + }, + { + "epoch": 0.3442702834402664, + "grad_norm": 5.412344609770045, + "learning_rate": 3.8132342867180318e-06, + "loss": 0.6972, + "step": 4239 + }, + { + "epoch": 0.34435149841630797, + "grad_norm": 3.8719949393667887, + "learning_rate": 3.812674656784924e-06, + "loss": 0.6241, + "step": 4240 + }, + { + "epoch": 0.34443271339234954, + "grad_norm": 6.272686879652281, + "learning_rate": 3.812114936021678e-06, + "loss": 0.4416, + "step": 4241 + }, + { + "epoch": 0.3445139283683911, + "grad_norm": 4.522533410945012, + "learning_rate": 3.811555124467023e-06, + "loss": 0.6361, + "step": 4242 + }, + { + "epoch": 0.3445951433444327, + "grad_norm": 3.7359506733942554, + "learning_rate": 3.8109952221596948e-06, + "loss": 0.5521, + "step": 4243 + }, + { + "epoch": 0.3446763583204743, + "grad_norm": 5.599008701288348, + "learning_rate": 3.810435229138435e-06, + "loss": 0.6105, + "step": 4244 + }, + { + "epoch": 0.34475757329651585, + "grad_norm": 4.77660925406526, + "learning_rate": 3.8098751454419925e-06, + "loss": 0.6491, + "step": 4245 + }, + { + "epoch": 0.3448387882725575, + "grad_norm": 5.070740770359748, + "learning_rate": 3.8093149711091227e-06, + "loss": 0.4824, + "step": 4246 + }, + { + "epoch": 0.34492000324859906, + "grad_norm": 5.29618525895374, + "learning_rate": 3.8087547061785864e-06, + "loss": 0.591, + "step": 4247 + }, + { + "epoch": 0.34500121822464064, + "grad_norm": 4.011238439114341, + "learning_rate": 3.8081943506891505e-06, + "loss": 0.6667, + "step": 4248 + }, + { + "epoch": 0.3450824332006822, + "grad_norm": 4.640224694054099, + "learning_rate": 3.8076339046795897e-06, + "loss": 0.5231, + "step": 4249 + }, + { + "epoch": 0.3451636481767238, + "grad_norm": 9.930057551543705, + "learning_rate": 3.807073368188683e-06, + "loss": 0.7279, + "step": 4250 + }, + { + "epoch": 0.34524486315276537, + "grad_norm": 10.280051980432813, + "learning_rate": 3.8065127412552172e-06, + "loss": 0.7463, + "step": 4251 + }, + { + "epoch": 0.34532607812880695, + "grad_norm": 4.260596124125055, + "learning_rate": 3.8059520239179836e-06, + "loss": 0.7628, + "step": 4252 + }, + { + "epoch": 0.3454072931048485, + "grad_norm": 6.511349904040633, + "learning_rate": 3.805391216215782e-06, + "loss": 0.4457, + "step": 4253 + }, + { + "epoch": 0.3454885080808901, + "grad_norm": 7.176157624874702, + "learning_rate": 3.8048303181874167e-06, + "loss": 0.4474, + "step": 4254 + }, + { + "epoch": 0.3455697230569317, + "grad_norm": 5.9798967177829345, + "learning_rate": 3.8042693298717e-06, + "loss": 0.7248, + "step": 4255 + }, + { + "epoch": 0.34565093803297325, + "grad_norm": 5.3870467284981, + "learning_rate": 3.8037082513074468e-06, + "loss": 0.4971, + "step": 4256 + }, + { + "epoch": 0.3457321530090149, + "grad_norm": 6.0361440421296, + "learning_rate": 3.8031470825334838e-06, + "loss": 0.5185, + "step": 4257 + }, + { + "epoch": 0.34581336798505646, + "grad_norm": 3.7390306125476243, + "learning_rate": 3.8025858235886394e-06, + "loss": 0.7618, + "step": 4258 + }, + { + "epoch": 0.34589458296109804, + "grad_norm": 6.423197360909206, + "learning_rate": 3.802024474511749e-06, + "loss": 0.5033, + "step": 4259 + }, + { + "epoch": 0.3459757979371396, + "grad_norm": 4.523265015480748, + "learning_rate": 3.801463035341656e-06, + "loss": 0.5696, + "step": 4260 + }, + { + "epoch": 0.3460570129131812, + "grad_norm": 6.030313585209124, + "learning_rate": 3.8009015061172095e-06, + "loss": 0.5074, + "step": 4261 + }, + { + "epoch": 0.34613822788922277, + "grad_norm": 9.4302885564273, + "learning_rate": 3.8003398868772635e-06, + "loss": 0.4557, + "step": 4262 + }, + { + "epoch": 0.34621944286526435, + "grad_norm": 7.919711884223888, + "learning_rate": 3.799778177660679e-06, + "loss": 0.5377, + "step": 4263 + }, + { + "epoch": 0.3463006578413059, + "grad_norm": 4.176073302897445, + "learning_rate": 3.7992163785063236e-06, + "loss": 0.5704, + "step": 4264 + }, + { + "epoch": 0.3463818728173475, + "grad_norm": 5.09213708548835, + "learning_rate": 3.798654489453071e-06, + "loss": 0.554, + "step": 4265 + }, + { + "epoch": 0.3464630877933891, + "grad_norm": 6.0436953163637295, + "learning_rate": 3.7980925105398004e-06, + "loss": 0.526, + "step": 4266 + }, + { + "epoch": 0.34654430276943066, + "grad_norm": 6.156030657781099, + "learning_rate": 3.7975304418053986e-06, + "loss": 0.4978, + "step": 4267 + }, + { + "epoch": 0.3466255177454723, + "grad_norm": 8.589190270382112, + "learning_rate": 3.796968283288758e-06, + "loss": 0.6419, + "step": 4268 + }, + { + "epoch": 0.34670673272151387, + "grad_norm": 3.828448877259962, + "learning_rate": 3.7964060350287747e-06, + "loss": 0.5272, + "step": 4269 + }, + { + "epoch": 0.34678794769755544, + "grad_norm": 4.335800260568463, + "learning_rate": 3.795843697064355e-06, + "loss": 0.5072, + "step": 4270 + }, + { + "epoch": 0.346869162673597, + "grad_norm": 4.112493113260704, + "learning_rate": 3.795281269434411e-06, + "loss": 0.5839, + "step": 4271 + }, + { + "epoch": 0.3469503776496386, + "grad_norm": 7.2088319680381225, + "learning_rate": 3.794718752177857e-06, + "loss": 0.7074, + "step": 4272 + }, + { + "epoch": 0.3470315926256802, + "grad_norm": 3.766153162278056, + "learning_rate": 3.7941561453336184e-06, + "loss": 0.5309, + "step": 4273 + }, + { + "epoch": 0.34711280760172175, + "grad_norm": 5.845126706777332, + "learning_rate": 3.7935934489406232e-06, + "loss": 0.3967, + "step": 4274 + }, + { + "epoch": 0.34719402257776333, + "grad_norm": 6.9520337995936785, + "learning_rate": 3.7930306630378085e-06, + "loss": 0.7975, + "step": 4275 + }, + { + "epoch": 0.3472752375538049, + "grad_norm": 4.890242996197673, + "learning_rate": 3.7924677876641147e-06, + "loss": 0.4203, + "step": 4276 + }, + { + "epoch": 0.3473564525298465, + "grad_norm": 3.8913871864084255, + "learning_rate": 3.79190482285849e-06, + "loss": 0.7376, + "step": 4277 + }, + { + "epoch": 0.34743766750588806, + "grad_norm": 3.544505934601897, + "learning_rate": 3.7913417686598886e-06, + "loss": 0.6519, + "step": 4278 + }, + { + "epoch": 0.3475188824819297, + "grad_norm": 9.771874636578227, + "learning_rate": 3.790778625107272e-06, + "loss": 0.5073, + "step": 4279 + }, + { + "epoch": 0.34760009745797127, + "grad_norm": 3.505229136041032, + "learning_rate": 3.790215392239606e-06, + "loss": 0.5759, + "step": 4280 + }, + { + "epoch": 0.34768131243401285, + "grad_norm": 5.243230166877009, + "learning_rate": 3.7896520700958616e-06, + "loss": 0.5125, + "step": 4281 + }, + { + "epoch": 0.3477625274100544, + "grad_norm": 5.947817581890729, + "learning_rate": 3.789088658715021e-06, + "loss": 0.5668, + "step": 4282 + }, + { + "epoch": 0.347843742386096, + "grad_norm": 4.83590576883994, + "learning_rate": 3.788525158136067e-06, + "loss": 0.5129, + "step": 4283 + }, + { + "epoch": 0.3479249573621376, + "grad_norm": 4.089901831736869, + "learning_rate": 3.787961568397992e-06, + "loss": 0.45, + "step": 4284 + }, + { + "epoch": 0.34800617233817915, + "grad_norm": 4.292363577189182, + "learning_rate": 3.787397889539792e-06, + "loss": 0.699, + "step": 4285 + }, + { + "epoch": 0.34808738731422073, + "grad_norm": 3.731168656658335, + "learning_rate": 3.786834121600472e-06, + "loss": 0.4771, + "step": 4286 + }, + { + "epoch": 0.3481686022902623, + "grad_norm": 6.663689172947077, + "learning_rate": 3.7862702646190415e-06, + "loss": 0.6777, + "step": 4287 + }, + { + "epoch": 0.3482498172663039, + "grad_norm": 7.125149701358761, + "learning_rate": 3.7857063186345156e-06, + "loss": 0.62, + "step": 4288 + }, + { + "epoch": 0.34833103224234546, + "grad_norm": 3.677264775633149, + "learning_rate": 3.7851422836859177e-06, + "loss": 0.541, + "step": 4289 + }, + { + "epoch": 0.3484122472183871, + "grad_norm": 7.094843934887101, + "learning_rate": 3.7845781598122743e-06, + "loss": 0.4561, + "step": 4290 + }, + { + "epoch": 0.34849346219442867, + "grad_norm": 4.886642342030592, + "learning_rate": 3.7840139470526215e-06, + "loss": 0.4937, + "step": 4291 + }, + { + "epoch": 0.34857467717047025, + "grad_norm": 5.62838070355612, + "learning_rate": 3.783449645445999e-06, + "loss": 0.513, + "step": 4292 + }, + { + "epoch": 0.3486558921465118, + "grad_norm": 4.6812249317048025, + "learning_rate": 3.782885255031453e-06, + "loss": 0.5147, + "step": 4293 + }, + { + "epoch": 0.3487371071225534, + "grad_norm": 6.938556921862774, + "learning_rate": 3.782320775848038e-06, + "loss": 0.3674, + "step": 4294 + }, + { + "epoch": 0.348818322098595, + "grad_norm": 6.723176214680427, + "learning_rate": 3.7817562079348114e-06, + "loss": 0.4626, + "step": 4295 + }, + { + "epoch": 0.34889953707463656, + "grad_norm": 5.553775061400944, + "learning_rate": 3.7811915513308382e-06, + "loss": 0.4768, + "step": 4296 + }, + { + "epoch": 0.34898075205067813, + "grad_norm": 5.884939622386394, + "learning_rate": 3.7806268060751914e-06, + "loss": 0.4368, + "step": 4297 + }, + { + "epoch": 0.3490619670267197, + "grad_norm": 6.5667330861720234, + "learning_rate": 3.7800619722069464e-06, + "loss": 0.444, + "step": 4298 + }, + { + "epoch": 0.3491431820027613, + "grad_norm": 6.393075158086913, + "learning_rate": 3.7794970497651877e-06, + "loss": 0.4569, + "step": 4299 + }, + { + "epoch": 0.34922439697880286, + "grad_norm": 7.173171177630671, + "learning_rate": 3.7789320387890056e-06, + "loss": 0.6791, + "step": 4300 + }, + { + "epoch": 0.3493056119548445, + "grad_norm": 4.165736108586291, + "learning_rate": 3.778366939317494e-06, + "loss": 0.5986, + "step": 4301 + }, + { + "epoch": 0.3493868269308861, + "grad_norm": 3.349799899482817, + "learning_rate": 3.777801751389757e-06, + "loss": 0.5417, + "step": 4302 + }, + { + "epoch": 0.34946804190692765, + "grad_norm": 5.017370500577692, + "learning_rate": 3.7772364750449002e-06, + "loss": 0.7214, + "step": 4303 + }, + { + "epoch": 0.34954925688296923, + "grad_norm": 6.384615890109909, + "learning_rate": 3.77667111032204e-06, + "loss": 0.4845, + "step": 4304 + }, + { + "epoch": 0.3496304718590108, + "grad_norm": 5.139067733395603, + "learning_rate": 3.776105657260295e-06, + "loss": 0.5029, + "step": 4305 + }, + { + "epoch": 0.3497116868350524, + "grad_norm": 30.243450603041254, + "learning_rate": 3.7755401158987926e-06, + "loss": 0.6012, + "step": 4306 + }, + { + "epoch": 0.34979290181109396, + "grad_norm": 8.484420160765751, + "learning_rate": 3.774974486276664e-06, + "loss": 0.4776, + "step": 4307 + }, + { + "epoch": 0.34987411678713554, + "grad_norm": 6.935408912143071, + "learning_rate": 3.77440876843305e-06, + "loss": 0.4172, + "step": 4308 + }, + { + "epoch": 0.3499553317631771, + "grad_norm": 4.024333397155922, + "learning_rate": 3.773842962407093e-06, + "loss": 0.7109, + "step": 4309 + }, + { + "epoch": 0.3500365467392187, + "grad_norm": 4.11100179002336, + "learning_rate": 3.773277068237945e-06, + "loss": 0.5926, + "step": 4310 + }, + { + "epoch": 0.35011776171526027, + "grad_norm": 4.947318220787382, + "learning_rate": 3.7727110859647627e-06, + "loss": 0.6069, + "step": 4311 + }, + { + "epoch": 0.3501989766913019, + "grad_norm": 3.6871411628254696, + "learning_rate": 3.772145015626709e-06, + "loss": 0.5277, + "step": 4312 + }, + { + "epoch": 0.3502801916673435, + "grad_norm": 10.305081134224809, + "learning_rate": 3.771578857262953e-06, + "loss": 0.4547, + "step": 4313 + }, + { + "epoch": 0.35036140664338505, + "grad_norm": 4.429330825416072, + "learning_rate": 3.771012610912669e-06, + "loss": 0.6503, + "step": 4314 + }, + { + "epoch": 0.35044262161942663, + "grad_norm": 8.456969721413778, + "learning_rate": 3.7704462766150396e-06, + "loss": 0.5715, + "step": 4315 + }, + { + "epoch": 0.3505238365954682, + "grad_norm": 5.543349480716538, + "learning_rate": 3.7698798544092525e-06, + "loss": 0.3989, + "step": 4316 + }, + { + "epoch": 0.3506050515715098, + "grad_norm": 4.0226607503884315, + "learning_rate": 3.7693133443344986e-06, + "loss": 0.7712, + "step": 4317 + }, + { + "epoch": 0.35068626654755136, + "grad_norm": 3.3414221408894798, + "learning_rate": 3.7687467464299797e-06, + "loss": 0.5677, + "step": 4318 + }, + { + "epoch": 0.35076748152359294, + "grad_norm": 3.3587069368493263, + "learning_rate": 3.7681800607349017e-06, + "loss": 0.5779, + "step": 4319 + }, + { + "epoch": 0.3508486964996345, + "grad_norm": 7.359687891817828, + "learning_rate": 3.767613287288474e-06, + "loss": 0.5286, + "step": 4320 + }, + { + "epoch": 0.3509299114756761, + "grad_norm": 4.5580897772433095, + "learning_rate": 3.767046426129917e-06, + "loss": 0.6514, + "step": 4321 + }, + { + "epoch": 0.35101112645171767, + "grad_norm": 6.868236388509833, + "learning_rate": 3.7664794772984515e-06, + "loss": 0.4803, + "step": 4322 + }, + { + "epoch": 0.3510923414277593, + "grad_norm": 4.0864068693665985, + "learning_rate": 3.7659124408333094e-06, + "loss": 0.4627, + "step": 4323 + }, + { + "epoch": 0.3511735564038009, + "grad_norm": 4.220792384857057, + "learning_rate": 3.7653453167737263e-06, + "loss": 0.6832, + "step": 4324 + }, + { + "epoch": 0.35125477137984246, + "grad_norm": 6.391965671992134, + "learning_rate": 3.7647781051589436e-06, + "loss": 0.7653, + "step": 4325 + }, + { + "epoch": 0.35133598635588403, + "grad_norm": 6.3932460863379905, + "learning_rate": 3.76421080602821e-06, + "loss": 0.4652, + "step": 4326 + }, + { + "epoch": 0.3514172013319256, + "grad_norm": 5.223581522055953, + "learning_rate": 3.76364341942078e-06, + "loss": 0.6826, + "step": 4327 + }, + { + "epoch": 0.3514984163079672, + "grad_norm": 6.233483510000385, + "learning_rate": 3.7630759453759123e-06, + "loss": 0.4624, + "step": 4328 + }, + { + "epoch": 0.35157963128400876, + "grad_norm": 4.989931764108342, + "learning_rate": 3.7625083839328747e-06, + "loss": 0.4736, + "step": 4329 + }, + { + "epoch": 0.35166084626005034, + "grad_norm": 3.2608228815714755, + "learning_rate": 3.7619407351309377e-06, + "loss": 0.4673, + "step": 4330 + }, + { + "epoch": 0.3517420612360919, + "grad_norm": 4.925326222217201, + "learning_rate": 3.761372999009381e-06, + "loss": 0.5807, + "step": 4331 + }, + { + "epoch": 0.3518232762121335, + "grad_norm": 3.6058579203829844, + "learning_rate": 3.7608051756074894e-06, + "loss": 0.4816, + "step": 4332 + }, + { + "epoch": 0.3519044911881751, + "grad_norm": 5.412076104516127, + "learning_rate": 3.7602372649645512e-06, + "loss": 0.6296, + "step": 4333 + }, + { + "epoch": 0.3519857061642167, + "grad_norm": 4.984953620322977, + "learning_rate": 3.759669267119864e-06, + "loss": 0.5238, + "step": 4334 + }, + { + "epoch": 0.3520669211402583, + "grad_norm": 3.348233179207022, + "learning_rate": 3.759101182112731e-06, + "loss": 0.6843, + "step": 4335 + }, + { + "epoch": 0.35214813611629986, + "grad_norm": 3.9680570346308204, + "learning_rate": 3.758533009982459e-06, + "loss": 0.4943, + "step": 4336 + }, + { + "epoch": 0.35222935109234144, + "grad_norm": 7.538460902230132, + "learning_rate": 3.7579647507683636e-06, + "loss": 0.4964, + "step": 4337 + }, + { + "epoch": 0.352310566068383, + "grad_norm": 4.840273083850319, + "learning_rate": 3.7573964045097655e-06, + "loss": 0.6186, + "step": 4338 + }, + { + "epoch": 0.3523917810444246, + "grad_norm": 4.652079301299005, + "learning_rate": 3.7568279712459908e-06, + "loss": 0.4687, + "step": 4339 + }, + { + "epoch": 0.35247299602046617, + "grad_norm": 4.542830944823784, + "learning_rate": 3.7562594510163718e-06, + "loss": 0.6544, + "step": 4340 + }, + { + "epoch": 0.35255421099650774, + "grad_norm": 5.562323687848427, + "learning_rate": 3.755690843860248e-06, + "loss": 0.598, + "step": 4341 + }, + { + "epoch": 0.3526354259725493, + "grad_norm": 3.056102850388366, + "learning_rate": 3.7551221498169633e-06, + "loss": 0.46, + "step": 4342 + }, + { + "epoch": 0.3527166409485909, + "grad_norm": 6.370494566318182, + "learning_rate": 3.7545533689258683e-06, + "loss": 0.5024, + "step": 4343 + }, + { + "epoch": 0.3527978559246325, + "grad_norm": 3.674196156661001, + "learning_rate": 3.75398450122632e-06, + "loss": 0.4952, + "step": 4344 + }, + { + "epoch": 0.3528790709006741, + "grad_norm": 5.618292271167161, + "learning_rate": 3.7534155467576805e-06, + "loss": 0.545, + "step": 4345 + }, + { + "epoch": 0.3529602858767157, + "grad_norm": 7.155057050102085, + "learning_rate": 3.7528465055593186e-06, + "loss": 0.7136, + "step": 4346 + }, + { + "epoch": 0.35304150085275726, + "grad_norm": 4.835019495981251, + "learning_rate": 3.75227737767061e-06, + "loss": 0.5152, + "step": 4347 + }, + { + "epoch": 0.35312271582879884, + "grad_norm": 6.80042426573597, + "learning_rate": 3.7517081631309336e-06, + "loss": 0.5799, + "step": 4348 + }, + { + "epoch": 0.3532039308048404, + "grad_norm": 7.706095450683844, + "learning_rate": 3.751138861979678e-06, + "loss": 0.5641, + "step": 4349 + }, + { + "epoch": 0.353285145780882, + "grad_norm": 8.667289733551144, + "learning_rate": 3.750569474256233e-06, + "loss": 0.5249, + "step": 4350 + }, + { + "epoch": 0.35336636075692357, + "grad_norm": 4.311680279675531, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.4826, + "step": 4351 + }, + { + "epoch": 0.35344757573296515, + "grad_norm": 5.148268090014955, + "learning_rate": 3.7494304392503826e-06, + "loss": 0.5173, + "step": 4352 + }, + { + "epoch": 0.3535287907090067, + "grad_norm": 5.2133873799522705, + "learning_rate": 3.7488607920467912e-06, + "loss": 0.7347, + "step": 4353 + }, + { + "epoch": 0.3536100056850483, + "grad_norm": 5.251842412363714, + "learning_rate": 3.7482910584286424e-06, + "loss": 0.4798, + "step": 4354 + }, + { + "epoch": 0.3536912206610899, + "grad_norm": 4.4703093132843, + "learning_rate": 3.747721238435359e-06, + "loss": 0.6538, + "step": 4355 + }, + { + "epoch": 0.3537724356371315, + "grad_norm": 4.814603055251707, + "learning_rate": 3.747151332106369e-06, + "loss": 0.487, + "step": 4356 + }, + { + "epoch": 0.3538536506131731, + "grad_norm": 5.542109813139826, + "learning_rate": 3.746581339481108e-06, + "loss": 0.5932, + "step": 4357 + }, + { + "epoch": 0.35393486558921466, + "grad_norm": 5.541416801300144, + "learning_rate": 3.746011260599015e-06, + "loss": 0.605, + "step": 4358 + }, + { + "epoch": 0.35401608056525624, + "grad_norm": 4.197242421152716, + "learning_rate": 3.7454410954995375e-06, + "loss": 0.4671, + "step": 4359 + }, + { + "epoch": 0.3540972955412978, + "grad_norm": 6.220911708551648, + "learning_rate": 3.7448708442221277e-06, + "loss": 0.5889, + "step": 4360 + }, + { + "epoch": 0.3541785105173394, + "grad_norm": 4.456428745711773, + "learning_rate": 3.744300506806243e-06, + "loss": 0.5731, + "step": 4361 + }, + { + "epoch": 0.354259725493381, + "grad_norm": 4.103784001652695, + "learning_rate": 3.7437300832913503e-06, + "loss": 0.4561, + "step": 4362 + }, + { + "epoch": 0.35434094046942255, + "grad_norm": 16.445100012740834, + "learning_rate": 3.743159573716917e-06, + "loss": 0.6277, + "step": 4363 + }, + { + "epoch": 0.3544221554454641, + "grad_norm": 4.616837308526195, + "learning_rate": 3.7425889781224204e-06, + "loss": 0.5266, + "step": 4364 + }, + { + "epoch": 0.3545033704215057, + "grad_norm": 8.670573299062688, + "learning_rate": 3.742018296547344e-06, + "loss": 0.5105, + "step": 4365 + }, + { + "epoch": 0.3545845853975473, + "grad_norm": 6.449318189960221, + "learning_rate": 3.741447529031173e-06, + "loss": 0.7134, + "step": 4366 + }, + { + "epoch": 0.3546658003735889, + "grad_norm": 4.211623063096266, + "learning_rate": 3.7408766756134046e-06, + "loss": 0.5255, + "step": 4367 + }, + { + "epoch": 0.3547470153496305, + "grad_norm": 4.907991529043284, + "learning_rate": 3.740305736333537e-06, + "loss": 0.6893, + "step": 4368 + }, + { + "epoch": 0.35482823032567207, + "grad_norm": 4.845747717269982, + "learning_rate": 3.7397347112310767e-06, + "loss": 0.6383, + "step": 4369 + }, + { + "epoch": 0.35490944530171364, + "grad_norm": 4.148877926596722, + "learning_rate": 3.7391636003455355e-06, + "loss": 0.4795, + "step": 4370 + }, + { + "epoch": 0.3549906602777552, + "grad_norm": 5.568518592335486, + "learning_rate": 3.7385924037164316e-06, + "loss": 0.5019, + "step": 4371 + }, + { + "epoch": 0.3550718752537968, + "grad_norm": 11.380910354185877, + "learning_rate": 3.7380211213832882e-06, + "loss": 0.4622, + "step": 4372 + }, + { + "epoch": 0.3551530902298384, + "grad_norm": 4.898875896209035, + "learning_rate": 3.737449753385636e-06, + "loss": 0.5913, + "step": 4373 + }, + { + "epoch": 0.35523430520587995, + "grad_norm": 3.7018730566328433, + "learning_rate": 3.7368782997630093e-06, + "loss": 0.7925, + "step": 4374 + }, + { + "epoch": 0.35531552018192153, + "grad_norm": 3.8066706472807565, + "learning_rate": 3.7363067605549515e-06, + "loss": 0.5056, + "step": 4375 + }, + { + "epoch": 0.3553967351579631, + "grad_norm": 2.709504723389777, + "learning_rate": 3.7357351358010075e-06, + "loss": 0.5972, + "step": 4376 + }, + { + "epoch": 0.3554779501340047, + "grad_norm": 4.824489245992463, + "learning_rate": 3.735163425540732e-06, + "loss": 0.6907, + "step": 4377 + }, + { + "epoch": 0.3555591651100463, + "grad_norm": 7.943391610774761, + "learning_rate": 3.734591629813686e-06, + "loss": 0.5539, + "step": 4378 + }, + { + "epoch": 0.3556403800860879, + "grad_norm": 4.381215594871206, + "learning_rate": 3.7340197486594315e-06, + "loss": 0.5431, + "step": 4379 + }, + { + "epoch": 0.35572159506212947, + "grad_norm": 4.946575395839248, + "learning_rate": 3.7334477821175424e-06, + "loss": 0.6081, + "step": 4380 + }, + { + "epoch": 0.35580281003817105, + "grad_norm": 4.52740218848989, + "learning_rate": 3.732875730227595e-06, + "loss": 0.4524, + "step": 4381 + }, + { + "epoch": 0.3558840250142126, + "grad_norm": 6.069550060126085, + "learning_rate": 3.7323035930291706e-06, + "loss": 0.5258, + "step": 4382 + }, + { + "epoch": 0.3559652399902542, + "grad_norm": 5.631666918507212, + "learning_rate": 3.731731370561861e-06, + "loss": 0.8438, + "step": 4383 + }, + { + "epoch": 0.3560464549662958, + "grad_norm": 8.095060386123581, + "learning_rate": 3.7311590628652584e-06, + "loss": 0.6436, + "step": 4384 + }, + { + "epoch": 0.35612766994233735, + "grad_norm": 5.278732076516886, + "learning_rate": 3.730586669978965e-06, + "loss": 0.5972, + "step": 4385 + }, + { + "epoch": 0.35620888491837893, + "grad_norm": 6.566634009856689, + "learning_rate": 3.7300141919425865e-06, + "loss": 0.4352, + "step": 4386 + }, + { + "epoch": 0.3562900998944205, + "grad_norm": 4.301691821909096, + "learning_rate": 3.729441628795736e-06, + "loss": 0.4451, + "step": 4387 + }, + { + "epoch": 0.3563713148704621, + "grad_norm": 3.538764460520504, + "learning_rate": 3.728868980578031e-06, + "loss": 0.4408, + "step": 4388 + }, + { + "epoch": 0.3564525298465037, + "grad_norm": 8.009167633892602, + "learning_rate": 3.7282962473290964e-06, + "loss": 0.6824, + "step": 4389 + }, + { + "epoch": 0.3565337448225453, + "grad_norm": 3.6954739187392414, + "learning_rate": 3.727723429088562e-06, + "loss": 0.6465, + "step": 4390 + }, + { + "epoch": 0.3566149597985869, + "grad_norm": 5.835592705497657, + "learning_rate": 3.7271505258960644e-06, + "loss": 0.3879, + "step": 4391 + }, + { + "epoch": 0.35669617477462845, + "grad_norm": 4.621090146943028, + "learning_rate": 3.726577537791245e-06, + "loss": 0.6456, + "step": 4392 + }, + { + "epoch": 0.35677738975067, + "grad_norm": 7.345583344323817, + "learning_rate": 3.726004464813752e-06, + "loss": 0.5299, + "step": 4393 + }, + { + "epoch": 0.3568586047267116, + "grad_norm": 5.887510819023688, + "learning_rate": 3.725431307003238e-06, + "loss": 0.4683, + "step": 4394 + }, + { + "epoch": 0.3569398197027532, + "grad_norm": 4.4086817353864785, + "learning_rate": 3.7248580643993625e-06, + "loss": 0.5324, + "step": 4395 + }, + { + "epoch": 0.35702103467879476, + "grad_norm": 5.875362001143908, + "learning_rate": 3.724284737041792e-06, + "loss": 0.5235, + "step": 4396 + }, + { + "epoch": 0.35710224965483633, + "grad_norm": 4.84990196629755, + "learning_rate": 3.723711324970197e-06, + "loss": 0.4776, + "step": 4397 + }, + { + "epoch": 0.3571834646308779, + "grad_norm": 4.446080183771416, + "learning_rate": 3.723137828224255e-06, + "loss": 0.5653, + "step": 4398 + }, + { + "epoch": 0.3572646796069195, + "grad_norm": 11.963154592877663, + "learning_rate": 3.722564246843648e-06, + "loss": 0.6452, + "step": 4399 + }, + { + "epoch": 0.3573458945829611, + "grad_norm": 5.038523322391007, + "learning_rate": 3.7219905808680663e-06, + "loss": 0.5563, + "step": 4400 + }, + { + "epoch": 0.3574271095590027, + "grad_norm": 5.775055041451304, + "learning_rate": 3.7214168303372033e-06, + "loss": 0.4416, + "step": 4401 + }, + { + "epoch": 0.3575083245350443, + "grad_norm": 13.159177334829804, + "learning_rate": 3.72084299529076e-06, + "loss": 0.6903, + "step": 4402 + }, + { + "epoch": 0.35758953951108585, + "grad_norm": 6.4873380102452325, + "learning_rate": 3.720269075768442e-06, + "loss": 0.5353, + "step": 4403 + }, + { + "epoch": 0.35767075448712743, + "grad_norm": 6.187953414055915, + "learning_rate": 3.7196950718099636e-06, + "loss": 0.5224, + "step": 4404 + }, + { + "epoch": 0.357751969463169, + "grad_norm": 6.813848598601859, + "learning_rate": 3.71912098345504e-06, + "loss": 0.4981, + "step": 4405 + }, + { + "epoch": 0.3578331844392106, + "grad_norm": 5.500592742588713, + "learning_rate": 3.7185468107433966e-06, + "loss": 0.6389, + "step": 4406 + }, + { + "epoch": 0.35791439941525216, + "grad_norm": 4.764003153750818, + "learning_rate": 3.7179725537147638e-06, + "loss": 0.6802, + "step": 4407 + }, + { + "epoch": 0.35799561439129374, + "grad_norm": 23.629219188415377, + "learning_rate": 3.717398212408875e-06, + "loss": 0.6526, + "step": 4408 + }, + { + "epoch": 0.3580768293673353, + "grad_norm": 6.635582406761952, + "learning_rate": 3.716823786865474e-06, + "loss": 0.4644, + "step": 4409 + }, + { + "epoch": 0.3581580443433769, + "grad_norm": 5.158135221215257, + "learning_rate": 3.7162492771243068e-06, + "loss": 0.5585, + "step": 4410 + }, + { + "epoch": 0.3582392593194185, + "grad_norm": 3.8038224782647925, + "learning_rate": 3.7156746832251266e-06, + "loss": 0.6006, + "step": 4411 + }, + { + "epoch": 0.3583204742954601, + "grad_norm": 3.935552912541937, + "learning_rate": 3.7151000052076913e-06, + "loss": 0.5972, + "step": 4412 + }, + { + "epoch": 0.3584016892715017, + "grad_norm": 5.2873448761373885, + "learning_rate": 3.7145252431117672e-06, + "loss": 0.5808, + "step": 4413 + }, + { + "epoch": 0.35848290424754325, + "grad_norm": 7.369629869130491, + "learning_rate": 3.713950396977124e-06, + "loss": 0.7058, + "step": 4414 + }, + { + "epoch": 0.35856411922358483, + "grad_norm": 5.201267760709762, + "learning_rate": 3.7133754668435377e-06, + "loss": 0.8023, + "step": 4415 + }, + { + "epoch": 0.3586453341996264, + "grad_norm": 3.1998164797424598, + "learning_rate": 3.7128004527507916e-06, + "loss": 0.596, + "step": 4416 + }, + { + "epoch": 0.358726549175668, + "grad_norm": 5.052174245450653, + "learning_rate": 3.712225354738672e-06, + "loss": 0.6137, + "step": 4417 + }, + { + "epoch": 0.35880776415170956, + "grad_norm": 4.804582850754989, + "learning_rate": 3.7116501728469746e-06, + "loss": 0.5863, + "step": 4418 + }, + { + "epoch": 0.35888897912775114, + "grad_norm": 4.754942942838345, + "learning_rate": 3.711074907115497e-06, + "loss": 0.5453, + "step": 4419 + }, + { + "epoch": 0.3589701941037927, + "grad_norm": 4.182528054308997, + "learning_rate": 3.710499557584045e-06, + "loss": 0.6604, + "step": 4420 + }, + { + "epoch": 0.3590514090798343, + "grad_norm": 5.430420823976959, + "learning_rate": 3.7099241242924306e-06, + "loss": 0.6099, + "step": 4421 + }, + { + "epoch": 0.3591326240558759, + "grad_norm": 6.298870190762428, + "learning_rate": 3.7093486072804696e-06, + "loss": 0.5275, + "step": 4422 + }, + { + "epoch": 0.3592138390319175, + "grad_norm": 4.826705123382431, + "learning_rate": 3.7087730065879862e-06, + "loss": 0.7317, + "step": 4423 + }, + { + "epoch": 0.3592950540079591, + "grad_norm": 13.171633048916624, + "learning_rate": 3.708197322254807e-06, + "loss": 0.6044, + "step": 4424 + }, + { + "epoch": 0.35937626898400066, + "grad_norm": 4.774839704277342, + "learning_rate": 3.7076215543207688e-06, + "loss": 0.6298, + "step": 4425 + }, + { + "epoch": 0.35945748396004223, + "grad_norm": 7.708019365223568, + "learning_rate": 3.7070457028257095e-06, + "loss": 0.4906, + "step": 4426 + }, + { + "epoch": 0.3595386989360838, + "grad_norm": 6.0983843136049805, + "learning_rate": 3.7064697678094765e-06, + "loss": 0.459, + "step": 4427 + }, + { + "epoch": 0.3596199139121254, + "grad_norm": 4.100219190551162, + "learning_rate": 3.7058937493119195e-06, + "loss": 0.553, + "step": 4428 + }, + { + "epoch": 0.35970112888816697, + "grad_norm": 5.689077880893839, + "learning_rate": 3.705317647372898e-06, + "loss": 0.4724, + "step": 4429 + }, + { + "epoch": 0.35978234386420854, + "grad_norm": 3.858145204321467, + "learning_rate": 3.704741462032274e-06, + "loss": 0.4586, + "step": 4430 + }, + { + "epoch": 0.3598635588402501, + "grad_norm": 11.983671864011523, + "learning_rate": 3.7041651933299167e-06, + "loss": 0.5293, + "step": 4431 + }, + { + "epoch": 0.3599447738162917, + "grad_norm": 5.870016078637099, + "learning_rate": 3.703588841305702e-06, + "loss": 0.6165, + "step": 4432 + }, + { + "epoch": 0.36002598879233333, + "grad_norm": 11.720283579984393, + "learning_rate": 3.7030124059995086e-06, + "loss": 0.5869, + "step": 4433 + }, + { + "epoch": 0.3601072037683749, + "grad_norm": 3.3466945140934072, + "learning_rate": 3.7024358874512235e-06, + "loss": 0.5978, + "step": 4434 + }, + { + "epoch": 0.3601884187444165, + "grad_norm": 6.264649818329499, + "learning_rate": 3.7018592857007386e-06, + "loss": 0.5366, + "step": 4435 + }, + { + "epoch": 0.36026963372045806, + "grad_norm": 5.505223118383528, + "learning_rate": 3.701282600787952e-06, + "loss": 0.5043, + "step": 4436 + }, + { + "epoch": 0.36035084869649964, + "grad_norm": 3.9050434398042793, + "learning_rate": 3.700705832752768e-06, + "loss": 0.4023, + "step": 4437 + }, + { + "epoch": 0.3604320636725412, + "grad_norm": 3.8486979131156818, + "learning_rate": 3.700128981635094e-06, + "loss": 0.7087, + "step": 4438 + }, + { + "epoch": 0.3605132786485828, + "grad_norm": 4.97236792052597, + "learning_rate": 3.6995520474748457e-06, + "loss": 0.676, + "step": 4439 + }, + { + "epoch": 0.36059449362462437, + "grad_norm": 5.54885373013609, + "learning_rate": 3.698975030311946e-06, + "loss": 0.4853, + "step": 4440 + }, + { + "epoch": 0.36067570860066595, + "grad_norm": 4.00154383333082, + "learning_rate": 3.6983979301863184e-06, + "loss": 0.6239, + "step": 4441 + }, + { + "epoch": 0.3607569235767075, + "grad_norm": 8.644737774833692, + "learning_rate": 3.6978207471378965e-06, + "loss": 0.648, + "step": 4442 + }, + { + "epoch": 0.3608381385527491, + "grad_norm": 4.192845183053263, + "learning_rate": 3.697243481206619e-06, + "loss": 0.5129, + "step": 4443 + }, + { + "epoch": 0.36091935352879073, + "grad_norm": 4.71407305163305, + "learning_rate": 3.6966661324324278e-06, + "loss": 0.6095, + "step": 4444 + }, + { + "epoch": 0.3610005685048323, + "grad_norm": 7.220930209286327, + "learning_rate": 3.6960887008552743e-06, + "loss": 0.4677, + "step": 4445 + }, + { + "epoch": 0.3610817834808739, + "grad_norm": 4.962625757719452, + "learning_rate": 3.6955111865151127e-06, + "loss": 0.5154, + "step": 4446 + }, + { + "epoch": 0.36116299845691546, + "grad_norm": 10.29073366584436, + "learning_rate": 3.6949335894519033e-06, + "loss": 0.5977, + "step": 4447 + }, + { + "epoch": 0.36124421343295704, + "grad_norm": 4.688756562689801, + "learning_rate": 3.6943559097056155e-06, + "loss": 0.4716, + "step": 4448 + }, + { + "epoch": 0.3613254284089986, + "grad_norm": 5.833182973807545, + "learning_rate": 3.6937781473162183e-06, + "loss": 0.6092, + "step": 4449 + }, + { + "epoch": 0.3614066433850402, + "grad_norm": 7.160318638424944, + "learning_rate": 3.6932003023236916e-06, + "loss": 0.6076, + "step": 4450 + }, + { + "epoch": 0.36148785836108177, + "grad_norm": 9.311074485235293, + "learning_rate": 3.692622374768019e-06, + "loss": 0.5193, + "step": 4451 + }, + { + "epoch": 0.36156907333712335, + "grad_norm": 12.351430012812038, + "learning_rate": 3.69204436468919e-06, + "loss": 0.4899, + "step": 4452 + }, + { + "epoch": 0.3616502883131649, + "grad_norm": 5.254888121313631, + "learning_rate": 3.6914662721272e-06, + "loss": 0.5991, + "step": 4453 + }, + { + "epoch": 0.36173150328920656, + "grad_norm": 4.86844411981734, + "learning_rate": 3.6908880971220494e-06, + "loss": 0.5256, + "step": 4454 + }, + { + "epoch": 0.36181271826524813, + "grad_norm": 11.036443384726775, + "learning_rate": 3.690309839713745e-06, + "loss": 0.504, + "step": 4455 + }, + { + "epoch": 0.3618939332412897, + "grad_norm": 3.81687688264698, + "learning_rate": 3.6897314999423e-06, + "loss": 0.4975, + "step": 4456 + }, + { + "epoch": 0.3619751482173313, + "grad_norm": 3.12299941091288, + "learning_rate": 3.6891530778477306e-06, + "loss": 0.5774, + "step": 4457 + }, + { + "epoch": 0.36205636319337287, + "grad_norm": 5.036876057872486, + "learning_rate": 3.6885745734700628e-06, + "loss": 0.4885, + "step": 4458 + }, + { + "epoch": 0.36213757816941444, + "grad_norm": 8.62327137178916, + "learning_rate": 3.687995986849325e-06, + "loss": 0.6869, + "step": 4459 + }, + { + "epoch": 0.362218793145456, + "grad_norm": 3.4708246203276443, + "learning_rate": 3.687417318025551e-06, + "loss": 0.5733, + "step": 4460 + }, + { + "epoch": 0.3623000081214976, + "grad_norm": 4.973444884464531, + "learning_rate": 3.686838567038784e-06, + "loss": 0.6222, + "step": 4461 + }, + { + "epoch": 0.3623812230975392, + "grad_norm": 5.667375423236751, + "learning_rate": 3.68625973392907e-06, + "loss": 0.5412, + "step": 4462 + }, + { + "epoch": 0.36246243807358075, + "grad_norm": 6.103470863477208, + "learning_rate": 3.6856808187364594e-06, + "loss": 0.5223, + "step": 4463 + }, + { + "epoch": 0.3625436530496223, + "grad_norm": 7.865435634910888, + "learning_rate": 3.685101821501012e-06, + "loss": 0.4664, + "step": 4464 + }, + { + "epoch": 0.36262486802566396, + "grad_norm": 4.7253648152592715, + "learning_rate": 3.6845227422627904e-06, + "loss": 0.3839, + "step": 4465 + }, + { + "epoch": 0.36270608300170554, + "grad_norm": 5.338042758405891, + "learning_rate": 3.683943581061864e-06, + "loss": 0.587, + "step": 4466 + }, + { + "epoch": 0.3627872979777471, + "grad_norm": 5.179836245143301, + "learning_rate": 3.683364337938308e-06, + "loss": 0.5633, + "step": 4467 + }, + { + "epoch": 0.3628685129537887, + "grad_norm": 6.353705665638407, + "learning_rate": 3.6827850129322017e-06, + "loss": 0.6154, + "step": 4468 + }, + { + "epoch": 0.36294972792983027, + "grad_norm": 4.584366349679948, + "learning_rate": 3.682205606083633e-06, + "loss": 0.4864, + "step": 4469 + }, + { + "epoch": 0.36303094290587185, + "grad_norm": 6.208825994136881, + "learning_rate": 3.681626117432693e-06, + "loss": 0.4577, + "step": 4470 + }, + { + "epoch": 0.3631121578819134, + "grad_norm": 6.366010507681265, + "learning_rate": 3.6810465470194796e-06, + "loss": 0.4515, + "step": 4471 + }, + { + "epoch": 0.363193372857955, + "grad_norm": 4.554722007028088, + "learning_rate": 3.680466894884096e-06, + "loss": 0.5824, + "step": 4472 + }, + { + "epoch": 0.3632745878339966, + "grad_norm": 14.16578555542569, + "learning_rate": 3.6798871610666497e-06, + "loss": 0.5687, + "step": 4473 + }, + { + "epoch": 0.36335580281003815, + "grad_norm": 6.940126197441687, + "learning_rate": 3.679307345607257e-06, + "loss": 0.5116, + "step": 4474 + }, + { + "epoch": 0.36343701778607973, + "grad_norm": 11.501609688473003, + "learning_rate": 3.6787274485460377e-06, + "loss": 0.5931, + "step": 4475 + }, + { + "epoch": 0.36351823276212136, + "grad_norm": 9.174827684495176, + "learning_rate": 3.678147469923117e-06, + "loss": 0.5515, + "step": 4476 + }, + { + "epoch": 0.36359944773816294, + "grad_norm": 4.650571061924677, + "learning_rate": 3.677567409778626e-06, + "loss": 0.4882, + "step": 4477 + }, + { + "epoch": 0.3636806627142045, + "grad_norm": 4.716688957959236, + "learning_rate": 3.6769872681527036e-06, + "loss": 0.5081, + "step": 4478 + }, + { + "epoch": 0.3637618776902461, + "grad_norm": 6.347553656163701, + "learning_rate": 3.6764070450854907e-06, + "loss": 0.4636, + "step": 4479 + }, + { + "epoch": 0.36384309266628767, + "grad_norm": 10.156227178827374, + "learning_rate": 3.675826740617136e-06, + "loss": 0.4511, + "step": 4480 + }, + { + "epoch": 0.36392430764232925, + "grad_norm": 3.559954335727195, + "learning_rate": 3.6752463547877946e-06, + "loss": 0.6027, + "step": 4481 + }, + { + "epoch": 0.3640055226183708, + "grad_norm": 4.72021021192598, + "learning_rate": 3.674665887637625e-06, + "loss": 0.5531, + "step": 4482 + }, + { + "epoch": 0.3640867375944124, + "grad_norm": 11.263712709683006, + "learning_rate": 3.6740853392067925e-06, + "loss": 0.609, + "step": 4483 + }, + { + "epoch": 0.364167952570454, + "grad_norm": 5.123832459796128, + "learning_rate": 3.6735047095354693e-06, + "loss": 0.458, + "step": 4484 + }, + { + "epoch": 0.36424916754649556, + "grad_norm": 4.265471041437096, + "learning_rate": 3.67292399866383e-06, + "loss": 0.8243, + "step": 4485 + }, + { + "epoch": 0.36433038252253713, + "grad_norm": 7.3136697189021405, + "learning_rate": 3.6723432066320575e-06, + "loss": 0.5279, + "step": 4486 + }, + { + "epoch": 0.36441159749857877, + "grad_norm": 3.4708908613272618, + "learning_rate": 3.67176233348034e-06, + "loss": 0.5274, + "step": 4487 + }, + { + "epoch": 0.36449281247462034, + "grad_norm": 3.2379003246205205, + "learning_rate": 3.6711813792488706e-06, + "loss": 0.4907, + "step": 4488 + }, + { + "epoch": 0.3645740274506619, + "grad_norm": 4.61818923987606, + "learning_rate": 3.6706003439778476e-06, + "loss": 0.5109, + "step": 4489 + }, + { + "epoch": 0.3646552424267035, + "grad_norm": 6.1238028357576555, + "learning_rate": 3.6700192277074766e-06, + "loss": 0.4814, + "step": 4490 + }, + { + "epoch": 0.3647364574027451, + "grad_norm": 5.108669074454857, + "learning_rate": 3.6694380304779676e-06, + "loss": 0.4514, + "step": 4491 + }, + { + "epoch": 0.36481767237878665, + "grad_norm": 4.555539206226715, + "learning_rate": 3.6688567523295356e-06, + "loss": 0.6225, + "step": 4492 + }, + { + "epoch": 0.3648988873548282, + "grad_norm": 4.947835075771073, + "learning_rate": 3.668275393302402e-06, + "loss": 0.526, + "step": 4493 + }, + { + "epoch": 0.3649801023308698, + "grad_norm": 4.987280419667079, + "learning_rate": 3.667693953436795e-06, + "loss": 0.5885, + "step": 4494 + }, + { + "epoch": 0.3650613173069114, + "grad_norm": 5.33019974406346, + "learning_rate": 3.6671124327729457e-06, + "loss": 0.5906, + "step": 4495 + }, + { + "epoch": 0.36514253228295296, + "grad_norm": 3.911550454532029, + "learning_rate": 3.6665308313510927e-06, + "loss": 0.4641, + "step": 4496 + }, + { + "epoch": 0.36522374725899454, + "grad_norm": 7.327451479070638, + "learning_rate": 3.665949149211481e-06, + "loss": 0.5346, + "step": 4497 + }, + { + "epoch": 0.36530496223503617, + "grad_norm": 6.679619506230374, + "learning_rate": 3.6653673863943584e-06, + "loss": 0.564, + "step": 4498 + }, + { + "epoch": 0.36538617721107775, + "grad_norm": 7.979525255569825, + "learning_rate": 3.6647855429399803e-06, + "loss": 0.5412, + "step": 4499 + }, + { + "epoch": 0.3654673921871193, + "grad_norm": 5.930675497170029, + "learning_rate": 3.6642036188886072e-06, + "loss": 0.5653, + "step": 4500 + }, + { + "epoch": 0.3655486071631609, + "grad_norm": 4.872227933745589, + "learning_rate": 3.663621614280505e-06, + "loss": 0.6143, + "step": 4501 + }, + { + "epoch": 0.3656298221392025, + "grad_norm": 4.942347208045127, + "learning_rate": 3.663039529155945e-06, + "loss": 0.5996, + "step": 4502 + }, + { + "epoch": 0.36571103711524405, + "grad_norm": 3.9093013103451084, + "learning_rate": 3.6624573635552056e-06, + "loss": 0.6432, + "step": 4503 + }, + { + "epoch": 0.36579225209128563, + "grad_norm": 6.258360673482065, + "learning_rate": 3.6618751175185687e-06, + "loss": 0.5461, + "step": 4504 + }, + { + "epoch": 0.3658734670673272, + "grad_norm": 6.512518797476682, + "learning_rate": 3.6612927910863235e-06, + "loss": 0.4864, + "step": 4505 + }, + { + "epoch": 0.3659546820433688, + "grad_norm": 4.622906130526112, + "learning_rate": 3.660710384298762e-06, + "loss": 0.6799, + "step": 4506 + }, + { + "epoch": 0.36603589701941036, + "grad_norm": 3.157312017939539, + "learning_rate": 3.6601278971961853e-06, + "loss": 0.6479, + "step": 4507 + }, + { + "epoch": 0.36611711199545194, + "grad_norm": 5.831757782243094, + "learning_rate": 3.659545329818898e-06, + "loss": 0.3765, + "step": 4508 + }, + { + "epoch": 0.36619832697149357, + "grad_norm": 4.871188343403577, + "learning_rate": 3.6589626822072105e-06, + "loss": 0.5067, + "step": 4509 + }, + { + "epoch": 0.36627954194753515, + "grad_norm": 3.3143048064685825, + "learning_rate": 3.6583799544014397e-06, + "loss": 0.7395, + "step": 4510 + }, + { + "epoch": 0.3663607569235767, + "grad_norm": 6.5927342002006535, + "learning_rate": 3.6577971464419064e-06, + "loss": 0.4955, + "step": 4511 + }, + { + "epoch": 0.3664419718996183, + "grad_norm": 5.678543552265716, + "learning_rate": 3.6572142583689372e-06, + "loss": 0.5946, + "step": 4512 + }, + { + "epoch": 0.3665231868756599, + "grad_norm": 9.978179156767617, + "learning_rate": 3.656631290222867e-06, + "loss": 0.6531, + "step": 4513 + }, + { + "epoch": 0.36660440185170146, + "grad_norm": 4.4736754108351215, + "learning_rate": 3.656048242044033e-06, + "loss": 0.545, + "step": 4514 + }, + { + "epoch": 0.36668561682774303, + "grad_norm": 3.069124245403996, + "learning_rate": 3.655465113872779e-06, + "loss": 0.4614, + "step": 4515 + }, + { + "epoch": 0.3667668318037846, + "grad_norm": 5.78003187791633, + "learning_rate": 3.6548819057494533e-06, + "loss": 0.5642, + "step": 4516 + }, + { + "epoch": 0.3668480467798262, + "grad_norm": 6.651983645474769, + "learning_rate": 3.6542986177144124e-06, + "loss": 0.726, + "step": 4517 + }, + { + "epoch": 0.36692926175586776, + "grad_norm": 5.61902839950427, + "learning_rate": 3.6537152498080165e-06, + "loss": 0.5679, + "step": 4518 + }, + { + "epoch": 0.36701047673190934, + "grad_norm": 3.6610484670477597, + "learning_rate": 3.653131802070631e-06, + "loss": 0.6392, + "step": 4519 + }, + { + "epoch": 0.367091691707951, + "grad_norm": 4.744064286312601, + "learning_rate": 3.6525482745426277e-06, + "loss": 0.5744, + "step": 4520 + }, + { + "epoch": 0.36717290668399255, + "grad_norm": 5.713260181456036, + "learning_rate": 3.6519646672643837e-06, + "loss": 0.647, + "step": 4521 + }, + { + "epoch": 0.3672541216600341, + "grad_norm": 4.789967162852778, + "learning_rate": 3.6513809802762805e-06, + "loss": 0.5129, + "step": 4522 + }, + { + "epoch": 0.3673353366360757, + "grad_norm": 5.3194037700176215, + "learning_rate": 3.6507972136187082e-06, + "loss": 0.53, + "step": 4523 + }, + { + "epoch": 0.3674165516121173, + "grad_norm": 4.077739067704118, + "learning_rate": 3.650213367332059e-06, + "loss": 0.4182, + "step": 4524 + }, + { + "epoch": 0.36749776658815886, + "grad_norm": 5.963767765697606, + "learning_rate": 3.6496294414567313e-06, + "loss": 0.5525, + "step": 4525 + }, + { + "epoch": 0.36757898156420044, + "grad_norm": 16.366327402411947, + "learning_rate": 3.649045436033132e-06, + "loss": 0.6207, + "step": 4526 + }, + { + "epoch": 0.367660196540242, + "grad_norm": 4.600677095064222, + "learning_rate": 3.6484613511016693e-06, + "loss": 0.5597, + "step": 4527 + }, + { + "epoch": 0.3677414115162836, + "grad_norm": 4.635569455138606, + "learning_rate": 3.6478771867027585e-06, + "loss": 0.5274, + "step": 4528 + }, + { + "epoch": 0.36782262649232517, + "grad_norm": 13.8186155722183, + "learning_rate": 3.647292942876822e-06, + "loss": 0.3283, + "step": 4529 + }, + { + "epoch": 0.36790384146836674, + "grad_norm": 4.921988696473463, + "learning_rate": 3.646708619664286e-06, + "loss": 0.666, + "step": 4530 + }, + { + "epoch": 0.3679850564444084, + "grad_norm": 5.125112781896423, + "learning_rate": 3.646124217105582e-06, + "loss": 0.504, + "step": 4531 + }, + { + "epoch": 0.36806627142044995, + "grad_norm": 4.160106735681602, + "learning_rate": 3.645539735241148e-06, + "loss": 0.4889, + "step": 4532 + }, + { + "epoch": 0.36814748639649153, + "grad_norm": 11.018187691856639, + "learning_rate": 3.6449551741114277e-06, + "loss": 0.5853, + "step": 4533 + }, + { + "epoch": 0.3682287013725331, + "grad_norm": 6.301041751261872, + "learning_rate": 3.6443705337568683e-06, + "loss": 0.5506, + "step": 4534 + }, + { + "epoch": 0.3683099163485747, + "grad_norm": 2.9536104737688635, + "learning_rate": 3.643785814217924e-06, + "loss": 0.4832, + "step": 4535 + }, + { + "epoch": 0.36839113132461626, + "grad_norm": 6.237830789303934, + "learning_rate": 3.6432010155350556e-06, + "loss": 0.5408, + "step": 4536 + }, + { + "epoch": 0.36847234630065784, + "grad_norm": 5.326352917032812, + "learning_rate": 3.642616137748727e-06, + "loss": 0.5208, + "step": 4537 + }, + { + "epoch": 0.3685535612766994, + "grad_norm": 6.248306452774462, + "learning_rate": 3.6420311808994084e-06, + "loss": 0.5739, + "step": 4538 + }, + { + "epoch": 0.368634776252741, + "grad_norm": 4.147693647179564, + "learning_rate": 3.641446145027577e-06, + "loss": 0.6192, + "step": 4539 + }, + { + "epoch": 0.36871599122878257, + "grad_norm": 6.469141494304793, + "learning_rate": 3.640861030173713e-06, + "loss": 0.5873, + "step": 4540 + }, + { + "epoch": 0.36879720620482415, + "grad_norm": 7.33657703163369, + "learning_rate": 3.6402758363783037e-06, + "loss": 0.6911, + "step": 4541 + }, + { + "epoch": 0.3688784211808658, + "grad_norm": 5.919960884683257, + "learning_rate": 3.639690563681841e-06, + "loss": 0.5873, + "step": 4542 + }, + { + "epoch": 0.36895963615690736, + "grad_norm": 18.615597531360145, + "learning_rate": 3.6391052121248233e-06, + "loss": 0.5995, + "step": 4543 + }, + { + "epoch": 0.36904085113294893, + "grad_norm": 7.146041859144082, + "learning_rate": 3.6385197817477535e-06, + "loss": 0.4588, + "step": 4544 + }, + { + "epoch": 0.3691220661089905, + "grad_norm": 4.196856311260542, + "learning_rate": 3.6379342725911402e-06, + "loss": 0.4738, + "step": 4545 + }, + { + "epoch": 0.3692032810850321, + "grad_norm": 4.917160222161063, + "learning_rate": 3.637348684695498e-06, + "loss": 0.6132, + "step": 4546 + }, + { + "epoch": 0.36928449606107366, + "grad_norm": 3.5260428355619218, + "learning_rate": 3.6367630181013457e-06, + "loss": 0.3245, + "step": 4547 + }, + { + "epoch": 0.36936571103711524, + "grad_norm": 5.486577536708969, + "learning_rate": 3.6361772728492096e-06, + "loss": 0.6407, + "step": 4548 + }, + { + "epoch": 0.3694469260131568, + "grad_norm": 5.527101563539024, + "learning_rate": 3.6355914489796185e-06, + "loss": 0.645, + "step": 4549 + }, + { + "epoch": 0.3695281409891984, + "grad_norm": 5.8535848869679405, + "learning_rate": 3.6350055465331098e-06, + "loss": 0.5124, + "step": 4550 + }, + { + "epoch": 0.36960935596523997, + "grad_norm": 3.0247220854088592, + "learning_rate": 3.6344195655502233e-06, + "loss": 0.5123, + "step": 4551 + }, + { + "epoch": 0.36969057094128155, + "grad_norm": 5.325222924482676, + "learning_rate": 3.633833506071508e-06, + "loss": 0.3802, + "step": 4552 + }, + { + "epoch": 0.3697717859173232, + "grad_norm": 6.707191666111479, + "learning_rate": 3.6332473681375146e-06, + "loss": 0.4215, + "step": 4553 + }, + { + "epoch": 0.36985300089336476, + "grad_norm": 6.3473511974694885, + "learning_rate": 3.6326611517888e-06, + "loss": 0.5247, + "step": 4554 + }, + { + "epoch": 0.36993421586940634, + "grad_norm": 4.62624452294825, + "learning_rate": 3.632074857065928e-06, + "loss": 0.4702, + "step": 4555 + }, + { + "epoch": 0.3700154308454479, + "grad_norm": 3.421641642355826, + "learning_rate": 3.631488484009469e-06, + "loss": 0.5228, + "step": 4556 + }, + { + "epoch": 0.3700966458214895, + "grad_norm": 6.538600958475513, + "learning_rate": 3.630902032659994e-06, + "loss": 0.5604, + "step": 4557 + }, + { + "epoch": 0.37017786079753107, + "grad_norm": 6.779583853613752, + "learning_rate": 3.6303155030580834e-06, + "loss": 0.5324, + "step": 4558 + }, + { + "epoch": 0.37025907577357264, + "grad_norm": 4.786962125388907, + "learning_rate": 3.629728895244323e-06, + "loss": 0.4619, + "step": 4559 + }, + { + "epoch": 0.3703402907496142, + "grad_norm": 5.970025695408968, + "learning_rate": 3.6291422092593016e-06, + "loss": 0.5009, + "step": 4560 + }, + { + "epoch": 0.3704215057256558, + "grad_norm": 5.054283857837389, + "learning_rate": 3.628555445143615e-06, + "loss": 0.5614, + "step": 4561 + }, + { + "epoch": 0.3705027207016974, + "grad_norm": 4.038161358633893, + "learning_rate": 3.6279686029378646e-06, + "loss": 0.5086, + "step": 4562 + }, + { + "epoch": 0.37058393567773895, + "grad_norm": 4.169489051463636, + "learning_rate": 3.6273816826826565e-06, + "loss": 0.4921, + "step": 4563 + }, + { + "epoch": 0.3706651506537806, + "grad_norm": 6.938462515442908, + "learning_rate": 3.6267946844186023e-06, + "loss": 0.5351, + "step": 4564 + }, + { + "epoch": 0.37074636562982216, + "grad_norm": 4.177658193519644, + "learning_rate": 3.6262076081863195e-06, + "loss": 0.4798, + "step": 4565 + }, + { + "epoch": 0.37082758060586374, + "grad_norm": 3.350413459828565, + "learning_rate": 3.625620454026431e-06, + "loss": 0.468, + "step": 4566 + }, + { + "epoch": 0.3709087955819053, + "grad_norm": 5.601772436876186, + "learning_rate": 3.625033221979564e-06, + "loss": 0.5411, + "step": 4567 + }, + { + "epoch": 0.3709900105579469, + "grad_norm": 5.46442194527407, + "learning_rate": 3.624445912086352e-06, + "loss": 0.6332, + "step": 4568 + }, + { + "epoch": 0.37107122553398847, + "grad_norm": 5.8554252343208, + "learning_rate": 3.6238585243874346e-06, + "loss": 0.6209, + "step": 4569 + }, + { + "epoch": 0.37115244051003005, + "grad_norm": 6.38111885179741, + "learning_rate": 3.6232710589234556e-06, + "loss": 0.5771, + "step": 4570 + }, + { + "epoch": 0.3712336554860716, + "grad_norm": 4.123032307940846, + "learning_rate": 3.6226835157350625e-06, + "loss": 0.6513, + "step": 4571 + }, + { + "epoch": 0.3713148704621132, + "grad_norm": 5.925335677806806, + "learning_rate": 3.6220958948629137e-06, + "loss": 0.6454, + "step": 4572 + }, + { + "epoch": 0.3713960854381548, + "grad_norm": 4.123001819171552, + "learning_rate": 3.621508196347667e-06, + "loss": 0.5576, + "step": 4573 + }, + { + "epoch": 0.37147730041419635, + "grad_norm": 6.967825569297834, + "learning_rate": 3.6209204202299875e-06, + "loss": 0.6256, + "step": 4574 + }, + { + "epoch": 0.371558515390238, + "grad_norm": 6.347779650753475, + "learning_rate": 3.6203325665505486e-06, + "loss": 0.632, + "step": 4575 + }, + { + "epoch": 0.37163973036627956, + "grad_norm": 3.8118384188679943, + "learning_rate": 3.619744635350025e-06, + "loss": 0.5919, + "step": 4576 + }, + { + "epoch": 0.37172094534232114, + "grad_norm": 7.993997703219614, + "learning_rate": 3.619156626669098e-06, + "loss": 0.6338, + "step": 4577 + }, + { + "epoch": 0.3718021603183627, + "grad_norm": 4.85057042384623, + "learning_rate": 3.6185685405484566e-06, + "loss": 0.5769, + "step": 4578 + }, + { + "epoch": 0.3718833752944043, + "grad_norm": 4.65594156616089, + "learning_rate": 3.6179803770287913e-06, + "loss": 0.4109, + "step": 4579 + }, + { + "epoch": 0.37196459027044587, + "grad_norm": 4.986423129791739, + "learning_rate": 3.6173921361508012e-06, + "loss": 0.3943, + "step": 4580 + }, + { + "epoch": 0.37204580524648745, + "grad_norm": 6.284947322407295, + "learning_rate": 3.616803817955189e-06, + "loss": 0.5428, + "step": 4581 + }, + { + "epoch": 0.372127020222529, + "grad_norm": 5.691836227913848, + "learning_rate": 3.6162154224826627e-06, + "loss": 0.5465, + "step": 4582 + }, + { + "epoch": 0.3722082351985706, + "grad_norm": 8.95784788971493, + "learning_rate": 3.615626949773937e-06, + "loss": 0.4766, + "step": 4583 + }, + { + "epoch": 0.3722894501746122, + "grad_norm": 4.7473375463729495, + "learning_rate": 3.6150383998697315e-06, + "loss": 0.5862, + "step": 4584 + }, + { + "epoch": 0.37237066515065376, + "grad_norm": 5.4505240290945585, + "learning_rate": 3.614449772810769e-06, + "loss": 0.4044, + "step": 4585 + }, + { + "epoch": 0.3724518801266954, + "grad_norm": 6.977930137629381, + "learning_rate": 3.613861068637781e-06, + "loss": 0.4798, + "step": 4586 + }, + { + "epoch": 0.37253309510273697, + "grad_norm": 6.5403690266388335, + "learning_rate": 3.6132722873915017e-06, + "loss": 0.6262, + "step": 4587 + }, + { + "epoch": 0.37261431007877854, + "grad_norm": 6.834395602891912, + "learning_rate": 3.6126834291126724e-06, + "loss": 0.4946, + "step": 4588 + }, + { + "epoch": 0.3726955250548201, + "grad_norm": 3.6009773174215054, + "learning_rate": 3.6120944938420384e-06, + "loss": 0.552, + "step": 4589 + }, + { + "epoch": 0.3727767400308617, + "grad_norm": 6.184936256815004, + "learning_rate": 3.6115054816203504e-06, + "loss": 0.4826, + "step": 4590 + }, + { + "epoch": 0.3728579550069033, + "grad_norm": 4.521800781478745, + "learning_rate": 3.6109163924883668e-06, + "loss": 0.595, + "step": 4591 + }, + { + "epoch": 0.37293916998294485, + "grad_norm": 5.129658876888231, + "learning_rate": 3.6103272264868473e-06, + "loss": 0.5459, + "step": 4592 + }, + { + "epoch": 0.37302038495898643, + "grad_norm": 3.955425189400771, + "learning_rate": 3.6097379836565604e-06, + "loss": 0.7445, + "step": 4593 + }, + { + "epoch": 0.373101599935028, + "grad_norm": 4.651648963857242, + "learning_rate": 3.6091486640382785e-06, + "loss": 0.5907, + "step": 4594 + }, + { + "epoch": 0.3731828149110696, + "grad_norm": 3.3474693180043045, + "learning_rate": 3.6085592676727786e-06, + "loss": 0.332, + "step": 4595 + }, + { + "epoch": 0.37326402988711116, + "grad_norm": 6.634033909104456, + "learning_rate": 3.6079697946008453e-06, + "loss": 0.4753, + "step": 4596 + }, + { + "epoch": 0.3733452448631528, + "grad_norm": 5.83533093603698, + "learning_rate": 3.607380244863265e-06, + "loss": 0.5514, + "step": 4597 + }, + { + "epoch": 0.37342645983919437, + "grad_norm": 4.575836973417688, + "learning_rate": 3.6067906185008328e-06, + "loss": 0.3765, + "step": 4598 + }, + { + "epoch": 0.37350767481523595, + "grad_norm": 4.848076544705409, + "learning_rate": 3.6062009155543483e-06, + "loss": 0.5876, + "step": 4599 + }, + { + "epoch": 0.3735888897912775, + "grad_norm": 5.836658897942606, + "learning_rate": 3.6056111360646134e-06, + "loss": 0.4459, + "step": 4600 + }, + { + "epoch": 0.3736701047673191, + "grad_norm": 3.9899151337710306, + "learning_rate": 3.6050212800724403e-06, + "loss": 0.4275, + "step": 4601 + }, + { + "epoch": 0.3737513197433607, + "grad_norm": 4.289228152113736, + "learning_rate": 3.6044313476186433e-06, + "loss": 0.5786, + "step": 4602 + }, + { + "epoch": 0.37383253471940225, + "grad_norm": 5.434462312775084, + "learning_rate": 3.603841338744041e-06, + "loss": 0.4954, + "step": 4603 + }, + { + "epoch": 0.37391374969544383, + "grad_norm": 3.5351327522854423, + "learning_rate": 3.6032512534894597e-06, + "loss": 0.6879, + "step": 4604 + }, + { + "epoch": 0.3739949646714854, + "grad_norm": 5.549918211724535, + "learning_rate": 3.602661091895732e-06, + "loss": 0.43, + "step": 4605 + }, + { + "epoch": 0.374076179647527, + "grad_norm": 4.083602872009912, + "learning_rate": 3.602070854003692e-06, + "loss": 0.5157, + "step": 4606 + }, + { + "epoch": 0.37415739462356856, + "grad_norm": 7.5939379592765786, + "learning_rate": 3.6014805398541815e-06, + "loss": 0.5669, + "step": 4607 + }, + { + "epoch": 0.3742386095996102, + "grad_norm": 3.8544939676204355, + "learning_rate": 3.6008901494880467e-06, + "loss": 0.4815, + "step": 4608 + }, + { + "epoch": 0.37431982457565177, + "grad_norm": 6.619794981662382, + "learning_rate": 3.60029968294614e-06, + "loss": 0.8072, + "step": 4609 + }, + { + "epoch": 0.37440103955169335, + "grad_norm": 9.067352524497199, + "learning_rate": 3.599709140269319e-06, + "loss": 0.409, + "step": 4610 + }, + { + "epoch": 0.3744822545277349, + "grad_norm": 4.980681642925656, + "learning_rate": 3.599118521498445e-06, + "loss": 0.6207, + "step": 4611 + }, + { + "epoch": 0.3745634695037765, + "grad_norm": 5.8795928027879585, + "learning_rate": 3.598527826674387e-06, + "loss": 0.4077, + "step": 4612 + }, + { + "epoch": 0.3746446844798181, + "grad_norm": 6.664130906786667, + "learning_rate": 3.597937055838017e-06, + "loss": 0.4889, + "step": 4613 + }, + { + "epoch": 0.37472589945585966, + "grad_norm": 3.512866216429364, + "learning_rate": 3.5973462090302137e-06, + "loss": 0.556, + "step": 4614 + }, + { + "epoch": 0.37480711443190123, + "grad_norm": 7.915343003804626, + "learning_rate": 3.5967552862918603e-06, + "loss": 0.5955, + "step": 4615 + }, + { + "epoch": 0.3748883294079428, + "grad_norm": 5.142753619848735, + "learning_rate": 3.596164287663845e-06, + "loss": 0.4933, + "step": 4616 + }, + { + "epoch": 0.3749695443839844, + "grad_norm": 5.185440275795067, + "learning_rate": 3.5955732131870626e-06, + "loss": 0.9477, + "step": 4617 + }, + { + "epoch": 0.37505075936002596, + "grad_norm": 12.990297623448031, + "learning_rate": 3.594982062902412e-06, + "loss": 0.6607, + "step": 4618 + }, + { + "epoch": 0.3751319743360676, + "grad_norm": 4.182193438396625, + "learning_rate": 3.5943908368507985e-06, + "loss": 0.6018, + "step": 4619 + }, + { + "epoch": 0.3752131893121092, + "grad_norm": 3.8071948876468733, + "learning_rate": 3.59379953507313e-06, + "loss": 0.5165, + "step": 4620 + }, + { + "epoch": 0.37529440428815075, + "grad_norm": 4.47056756713465, + "learning_rate": 3.593208157610324e-06, + "loss": 0.4829, + "step": 4621 + }, + { + "epoch": 0.37537561926419233, + "grad_norm": 4.861658029251169, + "learning_rate": 3.592616704503298e-06, + "loss": 0.5083, + "step": 4622 + }, + { + "epoch": 0.3754568342402339, + "grad_norm": 4.194610134145657, + "learning_rate": 3.5920251757929787e-06, + "loss": 0.5095, + "step": 4623 + }, + { + "epoch": 0.3755380492162755, + "grad_norm": 6.791531269079618, + "learning_rate": 3.5914335715202976e-06, + "loss": 0.4922, + "step": 4624 + }, + { + "epoch": 0.37561926419231706, + "grad_norm": 5.2361522290402185, + "learning_rate": 3.590841891726189e-06, + "loss": 0.6261, + "step": 4625 + }, + { + "epoch": 0.37570047916835864, + "grad_norm": 10.082840154908705, + "learning_rate": 3.5902501364515945e-06, + "loss": 0.5765, + "step": 4626 + }, + { + "epoch": 0.3757816941444002, + "grad_norm": 6.544537456866241, + "learning_rate": 3.5896583057374607e-06, + "loss": 0.3993, + "step": 4627 + }, + { + "epoch": 0.3758629091204418, + "grad_norm": 4.601861451010068, + "learning_rate": 3.589066399624739e-06, + "loss": 0.6202, + "step": 4628 + }, + { + "epoch": 0.37594412409648337, + "grad_norm": 4.584192839631751, + "learning_rate": 3.5884744181543868e-06, + "loss": 0.5738, + "step": 4629 + }, + { + "epoch": 0.376025339072525, + "grad_norm": 3.856688627858152, + "learning_rate": 3.5878823613673652e-06, + "loss": 0.4293, + "step": 4630 + }, + { + "epoch": 0.3761065540485666, + "grad_norm": 3.3977632115418106, + "learning_rate": 3.5872902293046417e-06, + "loss": 0.4848, + "step": 4631 + }, + { + "epoch": 0.37618776902460815, + "grad_norm": 5.9959616576129395, + "learning_rate": 3.586698022007189e-06, + "loss": 0.554, + "step": 4632 + }, + { + "epoch": 0.37626898400064973, + "grad_norm": 4.593599550308795, + "learning_rate": 3.5861057395159837e-06, + "loss": 0.494, + "step": 4633 + }, + { + "epoch": 0.3763501989766913, + "grad_norm": 3.9971449166055644, + "learning_rate": 3.5855133818720106e-06, + "loss": 0.3877, + "step": 4634 + }, + { + "epoch": 0.3764314139527329, + "grad_norm": 10.899762207965088, + "learning_rate": 3.5849209491162555e-06, + "loss": 0.4489, + "step": 4635 + }, + { + "epoch": 0.37651262892877446, + "grad_norm": 3.0450268530430598, + "learning_rate": 3.5843284412897127e-06, + "loss": 0.6891, + "step": 4636 + }, + { + "epoch": 0.37659384390481604, + "grad_norm": 5.435757415343299, + "learning_rate": 3.5837358584333814e-06, + "loss": 0.6111, + "step": 4637 + }, + { + "epoch": 0.3766750588808576, + "grad_norm": 4.283255422002926, + "learning_rate": 3.583143200588263e-06, + "loss": 0.4667, + "step": 4638 + }, + { + "epoch": 0.3767562738568992, + "grad_norm": 2.6058528884262553, + "learning_rate": 3.5825504677953684e-06, + "loss": 0.5796, + "step": 4639 + }, + { + "epoch": 0.37683748883294077, + "grad_norm": 5.535677617104462, + "learning_rate": 3.581957660095711e-06, + "loss": 0.579, + "step": 4640 + }, + { + "epoch": 0.3769187038089824, + "grad_norm": 7.993394293530673, + "learning_rate": 3.5813647775303084e-06, + "loss": 0.531, + "step": 4641 + }, + { + "epoch": 0.376999918785024, + "grad_norm": 6.421520761195438, + "learning_rate": 3.580771820140187e-06, + "loss": 0.5836, + "step": 4642 + }, + { + "epoch": 0.37708113376106556, + "grad_norm": 4.4706924859775174, + "learning_rate": 3.580178787966376e-06, + "loss": 0.6717, + "step": 4643 + }, + { + "epoch": 0.37716234873710713, + "grad_norm": 4.900856116172455, + "learning_rate": 3.5795856810499085e-06, + "loss": 0.5729, + "step": 4644 + }, + { + "epoch": 0.3772435637131487, + "grad_norm": 3.182583890174541, + "learning_rate": 3.5789924994318267e-06, + "loss": 0.6078, + "step": 4645 + }, + { + "epoch": 0.3773247786891903, + "grad_norm": 3.865947434731064, + "learning_rate": 3.578399243153174e-06, + "loss": 0.6606, + "step": 4646 + }, + { + "epoch": 0.37740599366523186, + "grad_norm": 3.349782633519335, + "learning_rate": 3.5778059122550007e-06, + "loss": 0.544, + "step": 4647 + }, + { + "epoch": 0.37748720864127344, + "grad_norm": 4.885217069177799, + "learning_rate": 3.5772125067783624e-06, + "loss": 0.5011, + "step": 4648 + }, + { + "epoch": 0.377568423617315, + "grad_norm": 4.590378597766298, + "learning_rate": 3.57661902676432e-06, + "loss": 0.5884, + "step": 4649 + }, + { + "epoch": 0.3776496385933566, + "grad_norm": 4.527590654884785, + "learning_rate": 3.576025472253939e-06, + "loss": 0.4238, + "step": 4650 + }, + { + "epoch": 0.3777308535693982, + "grad_norm": 4.451704643586669, + "learning_rate": 3.5754318432882907e-06, + "loss": 0.4485, + "step": 4651 + }, + { + "epoch": 0.3778120685454398, + "grad_norm": 5.927309052944885, + "learning_rate": 3.5748381399084492e-06, + "loss": 0.5697, + "step": 4652 + }, + { + "epoch": 0.3778932835214814, + "grad_norm": 5.139872795558716, + "learning_rate": 3.5742443621554977e-06, + "loss": 0.5761, + "step": 4653 + }, + { + "epoch": 0.37797449849752296, + "grad_norm": 6.387127931906058, + "learning_rate": 3.5736505100705223e-06, + "loss": 0.6974, + "step": 4654 + }, + { + "epoch": 0.37805571347356454, + "grad_norm": 5.810815572808849, + "learning_rate": 3.573056583694612e-06, + "loss": 0.6572, + "step": 4655 + }, + { + "epoch": 0.3781369284496061, + "grad_norm": 4.719558057872907, + "learning_rate": 3.5724625830688667e-06, + "loss": 0.5495, + "step": 4656 + }, + { + "epoch": 0.3782181434256477, + "grad_norm": 6.208002944055565, + "learning_rate": 3.571868508234386e-06, + "loss": 0.4823, + "step": 4657 + }, + { + "epoch": 0.37829935840168927, + "grad_norm": 3.68160438978963, + "learning_rate": 3.5712743592322775e-06, + "loss": 0.504, + "step": 4658 + }, + { + "epoch": 0.37838057337773084, + "grad_norm": 4.064268236543746, + "learning_rate": 3.570680136103653e-06, + "loss": 0.571, + "step": 4659 + }, + { + "epoch": 0.3784617883537724, + "grad_norm": 7.721913069819413, + "learning_rate": 3.57008583888963e-06, + "loss": 0.4338, + "step": 4660 + }, + { + "epoch": 0.378543003329814, + "grad_norm": 8.025619823408007, + "learning_rate": 3.569491467631329e-06, + "loss": 0.4907, + "step": 4661 + }, + { + "epoch": 0.3786242183058556, + "grad_norm": 4.4108267729746204, + "learning_rate": 3.568897022369879e-06, + "loss": 0.6222, + "step": 4662 + }, + { + "epoch": 0.3787054332818972, + "grad_norm": 15.163395858330826, + "learning_rate": 3.568302503146413e-06, + "loss": 0.5193, + "step": 4663 + }, + { + "epoch": 0.3787866482579388, + "grad_norm": 3.2649054015739525, + "learning_rate": 3.567707910002068e-06, + "loss": 0.5473, + "step": 4664 + }, + { + "epoch": 0.37886786323398036, + "grad_norm": 5.082393091909739, + "learning_rate": 3.5671132429779847e-06, + "loss": 0.4679, + "step": 4665 + }, + { + "epoch": 0.37894907821002194, + "grad_norm": 7.079599784559318, + "learning_rate": 3.566518502115314e-06, + "loss": 0.501, + "step": 4666 + }, + { + "epoch": 0.3790302931860635, + "grad_norm": 6.472514959171711, + "learning_rate": 3.565923687455207e-06, + "loss": 0.6414, + "step": 4667 + }, + { + "epoch": 0.3791115081621051, + "grad_norm": 6.442050896136885, + "learning_rate": 3.565328799038822e-06, + "loss": 0.4772, + "step": 4668 + }, + { + "epoch": 0.37919272313814667, + "grad_norm": 6.208659634243696, + "learning_rate": 3.5647338369073225e-06, + "loss": 0.6315, + "step": 4669 + }, + { + "epoch": 0.37927393811418825, + "grad_norm": 4.322414723790994, + "learning_rate": 3.5641388011018764e-06, + "loss": 0.4861, + "step": 4670 + }, + { + "epoch": 0.3793551530902298, + "grad_norm": 6.68918224709947, + "learning_rate": 3.563543691663657e-06, + "loss": 0.6495, + "step": 4671 + }, + { + "epoch": 0.3794363680662714, + "grad_norm": 6.412987095133581, + "learning_rate": 3.5629485086338432e-06, + "loss": 0.7778, + "step": 4672 + }, + { + "epoch": 0.379517583042313, + "grad_norm": 4.2253670535048204, + "learning_rate": 3.562353252053618e-06, + "loss": 0.4884, + "step": 4673 + }, + { + "epoch": 0.3795987980183546, + "grad_norm": 5.325608575778679, + "learning_rate": 3.56175792196417e-06, + "loss": 0.4152, + "step": 4674 + }, + { + "epoch": 0.3796800129943962, + "grad_norm": 5.859119696052518, + "learning_rate": 3.561162518406693e-06, + "loss": 0.5208, + "step": 4675 + }, + { + "epoch": 0.37976122797043776, + "grad_norm": 3.929599806942015, + "learning_rate": 3.5605670414223866e-06, + "loss": 0.6021, + "step": 4676 + }, + { + "epoch": 0.37984244294647934, + "grad_norm": 10.494504954752514, + "learning_rate": 3.559971491052453e-06, + "loss": 0.6292, + "step": 4677 + }, + { + "epoch": 0.3799236579225209, + "grad_norm": 4.64718386046636, + "learning_rate": 3.559375867338103e-06, + "loss": 0.4315, + "step": 4678 + }, + { + "epoch": 0.3800048728985625, + "grad_norm": 4.74521085436876, + "learning_rate": 3.5587801703205486e-06, + "loss": 0.4433, + "step": 4679 + }, + { + "epoch": 0.3800860878746041, + "grad_norm": 9.15577004594742, + "learning_rate": 3.558184400041011e-06, + "loss": 0.5353, + "step": 4680 + }, + { + "epoch": 0.38016730285064565, + "grad_norm": 4.324557986539743, + "learning_rate": 3.557588556540712e-06, + "loss": 0.6834, + "step": 4681 + }, + { + "epoch": 0.3802485178266872, + "grad_norm": 4.785865260255061, + "learning_rate": 3.556992639860883e-06, + "loss": 0.487, + "step": 4682 + }, + { + "epoch": 0.3803297328027288, + "grad_norm": 7.647172576935408, + "learning_rate": 3.5563966500427577e-06, + "loss": 0.4949, + "step": 4683 + }, + { + "epoch": 0.3804109477787704, + "grad_norm": 6.176240715797557, + "learning_rate": 3.555800587127574e-06, + "loss": 0.6732, + "step": 4684 + }, + { + "epoch": 0.380492162754812, + "grad_norm": 2.9442581188680306, + "learning_rate": 3.5552044511565783e-06, + "loss": 0.6017, + "step": 4685 + }, + { + "epoch": 0.3805733777308536, + "grad_norm": 3.6958273548877916, + "learning_rate": 3.554608242171019e-06, + "loss": 0.5588, + "step": 4686 + }, + { + "epoch": 0.38065459270689517, + "grad_norm": 4.174711443180175, + "learning_rate": 3.554011960212151e-06, + "loss": 0.4675, + "step": 4687 + }, + { + "epoch": 0.38073580768293674, + "grad_norm": 4.393276038211761, + "learning_rate": 3.5534156053212333e-06, + "loss": 0.5403, + "step": 4688 + }, + { + "epoch": 0.3808170226589783, + "grad_norm": 4.053882314908021, + "learning_rate": 3.5528191775395304e-06, + "loss": 0.5421, + "step": 4689 + }, + { + "epoch": 0.3808982376350199, + "grad_norm": 2.5408049268953437, + "learning_rate": 3.552222676908313e-06, + "loss": 0.4942, + "step": 4690 + }, + { + "epoch": 0.3809794526110615, + "grad_norm": 10.13235114058035, + "learning_rate": 3.5516261034688547e-06, + "loss": 0.5421, + "step": 4691 + }, + { + "epoch": 0.38106066758710305, + "grad_norm": 7.384145049649841, + "learning_rate": 3.5510294572624358e-06, + "loss": 0.4957, + "step": 4692 + }, + { + "epoch": 0.38114188256314463, + "grad_norm": 11.958189835069302, + "learning_rate": 3.5504327383303415e-06, + "loss": 0.5927, + "step": 4693 + }, + { + "epoch": 0.3812230975391862, + "grad_norm": 3.403027900822607, + "learning_rate": 3.549835946713861e-06, + "loss": 0.5608, + "step": 4694 + }, + { + "epoch": 0.3813043125152278, + "grad_norm": 3.057095877975725, + "learning_rate": 3.5492390824542887e-06, + "loss": 0.5047, + "step": 4695 + }, + { + "epoch": 0.3813855274912694, + "grad_norm": 6.33274087310348, + "learning_rate": 3.5486421455929253e-06, + "loss": 0.4971, + "step": 4696 + }, + { + "epoch": 0.381466742467311, + "grad_norm": 4.1556831093555555, + "learning_rate": 3.5480451361710744e-06, + "loss": 0.6343, + "step": 4697 + }, + { + "epoch": 0.38154795744335257, + "grad_norm": 3.1197068090047786, + "learning_rate": 3.5474480542300475e-06, + "loss": 0.6561, + "step": 4698 + }, + { + "epoch": 0.38162917241939415, + "grad_norm": 4.160059988171047, + "learning_rate": 3.5468508998111596e-06, + "loss": 0.6627, + "step": 4699 + }, + { + "epoch": 0.3817103873954357, + "grad_norm": 5.478519363239558, + "learning_rate": 3.5462536729557284e-06, + "loss": 0.6216, + "step": 4700 + }, + { + "epoch": 0.3817916023714773, + "grad_norm": 6.8557884315946795, + "learning_rate": 3.545656373705081e-06, + "loss": 0.6758, + "step": 4701 + }, + { + "epoch": 0.3818728173475189, + "grad_norm": 4.172187135110137, + "learning_rate": 3.5450590021005465e-06, + "loss": 0.4609, + "step": 4702 + }, + { + "epoch": 0.38195403232356046, + "grad_norm": 7.593654425254116, + "learning_rate": 3.5444615581834595e-06, + "loss": 0.5303, + "step": 4703 + }, + { + "epoch": 0.38203524729960203, + "grad_norm": 2.9965381248103466, + "learning_rate": 3.5438640419951608e-06, + "loss": 0.6348, + "step": 4704 + }, + { + "epoch": 0.3821164622756436, + "grad_norm": 3.965887732610272, + "learning_rate": 3.5432664535769952e-06, + "loss": 0.7041, + "step": 4705 + }, + { + "epoch": 0.3821976772516852, + "grad_norm": 10.865736295673367, + "learning_rate": 3.5426687929703117e-06, + "loss": 0.5335, + "step": 4706 + }, + { + "epoch": 0.3822788922277268, + "grad_norm": 6.090855516494487, + "learning_rate": 3.5420710602164665e-06, + "loss": 0.6845, + "step": 4707 + }, + { + "epoch": 0.3823601072037684, + "grad_norm": 3.356332999735362, + "learning_rate": 3.5414732553568194e-06, + "loss": 0.5373, + "step": 4708 + }, + { + "epoch": 0.38244132217981, + "grad_norm": 6.469401787137639, + "learning_rate": 3.5408753784327344e-06, + "loss": 0.5448, + "step": 4709 + }, + { + "epoch": 0.38252253715585155, + "grad_norm": 5.575865042285411, + "learning_rate": 3.540277429485582e-06, + "loss": 0.474, + "step": 4710 + }, + { + "epoch": 0.3826037521318931, + "grad_norm": 5.15216848497321, + "learning_rate": 3.539679408556737e-06, + "loss": 0.4949, + "step": 4711 + }, + { + "epoch": 0.3826849671079347, + "grad_norm": 4.447503975145806, + "learning_rate": 3.5390813156875792e-06, + "loss": 0.4684, + "step": 4712 + }, + { + "epoch": 0.3827661820839763, + "grad_norm": 6.602421808698497, + "learning_rate": 3.538483150919494e-06, + "loss": 0.4992, + "step": 4713 + }, + { + "epoch": 0.38284739706001786, + "grad_norm": 9.959981744936794, + "learning_rate": 3.537884914293871e-06, + "loss": 0.5644, + "step": 4714 + }, + { + "epoch": 0.38292861203605943, + "grad_norm": 4.773698731849422, + "learning_rate": 3.537286605852105e-06, + "loss": 0.5316, + "step": 4715 + }, + { + "epoch": 0.383009827012101, + "grad_norm": 5.852883004242007, + "learning_rate": 3.536688225635595e-06, + "loss": 0.4393, + "step": 4716 + }, + { + "epoch": 0.3830910419881426, + "grad_norm": 3.5878863623629442, + "learning_rate": 3.5360897736857464e-06, + "loss": 0.7132, + "step": 4717 + }, + { + "epoch": 0.3831722569641842, + "grad_norm": 5.620156517899439, + "learning_rate": 3.5354912500439696e-06, + "loss": 0.5248, + "step": 4718 + }, + { + "epoch": 0.3832534719402258, + "grad_norm": 7.422807956148856, + "learning_rate": 3.5348926547516783e-06, + "loss": 0.5087, + "step": 4719 + }, + { + "epoch": 0.3833346869162674, + "grad_norm": 5.38906725850609, + "learning_rate": 3.534293987850291e-06, + "loss": 0.7483, + "step": 4720 + }, + { + "epoch": 0.38341590189230895, + "grad_norm": 4.767084586004944, + "learning_rate": 3.5336952493812353e-06, + "loss": 0.6177, + "step": 4721 + }, + { + "epoch": 0.38349711686835053, + "grad_norm": 4.383751010173588, + "learning_rate": 3.533096439385939e-06, + "loss": 0.5982, + "step": 4722 + }, + { + "epoch": 0.3835783318443921, + "grad_norm": 18.60580065511833, + "learning_rate": 3.532497557905836e-06, + "loss": 0.6254, + "step": 4723 + }, + { + "epoch": 0.3836595468204337, + "grad_norm": 4.724647861890121, + "learning_rate": 3.531898604982367e-06, + "loss": 0.4813, + "step": 4724 + }, + { + "epoch": 0.38374076179647526, + "grad_norm": 5.525218151959553, + "learning_rate": 3.5312995806569754e-06, + "loss": 0.604, + "step": 4725 + }, + { + "epoch": 0.38382197677251684, + "grad_norm": 4.419767437379395, + "learning_rate": 3.5307004849711114e-06, + "loss": 0.6971, + "step": 4726 + }, + { + "epoch": 0.3839031917485584, + "grad_norm": 4.847805495392348, + "learning_rate": 3.530101317966228e-06, + "loss": 0.4638, + "step": 4727 + }, + { + "epoch": 0.3839844067246, + "grad_norm": 4.89939462212519, + "learning_rate": 3.5295020796837854e-06, + "loss": 0.503, + "step": 4728 + }, + { + "epoch": 0.3840656217006416, + "grad_norm": 4.244557296775247, + "learning_rate": 3.528902770165248e-06, + "loss": 0.5719, + "step": 4729 + }, + { + "epoch": 0.3841468366766832, + "grad_norm": 8.538711454427421, + "learning_rate": 3.5283033894520836e-06, + "loss": 0.4718, + "step": 4730 + }, + { + "epoch": 0.3842280516527248, + "grad_norm": 3.6208666339316995, + "learning_rate": 3.5277039375857677e-06, + "loss": 0.5757, + "step": 4731 + }, + { + "epoch": 0.38430926662876636, + "grad_norm": 6.751106985384788, + "learning_rate": 3.5271044146077773e-06, + "loss": 0.5931, + "step": 4732 + }, + { + "epoch": 0.38439048160480793, + "grad_norm": 7.411379001602781, + "learning_rate": 3.5265048205595976e-06, + "loss": 0.4918, + "step": 4733 + }, + { + "epoch": 0.3844716965808495, + "grad_norm": 4.0826091607694455, + "learning_rate": 3.5259051554827175e-06, + "loss": 0.4503, + "step": 4734 + }, + { + "epoch": 0.3845529115568911, + "grad_norm": 4.479671138778023, + "learning_rate": 3.5253054194186297e-06, + "loss": 0.5551, + "step": 4735 + }, + { + "epoch": 0.38463412653293266, + "grad_norm": 6.005050130154772, + "learning_rate": 3.524705612408833e-06, + "loss": 0.6683, + "step": 4736 + }, + { + "epoch": 0.38471534150897424, + "grad_norm": 5.171375305373519, + "learning_rate": 3.5241057344948317e-06, + "loss": 0.5399, + "step": 4737 + }, + { + "epoch": 0.3847965564850158, + "grad_norm": 10.084203556597553, + "learning_rate": 3.523505785718133e-06, + "loss": 0.6731, + "step": 4738 + }, + { + "epoch": 0.3848777714610574, + "grad_norm": 6.878409019634097, + "learning_rate": 3.5229057661202513e-06, + "loss": 0.4689, + "step": 4739 + }, + { + "epoch": 0.384958986437099, + "grad_norm": 5.680734133112059, + "learning_rate": 3.5223056757427044e-06, + "loss": 0.5154, + "step": 4740 + }, + { + "epoch": 0.3850402014131406, + "grad_norm": 2.861081172022858, + "learning_rate": 3.5217055146270144e-06, + "loss": 0.5012, + "step": 4741 + }, + { + "epoch": 0.3851214163891822, + "grad_norm": 7.512509192657636, + "learning_rate": 3.5211052828147114e-06, + "loss": 0.5743, + "step": 4742 + }, + { + "epoch": 0.38520263136522376, + "grad_norm": 7.512493844062562, + "learning_rate": 3.5205049803473257e-06, + "loss": 0.5381, + "step": 4743 + }, + { + "epoch": 0.38528384634126533, + "grad_norm": 4.916065054864969, + "learning_rate": 3.5199046072663968e-06, + "loss": 0.5969, + "step": 4744 + }, + { + "epoch": 0.3853650613173069, + "grad_norm": 5.767505645700775, + "learning_rate": 3.5193041636134673e-06, + "loss": 0.896, + "step": 4745 + }, + { + "epoch": 0.3854462762933485, + "grad_norm": 3.7412254239286566, + "learning_rate": 3.518703649430083e-06, + "loss": 0.6188, + "step": 4746 + }, + { + "epoch": 0.38552749126939007, + "grad_norm": 5.626427073356312, + "learning_rate": 3.518103064757798e-06, + "loss": 0.4346, + "step": 4747 + }, + { + "epoch": 0.38560870624543164, + "grad_norm": 5.8362139577169, + "learning_rate": 3.51750240963817e-06, + "loss": 0.5342, + "step": 4748 + }, + { + "epoch": 0.3856899212214732, + "grad_norm": 4.982453547043835, + "learning_rate": 3.516901684112759e-06, + "loss": 0.6567, + "step": 4749 + }, + { + "epoch": 0.3857711361975148, + "grad_norm": 4.093696003385388, + "learning_rate": 3.5163008882231347e-06, + "loss": 0.5791, + "step": 4750 + }, + { + "epoch": 0.38585235117355643, + "grad_norm": 5.642536053815455, + "learning_rate": 3.5157000220108674e-06, + "loss": 0.4339, + "step": 4751 + }, + { + "epoch": 0.385933566149598, + "grad_norm": 4.2134737972824, + "learning_rate": 3.5150990855175337e-06, + "loss": 0.6173, + "step": 4752 + }, + { + "epoch": 0.3860147811256396, + "grad_norm": 7.7835339423428405, + "learning_rate": 3.5144980787847155e-06, + "loss": 0.7697, + "step": 4753 + }, + { + "epoch": 0.38609599610168116, + "grad_norm": 4.445884072963403, + "learning_rate": 3.5138970018539998e-06, + "loss": 0.4681, + "step": 4754 + }, + { + "epoch": 0.38617721107772274, + "grad_norm": 7.397822019738059, + "learning_rate": 3.513295854766977e-06, + "loss": 0.4439, + "step": 4755 + }, + { + "epoch": 0.3862584260537643, + "grad_norm": 5.83757001159602, + "learning_rate": 3.5126946375652443e-06, + "loss": 0.5881, + "step": 4756 + }, + { + "epoch": 0.3863396410298059, + "grad_norm": 5.649456450527899, + "learning_rate": 3.512093350290402e-06, + "loss": 0.4453, + "step": 4757 + }, + { + "epoch": 0.38642085600584747, + "grad_norm": 4.735284890427706, + "learning_rate": 3.511491992984057e-06, + "loss": 0.6149, + "step": 4758 + }, + { + "epoch": 0.38650207098188905, + "grad_norm": 9.984154759576063, + "learning_rate": 3.510890565687818e-06, + "loss": 0.4296, + "step": 4759 + }, + { + "epoch": 0.3865832859579306, + "grad_norm": 5.043149999950322, + "learning_rate": 3.5102890684433026e-06, + "loss": 0.5894, + "step": 4760 + }, + { + "epoch": 0.3866645009339722, + "grad_norm": 4.065716569234592, + "learning_rate": 3.509687501292132e-06, + "loss": 0.3679, + "step": 4761 + }, + { + "epoch": 0.38674571591001383, + "grad_norm": 5.553031924098756, + "learning_rate": 3.5090858642759273e-06, + "loss": 0.4891, + "step": 4762 + }, + { + "epoch": 0.3868269308860554, + "grad_norm": 9.139353955383072, + "learning_rate": 3.5084841574363227e-06, + "loss": 0.5457, + "step": 4763 + }, + { + "epoch": 0.386908145862097, + "grad_norm": 5.211077508553389, + "learning_rate": 3.507882380814952e-06, + "loss": 0.7012, + "step": 4764 + }, + { + "epoch": 0.38698936083813856, + "grad_norm": 3.9781818293321027, + "learning_rate": 3.507280534453454e-06, + "loss": 0.5159, + "step": 4765 + }, + { + "epoch": 0.38707057581418014, + "grad_norm": 3.7855417874921455, + "learning_rate": 3.5066786183934743e-06, + "loss": 0.409, + "step": 4766 + }, + { + "epoch": 0.3871517907902217, + "grad_norm": 5.51441031251894, + "learning_rate": 3.5060766326766626e-06, + "loss": 0.5835, + "step": 4767 + }, + { + "epoch": 0.3872330057662633, + "grad_norm": 5.729949936654051, + "learning_rate": 3.505474577344672e-06, + "loss": 0.4933, + "step": 4768 + }, + { + "epoch": 0.38731422074230487, + "grad_norm": 4.617510877977197, + "learning_rate": 3.504872452439162e-06, + "loss": 0.6594, + "step": 4769 + }, + { + "epoch": 0.38739543571834645, + "grad_norm": 4.618436696346881, + "learning_rate": 3.504270258001796e-06, + "loss": 0.6615, + "step": 4770 + }, + { + "epoch": 0.387476650694388, + "grad_norm": 3.771460379690749, + "learning_rate": 3.503667994074244e-06, + "loss": 0.2883, + "step": 4771 + }, + { + "epoch": 0.3875578656704296, + "grad_norm": 6.252805853476455, + "learning_rate": 3.5030656606981783e-06, + "loss": 0.4846, + "step": 4772 + }, + { + "epoch": 0.38763908064647123, + "grad_norm": 6.604659609809073, + "learning_rate": 3.5024632579152775e-06, + "loss": 0.4682, + "step": 4773 + }, + { + "epoch": 0.3877202956225128, + "grad_norm": 8.26513746625276, + "learning_rate": 3.501860785767225e-06, + "loss": 0.5166, + "step": 4774 + }, + { + "epoch": 0.3878015105985544, + "grad_norm": 4.4406417232924085, + "learning_rate": 3.5012582442957077e-06, + "loss": 0.7179, + "step": 4775 + }, + { + "epoch": 0.38788272557459597, + "grad_norm": 13.64277952343109, + "learning_rate": 3.5006556335424197e-06, + "loss": 0.3686, + "step": 4776 + }, + { + "epoch": 0.38796394055063754, + "grad_norm": 4.226503480824964, + "learning_rate": 3.500052953549058e-06, + "loss": 0.6139, + "step": 4777 + }, + { + "epoch": 0.3880451555266791, + "grad_norm": 4.974191546797816, + "learning_rate": 3.4994502043573237e-06, + "loss": 0.7821, + "step": 4778 + }, + { + "epoch": 0.3881263705027207, + "grad_norm": 6.7453632874753815, + "learning_rate": 3.498847386008925e-06, + "loss": 0.5492, + "step": 4779 + }, + { + "epoch": 0.3882075854787623, + "grad_norm": 4.381360711445991, + "learning_rate": 3.4982444985455744e-06, + "loss": 0.5192, + "step": 4780 + }, + { + "epoch": 0.38828880045480385, + "grad_norm": 4.91701938364453, + "learning_rate": 3.4976415420089865e-06, + "loss": 0.5407, + "step": 4781 + }, + { + "epoch": 0.38837001543084543, + "grad_norm": 5.048580518545317, + "learning_rate": 3.4970385164408837e-06, + "loss": 0.466, + "step": 4782 + }, + { + "epoch": 0.388451230406887, + "grad_norm": 4.973082626749845, + "learning_rate": 3.496435421882994e-06, + "loss": 0.7516, + "step": 4783 + }, + { + "epoch": 0.38853244538292864, + "grad_norm": 5.743564392198817, + "learning_rate": 3.4958322583770453e-06, + "loss": 0.593, + "step": 4784 + }, + { + "epoch": 0.3886136603589702, + "grad_norm": 4.979299976934781, + "learning_rate": 3.495229025964775e-06, + "loss": 0.4979, + "step": 4785 + }, + { + "epoch": 0.3886948753350118, + "grad_norm": 12.613122878358682, + "learning_rate": 3.494625724687923e-06, + "loss": 0.6043, + "step": 4786 + }, + { + "epoch": 0.38877609031105337, + "grad_norm": 3.85614729618497, + "learning_rate": 3.494022354588235e-06, + "loss": 0.7142, + "step": 4787 + }, + { + "epoch": 0.38885730528709495, + "grad_norm": 6.046549551329485, + "learning_rate": 3.493418915707461e-06, + "loss": 0.4985, + "step": 4788 + }, + { + "epoch": 0.3889385202631365, + "grad_norm": 5.004952219579293, + "learning_rate": 3.4928154080873556e-06, + "loss": 0.7017, + "step": 4789 + }, + { + "epoch": 0.3890197352391781, + "grad_norm": 6.147470697065457, + "learning_rate": 3.4922118317696785e-06, + "loss": 0.5423, + "step": 4790 + }, + { + "epoch": 0.3891009502152197, + "grad_norm": 6.876508550025277, + "learning_rate": 3.491608186796193e-06, + "loss": 0.6039, + "step": 4791 + }, + { + "epoch": 0.38918216519126125, + "grad_norm": 10.41135603101845, + "learning_rate": 3.49100447320867e-06, + "loss": 0.5495, + "step": 4792 + }, + { + "epoch": 0.38926338016730283, + "grad_norm": 3.362847757169651, + "learning_rate": 3.4904006910488824e-06, + "loss": 0.5267, + "step": 4793 + }, + { + "epoch": 0.3893445951433444, + "grad_norm": 8.868539359623025, + "learning_rate": 3.489796840358608e-06, + "loss": 0.5027, + "step": 4794 + }, + { + "epoch": 0.38942581011938604, + "grad_norm": 3.869570039313511, + "learning_rate": 3.4891929211796303e-06, + "loss": 0.6388, + "step": 4795 + }, + { + "epoch": 0.3895070250954276, + "grad_norm": 18.46976613986865, + "learning_rate": 3.488588933553739e-06, + "loss": 0.7654, + "step": 4796 + }, + { + "epoch": 0.3895882400714692, + "grad_norm": 4.462040282578044, + "learning_rate": 3.4879848775227243e-06, + "loss": 0.5233, + "step": 4797 + }, + { + "epoch": 0.38966945504751077, + "grad_norm": 16.476898959715022, + "learning_rate": 3.487380753128385e-06, + "loss": 0.463, + "step": 4798 + }, + { + "epoch": 0.38975067002355235, + "grad_norm": 4.641132700498576, + "learning_rate": 3.4867765604125236e-06, + "loss": 0.3472, + "step": 4799 + }, + { + "epoch": 0.3898318849995939, + "grad_norm": 4.809948864229084, + "learning_rate": 3.4861722994169466e-06, + "loss": 0.4521, + "step": 4800 + }, + { + "epoch": 0.3899130999756355, + "grad_norm": 4.6897556369620625, + "learning_rate": 3.485567970183466e-06, + "loss": 0.5308, + "step": 4801 + }, + { + "epoch": 0.3899943149516771, + "grad_norm": 4.712889214741922, + "learning_rate": 3.484963572753898e-06, + "loss": 0.4595, + "step": 4802 + }, + { + "epoch": 0.39007552992771866, + "grad_norm": 3.914385123227235, + "learning_rate": 3.4843591071700627e-06, + "loss": 0.6773, + "step": 4803 + }, + { + "epoch": 0.39015674490376023, + "grad_norm": 3.7046943355774147, + "learning_rate": 3.4837545734737877e-06, + "loss": 0.4701, + "step": 4804 + }, + { + "epoch": 0.3902379598798018, + "grad_norm": 5.273455209488689, + "learning_rate": 3.483149971706902e-06, + "loss": 0.5245, + "step": 4805 + }, + { + "epoch": 0.39031917485584344, + "grad_norm": 5.259747071201792, + "learning_rate": 3.482545301911242e-06, + "loss": 0.58, + "step": 4806 + }, + { + "epoch": 0.390400389831885, + "grad_norm": 3.865781563186537, + "learning_rate": 3.4819405641286476e-06, + "loss": 0.5728, + "step": 4807 + }, + { + "epoch": 0.3904816048079266, + "grad_norm": 4.430734119933856, + "learning_rate": 3.481335758400962e-06, + "loss": 0.406, + "step": 4808 + }, + { + "epoch": 0.3905628197839682, + "grad_norm": 3.1468197398544295, + "learning_rate": 3.480730884770036e-06, + "loss": 0.5959, + "step": 4809 + }, + { + "epoch": 0.39064403476000975, + "grad_norm": 4.928724212153909, + "learning_rate": 3.4801259432777236e-06, + "loss": 0.534, + "step": 4810 + }, + { + "epoch": 0.3907252497360513, + "grad_norm": 5.338771481365462, + "learning_rate": 3.479520933965882e-06, + "loss": 0.6451, + "step": 4811 + }, + { + "epoch": 0.3908064647120929, + "grad_norm": 4.865233384961484, + "learning_rate": 3.4789158568763777e-06, + "loss": 0.6479, + "step": 4812 + }, + { + "epoch": 0.3908876796881345, + "grad_norm": 6.708639239866096, + "learning_rate": 3.4783107120510758e-06, + "loss": 0.4542, + "step": 4813 + }, + { + "epoch": 0.39096889466417606, + "grad_norm": 5.797570016494007, + "learning_rate": 3.4777054995318493e-06, + "loss": 0.6198, + "step": 4814 + }, + { + "epoch": 0.39105010964021764, + "grad_norm": 3.723719927731895, + "learning_rate": 3.4771002193605783e-06, + "loss": 0.5544, + "step": 4815 + }, + { + "epoch": 0.3911313246162592, + "grad_norm": 4.734496419671341, + "learning_rate": 3.4764948715791425e-06, + "loss": 0.5319, + "step": 4816 + }, + { + "epoch": 0.39121253959230085, + "grad_norm": 3.483732371245711, + "learning_rate": 3.47588945622943e-06, + "loss": 0.6067, + "step": 4817 + }, + { + "epoch": 0.3912937545683424, + "grad_norm": 3.7596387481376143, + "learning_rate": 3.4752839733533315e-06, + "loss": 0.7024, + "step": 4818 + }, + { + "epoch": 0.391374969544384, + "grad_norm": 6.235595740178511, + "learning_rate": 3.4746784229927445e-06, + "loss": 0.5705, + "step": 4819 + }, + { + "epoch": 0.3914561845204256, + "grad_norm": 4.914796686357341, + "learning_rate": 3.4740728051895683e-06, + "loss": 0.4127, + "step": 4820 + }, + { + "epoch": 0.39153739949646715, + "grad_norm": 6.037035106034122, + "learning_rate": 3.4734671199857093e-06, + "loss": 0.461, + "step": 4821 + }, + { + "epoch": 0.39161861447250873, + "grad_norm": 4.084676397060007, + "learning_rate": 3.4728613674230777e-06, + "loss": 0.4533, + "step": 4822 + }, + { + "epoch": 0.3916998294485503, + "grad_norm": 5.820986362019269, + "learning_rate": 3.472255547543589e-06, + "loss": 0.4948, + "step": 4823 + }, + { + "epoch": 0.3917810444245919, + "grad_norm": 6.625284347900165, + "learning_rate": 3.4716496603891605e-06, + "loss": 0.6856, + "step": 4824 + }, + { + "epoch": 0.39186225940063346, + "grad_norm": 4.751431998722198, + "learning_rate": 3.471043706001719e-06, + "loss": 0.5442, + "step": 4825 + }, + { + "epoch": 0.39194347437667504, + "grad_norm": 3.789894887233057, + "learning_rate": 3.4704376844231922e-06, + "loss": 0.5568, + "step": 4826 + }, + { + "epoch": 0.3920246893527166, + "grad_norm": 3.666796443871811, + "learning_rate": 3.4698315956955125e-06, + "loss": 0.6599, + "step": 4827 + }, + { + "epoch": 0.39210590432875825, + "grad_norm": 3.6434242436779973, + "learning_rate": 3.46922543986062e-06, + "loss": 0.4903, + "step": 4828 + }, + { + "epoch": 0.3921871193047998, + "grad_norm": 7.08366419413291, + "learning_rate": 3.468619216960457e-06, + "loss": 0.6005, + "step": 4829 + }, + { + "epoch": 0.3922683342808414, + "grad_norm": 4.495677796038025, + "learning_rate": 3.46801292703697e-06, + "loss": 0.5686, + "step": 4830 + }, + { + "epoch": 0.392349549256883, + "grad_norm": 6.22783462766733, + "learning_rate": 3.467406570132112e-06, + "loss": 0.402, + "step": 4831 + }, + { + "epoch": 0.39243076423292456, + "grad_norm": 6.8479413586138165, + "learning_rate": 3.4668001462878386e-06, + "loss": 0.4031, + "step": 4832 + }, + { + "epoch": 0.39251197920896613, + "grad_norm": 11.848480365877366, + "learning_rate": 3.466193655546112e-06, + "loss": 0.4203, + "step": 4833 + }, + { + "epoch": 0.3925931941850077, + "grad_norm": 4.1344833900535205, + "learning_rate": 3.465587097948898e-06, + "loss": 0.4155, + "step": 4834 + }, + { + "epoch": 0.3926744091610493, + "grad_norm": 4.430449430971486, + "learning_rate": 3.4649804735381675e-06, + "loss": 0.6314, + "step": 4835 + }, + { + "epoch": 0.39275562413709086, + "grad_norm": 5.347837303662017, + "learning_rate": 3.4643737823558947e-06, + "loss": 0.5447, + "step": 4836 + }, + { + "epoch": 0.39283683911313244, + "grad_norm": 6.546391191124815, + "learning_rate": 3.463767024444061e-06, + "loss": 0.5371, + "step": 4837 + }, + { + "epoch": 0.392918054089174, + "grad_norm": 4.80456942652959, + "learning_rate": 3.4631601998446484e-06, + "loss": 0.638, + "step": 4838 + }, + { + "epoch": 0.39299926906521565, + "grad_norm": 5.280861827964533, + "learning_rate": 3.4625533085996495e-06, + "loss": 0.4545, + "step": 4839 + }, + { + "epoch": 0.3930804840412572, + "grad_norm": 6.5672307967606995, + "learning_rate": 3.4619463507510536e-06, + "loss": 0.4733, + "step": 4840 + }, + { + "epoch": 0.3931616990172988, + "grad_norm": 4.308991992577792, + "learning_rate": 3.4613393263408625e-06, + "loss": 0.5039, + "step": 4841 + }, + { + "epoch": 0.3932429139933404, + "grad_norm": 5.784233850167831, + "learning_rate": 3.4607322354110785e-06, + "loss": 0.5651, + "step": 4842 + }, + { + "epoch": 0.39332412896938196, + "grad_norm": 5.463637822033822, + "learning_rate": 3.4601250780037064e-06, + "loss": 0.6328, + "step": 4843 + }, + { + "epoch": 0.39340534394542354, + "grad_norm": 6.116722144553743, + "learning_rate": 3.4595178541607616e-06, + "loss": 0.63, + "step": 4844 + }, + { + "epoch": 0.3934865589214651, + "grad_norm": 5.568944318775315, + "learning_rate": 3.45891056392426e-06, + "loss": 0.4404, + "step": 4845 + }, + { + "epoch": 0.3935677738975067, + "grad_norm": 5.00032757592686, + "learning_rate": 3.4583032073362216e-06, + "loss": 0.4921, + "step": 4846 + }, + { + "epoch": 0.39364898887354827, + "grad_norm": 6.833348088070499, + "learning_rate": 3.4576957844386728e-06, + "loss": 0.5459, + "step": 4847 + }, + { + "epoch": 0.39373020384958984, + "grad_norm": 3.622191745550694, + "learning_rate": 3.4570882952736445e-06, + "loss": 0.5338, + "step": 4848 + }, + { + "epoch": 0.3938114188256314, + "grad_norm": 3.7561292429825617, + "learning_rate": 3.4564807398831716e-06, + "loss": 0.5432, + "step": 4849 + }, + { + "epoch": 0.39389263380167305, + "grad_norm": 6.7697920687749145, + "learning_rate": 3.4558731183092936e-06, + "loss": 0.4724, + "step": 4850 + }, + { + "epoch": 0.39397384877771463, + "grad_norm": 17.901118220763788, + "learning_rate": 3.4552654305940546e-06, + "loss": 0.4965, + "step": 4851 + }, + { + "epoch": 0.3940550637537562, + "grad_norm": 5.591762564408774, + "learning_rate": 3.4546576767795036e-06, + "loss": 0.4596, + "step": 4852 + }, + { + "epoch": 0.3941362787297978, + "grad_norm": 4.5991945379468975, + "learning_rate": 3.4540498569076935e-06, + "loss": 0.6449, + "step": 4853 + }, + { + "epoch": 0.39421749370583936, + "grad_norm": 4.012140667999568, + "learning_rate": 3.453441971020682e-06, + "loss": 0.5522, + "step": 4854 + }, + { + "epoch": 0.39429870868188094, + "grad_norm": 4.026685263806628, + "learning_rate": 3.4528340191605336e-06, + "loss": 0.5232, + "step": 4855 + }, + { + "epoch": 0.3943799236579225, + "grad_norm": 3.9644370555022226, + "learning_rate": 3.452226001369313e-06, + "loss": 0.4068, + "step": 4856 + }, + { + "epoch": 0.3944611386339641, + "grad_norm": 4.955057245622184, + "learning_rate": 3.451617917689093e-06, + "loss": 0.5034, + "step": 4857 + }, + { + "epoch": 0.39454235361000567, + "grad_norm": 6.121308963517397, + "learning_rate": 3.4510097681619497e-06, + "loss": 0.5604, + "step": 4858 + }, + { + "epoch": 0.39462356858604725, + "grad_norm": 3.955911011778945, + "learning_rate": 3.4504015528299633e-06, + "loss": 0.742, + "step": 4859 + }, + { + "epoch": 0.3947047835620888, + "grad_norm": 3.416220952136702, + "learning_rate": 3.449793271735219e-06, + "loss": 0.4727, + "step": 4860 + }, + { + "epoch": 0.39478599853813046, + "grad_norm": 4.42715063532882, + "learning_rate": 3.4491849249198074e-06, + "loss": 0.6083, + "step": 4861 + }, + { + "epoch": 0.39486721351417203, + "grad_norm": 5.7756208575555466, + "learning_rate": 3.4485765124258223e-06, + "loss": 0.537, + "step": 4862 + }, + { + "epoch": 0.3949484284902136, + "grad_norm": 5.659261574392475, + "learning_rate": 3.4479680342953627e-06, + "loss": 0.4879, + "step": 4863 + }, + { + "epoch": 0.3950296434662552, + "grad_norm": 3.9294665757315625, + "learning_rate": 3.4473594905705326e-06, + "loss": 0.5727, + "step": 4864 + }, + { + "epoch": 0.39511085844229676, + "grad_norm": 4.426724017012553, + "learning_rate": 3.446750881293439e-06, + "loss": 0.5823, + "step": 4865 + }, + { + "epoch": 0.39519207341833834, + "grad_norm": 4.23549319138764, + "learning_rate": 3.4461422065061957e-06, + "loss": 0.594, + "step": 4866 + }, + { + "epoch": 0.3952732883943799, + "grad_norm": 4.069048549480606, + "learning_rate": 3.4455334662509186e-06, + "loss": 0.5666, + "step": 4867 + }, + { + "epoch": 0.3953545033704215, + "grad_norm": 9.920612867004065, + "learning_rate": 3.44492466056973e-06, + "loss": 0.5656, + "step": 4868 + }, + { + "epoch": 0.39543571834646307, + "grad_norm": 7.365805382978001, + "learning_rate": 3.4443157895047556e-06, + "loss": 0.5565, + "step": 4869 + }, + { + "epoch": 0.39551693332250465, + "grad_norm": 4.079639656340487, + "learning_rate": 3.4437068530981266e-06, + "loss": 0.5486, + "step": 4870 + }, + { + "epoch": 0.3955981482985462, + "grad_norm": 5.1014095863826805, + "learning_rate": 3.4430978513919777e-06, + "loss": 0.4873, + "step": 4871 + }, + { + "epoch": 0.39567936327458786, + "grad_norm": 6.57666055283231, + "learning_rate": 3.4424887844284492e-06, + "loss": 0.5425, + "step": 4872 + }, + { + "epoch": 0.39576057825062944, + "grad_norm": 4.768382603290578, + "learning_rate": 3.4418796522496845e-06, + "loss": 0.3941, + "step": 4873 + }, + { + "epoch": 0.395841793226671, + "grad_norm": 6.114180863563064, + "learning_rate": 3.4412704548978326e-06, + "loss": 0.7732, + "step": 4874 + }, + { + "epoch": 0.3959230082027126, + "grad_norm": 3.93227698865023, + "learning_rate": 3.4406611924150468e-06, + "loss": 0.5043, + "step": 4875 + }, + { + "epoch": 0.39600422317875417, + "grad_norm": 5.0863186506106794, + "learning_rate": 3.440051864843485e-06, + "loss": 0.4199, + "step": 4876 + }, + { + "epoch": 0.39608543815479574, + "grad_norm": 8.186606675125352, + "learning_rate": 3.4394424722253095e-06, + "loss": 0.5594, + "step": 4877 + }, + { + "epoch": 0.3961666531308373, + "grad_norm": 3.1309990632230185, + "learning_rate": 3.4388330146026865e-06, + "loss": 0.515, + "step": 4878 + }, + { + "epoch": 0.3962478681068789, + "grad_norm": 3.0856049239044676, + "learning_rate": 3.438223492017787e-06, + "loss": 0.65, + "step": 4879 + }, + { + "epoch": 0.3963290830829205, + "grad_norm": 5.758345655080891, + "learning_rate": 3.4376139045127886e-06, + "loss": 0.5401, + "step": 4880 + }, + { + "epoch": 0.39641029805896205, + "grad_norm": 6.386676934781451, + "learning_rate": 3.4370042521298697e-06, + "loss": 0.63, + "step": 4881 + }, + { + "epoch": 0.39649151303500363, + "grad_norm": 4.289237032932299, + "learning_rate": 3.436394534911216e-06, + "loss": 0.4628, + "step": 4882 + }, + { + "epoch": 0.39657272801104526, + "grad_norm": 4.493221527603865, + "learning_rate": 3.4357847528990157e-06, + "loss": 0.4507, + "step": 4883 + }, + { + "epoch": 0.39665394298708684, + "grad_norm": 5.931364030127042, + "learning_rate": 3.4351749061354634e-06, + "loss": 0.4214, + "step": 4884 + }, + { + "epoch": 0.3967351579631284, + "grad_norm": 5.201076643919345, + "learning_rate": 3.4345649946627567e-06, + "loss": 0.4338, + "step": 4885 + }, + { + "epoch": 0.39681637293917, + "grad_norm": 6.755107310201019, + "learning_rate": 3.4339550185230985e-06, + "loss": 0.633, + "step": 4886 + }, + { + "epoch": 0.39689758791521157, + "grad_norm": 6.012302549365938, + "learning_rate": 3.4333449777586957e-06, + "loss": 0.4477, + "step": 4887 + }, + { + "epoch": 0.39697880289125315, + "grad_norm": 5.404553280600934, + "learning_rate": 3.432734872411761e-06, + "loss": 0.555, + "step": 4888 + }, + { + "epoch": 0.3970600178672947, + "grad_norm": 6.802478190345189, + "learning_rate": 3.4321247025245084e-06, + "loss": 0.56, + "step": 4889 + }, + { + "epoch": 0.3971412328433363, + "grad_norm": 4.629306992412694, + "learning_rate": 3.4315144681391604e-06, + "loss": 0.5331, + "step": 4890 + }, + { + "epoch": 0.3972224478193779, + "grad_norm": 4.785707649092083, + "learning_rate": 3.430904169297941e-06, + "loss": 0.9165, + "step": 4891 + }, + { + "epoch": 0.39730366279541945, + "grad_norm": 7.873306295295412, + "learning_rate": 3.4302938060430794e-06, + "loss": 0.6152, + "step": 4892 + }, + { + "epoch": 0.39738487777146103, + "grad_norm": 3.4652132108801195, + "learning_rate": 3.429683378416811e-06, + "loss": 0.4936, + "step": 4893 + }, + { + "epoch": 0.39746609274750266, + "grad_norm": 4.301920992179896, + "learning_rate": 3.429072886461372e-06, + "loss": 0.5562, + "step": 4894 + }, + { + "epoch": 0.39754730772354424, + "grad_norm": 5.010865178574669, + "learning_rate": 3.428462330219007e-06, + "loss": 0.5153, + "step": 4895 + }, + { + "epoch": 0.3976285226995858, + "grad_norm": 6.077200433598698, + "learning_rate": 3.4278517097319617e-06, + "loss": 0.4894, + "step": 4896 + }, + { + "epoch": 0.3977097376756274, + "grad_norm": 4.201260945415446, + "learning_rate": 3.4272410250424893e-06, + "loss": 0.5466, + "step": 4897 + }, + { + "epoch": 0.39779095265166897, + "grad_norm": 4.064316394961935, + "learning_rate": 3.4266302761928453e-06, + "loss": 0.499, + "step": 4898 + }, + { + "epoch": 0.39787216762771055, + "grad_norm": 10.729804958408987, + "learning_rate": 3.4260194632252903e-06, + "loss": 0.6487, + "step": 4899 + }, + { + "epoch": 0.3979533826037521, + "grad_norm": 5.44190759303068, + "learning_rate": 3.4254085861820895e-06, + "loss": 0.4588, + "step": 4900 + }, + { + "epoch": 0.3980345975797937, + "grad_norm": 6.394017389790841, + "learning_rate": 3.424797645105512e-06, + "loss": 0.5658, + "step": 4901 + }, + { + "epoch": 0.3981158125558353, + "grad_norm": 5.684350734119721, + "learning_rate": 3.4241866400378315e-06, + "loss": 0.5158, + "step": 4902 + }, + { + "epoch": 0.39819702753187686, + "grad_norm": 3.9864422041154275, + "learning_rate": 3.423575571021327e-06, + "loss": 0.4298, + "step": 4903 + }, + { + "epoch": 0.39827824250791843, + "grad_norm": 5.696110565863003, + "learning_rate": 3.4229644380982817e-06, + "loss": 0.6485, + "step": 4904 + }, + { + "epoch": 0.39835945748396007, + "grad_norm": 8.085273365710513, + "learning_rate": 3.4223532413109807e-06, + "loss": 0.5311, + "step": 4905 + }, + { + "epoch": 0.39844067246000164, + "grad_norm": 21.38684299111223, + "learning_rate": 3.4217419807017177e-06, + "loss": 0.3467, + "step": 4906 + }, + { + "epoch": 0.3985218874360432, + "grad_norm": 6.879258693117489, + "learning_rate": 3.4211306563127876e-06, + "loss": 0.3502, + "step": 4907 + }, + { + "epoch": 0.3986031024120848, + "grad_norm": 5.751298879545744, + "learning_rate": 3.4205192681864905e-06, + "loss": 0.5344, + "step": 4908 + }, + { + "epoch": 0.3986843173881264, + "grad_norm": 9.337940570841411, + "learning_rate": 3.4199078163651335e-06, + "loss": 0.5033, + "step": 4909 + }, + { + "epoch": 0.39876553236416795, + "grad_norm": 4.8306108732275925, + "learning_rate": 3.419296300891023e-06, + "loss": 0.518, + "step": 4910 + }, + { + "epoch": 0.39884674734020953, + "grad_norm": 7.933167629628174, + "learning_rate": 3.418684721806474e-06, + "loss": 0.4499, + "step": 4911 + }, + { + "epoch": 0.3989279623162511, + "grad_norm": 6.472677681508684, + "learning_rate": 3.418073079153804e-06, + "loss": 0.534, + "step": 4912 + }, + { + "epoch": 0.3990091772922927, + "grad_norm": 6.504622980241599, + "learning_rate": 3.4174613729753364e-06, + "loss": 0.569, + "step": 4913 + }, + { + "epoch": 0.39909039226833426, + "grad_norm": 7.727056330599826, + "learning_rate": 3.4168496033133968e-06, + "loss": 0.5061, + "step": 4914 + }, + { + "epoch": 0.39917160724437584, + "grad_norm": 4.697545157125827, + "learning_rate": 3.416237770210317e-06, + "loss": 0.4645, + "step": 4915 + }, + { + "epoch": 0.39925282222041747, + "grad_norm": 4.809263186617258, + "learning_rate": 3.415625873708433e-06, + "loss": 0.4595, + "step": 4916 + }, + { + "epoch": 0.39933403719645905, + "grad_norm": 10.22371082626964, + "learning_rate": 3.4150139138500843e-06, + "loss": 0.4436, + "step": 4917 + }, + { + "epoch": 0.3994152521725006, + "grad_norm": 5.9377177585980245, + "learning_rate": 3.4144018906776155e-06, + "loss": 0.5012, + "step": 4918 + }, + { + "epoch": 0.3994964671485422, + "grad_norm": 5.523225188806799, + "learning_rate": 3.413789804233375e-06, + "loss": 0.6356, + "step": 4919 + }, + { + "epoch": 0.3995776821245838, + "grad_norm": 4.922432155658762, + "learning_rate": 3.413177654559717e-06, + "loss": 0.4785, + "step": 4920 + }, + { + "epoch": 0.39965889710062535, + "grad_norm": 8.522209977514363, + "learning_rate": 3.4125654416989975e-06, + "loss": 0.4049, + "step": 4921 + }, + { + "epoch": 0.39974011207666693, + "grad_norm": 4.166534006928631, + "learning_rate": 3.411953165693579e-06, + "loss": 0.5051, + "step": 4922 + }, + { + "epoch": 0.3998213270527085, + "grad_norm": 7.556966122206325, + "learning_rate": 3.4113408265858282e-06, + "loss": 0.5134, + "step": 4923 + }, + { + "epoch": 0.3999025420287501, + "grad_norm": 4.46432321290014, + "learning_rate": 3.4107284244181154e-06, + "loss": 0.5773, + "step": 4924 + }, + { + "epoch": 0.39998375700479166, + "grad_norm": 3.3296185748575837, + "learning_rate": 3.4101159592328148e-06, + "loss": 0.492, + "step": 4925 + }, + { + "epoch": 0.40006497198083324, + "grad_norm": 4.808058470791979, + "learning_rate": 3.409503431072308e-06, + "loss": 0.4053, + "step": 4926 + }, + { + "epoch": 0.40014618695687487, + "grad_norm": 5.658421289602297, + "learning_rate": 3.408890839978976e-06, + "loss": 0.515, + "step": 4927 + }, + { + "epoch": 0.40022740193291645, + "grad_norm": 6.896229712559321, + "learning_rate": 3.4082781859952087e-06, + "loss": 0.5547, + "step": 4928 + }, + { + "epoch": 0.400308616908958, + "grad_norm": 5.824806238919424, + "learning_rate": 3.407665469163398e-06, + "loss": 0.6033, + "step": 4929 + }, + { + "epoch": 0.4003898318849996, + "grad_norm": 4.135425915903601, + "learning_rate": 3.4070526895259403e-06, + "loss": 0.5083, + "step": 4930 + }, + { + "epoch": 0.4004710468610412, + "grad_norm": 5.133129422170843, + "learning_rate": 3.4064398471252367e-06, + "loss": 0.5962, + "step": 4931 + }, + { + "epoch": 0.40055226183708276, + "grad_norm": 5.46292327134982, + "learning_rate": 3.4058269420036937e-06, + "loss": 0.4848, + "step": 4932 + }, + { + "epoch": 0.40063347681312433, + "grad_norm": 5.055147156646343, + "learning_rate": 3.40521397420372e-06, + "loss": 0.5223, + "step": 4933 + }, + { + "epoch": 0.4007146917891659, + "grad_norm": 6.056907838015249, + "learning_rate": 3.4046009437677296e-06, + "loss": 0.658, + "step": 4934 + }, + { + "epoch": 0.4007959067652075, + "grad_norm": 5.370500579188346, + "learning_rate": 3.403987850738142e-06, + "loss": 0.4064, + "step": 4935 + }, + { + "epoch": 0.40087712174124907, + "grad_norm": 5.583756706162014, + "learning_rate": 3.4033746951573797e-06, + "loss": 0.4991, + "step": 4936 + }, + { + "epoch": 0.40095833671729064, + "grad_norm": 7.016728588738892, + "learning_rate": 3.4027614770678695e-06, + "loss": 0.6236, + "step": 4937 + }, + { + "epoch": 0.4010395516933323, + "grad_norm": 4.595088770001303, + "learning_rate": 3.402148196512042e-06, + "loss": 0.4602, + "step": 4938 + }, + { + "epoch": 0.40112076666937385, + "grad_norm": 8.586997426840448, + "learning_rate": 3.4015348535323344e-06, + "loss": 0.582, + "step": 4939 + }, + { + "epoch": 0.40120198164541543, + "grad_norm": 6.420810576653918, + "learning_rate": 3.400921448171187e-06, + "loss": 0.456, + "step": 4940 + }, + { + "epoch": 0.401283196621457, + "grad_norm": 3.598889910541628, + "learning_rate": 3.4003079804710414e-06, + "loss": 0.5049, + "step": 4941 + }, + { + "epoch": 0.4013644115974986, + "grad_norm": 4.85229866007985, + "learning_rate": 3.39969445047435e-06, + "loss": 0.5795, + "step": 4942 + }, + { + "epoch": 0.40144562657354016, + "grad_norm": 5.046704044838436, + "learning_rate": 3.399080858223564e-06, + "loss": 0.4898, + "step": 4943 + }, + { + "epoch": 0.40152684154958174, + "grad_norm": 4.201283039118606, + "learning_rate": 3.3984672037611403e-06, + "loss": 0.4583, + "step": 4944 + }, + { + "epoch": 0.4016080565256233, + "grad_norm": 3.6245756239233855, + "learning_rate": 3.3978534871295423e-06, + "loss": 0.569, + "step": 4945 + }, + { + "epoch": 0.4016892715016649, + "grad_norm": 5.943049720731405, + "learning_rate": 3.3972397083712337e-06, + "loss": 0.4635, + "step": 4946 + }, + { + "epoch": 0.40177048647770647, + "grad_norm": 4.4825133089550615, + "learning_rate": 3.3966258675286868e-06, + "loss": 0.5545, + "step": 4947 + }, + { + "epoch": 0.40185170145374804, + "grad_norm": 7.159533755424081, + "learning_rate": 3.3960119646443743e-06, + "loss": 0.5132, + "step": 4948 + }, + { + "epoch": 0.4019329164297897, + "grad_norm": 5.668432680762583, + "learning_rate": 3.395397999760777e-06, + "loss": 0.4824, + "step": 4949 + }, + { + "epoch": 0.40201413140583125, + "grad_norm": 7.74454321710248, + "learning_rate": 3.394783972920376e-06, + "loss": 0.5292, + "step": 4950 + }, + { + "epoch": 0.40209534638187283, + "grad_norm": 3.9688138836277864, + "learning_rate": 3.3941698841656594e-06, + "loss": 0.5123, + "step": 4951 + }, + { + "epoch": 0.4021765613579144, + "grad_norm": 3.6092591329939196, + "learning_rate": 3.3935557335391194e-06, + "loss": 0.5618, + "step": 4952 + }, + { + "epoch": 0.402257776333956, + "grad_norm": 5.268354030573778, + "learning_rate": 3.3929415210832526e-06, + "loss": 0.4638, + "step": 4953 + }, + { + "epoch": 0.40233899130999756, + "grad_norm": 4.725073559638671, + "learning_rate": 3.392327246840558e-06, + "loss": 0.5442, + "step": 4954 + }, + { + "epoch": 0.40242020628603914, + "grad_norm": 5.0239266752483225, + "learning_rate": 3.39171291085354e-06, + "loss": 0.5132, + "step": 4955 + }, + { + "epoch": 0.4025014212620807, + "grad_norm": 4.688205783968622, + "learning_rate": 3.3910985131647077e-06, + "loss": 0.6074, + "step": 4956 + }, + { + "epoch": 0.4025826362381223, + "grad_norm": 4.306614565733134, + "learning_rate": 3.3904840538165745e-06, + "loss": 0.4564, + "step": 4957 + }, + { + "epoch": 0.40266385121416387, + "grad_norm": 13.862109865488437, + "learning_rate": 3.3898695328516585e-06, + "loss": 0.536, + "step": 4958 + }, + { + "epoch": 0.40274506619020545, + "grad_norm": 4.512610902634363, + "learning_rate": 3.38925495031248e-06, + "loss": 0.6559, + "step": 4959 + }, + { + "epoch": 0.4028262811662471, + "grad_norm": 4.1778178120621225, + "learning_rate": 3.3886403062415653e-06, + "loss": 0.5741, + "step": 4960 + }, + { + "epoch": 0.40290749614228866, + "grad_norm": 5.5923461249681194, + "learning_rate": 3.3880256006814436e-06, + "loss": 0.7565, + "step": 4961 + }, + { + "epoch": 0.40298871111833023, + "grad_norm": 10.781234356503536, + "learning_rate": 3.387410833674651e-06, + "loss": 0.817, + "step": 4962 + }, + { + "epoch": 0.4030699260943718, + "grad_norm": 7.255219164441723, + "learning_rate": 3.386796005263725e-06, + "loss": 0.4918, + "step": 4963 + }, + { + "epoch": 0.4031511410704134, + "grad_norm": 3.7214218539504094, + "learning_rate": 3.3861811154912085e-06, + "loss": 0.5502, + "step": 4964 + }, + { + "epoch": 0.40323235604645496, + "grad_norm": 3.7233458707878246, + "learning_rate": 3.385566164399649e-06, + "loss": 0.6731, + "step": 4965 + }, + { + "epoch": 0.40331357102249654, + "grad_norm": 6.244845838241695, + "learning_rate": 3.3849511520315986e-06, + "loss": 0.426, + "step": 4966 + }, + { + "epoch": 0.4033947859985381, + "grad_norm": 5.384785695599885, + "learning_rate": 3.384336078429611e-06, + "loss": 0.7065, + "step": 4967 + }, + { + "epoch": 0.4034760009745797, + "grad_norm": 4.49252865805685, + "learning_rate": 3.3837209436362473e-06, + "loss": 0.477, + "step": 4968 + }, + { + "epoch": 0.4035572159506213, + "grad_norm": 3.4933203948485643, + "learning_rate": 3.3831057476940716e-06, + "loss": 0.5459, + "step": 4969 + }, + { + "epoch": 0.40363843092666285, + "grad_norm": 7.33852450348436, + "learning_rate": 3.382490490645651e-06, + "loss": 0.559, + "step": 4970 + }, + { + "epoch": 0.4037196459027045, + "grad_norm": 5.565835641557815, + "learning_rate": 3.3818751725335595e-06, + "loss": 0.4383, + "step": 4971 + }, + { + "epoch": 0.40380086087874606, + "grad_norm": 6.069976549417349, + "learning_rate": 3.3812597934003746e-06, + "loss": 0.6346, + "step": 4972 + }, + { + "epoch": 0.40388207585478764, + "grad_norm": 5.41297727878491, + "learning_rate": 3.3806443532886736e-06, + "loss": 0.4902, + "step": 4973 + }, + { + "epoch": 0.4039632908308292, + "grad_norm": 3.474757683944031, + "learning_rate": 3.3800288522410464e-06, + "loss": 0.6833, + "step": 4974 + }, + { + "epoch": 0.4040445058068708, + "grad_norm": 18.494129306251725, + "learning_rate": 3.3794132903000787e-06, + "loss": 0.4717, + "step": 4975 + }, + { + "epoch": 0.40412572078291237, + "grad_norm": 5.201629259993079, + "learning_rate": 3.3787976675083657e-06, + "loss": 0.4586, + "step": 4976 + }, + { + "epoch": 0.40420693575895394, + "grad_norm": 3.067579639195292, + "learning_rate": 3.3781819839085056e-06, + "loss": 0.6093, + "step": 4977 + }, + { + "epoch": 0.4042881507349955, + "grad_norm": 6.51098650482296, + "learning_rate": 3.3775662395431e-06, + "loss": 0.4642, + "step": 4978 + }, + { + "epoch": 0.4043693657110371, + "grad_norm": 3.2528809931282057, + "learning_rate": 3.376950434454754e-06, + "loss": 0.6634, + "step": 4979 + }, + { + "epoch": 0.4044505806870787, + "grad_norm": 6.255509398772904, + "learning_rate": 3.37633456868608e-06, + "loss": 0.5264, + "step": 4980 + }, + { + "epoch": 0.40453179566312025, + "grad_norm": 6.628156482525939, + "learning_rate": 3.3757186422796918e-06, + "loss": 0.4249, + "step": 4981 + }, + { + "epoch": 0.4046130106391619, + "grad_norm": 5.667448673111919, + "learning_rate": 3.3751026552782085e-06, + "loss": 0.5736, + "step": 4982 + }, + { + "epoch": 0.40469422561520346, + "grad_norm": 4.987055006193434, + "learning_rate": 3.3744866077242516e-06, + "loss": 0.585, + "step": 4983 + }, + { + "epoch": 0.40477544059124504, + "grad_norm": 5.82552976044451, + "learning_rate": 3.3738704996604505e-06, + "loss": 0.6341, + "step": 4984 + }, + { + "epoch": 0.4048566555672866, + "grad_norm": 4.093717103910503, + "learning_rate": 3.373254331129436e-06, + "loss": 0.4046, + "step": 4985 + }, + { + "epoch": 0.4049378705433282, + "grad_norm": 7.070638776876593, + "learning_rate": 3.3726381021738426e-06, + "loss": 0.4569, + "step": 4986 + }, + { + "epoch": 0.40501908551936977, + "grad_norm": 4.327438597298289, + "learning_rate": 3.372021812836311e-06, + "loss": 0.522, + "step": 4987 + }, + { + "epoch": 0.40510030049541135, + "grad_norm": 7.802424985063589, + "learning_rate": 3.371405463159486e-06, + "loss": 0.5055, + "step": 4988 + }, + { + "epoch": 0.4051815154714529, + "grad_norm": 5.2484985718748405, + "learning_rate": 3.3707890531860143e-06, + "loss": 0.5691, + "step": 4989 + }, + { + "epoch": 0.4052627304474945, + "grad_norm": 3.961595770162228, + "learning_rate": 3.3701725829585484e-06, + "loss": 0.5087, + "step": 4990 + }, + { + "epoch": 0.4053439454235361, + "grad_norm": 5.115368545445615, + "learning_rate": 3.369556052519746e-06, + "loss": 0.5014, + "step": 4991 + }, + { + "epoch": 0.40542516039957766, + "grad_norm": 9.264859588158524, + "learning_rate": 3.3689394619122654e-06, + "loss": 0.5217, + "step": 4992 + }, + { + "epoch": 0.4055063753756193, + "grad_norm": 5.809302524376221, + "learning_rate": 3.3683228111787738e-06, + "loss": 0.5209, + "step": 4993 + }, + { + "epoch": 0.40558759035166086, + "grad_norm": 6.347688148270995, + "learning_rate": 3.367706100361939e-06, + "loss": 0.5284, + "step": 4994 + }, + { + "epoch": 0.40566880532770244, + "grad_norm": 5.2075723248409185, + "learning_rate": 3.3670893295044344e-06, + "loss": 0.6141, + "step": 4995 + }, + { + "epoch": 0.405750020303744, + "grad_norm": 6.241751717955846, + "learning_rate": 3.3664724986489368e-06, + "loss": 0.539, + "step": 4996 + }, + { + "epoch": 0.4058312352797856, + "grad_norm": 5.913411596718994, + "learning_rate": 3.3658556078381283e-06, + "loss": 0.4779, + "step": 4997 + }, + { + "epoch": 0.4059124502558272, + "grad_norm": 5.987603839595508, + "learning_rate": 3.3652386571146945e-06, + "loss": 0.4415, + "step": 4998 + }, + { + "epoch": 0.40599366523186875, + "grad_norm": 5.342255084971644, + "learning_rate": 3.3646216465213245e-06, + "loss": 0.5426, + "step": 4999 + }, + { + "epoch": 0.4060748802079103, + "grad_norm": 5.881978781645484, + "learning_rate": 3.364004576100712e-06, + "loss": 0.5235, + "step": 5000 + }, + { + "epoch": 0.4061560951839519, + "grad_norm": 5.907061959419017, + "learning_rate": 3.3633874458955573e-06, + "loss": 0.5061, + "step": 5001 + }, + { + "epoch": 0.4062373101599935, + "grad_norm": 4.640005094027489, + "learning_rate": 3.362770255948559e-06, + "loss": 0.5892, + "step": 5002 + }, + { + "epoch": 0.40631852513603506, + "grad_norm": 5.450649196042516, + "learning_rate": 3.3621530063024257e-06, + "loss": 0.6451, + "step": 5003 + }, + { + "epoch": 0.4063997401120767, + "grad_norm": 12.30013823840977, + "learning_rate": 3.3615356969998676e-06, + "loss": 0.5278, + "step": 5004 + }, + { + "epoch": 0.40648095508811827, + "grad_norm": 3.2285712876853156, + "learning_rate": 3.360918328083598e-06, + "loss": 0.6247, + "step": 5005 + }, + { + "epoch": 0.40656217006415984, + "grad_norm": 4.8576215977033135, + "learning_rate": 3.3603008995963373e-06, + "loss": 0.527, + "step": 5006 + }, + { + "epoch": 0.4066433850402014, + "grad_norm": 6.983405839371138, + "learning_rate": 3.3596834115808074e-06, + "loss": 0.5264, + "step": 5007 + }, + { + "epoch": 0.406724600016243, + "grad_norm": 4.393414313571411, + "learning_rate": 3.3590658640797346e-06, + "loss": 0.6397, + "step": 5008 + }, + { + "epoch": 0.4068058149922846, + "grad_norm": 5.416974817120511, + "learning_rate": 3.3584482571358513e-06, + "loss": 0.4797, + "step": 5009 + }, + { + "epoch": 0.40688702996832615, + "grad_norm": 3.780442832500498, + "learning_rate": 3.357830590791891e-06, + "loss": 0.452, + "step": 5010 + }, + { + "epoch": 0.40696824494436773, + "grad_norm": 4.203958087565441, + "learning_rate": 3.3572128650905946e-06, + "loss": 0.4962, + "step": 5011 + }, + { + "epoch": 0.4070494599204093, + "grad_norm": 7.016929878457646, + "learning_rate": 3.3565950800747038e-06, + "loss": 0.5625, + "step": 5012 + }, + { + "epoch": 0.4071306748964509, + "grad_norm": 5.779316536373502, + "learning_rate": 3.355977235786968e-06, + "loss": 0.544, + "step": 5013 + }, + { + "epoch": 0.40721188987249246, + "grad_norm": 4.419872849655686, + "learning_rate": 3.3553593322701374e-06, + "loss": 0.5445, + "step": 5014 + }, + { + "epoch": 0.4072931048485341, + "grad_norm": 19.583565435812158, + "learning_rate": 3.3547413695669673e-06, + "loss": 0.4479, + "step": 5015 + }, + { + "epoch": 0.40737431982457567, + "grad_norm": 6.737000577753693, + "learning_rate": 3.3541233477202184e-06, + "loss": 0.6231, + "step": 5016 + }, + { + "epoch": 0.40745553480061725, + "grad_norm": 5.651619729539659, + "learning_rate": 3.3535052667726546e-06, + "loss": 0.4791, + "step": 5017 + }, + { + "epoch": 0.4075367497766588, + "grad_norm": 13.01736060108638, + "learning_rate": 3.352887126767043e-06, + "loss": 0.618, + "step": 5018 + }, + { + "epoch": 0.4076179647527004, + "grad_norm": 7.913220552107575, + "learning_rate": 3.352268927746156e-06, + "loss": 0.5014, + "step": 5019 + }, + { + "epoch": 0.407699179728742, + "grad_norm": 5.00487843260526, + "learning_rate": 3.3516506697527706e-06, + "loss": 0.7005, + "step": 5020 + }, + { + "epoch": 0.40778039470478356, + "grad_norm": 4.0699036925411045, + "learning_rate": 3.3510323528296656e-06, + "loss": 0.5042, + "step": 5021 + }, + { + "epoch": 0.40786160968082513, + "grad_norm": 3.9550635463784944, + "learning_rate": 3.3504139770196252e-06, + "loss": 0.5318, + "step": 5022 + }, + { + "epoch": 0.4079428246568667, + "grad_norm": 3.9690809902318605, + "learning_rate": 3.3497955423654395e-06, + "loss": 0.5501, + "step": 5023 + }, + { + "epoch": 0.4080240396329083, + "grad_norm": 5.238253417658213, + "learning_rate": 3.349177048909899e-06, + "loss": 0.4221, + "step": 5024 + }, + { + "epoch": 0.40810525460894986, + "grad_norm": 6.31641058902131, + "learning_rate": 3.3485584966958005e-06, + "loss": 0.5599, + "step": 5025 + }, + { + "epoch": 0.4081864695849915, + "grad_norm": 4.773613373392485, + "learning_rate": 3.3479398857659464e-06, + "loss": 0.672, + "step": 5026 + }, + { + "epoch": 0.4082676845610331, + "grad_norm": 6.145123089078632, + "learning_rate": 3.3473212161631385e-06, + "loss": 0.4964, + "step": 5027 + }, + { + "epoch": 0.40834889953707465, + "grad_norm": 4.502728069285821, + "learning_rate": 3.3467024879301873e-06, + "loss": 0.633, + "step": 5028 + }, + { + "epoch": 0.4084301145131162, + "grad_norm": 4.232356275831273, + "learning_rate": 3.346083701109905e-06, + "loss": 0.7541, + "step": 5029 + }, + { + "epoch": 0.4085113294891578, + "grad_norm": 6.5669331099437045, + "learning_rate": 3.3454648557451087e-06, + "loss": 0.6154, + "step": 5030 + }, + { + "epoch": 0.4085925444651994, + "grad_norm": 3.585238962477474, + "learning_rate": 3.3448459518786193e-06, + "loss": 0.563, + "step": 5031 + }, + { + "epoch": 0.40867375944124096, + "grad_norm": 7.113560531413456, + "learning_rate": 3.3442269895532604e-06, + "loss": 0.4665, + "step": 5032 + }, + { + "epoch": 0.40875497441728254, + "grad_norm": 4.184427215416845, + "learning_rate": 3.3436079688118618e-06, + "loss": 0.4255, + "step": 5033 + }, + { + "epoch": 0.4088361893933241, + "grad_norm": 3.690634979166768, + "learning_rate": 3.3429888896972575e-06, + "loss": 0.4262, + "step": 5034 + }, + { + "epoch": 0.4089174043693657, + "grad_norm": 3.7301420071620424, + "learning_rate": 3.3423697522522823e-06, + "loss": 0.5172, + "step": 5035 + }, + { + "epoch": 0.40899861934540727, + "grad_norm": 4.87720646565865, + "learning_rate": 3.3417505565197794e-06, + "loss": 0.6716, + "step": 5036 + }, + { + "epoch": 0.4090798343214489, + "grad_norm": 3.6748482560950615, + "learning_rate": 3.3411313025425927e-06, + "loss": 0.5472, + "step": 5037 + }, + { + "epoch": 0.4091610492974905, + "grad_norm": 5.7048262277305275, + "learning_rate": 3.340511990363571e-06, + "loss": 0.3746, + "step": 5038 + }, + { + "epoch": 0.40924226427353205, + "grad_norm": 5.363584407358412, + "learning_rate": 3.3398926200255684e-06, + "loss": 0.3669, + "step": 5039 + }, + { + "epoch": 0.40932347924957363, + "grad_norm": 4.185826585370702, + "learning_rate": 3.3392731915714417e-06, + "loss": 0.5765, + "step": 5040 + }, + { + "epoch": 0.4094046942256152, + "grad_norm": 5.9673921212077286, + "learning_rate": 3.338653705044051e-06, + "loss": 0.574, + "step": 5041 + }, + { + "epoch": 0.4094859092016568, + "grad_norm": 13.697669906869592, + "learning_rate": 3.3380341604862633e-06, + "loss": 0.4787, + "step": 5042 + }, + { + "epoch": 0.40956712417769836, + "grad_norm": 18.101167067110552, + "learning_rate": 3.3374145579409467e-06, + "loss": 0.5793, + "step": 5043 + }, + { + "epoch": 0.40964833915373994, + "grad_norm": 33.302750260887564, + "learning_rate": 3.3367948974509743e-06, + "loss": 0.4709, + "step": 5044 + }, + { + "epoch": 0.4097295541297815, + "grad_norm": 5.565821515126126, + "learning_rate": 3.336175179059224e-06, + "loss": 0.4976, + "step": 5045 + }, + { + "epoch": 0.4098107691058231, + "grad_norm": 4.92473685068305, + "learning_rate": 3.335555402808577e-06, + "loss": 0.5237, + "step": 5046 + }, + { + "epoch": 0.40989198408186467, + "grad_norm": 3.6230723915327343, + "learning_rate": 3.334935568741918e-06, + "loss": 0.505, + "step": 5047 + }, + { + "epoch": 0.4099731990579063, + "grad_norm": 4.073098074480497, + "learning_rate": 3.3343156769021355e-06, + "loss": 0.5275, + "step": 5048 + }, + { + "epoch": 0.4100544140339479, + "grad_norm": 3.908868759278294, + "learning_rate": 3.333695727332125e-06, + "loss": 0.635, + "step": 5049 + }, + { + "epoch": 0.41013562900998946, + "grad_norm": 4.898135196202766, + "learning_rate": 3.3330757200747828e-06, + "loss": 0.4958, + "step": 5050 + }, + { + "epoch": 0.41021684398603103, + "grad_norm": 11.463379753224626, + "learning_rate": 3.332455655173008e-06, + "loss": 0.4647, + "step": 5051 + }, + { + "epoch": 0.4102980589620726, + "grad_norm": 4.243407156064371, + "learning_rate": 3.3318355326697093e-06, + "loss": 0.6904, + "step": 5052 + }, + { + "epoch": 0.4103792739381142, + "grad_norm": 3.5903815519776585, + "learning_rate": 3.3312153526077933e-06, + "loss": 0.5248, + "step": 5053 + }, + { + "epoch": 0.41046048891415576, + "grad_norm": 5.5607811919006345, + "learning_rate": 3.330595115030174e-06, + "loss": 0.4837, + "step": 5054 + }, + { + "epoch": 0.41054170389019734, + "grad_norm": 4.396444235590735, + "learning_rate": 3.3299748199797686e-06, + "loss": 0.4331, + "step": 5055 + }, + { + "epoch": 0.4106229188662389, + "grad_norm": 5.6326755354518845, + "learning_rate": 3.3293544674994987e-06, + "loss": 0.5109, + "step": 5056 + }, + { + "epoch": 0.4107041338422805, + "grad_norm": 6.458559629689074, + "learning_rate": 3.328734057632289e-06, + "loss": 0.5488, + "step": 5057 + }, + { + "epoch": 0.41078534881832207, + "grad_norm": 6.259903972340533, + "learning_rate": 3.328113590421068e-06, + "loss": 0.787, + "step": 5058 + }, + { + "epoch": 0.4108665637943637, + "grad_norm": 5.031847327165066, + "learning_rate": 3.3274930659087694e-06, + "loss": 0.6045, + "step": 5059 + }, + { + "epoch": 0.4109477787704053, + "grad_norm": 12.757281627069613, + "learning_rate": 3.3268724841383302e-06, + "loss": 0.5007, + "step": 5060 + }, + { + "epoch": 0.41102899374644686, + "grad_norm": 8.972711433026783, + "learning_rate": 3.3262518451526916e-06, + "loss": 0.6009, + "step": 5061 + }, + { + "epoch": 0.41111020872248843, + "grad_norm": 3.924314034573167, + "learning_rate": 3.3256311489947973e-06, + "loss": 0.4718, + "step": 5062 + }, + { + "epoch": 0.41119142369853, + "grad_norm": 3.8326582602133796, + "learning_rate": 3.3250103957075987e-06, + "loss": 0.6721, + "step": 5063 + }, + { + "epoch": 0.4112726386745716, + "grad_norm": 5.858057317848575, + "learning_rate": 3.3243895853340445e-06, + "loss": 0.3982, + "step": 5064 + }, + { + "epoch": 0.41135385365061317, + "grad_norm": 4.79530936974041, + "learning_rate": 3.323768717917096e-06, + "loss": 0.6906, + "step": 5065 + }, + { + "epoch": 0.41143506862665474, + "grad_norm": 3.761887070675124, + "learning_rate": 3.323147793499712e-06, + "loss": 0.5835, + "step": 5066 + }, + { + "epoch": 0.4115162836026963, + "grad_norm": 4.8103574968475735, + "learning_rate": 3.3225268121248567e-06, + "loss": 0.5361, + "step": 5067 + }, + { + "epoch": 0.4115974985787379, + "grad_norm": 3.859720909374451, + "learning_rate": 3.321905773835498e-06, + "loss": 0.6076, + "step": 5068 + }, + { + "epoch": 0.4116787135547795, + "grad_norm": 3.758573255947487, + "learning_rate": 3.3212846786746113e-06, + "loss": 0.4216, + "step": 5069 + }, + { + "epoch": 0.4117599285308211, + "grad_norm": 7.220042667589109, + "learning_rate": 3.3206635266851707e-06, + "loss": 0.5195, + "step": 5070 + }, + { + "epoch": 0.4118411435068627, + "grad_norm": 5.446095850430871, + "learning_rate": 3.320042317910157e-06, + "loss": 0.5241, + "step": 5071 + }, + { + "epoch": 0.41192235848290426, + "grad_norm": 4.589008245862078, + "learning_rate": 3.319421052392556e-06, + "loss": 0.6736, + "step": 5072 + }, + { + "epoch": 0.41200357345894584, + "grad_norm": 9.586999090184612, + "learning_rate": 3.318799730175354e-06, + "loss": 0.5927, + "step": 5073 + }, + { + "epoch": 0.4120847884349874, + "grad_norm": 3.790231648461632, + "learning_rate": 3.3181783513015443e-06, + "loss": 0.5262, + "step": 5074 + }, + { + "epoch": 0.412166003411029, + "grad_norm": 7.8339239381427435, + "learning_rate": 3.317556915814123e-06, + "loss": 0.5578, + "step": 5075 + }, + { + "epoch": 0.41224721838707057, + "grad_norm": 4.189621669708876, + "learning_rate": 3.31693542375609e-06, + "loss": 0.4758, + "step": 5076 + }, + { + "epoch": 0.41232843336311215, + "grad_norm": 3.23569345024193, + "learning_rate": 3.316313875170449e-06, + "loss": 0.5989, + "step": 5077 + }, + { + "epoch": 0.4124096483391537, + "grad_norm": 5.092573629488183, + "learning_rate": 3.3156922701002082e-06, + "loss": 0.4451, + "step": 5078 + }, + { + "epoch": 0.4124908633151953, + "grad_norm": 3.7634789724489774, + "learning_rate": 3.3150706085883795e-06, + "loss": 0.6206, + "step": 5079 + }, + { + "epoch": 0.4125720782912369, + "grad_norm": 9.494128622068938, + "learning_rate": 3.3144488906779775e-06, + "loss": 0.5531, + "step": 5080 + }, + { + "epoch": 0.4126532932672785, + "grad_norm": 5.01807369882204, + "learning_rate": 3.3138271164120235e-06, + "loss": 0.4668, + "step": 5081 + }, + { + "epoch": 0.4127345082433201, + "grad_norm": 4.224562759189403, + "learning_rate": 3.3132052858335405e-06, + "loss": 0.471, + "step": 5082 + }, + { + "epoch": 0.41281572321936166, + "grad_norm": 4.452581770491441, + "learning_rate": 3.312583398985555e-06, + "loss": 0.478, + "step": 5083 + }, + { + "epoch": 0.41289693819540324, + "grad_norm": 6.547729750220411, + "learning_rate": 3.3119614559110986e-06, + "loss": 0.7315, + "step": 5084 + }, + { + "epoch": 0.4129781531714448, + "grad_norm": 3.7189334139526307, + "learning_rate": 3.3113394566532076e-06, + "loss": 0.4778, + "step": 5085 + }, + { + "epoch": 0.4130593681474864, + "grad_norm": 5.682620271914815, + "learning_rate": 3.310717401254919e-06, + "loss": 0.6956, + "step": 5086 + }, + { + "epoch": 0.41314058312352797, + "grad_norm": 5.539361693130007, + "learning_rate": 3.3100952897592774e-06, + "loss": 0.4507, + "step": 5087 + }, + { + "epoch": 0.41322179809956955, + "grad_norm": 5.822291658506241, + "learning_rate": 3.3094731222093297e-06, + "loss": 0.5914, + "step": 5088 + }, + { + "epoch": 0.4133030130756111, + "grad_norm": 4.73471914454109, + "learning_rate": 3.3088508986481256e-06, + "loss": 0.4881, + "step": 5089 + }, + { + "epoch": 0.4133842280516527, + "grad_norm": 6.22074010943859, + "learning_rate": 3.30822861911872e-06, + "loss": 0.603, + "step": 5090 + }, + { + "epoch": 0.4134654430276943, + "grad_norm": 6.1051785076416385, + "learning_rate": 3.3076062836641716e-06, + "loss": 0.43, + "step": 5091 + }, + { + "epoch": 0.4135466580037359, + "grad_norm": 4.391606365906377, + "learning_rate": 3.306983892327542e-06, + "loss": 0.4868, + "step": 5092 + }, + { + "epoch": 0.4136278729797775, + "grad_norm": 3.796872062788742, + "learning_rate": 3.306361445151899e-06, + "loss": 0.5556, + "step": 5093 + }, + { + "epoch": 0.41370908795581907, + "grad_norm": 5.3352051929174165, + "learning_rate": 3.3057389421803104e-06, + "loss": 0.5305, + "step": 5094 + }, + { + "epoch": 0.41379030293186064, + "grad_norm": 5.507108563864587, + "learning_rate": 3.305116383455852e-06, + "loss": 0.8252, + "step": 5095 + }, + { + "epoch": 0.4138715179079022, + "grad_norm": 3.239020837952215, + "learning_rate": 3.304493769021601e-06, + "loss": 0.5663, + "step": 5096 + }, + { + "epoch": 0.4139527328839438, + "grad_norm": 5.771163705049402, + "learning_rate": 3.3038710989206386e-06, + "loss": 0.5291, + "step": 5097 + }, + { + "epoch": 0.4140339478599854, + "grad_norm": 7.098970415020295, + "learning_rate": 3.303248373196051e-06, + "loss": 0.5858, + "step": 5098 + }, + { + "epoch": 0.41411516283602695, + "grad_norm": 6.2763287339806535, + "learning_rate": 3.3026255918909267e-06, + "loss": 0.4381, + "step": 5099 + }, + { + "epoch": 0.41419637781206853, + "grad_norm": 3.6575950544234708, + "learning_rate": 3.302002755048359e-06, + "loss": 0.7504, + "step": 5100 + }, + { + "epoch": 0.4142775927881101, + "grad_norm": 4.90381745333394, + "learning_rate": 3.3013798627114457e-06, + "loss": 0.3803, + "step": 5101 + }, + { + "epoch": 0.4143588077641517, + "grad_norm": 4.885461667100054, + "learning_rate": 3.300756914923287e-06, + "loss": 0.4982, + "step": 5102 + }, + { + "epoch": 0.4144400227401933, + "grad_norm": 4.517776876693077, + "learning_rate": 3.3001339117269883e-06, + "loss": 0.6321, + "step": 5103 + }, + { + "epoch": 0.4145212377162349, + "grad_norm": 4.594992279285769, + "learning_rate": 3.2995108531656566e-06, + "loss": 0.4652, + "step": 5104 + }, + { + "epoch": 0.41460245269227647, + "grad_norm": 4.353302582897617, + "learning_rate": 3.298887739282406e-06, + "loss": 0.5871, + "step": 5105 + }, + { + "epoch": 0.41468366766831805, + "grad_norm": 8.458802751943896, + "learning_rate": 3.298264570120351e-06, + "loss": 0.7833, + "step": 5106 + }, + { + "epoch": 0.4147648826443596, + "grad_norm": 3.170384706895752, + "learning_rate": 3.297641345722613e-06, + "loss": 0.5037, + "step": 5107 + }, + { + "epoch": 0.4148460976204012, + "grad_norm": 2.9676348563178094, + "learning_rate": 3.2970180661323155e-06, + "loss": 0.6279, + "step": 5108 + }, + { + "epoch": 0.4149273125964428, + "grad_norm": 4.616218599976382, + "learning_rate": 3.2963947313925857e-06, + "loss": 0.559, + "step": 5109 + }, + { + "epoch": 0.41500852757248435, + "grad_norm": 6.559911673941787, + "learning_rate": 3.295771341546555e-06, + "loss": 0.5066, + "step": 5110 + }, + { + "epoch": 0.41508974254852593, + "grad_norm": 4.702456652104927, + "learning_rate": 3.2951478966373602e-06, + "loss": 0.4724, + "step": 5111 + }, + { + "epoch": 0.4151709575245675, + "grad_norm": 6.625773908660462, + "learning_rate": 3.2945243967081386e-06, + "loss": 0.5304, + "step": 5112 + }, + { + "epoch": 0.4152521725006091, + "grad_norm": 4.794677100016037, + "learning_rate": 3.2939008418020334e-06, + "loss": 0.5455, + "step": 5113 + }, + { + "epoch": 0.4153333874766507, + "grad_norm": 4.228847169637584, + "learning_rate": 3.293277231962192e-06, + "loss": 0.4838, + "step": 5114 + }, + { + "epoch": 0.4154146024526923, + "grad_norm": 5.780889261663269, + "learning_rate": 3.292653567231765e-06, + "loss": 0.5851, + "step": 5115 + }, + { + "epoch": 0.41549581742873387, + "grad_norm": 3.5767019478443514, + "learning_rate": 3.2920298476539047e-06, + "loss": 0.4035, + "step": 5116 + }, + { + "epoch": 0.41557703240477545, + "grad_norm": 3.173589475313961, + "learning_rate": 3.2914060732717725e-06, + "loss": 0.4834, + "step": 5117 + }, + { + "epoch": 0.415658247380817, + "grad_norm": 5.702832664803554, + "learning_rate": 3.290782244128527e-06, + "loss": 0.4122, + "step": 5118 + }, + { + "epoch": 0.4157394623568586, + "grad_norm": 4.350245123381758, + "learning_rate": 3.290158360267336e-06, + "loss": 0.7049, + "step": 5119 + }, + { + "epoch": 0.4158206773329002, + "grad_norm": 3.6272992204161887, + "learning_rate": 3.2895344217313683e-06, + "loss": 0.5299, + "step": 5120 + }, + { + "epoch": 0.41590189230894176, + "grad_norm": 3.9213077791760966, + "learning_rate": 3.2889104285637967e-06, + "loss": 0.5795, + "step": 5121 + }, + { + "epoch": 0.41598310728498333, + "grad_norm": 4.304699561537021, + "learning_rate": 3.2882863808077993e-06, + "loss": 0.5066, + "step": 5122 + }, + { + "epoch": 0.4160643222610249, + "grad_norm": 13.4372391747002, + "learning_rate": 3.287662278506556e-06, + "loss": 0.4601, + "step": 5123 + }, + { + "epoch": 0.4161455372370665, + "grad_norm": 8.076930536538704, + "learning_rate": 3.2870381217032522e-06, + "loss": 0.4771, + "step": 5124 + }, + { + "epoch": 0.4162267522131081, + "grad_norm": 5.712865061327913, + "learning_rate": 3.2864139104410753e-06, + "loss": 0.3819, + "step": 5125 + }, + { + "epoch": 0.4163079671891497, + "grad_norm": 8.244135425876427, + "learning_rate": 3.2857896447632174e-06, + "loss": 0.5816, + "step": 5126 + }, + { + "epoch": 0.4163891821651913, + "grad_norm": 3.9304755927131785, + "learning_rate": 3.2851653247128755e-06, + "loss": 0.5813, + "step": 5127 + }, + { + "epoch": 0.41647039714123285, + "grad_norm": 3.476794981739987, + "learning_rate": 3.2845409503332488e-06, + "loss": 0.7026, + "step": 5128 + }, + { + "epoch": 0.41655161211727443, + "grad_norm": 6.988919037400744, + "learning_rate": 3.2839165216675396e-06, + "loss": 0.3883, + "step": 5129 + }, + { + "epoch": 0.416632827093316, + "grad_norm": 9.043393482730245, + "learning_rate": 3.283292038758956e-06, + "loss": 0.6782, + "step": 5130 + }, + { + "epoch": 0.4167140420693576, + "grad_norm": 9.047180698274353, + "learning_rate": 3.2826675016507094e-06, + "loss": 0.3717, + "step": 5131 + }, + { + "epoch": 0.41679525704539916, + "grad_norm": 4.352324808412342, + "learning_rate": 3.2820429103860133e-06, + "loss": 0.5285, + "step": 5132 + }, + { + "epoch": 0.41687647202144074, + "grad_norm": 4.3428722591294155, + "learning_rate": 3.281418265008087e-06, + "loss": 0.571, + "step": 5133 + }, + { + "epoch": 0.4169576869974823, + "grad_norm": 4.529875570698177, + "learning_rate": 3.280793565560153e-06, + "loss": 0.6332, + "step": 5134 + }, + { + "epoch": 0.41703890197352395, + "grad_norm": 10.119978089195198, + "learning_rate": 3.280168812085436e-06, + "loss": 0.4528, + "step": 5135 + }, + { + "epoch": 0.4171201169495655, + "grad_norm": 3.973556997891654, + "learning_rate": 3.279544004627166e-06, + "loss": 0.6027, + "step": 5136 + }, + { + "epoch": 0.4172013319256071, + "grad_norm": 3.7856125466570814, + "learning_rate": 3.2789191432285767e-06, + "loss": 0.5312, + "step": 5137 + }, + { + "epoch": 0.4172825469016487, + "grad_norm": 3.897046711276253, + "learning_rate": 3.278294227932905e-06, + "loss": 0.4928, + "step": 5138 + }, + { + "epoch": 0.41736376187769025, + "grad_norm": 3.9807540904239715, + "learning_rate": 3.277669258783391e-06, + "loss": 0.4981, + "step": 5139 + }, + { + "epoch": 0.41744497685373183, + "grad_norm": 4.193176888500685, + "learning_rate": 3.277044235823281e-06, + "loss": 0.5247, + "step": 5140 + }, + { + "epoch": 0.4175261918297734, + "grad_norm": 4.102295880640859, + "learning_rate": 3.2764191590958234e-06, + "loss": 0.5928, + "step": 5141 + }, + { + "epoch": 0.417607406805815, + "grad_norm": 4.4166185861935, + "learning_rate": 3.2757940286442676e-06, + "loss": 0.4061, + "step": 5142 + }, + { + "epoch": 0.41768862178185656, + "grad_norm": 6.205963915441601, + "learning_rate": 3.2751688445118705e-06, + "loss": 0.5939, + "step": 5143 + }, + { + "epoch": 0.41776983675789814, + "grad_norm": 9.375588558168431, + "learning_rate": 3.2745436067418934e-06, + "loss": 0.5445, + "step": 5144 + }, + { + "epoch": 0.4178510517339397, + "grad_norm": 7.231124759113749, + "learning_rate": 3.2739183153775964e-06, + "loss": 0.3809, + "step": 5145 + }, + { + "epoch": 0.41793226670998135, + "grad_norm": 5.36109001138212, + "learning_rate": 3.2732929704622485e-06, + "loss": 0.4421, + "step": 5146 + }, + { + "epoch": 0.4180134816860229, + "grad_norm": 5.1614659053586545, + "learning_rate": 3.2726675720391203e-06, + "loss": 0.4925, + "step": 5147 + }, + { + "epoch": 0.4180946966620645, + "grad_norm": 5.095668465921769, + "learning_rate": 3.272042120151485e-06, + "loss": 0.7233, + "step": 5148 + }, + { + "epoch": 0.4181759116381061, + "grad_norm": 6.655912818912801, + "learning_rate": 3.2714166148426204e-06, + "loss": 0.4635, + "step": 5149 + }, + { + "epoch": 0.41825712661414766, + "grad_norm": 5.621140760979749, + "learning_rate": 3.27079105615581e-06, + "loss": 0.5281, + "step": 5150 + }, + { + "epoch": 0.41833834159018923, + "grad_norm": 5.0469957881501255, + "learning_rate": 3.2701654441343365e-06, + "loss": 0.5175, + "step": 5151 + }, + { + "epoch": 0.4184195565662308, + "grad_norm": 4.710792152616251, + "learning_rate": 3.269539778821491e-06, + "loss": 0.4824, + "step": 5152 + }, + { + "epoch": 0.4185007715422724, + "grad_norm": 5.248614874618813, + "learning_rate": 3.268914060260565e-06, + "loss": 0.6031, + "step": 5153 + }, + { + "epoch": 0.41858198651831396, + "grad_norm": 3.7091517078378096, + "learning_rate": 3.2682882884948557e-06, + "loss": 0.445, + "step": 5154 + }, + { + "epoch": 0.41866320149435554, + "grad_norm": 7.473085780555477, + "learning_rate": 3.2676624635676637e-06, + "loss": 0.4225, + "step": 5155 + }, + { + "epoch": 0.4187444164703971, + "grad_norm": 5.984937157684743, + "learning_rate": 3.267036585522291e-06, + "loss": 0.3815, + "step": 5156 + }, + { + "epoch": 0.41882563144643875, + "grad_norm": 7.910654251052096, + "learning_rate": 3.2664106544020464e-06, + "loss": 0.4595, + "step": 5157 + }, + { + "epoch": 0.41890684642248033, + "grad_norm": 5.447690662144098, + "learning_rate": 3.2657846702502404e-06, + "loss": 0.5105, + "step": 5158 + }, + { + "epoch": 0.4189880613985219, + "grad_norm": 6.61183045497076, + "learning_rate": 3.2651586331101887e-06, + "loss": 0.4271, + "step": 5159 + }, + { + "epoch": 0.4190692763745635, + "grad_norm": 6.227375988300761, + "learning_rate": 3.2645325430252096e-06, + "loss": 0.5066, + "step": 5160 + }, + { + "epoch": 0.41915049135060506, + "grad_norm": 3.5873779321139714, + "learning_rate": 3.2639064000386236e-06, + "loss": 0.5637, + "step": 5161 + }, + { + "epoch": 0.41923170632664664, + "grad_norm": 3.2262674778585376, + "learning_rate": 3.2632802041937574e-06, + "loss": 0.4754, + "step": 5162 + }, + { + "epoch": 0.4193129213026882, + "grad_norm": 4.976278018847007, + "learning_rate": 3.262653955533942e-06, + "loss": 0.4554, + "step": 5163 + }, + { + "epoch": 0.4193941362787298, + "grad_norm": 7.7409127238618245, + "learning_rate": 3.262027654102508e-06, + "loss": 0.5582, + "step": 5164 + }, + { + "epoch": 0.41947535125477137, + "grad_norm": 5.01210383268172, + "learning_rate": 3.2614012999427934e-06, + "loss": 0.4712, + "step": 5165 + }, + { + "epoch": 0.41955656623081294, + "grad_norm": 4.527219081383319, + "learning_rate": 3.26077489309814e-06, + "loss": 0.7056, + "step": 5166 + }, + { + "epoch": 0.4196377812068545, + "grad_norm": 4.790861705461241, + "learning_rate": 3.2601484336118887e-06, + "loss": 0.6324, + "step": 5167 + }, + { + "epoch": 0.41971899618289615, + "grad_norm": 5.6189045494809875, + "learning_rate": 3.2595219215273895e-06, + "loss": 0.4605, + "step": 5168 + }, + { + "epoch": 0.41980021115893773, + "grad_norm": 3.3926840995979206, + "learning_rate": 3.258895356887993e-06, + "loss": 0.4653, + "step": 5169 + }, + { + "epoch": 0.4198814261349793, + "grad_norm": 4.7133012677803405, + "learning_rate": 3.2582687397370538e-06, + "loss": 0.3422, + "step": 5170 + }, + { + "epoch": 0.4199626411110209, + "grad_norm": 6.7508413558662355, + "learning_rate": 3.257642070117931e-06, + "loss": 0.4689, + "step": 5171 + }, + { + "epoch": 0.42004385608706246, + "grad_norm": 7.457776695287912, + "learning_rate": 3.2570153480739867e-06, + "loss": 0.5847, + "step": 5172 + }, + { + "epoch": 0.42012507106310404, + "grad_norm": 5.26972034775471, + "learning_rate": 3.2563885736485873e-06, + "loss": 0.5858, + "step": 5173 + }, + { + "epoch": 0.4202062860391456, + "grad_norm": 2.8327813340648325, + "learning_rate": 3.255761746885101e-06, + "loss": 0.437, + "step": 5174 + }, + { + "epoch": 0.4202875010151872, + "grad_norm": 4.515606181218866, + "learning_rate": 3.2551348678269023e-06, + "loss": 0.6866, + "step": 5175 + }, + { + "epoch": 0.42036871599122877, + "grad_norm": 4.52820690426378, + "learning_rate": 3.2545079365173672e-06, + "loss": 0.4457, + "step": 5176 + }, + { + "epoch": 0.42044993096727035, + "grad_norm": 5.8094460720614505, + "learning_rate": 3.253880952999876e-06, + "loss": 0.501, + "step": 5177 + }, + { + "epoch": 0.4205311459433119, + "grad_norm": 9.872169395018048, + "learning_rate": 3.2532539173178125e-06, + "loss": 0.6308, + "step": 5178 + }, + { + "epoch": 0.42061236091935356, + "grad_norm": 8.660617050371519, + "learning_rate": 3.2526268295145647e-06, + "loss": 0.5709, + "step": 5179 + }, + { + "epoch": 0.42069357589539513, + "grad_norm": 4.736443287957975, + "learning_rate": 3.251999689633523e-06, + "loss": 0.4353, + "step": 5180 + }, + { + "epoch": 0.4207747908714367, + "grad_norm": 10.700216416621961, + "learning_rate": 3.2513724977180828e-06, + "loss": 0.623, + "step": 5181 + }, + { + "epoch": 0.4208560058474783, + "grad_norm": 11.331063237198684, + "learning_rate": 3.250745253811643e-06, + "loss": 0.4766, + "step": 5182 + }, + { + "epoch": 0.42093722082351986, + "grad_norm": 4.634568286716608, + "learning_rate": 3.250117957957604e-06, + "loss": 0.5879, + "step": 5183 + }, + { + "epoch": 0.42101843579956144, + "grad_norm": 4.031580348842282, + "learning_rate": 3.249490610199373e-06, + "loss": 0.4788, + "step": 5184 + }, + { + "epoch": 0.421099650775603, + "grad_norm": 5.071428970099418, + "learning_rate": 3.248863210580358e-06, + "loss": 0.4319, + "step": 5185 + }, + { + "epoch": 0.4211808657516446, + "grad_norm": 4.271403990802303, + "learning_rate": 3.248235759143972e-06, + "loss": 0.4285, + "step": 5186 + }, + { + "epoch": 0.4212620807276862, + "grad_norm": 5.620600198973997, + "learning_rate": 3.247608255933632e-06, + "loss": 0.5508, + "step": 5187 + }, + { + "epoch": 0.42134329570372775, + "grad_norm": 3.5656379095963366, + "learning_rate": 3.2469807009927568e-06, + "loss": 0.5465, + "step": 5188 + }, + { + "epoch": 0.4214245106797693, + "grad_norm": 4.267563683154685, + "learning_rate": 3.2463530943647708e-06, + "loss": 0.6901, + "step": 5189 + }, + { + "epoch": 0.42150572565581096, + "grad_norm": 6.968445424375631, + "learning_rate": 3.2457254360931013e-06, + "loss": 0.6524, + "step": 5190 + }, + { + "epoch": 0.42158694063185254, + "grad_norm": 6.404940362152012, + "learning_rate": 3.245097726221177e-06, + "loss": 0.4577, + "step": 5191 + }, + { + "epoch": 0.4216681556078941, + "grad_norm": 10.422891227755217, + "learning_rate": 3.244469964792434e-06, + "loss": 0.487, + "step": 5192 + }, + { + "epoch": 0.4217493705839357, + "grad_norm": 4.235778167208473, + "learning_rate": 3.24384215185031e-06, + "loss": 0.5221, + "step": 5193 + }, + { + "epoch": 0.42183058555997727, + "grad_norm": 5.314122103787476, + "learning_rate": 3.2432142874382442e-06, + "loss": 0.5772, + "step": 5194 + }, + { + "epoch": 0.42191180053601884, + "grad_norm": 34.115592182605965, + "learning_rate": 3.2425863715996852e-06, + "loss": 0.6579, + "step": 5195 + }, + { + "epoch": 0.4219930155120604, + "grad_norm": 5.189198561105788, + "learning_rate": 3.241958404378078e-06, + "loss": 0.5213, + "step": 5196 + }, + { + "epoch": 0.422074230488102, + "grad_norm": 4.933122374718706, + "learning_rate": 3.2413303858168767e-06, + "loss": 0.6707, + "step": 5197 + }, + { + "epoch": 0.4221554454641436, + "grad_norm": 4.33962971377441, + "learning_rate": 3.2407023159595356e-06, + "loss": 0.4882, + "step": 5198 + }, + { + "epoch": 0.42223666044018515, + "grad_norm": 6.399904760111607, + "learning_rate": 3.2400741948495146e-06, + "loss": 0.383, + "step": 5199 + }, + { + "epoch": 0.42231787541622673, + "grad_norm": 11.60358668566535, + "learning_rate": 3.239446022530276e-06, + "loss": 0.6851, + "step": 5200 + }, + { + "epoch": 0.42239909039226836, + "grad_norm": 10.376518506368765, + "learning_rate": 3.2388177990452863e-06, + "loss": 0.5153, + "step": 5201 + }, + { + "epoch": 0.42248030536830994, + "grad_norm": 5.087706195580212, + "learning_rate": 3.2381895244380146e-06, + "loss": 0.4698, + "step": 5202 + }, + { + "epoch": 0.4225615203443515, + "grad_norm": 3.447914836981935, + "learning_rate": 3.237561198751935e-06, + "loss": 0.5198, + "step": 5203 + }, + { + "epoch": 0.4226427353203931, + "grad_norm": 7.767514821519397, + "learning_rate": 3.2369328220305242e-06, + "loss": 0.5854, + "step": 5204 + }, + { + "epoch": 0.42272395029643467, + "grad_norm": 7.5527718662260295, + "learning_rate": 3.2363043943172616e-06, + "loss": 0.4495, + "step": 5205 + }, + { + "epoch": 0.42280516527247625, + "grad_norm": 6.408787873539771, + "learning_rate": 3.235675915655633e-06, + "loss": 0.4103, + "step": 5206 + }, + { + "epoch": 0.4228863802485178, + "grad_norm": 8.10134326274207, + "learning_rate": 3.235047386089123e-06, + "loss": 0.4829, + "step": 5207 + }, + { + "epoch": 0.4229675952245594, + "grad_norm": 4.1969050205146425, + "learning_rate": 3.2344188056612247e-06, + "loss": 0.5541, + "step": 5208 + }, + { + "epoch": 0.423048810200601, + "grad_norm": 4.954025585754433, + "learning_rate": 3.233790174415432e-06, + "loss": 0.4104, + "step": 5209 + }, + { + "epoch": 0.42313002517664255, + "grad_norm": 5.191981721617316, + "learning_rate": 3.2331614923952424e-06, + "loss": 0.6211, + "step": 5210 + }, + { + "epoch": 0.42321124015268413, + "grad_norm": 10.46456809201404, + "learning_rate": 3.232532759644158e-06, + "loss": 0.5638, + "step": 5211 + }, + { + "epoch": 0.42329245512872576, + "grad_norm": 4.916424280031669, + "learning_rate": 3.231903976205684e-06, + "loss": 0.4139, + "step": 5212 + }, + { + "epoch": 0.42337367010476734, + "grad_norm": 4.247004575276243, + "learning_rate": 3.231275142123328e-06, + "loss": 0.3899, + "step": 5213 + }, + { + "epoch": 0.4234548850808089, + "grad_norm": 4.899674792955789, + "learning_rate": 3.2306462574406024e-06, + "loss": 0.697, + "step": 5214 + }, + { + "epoch": 0.4235361000568505, + "grad_norm": 4.934275124900704, + "learning_rate": 3.2300173222010225e-06, + "loss": 0.6492, + "step": 5215 + }, + { + "epoch": 0.42361731503289207, + "grad_norm": 4.326952640941383, + "learning_rate": 3.229388336448107e-06, + "loss": 0.5192, + "step": 5216 + }, + { + "epoch": 0.42369853000893365, + "grad_norm": 6.478214535298061, + "learning_rate": 3.22875930022538e-06, + "loss": 0.5203, + "step": 5217 + }, + { + "epoch": 0.4237797449849752, + "grad_norm": 3.7343533496893393, + "learning_rate": 3.2281302135763655e-06, + "loss": 0.5004, + "step": 5218 + }, + { + "epoch": 0.4238609599610168, + "grad_norm": 4.189623891419984, + "learning_rate": 3.227501076544594e-06, + "loss": 0.5008, + "step": 5219 + }, + { + "epoch": 0.4239421749370584, + "grad_norm": 9.421806026431668, + "learning_rate": 3.2268718891735985e-06, + "loss": 0.4428, + "step": 5220 + }, + { + "epoch": 0.42402338991309996, + "grad_norm": 8.411484589237242, + "learning_rate": 3.2262426515069144e-06, + "loss": 0.5658, + "step": 5221 + }, + { + "epoch": 0.42410460488914153, + "grad_norm": 5.809866223556941, + "learning_rate": 3.225613363588084e-06, + "loss": 0.5688, + "step": 5222 + }, + { + "epoch": 0.42418581986518317, + "grad_norm": 4.3592044879636465, + "learning_rate": 3.2249840254606474e-06, + "loss": 0.5421, + "step": 5223 + }, + { + "epoch": 0.42426703484122474, + "grad_norm": 2.8417643428931147, + "learning_rate": 3.2243546371681535e-06, + "loss": 0.5438, + "step": 5224 + }, + { + "epoch": 0.4243482498172663, + "grad_norm": 4.321132502798912, + "learning_rate": 3.2237251987541535e-06, + "loss": 0.669, + "step": 5225 + }, + { + "epoch": 0.4244294647933079, + "grad_norm": 17.616950980213982, + "learning_rate": 3.223095710262199e-06, + "loss": 0.5083, + "step": 5226 + }, + { + "epoch": 0.4245106797693495, + "grad_norm": 6.679913112655923, + "learning_rate": 3.2224661717358474e-06, + "loss": 0.5919, + "step": 5227 + }, + { + "epoch": 0.42459189474539105, + "grad_norm": 4.47571261524808, + "learning_rate": 3.221836583218662e-06, + "loss": 0.376, + "step": 5228 + }, + { + "epoch": 0.42467310972143263, + "grad_norm": 8.501783709365114, + "learning_rate": 3.221206944754205e-06, + "loss": 0.4275, + "step": 5229 + }, + { + "epoch": 0.4247543246974742, + "grad_norm": 4.892245082709042, + "learning_rate": 3.220577256386043e-06, + "loss": 0.6864, + "step": 5230 + }, + { + "epoch": 0.4248355396735158, + "grad_norm": 5.054086864480611, + "learning_rate": 3.21994751815775e-06, + "loss": 0.5241, + "step": 5231 + }, + { + "epoch": 0.42491675464955736, + "grad_norm": 3.2768063978983095, + "learning_rate": 3.2193177301128985e-06, + "loss": 0.5298, + "step": 5232 + }, + { + "epoch": 0.42499796962559894, + "grad_norm": 4.649050909544704, + "learning_rate": 3.2186878922950672e-06, + "loss": 0.6465, + "step": 5233 + }, + { + "epoch": 0.42507918460164057, + "grad_norm": 4.194611280947634, + "learning_rate": 3.218058004747837e-06, + "loss": 0.5813, + "step": 5234 + }, + { + "epoch": 0.42516039957768215, + "grad_norm": 12.035010001320948, + "learning_rate": 3.2174280675147933e-06, + "loss": 0.444, + "step": 5235 + }, + { + "epoch": 0.4252416145537237, + "grad_norm": 3.9279173401772285, + "learning_rate": 3.2167980806395244e-06, + "loss": 0.4574, + "step": 5236 + }, + { + "epoch": 0.4253228295297653, + "grad_norm": 3.760359692561372, + "learning_rate": 3.216168044165622e-06, + "loss": 0.4997, + "step": 5237 + }, + { + "epoch": 0.4254040445058069, + "grad_norm": 6.915480718748514, + "learning_rate": 3.215537958136681e-06, + "loss": 0.5207, + "step": 5238 + }, + { + "epoch": 0.42548525948184845, + "grad_norm": 9.393926456415134, + "learning_rate": 3.2149078225963e-06, + "loss": 0.4996, + "step": 5239 + }, + { + "epoch": 0.42556647445789003, + "grad_norm": 5.607193609498181, + "learning_rate": 3.2142776375880814e-06, + "loss": 0.6179, + "step": 5240 + }, + { + "epoch": 0.4256476894339316, + "grad_norm": 7.777688950913407, + "learning_rate": 3.213647403155631e-06, + "loss": 0.4407, + "step": 5241 + }, + { + "epoch": 0.4257289044099732, + "grad_norm": 5.406198429944911, + "learning_rate": 3.213017119342557e-06, + "loss": 0.4342, + "step": 5242 + }, + { + "epoch": 0.42581011938601476, + "grad_norm": 4.332500644815715, + "learning_rate": 3.2123867861924705e-06, + "loss": 0.547, + "step": 5243 + }, + { + "epoch": 0.42589133436205634, + "grad_norm": 4.158064015085327, + "learning_rate": 3.211756403748991e-06, + "loss": 0.6803, + "step": 5244 + }, + { + "epoch": 0.42597254933809797, + "grad_norm": 4.670204213436716, + "learning_rate": 3.211125972055734e-06, + "loss": 0.3712, + "step": 5245 + }, + { + "epoch": 0.42605376431413955, + "grad_norm": 3.1072025340370835, + "learning_rate": 3.210495491156323e-06, + "loss": 0.6592, + "step": 5246 + }, + { + "epoch": 0.4261349792901811, + "grad_norm": 3.845276286103103, + "learning_rate": 3.2098649610943855e-06, + "loss": 0.3973, + "step": 5247 + }, + { + "epoch": 0.4262161942662227, + "grad_norm": 5.263686826495678, + "learning_rate": 3.2092343819135485e-06, + "loss": 0.4361, + "step": 5248 + }, + { + "epoch": 0.4262974092422643, + "grad_norm": 4.706614514686621, + "learning_rate": 3.2086037536574467e-06, + "loss": 0.5642, + "step": 5249 + }, + { + "epoch": 0.42637862421830586, + "grad_norm": 7.9955278197924455, + "learning_rate": 3.207973076369715e-06, + "loss": 0.4473, + "step": 5250 + }, + { + "epoch": 0.42645983919434743, + "grad_norm": 5.6591134035265505, + "learning_rate": 3.2073423500939926e-06, + "loss": 0.4853, + "step": 5251 + }, + { + "epoch": 0.426541054170389, + "grad_norm": 8.612516192884062, + "learning_rate": 3.206711574873924e-06, + "loss": 0.462, + "step": 5252 + }, + { + "epoch": 0.4266222691464306, + "grad_norm": 5.102810963488392, + "learning_rate": 3.2060807507531545e-06, + "loss": 0.5559, + "step": 5253 + }, + { + "epoch": 0.42670348412247217, + "grad_norm": 7.341129074246249, + "learning_rate": 3.2054498777753335e-06, + "loss": 0.5183, + "step": 5254 + }, + { + "epoch": 0.42678469909851374, + "grad_norm": 5.00922975998508, + "learning_rate": 3.204818955984115e-06, + "loss": 0.4557, + "step": 5255 + }, + { + "epoch": 0.4268659140745554, + "grad_norm": 3.0225137055666087, + "learning_rate": 3.2041879854231545e-06, + "loss": 0.6202, + "step": 5256 + }, + { + "epoch": 0.42694712905059695, + "grad_norm": 5.862930940720101, + "learning_rate": 3.203556966136113e-06, + "loss": 0.5674, + "step": 5257 + }, + { + "epoch": 0.42702834402663853, + "grad_norm": 8.760401643981206, + "learning_rate": 3.202925898166652e-06, + "loss": 0.4603, + "step": 5258 + }, + { + "epoch": 0.4271095590026801, + "grad_norm": 7.198009661677629, + "learning_rate": 3.2022947815584393e-06, + "loss": 0.4481, + "step": 5259 + }, + { + "epoch": 0.4271907739787217, + "grad_norm": 4.513288458419195, + "learning_rate": 3.2016636163551456e-06, + "loss": 0.4721, + "step": 5260 + }, + { + "epoch": 0.42727198895476326, + "grad_norm": 4.556711076123282, + "learning_rate": 3.2010324026004425e-06, + "loss": 0.5153, + "step": 5261 + }, + { + "epoch": 0.42735320393080484, + "grad_norm": 4.403874321167366, + "learning_rate": 3.200401140338007e-06, + "loss": 0.5405, + "step": 5262 + }, + { + "epoch": 0.4274344189068464, + "grad_norm": 15.464524857508628, + "learning_rate": 3.1997698296115192e-06, + "loss": 0.5812, + "step": 5263 + }, + { + "epoch": 0.427515633882888, + "grad_norm": 6.763078104210325, + "learning_rate": 3.1991384704646632e-06, + "loss": 0.4366, + "step": 5264 + }, + { + "epoch": 0.42759684885892957, + "grad_norm": 3.8083736623019377, + "learning_rate": 3.198507062941125e-06, + "loss": 0.7049, + "step": 5265 + }, + { + "epoch": 0.42767806383497114, + "grad_norm": 4.411432581791518, + "learning_rate": 3.197875607084595e-06, + "loss": 0.5536, + "step": 5266 + }, + { + "epoch": 0.4277592788110128, + "grad_norm": 5.209356808212503, + "learning_rate": 3.1972441029387664e-06, + "loss": 0.7173, + "step": 5267 + }, + { + "epoch": 0.42784049378705435, + "grad_norm": 4.422812878298527, + "learning_rate": 3.196612550547336e-06, + "loss": 0.5126, + "step": 5268 + }, + { + "epoch": 0.42792170876309593, + "grad_norm": 4.063375030795528, + "learning_rate": 3.1959809499540033e-06, + "loss": 0.4163, + "step": 5269 + }, + { + "epoch": 0.4280029237391375, + "grad_norm": 6.439580078200473, + "learning_rate": 3.1953493012024728e-06, + "loss": 0.46, + "step": 5270 + }, + { + "epoch": 0.4280841387151791, + "grad_norm": 6.644696648989759, + "learning_rate": 3.1947176043364512e-06, + "loss": 0.5559, + "step": 5271 + }, + { + "epoch": 0.42816535369122066, + "grad_norm": 9.625730715608832, + "learning_rate": 3.194085859399647e-06, + "loss": 0.4542, + "step": 5272 + }, + { + "epoch": 0.42824656866726224, + "grad_norm": 6.80723990351529, + "learning_rate": 3.1934540664357756e-06, + "loss": 0.3875, + "step": 5273 + }, + { + "epoch": 0.4283277836433038, + "grad_norm": 4.063505294236836, + "learning_rate": 3.1928222254885527e-06, + "loss": 0.4263, + "step": 5274 + }, + { + "epoch": 0.4284089986193454, + "grad_norm": 5.254426921833707, + "learning_rate": 3.192190336601698e-06, + "loss": 0.601, + "step": 5275 + }, + { + "epoch": 0.42849021359538697, + "grad_norm": 3.3439818153250274, + "learning_rate": 3.1915583998189365e-06, + "loss": 0.8393, + "step": 5276 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 4.423002542739268, + "learning_rate": 3.190926415183993e-06, + "loss": 0.4905, + "step": 5277 + }, + { + "epoch": 0.4286526435474702, + "grad_norm": 4.969141698478439, + "learning_rate": 3.190294382740598e-06, + "loss": 0.4598, + "step": 5278 + }, + { + "epoch": 0.42873385852351176, + "grad_norm": 5.329263080541811, + "learning_rate": 3.189662302532486e-06, + "loss": 0.5027, + "step": 5279 + }, + { + "epoch": 0.42881507349955333, + "grad_norm": 8.465008482689347, + "learning_rate": 3.1890301746033914e-06, + "loss": 0.4451, + "step": 5280 + }, + { + "epoch": 0.4288962884755949, + "grad_norm": 4.288959841432594, + "learning_rate": 3.188397998997056e-06, + "loss": 0.5379, + "step": 5281 + }, + { + "epoch": 0.4289775034516365, + "grad_norm": 4.307272162236529, + "learning_rate": 3.1877657757572223e-06, + "loss": 0.3864, + "step": 5282 + }, + { + "epoch": 0.42905871842767807, + "grad_norm": 4.853723061541425, + "learning_rate": 3.187133504927637e-06, + "loss": 0.5744, + "step": 5283 + }, + { + "epoch": 0.42913993340371964, + "grad_norm": 9.888573881930817, + "learning_rate": 3.18650118655205e-06, + "loss": 0.4017, + "step": 5284 + }, + { + "epoch": 0.4292211483797612, + "grad_norm": 6.195263958719191, + "learning_rate": 3.1858688206742135e-06, + "loss": 0.4613, + "step": 5285 + }, + { + "epoch": 0.4293023633558028, + "grad_norm": 5.993567524504076, + "learning_rate": 3.1852364073378845e-06, + "loss": 0.5304, + "step": 5286 + }, + { + "epoch": 0.4293835783318444, + "grad_norm": 6.246122494535498, + "learning_rate": 3.1846039465868233e-06, + "loss": 0.4809, + "step": 5287 + }, + { + "epoch": 0.42946479330788595, + "grad_norm": 11.344295485158227, + "learning_rate": 3.1839714384647914e-06, + "loss": 0.416, + "step": 5288 + }, + { + "epoch": 0.4295460082839276, + "grad_norm": 6.789926972632632, + "learning_rate": 3.1833388830155564e-06, + "loss": 0.5286, + "step": 5289 + }, + { + "epoch": 0.42962722325996916, + "grad_norm": 4.731313984743315, + "learning_rate": 3.1827062802828878e-06, + "loss": 0.6368, + "step": 5290 + }, + { + "epoch": 0.42970843823601074, + "grad_norm": 8.422362176653303, + "learning_rate": 3.182073630310557e-06, + "loss": 0.4566, + "step": 5291 + }, + { + "epoch": 0.4297896532120523, + "grad_norm": 5.811200383485644, + "learning_rate": 3.18144093314234e-06, + "loss": 0.7081, + "step": 5292 + }, + { + "epoch": 0.4298708681880939, + "grad_norm": 3.464910361407899, + "learning_rate": 3.180808188822019e-06, + "loss": 0.6889, + "step": 5293 + }, + { + "epoch": 0.42995208316413547, + "grad_norm": 3.8106177663295115, + "learning_rate": 3.180175397393373e-06, + "loss": 0.6863, + "step": 5294 + }, + { + "epoch": 0.43003329814017704, + "grad_norm": 8.071444985141785, + "learning_rate": 3.1795425589001896e-06, + "loss": 0.5488, + "step": 5295 + }, + { + "epoch": 0.4301145131162186, + "grad_norm": 4.430290323012749, + "learning_rate": 3.178909673386257e-06, + "loss": 0.5592, + "step": 5296 + }, + { + "epoch": 0.4301957280922602, + "grad_norm": 4.5330502482145505, + "learning_rate": 3.178276740895369e-06, + "loss": 0.5002, + "step": 5297 + }, + { + "epoch": 0.4302769430683018, + "grad_norm": 7.279026055668376, + "learning_rate": 3.1776437614713197e-06, + "loss": 0.547, + "step": 5298 + }, + { + "epoch": 0.43035815804434335, + "grad_norm": 5.6210852302334855, + "learning_rate": 3.177010735157909e-06, + "loss": 0.6157, + "step": 5299 + }, + { + "epoch": 0.430439373020385, + "grad_norm": 6.112959552581171, + "learning_rate": 3.1763776619989377e-06, + "loss": 0.5323, + "step": 5300 + }, + { + "epoch": 0.43052058799642656, + "grad_norm": 4.4197983016582745, + "learning_rate": 3.175744542038212e-06, + "loss": 0.525, + "step": 5301 + }, + { + "epoch": 0.43060180297246814, + "grad_norm": 5.4564596091826, + "learning_rate": 3.175111375319541e-06, + "loss": 0.5436, + "step": 5302 + }, + { + "epoch": 0.4306830179485097, + "grad_norm": 4.9689391465007615, + "learning_rate": 3.174478161886736e-06, + "loss": 0.4957, + "step": 5303 + }, + { + "epoch": 0.4307642329245513, + "grad_norm": 9.04638437397121, + "learning_rate": 3.1738449017836102e-06, + "loss": 0.4726, + "step": 5304 + }, + { + "epoch": 0.43084544790059287, + "grad_norm": 6.518271460980143, + "learning_rate": 3.173211595053985e-06, + "loss": 0.4154, + "step": 5305 + }, + { + "epoch": 0.43092666287663445, + "grad_norm": 9.139018032286032, + "learning_rate": 3.17257824174168e-06, + "loss": 0.6605, + "step": 5306 + }, + { + "epoch": 0.431007877852676, + "grad_norm": 5.893703080064373, + "learning_rate": 3.17194484189052e-06, + "loss": 0.5479, + "step": 5307 + }, + { + "epoch": 0.4310890928287176, + "grad_norm": 6.561881133428007, + "learning_rate": 3.171311395544333e-06, + "loss": 0.5071, + "step": 5308 + }, + { + "epoch": 0.4311703078047592, + "grad_norm": 5.394616296616023, + "learning_rate": 3.170677902746951e-06, + "loss": 0.4559, + "step": 5309 + }, + { + "epoch": 0.43125152278080076, + "grad_norm": 6.001462406111083, + "learning_rate": 3.170044363542207e-06, + "loss": 0.4429, + "step": 5310 + }, + { + "epoch": 0.4313327377568424, + "grad_norm": 5.400528463179634, + "learning_rate": 3.1694107779739394e-06, + "loss": 0.4939, + "step": 5311 + }, + { + "epoch": 0.43141395273288397, + "grad_norm": 3.4971169628242347, + "learning_rate": 3.1687771460859886e-06, + "loss": 0.5207, + "step": 5312 + }, + { + "epoch": 0.43149516770892554, + "grad_norm": 5.801443009456409, + "learning_rate": 3.168143467922199e-06, + "loss": 0.537, + "step": 5313 + }, + { + "epoch": 0.4315763826849671, + "grad_norm": 3.877634512463603, + "learning_rate": 3.1675097435264175e-06, + "loss": 0.6002, + "step": 5314 + }, + { + "epoch": 0.4316575976610087, + "grad_norm": 4.436235293442774, + "learning_rate": 3.166875972942494e-06, + "loss": 0.5651, + "step": 5315 + }, + { + "epoch": 0.4317388126370503, + "grad_norm": 3.8720642376323857, + "learning_rate": 3.166242156214283e-06, + "loss": 0.5255, + "step": 5316 + }, + { + "epoch": 0.43182002761309185, + "grad_norm": 5.096619815050037, + "learning_rate": 3.1656082933856415e-06, + "loss": 0.4563, + "step": 5317 + }, + { + "epoch": 0.4319012425891334, + "grad_norm": 4.129271114889403, + "learning_rate": 3.1649743845004275e-06, + "loss": 0.4742, + "step": 5318 + }, + { + "epoch": 0.431982457565175, + "grad_norm": 5.369554351025192, + "learning_rate": 3.164340429602506e-06, + "loss": 0.5313, + "step": 5319 + }, + { + "epoch": 0.4320636725412166, + "grad_norm": 5.6537665065078615, + "learning_rate": 3.1637064287357433e-06, + "loss": 0.427, + "step": 5320 + }, + { + "epoch": 0.43214488751725816, + "grad_norm": 5.909754479859293, + "learning_rate": 3.1630723819440075e-06, + "loss": 0.4343, + "step": 5321 + }, + { + "epoch": 0.4322261024932998, + "grad_norm": 9.806257811882773, + "learning_rate": 3.1624382892711724e-06, + "loss": 0.4722, + "step": 5322 + }, + { + "epoch": 0.43230731746934137, + "grad_norm": 9.73741516646952, + "learning_rate": 3.161804150761114e-06, + "loss": 0.5959, + "step": 5323 + }, + { + "epoch": 0.43238853244538294, + "grad_norm": 4.940223216087869, + "learning_rate": 3.16116996645771e-06, + "loss": 0.5979, + "step": 5324 + }, + { + "epoch": 0.4324697474214245, + "grad_norm": 16.649964044548284, + "learning_rate": 3.1605357364048446e-06, + "loss": 0.4802, + "step": 5325 + }, + { + "epoch": 0.4325509623974661, + "grad_norm": 7.719840433898037, + "learning_rate": 3.159901460646401e-06, + "loss": 0.5291, + "step": 5326 + }, + { + "epoch": 0.4326321773735077, + "grad_norm": 5.612098859952399, + "learning_rate": 3.15926713922627e-06, + "loss": 0.4709, + "step": 5327 + }, + { + "epoch": 0.43271339234954925, + "grad_norm": 15.165120351544296, + "learning_rate": 3.1586327721883416e-06, + "loss": 0.4699, + "step": 5328 + }, + { + "epoch": 0.43279460732559083, + "grad_norm": 7.373093805345169, + "learning_rate": 3.1579983595765107e-06, + "loss": 0.5246, + "step": 5329 + }, + { + "epoch": 0.4328758223016324, + "grad_norm": 4.232276270918911, + "learning_rate": 3.1573639014346756e-06, + "loss": 0.413, + "step": 5330 + }, + { + "epoch": 0.432957037277674, + "grad_norm": 5.475928131532231, + "learning_rate": 3.1567293978067383e-06, + "loss": 0.4798, + "step": 5331 + }, + { + "epoch": 0.43303825225371556, + "grad_norm": 5.319170320864578, + "learning_rate": 3.1560948487366016e-06, + "loss": 0.5221, + "step": 5332 + }, + { + "epoch": 0.4331194672297572, + "grad_norm": 4.7595654317237654, + "learning_rate": 3.1554602542681746e-06, + "loss": 0.5395, + "step": 5333 + }, + { + "epoch": 0.43320068220579877, + "grad_norm": 5.163449251799242, + "learning_rate": 3.154825614445366e-06, + "loss": 0.5755, + "step": 5334 + }, + { + "epoch": 0.43328189718184035, + "grad_norm": 4.2774589507058085, + "learning_rate": 3.154190929312091e-06, + "loss": 0.4156, + "step": 5335 + }, + { + "epoch": 0.4333631121578819, + "grad_norm": 6.636509210502926, + "learning_rate": 3.1535561989122667e-06, + "loss": 0.3532, + "step": 5336 + }, + { + "epoch": 0.4334443271339235, + "grad_norm": 4.679244393970052, + "learning_rate": 3.152921423289811e-06, + "loss": 0.5302, + "step": 5337 + }, + { + "epoch": 0.4335255421099651, + "grad_norm": 4.600315390689705, + "learning_rate": 3.1522866024886497e-06, + "loss": 0.5156, + "step": 5338 + }, + { + "epoch": 0.43360675708600666, + "grad_norm": 7.387419218755155, + "learning_rate": 3.1516517365527064e-06, + "loss": 0.6254, + "step": 5339 + }, + { + "epoch": 0.43368797206204823, + "grad_norm": 8.038125663875935, + "learning_rate": 3.151016825525912e-06, + "loss": 0.4804, + "step": 5340 + }, + { + "epoch": 0.4337691870380898, + "grad_norm": 5.664820632074068, + "learning_rate": 3.1503818694521993e-06, + "loss": 0.5997, + "step": 5341 + }, + { + "epoch": 0.4338504020141314, + "grad_norm": 5.420884590250101, + "learning_rate": 3.1497468683755027e-06, + "loss": 0.5103, + "step": 5342 + }, + { + "epoch": 0.43393161699017296, + "grad_norm": 4.344293142047334, + "learning_rate": 3.1491118223397622e-06, + "loss": 0.6567, + "step": 5343 + }, + { + "epoch": 0.4340128319662146, + "grad_norm": 3.3509931709068184, + "learning_rate": 3.1484767313889186e-06, + "loss": 0.5217, + "step": 5344 + }, + { + "epoch": 0.4340940469422562, + "grad_norm": 6.46488127239872, + "learning_rate": 3.1478415955669174e-06, + "loss": 0.5403, + "step": 5345 + }, + { + "epoch": 0.43417526191829775, + "grad_norm": 28.243274098376233, + "learning_rate": 3.1472064149177063e-06, + "loss": 0.613, + "step": 5346 + }, + { + "epoch": 0.4342564768943393, + "grad_norm": 6.961851378871304, + "learning_rate": 3.1465711894852364e-06, + "loss": 0.5199, + "step": 5347 + }, + { + "epoch": 0.4343376918703809, + "grad_norm": 4.438081889262291, + "learning_rate": 3.145935919313462e-06, + "loss": 0.6373, + "step": 5348 + }, + { + "epoch": 0.4344189068464225, + "grad_norm": 4.323565064610984, + "learning_rate": 3.1453006044463417e-06, + "loss": 0.5597, + "step": 5349 + }, + { + "epoch": 0.43450012182246406, + "grad_norm": 3.0858340228520826, + "learning_rate": 3.144665244927833e-06, + "loss": 0.5194, + "step": 5350 + }, + { + "epoch": 0.43458133679850564, + "grad_norm": 4.877755892925608, + "learning_rate": 3.144029840801902e-06, + "loss": 0.7601, + "step": 5351 + }, + { + "epoch": 0.4346625517745472, + "grad_norm": 4.213210244527702, + "learning_rate": 3.1433943921125154e-06, + "loss": 0.5054, + "step": 5352 + }, + { + "epoch": 0.4347437667505888, + "grad_norm": 4.009304731887304, + "learning_rate": 3.1427588989036406e-06, + "loss": 0.8653, + "step": 5353 + }, + { + "epoch": 0.43482498172663037, + "grad_norm": 6.671797591935914, + "learning_rate": 3.1421233612192527e-06, + "loss": 0.5943, + "step": 5354 + }, + { + "epoch": 0.434906196702672, + "grad_norm": 6.798919847330098, + "learning_rate": 3.1414877791033267e-06, + "loss": 0.5151, + "step": 5355 + }, + { + "epoch": 0.4349874116787136, + "grad_norm": 5.3539572195583975, + "learning_rate": 3.1408521525998403e-06, + "loss": 0.4387, + "step": 5356 + }, + { + "epoch": 0.43506862665475515, + "grad_norm": 4.194017130183013, + "learning_rate": 3.1402164817527776e-06, + "loss": 0.3866, + "step": 5357 + }, + { + "epoch": 0.43514984163079673, + "grad_norm": 3.6695150698980354, + "learning_rate": 3.1395807666061223e-06, + "loss": 0.6504, + "step": 5358 + }, + { + "epoch": 0.4352310566068383, + "grad_norm": 4.641952406086023, + "learning_rate": 3.138945007203863e-06, + "loss": 0.4588, + "step": 5359 + }, + { + "epoch": 0.4353122715828799, + "grad_norm": 10.931096401774482, + "learning_rate": 3.1383092035899903e-06, + "loss": 0.5878, + "step": 5360 + }, + { + "epoch": 0.43539348655892146, + "grad_norm": 5.762955444790604, + "learning_rate": 3.1376733558084994e-06, + "loss": 0.3661, + "step": 5361 + }, + { + "epoch": 0.43547470153496304, + "grad_norm": 4.461787974097396, + "learning_rate": 3.1370374639033876e-06, + "loss": 0.5696, + "step": 5362 + }, + { + "epoch": 0.4355559165110046, + "grad_norm": 3.0791414260530683, + "learning_rate": 3.1364015279186537e-06, + "loss": 0.6584, + "step": 5363 + }, + { + "epoch": 0.4356371314870462, + "grad_norm": 5.145988507189497, + "learning_rate": 3.1357655478983028e-06, + "loss": 0.4157, + "step": 5364 + }, + { + "epoch": 0.43571834646308777, + "grad_norm": 4.936564393357792, + "learning_rate": 3.135129523886341e-06, + "loss": 0.6098, + "step": 5365 + }, + { + "epoch": 0.4357995614391294, + "grad_norm": 5.83447257075552, + "learning_rate": 3.1344934559267763e-06, + "loss": 0.4316, + "step": 5366 + }, + { + "epoch": 0.435880776415171, + "grad_norm": 4.5475608942051196, + "learning_rate": 3.1338573440636232e-06, + "loss": 0.591, + "step": 5367 + }, + { + "epoch": 0.43596199139121256, + "grad_norm": 6.710109555046167, + "learning_rate": 3.133221188340897e-06, + "loss": 0.5388, + "step": 5368 + }, + { + "epoch": 0.43604320636725413, + "grad_norm": 6.706321579158362, + "learning_rate": 3.132584988802615e-06, + "loss": 0.5046, + "step": 5369 + }, + { + "epoch": 0.4361244213432957, + "grad_norm": 5.443757710169265, + "learning_rate": 3.1319487454928005e-06, + "loss": 0.5206, + "step": 5370 + }, + { + "epoch": 0.4362056363193373, + "grad_norm": 43.84951492925826, + "learning_rate": 3.1313124584554772e-06, + "loss": 0.601, + "step": 5371 + }, + { + "epoch": 0.43628685129537886, + "grad_norm": 2.965181185591869, + "learning_rate": 3.130676127734673e-06, + "loss": 0.5649, + "step": 5372 + }, + { + "epoch": 0.43636806627142044, + "grad_norm": 4.0688538065516955, + "learning_rate": 3.1300397533744176e-06, + "loss": 0.5637, + "step": 5373 + }, + { + "epoch": 0.436449281247462, + "grad_norm": 4.285947029576996, + "learning_rate": 3.129403335418747e-06, + "loss": 0.6039, + "step": 5374 + }, + { + "epoch": 0.4365304962235036, + "grad_norm": 9.05418272699839, + "learning_rate": 3.128766873911696e-06, + "loss": 0.504, + "step": 5375 + }, + { + "epoch": 0.43661171119954517, + "grad_norm": 7.150334825744675, + "learning_rate": 3.1281303688973054e-06, + "loss": 0.437, + "step": 5376 + }, + { + "epoch": 0.4366929261755868, + "grad_norm": 4.999179068937834, + "learning_rate": 3.127493820419617e-06, + "loss": 0.4809, + "step": 5377 + }, + { + "epoch": 0.4367741411516284, + "grad_norm": 4.8473879562154565, + "learning_rate": 3.1268572285226773e-06, + "loss": 0.4894, + "step": 5378 + }, + { + "epoch": 0.43685535612766996, + "grad_norm": 6.929506260339618, + "learning_rate": 3.1262205932505353e-06, + "loss": 0.4363, + "step": 5379 + }, + { + "epoch": 0.43693657110371154, + "grad_norm": 5.5615771450834846, + "learning_rate": 3.125583914647242e-06, + "loss": 0.5205, + "step": 5380 + }, + { + "epoch": 0.4370177860797531, + "grad_norm": 3.3462484960680072, + "learning_rate": 3.124947192756853e-06, + "loss": 0.6615, + "step": 5381 + }, + { + "epoch": 0.4370990010557947, + "grad_norm": 6.634956155249173, + "learning_rate": 3.124310427623426e-06, + "loss": 0.4979, + "step": 5382 + }, + { + "epoch": 0.43718021603183627, + "grad_norm": 5.3106857326366095, + "learning_rate": 3.123673619291021e-06, + "loss": 0.5231, + "step": 5383 + }, + { + "epoch": 0.43726143100787784, + "grad_norm": 4.925146872849586, + "learning_rate": 3.123036767803703e-06, + "loss": 0.6155, + "step": 5384 + }, + { + "epoch": 0.4373426459839194, + "grad_norm": 6.351273194254498, + "learning_rate": 3.122399873205538e-06, + "loss": 0.4055, + "step": 5385 + }, + { + "epoch": 0.437423860959961, + "grad_norm": 7.439635575652295, + "learning_rate": 3.121762935540595e-06, + "loss": 0.4508, + "step": 5386 + }, + { + "epoch": 0.4375050759360026, + "grad_norm": 11.221500508022611, + "learning_rate": 3.121125954852948e-06, + "loss": 0.4207, + "step": 5387 + }, + { + "epoch": 0.4375862909120442, + "grad_norm": 15.28900713021598, + "learning_rate": 3.120488931186672e-06, + "loss": 0.4755, + "step": 5388 + }, + { + "epoch": 0.4376675058880858, + "grad_norm": 3.903889410795801, + "learning_rate": 3.1198518645858455e-06, + "loss": 0.4798, + "step": 5389 + }, + { + "epoch": 0.43774872086412736, + "grad_norm": 6.674485525301077, + "learning_rate": 3.1192147550945517e-06, + "loss": 0.6514, + "step": 5390 + }, + { + "epoch": 0.43782993584016894, + "grad_norm": 14.258332144198288, + "learning_rate": 3.118577602756873e-06, + "loss": 0.4016, + "step": 5391 + }, + { + "epoch": 0.4379111508162105, + "grad_norm": 4.922640216869197, + "learning_rate": 3.1179404076168983e-06, + "loss": 0.5567, + "step": 5392 + }, + { + "epoch": 0.4379923657922521, + "grad_norm": 5.597787362812228, + "learning_rate": 3.1173031697187178e-06, + "loss": 0.6444, + "step": 5393 + }, + { + "epoch": 0.43807358076829367, + "grad_norm": 3.8553056656955143, + "learning_rate": 3.116665889106425e-06, + "loss": 0.3967, + "step": 5394 + }, + { + "epoch": 0.43815479574433525, + "grad_norm": 3.517771835764705, + "learning_rate": 3.1160285658241157e-06, + "loss": 0.4896, + "step": 5395 + }, + { + "epoch": 0.4382360107203768, + "grad_norm": 3.6077248593647715, + "learning_rate": 3.11539119991589e-06, + "loss": 0.6903, + "step": 5396 + }, + { + "epoch": 0.4383172256964184, + "grad_norm": 3.6023017655336056, + "learning_rate": 3.1147537914258513e-06, + "loss": 0.602, + "step": 5397 + }, + { + "epoch": 0.43839844067246, + "grad_norm": 5.836048987707051, + "learning_rate": 3.1141163403981033e-06, + "loss": 0.4444, + "step": 5398 + }, + { + "epoch": 0.4384796556485016, + "grad_norm": 4.215159805640962, + "learning_rate": 3.113478846876754e-06, + "loss": 0.6145, + "step": 5399 + }, + { + "epoch": 0.4385608706245432, + "grad_norm": 7.095922534739042, + "learning_rate": 3.1128413109059164e-06, + "loss": 0.5145, + "step": 5400 + }, + { + "epoch": 0.43864208560058476, + "grad_norm": 3.5538230920359077, + "learning_rate": 3.1122037325297027e-06, + "loss": 0.6229, + "step": 5401 + }, + { + "epoch": 0.43872330057662634, + "grad_norm": 5.0267826629365455, + "learning_rate": 3.1115661117922307e-06, + "loss": 0.5481, + "step": 5402 + }, + { + "epoch": 0.4388045155526679, + "grad_norm": 3.179251364264237, + "learning_rate": 3.1109284487376213e-06, + "loss": 0.5039, + "step": 5403 + }, + { + "epoch": 0.4388857305287095, + "grad_norm": 3.2478475873654626, + "learning_rate": 3.1102907434099962e-06, + "loss": 0.5207, + "step": 5404 + }, + { + "epoch": 0.43896694550475107, + "grad_norm": 7.568604033338787, + "learning_rate": 3.1096529958534805e-06, + "loss": 0.5009, + "step": 5405 + }, + { + "epoch": 0.43904816048079265, + "grad_norm": 5.352814742876498, + "learning_rate": 3.1090152061122053e-06, + "loss": 0.4465, + "step": 5406 + }, + { + "epoch": 0.4391293754568342, + "grad_norm": 6.7191876007979525, + "learning_rate": 3.1083773742303003e-06, + "loss": 0.5651, + "step": 5407 + }, + { + "epoch": 0.4392105904328758, + "grad_norm": 4.940449250528321, + "learning_rate": 3.1077395002519013e-06, + "loss": 0.5716, + "step": 5408 + }, + { + "epoch": 0.4392918054089174, + "grad_norm": 3.648123205660983, + "learning_rate": 3.1071015842211447e-06, + "loss": 0.9121, + "step": 5409 + }, + { + "epoch": 0.439373020384959, + "grad_norm": 4.953572403071827, + "learning_rate": 3.1064636261821716e-06, + "loss": 0.6146, + "step": 5410 + }, + { + "epoch": 0.4394542353610006, + "grad_norm": 6.9246508482882145, + "learning_rate": 3.105825626179126e-06, + "loss": 0.3964, + "step": 5411 + }, + { + "epoch": 0.43953545033704217, + "grad_norm": 4.790951487900791, + "learning_rate": 3.1051875842561523e-06, + "loss": 0.6518, + "step": 5412 + }, + { + "epoch": 0.43961666531308374, + "grad_norm": 4.736239149088806, + "learning_rate": 3.1045495004574017e-06, + "loss": 0.5212, + "step": 5413 + }, + { + "epoch": 0.4396978802891253, + "grad_norm": 7.540621113454675, + "learning_rate": 3.1039113748270248e-06, + "loss": 0.5596, + "step": 5414 + }, + { + "epoch": 0.4397790952651669, + "grad_norm": 3.720380110888026, + "learning_rate": 3.1032732074091765e-06, + "loss": 0.4385, + "step": 5415 + }, + { + "epoch": 0.4398603102412085, + "grad_norm": 4.7658804425871235, + "learning_rate": 3.1026349982480153e-06, + "loss": 0.6445, + "step": 5416 + }, + { + "epoch": 0.43994152521725005, + "grad_norm": 3.8751042373300963, + "learning_rate": 3.101996747387702e-06, + "loss": 0.5791, + "step": 5417 + }, + { + "epoch": 0.44002274019329163, + "grad_norm": 4.852177339352803, + "learning_rate": 3.101358454872399e-06, + "loss": 0.5171, + "step": 5418 + }, + { + "epoch": 0.4401039551693332, + "grad_norm": 4.223969934330754, + "learning_rate": 3.1007201207462745e-06, + "loss": 0.491, + "step": 5419 + }, + { + "epoch": 0.4401851701453748, + "grad_norm": 6.43559751438046, + "learning_rate": 3.1000817450534964e-06, + "loss": 0.4736, + "step": 5420 + }, + { + "epoch": 0.4402663851214164, + "grad_norm": 4.415584725720336, + "learning_rate": 3.0994433278382374e-06, + "loss": 0.5335, + "step": 5421 + }, + { + "epoch": 0.440347600097458, + "grad_norm": 7.044600702022488, + "learning_rate": 3.0988048691446733e-06, + "loss": 0.4042, + "step": 5422 + }, + { + "epoch": 0.44042881507349957, + "grad_norm": 5.958216205084174, + "learning_rate": 3.0981663690169806e-06, + "loss": 0.5884, + "step": 5423 + }, + { + "epoch": 0.44051003004954115, + "grad_norm": 7.972188724833584, + "learning_rate": 3.097527827499341e-06, + "loss": 0.4748, + "step": 5424 + }, + { + "epoch": 0.4405912450255827, + "grad_norm": 5.3560614787912195, + "learning_rate": 3.0968892446359383e-06, + "loss": 0.4807, + "step": 5425 + }, + { + "epoch": 0.4406724600016243, + "grad_norm": 4.339200995867066, + "learning_rate": 3.0962506204709587e-06, + "loss": 0.5928, + "step": 5426 + }, + { + "epoch": 0.4407536749776659, + "grad_norm": 5.4395379188651605, + "learning_rate": 3.0956119550485925e-06, + "loss": 0.4506, + "step": 5427 + }, + { + "epoch": 0.44083488995370745, + "grad_norm": 6.180414632693948, + "learning_rate": 3.09497324841303e-06, + "loss": 0.4891, + "step": 5428 + }, + { + "epoch": 0.44091610492974903, + "grad_norm": 4.088720798041899, + "learning_rate": 3.0943345006084678e-06, + "loss": 0.4157, + "step": 5429 + }, + { + "epoch": 0.4409973199057906, + "grad_norm": 4.834350877095814, + "learning_rate": 3.0936957116791048e-06, + "loss": 0.4818, + "step": 5430 + }, + { + "epoch": 0.4410785348818322, + "grad_norm": 6.729193046534626, + "learning_rate": 3.0930568816691394e-06, + "loss": 0.4463, + "step": 5431 + }, + { + "epoch": 0.4411597498578738, + "grad_norm": 4.907909407785353, + "learning_rate": 3.092418010622777e-06, + "loss": 0.5464, + "step": 5432 + }, + { + "epoch": 0.4412409648339154, + "grad_norm": 3.5985370572492146, + "learning_rate": 3.091779098584224e-06, + "loss": 0.5931, + "step": 5433 + }, + { + "epoch": 0.44132217980995697, + "grad_norm": 3.530928584827468, + "learning_rate": 3.0911401455976882e-06, + "loss": 0.483, + "step": 5434 + }, + { + "epoch": 0.44140339478599855, + "grad_norm": 4.3804647274752035, + "learning_rate": 3.0905011517073834e-06, + "loss": 0.4682, + "step": 5435 + }, + { + "epoch": 0.4414846097620401, + "grad_norm": 8.630610987447541, + "learning_rate": 3.089862116957525e-06, + "loss": 0.5017, + "step": 5436 + }, + { + "epoch": 0.4415658247380817, + "grad_norm": 4.490184934510654, + "learning_rate": 3.089223041392329e-06, + "loss": 0.4949, + "step": 5437 + }, + { + "epoch": 0.4416470397141233, + "grad_norm": 4.853522689003282, + "learning_rate": 3.0885839250560172e-06, + "loss": 0.6344, + "step": 5438 + }, + { + "epoch": 0.44172825469016486, + "grad_norm": 7.016095469900614, + "learning_rate": 3.087944767992813e-06, + "loss": 0.5162, + "step": 5439 + }, + { + "epoch": 0.44180946966620643, + "grad_norm": 5.994420464445846, + "learning_rate": 3.0873055702469416e-06, + "loss": 0.6458, + "step": 5440 + }, + { + "epoch": 0.441890684642248, + "grad_norm": 4.376884747350049, + "learning_rate": 3.086666331862634e-06, + "loss": 0.4059, + "step": 5441 + }, + { + "epoch": 0.4419718996182896, + "grad_norm": 5.692559176862132, + "learning_rate": 3.0860270528841208e-06, + "loss": 0.4641, + "step": 5442 + }, + { + "epoch": 0.4420531145943312, + "grad_norm": 4.8077643907663665, + "learning_rate": 3.085387733355637e-06, + "loss": 0.5179, + "step": 5443 + }, + { + "epoch": 0.4421343295703728, + "grad_norm": 5.021716165337083, + "learning_rate": 3.08474837332142e-06, + "loss": 0.6234, + "step": 5444 + }, + { + "epoch": 0.4422155445464144, + "grad_norm": 5.826683036908708, + "learning_rate": 3.0841089728257108e-06, + "loss": 0.5355, + "step": 5445 + }, + { + "epoch": 0.44229675952245595, + "grad_norm": 4.024099529077247, + "learning_rate": 3.0834695319127516e-06, + "loss": 0.5641, + "step": 5446 + }, + { + "epoch": 0.44237797449849753, + "grad_norm": 6.876907227052113, + "learning_rate": 3.082830050626789e-06, + "loss": 0.4901, + "step": 5447 + }, + { + "epoch": 0.4424591894745391, + "grad_norm": 5.144105316785434, + "learning_rate": 3.0821905290120712e-06, + "loss": 0.3913, + "step": 5448 + }, + { + "epoch": 0.4425404044505807, + "grad_norm": 3.975940531276462, + "learning_rate": 3.0815509671128506e-06, + "loss": 0.4229, + "step": 5449 + }, + { + "epoch": 0.44262161942662226, + "grad_norm": 5.375633378244454, + "learning_rate": 3.0809113649733803e-06, + "loss": 0.5113, + "step": 5450 + }, + { + "epoch": 0.44270283440266384, + "grad_norm": 12.651478454129343, + "learning_rate": 3.0802717226379175e-06, + "loss": 0.5075, + "step": 5451 + }, + { + "epoch": 0.4427840493787054, + "grad_norm": 4.831174765479994, + "learning_rate": 3.079632040150724e-06, + "loss": 0.5238, + "step": 5452 + }, + { + "epoch": 0.442865264354747, + "grad_norm": 5.064193661858532, + "learning_rate": 3.07899231755606e-06, + "loss": 0.5808, + "step": 5453 + }, + { + "epoch": 0.4429464793307886, + "grad_norm": 5.352945242031246, + "learning_rate": 3.0783525548981917e-06, + "loss": 0.4835, + "step": 5454 + }, + { + "epoch": 0.4430276943068302, + "grad_norm": 4.965261328987609, + "learning_rate": 3.077712752221388e-06, + "loss": 0.4251, + "step": 5455 + }, + { + "epoch": 0.4431089092828718, + "grad_norm": 4.9254182519569465, + "learning_rate": 3.0770729095699194e-06, + "loss": 0.6029, + "step": 5456 + }, + { + "epoch": 0.44319012425891335, + "grad_norm": 3.5617209008138992, + "learning_rate": 3.0764330269880593e-06, + "loss": 0.5718, + "step": 5457 + }, + { + "epoch": 0.44327133923495493, + "grad_norm": 4.355025485119006, + "learning_rate": 3.0757931045200844e-06, + "loss": 0.6286, + "step": 5458 + }, + { + "epoch": 0.4433525542109965, + "grad_norm": 6.239139168135333, + "learning_rate": 3.075153142210274e-06, + "loss": 0.5216, + "step": 5459 + }, + { + "epoch": 0.4434337691870381, + "grad_norm": 4.784319073015132, + "learning_rate": 3.0745131401029105e-06, + "loss": 0.5841, + "step": 5460 + }, + { + "epoch": 0.44351498416307966, + "grad_norm": 4.463298659345465, + "learning_rate": 3.073873098242278e-06, + "loss": 0.6327, + "step": 5461 + }, + { + "epoch": 0.44359619913912124, + "grad_norm": 6.8742335301264585, + "learning_rate": 3.0732330166726644e-06, + "loss": 0.5043, + "step": 5462 + }, + { + "epoch": 0.4436774141151628, + "grad_norm": 3.366707977466441, + "learning_rate": 3.07259289543836e-06, + "loss": 0.5506, + "step": 5463 + }, + { + "epoch": 0.4437586290912044, + "grad_norm": 4.220402613518687, + "learning_rate": 3.0719527345836568e-06, + "loss": 0.4197, + "step": 5464 + }, + { + "epoch": 0.443839844067246, + "grad_norm": 3.555095408073302, + "learning_rate": 3.0713125341528527e-06, + "loss": 0.4006, + "step": 5465 + }, + { + "epoch": 0.4439210590432876, + "grad_norm": 5.959143884494173, + "learning_rate": 3.0706722941902438e-06, + "loss": 0.5243, + "step": 5466 + }, + { + "epoch": 0.4440022740193292, + "grad_norm": 3.7731524034106707, + "learning_rate": 3.0700320147401324e-06, + "loss": 0.5004, + "step": 5467 + }, + { + "epoch": 0.44408348899537076, + "grad_norm": 4.782660700197082, + "learning_rate": 3.0693916958468236e-06, + "loss": 0.5797, + "step": 5468 + }, + { + "epoch": 0.44416470397141233, + "grad_norm": 4.37229758317183, + "learning_rate": 3.0687513375546216e-06, + "loss": 0.614, + "step": 5469 + }, + { + "epoch": 0.4442459189474539, + "grad_norm": 2.828225215491486, + "learning_rate": 3.0681109399078375e-06, + "loss": 0.7196, + "step": 5470 + }, + { + "epoch": 0.4443271339234955, + "grad_norm": 6.309177247959491, + "learning_rate": 3.0674705029507833e-06, + "loss": 0.476, + "step": 5471 + }, + { + "epoch": 0.44440834889953706, + "grad_norm": 4.103834477568316, + "learning_rate": 3.0668300267277735e-06, + "loss": 0.6375, + "step": 5472 + }, + { + "epoch": 0.44448956387557864, + "grad_norm": 6.093953734178574, + "learning_rate": 3.066189511283126e-06, + "loss": 0.4653, + "step": 5473 + }, + { + "epoch": 0.4445707788516202, + "grad_norm": 5.225914425304106, + "learning_rate": 3.0655489566611603e-06, + "loss": 0.6781, + "step": 5474 + }, + { + "epoch": 0.4446519938276618, + "grad_norm": 5.8819081687868335, + "learning_rate": 3.0649083629062e-06, + "loss": 0.5596, + "step": 5475 + }, + { + "epoch": 0.44473320880370343, + "grad_norm": 7.739981641220584, + "learning_rate": 3.0642677300625704e-06, + "loss": 0.3823, + "step": 5476 + }, + { + "epoch": 0.444814423779745, + "grad_norm": 5.131722265818073, + "learning_rate": 3.063627058174601e-06, + "loss": 0.4458, + "step": 5477 + }, + { + "epoch": 0.4448956387557866, + "grad_norm": 3.2411651551006364, + "learning_rate": 3.062986347286622e-06, + "loss": 0.7122, + "step": 5478 + }, + { + "epoch": 0.44497685373182816, + "grad_norm": 4.225618329584794, + "learning_rate": 3.0623455974429677e-06, + "loss": 0.4025, + "step": 5479 + }, + { + "epoch": 0.44505806870786974, + "grad_norm": 7.665112766366524, + "learning_rate": 3.061704808687973e-06, + "loss": 0.6129, + "step": 5480 + }, + { + "epoch": 0.4451392836839113, + "grad_norm": 9.07304807101099, + "learning_rate": 3.061063981065979e-06, + "loss": 0.5051, + "step": 5481 + }, + { + "epoch": 0.4452204986599529, + "grad_norm": 4.554173085391561, + "learning_rate": 3.0604231146213276e-06, + "loss": 0.6264, + "step": 5482 + }, + { + "epoch": 0.44530171363599447, + "grad_norm": 3.5414231796303515, + "learning_rate": 3.0597822093983614e-06, + "loss": 0.5022, + "step": 5483 + }, + { + "epoch": 0.44538292861203604, + "grad_norm": 8.230686535869143, + "learning_rate": 3.0591412654414297e-06, + "loss": 0.6486, + "step": 5484 + }, + { + "epoch": 0.4454641435880776, + "grad_norm": 5.579450710250304, + "learning_rate": 3.058500282794882e-06, + "loss": 0.5926, + "step": 5485 + }, + { + "epoch": 0.4455453585641192, + "grad_norm": 6.0408658738191425, + "learning_rate": 3.0578592615030693e-06, + "loss": 0.3853, + "step": 5486 + }, + { + "epoch": 0.44562657354016083, + "grad_norm": 4.852372309573421, + "learning_rate": 3.057218201610349e-06, + "loss": 0.4482, + "step": 5487 + }, + { + "epoch": 0.4457077885162024, + "grad_norm": 5.300082736246172, + "learning_rate": 3.056577103161078e-06, + "loss": 0.5611, + "step": 5488 + }, + { + "epoch": 0.445789003492244, + "grad_norm": 5.314374108815902, + "learning_rate": 3.055935966199617e-06, + "loss": 0.505, + "step": 5489 + }, + { + "epoch": 0.44587021846828556, + "grad_norm": 3.462874288033993, + "learning_rate": 3.0552947907703296e-06, + "loss": 0.5165, + "step": 5490 + }, + { + "epoch": 0.44595143344432714, + "grad_norm": 5.895017788703498, + "learning_rate": 3.054653576917581e-06, + "loss": 0.3758, + "step": 5491 + }, + { + "epoch": 0.4460326484203687, + "grad_norm": 3.720905933238603, + "learning_rate": 3.054012324685742e-06, + "loss": 0.574, + "step": 5492 + }, + { + "epoch": 0.4461138633964103, + "grad_norm": 6.446011438422685, + "learning_rate": 3.05337103411918e-06, + "loss": 0.6723, + "step": 5493 + }, + { + "epoch": 0.44619507837245187, + "grad_norm": 4.27621399758535, + "learning_rate": 3.0527297052622724e-06, + "loss": 0.5515, + "step": 5494 + }, + { + "epoch": 0.44627629334849345, + "grad_norm": 14.21365870200448, + "learning_rate": 3.0520883381593945e-06, + "loss": 0.4223, + "step": 5495 + }, + { + "epoch": 0.446357508324535, + "grad_norm": 3.727959140755334, + "learning_rate": 3.0514469328549244e-06, + "loss": 0.7102, + "step": 5496 + }, + { + "epoch": 0.4464387233005766, + "grad_norm": 4.155096776178402, + "learning_rate": 3.050805489393246e-06, + "loss": 0.6996, + "step": 5497 + }, + { + "epoch": 0.44651993827661823, + "grad_norm": 3.376388149465253, + "learning_rate": 3.0501640078187433e-06, + "loss": 0.6374, + "step": 5498 + }, + { + "epoch": 0.4466011532526598, + "grad_norm": 6.291374977431427, + "learning_rate": 3.049522488175802e-06, + "loss": 0.5527, + "step": 5499 + }, + { + "epoch": 0.4466823682287014, + "grad_norm": 6.926029198647835, + "learning_rate": 3.048880930508813e-06, + "loss": 0.4501, + "step": 5500 + }, + { + "epoch": 0.44676358320474296, + "grad_norm": 8.251812163196625, + "learning_rate": 3.0482393348621686e-06, + "loss": 0.4361, + "step": 5501 + }, + { + "epoch": 0.44684479818078454, + "grad_norm": 4.312548651784504, + "learning_rate": 3.0475977012802636e-06, + "loss": 0.4884, + "step": 5502 + }, + { + "epoch": 0.4469260131568261, + "grad_norm": 8.316308066336365, + "learning_rate": 3.0469560298074963e-06, + "loss": 0.4457, + "step": 5503 + }, + { + "epoch": 0.4470072281328677, + "grad_norm": 5.919440004233395, + "learning_rate": 3.046314320488266e-06, + "loss": 0.5825, + "step": 5504 + }, + { + "epoch": 0.4470884431089093, + "grad_norm": 3.520589949942169, + "learning_rate": 3.045672573366976e-06, + "loss": 0.7033, + "step": 5505 + }, + { + "epoch": 0.44716965808495085, + "grad_norm": 5.061834295396976, + "learning_rate": 3.045030788488032e-06, + "loss": 0.5824, + "step": 5506 + }, + { + "epoch": 0.4472508730609924, + "grad_norm": 5.285749057766182, + "learning_rate": 3.0443889658958425e-06, + "loss": 0.5358, + "step": 5507 + }, + { + "epoch": 0.447332088037034, + "grad_norm": 6.668648467416671, + "learning_rate": 3.043747105634817e-06, + "loss": 0.4972, + "step": 5508 + }, + { + "epoch": 0.44741330301307564, + "grad_norm": 5.315769012630028, + "learning_rate": 3.0431052077493693e-06, + "loss": 0.7238, + "step": 5509 + }, + { + "epoch": 0.4474945179891172, + "grad_norm": 4.114487698288283, + "learning_rate": 3.0424632722839164e-06, + "loss": 0.7628, + "step": 5510 + }, + { + "epoch": 0.4475757329651588, + "grad_norm": 5.636091284907231, + "learning_rate": 3.041821299282876e-06, + "loss": 0.6014, + "step": 5511 + }, + { + "epoch": 0.44765694794120037, + "grad_norm": 5.436008606726467, + "learning_rate": 3.0411792887906684e-06, + "loss": 0.4414, + "step": 5512 + }, + { + "epoch": 0.44773816291724194, + "grad_norm": 5.179891754796302, + "learning_rate": 3.0405372408517187e-06, + "loss": 0.477, + "step": 5513 + }, + { + "epoch": 0.4478193778932835, + "grad_norm": 2.443055617371502, + "learning_rate": 3.0398951555104528e-06, + "loss": 0.6141, + "step": 5514 + }, + { + "epoch": 0.4479005928693251, + "grad_norm": 3.1073744907433745, + "learning_rate": 3.0392530328112997e-06, + "loss": 0.4829, + "step": 5515 + }, + { + "epoch": 0.4479818078453667, + "grad_norm": 10.2090255345531, + "learning_rate": 3.0386108727986903e-06, + "loss": 0.4541, + "step": 5516 + }, + { + "epoch": 0.44806302282140825, + "grad_norm": 5.848825993848949, + "learning_rate": 3.037968675517059e-06, + "loss": 0.4629, + "step": 5517 + }, + { + "epoch": 0.44814423779744983, + "grad_norm": 5.418590209098287, + "learning_rate": 3.0373264410108422e-06, + "loss": 0.512, + "step": 5518 + }, + { + "epoch": 0.4482254527734914, + "grad_norm": 4.498788774198856, + "learning_rate": 3.03668416932448e-06, + "loss": 0.4333, + "step": 5519 + }, + { + "epoch": 0.44830666774953304, + "grad_norm": 7.415061980682369, + "learning_rate": 3.0360418605024134e-06, + "loss": 0.4415, + "step": 5520 + }, + { + "epoch": 0.4483878827255746, + "grad_norm": 4.0616647064171465, + "learning_rate": 3.0353995145890868e-06, + "loss": 0.6796, + "step": 5521 + }, + { + "epoch": 0.4484690977016162, + "grad_norm": 5.786539246564089, + "learning_rate": 3.0347571316289476e-06, + "loss": 0.4574, + "step": 5522 + }, + { + "epoch": 0.44855031267765777, + "grad_norm": 4.2099399908062445, + "learning_rate": 3.0341147116664455e-06, + "loss": 0.5367, + "step": 5523 + }, + { + "epoch": 0.44863152765369935, + "grad_norm": 4.538423749900379, + "learning_rate": 3.0334722547460317e-06, + "loss": 0.374, + "step": 5524 + }, + { + "epoch": 0.4487127426297409, + "grad_norm": 3.5387099780140137, + "learning_rate": 3.032829760912161e-06, + "loss": 0.4963, + "step": 5525 + }, + { + "epoch": 0.4487939576057825, + "grad_norm": 5.798358992841778, + "learning_rate": 3.032187230209291e-06, + "loss": 0.5552, + "step": 5526 + }, + { + "epoch": 0.4488751725818241, + "grad_norm": 4.773816159502634, + "learning_rate": 3.0315446626818816e-06, + "loss": 0.5549, + "step": 5527 + }, + { + "epoch": 0.44895638755786565, + "grad_norm": 4.150570620974484, + "learning_rate": 3.030902058374394e-06, + "loss": 0.5485, + "step": 5528 + }, + { + "epoch": 0.44903760253390723, + "grad_norm": 5.126013922801945, + "learning_rate": 3.0302594173312937e-06, + "loss": 0.4232, + "step": 5529 + }, + { + "epoch": 0.4491188175099488, + "grad_norm": 3.268316271468154, + "learning_rate": 3.0296167395970494e-06, + "loss": 0.5323, + "step": 5530 + }, + { + "epoch": 0.44920003248599044, + "grad_norm": 5.730548961152452, + "learning_rate": 3.0289740252161288e-06, + "loss": 0.6075, + "step": 5531 + }, + { + "epoch": 0.449281247462032, + "grad_norm": 7.380827782342893, + "learning_rate": 3.0283312742330044e-06, + "loss": 0.5684, + "step": 5532 + }, + { + "epoch": 0.4493624624380736, + "grad_norm": 3.0968839249512983, + "learning_rate": 3.027688486692153e-06, + "loss": 0.6726, + "step": 5533 + }, + { + "epoch": 0.4494436774141152, + "grad_norm": 3.139400489756557, + "learning_rate": 3.027045662638051e-06, + "loss": 0.5246, + "step": 5534 + }, + { + "epoch": 0.44952489239015675, + "grad_norm": 5.977877338701136, + "learning_rate": 3.026402802115178e-06, + "loss": 0.4677, + "step": 5535 + }, + { + "epoch": 0.4496061073661983, + "grad_norm": 4.58739972834906, + "learning_rate": 3.0257599051680175e-06, + "loss": 0.4622, + "step": 5536 + }, + { + "epoch": 0.4496873223422399, + "grad_norm": 6.184504437794764, + "learning_rate": 3.025116971841054e-06, + "loss": 0.6765, + "step": 5537 + }, + { + "epoch": 0.4497685373182815, + "grad_norm": 8.256944628820975, + "learning_rate": 3.0244740021787756e-06, + "loss": 0.5273, + "step": 5538 + }, + { + "epoch": 0.44984975229432306, + "grad_norm": 5.127185634205651, + "learning_rate": 3.023830996225671e-06, + "loss": 0.5202, + "step": 5539 + }, + { + "epoch": 0.44993096727036463, + "grad_norm": 5.783243249616182, + "learning_rate": 3.023187954026234e-06, + "loss": 0.588, + "step": 5540 + }, + { + "epoch": 0.4500121822464062, + "grad_norm": 11.669014789188214, + "learning_rate": 3.0225448756249605e-06, + "loss": 0.5103, + "step": 5541 + }, + { + "epoch": 0.45009339722244784, + "grad_norm": 12.31667513372858, + "learning_rate": 3.0219017610663466e-06, + "loss": 0.6313, + "step": 5542 + }, + { + "epoch": 0.4501746121984894, + "grad_norm": 4.705587679569699, + "learning_rate": 3.0212586103948933e-06, + "loss": 0.5984, + "step": 5543 + }, + { + "epoch": 0.450255827174531, + "grad_norm": 7.657296950948716, + "learning_rate": 3.020615423655102e-06, + "loss": 0.4608, + "step": 5544 + }, + { + "epoch": 0.4503370421505726, + "grad_norm": 4.260385063187866, + "learning_rate": 3.0199722008914787e-06, + "loss": 0.6098, + "step": 5545 + }, + { + "epoch": 0.45041825712661415, + "grad_norm": 5.405032331240298, + "learning_rate": 3.0193289421485317e-06, + "loss": 0.3907, + "step": 5546 + }, + { + "epoch": 0.45049947210265573, + "grad_norm": 5.755862088555348, + "learning_rate": 3.0186856474707705e-06, + "loss": 0.4768, + "step": 5547 + }, + { + "epoch": 0.4505806870786973, + "grad_norm": 8.389991068275073, + "learning_rate": 3.0180423169027067e-06, + "loss": 0.4632, + "step": 5548 + }, + { + "epoch": 0.4506619020547389, + "grad_norm": 7.221227795632766, + "learning_rate": 3.0173989504888573e-06, + "loss": 0.4662, + "step": 5549 + }, + { + "epoch": 0.45074311703078046, + "grad_norm": 3.6157966531181467, + "learning_rate": 3.0167555482737384e-06, + "loss": 0.6383, + "step": 5550 + }, + { + "epoch": 0.45082433200682204, + "grad_norm": 5.5152149011394185, + "learning_rate": 3.01611211030187e-06, + "loss": 0.3908, + "step": 5551 + }, + { + "epoch": 0.4509055469828636, + "grad_norm": 4.18620086470904, + "learning_rate": 3.0154686366177753e-06, + "loss": 0.4462, + "step": 5552 + }, + { + "epoch": 0.45098676195890525, + "grad_norm": 5.069886109587496, + "learning_rate": 3.0148251272659795e-06, + "loss": 0.4242, + "step": 5553 + }, + { + "epoch": 0.4510679769349468, + "grad_norm": 4.107158289610819, + "learning_rate": 3.0141815822910094e-06, + "loss": 0.5341, + "step": 5554 + }, + { + "epoch": 0.4511491919109884, + "grad_norm": 4.955579663758407, + "learning_rate": 3.013538001737395e-06, + "loss": 0.561, + "step": 5555 + }, + { + "epoch": 0.45123040688703, + "grad_norm": 4.9599952397922635, + "learning_rate": 3.0128943856496686e-06, + "loss": 0.6114, + "step": 5556 + }, + { + "epoch": 0.45131162186307155, + "grad_norm": 8.882975709525924, + "learning_rate": 3.0122507340723656e-06, + "loss": 0.6247, + "step": 5557 + }, + { + "epoch": 0.45139283683911313, + "grad_norm": 5.7173513452837135, + "learning_rate": 3.011607047050022e-06, + "loss": 0.4825, + "step": 5558 + }, + { + "epoch": 0.4514740518151547, + "grad_norm": 6.378003204165053, + "learning_rate": 3.0109633246271783e-06, + "loss": 0.4608, + "step": 5559 + }, + { + "epoch": 0.4515552667911963, + "grad_norm": 8.215078154867541, + "learning_rate": 3.0103195668483787e-06, + "loss": 0.4853, + "step": 5560 + }, + { + "epoch": 0.45163648176723786, + "grad_norm": 5.6937027402469305, + "learning_rate": 3.009675773758164e-06, + "loss": 0.5338, + "step": 5561 + }, + { + "epoch": 0.45171769674327944, + "grad_norm": 9.181599736624747, + "learning_rate": 3.009031945401084e-06, + "loss": 0.4927, + "step": 5562 + }, + { + "epoch": 0.451798911719321, + "grad_norm": 11.154194461173983, + "learning_rate": 3.008388081821687e-06, + "loss": 0.5629, + "step": 5563 + }, + { + "epoch": 0.45188012669536265, + "grad_norm": 8.09877927664861, + "learning_rate": 3.0077441830645256e-06, + "loss": 0.6261, + "step": 5564 + }, + { + "epoch": 0.4519613416714042, + "grad_norm": 6.075046325765384, + "learning_rate": 3.0071002491741537e-06, + "loss": 0.5261, + "step": 5565 + }, + { + "epoch": 0.4520425566474458, + "grad_norm": 7.918146195582449, + "learning_rate": 3.0064562801951286e-06, + "loss": 0.4545, + "step": 5566 + }, + { + "epoch": 0.4521237716234874, + "grad_norm": 4.644487714982916, + "learning_rate": 3.005812276172009e-06, + "loss": 0.5743, + "step": 5567 + }, + { + "epoch": 0.45220498659952896, + "grad_norm": 3.5759922555457924, + "learning_rate": 3.005168237149357e-06, + "loss": 0.572, + "step": 5568 + }, + { + "epoch": 0.45228620157557053, + "grad_norm": 6.775625726215614, + "learning_rate": 3.0045241631717366e-06, + "loss": 0.5287, + "step": 5569 + }, + { + "epoch": 0.4523674165516121, + "grad_norm": 3.7500419034479457, + "learning_rate": 3.0038800542837137e-06, + "loss": 0.5277, + "step": 5570 + }, + { + "epoch": 0.4524486315276537, + "grad_norm": 4.670720788070081, + "learning_rate": 3.003235910529859e-06, + "loss": 0.4724, + "step": 5571 + }, + { + "epoch": 0.45252984650369527, + "grad_norm": 7.410732606803197, + "learning_rate": 3.0025917319547417e-06, + "loss": 0.6593, + "step": 5572 + }, + { + "epoch": 0.45261106147973684, + "grad_norm": 7.469496813543684, + "learning_rate": 3.001947518602937e-06, + "loss": 0.4562, + "step": 5573 + }, + { + "epoch": 0.4526922764557784, + "grad_norm": 4.482461581886912, + "learning_rate": 3.0013032705190196e-06, + "loss": 0.5644, + "step": 5574 + }, + { + "epoch": 0.45277349143182005, + "grad_norm": 4.5986439128006, + "learning_rate": 3.00065898774757e-06, + "loss": 0.5426, + "step": 5575 + }, + { + "epoch": 0.45285470640786163, + "grad_norm": 4.16197803163674, + "learning_rate": 3.000014670333168e-06, + "loss": 0.5278, + "step": 5576 + }, + { + "epoch": 0.4529359213839032, + "grad_norm": 3.5560084963087646, + "learning_rate": 2.9993703183203963e-06, + "loss": 0.5385, + "step": 5577 + }, + { + "epoch": 0.4530171363599448, + "grad_norm": 4.260872328467781, + "learning_rate": 2.998725931753842e-06, + "loss": 0.449, + "step": 5578 + }, + { + "epoch": 0.45309835133598636, + "grad_norm": 4.8192798552612635, + "learning_rate": 2.9980815106780937e-06, + "loss": 0.605, + "step": 5579 + }, + { + "epoch": 0.45317956631202794, + "grad_norm": 3.949873091272697, + "learning_rate": 2.9974370551377396e-06, + "loss": 0.5863, + "step": 5580 + }, + { + "epoch": 0.4532607812880695, + "grad_norm": 8.728645349811258, + "learning_rate": 2.9967925651773745e-06, + "loss": 0.4891, + "step": 5581 + }, + { + "epoch": 0.4533419962641111, + "grad_norm": 3.623472869472515, + "learning_rate": 2.9961480408415926e-06, + "loss": 0.6374, + "step": 5582 + }, + { + "epoch": 0.45342321124015267, + "grad_norm": 4.651906826052307, + "learning_rate": 2.995503482174993e-06, + "loss": 0.4461, + "step": 5583 + }, + { + "epoch": 0.45350442621619425, + "grad_norm": 4.256628817272692, + "learning_rate": 2.9948588892221744e-06, + "loss": 0.4862, + "step": 5584 + }, + { + "epoch": 0.4535856411922358, + "grad_norm": 3.3916680603373797, + "learning_rate": 2.9942142620277394e-06, + "loss": 0.5934, + "step": 5585 + }, + { + "epoch": 0.45366685616827745, + "grad_norm": 6.635234044978131, + "learning_rate": 2.993569600636293e-06, + "loss": 0.5009, + "step": 5586 + }, + { + "epoch": 0.45374807114431903, + "grad_norm": 5.742703156297073, + "learning_rate": 2.9929249050924424e-06, + "loss": 0.4523, + "step": 5587 + }, + { + "epoch": 0.4538292861203606, + "grad_norm": 4.30344798988703, + "learning_rate": 2.992280175440797e-06, + "loss": 0.5456, + "step": 5588 + }, + { + "epoch": 0.4539105010964022, + "grad_norm": 4.517980897014619, + "learning_rate": 2.99163541172597e-06, + "loss": 0.4736, + "step": 5589 + }, + { + "epoch": 0.45399171607244376, + "grad_norm": 5.224517211850113, + "learning_rate": 2.990990613992573e-06, + "loss": 0.6306, + "step": 5590 + }, + { + "epoch": 0.45407293104848534, + "grad_norm": 4.522124261787193, + "learning_rate": 2.990345782285225e-06, + "loss": 0.5577, + "step": 5591 + }, + { + "epoch": 0.4541541460245269, + "grad_norm": 4.777313229511017, + "learning_rate": 2.989700916648544e-06, + "loss": 0.4167, + "step": 5592 + }, + { + "epoch": 0.4542353610005685, + "grad_norm": 6.094818732275617, + "learning_rate": 2.989056017127151e-06, + "loss": 0.4435, + "step": 5593 + }, + { + "epoch": 0.45431657597661007, + "grad_norm": 7.062295268512343, + "learning_rate": 2.988411083765669e-06, + "loss": 0.5327, + "step": 5594 + }, + { + "epoch": 0.45439779095265165, + "grad_norm": 5.878383502169189, + "learning_rate": 2.9877661166087265e-06, + "loss": 0.7565, + "step": 5595 + }, + { + "epoch": 0.4544790059286932, + "grad_norm": 5.252077496243823, + "learning_rate": 2.9871211157009496e-06, + "loss": 0.3842, + "step": 5596 + }, + { + "epoch": 0.45456022090473486, + "grad_norm": 4.8053252445722014, + "learning_rate": 2.986476081086969e-06, + "loss": 0.5382, + "step": 5597 + }, + { + "epoch": 0.45464143588077643, + "grad_norm": 5.29723639288913, + "learning_rate": 2.9858310128114187e-06, + "loss": 0.5417, + "step": 5598 + }, + { + "epoch": 0.454722650856818, + "grad_norm": 4.021237021928457, + "learning_rate": 2.9851859109189335e-06, + "loss": 0.54, + "step": 5599 + }, + { + "epoch": 0.4548038658328596, + "grad_norm": 26.465640352460827, + "learning_rate": 2.9845407754541513e-06, + "loss": 0.4593, + "step": 5600 + }, + { + "epoch": 0.45488508080890117, + "grad_norm": 4.300973423387741, + "learning_rate": 2.9838956064617108e-06, + "loss": 0.547, + "step": 5601 + }, + { + "epoch": 0.45496629578494274, + "grad_norm": 4.972732875615428, + "learning_rate": 2.9832504039862564e-06, + "loss": 0.5216, + "step": 5602 + }, + { + "epoch": 0.4550475107609843, + "grad_norm": 3.8151254239492074, + "learning_rate": 2.982605168072431e-06, + "loss": 0.5508, + "step": 5603 + }, + { + "epoch": 0.4551287257370259, + "grad_norm": 4.251587340007891, + "learning_rate": 2.981959898764882e-06, + "loss": 0.4318, + "step": 5604 + }, + { + "epoch": 0.4552099407130675, + "grad_norm": 4.436180317192108, + "learning_rate": 2.9813145961082594e-06, + "loss": 0.5097, + "step": 5605 + }, + { + "epoch": 0.45529115568910905, + "grad_norm": 4.5049604397224785, + "learning_rate": 2.9806692601472143e-06, + "loss": 0.6497, + "step": 5606 + }, + { + "epoch": 0.4553723706651506, + "grad_norm": 4.817837180058034, + "learning_rate": 2.9800238909263994e-06, + "loss": 0.6143, + "step": 5607 + }, + { + "epoch": 0.45545358564119226, + "grad_norm": 3.4243930089969785, + "learning_rate": 2.9793784884904733e-06, + "loss": 0.5063, + "step": 5608 + }, + { + "epoch": 0.45553480061723384, + "grad_norm": 5.925953178663676, + "learning_rate": 2.9787330528840915e-06, + "loss": 0.4992, + "step": 5609 + }, + { + "epoch": 0.4556160155932754, + "grad_norm": 5.497205043778295, + "learning_rate": 2.978087584151915e-06, + "loss": 0.5372, + "step": 5610 + }, + { + "epoch": 0.455697230569317, + "grad_norm": 8.33939404348421, + "learning_rate": 2.9774420823386104e-06, + "loss": 0.5619, + "step": 5611 + }, + { + "epoch": 0.45577844554535857, + "grad_norm": 6.195975570900406, + "learning_rate": 2.9767965474888395e-06, + "loss": 0.5683, + "step": 5612 + }, + { + "epoch": 0.45585966052140015, + "grad_norm": 14.703831195583096, + "learning_rate": 2.9761509796472697e-06, + "loss": 0.3429, + "step": 5613 + }, + { + "epoch": 0.4559408754974417, + "grad_norm": 4.631555776984385, + "learning_rate": 2.975505378858574e-06, + "loss": 0.6409, + "step": 5614 + }, + { + "epoch": 0.4560220904734833, + "grad_norm": 3.4809174977075803, + "learning_rate": 2.974859745167422e-06, + "loss": 0.769, + "step": 5615 + }, + { + "epoch": 0.4561033054495249, + "grad_norm": 5.098517264596913, + "learning_rate": 2.9742140786184885e-06, + "loss": 0.5612, + "step": 5616 + }, + { + "epoch": 0.45618452042556645, + "grad_norm": 6.00392072759688, + "learning_rate": 2.9735683792564506e-06, + "loss": 0.5576, + "step": 5617 + }, + { + "epoch": 0.45626573540160803, + "grad_norm": 7.263834884088008, + "learning_rate": 2.9729226471259877e-06, + "loss": 0.6709, + "step": 5618 + }, + { + "epoch": 0.45634695037764966, + "grad_norm": 3.554328673137164, + "learning_rate": 2.9722768822717795e-06, + "loss": 0.6453, + "step": 5619 + }, + { + "epoch": 0.45642816535369124, + "grad_norm": 5.778288795426547, + "learning_rate": 2.971631084738511e-06, + "loss": 0.4381, + "step": 5620 + }, + { + "epoch": 0.4565093803297328, + "grad_norm": 3.465143813973219, + "learning_rate": 2.9709852545708677e-06, + "loss": 0.5124, + "step": 5621 + }, + { + "epoch": 0.4565905953057744, + "grad_norm": 4.670081929190185, + "learning_rate": 2.9703393918135383e-06, + "loss": 0.4459, + "step": 5622 + }, + { + "epoch": 0.45667181028181597, + "grad_norm": 4.713194770845547, + "learning_rate": 2.96969349651121e-06, + "loss": 0.4847, + "step": 5623 + }, + { + "epoch": 0.45675302525785755, + "grad_norm": 3.630426739892531, + "learning_rate": 2.9690475687085795e-06, + "loss": 0.4093, + "step": 5624 + }, + { + "epoch": 0.4568342402338991, + "grad_norm": 5.486371575732508, + "learning_rate": 2.968401608450339e-06, + "loss": 0.6108, + "step": 5625 + }, + { + "epoch": 0.4569154552099407, + "grad_norm": 8.575804166016505, + "learning_rate": 2.967755615781186e-06, + "loss": 0.45, + "step": 5626 + }, + { + "epoch": 0.4569966701859823, + "grad_norm": 8.699853217920039, + "learning_rate": 2.9671095907458203e-06, + "loss": 0.5099, + "step": 5627 + }, + { + "epoch": 0.45707788516202386, + "grad_norm": 3.801057038330998, + "learning_rate": 2.966463533388943e-06, + "loss": 0.5301, + "step": 5628 + }, + { + "epoch": 0.45715910013806543, + "grad_norm": 6.044511448542259, + "learning_rate": 2.9658174437552577e-06, + "loss": 0.5236, + "step": 5629 + }, + { + "epoch": 0.45724031511410707, + "grad_norm": 5.875396782939016, + "learning_rate": 2.9651713218894706e-06, + "loss": 0.6417, + "step": 5630 + }, + { + "epoch": 0.45732153009014864, + "grad_norm": 4.000366715370684, + "learning_rate": 2.96452516783629e-06, + "loss": 0.5186, + "step": 5631 + }, + { + "epoch": 0.4574027450661902, + "grad_norm": 4.69919331836541, + "learning_rate": 2.9638789816404264e-06, + "loss": 0.5162, + "step": 5632 + }, + { + "epoch": 0.4574839600422318, + "grad_norm": 7.3603896088494185, + "learning_rate": 2.9632327633465917e-06, + "loss": 0.4416, + "step": 5633 + }, + { + "epoch": 0.4575651750182734, + "grad_norm": 7.25776019771969, + "learning_rate": 2.9625865129995023e-06, + "loss": 0.5364, + "step": 5634 + }, + { + "epoch": 0.45764638999431495, + "grad_norm": 5.174876153569623, + "learning_rate": 2.9619402306438738e-06, + "loss": 0.412, + "step": 5635 + }, + { + "epoch": 0.4577276049703565, + "grad_norm": 5.4010502742692355, + "learning_rate": 2.9612939163244266e-06, + "loss": 0.4053, + "step": 5636 + }, + { + "epoch": 0.4578088199463981, + "grad_norm": 3.835214497106897, + "learning_rate": 2.960647570085881e-06, + "loss": 0.4996, + "step": 5637 + }, + { + "epoch": 0.4578900349224397, + "grad_norm": 6.055794052072197, + "learning_rate": 2.960001191972963e-06, + "loss": 0.7144, + "step": 5638 + }, + { + "epoch": 0.45797124989848126, + "grad_norm": 4.158855522188884, + "learning_rate": 2.9593547820303954e-06, + "loss": 0.6862, + "step": 5639 + }, + { + "epoch": 0.45805246487452284, + "grad_norm": 3.708348870716642, + "learning_rate": 2.958708340302908e-06, + "loss": 0.5555, + "step": 5640 + }, + { + "epoch": 0.45813367985056447, + "grad_norm": 5.983420776955866, + "learning_rate": 2.958061866835232e-06, + "loss": 0.4188, + "step": 5641 + }, + { + "epoch": 0.45821489482660605, + "grad_norm": 8.29550387074976, + "learning_rate": 2.9574153616720986e-06, + "loss": 0.5177, + "step": 5642 + }, + { + "epoch": 0.4582961098026476, + "grad_norm": 5.444764698582431, + "learning_rate": 2.9567688248582436e-06, + "loss": 0.5237, + "step": 5643 + }, + { + "epoch": 0.4583773247786892, + "grad_norm": 10.29419590630684, + "learning_rate": 2.956122256438403e-06, + "loss": 0.4966, + "step": 5644 + }, + { + "epoch": 0.4584585397547308, + "grad_norm": 6.484747843668005, + "learning_rate": 2.955475656457316e-06, + "loss": 0.5255, + "step": 5645 + }, + { + "epoch": 0.45853975473077235, + "grad_norm": 5.119025976889396, + "learning_rate": 2.9548290249597246e-06, + "loss": 0.5482, + "step": 5646 + }, + { + "epoch": 0.45862096970681393, + "grad_norm": 6.402280776009877, + "learning_rate": 2.9541823619903716e-06, + "loss": 0.6823, + "step": 5647 + }, + { + "epoch": 0.4587021846828555, + "grad_norm": 7.6633301689603375, + "learning_rate": 2.9535356675940023e-06, + "loss": 0.4238, + "step": 5648 + }, + { + "epoch": 0.4587833996588971, + "grad_norm": 4.171651081928115, + "learning_rate": 2.952888941815366e-06, + "loss": 0.6524, + "step": 5649 + }, + { + "epoch": 0.45886461463493866, + "grad_norm": 5.133593022664716, + "learning_rate": 2.952242184699211e-06, + "loss": 0.5472, + "step": 5650 + }, + { + "epoch": 0.45894582961098024, + "grad_norm": 5.9594049521898205, + "learning_rate": 2.9515953962902914e-06, + "loss": 0.5447, + "step": 5651 + }, + { + "epoch": 0.45902704458702187, + "grad_norm": 6.3931523146679785, + "learning_rate": 2.950948576633359e-06, + "loss": 0.5729, + "step": 5652 + }, + { + "epoch": 0.45910825956306345, + "grad_norm": 5.8713619633879, + "learning_rate": 2.9503017257731727e-06, + "loss": 0.4821, + "step": 5653 + }, + { + "epoch": 0.459189474539105, + "grad_norm": 3.2931039167346743, + "learning_rate": 2.9496548437544905e-06, + "loss": 0.4733, + "step": 5654 + }, + { + "epoch": 0.4592706895151466, + "grad_norm": 5.1941445369327335, + "learning_rate": 2.9490079306220714e-06, + "loss": 0.3829, + "step": 5655 + }, + { + "epoch": 0.4593519044911882, + "grad_norm": 4.360689715599596, + "learning_rate": 2.9483609864206808e-06, + "loss": 0.5581, + "step": 5656 + }, + { + "epoch": 0.45943311946722976, + "grad_norm": 7.1909194878489195, + "learning_rate": 2.9477140111950834e-06, + "loss": 0.4832, + "step": 5657 + }, + { + "epoch": 0.45951433444327133, + "grad_norm": 5.716565178813941, + "learning_rate": 2.947067004990045e-06, + "loss": 0.458, + "step": 5658 + }, + { + "epoch": 0.4595955494193129, + "grad_norm": 5.12640741752529, + "learning_rate": 2.9464199678503364e-06, + "loss": 0.5096, + "step": 5659 + }, + { + "epoch": 0.4596767643953545, + "grad_norm": 4.73475880030873, + "learning_rate": 2.9457728998207286e-06, + "loss": 0.46, + "step": 5660 + }, + { + "epoch": 0.45975797937139606, + "grad_norm": 5.497100257560182, + "learning_rate": 2.9451258009459947e-06, + "loss": 0.4947, + "step": 5661 + }, + { + "epoch": 0.45983919434743764, + "grad_norm": 3.188631639464191, + "learning_rate": 2.9444786712709122e-06, + "loss": 0.4633, + "step": 5662 + }, + { + "epoch": 0.4599204093234793, + "grad_norm": 6.625940309443984, + "learning_rate": 2.943831510840257e-06, + "loss": 0.5155, + "step": 5663 + }, + { + "epoch": 0.46000162429952085, + "grad_norm": 6.021653211612759, + "learning_rate": 2.9431843196988107e-06, + "loss": 0.4657, + "step": 5664 + }, + { + "epoch": 0.4600828392755624, + "grad_norm": 4.027902151297978, + "learning_rate": 2.942537097891355e-06, + "loss": 0.5745, + "step": 5665 + }, + { + "epoch": 0.460164054251604, + "grad_norm": 4.556130080376428, + "learning_rate": 2.9418898454626744e-06, + "loss": 0.7196, + "step": 5666 + }, + { + "epoch": 0.4602452692276456, + "grad_norm": 7.887493386006087, + "learning_rate": 2.9412425624575553e-06, + "loss": 0.5629, + "step": 5667 + }, + { + "epoch": 0.46032648420368716, + "grad_norm": 11.085374599142877, + "learning_rate": 2.9405952489207858e-06, + "loss": 0.4118, + "step": 5668 + }, + { + "epoch": 0.46040769917972874, + "grad_norm": 5.492625782917754, + "learning_rate": 2.9399479048971567e-06, + "loss": 0.4832, + "step": 5669 + }, + { + "epoch": 0.4604889141557703, + "grad_norm": 3.5056589438634216, + "learning_rate": 2.939300530431462e-06, + "loss": 0.5736, + "step": 5670 + }, + { + "epoch": 0.4605701291318119, + "grad_norm": 4.465558544811287, + "learning_rate": 2.9386531255684942e-06, + "loss": 0.7238, + "step": 5671 + }, + { + "epoch": 0.46065134410785347, + "grad_norm": 48.57316377462239, + "learning_rate": 2.938005690353052e-06, + "loss": 0.5249, + "step": 5672 + }, + { + "epoch": 0.46073255908389504, + "grad_norm": 5.242926405538982, + "learning_rate": 2.937358224829935e-06, + "loss": 0.5339, + "step": 5673 + }, + { + "epoch": 0.4608137740599367, + "grad_norm": 5.8937374649076695, + "learning_rate": 2.936710729043943e-06, + "loss": 0.6889, + "step": 5674 + }, + { + "epoch": 0.46089498903597825, + "grad_norm": 5.462741459326538, + "learning_rate": 2.936063203039879e-06, + "loss": 0.5676, + "step": 5675 + }, + { + "epoch": 0.46097620401201983, + "grad_norm": 6.268297125768787, + "learning_rate": 2.93541564686255e-06, + "loss": 0.526, + "step": 5676 + }, + { + "epoch": 0.4610574189880614, + "grad_norm": 3.614454345003604, + "learning_rate": 2.9347680605567624e-06, + "loss": 0.4487, + "step": 5677 + }, + { + "epoch": 0.461138633964103, + "grad_norm": 8.297993020626132, + "learning_rate": 2.9341204441673267e-06, + "loss": 0.4531, + "step": 5678 + }, + { + "epoch": 0.46121984894014456, + "grad_norm": 4.364678727568973, + "learning_rate": 2.9334727977390526e-06, + "loss": 0.6027, + "step": 5679 + }, + { + "epoch": 0.46130106391618614, + "grad_norm": 8.596844992148577, + "learning_rate": 2.9328251213167557e-06, + "loss": 0.5818, + "step": 5680 + }, + { + "epoch": 0.4613822788922277, + "grad_norm": 4.6174846839314005, + "learning_rate": 2.9321774149452507e-06, + "loss": 0.4268, + "step": 5681 + }, + { + "epoch": 0.4614634938682693, + "grad_norm": 10.308065553433666, + "learning_rate": 2.9315296786693564e-06, + "loss": 0.5332, + "step": 5682 + }, + { + "epoch": 0.46154470884431087, + "grad_norm": 7.029864284315326, + "learning_rate": 2.9308819125338923e-06, + "loss": 0.6391, + "step": 5683 + }, + { + "epoch": 0.46162592382035245, + "grad_norm": 6.141294742654424, + "learning_rate": 2.9302341165836794e-06, + "loss": 0.7152, + "step": 5684 + }, + { + "epoch": 0.4617071387963941, + "grad_norm": 5.578226095205441, + "learning_rate": 2.9295862908635436e-06, + "loss": 0.5101, + "step": 5685 + }, + { + "epoch": 0.46178835377243566, + "grad_norm": 5.083560208470007, + "learning_rate": 2.92893843541831e-06, + "loss": 0.5236, + "step": 5686 + }, + { + "epoch": 0.46186956874847723, + "grad_norm": 3.7370396960027654, + "learning_rate": 2.928290550292806e-06, + "loss": 0.6088, + "step": 5687 + }, + { + "epoch": 0.4619507837245188, + "grad_norm": 8.921482639240043, + "learning_rate": 2.9276426355318625e-06, + "loss": 0.4895, + "step": 5688 + }, + { + "epoch": 0.4620319987005604, + "grad_norm": 3.2133566714283055, + "learning_rate": 2.9269946911803134e-06, + "loss": 0.5798, + "step": 5689 + }, + { + "epoch": 0.46211321367660196, + "grad_norm": 6.204381262030037, + "learning_rate": 2.92634671728299e-06, + "loss": 0.4417, + "step": 5690 + }, + { + "epoch": 0.46219442865264354, + "grad_norm": 5.691308583371654, + "learning_rate": 2.9256987138847302e-06, + "loss": 0.5512, + "step": 5691 + }, + { + "epoch": 0.4622756436286851, + "grad_norm": 4.319078154150966, + "learning_rate": 2.925050681030373e-06, + "loss": 0.5221, + "step": 5692 + }, + { + "epoch": 0.4623568586047267, + "grad_norm": 5.04058444742641, + "learning_rate": 2.9244026187647584e-06, + "loss": 0.647, + "step": 5693 + }, + { + "epoch": 0.46243807358076827, + "grad_norm": 5.620845349439251, + "learning_rate": 2.923754527132728e-06, + "loss": 0.4942, + "step": 5694 + }, + { + "epoch": 0.46251928855680985, + "grad_norm": 5.022513745576825, + "learning_rate": 2.9231064061791277e-06, + "loss": 0.3824, + "step": 5695 + }, + { + "epoch": 0.4626005035328515, + "grad_norm": 4.5284930571538835, + "learning_rate": 2.922458255948803e-06, + "loss": 0.4568, + "step": 5696 + }, + { + "epoch": 0.46268171850889306, + "grad_norm": 4.600857423481818, + "learning_rate": 2.9218100764866025e-06, + "loss": 0.6157, + "step": 5697 + }, + { + "epoch": 0.46276293348493464, + "grad_norm": 3.265626698109701, + "learning_rate": 2.9211618678373775e-06, + "loss": 0.4392, + "step": 5698 + }, + { + "epoch": 0.4628441484609762, + "grad_norm": 4.624889936593594, + "learning_rate": 2.9205136300459803e-06, + "loss": 0.463, + "step": 5699 + }, + { + "epoch": 0.4629253634370178, + "grad_norm": 5.8554370873391655, + "learning_rate": 2.919865363157265e-06, + "loss": 0.6943, + "step": 5700 + }, + { + "epoch": 0.46300657841305937, + "grad_norm": 10.564403475889055, + "learning_rate": 2.9192170672160892e-06, + "loss": 0.5849, + "step": 5701 + }, + { + "epoch": 0.46308779338910094, + "grad_norm": 4.570004688540801, + "learning_rate": 2.9185687422673103e-06, + "loss": 0.5091, + "step": 5702 + }, + { + "epoch": 0.4631690083651425, + "grad_norm": 7.370321832435026, + "learning_rate": 2.917920388355791e-06, + "loss": 0.4578, + "step": 5703 + }, + { + "epoch": 0.4632502233411841, + "grad_norm": 4.488088245228176, + "learning_rate": 2.9172720055263916e-06, + "loss": 0.3637, + "step": 5704 + }, + { + "epoch": 0.4633314383172257, + "grad_norm": 3.3278944590303086, + "learning_rate": 2.9166235938239785e-06, + "loss": 0.4415, + "step": 5705 + }, + { + "epoch": 0.46341265329326725, + "grad_norm": 25.965744402127452, + "learning_rate": 2.9159751532934165e-06, + "loss": 0.4605, + "step": 5706 + }, + { + "epoch": 0.4634938682693089, + "grad_norm": 5.292291969576549, + "learning_rate": 2.9153266839795756e-06, + "loss": 0.5707, + "step": 5707 + }, + { + "epoch": 0.46357508324535046, + "grad_norm": 7.540226421886441, + "learning_rate": 2.9146781859273276e-06, + "loss": 0.4891, + "step": 5708 + }, + { + "epoch": 0.46365629822139204, + "grad_norm": 3.896646180404308, + "learning_rate": 2.9140296591815425e-06, + "loss": 0.5333, + "step": 5709 + }, + { + "epoch": 0.4637375131974336, + "grad_norm": 5.700465799524542, + "learning_rate": 2.913381103787097e-06, + "loss": 0.391, + "step": 5710 + }, + { + "epoch": 0.4638187281734752, + "grad_norm": 4.798705492265267, + "learning_rate": 2.9127325197888663e-06, + "loss": 0.404, + "step": 5711 + }, + { + "epoch": 0.46389994314951677, + "grad_norm": 3.5471558345306877, + "learning_rate": 2.91208390723173e-06, + "loss": 0.7544, + "step": 5712 + }, + { + "epoch": 0.46398115812555835, + "grad_norm": 6.891850689361664, + "learning_rate": 2.911435266160568e-06, + "loss": 0.4801, + "step": 5713 + }, + { + "epoch": 0.4640623731015999, + "grad_norm": 4.753158541053688, + "learning_rate": 2.910786596620263e-06, + "loss": 0.4848, + "step": 5714 + }, + { + "epoch": 0.4641435880776415, + "grad_norm": 5.463566847148066, + "learning_rate": 2.9101378986556996e-06, + "loss": 0.4833, + "step": 5715 + }, + { + "epoch": 0.4642248030536831, + "grad_norm": 3.542984250968721, + "learning_rate": 2.909489172311765e-06, + "loss": 0.7555, + "step": 5716 + }, + { + "epoch": 0.46430601802972465, + "grad_norm": 4.081018198125554, + "learning_rate": 2.9088404176333456e-06, + "loss": 0.5655, + "step": 5717 + }, + { + "epoch": 0.4643872330057663, + "grad_norm": 5.0004125127548, + "learning_rate": 2.9081916346653333e-06, + "loss": 0.6157, + "step": 5718 + }, + { + "epoch": 0.46446844798180786, + "grad_norm": 6.309500154267012, + "learning_rate": 2.9075428234526215e-06, + "loss": 0.404, + "step": 5719 + }, + { + "epoch": 0.46454966295784944, + "grad_norm": 5.199891325095606, + "learning_rate": 2.9068939840401018e-06, + "loss": 0.4213, + "step": 5720 + }, + { + "epoch": 0.464630877933891, + "grad_norm": 3.3927116646573685, + "learning_rate": 2.906245116472672e-06, + "loss": 0.5366, + "step": 5721 + }, + { + "epoch": 0.4647120929099326, + "grad_norm": 5.355679239024366, + "learning_rate": 2.905596220795231e-06, + "loss": 0.5221, + "step": 5722 + }, + { + "epoch": 0.46479330788597417, + "grad_norm": 4.733947923123343, + "learning_rate": 2.9049472970526777e-06, + "loss": 0.49, + "step": 5723 + }, + { + "epoch": 0.46487452286201575, + "grad_norm": 6.624090170841801, + "learning_rate": 2.904298345289914e-06, + "loss": 0.4846, + "step": 5724 + }, + { + "epoch": 0.4649557378380573, + "grad_norm": 5.72885175850521, + "learning_rate": 2.9036493655518456e-06, + "loss": 0.4319, + "step": 5725 + }, + { + "epoch": 0.4650369528140989, + "grad_norm": 4.332865010791888, + "learning_rate": 2.9030003578833765e-06, + "loss": 0.5975, + "step": 5726 + }, + { + "epoch": 0.4651181677901405, + "grad_norm": 6.313800965150897, + "learning_rate": 2.902351322329416e-06, + "loss": 0.5159, + "step": 5727 + }, + { + "epoch": 0.46519938276618206, + "grad_norm": 5.227382270145744, + "learning_rate": 2.9017022589348733e-06, + "loss": 0.393, + "step": 5728 + }, + { + "epoch": 0.4652805977422237, + "grad_norm": 2.840558709344795, + "learning_rate": 2.9010531677446602e-06, + "loss": 0.52, + "step": 5729 + }, + { + "epoch": 0.46536181271826527, + "grad_norm": 5.366773140432362, + "learning_rate": 2.90040404880369e-06, + "loss": 0.506, + "step": 5730 + }, + { + "epoch": 0.46544302769430684, + "grad_norm": 3.2313831572145193, + "learning_rate": 2.8997549021568792e-06, + "loss": 0.6582, + "step": 5731 + }, + { + "epoch": 0.4655242426703484, + "grad_norm": 5.926467612756465, + "learning_rate": 2.899105727849145e-06, + "loss": 0.4414, + "step": 5732 + }, + { + "epoch": 0.46560545764639, + "grad_norm": 4.447511140460557, + "learning_rate": 2.898456525925406e-06, + "loss": 0.54, + "step": 5733 + }, + { + "epoch": 0.4656866726224316, + "grad_norm": 5.303134033820951, + "learning_rate": 2.8978072964305848e-06, + "loss": 0.4579, + "step": 5734 + }, + { + "epoch": 0.46576788759847315, + "grad_norm": 5.195062102266287, + "learning_rate": 2.8971580394096043e-06, + "loss": 0.4434, + "step": 5735 + }, + { + "epoch": 0.46584910257451473, + "grad_norm": 5.8469679642289165, + "learning_rate": 2.896508754907389e-06, + "loss": 0.5036, + "step": 5736 + }, + { + "epoch": 0.4659303175505563, + "grad_norm": 3.484649611013429, + "learning_rate": 2.8958594429688656e-06, + "loss": 0.483, + "step": 5737 + }, + { + "epoch": 0.4660115325265979, + "grad_norm": 7.276763839964852, + "learning_rate": 2.895210103638966e-06, + "loss": 0.5142, + "step": 5738 + }, + { + "epoch": 0.46609274750263946, + "grad_norm": 4.316468383561267, + "learning_rate": 2.894560736962617e-06, + "loss": 0.6067, + "step": 5739 + }, + { + "epoch": 0.4661739624786811, + "grad_norm": 3.8962363928693184, + "learning_rate": 2.893911342984754e-06, + "loss": 0.7335, + "step": 5740 + }, + { + "epoch": 0.46625517745472267, + "grad_norm": 4.124297053875989, + "learning_rate": 2.89326192175031e-06, + "loss": 0.4862, + "step": 5741 + }, + { + "epoch": 0.46633639243076425, + "grad_norm": 5.37302630861064, + "learning_rate": 2.8926124733042228e-06, + "loss": 0.3562, + "step": 5742 + }, + { + "epoch": 0.4664176074068058, + "grad_norm": 5.046203124192214, + "learning_rate": 2.89196299769143e-06, + "loss": 0.7901, + "step": 5743 + }, + { + "epoch": 0.4664988223828474, + "grad_norm": 8.053971161102101, + "learning_rate": 2.8913134949568726e-06, + "loss": 0.5199, + "step": 5744 + }, + { + "epoch": 0.466580037358889, + "grad_norm": 3.327749003210284, + "learning_rate": 2.890663965145492e-06, + "loss": 0.5486, + "step": 5745 + }, + { + "epoch": 0.46666125233493055, + "grad_norm": 5.0353868986052674, + "learning_rate": 2.890014408302233e-06, + "loss": 0.6335, + "step": 5746 + }, + { + "epoch": 0.46674246731097213, + "grad_norm": 6.139502965392141, + "learning_rate": 2.8893648244720406e-06, + "loss": 0.4263, + "step": 5747 + }, + { + "epoch": 0.4668236822870137, + "grad_norm": 5.019570855865613, + "learning_rate": 2.8887152136998644e-06, + "loss": 0.5657, + "step": 5748 + }, + { + "epoch": 0.4669048972630553, + "grad_norm": 16.504398142409077, + "learning_rate": 2.8880655760306507e-06, + "loss": 0.3836, + "step": 5749 + }, + { + "epoch": 0.46698611223909686, + "grad_norm": 5.525848295027518, + "learning_rate": 2.887415911509354e-06, + "loss": 0.6306, + "step": 5750 + }, + { + "epoch": 0.4670673272151385, + "grad_norm": 4.288458640459229, + "learning_rate": 2.8867662201809266e-06, + "loss": 0.6479, + "step": 5751 + }, + { + "epoch": 0.46714854219118007, + "grad_norm": 8.844918615051897, + "learning_rate": 2.8861165020903235e-06, + "loss": 0.5476, + "step": 5752 + }, + { + "epoch": 0.46722975716722165, + "grad_norm": 8.708089067131917, + "learning_rate": 2.8854667572825013e-06, + "loss": 0.4022, + "step": 5753 + }, + { + "epoch": 0.4673109721432632, + "grad_norm": 3.9421995828081897, + "learning_rate": 2.8848169858024206e-06, + "loss": 0.4588, + "step": 5754 + }, + { + "epoch": 0.4673921871193048, + "grad_norm": 3.8977256522289374, + "learning_rate": 2.8841671876950404e-06, + "loss": 0.4142, + "step": 5755 + }, + { + "epoch": 0.4674734020953464, + "grad_norm": 8.084593014494807, + "learning_rate": 2.8835173630053244e-06, + "loss": 0.4468, + "step": 5756 + }, + { + "epoch": 0.46755461707138796, + "grad_norm": 3.533484576984449, + "learning_rate": 2.882867511778237e-06, + "loss": 0.5784, + "step": 5757 + }, + { + "epoch": 0.46763583204742953, + "grad_norm": 4.263459871167963, + "learning_rate": 2.8822176340587434e-06, + "loss": 0.6078, + "step": 5758 + }, + { + "epoch": 0.4677170470234711, + "grad_norm": 3.9845148250837865, + "learning_rate": 2.881567729891812e-06, + "loss": 0.4972, + "step": 5759 + }, + { + "epoch": 0.4677982619995127, + "grad_norm": 7.062828717835998, + "learning_rate": 2.8809177993224143e-06, + "loss": 0.5564, + "step": 5760 + }, + { + "epoch": 0.46787947697555426, + "grad_norm": 4.435936639131757, + "learning_rate": 2.88026784239552e-06, + "loss": 0.573, + "step": 5761 + }, + { + "epoch": 0.4679606919515959, + "grad_norm": 4.512414174366236, + "learning_rate": 2.8796178591561035e-06, + "loss": 0.4828, + "step": 5762 + }, + { + "epoch": 0.4680419069276375, + "grad_norm": 5.094918568491417, + "learning_rate": 2.8789678496491407e-06, + "loss": 0.5475, + "step": 5763 + }, + { + "epoch": 0.46812312190367905, + "grad_norm": 2.6728500983692514, + "learning_rate": 2.878317813919608e-06, + "loss": 0.5026, + "step": 5764 + }, + { + "epoch": 0.46820433687972063, + "grad_norm": 4.7597544395019495, + "learning_rate": 2.877667752012485e-06, + "loss": 0.7266, + "step": 5765 + }, + { + "epoch": 0.4682855518557622, + "grad_norm": 6.8641010734258, + "learning_rate": 2.877017663972752e-06, + "loss": 0.4941, + "step": 5766 + }, + { + "epoch": 0.4683667668318038, + "grad_norm": 5.927018173381962, + "learning_rate": 2.876367549845393e-06, + "loss": 0.5093, + "step": 5767 + }, + { + "epoch": 0.46844798180784536, + "grad_norm": 3.4289610085754383, + "learning_rate": 2.875717409675391e-06, + "loss": 0.5454, + "step": 5768 + }, + { + "epoch": 0.46852919678388694, + "grad_norm": 7.9437461607693, + "learning_rate": 2.875067243507732e-06, + "loss": 0.4647, + "step": 5769 + }, + { + "epoch": 0.4686104117599285, + "grad_norm": 3.7116341834744078, + "learning_rate": 2.8744170513874054e-06, + "loss": 0.4881, + "step": 5770 + }, + { + "epoch": 0.4686916267359701, + "grad_norm": 4.2367362028156, + "learning_rate": 2.8737668333594005e-06, + "loss": 0.4672, + "step": 5771 + }, + { + "epoch": 0.46877284171201167, + "grad_norm": 4.273237686231666, + "learning_rate": 2.873116589468708e-06, + "loss": 0.4611, + "step": 5772 + }, + { + "epoch": 0.4688540566880533, + "grad_norm": 5.096615588091853, + "learning_rate": 2.872466319760323e-06, + "loss": 0.5954, + "step": 5773 + }, + { + "epoch": 0.4689352716640949, + "grad_norm": 4.825210645748332, + "learning_rate": 2.87181602427924e-06, + "loss": 0.5718, + "step": 5774 + }, + { + "epoch": 0.46901648664013645, + "grad_norm": 5.991461149755983, + "learning_rate": 2.8711657030704553e-06, + "loss": 0.4037, + "step": 5775 + }, + { + "epoch": 0.46909770161617803, + "grad_norm": 3.4800097173343776, + "learning_rate": 2.870515356178969e-06, + "loss": 0.513, + "step": 5776 + }, + { + "epoch": 0.4691789165922196, + "grad_norm": 6.256528064000865, + "learning_rate": 2.8698649836497805e-06, + "loss": 0.6655, + "step": 5777 + }, + { + "epoch": 0.4692601315682612, + "grad_norm": 5.04158800857517, + "learning_rate": 2.869214585527893e-06, + "loss": 0.4901, + "step": 5778 + }, + { + "epoch": 0.46934134654430276, + "grad_norm": 4.617317169571152, + "learning_rate": 2.8685641618583098e-06, + "loss": 0.4887, + "step": 5779 + }, + { + "epoch": 0.46942256152034434, + "grad_norm": 3.9855464372411147, + "learning_rate": 2.8679137126860373e-06, + "loss": 0.647, + "step": 5780 + }, + { + "epoch": 0.4695037764963859, + "grad_norm": 4.5232125393874965, + "learning_rate": 2.867263238056084e-06, + "loss": 0.5514, + "step": 5781 + }, + { + "epoch": 0.4695849914724275, + "grad_norm": 6.069972056894211, + "learning_rate": 2.866612738013457e-06, + "loss": 0.6096, + "step": 5782 + }, + { + "epoch": 0.46966620644846907, + "grad_norm": 4.326041253791683, + "learning_rate": 2.8659622126031687e-06, + "loss": 0.5519, + "step": 5783 + }, + { + "epoch": 0.4697474214245107, + "grad_norm": 6.02872157760923, + "learning_rate": 2.8653116618702338e-06, + "loss": 0.5394, + "step": 5784 + }, + { + "epoch": 0.4698286364005523, + "grad_norm": 12.764955573129805, + "learning_rate": 2.8646610858596635e-06, + "loss": 0.5991, + "step": 5785 + }, + { + "epoch": 0.46990985137659386, + "grad_norm": 5.345811501835503, + "learning_rate": 2.864010484616477e-06, + "loss": 0.54, + "step": 5786 + }, + { + "epoch": 0.46999106635263543, + "grad_norm": 4.1825362641291415, + "learning_rate": 2.8633598581856915e-06, + "loss": 0.515, + "step": 5787 + }, + { + "epoch": 0.470072281328677, + "grad_norm": 4.283001560216305, + "learning_rate": 2.8627092066123263e-06, + "loss": 0.4081, + "step": 5788 + }, + { + "epoch": 0.4701534963047186, + "grad_norm": 4.308589294597862, + "learning_rate": 2.8620585299414038e-06, + "loss": 0.5021, + "step": 5789 + }, + { + "epoch": 0.47023471128076016, + "grad_norm": 8.01361742978466, + "learning_rate": 2.861407828217947e-06, + "loss": 0.4754, + "step": 5790 + }, + { + "epoch": 0.47031592625680174, + "grad_norm": 5.074919316163974, + "learning_rate": 2.8607571014869816e-06, + "loss": 0.5081, + "step": 5791 + }, + { + "epoch": 0.4703971412328433, + "grad_norm": 4.9471885543147325, + "learning_rate": 2.860106349793534e-06, + "loss": 0.6083, + "step": 5792 + }, + { + "epoch": 0.4704783562088849, + "grad_norm": 5.520036465193203, + "learning_rate": 2.859455573182632e-06, + "loss": 0.5181, + "step": 5793 + }, + { + "epoch": 0.4705595711849265, + "grad_norm": 8.857086777181047, + "learning_rate": 2.8588047716993084e-06, + "loss": 0.402, + "step": 5794 + }, + { + "epoch": 0.4706407861609681, + "grad_norm": 5.525749548650858, + "learning_rate": 2.858153945388592e-06, + "loss": 0.4228, + "step": 5795 + }, + { + "epoch": 0.4707220011370097, + "grad_norm": 4.365954557012976, + "learning_rate": 2.8575030942955185e-06, + "loss": 0.3335, + "step": 5796 + }, + { + "epoch": 0.47080321611305126, + "grad_norm": 5.357132843700224, + "learning_rate": 2.856852218465124e-06, + "loss": 0.5517, + "step": 5797 + }, + { + "epoch": 0.47088443108909284, + "grad_norm": 5.2190595114312694, + "learning_rate": 2.856201317942443e-06, + "loss": 0.4705, + "step": 5798 + }, + { + "epoch": 0.4709656460651344, + "grad_norm": 3.424227721385115, + "learning_rate": 2.8555503927725164e-06, + "loss": 0.4894, + "step": 5799 + }, + { + "epoch": 0.471046861041176, + "grad_norm": 5.0569787088343325, + "learning_rate": 2.854899443000385e-06, + "loss": 0.4604, + "step": 5800 + }, + { + "epoch": 0.47112807601721757, + "grad_norm": 8.187150772848494, + "learning_rate": 2.8542484686710896e-06, + "loss": 0.5102, + "step": 5801 + }, + { + "epoch": 0.47120929099325914, + "grad_norm": 6.207010087185806, + "learning_rate": 2.8535974698296765e-06, + "loss": 0.3528, + "step": 5802 + }, + { + "epoch": 0.4712905059693007, + "grad_norm": 4.227888017544308, + "learning_rate": 2.8529464465211886e-06, + "loss": 0.6394, + "step": 5803 + }, + { + "epoch": 0.4713717209453423, + "grad_norm": 4.665116991045132, + "learning_rate": 2.852295398790675e-06, + "loss": 0.6159, + "step": 5804 + }, + { + "epoch": 0.4714529359213839, + "grad_norm": 3.8703424664055857, + "learning_rate": 2.8516443266831837e-06, + "loss": 0.673, + "step": 5805 + }, + { + "epoch": 0.4715341508974255, + "grad_norm": 3.9676189612736388, + "learning_rate": 2.8509932302437665e-06, + "loss": 0.5117, + "step": 5806 + }, + { + "epoch": 0.4716153658734671, + "grad_norm": 4.329273160183401, + "learning_rate": 2.850342109517475e-06, + "loss": 0.3692, + "step": 5807 + }, + { + "epoch": 0.47169658084950866, + "grad_norm": 11.340360099111553, + "learning_rate": 2.8496909645493642e-06, + "loss": 0.4246, + "step": 5808 + }, + { + "epoch": 0.47177779582555024, + "grad_norm": 3.8942256837186626, + "learning_rate": 2.849039795384489e-06, + "loss": 0.5581, + "step": 5809 + }, + { + "epoch": 0.4718590108015918, + "grad_norm": 9.155900656113834, + "learning_rate": 2.8483886020679075e-06, + "loss": 0.4519, + "step": 5810 + }, + { + "epoch": 0.4719402257776334, + "grad_norm": 6.0033962365746465, + "learning_rate": 2.847737384644678e-06, + "loss": 0.5612, + "step": 5811 + }, + { + "epoch": 0.47202144075367497, + "grad_norm": 5.889887605386142, + "learning_rate": 2.8470861431598623e-06, + "loss": 0.6559, + "step": 5812 + }, + { + "epoch": 0.47210265572971655, + "grad_norm": 4.599377492190549, + "learning_rate": 2.8464348776585234e-06, + "loss": 0.635, + "step": 5813 + }, + { + "epoch": 0.4721838707057581, + "grad_norm": 10.76890029919621, + "learning_rate": 2.8457835881857227e-06, + "loss": 0.4829, + "step": 5814 + }, + { + "epoch": 0.4722650856817997, + "grad_norm": 7.220464203607489, + "learning_rate": 2.8451322747865286e-06, + "loss": 0.5789, + "step": 5815 + }, + { + "epoch": 0.47234630065784133, + "grad_norm": 4.1108394338985725, + "learning_rate": 2.844480937506008e-06, + "loss": 0.609, + "step": 5816 + }, + { + "epoch": 0.4724275156338829, + "grad_norm": 9.04222984053817, + "learning_rate": 2.843829576389229e-06, + "loss": 0.4926, + "step": 5817 + }, + { + "epoch": 0.4725087306099245, + "grad_norm": 4.528427577254921, + "learning_rate": 2.843178191481263e-06, + "loss": 0.6624, + "step": 5818 + }, + { + "epoch": 0.47258994558596606, + "grad_norm": 3.3028903657064825, + "learning_rate": 2.842526782827183e-06, + "loss": 0.5929, + "step": 5819 + }, + { + "epoch": 0.47267116056200764, + "grad_norm": 5.026903257750425, + "learning_rate": 2.841875350472062e-06, + "loss": 0.5268, + "step": 5820 + }, + { + "epoch": 0.4727523755380492, + "grad_norm": 6.5104574690823975, + "learning_rate": 2.841223894460976e-06, + "loss": 0.6567, + "step": 5821 + }, + { + "epoch": 0.4728335905140908, + "grad_norm": 4.314065979269395, + "learning_rate": 2.8405724148390023e-06, + "loss": 0.449, + "step": 5822 + }, + { + "epoch": 0.4729148054901324, + "grad_norm": 3.312866780207277, + "learning_rate": 2.8399209116512204e-06, + "loss": 0.5964, + "step": 5823 + }, + { + "epoch": 0.47299602046617395, + "grad_norm": 5.37935898304279, + "learning_rate": 2.83926938494271e-06, + "loss": 0.5702, + "step": 5824 + }, + { + "epoch": 0.4730772354422155, + "grad_norm": 4.881465697522925, + "learning_rate": 2.838617834758554e-06, + "loss": 0.4697, + "step": 5825 + }, + { + "epoch": 0.4731584504182571, + "grad_norm": 5.744126603133613, + "learning_rate": 2.8379662611438356e-06, + "loss": 0.4512, + "step": 5826 + }, + { + "epoch": 0.47323966539429874, + "grad_norm": 5.291309755595148, + "learning_rate": 2.8373146641436413e-06, + "loss": 0.5276, + "step": 5827 + }, + { + "epoch": 0.4733208803703403, + "grad_norm": 3.9537639542161407, + "learning_rate": 2.836663043803057e-06, + "loss": 0.5143, + "step": 5828 + }, + { + "epoch": 0.4734020953463819, + "grad_norm": 7.745428674706201, + "learning_rate": 2.8360114001671724e-06, + "loss": 0.4687, + "step": 5829 + }, + { + "epoch": 0.47348331032242347, + "grad_norm": 4.510709091006437, + "learning_rate": 2.835359733281077e-06, + "loss": 0.4782, + "step": 5830 + }, + { + "epoch": 0.47356452529846504, + "grad_norm": 7.03651123334896, + "learning_rate": 2.834708043189862e-06, + "loss": 0.4816, + "step": 5831 + }, + { + "epoch": 0.4736457402745066, + "grad_norm": 4.581264549358578, + "learning_rate": 2.8340563299386226e-06, + "loss": 0.3454, + "step": 5832 + }, + { + "epoch": 0.4737269552505482, + "grad_norm": 3.39063784376629, + "learning_rate": 2.833404593572453e-06, + "loss": 0.3696, + "step": 5833 + }, + { + "epoch": 0.4738081702265898, + "grad_norm": 5.734013115764664, + "learning_rate": 2.832752834136449e-06, + "loss": 0.4435, + "step": 5834 + }, + { + "epoch": 0.47388938520263135, + "grad_norm": 4.343417166849715, + "learning_rate": 2.832101051675712e-06, + "loss": 0.6581, + "step": 5835 + }, + { + "epoch": 0.47397060017867293, + "grad_norm": 3.661535617789465, + "learning_rate": 2.8314492462353386e-06, + "loss": 0.5384, + "step": 5836 + }, + { + "epoch": 0.4740518151547145, + "grad_norm": 4.785958660433438, + "learning_rate": 2.8307974178604312e-06, + "loss": 0.5199, + "step": 5837 + }, + { + "epoch": 0.47413303013075614, + "grad_norm": 7.777693529299737, + "learning_rate": 2.830145566596094e-06, + "loss": 0.5311, + "step": 5838 + }, + { + "epoch": 0.4742142451067977, + "grad_norm": 6.275994532015221, + "learning_rate": 2.8294936924874304e-06, + "loss": 0.6261, + "step": 5839 + }, + { + "epoch": 0.4742954600828393, + "grad_norm": 5.198840400244198, + "learning_rate": 2.8288417955795476e-06, + "loss": 0.5628, + "step": 5840 + }, + { + "epoch": 0.47437667505888087, + "grad_norm": 3.7789894977053615, + "learning_rate": 2.828189875917553e-06, + "loss": 0.6039, + "step": 5841 + }, + { + "epoch": 0.47445789003492245, + "grad_norm": 4.356995519309679, + "learning_rate": 2.827537933546555e-06, + "loss": 0.6307, + "step": 5842 + }, + { + "epoch": 0.474539105010964, + "grad_norm": 4.052004440671419, + "learning_rate": 2.8268859685116663e-06, + "loss": 0.5372, + "step": 5843 + }, + { + "epoch": 0.4746203199870056, + "grad_norm": 3.622092711482535, + "learning_rate": 2.826233980857998e-06, + "loss": 0.534, + "step": 5844 + }, + { + "epoch": 0.4747015349630472, + "grad_norm": 6.265338779708121, + "learning_rate": 2.8255819706306653e-06, + "loss": 0.4377, + "step": 5845 + }, + { + "epoch": 0.47478274993908876, + "grad_norm": 5.902804627865366, + "learning_rate": 2.8249299378747833e-06, + "loss": 0.3812, + "step": 5846 + }, + { + "epoch": 0.47486396491513033, + "grad_norm": 7.988753927468825, + "learning_rate": 2.824277882635469e-06, + "loss": 0.5673, + "step": 5847 + }, + { + "epoch": 0.4749451798911719, + "grad_norm": 10.87808274886899, + "learning_rate": 2.8236258049578418e-06, + "loss": 0.5389, + "step": 5848 + }, + { + "epoch": 0.47502639486721354, + "grad_norm": 5.350053974435865, + "learning_rate": 2.8229737048870216e-06, + "loss": 0.4301, + "step": 5849 + }, + { + "epoch": 0.4751076098432551, + "grad_norm": 5.918757588156862, + "learning_rate": 2.8223215824681295e-06, + "loss": 0.5695, + "step": 5850 + }, + { + "epoch": 0.4751888248192967, + "grad_norm": 12.49565829789524, + "learning_rate": 2.821669437746291e-06, + "loss": 0.6008, + "step": 5851 + }, + { + "epoch": 0.4752700397953383, + "grad_norm": 7.165842116453641, + "learning_rate": 2.8210172707666296e-06, + "loss": 0.5504, + "step": 5852 + }, + { + "epoch": 0.47535125477137985, + "grad_norm": 5.786887937045634, + "learning_rate": 2.820365081574271e-06, + "loss": 0.5192, + "step": 5853 + }, + { + "epoch": 0.4754324697474214, + "grad_norm": 6.157943347201467, + "learning_rate": 2.819712870214345e-06, + "loss": 0.5769, + "step": 5854 + }, + { + "epoch": 0.475513684723463, + "grad_norm": 15.746368000393133, + "learning_rate": 2.8190606367319806e-06, + "loss": 0.4668, + "step": 5855 + }, + { + "epoch": 0.4755948996995046, + "grad_norm": 7.868624147224612, + "learning_rate": 2.8184083811723083e-06, + "loss": 0.4084, + "step": 5856 + }, + { + "epoch": 0.47567611467554616, + "grad_norm": 16.215616004236644, + "learning_rate": 2.817756103580461e-06, + "loss": 0.588, + "step": 5857 + }, + { + "epoch": 0.47575732965158773, + "grad_norm": 4.747406090572309, + "learning_rate": 2.8171038040015737e-06, + "loss": 0.5907, + "step": 5858 + }, + { + "epoch": 0.4758385446276293, + "grad_norm": 4.9449474953603305, + "learning_rate": 2.8164514824807814e-06, + "loss": 0.5015, + "step": 5859 + }, + { + "epoch": 0.47591975960367094, + "grad_norm": 13.074384603626209, + "learning_rate": 2.8157991390632206e-06, + "loss": 0.6872, + "step": 5860 + }, + { + "epoch": 0.4760009745797125, + "grad_norm": 6.893964042250645, + "learning_rate": 2.8151467737940312e-06, + "loss": 0.4646, + "step": 5861 + }, + { + "epoch": 0.4760821895557541, + "grad_norm": 11.048526461851749, + "learning_rate": 2.8144943867183535e-06, + "loss": 0.6181, + "step": 5862 + }, + { + "epoch": 0.4761634045317957, + "grad_norm": 5.148676616822644, + "learning_rate": 2.8138419778813274e-06, + "loss": 0.5295, + "step": 5863 + }, + { + "epoch": 0.47624461950783725, + "grad_norm": 7.31376261027769, + "learning_rate": 2.8131895473280985e-06, + "loss": 0.6112, + "step": 5864 + }, + { + "epoch": 0.47632583448387883, + "grad_norm": 8.847836919115894, + "learning_rate": 2.81253709510381e-06, + "loss": 0.5427, + "step": 5865 + }, + { + "epoch": 0.4764070494599204, + "grad_norm": 3.549876129014136, + "learning_rate": 2.811884621253608e-06, + "loss": 0.5135, + "step": 5866 + }, + { + "epoch": 0.476488264435962, + "grad_norm": 4.676010284588651, + "learning_rate": 2.811232125822642e-06, + "loss": 0.6499, + "step": 5867 + }, + { + "epoch": 0.47656947941200356, + "grad_norm": 4.475545565711852, + "learning_rate": 2.81057960885606e-06, + "loss": 0.4845, + "step": 5868 + }, + { + "epoch": 0.47665069438804514, + "grad_norm": 5.150829015821627, + "learning_rate": 2.8099270703990124e-06, + "loss": 0.4768, + "step": 5869 + }, + { + "epoch": 0.4767319093640867, + "grad_norm": 6.981268549767182, + "learning_rate": 2.8092745104966514e-06, + "loss": 0.5269, + "step": 5870 + }, + { + "epoch": 0.47681312434012835, + "grad_norm": 4.317381690472616, + "learning_rate": 2.8086219291941314e-06, + "loss": 0.5675, + "step": 5871 + }, + { + "epoch": 0.4768943393161699, + "grad_norm": 6.190644029515262, + "learning_rate": 2.807969326536607e-06, + "loss": 0.4985, + "step": 5872 + }, + { + "epoch": 0.4769755542922115, + "grad_norm": 3.4066085952901073, + "learning_rate": 2.8073167025692354e-06, + "loss": 0.5183, + "step": 5873 + }, + { + "epoch": 0.4770567692682531, + "grad_norm": 5.692886429945919, + "learning_rate": 2.8066640573371747e-06, + "loss": 0.5322, + "step": 5874 + }, + { + "epoch": 0.47713798424429466, + "grad_norm": 7.642876548549243, + "learning_rate": 2.8060113908855847e-06, + "loss": 0.6323, + "step": 5875 + }, + { + "epoch": 0.47721919922033623, + "grad_norm": 17.72297787277452, + "learning_rate": 2.805358703259624e-06, + "loss": 0.5077, + "step": 5876 + }, + { + "epoch": 0.4773004141963778, + "grad_norm": 5.581364142761338, + "learning_rate": 2.8047059945044585e-06, + "loss": 0.4381, + "step": 5877 + }, + { + "epoch": 0.4773816291724194, + "grad_norm": 9.67091630066501, + "learning_rate": 2.8040532646652515e-06, + "loss": 0.4763, + "step": 5878 + }, + { + "epoch": 0.47746284414846096, + "grad_norm": 4.1891971217435575, + "learning_rate": 2.803400513787166e-06, + "loss": 0.5316, + "step": 5879 + }, + { + "epoch": 0.47754405912450254, + "grad_norm": 6.17817844727735, + "learning_rate": 2.802747741915372e-06, + "loss": 0.5111, + "step": 5880 + }, + { + "epoch": 0.4776252741005441, + "grad_norm": 4.9847568946301015, + "learning_rate": 2.8020949490950367e-06, + "loss": 0.5109, + "step": 5881 + }, + { + "epoch": 0.47770648907658575, + "grad_norm": 4.473462329646816, + "learning_rate": 2.801442135371329e-06, + "loss": 0.6185, + "step": 5882 + }, + { + "epoch": 0.4777877040526273, + "grad_norm": 6.199201466381722, + "learning_rate": 2.800789300789421e-06, + "loss": 0.4709, + "step": 5883 + }, + { + "epoch": 0.4778689190286689, + "grad_norm": 3.4927971670876157, + "learning_rate": 2.8001364453944853e-06, + "loss": 0.658, + "step": 5884 + }, + { + "epoch": 0.4779501340047105, + "grad_norm": 9.252290801408948, + "learning_rate": 2.799483569231696e-06, + "loss": 0.3258, + "step": 5885 + }, + { + "epoch": 0.47803134898075206, + "grad_norm": 5.159111959600434, + "learning_rate": 2.798830672346229e-06, + "loss": 0.7327, + "step": 5886 + }, + { + "epoch": 0.47811256395679363, + "grad_norm": 4.997715592914455, + "learning_rate": 2.7981777547832604e-06, + "loss": 0.5373, + "step": 5887 + }, + { + "epoch": 0.4781937789328352, + "grad_norm": 7.611885925164203, + "learning_rate": 2.7975248165879697e-06, + "loss": 0.4249, + "step": 5888 + }, + { + "epoch": 0.4782749939088768, + "grad_norm": 8.402898696861591, + "learning_rate": 2.7968718578055365e-06, + "loss": 0.5686, + "step": 5889 + }, + { + "epoch": 0.47835620888491837, + "grad_norm": 4.514667421391678, + "learning_rate": 2.796218878481142e-06, + "loss": 0.5125, + "step": 5890 + }, + { + "epoch": 0.47843742386095994, + "grad_norm": 6.200592103587213, + "learning_rate": 2.7955658786599688e-06, + "loss": 0.4591, + "step": 5891 + }, + { + "epoch": 0.4785186388370015, + "grad_norm": 3.9141827656240262, + "learning_rate": 2.7949128583872e-06, + "loss": 0.6035, + "step": 5892 + }, + { + "epoch": 0.47859985381304315, + "grad_norm": 2.881200756727726, + "learning_rate": 2.7942598177080233e-06, + "loss": 0.5105, + "step": 5893 + }, + { + "epoch": 0.47868106878908473, + "grad_norm": 4.876865531343913, + "learning_rate": 2.7936067566676244e-06, + "loss": 0.4788, + "step": 5894 + }, + { + "epoch": 0.4787622837651263, + "grad_norm": 4.129321830180821, + "learning_rate": 2.792953675311192e-06, + "loss": 0.4933, + "step": 5895 + }, + { + "epoch": 0.4788434987411679, + "grad_norm": 4.975168884090526, + "learning_rate": 2.792300573683915e-06, + "loss": 0.6777, + "step": 5896 + }, + { + "epoch": 0.47892471371720946, + "grad_norm": 4.373706953851931, + "learning_rate": 2.7916474518309854e-06, + "loss": 0.6101, + "step": 5897 + }, + { + "epoch": 0.47900592869325104, + "grad_norm": 6.787308432476194, + "learning_rate": 2.790994309797596e-06, + "loss": 0.5524, + "step": 5898 + }, + { + "epoch": 0.4790871436692926, + "grad_norm": 3.130380915868563, + "learning_rate": 2.79034114762894e-06, + "loss": 0.5455, + "step": 5899 + }, + { + "epoch": 0.4791683586453342, + "grad_norm": 5.155056558227334, + "learning_rate": 2.789687965370214e-06, + "loss": 0.405, + "step": 5900 + }, + { + "epoch": 0.47924957362137577, + "grad_norm": 5.772687941455542, + "learning_rate": 2.7890347630666135e-06, + "loss": 0.498, + "step": 5901 + }, + { + "epoch": 0.47933078859741735, + "grad_norm": 4.464910286196713, + "learning_rate": 2.788381540763337e-06, + "loss": 0.4895, + "step": 5902 + }, + { + "epoch": 0.4794120035734589, + "grad_norm": 6.222904764201984, + "learning_rate": 2.787728298505584e-06, + "loss": 0.456, + "step": 5903 + }, + { + "epoch": 0.47949321854950055, + "grad_norm": 30.493510617091204, + "learning_rate": 2.787075036338556e-06, + "loss": 0.4607, + "step": 5904 + }, + { + "epoch": 0.47957443352554213, + "grad_norm": 12.321251377438697, + "learning_rate": 2.7864217543074544e-06, + "loss": 0.545, + "step": 5905 + }, + { + "epoch": 0.4796556485015837, + "grad_norm": 5.80807114653115, + "learning_rate": 2.7857684524574833e-06, + "loss": 0.6029, + "step": 5906 + }, + { + "epoch": 0.4797368634776253, + "grad_norm": 7.259661847105047, + "learning_rate": 2.7851151308338483e-06, + "loss": 0.5074, + "step": 5907 + }, + { + "epoch": 0.47981807845366686, + "grad_norm": 5.490831475001269, + "learning_rate": 2.784461789481754e-06, + "loss": 0.4931, + "step": 5908 + }, + { + "epoch": 0.47989929342970844, + "grad_norm": 10.451899945538207, + "learning_rate": 2.7838084284464105e-06, + "loss": 0.3872, + "step": 5909 + }, + { + "epoch": 0.47998050840575, + "grad_norm": 7.293146517834257, + "learning_rate": 2.7831550477730255e-06, + "loss": 0.6436, + "step": 5910 + }, + { + "epoch": 0.4800617233817916, + "grad_norm": 6.817978018689368, + "learning_rate": 2.78250164750681e-06, + "loss": 0.4786, + "step": 5911 + }, + { + "epoch": 0.48014293835783317, + "grad_norm": 5.17758792887419, + "learning_rate": 2.781848227692974e-06, + "loss": 0.316, + "step": 5912 + }, + { + "epoch": 0.48022415333387475, + "grad_norm": 9.128124210915745, + "learning_rate": 2.7811947883767343e-06, + "loss": 0.4606, + "step": 5913 + }, + { + "epoch": 0.4803053683099163, + "grad_norm": 6.296126142033211, + "learning_rate": 2.780541329603303e-06, + "loss": 0.4745, + "step": 5914 + }, + { + "epoch": 0.48038658328595796, + "grad_norm": 6.296563355774902, + "learning_rate": 2.7798878514178955e-06, + "loss": 0.5816, + "step": 5915 + }, + { + "epoch": 0.48046779826199953, + "grad_norm": 4.08831948588252, + "learning_rate": 2.779234353865731e-06, + "loss": 0.4533, + "step": 5916 + }, + { + "epoch": 0.4805490132380411, + "grad_norm": 6.198220168009877, + "learning_rate": 2.7785808369920263e-06, + "loss": 0.52, + "step": 5917 + }, + { + "epoch": 0.4806302282140827, + "grad_norm": 7.348946572016875, + "learning_rate": 2.777927300842003e-06, + "loss": 0.4988, + "step": 5918 + }, + { + "epoch": 0.48071144319012427, + "grad_norm": 7.4243657712578, + "learning_rate": 2.7772737454608804e-06, + "loss": 0.5427, + "step": 5919 + }, + { + "epoch": 0.48079265816616584, + "grad_norm": 6.240520113282074, + "learning_rate": 2.7766201708938823e-06, + "loss": 0.6156, + "step": 5920 + }, + { + "epoch": 0.4808738731422074, + "grad_norm": 4.092737052613775, + "learning_rate": 2.7759665771862324e-06, + "loss": 0.4755, + "step": 5921 + }, + { + "epoch": 0.480955088118249, + "grad_norm": 4.543429446499594, + "learning_rate": 2.775312964383156e-06, + "loss": 0.4982, + "step": 5922 + }, + { + "epoch": 0.4810363030942906, + "grad_norm": 4.139943817706591, + "learning_rate": 2.77465933252988e-06, + "loss": 0.6698, + "step": 5923 + }, + { + "epoch": 0.48111751807033215, + "grad_norm": 4.420260702078683, + "learning_rate": 2.7740056816716317e-06, + "loss": 0.4612, + "step": 5924 + }, + { + "epoch": 0.48119873304637373, + "grad_norm": 8.399576878392056, + "learning_rate": 2.7733520118536395e-06, + "loss": 0.4372, + "step": 5925 + }, + { + "epoch": 0.48127994802241536, + "grad_norm": 8.529789641771623, + "learning_rate": 2.772698323121135e-06, + "loss": 0.4788, + "step": 5926 + }, + { + "epoch": 0.48136116299845694, + "grad_norm": 6.033494247535142, + "learning_rate": 2.7720446155193503e-06, + "loss": 0.8274, + "step": 5927 + }, + { + "epoch": 0.4814423779744985, + "grad_norm": 3.680362638542244, + "learning_rate": 2.7713908890935177e-06, + "loss": 0.5214, + "step": 5928 + }, + { + "epoch": 0.4815235929505401, + "grad_norm": 7.379140973686685, + "learning_rate": 2.770737143888872e-06, + "loss": 0.7151, + "step": 5929 + }, + { + "epoch": 0.48160480792658167, + "grad_norm": 3.6816364647280584, + "learning_rate": 2.7700833799506487e-06, + "loss": 0.553, + "step": 5930 + }, + { + "epoch": 0.48168602290262325, + "grad_norm": 5.987951257678209, + "learning_rate": 2.7694295973240848e-06, + "loss": 0.4937, + "step": 5931 + }, + { + "epoch": 0.4817672378786648, + "grad_norm": 4.956448565368726, + "learning_rate": 2.7687757960544193e-06, + "loss": 0.4982, + "step": 5932 + }, + { + "epoch": 0.4818484528547064, + "grad_norm": 4.49940857003615, + "learning_rate": 2.7681219761868905e-06, + "loss": 0.6454, + "step": 5933 + }, + { + "epoch": 0.481929667830748, + "grad_norm": 4.009481443453415, + "learning_rate": 2.7674681377667403e-06, + "loss": 0.5949, + "step": 5934 + }, + { + "epoch": 0.48201088280678955, + "grad_norm": 6.276103431385119, + "learning_rate": 2.7668142808392102e-06, + "loss": 0.6751, + "step": 5935 + }, + { + "epoch": 0.48209209778283113, + "grad_norm": 5.264109990138893, + "learning_rate": 2.7661604054495447e-06, + "loss": 0.5605, + "step": 5936 + }, + { + "epoch": 0.48217331275887276, + "grad_norm": 6.623257155307936, + "learning_rate": 2.765506511642987e-06, + "loss": 0.5536, + "step": 5937 + }, + { + "epoch": 0.48225452773491434, + "grad_norm": 8.180552462619701, + "learning_rate": 2.764852599464784e-06, + "loss": 0.5155, + "step": 5938 + }, + { + "epoch": 0.4823357427109559, + "grad_norm": 3.6982416953033996, + "learning_rate": 2.764198668960183e-06, + "loss": 0.4975, + "step": 5939 + }, + { + "epoch": 0.4824169576869975, + "grad_norm": 4.58146111767092, + "learning_rate": 2.7635447201744324e-06, + "loss": 0.6719, + "step": 5940 + }, + { + "epoch": 0.48249817266303907, + "grad_norm": 4.792428868707111, + "learning_rate": 2.7628907531527815e-06, + "loss": 0.5106, + "step": 5941 + }, + { + "epoch": 0.48257938763908065, + "grad_norm": 5.140027653654479, + "learning_rate": 2.762236767940482e-06, + "loss": 0.3998, + "step": 5942 + }, + { + "epoch": 0.4826606026151222, + "grad_norm": 4.42487569925798, + "learning_rate": 2.761582764582787e-06, + "loss": 0.4726, + "step": 5943 + }, + { + "epoch": 0.4827418175911638, + "grad_norm": 4.997518890904461, + "learning_rate": 2.760928743124948e-06, + "loss": 0.5601, + "step": 5944 + }, + { + "epoch": 0.4828230325672054, + "grad_norm": 3.799079838934848, + "learning_rate": 2.7602747036122213e-06, + "loss": 0.6182, + "step": 5945 + }, + { + "epoch": 0.48290424754324696, + "grad_norm": 10.353271040341017, + "learning_rate": 2.759620646089863e-06, + "loss": 0.476, + "step": 5946 + }, + { + "epoch": 0.48298546251928853, + "grad_norm": 5.666756315530278, + "learning_rate": 2.758966570603129e-06, + "loss": 0.5052, + "step": 5947 + }, + { + "epoch": 0.48306667749533017, + "grad_norm": 6.165341654780337, + "learning_rate": 2.7583124771972797e-06, + "loss": 0.4994, + "step": 5948 + }, + { + "epoch": 0.48314789247137174, + "grad_norm": 4.569577065572749, + "learning_rate": 2.7576583659175738e-06, + "loss": 0.439, + "step": 5949 + }, + { + "epoch": 0.4832291074474133, + "grad_norm": 3.098941821564416, + "learning_rate": 2.7570042368092724e-06, + "loss": 0.5527, + "step": 5950 + }, + { + "epoch": 0.4833103224234549, + "grad_norm": 5.497608597420854, + "learning_rate": 2.7563500899176383e-06, + "loss": 0.4795, + "step": 5951 + }, + { + "epoch": 0.4833915373994965, + "grad_norm": 5.7165228874758505, + "learning_rate": 2.7556959252879345e-06, + "loss": 0.4909, + "step": 5952 + }, + { + "epoch": 0.48347275237553805, + "grad_norm": 5.874962862560617, + "learning_rate": 2.755041742965426e-06, + "loss": 0.4677, + "step": 5953 + }, + { + "epoch": 0.4835539673515796, + "grad_norm": 4.095451834207825, + "learning_rate": 2.7543875429953787e-06, + "loss": 0.6461, + "step": 5954 + }, + { + "epoch": 0.4836351823276212, + "grad_norm": 4.473317225148218, + "learning_rate": 2.7537333254230596e-06, + "loss": 0.4963, + "step": 5955 + }, + { + "epoch": 0.4837163973036628, + "grad_norm": 5.773342053411397, + "learning_rate": 2.7530790902937376e-06, + "loss": 0.4534, + "step": 5956 + }, + { + "epoch": 0.48379761227970436, + "grad_norm": 3.510346924849262, + "learning_rate": 2.752424837652681e-06, + "loss": 0.6875, + "step": 5957 + }, + { + "epoch": 0.48387882725574594, + "grad_norm": 7.132250653049774, + "learning_rate": 2.751770567545163e-06, + "loss": 0.5412, + "step": 5958 + }, + { + "epoch": 0.48396004223178757, + "grad_norm": 5.085327115862414, + "learning_rate": 2.7511162800164536e-06, + "loss": 0.7837, + "step": 5959 + }, + { + "epoch": 0.48404125720782915, + "grad_norm": 7.899170324549878, + "learning_rate": 2.7504619751118266e-06, + "loss": 0.5815, + "step": 5960 + }, + { + "epoch": 0.4841224721838707, + "grad_norm": 12.534688873820935, + "learning_rate": 2.749807652876556e-06, + "loss": 0.5867, + "step": 5961 + }, + { + "epoch": 0.4842036871599123, + "grad_norm": 5.888585867450327, + "learning_rate": 2.749153313355919e-06, + "loss": 0.4844, + "step": 5962 + }, + { + "epoch": 0.4842849021359539, + "grad_norm": 4.658146495411607, + "learning_rate": 2.74849895659519e-06, + "loss": 0.4221, + "step": 5963 + }, + { + "epoch": 0.48436611711199545, + "grad_norm": 8.00901759996569, + "learning_rate": 2.7478445826396495e-06, + "loss": 0.3995, + "step": 5964 + }, + { + "epoch": 0.48444733208803703, + "grad_norm": 4.722557156452973, + "learning_rate": 2.747190191534575e-06, + "loss": 0.5922, + "step": 5965 + }, + { + "epoch": 0.4845285470640786, + "grad_norm": 4.907315907025934, + "learning_rate": 2.7465357833252477e-06, + "loss": 0.483, + "step": 5966 + }, + { + "epoch": 0.4846097620401202, + "grad_norm": 3.780776893956639, + "learning_rate": 2.7458813580569487e-06, + "loss": 0.5136, + "step": 5967 + }, + { + "epoch": 0.48469097701616176, + "grad_norm": 4.759267167903013, + "learning_rate": 2.7452269157749614e-06, + "loss": 0.6048, + "step": 5968 + }, + { + "epoch": 0.48477219199220334, + "grad_norm": 5.977936036351052, + "learning_rate": 2.744572456524569e-06, + "loss": 0.5071, + "step": 5969 + }, + { + "epoch": 0.48485340696824497, + "grad_norm": 5.801052897330955, + "learning_rate": 2.7439179803510567e-06, + "loss": 0.3259, + "step": 5970 + }, + { + "epoch": 0.48493462194428655, + "grad_norm": 5.504537267230917, + "learning_rate": 2.7432634872997123e-06, + "loss": 0.454, + "step": 5971 + }, + { + "epoch": 0.4850158369203281, + "grad_norm": 5.980973589761268, + "learning_rate": 2.7426089774158217e-06, + "loss": 0.5151, + "step": 5972 + }, + { + "epoch": 0.4850970518963697, + "grad_norm": 6.090787296944255, + "learning_rate": 2.7419544507446727e-06, + "loss": 0.5434, + "step": 5973 + }, + { + "epoch": 0.4851782668724113, + "grad_norm": 5.184785779288021, + "learning_rate": 2.7412999073315567e-06, + "loss": 0.6548, + "step": 5974 + }, + { + "epoch": 0.48525948184845286, + "grad_norm": 3.168315464569746, + "learning_rate": 2.7406453472217654e-06, + "loss": 0.5838, + "step": 5975 + }, + { + "epoch": 0.48534069682449443, + "grad_norm": 5.7401424667695204, + "learning_rate": 2.7399907704605884e-06, + "loss": 0.4268, + "step": 5976 + }, + { + "epoch": 0.485421911800536, + "grad_norm": 8.886061000991853, + "learning_rate": 2.7393361770933198e-06, + "loss": 0.4986, + "step": 5977 + }, + { + "epoch": 0.4855031267765776, + "grad_norm": 4.804673355461681, + "learning_rate": 2.7386815671652556e-06, + "loss": 0.6466, + "step": 5978 + }, + { + "epoch": 0.48558434175261916, + "grad_norm": 5.321396681856843, + "learning_rate": 2.7380269407216896e-06, + "loss": 0.4684, + "step": 5979 + }, + { + "epoch": 0.48566555672866074, + "grad_norm": 5.686891915170021, + "learning_rate": 2.737372297807919e-06, + "loss": 0.6947, + "step": 5980 + }, + { + "epoch": 0.4857467717047024, + "grad_norm": 6.348245493300976, + "learning_rate": 2.7367176384692425e-06, + "loss": 0.4924, + "step": 5981 + }, + { + "epoch": 0.48582798668074395, + "grad_norm": 6.0903339297311545, + "learning_rate": 2.736062962750957e-06, + "loss": 0.5165, + "step": 5982 + }, + { + "epoch": 0.4859092016567855, + "grad_norm": 11.757378358765155, + "learning_rate": 2.735408270698364e-06, + "loss": 0.4462, + "step": 5983 + }, + { + "epoch": 0.4859904166328271, + "grad_norm": 16.683533350645916, + "learning_rate": 2.7347535623567656e-06, + "loss": 0.6067, + "step": 5984 + }, + { + "epoch": 0.4860716316088687, + "grad_norm": 7.944296108437706, + "learning_rate": 2.734098837771462e-06, + "loss": 0.5032, + "step": 5985 + }, + { + "epoch": 0.48615284658491026, + "grad_norm": 5.943425981392983, + "learning_rate": 2.7334440969877584e-06, + "loss": 0.5087, + "step": 5986 + }, + { + "epoch": 0.48623406156095184, + "grad_norm": 3.3987635620526686, + "learning_rate": 2.7327893400509586e-06, + "loss": 0.5818, + "step": 5987 + }, + { + "epoch": 0.4863152765369934, + "grad_norm": 6.620696249187532, + "learning_rate": 2.732134567006368e-06, + "loss": 0.4364, + "step": 5988 + }, + { + "epoch": 0.486396491513035, + "grad_norm": 5.572056490122121, + "learning_rate": 2.731479777899295e-06, + "loss": 0.4405, + "step": 5989 + }, + { + "epoch": 0.48647770648907657, + "grad_norm": 6.354284008710806, + "learning_rate": 2.730824972775045e-06, + "loss": 0.6067, + "step": 5990 + }, + { + "epoch": 0.48655892146511814, + "grad_norm": 5.802616735842974, + "learning_rate": 2.7301701516789303e-06, + "loss": 0.3966, + "step": 5991 + }, + { + "epoch": 0.4866401364411598, + "grad_norm": 4.459624839521087, + "learning_rate": 2.729515314656258e-06, + "loss": 0.5996, + "step": 5992 + }, + { + "epoch": 0.48672135141720135, + "grad_norm": 7.157939161361248, + "learning_rate": 2.7288604617523405e-06, + "loss": 0.5517, + "step": 5993 + }, + { + "epoch": 0.48680256639324293, + "grad_norm": 5.292457922278461, + "learning_rate": 2.728205593012491e-06, + "loss": 0.4596, + "step": 5994 + }, + { + "epoch": 0.4868837813692845, + "grad_norm": 5.048505829129327, + "learning_rate": 2.7275507084820226e-06, + "loss": 0.6614, + "step": 5995 + }, + { + "epoch": 0.4869649963453261, + "grad_norm": 5.999484690389678, + "learning_rate": 2.726895808206248e-06, + "loss": 0.4645, + "step": 5996 + }, + { + "epoch": 0.48704621132136766, + "grad_norm": 4.707719844104531, + "learning_rate": 2.7262408922304857e-06, + "loss": 0.5658, + "step": 5997 + }, + { + "epoch": 0.48712742629740924, + "grad_norm": 9.302118644783453, + "learning_rate": 2.72558596060005e-06, + "loss": 0.4844, + "step": 5998 + }, + { + "epoch": 0.4872086412734508, + "grad_norm": 6.41310409364449, + "learning_rate": 2.72493101336026e-06, + "loss": 0.4755, + "step": 5999 + }, + { + "epoch": 0.4872898562494924, + "grad_norm": 7.372592645487124, + "learning_rate": 2.7242760505564346e-06, + "loss": 0.4443, + "step": 6000 + }, + { + "epoch": 0.48737107122553397, + "grad_norm": 5.478164117803782, + "learning_rate": 2.7236210722338936e-06, + "loss": 0.6266, + "step": 6001 + }, + { + "epoch": 0.48745228620157555, + "grad_norm": 4.451621100299445, + "learning_rate": 2.7229660784379575e-06, + "loss": 0.6028, + "step": 6002 + }, + { + "epoch": 0.4875335011776172, + "grad_norm": 4.829301062695959, + "learning_rate": 2.7223110692139487e-06, + "loss": 0.3843, + "step": 6003 + }, + { + "epoch": 0.48761471615365876, + "grad_norm": 8.161182349000478, + "learning_rate": 2.7216560446071904e-06, + "loss": 0.5373, + "step": 6004 + }, + { + "epoch": 0.48769593112970033, + "grad_norm": 5.075301630204499, + "learning_rate": 2.721001004663008e-06, + "loss": 0.5209, + "step": 6005 + }, + { + "epoch": 0.4877771461057419, + "grad_norm": 3.5392282378757094, + "learning_rate": 2.7203459494267243e-06, + "loss": 0.4714, + "step": 6006 + }, + { + "epoch": 0.4878583610817835, + "grad_norm": 3.3493451655565147, + "learning_rate": 2.719690878943668e-06, + "loss": 0.6381, + "step": 6007 + }, + { + "epoch": 0.48793957605782506, + "grad_norm": 11.279877740153177, + "learning_rate": 2.7190357932591653e-06, + "loss": 0.5869, + "step": 6008 + }, + { + "epoch": 0.48802079103386664, + "grad_norm": 15.655656745532893, + "learning_rate": 2.7183806924185447e-06, + "loss": 0.4589, + "step": 6009 + }, + { + "epoch": 0.4881020060099082, + "grad_norm": 7.534260268163045, + "learning_rate": 2.717725576467136e-06, + "loss": 0.5564, + "step": 6010 + }, + { + "epoch": 0.4881832209859498, + "grad_norm": 8.16602240519616, + "learning_rate": 2.71707044545027e-06, + "loss": 0.4806, + "step": 6011 + }, + { + "epoch": 0.48826443596199137, + "grad_norm": 5.065942666535822, + "learning_rate": 2.716415299413278e-06, + "loss": 0.4948, + "step": 6012 + }, + { + "epoch": 0.48834565093803295, + "grad_norm": 6.1717583953297455, + "learning_rate": 2.7157601384014927e-06, + "loss": 0.5663, + "step": 6013 + }, + { + "epoch": 0.4884268659140746, + "grad_norm": 5.9450854471553125, + "learning_rate": 2.7151049624602473e-06, + "loss": 0.6468, + "step": 6014 + }, + { + "epoch": 0.48850808089011616, + "grad_norm": 6.376587879616875, + "learning_rate": 2.714449771634877e-06, + "loss": 0.4685, + "step": 6015 + }, + { + "epoch": 0.48858929586615774, + "grad_norm": 4.341090257406891, + "learning_rate": 2.713794565970718e-06, + "loss": 0.438, + "step": 6016 + }, + { + "epoch": 0.4886705108421993, + "grad_norm": 4.016157690853844, + "learning_rate": 2.7131393455131057e-06, + "loss": 0.6089, + "step": 6017 + }, + { + "epoch": 0.4887517258182409, + "grad_norm": 2.638895965006909, + "learning_rate": 2.7124841103073794e-06, + "loss": 0.3652, + "step": 6018 + }, + { + "epoch": 0.48883294079428247, + "grad_norm": 9.501098441788494, + "learning_rate": 2.711828860398877e-06, + "loss": 0.5685, + "step": 6019 + }, + { + "epoch": 0.48891415577032404, + "grad_norm": 6.054200225549085, + "learning_rate": 2.7111735958329383e-06, + "loss": 0.3793, + "step": 6020 + }, + { + "epoch": 0.4889953707463656, + "grad_norm": 5.616508580837288, + "learning_rate": 2.7105183166549048e-06, + "loss": 0.547, + "step": 6021 + }, + { + "epoch": 0.4890765857224072, + "grad_norm": 6.361440708041633, + "learning_rate": 2.7098630229101174e-06, + "loss": 0.5794, + "step": 6022 + }, + { + "epoch": 0.4891578006984488, + "grad_norm": 9.543869316221988, + "learning_rate": 2.70920771464392e-06, + "loss": 0.6857, + "step": 6023 + }, + { + "epoch": 0.48923901567449035, + "grad_norm": 4.931811725830596, + "learning_rate": 2.708552391901656e-06, + "loss": 0.6798, + "step": 6024 + }, + { + "epoch": 0.489320230650532, + "grad_norm": 7.691562606616547, + "learning_rate": 2.70789705472867e-06, + "loss": 0.6456, + "step": 6025 + }, + { + "epoch": 0.48940144562657356, + "grad_norm": 3.9609368324412286, + "learning_rate": 2.707241703170308e-06, + "loss": 0.4363, + "step": 6026 + }, + { + "epoch": 0.48948266060261514, + "grad_norm": 5.514465718417427, + "learning_rate": 2.706586337271917e-06, + "loss": 0.4752, + "step": 6027 + }, + { + "epoch": 0.4895638755786567, + "grad_norm": 4.986568690370059, + "learning_rate": 2.705930957078845e-06, + "loss": 0.5089, + "step": 6028 + }, + { + "epoch": 0.4896450905546983, + "grad_norm": 4.59026441450655, + "learning_rate": 2.705275562636441e-06, + "loss": 0.5348, + "step": 6029 + }, + { + "epoch": 0.48972630553073987, + "grad_norm": 5.882838457335099, + "learning_rate": 2.7046201539900537e-06, + "loss": 0.5789, + "step": 6030 + }, + { + "epoch": 0.48980752050678145, + "grad_norm": 6.273818536620701, + "learning_rate": 2.7039647311850347e-06, + "loss": 0.5857, + "step": 6031 + }, + { + "epoch": 0.489888735482823, + "grad_norm": 4.353727277852687, + "learning_rate": 2.7033092942667362e-06, + "loss": 0.4136, + "step": 6032 + }, + { + "epoch": 0.4899699504588646, + "grad_norm": 11.49874908610111, + "learning_rate": 2.70265384328051e-06, + "loss": 0.4196, + "step": 6033 + }, + { + "epoch": 0.4900511654349062, + "grad_norm": 8.579866589240709, + "learning_rate": 2.701998378271711e-06, + "loss": 0.4894, + "step": 6034 + }, + { + "epoch": 0.49013238041094775, + "grad_norm": 25.037657431765965, + "learning_rate": 2.7013428992856925e-06, + "loss": 0.5139, + "step": 6035 + }, + { + "epoch": 0.4902135953869894, + "grad_norm": 11.228244377773777, + "learning_rate": 2.700687406367812e-06, + "loss": 0.6387, + "step": 6036 + }, + { + "epoch": 0.49029481036303096, + "grad_norm": 4.944749953225672, + "learning_rate": 2.700031899563425e-06, + "loss": 0.5617, + "step": 6037 + }, + { + "epoch": 0.49037602533907254, + "grad_norm": 11.318432666073637, + "learning_rate": 2.6993763789178885e-06, + "loss": 0.6037, + "step": 6038 + }, + { + "epoch": 0.4904572403151141, + "grad_norm": 4.831316548854263, + "learning_rate": 2.698720844476562e-06, + "loss": 0.6919, + "step": 6039 + }, + { + "epoch": 0.4905384552911557, + "grad_norm": 6.464718545938012, + "learning_rate": 2.6980652962848055e-06, + "loss": 0.5024, + "step": 6040 + }, + { + "epoch": 0.49061967026719727, + "grad_norm": 8.179746403330494, + "learning_rate": 2.697409734387978e-06, + "loss": 0.4774, + "step": 6041 + }, + { + "epoch": 0.49070088524323885, + "grad_norm": 6.753372733627089, + "learning_rate": 2.6967541588314413e-06, + "loss": 0.4216, + "step": 6042 + }, + { + "epoch": 0.4907821002192804, + "grad_norm": 3.5059703839980707, + "learning_rate": 2.6960985696605583e-06, + "loss": 0.5735, + "step": 6043 + }, + { + "epoch": 0.490863315195322, + "grad_norm": 6.5528726281723335, + "learning_rate": 2.695442966920693e-06, + "loss": 0.4922, + "step": 6044 + }, + { + "epoch": 0.4909445301713636, + "grad_norm": 6.2801153498685744, + "learning_rate": 2.6947873506572083e-06, + "loss": 0.5194, + "step": 6045 + }, + { + "epoch": 0.49102574514740516, + "grad_norm": 2.729788705777957, + "learning_rate": 2.6941317209154694e-06, + "loss": 0.7296, + "step": 6046 + }, + { + "epoch": 0.4911069601234468, + "grad_norm": 5.110271057688403, + "learning_rate": 2.693476077740843e-06, + "loss": 0.4946, + "step": 6047 + }, + { + "epoch": 0.49118817509948837, + "grad_norm": 5.133921081919526, + "learning_rate": 2.6928204211786957e-06, + "loss": 0.5102, + "step": 6048 + }, + { + "epoch": 0.49126939007552994, + "grad_norm": 4.1327143511126, + "learning_rate": 2.6921647512743963e-06, + "loss": 0.4642, + "step": 6049 + }, + { + "epoch": 0.4913506050515715, + "grad_norm": 13.617171548443578, + "learning_rate": 2.691509068073313e-06, + "loss": 0.356, + "step": 6050 + }, + { + "epoch": 0.4914318200276131, + "grad_norm": 5.719645297157029, + "learning_rate": 2.6908533716208157e-06, + "loss": 0.5327, + "step": 6051 + }, + { + "epoch": 0.4915130350036547, + "grad_norm": 3.9479135140894224, + "learning_rate": 2.690197661962275e-06, + "loss": 0.6186, + "step": 6052 + }, + { + "epoch": 0.49159424997969625, + "grad_norm": 7.503761117502973, + "learning_rate": 2.6895419391430635e-06, + "loss": 0.3574, + "step": 6053 + }, + { + "epoch": 0.49167546495573783, + "grad_norm": 4.597046243404988, + "learning_rate": 2.688886203208552e-06, + "loss": 0.4214, + "step": 6054 + }, + { + "epoch": 0.4917566799317794, + "grad_norm": 19.152422694361487, + "learning_rate": 2.6882304542041147e-06, + "loss": 0.4696, + "step": 6055 + }, + { + "epoch": 0.491837894907821, + "grad_norm": 6.515260679020386, + "learning_rate": 2.687574692175127e-06, + "loss": 0.4959, + "step": 6056 + }, + { + "epoch": 0.49191910988386256, + "grad_norm": 4.2821690669001224, + "learning_rate": 2.6869189171669637e-06, + "loss": 0.3626, + "step": 6057 + }, + { + "epoch": 0.4920003248599042, + "grad_norm": 7.192130700433451, + "learning_rate": 2.686263129224999e-06, + "loss": 0.5963, + "step": 6058 + }, + { + "epoch": 0.49208153983594577, + "grad_norm": 7.6817361440233585, + "learning_rate": 2.685607328394613e-06, + "loss": 0.4168, + "step": 6059 + }, + { + "epoch": 0.49216275481198735, + "grad_norm": 11.249409008964918, + "learning_rate": 2.6849515147211814e-06, + "loss": 0.4758, + "step": 6060 + }, + { + "epoch": 0.4922439697880289, + "grad_norm": 3.9392524421599346, + "learning_rate": 2.6842956882500843e-06, + "loss": 0.445, + "step": 6061 + }, + { + "epoch": 0.4923251847640705, + "grad_norm": 9.559526227459175, + "learning_rate": 2.6836398490267006e-06, + "loss": 0.5492, + "step": 6062 + }, + { + "epoch": 0.4924063997401121, + "grad_norm": 3.917087406483406, + "learning_rate": 2.6829839970964112e-06, + "loss": 0.4452, + "step": 6063 + }, + { + "epoch": 0.49248761471615365, + "grad_norm": 3.7273772258833566, + "learning_rate": 2.682328132504598e-06, + "loss": 0.4458, + "step": 6064 + }, + { + "epoch": 0.49256882969219523, + "grad_norm": 4.9038911903534474, + "learning_rate": 2.6816722552966423e-06, + "loss": 0.5387, + "step": 6065 + }, + { + "epoch": 0.4926500446682368, + "grad_norm": 5.697167255856347, + "learning_rate": 2.6810163655179287e-06, + "loss": 0.4933, + "step": 6066 + }, + { + "epoch": 0.4927312596442784, + "grad_norm": 4.553361726145492, + "learning_rate": 2.6803604632138403e-06, + "loss": 0.381, + "step": 6067 + }, + { + "epoch": 0.49281247462031996, + "grad_norm": 4.7949903203120305, + "learning_rate": 2.6797045484297624e-06, + "loss": 0.5172, + "step": 6068 + }, + { + "epoch": 0.4928936895963616, + "grad_norm": 20.378872084713798, + "learning_rate": 2.6790486212110812e-06, + "loss": 0.4133, + "step": 6069 + }, + { + "epoch": 0.49297490457240317, + "grad_norm": 6.870346383155372, + "learning_rate": 2.678392681603183e-06, + "loss": 0.4685, + "step": 6070 + }, + { + "epoch": 0.49305611954844475, + "grad_norm": 4.53706289191984, + "learning_rate": 2.6777367296514547e-06, + "loss": 0.4881, + "step": 6071 + }, + { + "epoch": 0.4931373345244863, + "grad_norm": 5.005808865354581, + "learning_rate": 2.677080765401286e-06, + "loss": 0.5477, + "step": 6072 + }, + { + "epoch": 0.4932185495005279, + "grad_norm": 8.68217397091714, + "learning_rate": 2.6764247888980654e-06, + "loss": 0.4074, + "step": 6073 + }, + { + "epoch": 0.4932997644765695, + "grad_norm": 6.058970447210371, + "learning_rate": 2.675768800187182e-06, + "loss": 0.5763, + "step": 6074 + }, + { + "epoch": 0.49338097945261106, + "grad_norm": 2.9831711571626522, + "learning_rate": 2.67511279931403e-06, + "loss": 0.6046, + "step": 6075 + }, + { + "epoch": 0.49346219442865263, + "grad_norm": 3.841362882001304, + "learning_rate": 2.674456786323998e-06, + "loss": 0.505, + "step": 6076 + }, + { + "epoch": 0.4935434094046942, + "grad_norm": 5.296826365325529, + "learning_rate": 2.6738007612624792e-06, + "loss": 0.5466, + "step": 6077 + }, + { + "epoch": 0.4936246243807358, + "grad_norm": 4.678775568920299, + "learning_rate": 2.673144724174868e-06, + "loss": 0.5204, + "step": 6078 + }, + { + "epoch": 0.49370583935677737, + "grad_norm": 4.857667826957571, + "learning_rate": 2.6724886751065584e-06, + "loss": 0.5329, + "step": 6079 + }, + { + "epoch": 0.493787054332819, + "grad_norm": 5.592492702134483, + "learning_rate": 2.671832614102945e-06, + "loss": 0.529, + "step": 6080 + }, + { + "epoch": 0.4938682693088606, + "grad_norm": 61.655374457513666, + "learning_rate": 2.671176541209424e-06, + "loss": 0.5004, + "step": 6081 + }, + { + "epoch": 0.49394948428490215, + "grad_norm": 6.490521162736462, + "learning_rate": 2.6705204564713927e-06, + "loss": 0.4508, + "step": 6082 + }, + { + "epoch": 0.49403069926094373, + "grad_norm": 6.042977369262763, + "learning_rate": 2.669864359934249e-06, + "loss": 0.4754, + "step": 6083 + }, + { + "epoch": 0.4941119142369853, + "grad_norm": 7.550162317594159, + "learning_rate": 2.6692082516433886e-06, + "loss": 0.4356, + "step": 6084 + }, + { + "epoch": 0.4941931292130269, + "grad_norm": 3.954161538122901, + "learning_rate": 2.668552131644214e-06, + "loss": 0.6147, + "step": 6085 + }, + { + "epoch": 0.49427434418906846, + "grad_norm": 8.837611857542369, + "learning_rate": 2.667895999982124e-06, + "loss": 0.4406, + "step": 6086 + }, + { + "epoch": 0.49435555916511004, + "grad_norm": 5.060653912888538, + "learning_rate": 2.6672398567025188e-06, + "loss": 0.4472, + "step": 6087 + }, + { + "epoch": 0.4944367741411516, + "grad_norm": 9.229957236777977, + "learning_rate": 2.666583701850802e-06, + "loss": 0.507, + "step": 6088 + }, + { + "epoch": 0.4945179891171932, + "grad_norm": 4.035963295981821, + "learning_rate": 2.6659275354723735e-06, + "loss": 0.4867, + "step": 6089 + }, + { + "epoch": 0.49459920409323477, + "grad_norm": 8.351371921914652, + "learning_rate": 2.6652713576126376e-06, + "loss": 0.6755, + "step": 6090 + }, + { + "epoch": 0.4946804190692764, + "grad_norm": 5.78678933022576, + "learning_rate": 2.6646151683169985e-06, + "loss": 0.6439, + "step": 6091 + }, + { + "epoch": 0.494761634045318, + "grad_norm": 4.566506684904742, + "learning_rate": 2.6639589676308614e-06, + "loss": 0.7211, + "step": 6092 + }, + { + "epoch": 0.49484284902135955, + "grad_norm": 7.609049634242752, + "learning_rate": 2.663302755599631e-06, + "loss": 0.6008, + "step": 6093 + }, + { + "epoch": 0.49492406399740113, + "grad_norm": 3.3025767017322893, + "learning_rate": 2.6626465322687144e-06, + "loss": 0.5058, + "step": 6094 + }, + { + "epoch": 0.4950052789734427, + "grad_norm": 5.250107319379549, + "learning_rate": 2.6619902976835187e-06, + "loss": 0.3995, + "step": 6095 + }, + { + "epoch": 0.4950864939494843, + "grad_norm": 4.264694509387339, + "learning_rate": 2.6613340518894513e-06, + "loss": 0.4428, + "step": 6096 + }, + { + "epoch": 0.49516770892552586, + "grad_norm": 9.523848099068603, + "learning_rate": 2.6606777949319217e-06, + "loss": 0.4568, + "step": 6097 + }, + { + "epoch": 0.49524892390156744, + "grad_norm": 4.645197872989053, + "learning_rate": 2.6600215268563396e-06, + "loss": 0.7066, + "step": 6098 + }, + { + "epoch": 0.495330138877609, + "grad_norm": 6.723227949020278, + "learning_rate": 2.6593652477081146e-06, + "loss": 0.5934, + "step": 6099 + }, + { + "epoch": 0.4954113538536506, + "grad_norm": 6.836596280031364, + "learning_rate": 2.658708957532657e-06, + "loss": 0.4197, + "step": 6100 + }, + { + "epoch": 0.49549256882969217, + "grad_norm": 5.530454100104951, + "learning_rate": 2.6580526563753794e-06, + "loss": 0.5521, + "step": 6101 + }, + { + "epoch": 0.4955737838057338, + "grad_norm": 8.014803649260598, + "learning_rate": 2.6573963442816957e-06, + "loss": 0.4716, + "step": 6102 + }, + { + "epoch": 0.4956549987817754, + "grad_norm": 5.651578109021286, + "learning_rate": 2.656740021297017e-06, + "loss": 0.6337, + "step": 6103 + }, + { + "epoch": 0.49573621375781696, + "grad_norm": 4.497707754410902, + "learning_rate": 2.6560836874667584e-06, + "loss": 0.4835, + "step": 6104 + }, + { + "epoch": 0.49581742873385853, + "grad_norm": 6.748930438264841, + "learning_rate": 2.6554273428363352e-06, + "loss": 0.4341, + "step": 6105 + }, + { + "epoch": 0.4958986437099001, + "grad_norm": 4.813991480179138, + "learning_rate": 2.6547709874511622e-06, + "loss": 0.4681, + "step": 6106 + }, + { + "epoch": 0.4959798586859417, + "grad_norm": 4.661897396362085, + "learning_rate": 2.654114621356656e-06, + "loss": 0.4306, + "step": 6107 + }, + { + "epoch": 0.49606107366198327, + "grad_norm": 6.397578426337775, + "learning_rate": 2.6534582445982338e-06, + "loss": 0.4603, + "step": 6108 + }, + { + "epoch": 0.49614228863802484, + "grad_norm": 4.315053588094933, + "learning_rate": 2.6528018572213133e-06, + "loss": 0.6069, + "step": 6109 + }, + { + "epoch": 0.4962235036140664, + "grad_norm": 3.5982681139239876, + "learning_rate": 2.6521454592713125e-06, + "loss": 0.4975, + "step": 6110 + }, + { + "epoch": 0.496304718590108, + "grad_norm": 4.778399998810193, + "learning_rate": 2.6514890507936515e-06, + "loss": 0.4753, + "step": 6111 + }, + { + "epoch": 0.4963859335661496, + "grad_norm": 5.875526293696564, + "learning_rate": 2.6508326318337498e-06, + "loss": 0.6111, + "step": 6112 + }, + { + "epoch": 0.4964671485421912, + "grad_norm": 4.292823551355768, + "learning_rate": 2.6501762024370283e-06, + "loss": 0.3784, + "step": 6113 + }, + { + "epoch": 0.4965483635182328, + "grad_norm": 8.401173587655329, + "learning_rate": 2.6495197626489082e-06, + "loss": 0.582, + "step": 6114 + }, + { + "epoch": 0.49662957849427436, + "grad_norm": 4.321012772875746, + "learning_rate": 2.6488633125148127e-06, + "loss": 0.4502, + "step": 6115 + }, + { + "epoch": 0.49671079347031594, + "grad_norm": 5.379767516829725, + "learning_rate": 2.6482068520801625e-06, + "loss": 0.5668, + "step": 6116 + }, + { + "epoch": 0.4967920084463575, + "grad_norm": 5.218798786070938, + "learning_rate": 2.647550381390383e-06, + "loss": 0.4885, + "step": 6117 + }, + { + "epoch": 0.4968732234223991, + "grad_norm": 6.280161135864912, + "learning_rate": 2.6468939004908987e-06, + "loss": 0.5571, + "step": 6118 + }, + { + "epoch": 0.49695443839844067, + "grad_norm": 6.147745243094484, + "learning_rate": 2.646237409427133e-06, + "loss": 0.6213, + "step": 6119 + }, + { + "epoch": 0.49703565337448224, + "grad_norm": 4.237997643901119, + "learning_rate": 2.645580908244513e-06, + "loss": 0.5256, + "step": 6120 + }, + { + "epoch": 0.4971168683505238, + "grad_norm": 5.794983422357201, + "learning_rate": 2.644924396988465e-06, + "loss": 0.4717, + "step": 6121 + }, + { + "epoch": 0.4971980833265654, + "grad_norm": 3.028956483672569, + "learning_rate": 2.644267875704415e-06, + "loss": 0.605, + "step": 6122 + }, + { + "epoch": 0.497279298302607, + "grad_norm": 4.783447111346257, + "learning_rate": 2.6436113444377916e-06, + "loss": 0.3828, + "step": 6123 + }, + { + "epoch": 0.4973605132786486, + "grad_norm": 4.634896048686429, + "learning_rate": 2.6429548032340233e-06, + "loss": 0.4624, + "step": 6124 + }, + { + "epoch": 0.4974417282546902, + "grad_norm": 4.4180065168228575, + "learning_rate": 2.642298252138539e-06, + "loss": 0.471, + "step": 6125 + }, + { + "epoch": 0.49752294323073176, + "grad_norm": 7.029597857740861, + "learning_rate": 2.641641691196769e-06, + "loss": 0.3827, + "step": 6126 + }, + { + "epoch": 0.49760415820677334, + "grad_norm": 3.5588276162139922, + "learning_rate": 2.6409851204541435e-06, + "loss": 0.634, + "step": 6127 + }, + { + "epoch": 0.4976853731828149, + "grad_norm": 5.229212123221044, + "learning_rate": 2.640328539956094e-06, + "loss": 0.4562, + "step": 6128 + }, + { + "epoch": 0.4977665881588565, + "grad_norm": 4.266180122295215, + "learning_rate": 2.639671949748052e-06, + "loss": 0.5074, + "step": 6129 + }, + { + "epoch": 0.49784780313489807, + "grad_norm": 5.993522414034609, + "learning_rate": 2.6390153498754506e-06, + "loss": 0.5987, + "step": 6130 + }, + { + "epoch": 0.49792901811093965, + "grad_norm": 5.432626385109642, + "learning_rate": 2.638358740383723e-06, + "loss": 0.5943, + "step": 6131 + }, + { + "epoch": 0.4980102330869812, + "grad_norm": 7.744059010319982, + "learning_rate": 2.637702121318302e-06, + "loss": 0.4533, + "step": 6132 + }, + { + "epoch": 0.4980914480630228, + "grad_norm": 6.471183582152214, + "learning_rate": 2.6370454927246237e-06, + "loss": 0.4274, + "step": 6133 + }, + { + "epoch": 0.4981726630390644, + "grad_norm": 3.457874851254734, + "learning_rate": 2.6363888546481224e-06, + "loss": 0.4676, + "step": 6134 + }, + { + "epoch": 0.498253878015106, + "grad_norm": 8.481196069942994, + "learning_rate": 2.635732207134234e-06, + "loss": 0.4627, + "step": 6135 + }, + { + "epoch": 0.4983350929911476, + "grad_norm": 5.660075404955752, + "learning_rate": 2.635075550228395e-06, + "loss": 0.4784, + "step": 6136 + }, + { + "epoch": 0.49841630796718916, + "grad_norm": 4.313279838626725, + "learning_rate": 2.634418883976043e-06, + "loss": 0.4569, + "step": 6137 + }, + { + "epoch": 0.49849752294323074, + "grad_norm": 3.362477204863664, + "learning_rate": 2.6337622084226163e-06, + "loss": 0.5727, + "step": 6138 + }, + { + "epoch": 0.4985787379192723, + "grad_norm": 4.485740579944916, + "learning_rate": 2.633105523613551e-06, + "loss": 0.7045, + "step": 6139 + }, + { + "epoch": 0.4986599528953139, + "grad_norm": 3.989314569645018, + "learning_rate": 2.6324488295942897e-06, + "loss": 0.693, + "step": 6140 + }, + { + "epoch": 0.4987411678713555, + "grad_norm": 6.808212058806507, + "learning_rate": 2.6317921264102697e-06, + "loss": 0.4904, + "step": 6141 + }, + { + "epoch": 0.49882238284739705, + "grad_norm": 4.01346195857619, + "learning_rate": 2.6311354141069324e-06, + "loss": 0.6021, + "step": 6142 + }, + { + "epoch": 0.4989035978234386, + "grad_norm": 5.471950249286309, + "learning_rate": 2.630478692729718e-06, + "loss": 0.5009, + "step": 6143 + }, + { + "epoch": 0.4989848127994802, + "grad_norm": 4.820722399207098, + "learning_rate": 2.6298219623240685e-06, + "loss": 0.3804, + "step": 6144 + }, + { + "epoch": 0.4990660277755218, + "grad_norm": 3.164263910898727, + "learning_rate": 2.6291652229354264e-06, + "loss": 0.4665, + "step": 6145 + }, + { + "epoch": 0.4991472427515634, + "grad_norm": 5.584269076148355, + "learning_rate": 2.6285084746092347e-06, + "loss": 0.5254, + "step": 6146 + }, + { + "epoch": 0.499228457727605, + "grad_norm": 10.9678056140828, + "learning_rate": 2.627851717390936e-06, + "loss": 0.4869, + "step": 6147 + }, + { + "epoch": 0.49930967270364657, + "grad_norm": 6.984083124060342, + "learning_rate": 2.6271949513259764e-06, + "loss": 0.5502, + "step": 6148 + }, + { + "epoch": 0.49939088767968814, + "grad_norm": 4.28910430937024, + "learning_rate": 2.626538176459798e-06, + "loss": 0.5626, + "step": 6149 + }, + { + "epoch": 0.4994721026557297, + "grad_norm": 5.809158832217803, + "learning_rate": 2.625881392837849e-06, + "loss": 0.5614, + "step": 6150 + }, + { + "epoch": 0.4995533176317713, + "grad_norm": 4.895799235690263, + "learning_rate": 2.6252246005055725e-06, + "loss": 0.5047, + "step": 6151 + }, + { + "epoch": 0.4996345326078129, + "grad_norm": 3.584830115023326, + "learning_rate": 2.6245677995084163e-06, + "loss": 0.555, + "step": 6152 + }, + { + "epoch": 0.49971574758385445, + "grad_norm": 10.564765809267739, + "learning_rate": 2.6239109898918286e-06, + "loss": 0.4141, + "step": 6153 + }, + { + "epoch": 0.49979696255989603, + "grad_norm": 3.816580959787303, + "learning_rate": 2.6232541717012563e-06, + "loss": 0.5562, + "step": 6154 + }, + { + "epoch": 0.4998781775359376, + "grad_norm": 7.68222211219942, + "learning_rate": 2.6225973449821468e-06, + "loss": 0.4434, + "step": 6155 + }, + { + "epoch": 0.4999593925119792, + "grad_norm": 4.677405643402403, + "learning_rate": 2.6219405097799498e-06, + "loss": 0.67, + "step": 6156 + }, + { + "epoch": 0.5000406074880208, + "grad_norm": 6.735859677395967, + "learning_rate": 2.6212836661401154e-06, + "loss": 0.5044, + "step": 6157 + }, + { + "epoch": 0.5001218224640623, + "grad_norm": 5.559869392207337, + "learning_rate": 2.6206268141080924e-06, + "loss": 0.4942, + "step": 6158 + }, + { + "epoch": 0.500203037440104, + "grad_norm": 5.91589064626228, + "learning_rate": 2.619969953729333e-06, + "loss": 0.5946, + "step": 6159 + }, + { + "epoch": 0.5002842524161455, + "grad_norm": 9.884104436935436, + "learning_rate": 2.6193130850492876e-06, + "loss": 0.6033, + "step": 6160 + }, + { + "epoch": 0.5003654673921871, + "grad_norm": 5.191171578581443, + "learning_rate": 2.618656208113408e-06, + "loss": 0.4999, + "step": 6161 + }, + { + "epoch": 0.5004466823682288, + "grad_norm": 4.869819153010546, + "learning_rate": 2.6179993229671473e-06, + "loss": 0.5341, + "step": 6162 + }, + { + "epoch": 0.5005278973442703, + "grad_norm": 4.257762076221053, + "learning_rate": 2.6173424296559575e-06, + "loss": 0.5408, + "step": 6163 + }, + { + "epoch": 0.5006091123203119, + "grad_norm": 4.96043740363745, + "learning_rate": 2.6166855282252933e-06, + "loss": 0.5863, + "step": 6164 + }, + { + "epoch": 0.5006903272963534, + "grad_norm": 4.664873730144854, + "learning_rate": 2.616028618720607e-06, + "loss": 0.4936, + "step": 6165 + }, + { + "epoch": 0.5007715422723951, + "grad_norm": 5.757723223582411, + "learning_rate": 2.615371701187355e-06, + "loss": 0.5604, + "step": 6166 + }, + { + "epoch": 0.5008527572484366, + "grad_norm": 5.55920229153415, + "learning_rate": 2.6147147756709925e-06, + "loss": 0.5215, + "step": 6167 + }, + { + "epoch": 0.5009339722244782, + "grad_norm": 3.916022531924741, + "learning_rate": 2.614057842216973e-06, + "loss": 0.6314, + "step": 6168 + }, + { + "epoch": 0.5010151872005197, + "grad_norm": 3.748268469101728, + "learning_rate": 2.6134009008707555e-06, + "loss": 0.5058, + "step": 6169 + }, + { + "epoch": 0.5010964021765614, + "grad_norm": 3.866469109499023, + "learning_rate": 2.6127439516777956e-06, + "loss": 0.6175, + "step": 6170 + }, + { + "epoch": 0.5011776171526029, + "grad_norm": 9.784917821882507, + "learning_rate": 2.6120869946835513e-06, + "loss": 0.4369, + "step": 6171 + }, + { + "epoch": 0.5012588321286445, + "grad_norm": 3.918198159806199, + "learning_rate": 2.61143002993348e-06, + "loss": 0.5966, + "step": 6172 + }, + { + "epoch": 0.5013400471046862, + "grad_norm": 9.939958648361882, + "learning_rate": 2.61077305747304e-06, + "loss": 0.5353, + "step": 6173 + }, + { + "epoch": 0.5014212620807277, + "grad_norm": 6.322410360361211, + "learning_rate": 2.610116077347691e-06, + "loss": 0.4513, + "step": 6174 + }, + { + "epoch": 0.5015024770567693, + "grad_norm": 6.622717663567328, + "learning_rate": 2.609459089602892e-06, + "loss": 0.3957, + "step": 6175 + }, + { + "epoch": 0.5015836920328108, + "grad_norm": 8.064816332776429, + "learning_rate": 2.6088020942841034e-06, + "loss": 0.3698, + "step": 6176 + }, + { + "epoch": 0.5016649070088525, + "grad_norm": 7.086132865620177, + "learning_rate": 2.6081450914367864e-06, + "loss": 0.5031, + "step": 6177 + }, + { + "epoch": 0.501746121984894, + "grad_norm": 3.7954526083234077, + "learning_rate": 2.6074880811064003e-06, + "loss": 0.5348, + "step": 6178 + }, + { + "epoch": 0.5018273369609356, + "grad_norm": 5.020997847466201, + "learning_rate": 2.606831063338408e-06, + "loss": 0.4605, + "step": 6179 + }, + { + "epoch": 0.5019085519369771, + "grad_norm": 4.560830832766969, + "learning_rate": 2.6061740381782723e-06, + "loss": 0.5455, + "step": 6180 + }, + { + "epoch": 0.5019897669130188, + "grad_norm": 4.229120124206621, + "learning_rate": 2.605517005671454e-06, + "loss": 0.5157, + "step": 6181 + }, + { + "epoch": 0.5020709818890603, + "grad_norm": 4.581886089334441, + "learning_rate": 2.604859965863418e-06, + "loss": 0.4667, + "step": 6182 + }, + { + "epoch": 0.5021521968651019, + "grad_norm": 4.4186999593426055, + "learning_rate": 2.6042029187996277e-06, + "loss": 0.5935, + "step": 6183 + }, + { + "epoch": 0.5022334118411436, + "grad_norm": 4.736522021215294, + "learning_rate": 2.6035458645255467e-06, + "loss": 0.5252, + "step": 6184 + }, + { + "epoch": 0.5023146268171851, + "grad_norm": 4.668585962104124, + "learning_rate": 2.602888803086639e-06, + "loss": 0.5201, + "step": 6185 + }, + { + "epoch": 0.5023958417932267, + "grad_norm": 9.66768445142635, + "learning_rate": 2.602231734528372e-06, + "loss": 0.4524, + "step": 6186 + }, + { + "epoch": 0.5024770567692682, + "grad_norm": 4.15432156915695, + "learning_rate": 2.601574658896209e-06, + "loss": 0.6911, + "step": 6187 + }, + { + "epoch": 0.5025582717453099, + "grad_norm": 5.036750129329255, + "learning_rate": 2.6009175762356176e-06, + "loss": 0.587, + "step": 6188 + }, + { + "epoch": 0.5026394867213514, + "grad_norm": 9.244386485882309, + "learning_rate": 2.6002604865920645e-06, + "loss": 0.5369, + "step": 6189 + }, + { + "epoch": 0.502720701697393, + "grad_norm": 4.712476893869534, + "learning_rate": 2.5996033900110155e-06, + "loss": 0.6125, + "step": 6190 + }, + { + "epoch": 0.5028019166734345, + "grad_norm": 7.034161631409917, + "learning_rate": 2.5989462865379394e-06, + "loss": 0.6711, + "step": 6191 + }, + { + "epoch": 0.5028831316494762, + "grad_norm": 2.537027536386944, + "learning_rate": 2.598289176218304e-06, + "loss": 0.5937, + "step": 6192 + }, + { + "epoch": 0.5029643466255177, + "grad_norm": 3.394185226747455, + "learning_rate": 2.597632059097577e-06, + "loss": 0.6653, + "step": 6193 + }, + { + "epoch": 0.5030455616015593, + "grad_norm": 4.376129626575339, + "learning_rate": 2.5969749352212294e-06, + "loss": 0.6337, + "step": 6194 + }, + { + "epoch": 0.503126776577601, + "grad_norm": 4.748486742998477, + "learning_rate": 2.5963178046347286e-06, + "loss": 0.4952, + "step": 6195 + }, + { + "epoch": 0.5032079915536425, + "grad_norm": 7.369302442783826, + "learning_rate": 2.595660667383547e-06, + "loss": 0.3893, + "step": 6196 + }, + { + "epoch": 0.5032892065296841, + "grad_norm": 5.509513918719035, + "learning_rate": 2.5950035235131515e-06, + "loss": 0.5277, + "step": 6197 + }, + { + "epoch": 0.5033704215057256, + "grad_norm": 5.829645133957113, + "learning_rate": 2.594346373069016e-06, + "loss": 0.4237, + "step": 6198 + }, + { + "epoch": 0.5034516364817673, + "grad_norm": 5.626826411794362, + "learning_rate": 2.593689216096611e-06, + "loss": 0.4906, + "step": 6199 + }, + { + "epoch": 0.5035328514578088, + "grad_norm": 6.977550708147046, + "learning_rate": 2.5930320526414083e-06, + "loss": 0.548, + "step": 6200 + }, + { + "epoch": 0.5036140664338504, + "grad_norm": 7.209561596520351, + "learning_rate": 2.592374882748879e-06, + "loss": 0.4672, + "step": 6201 + }, + { + "epoch": 0.503695281409892, + "grad_norm": 5.543565192935603, + "learning_rate": 2.5917177064644974e-06, + "loss": 0.5535, + "step": 6202 + }, + { + "epoch": 0.5037764963859336, + "grad_norm": 5.366571236178294, + "learning_rate": 2.5910605238337355e-06, + "loss": 0.6972, + "step": 6203 + }, + { + "epoch": 0.5038577113619751, + "grad_norm": 6.261566607697583, + "learning_rate": 2.5904033349020675e-06, + "loss": 0.4503, + "step": 6204 + }, + { + "epoch": 0.5039389263380167, + "grad_norm": 5.805760768553691, + "learning_rate": 2.589746139714967e-06, + "loss": 0.4825, + "step": 6205 + }, + { + "epoch": 0.5040201413140584, + "grad_norm": 8.328312111239873, + "learning_rate": 2.5890889383179086e-06, + "loss": 0.4764, + "step": 6206 + }, + { + "epoch": 0.5041013562900999, + "grad_norm": 4.810122399520198, + "learning_rate": 2.588431730756367e-06, + "loss": 0.5084, + "step": 6207 + }, + { + "epoch": 0.5041825712661415, + "grad_norm": 3.7065509135197545, + "learning_rate": 2.5877745170758177e-06, + "loss": 0.524, + "step": 6208 + }, + { + "epoch": 0.504263786242183, + "grad_norm": 5.121308484330212, + "learning_rate": 2.5871172973217367e-06, + "loss": 0.5104, + "step": 6209 + }, + { + "epoch": 0.5043450012182247, + "grad_norm": 5.928958710988731, + "learning_rate": 2.5864600715396e-06, + "loss": 0.5344, + "step": 6210 + }, + { + "epoch": 0.5044262161942662, + "grad_norm": 5.887834972208363, + "learning_rate": 2.585802839774883e-06, + "loss": 0.4809, + "step": 6211 + }, + { + "epoch": 0.5045074311703078, + "grad_norm": 4.367368393282637, + "learning_rate": 2.5851456020730643e-06, + "loss": 0.6071, + "step": 6212 + }, + { + "epoch": 0.5045886461463494, + "grad_norm": 3.4461457206000374, + "learning_rate": 2.584488358479621e-06, + "loss": 0.6266, + "step": 6213 + }, + { + "epoch": 0.504669861122391, + "grad_norm": 3.1291003154252004, + "learning_rate": 2.5838311090400293e-06, + "loss": 0.5782, + "step": 6214 + }, + { + "epoch": 0.5047510760984325, + "grad_norm": 9.218149230525775, + "learning_rate": 2.58317385379977e-06, + "loss": 0.395, + "step": 6215 + }, + { + "epoch": 0.5048322910744741, + "grad_norm": 7.375352110089412, + "learning_rate": 2.582516592804319e-06, + "loss": 0.4645, + "step": 6216 + }, + { + "epoch": 0.5049135060505158, + "grad_norm": 6.15235163146789, + "learning_rate": 2.5818593260991565e-06, + "loss": 0.514, + "step": 6217 + }, + { + "epoch": 0.5049947210265573, + "grad_norm": 16.29462482704463, + "learning_rate": 2.581202053729762e-06, + "loss": 0.5103, + "step": 6218 + }, + { + "epoch": 0.5050759360025989, + "grad_norm": 4.3058949775554884, + "learning_rate": 2.580544775741616e-06, + "loss": 0.5023, + "step": 6219 + }, + { + "epoch": 0.5051571509786404, + "grad_norm": 3.2866867580977606, + "learning_rate": 2.579887492180197e-06, + "loss": 0.6187, + "step": 6220 + }, + { + "epoch": 0.5052383659546821, + "grad_norm": 5.219156744499874, + "learning_rate": 2.579230203090986e-06, + "loss": 0.3933, + "step": 6221 + }, + { + "epoch": 0.5053195809307236, + "grad_norm": 4.147926897905404, + "learning_rate": 2.578572908519465e-06, + "loss": 0.506, + "step": 6222 + }, + { + "epoch": 0.5054007959067652, + "grad_norm": 4.123405600563359, + "learning_rate": 2.577915608511114e-06, + "loss": 0.5677, + "step": 6223 + }, + { + "epoch": 0.5054820108828068, + "grad_norm": 7.522333120828346, + "learning_rate": 2.5772583031114157e-06, + "loss": 0.4562, + "step": 6224 + }, + { + "epoch": 0.5055632258588484, + "grad_norm": 6.046806353687886, + "learning_rate": 2.5766009923658516e-06, + "loss": 0.3849, + "step": 6225 + }, + { + "epoch": 0.5056444408348899, + "grad_norm": 4.707865925528092, + "learning_rate": 2.5759436763199047e-06, + "loss": 0.4868, + "step": 6226 + }, + { + "epoch": 0.5057256558109315, + "grad_norm": 4.715913190140127, + "learning_rate": 2.575286355019056e-06, + "loss": 0.587, + "step": 6227 + }, + { + "epoch": 0.5058068707869732, + "grad_norm": 3.2497965234461264, + "learning_rate": 2.5746290285087912e-06, + "loss": 0.7003, + "step": 6228 + }, + { + "epoch": 0.5058880857630147, + "grad_norm": 3.280084946712466, + "learning_rate": 2.5739716968345922e-06, + "loss": 0.6808, + "step": 6229 + }, + { + "epoch": 0.5059693007390563, + "grad_norm": 5.724019303106518, + "learning_rate": 2.573314360041943e-06, + "loss": 0.4032, + "step": 6230 + }, + { + "epoch": 0.5060505157150978, + "grad_norm": 4.723214741303482, + "learning_rate": 2.5726570181763286e-06, + "loss": 0.4832, + "step": 6231 + }, + { + "epoch": 0.5061317306911395, + "grad_norm": 3.7300991619481363, + "learning_rate": 2.571999671283233e-06, + "loss": 0.4762, + "step": 6232 + }, + { + "epoch": 0.506212945667181, + "grad_norm": 4.889315053935509, + "learning_rate": 2.5713423194081404e-06, + "loss": 0.5565, + "step": 6233 + }, + { + "epoch": 0.5062941606432226, + "grad_norm": 7.67062134201034, + "learning_rate": 2.570684962596538e-06, + "loss": 0.5569, + "step": 6234 + }, + { + "epoch": 0.5063753756192642, + "grad_norm": 4.100525163986654, + "learning_rate": 2.5700276008939096e-06, + "loss": 0.4171, + "step": 6235 + }, + { + "epoch": 0.5064565905953058, + "grad_norm": 4.883509578878357, + "learning_rate": 2.569370234345742e-06, + "loss": 0.4534, + "step": 6236 + }, + { + "epoch": 0.5065378055713473, + "grad_norm": 17.999174929712392, + "learning_rate": 2.568712862997522e-06, + "loss": 0.4791, + "step": 6237 + }, + { + "epoch": 0.5066190205473889, + "grad_norm": 3.501500494032202, + "learning_rate": 2.5680554868947346e-06, + "loss": 0.3923, + "step": 6238 + }, + { + "epoch": 0.5067002355234306, + "grad_norm": 5.490863237584197, + "learning_rate": 2.5673981060828672e-06, + "loss": 0.557, + "step": 6239 + }, + { + "epoch": 0.5067814504994721, + "grad_norm": 4.017635335289608, + "learning_rate": 2.5667407206074084e-06, + "loss": 0.4779, + "step": 6240 + }, + { + "epoch": 0.5068626654755137, + "grad_norm": 5.193790698559395, + "learning_rate": 2.566083330513845e-06, + "loss": 0.5404, + "step": 6241 + }, + { + "epoch": 0.5069438804515553, + "grad_norm": 3.781621395266743, + "learning_rate": 2.565425935847665e-06, + "loss": 0.5054, + "step": 6242 + }, + { + "epoch": 0.5070250954275969, + "grad_norm": 6.125227421255686, + "learning_rate": 2.564768536654356e-06, + "loss": 0.565, + "step": 6243 + }, + { + "epoch": 0.5071063104036384, + "grad_norm": 5.159395575524112, + "learning_rate": 2.564111132979407e-06, + "loss": 0.5639, + "step": 6244 + }, + { + "epoch": 0.50718752537968, + "grad_norm": 5.800011134638956, + "learning_rate": 2.563453724868308e-06, + "loss": 0.4247, + "step": 6245 + }, + { + "epoch": 0.5072687403557216, + "grad_norm": 3.2153805898476606, + "learning_rate": 2.5627963123665455e-06, + "loss": 0.4201, + "step": 6246 + }, + { + "epoch": 0.5073499553317632, + "grad_norm": 3.5092901295905405, + "learning_rate": 2.5621388955196113e-06, + "loss": 0.472, + "step": 6247 + }, + { + "epoch": 0.5074311703078047, + "grad_norm": 8.695984681785939, + "learning_rate": 2.561481474372995e-06, + "loss": 0.4915, + "step": 6248 + }, + { + "epoch": 0.5075123852838463, + "grad_norm": 4.710656984668509, + "learning_rate": 2.560824048972185e-06, + "loss": 0.4936, + "step": 6249 + }, + { + "epoch": 0.507593600259888, + "grad_norm": 7.293645049422129, + "learning_rate": 2.5601666193626735e-06, + "loss": 0.6042, + "step": 6250 + }, + { + "epoch": 0.5076748152359295, + "grad_norm": 5.2621850945387285, + "learning_rate": 2.55950918558995e-06, + "loss": 0.443, + "step": 6251 + }, + { + "epoch": 0.5077560302119711, + "grad_norm": 3.4152519821072578, + "learning_rate": 2.558851747699506e-06, + "loss": 0.697, + "step": 6252 + }, + { + "epoch": 0.5078372451880127, + "grad_norm": 5.202402703658923, + "learning_rate": 2.5581943057368317e-06, + "loss": 0.4773, + "step": 6253 + }, + { + "epoch": 0.5079184601640543, + "grad_norm": 7.606949152034, + "learning_rate": 2.5575368597474202e-06, + "loss": 0.3156, + "step": 6254 + }, + { + "epoch": 0.5079996751400958, + "grad_norm": 3.929776691209423, + "learning_rate": 2.5568794097767624e-06, + "loss": 0.4507, + "step": 6255 + }, + { + "epoch": 0.5080808901161374, + "grad_norm": 6.335298963383283, + "learning_rate": 2.5562219558703504e-06, + "loss": 0.6335, + "step": 6256 + }, + { + "epoch": 0.508162105092179, + "grad_norm": 7.8562095782475865, + "learning_rate": 2.555564498073677e-06, + "loss": 0.5108, + "step": 6257 + }, + { + "epoch": 0.5082433200682206, + "grad_norm": 4.8650466475894145, + "learning_rate": 2.554907036432235e-06, + "loss": 0.5093, + "step": 6258 + }, + { + "epoch": 0.5083245350442621, + "grad_norm": 5.229859161415607, + "learning_rate": 2.554249570991515e-06, + "loss": 0.5999, + "step": 6259 + }, + { + "epoch": 0.5084057500203037, + "grad_norm": 4.359502656532959, + "learning_rate": 2.5535921017970123e-06, + "loss": 0.435, + "step": 6260 + }, + { + "epoch": 0.5084869649963454, + "grad_norm": 15.539251038374521, + "learning_rate": 2.5529346288942203e-06, + "loss": 0.4546, + "step": 6261 + }, + { + "epoch": 0.5085681799723869, + "grad_norm": 4.359760496582075, + "learning_rate": 2.5522771523286317e-06, + "loss": 0.4273, + "step": 6262 + }, + { + "epoch": 0.5086493949484285, + "grad_norm": 5.6985722122365114, + "learning_rate": 2.551619672145741e-06, + "loss": 0.5175, + "step": 6263 + }, + { + "epoch": 0.5087306099244701, + "grad_norm": 4.451602003719397, + "learning_rate": 2.5509621883910424e-06, + "loss": 0.5247, + "step": 6264 + }, + { + "epoch": 0.5088118249005117, + "grad_norm": 6.12534073894256, + "learning_rate": 2.55030470111003e-06, + "loss": 0.4593, + "step": 6265 + }, + { + "epoch": 0.5088930398765532, + "grad_norm": 5.010881145167957, + "learning_rate": 2.5496472103481984e-06, + "loss": 0.523, + "step": 6266 + }, + { + "epoch": 0.5089742548525948, + "grad_norm": 6.63962229868484, + "learning_rate": 2.5489897161510425e-06, + "loss": 0.4728, + "step": 6267 + }, + { + "epoch": 0.5090554698286364, + "grad_norm": 4.6745969311887485, + "learning_rate": 2.5483322185640575e-06, + "loss": 0.5094, + "step": 6268 + }, + { + "epoch": 0.509136684804678, + "grad_norm": 7.671233857618601, + "learning_rate": 2.547674717632739e-06, + "loss": 0.5129, + "step": 6269 + }, + { + "epoch": 0.5092178997807195, + "grad_norm": 4.60865861862504, + "learning_rate": 2.547017213402582e-06, + "loss": 0.4483, + "step": 6270 + }, + { + "epoch": 0.5092991147567612, + "grad_norm": 6.2621430489481265, + "learning_rate": 2.546359705919083e-06, + "loss": 0.4052, + "step": 6271 + }, + { + "epoch": 0.5093803297328028, + "grad_norm": 5.501701524912082, + "learning_rate": 2.545702195227737e-06, + "loss": 0.504, + "step": 6272 + }, + { + "epoch": 0.5094615447088443, + "grad_norm": 9.43685395208808, + "learning_rate": 2.545044681374042e-06, + "loss": 0.7419, + "step": 6273 + }, + { + "epoch": 0.5095427596848859, + "grad_norm": 4.655667182094343, + "learning_rate": 2.544387164403493e-06, + "loss": 0.6057, + "step": 6274 + }, + { + "epoch": 0.5096239746609275, + "grad_norm": 6.207757916402813, + "learning_rate": 2.543729644361587e-06, + "loss": 0.3979, + "step": 6275 + }, + { + "epoch": 0.5097051896369691, + "grad_norm": 6.082343128329472, + "learning_rate": 2.5430721212938216e-06, + "loss": 0.4088, + "step": 6276 + }, + { + "epoch": 0.5097864046130106, + "grad_norm": 4.720976085216124, + "learning_rate": 2.542414595245693e-06, + "loss": 0.4561, + "step": 6277 + }, + { + "epoch": 0.5098676195890522, + "grad_norm": 4.73426813490108, + "learning_rate": 2.541757066262699e-06, + "loss": 0.6201, + "step": 6278 + }, + { + "epoch": 0.5099488345650938, + "grad_norm": 3.383613942551711, + "learning_rate": 2.541099534390336e-06, + "loss": 0.4821, + "step": 6279 + }, + { + "epoch": 0.5100300495411354, + "grad_norm": 8.433408631314988, + "learning_rate": 2.5404419996741042e-06, + "loss": 0.4494, + "step": 6280 + }, + { + "epoch": 0.5101112645171769, + "grad_norm": 5.815113828274007, + "learning_rate": 2.5397844621594997e-06, + "loss": 0.5385, + "step": 6281 + }, + { + "epoch": 0.5101924794932186, + "grad_norm": 4.687103952333867, + "learning_rate": 2.5391269218920202e-06, + "loss": 0.4344, + "step": 6282 + }, + { + "epoch": 0.5102736944692602, + "grad_norm": 5.089752144632809, + "learning_rate": 2.5384693789171656e-06, + "loss": 0.4559, + "step": 6283 + }, + { + "epoch": 0.5103549094453017, + "grad_norm": 5.854876777674836, + "learning_rate": 2.537811833280433e-06, + "loss": 0.5045, + "step": 6284 + }, + { + "epoch": 0.5104361244213433, + "grad_norm": 4.799615858367655, + "learning_rate": 2.5371542850273224e-06, + "loss": 0.4898, + "step": 6285 + }, + { + "epoch": 0.5105173393973849, + "grad_norm": 5.42400764526346, + "learning_rate": 2.5364967342033307e-06, + "loss": 0.6302, + "step": 6286 + }, + { + "epoch": 0.5105985543734265, + "grad_norm": 4.933143417479578, + "learning_rate": 2.5358391808539597e-06, + "loss": 0.4977, + "step": 6287 + }, + { + "epoch": 0.510679769349468, + "grad_norm": 3.4223345719076113, + "learning_rate": 2.535181625024706e-06, + "loss": 0.4809, + "step": 6288 + }, + { + "epoch": 0.5107609843255096, + "grad_norm": 9.061912369576826, + "learning_rate": 2.53452406676107e-06, + "loss": 0.3358, + "step": 6289 + }, + { + "epoch": 0.5108421993015512, + "grad_norm": 10.525328047917656, + "learning_rate": 2.5338665061085518e-06, + "loss": 0.6054, + "step": 6290 + }, + { + "epoch": 0.5109234142775928, + "grad_norm": 5.383474823413491, + "learning_rate": 2.5332089431126504e-06, + "loss": 0.6881, + "step": 6291 + }, + { + "epoch": 0.5110046292536343, + "grad_norm": 11.602351673058207, + "learning_rate": 2.532551377818866e-06, + "loss": 0.6387, + "step": 6292 + }, + { + "epoch": 0.511085844229676, + "grad_norm": 4.317046810663442, + "learning_rate": 2.5318938102726985e-06, + "loss": 0.6418, + "step": 6293 + }, + { + "epoch": 0.5111670592057176, + "grad_norm": 4.588601931278866, + "learning_rate": 2.5312362405196485e-06, + "loss": 0.4955, + "step": 6294 + }, + { + "epoch": 0.5112482741817591, + "grad_norm": 5.9618029800437204, + "learning_rate": 2.530578668605215e-06, + "loss": 0.4686, + "step": 6295 + }, + { + "epoch": 0.5113294891578007, + "grad_norm": 4.309047195757413, + "learning_rate": 2.5299210945749005e-06, + "loss": 0.624, + "step": 6296 + }, + { + "epoch": 0.5114107041338423, + "grad_norm": 6.062649085654284, + "learning_rate": 2.529263518474204e-06, + "loss": 0.5655, + "step": 6297 + }, + { + "epoch": 0.5114919191098839, + "grad_norm": 8.101899866072504, + "learning_rate": 2.5286059403486262e-06, + "loss": 0.6048, + "step": 6298 + }, + { + "epoch": 0.5115731340859254, + "grad_norm": 4.855939734927873, + "learning_rate": 2.52794836024367e-06, + "loss": 0.5202, + "step": 6299 + }, + { + "epoch": 0.511654349061967, + "grad_norm": 4.1533242601287315, + "learning_rate": 2.5272907782048343e-06, + "loss": 0.6758, + "step": 6300 + }, + { + "epoch": 0.5117355640380086, + "grad_norm": 44.854170715654014, + "learning_rate": 2.526633194277622e-06, + "loss": 0.4042, + "step": 6301 + }, + { + "epoch": 0.5118167790140502, + "grad_norm": 4.477675057177277, + "learning_rate": 2.5259756085075333e-06, + "loss": 0.641, + "step": 6302 + }, + { + "epoch": 0.5118979939900917, + "grad_norm": 5.00794940429514, + "learning_rate": 2.5253180209400697e-06, + "loss": 0.4153, + "step": 6303 + }, + { + "epoch": 0.5119792089661334, + "grad_norm": 35.355057930038036, + "learning_rate": 2.5246604316207327e-06, + "loss": 0.3842, + "step": 6304 + }, + { + "epoch": 0.512060423942175, + "grad_norm": 5.069001886917277, + "learning_rate": 2.524002840595025e-06, + "loss": 0.5766, + "step": 6305 + }, + { + "epoch": 0.5121416389182165, + "grad_norm": 4.7801204519502845, + "learning_rate": 2.523345247908448e-06, + "loss": 0.5046, + "step": 6306 + }, + { + "epoch": 0.5122228538942581, + "grad_norm": 13.094992788363395, + "learning_rate": 2.522687653606503e-06, + "loss": 0.4069, + "step": 6307 + }, + { + "epoch": 0.5123040688702997, + "grad_norm": 3.4115277705733273, + "learning_rate": 2.5220300577346925e-06, + "loss": 0.6193, + "step": 6308 + }, + { + "epoch": 0.5123852838463413, + "grad_norm": 4.934143576608908, + "learning_rate": 2.521372460338518e-06, + "loss": 0.4763, + "step": 6309 + }, + { + "epoch": 0.5124664988223828, + "grad_norm": 4.955171281673901, + "learning_rate": 2.5207148614634836e-06, + "loss": 0.5961, + "step": 6310 + }, + { + "epoch": 0.5125477137984245, + "grad_norm": 5.804504802010579, + "learning_rate": 2.5200572611550893e-06, + "loss": 0.4608, + "step": 6311 + }, + { + "epoch": 0.512628928774466, + "grad_norm": 3.3522351499110346, + "learning_rate": 2.5193996594588395e-06, + "loss": 0.5245, + "step": 6312 + }, + { + "epoch": 0.5127101437505076, + "grad_norm": 4.503464521908895, + "learning_rate": 2.5187420564202357e-06, + "loss": 0.6784, + "step": 6313 + }, + { + "epoch": 0.5127913587265491, + "grad_norm": 4.619599612350149, + "learning_rate": 2.518084452084781e-06, + "loss": 0.5239, + "step": 6314 + }, + { + "epoch": 0.5128725737025908, + "grad_norm": 6.2490361812009745, + "learning_rate": 2.5174268464979775e-06, + "loss": 0.4502, + "step": 6315 + }, + { + "epoch": 0.5129537886786324, + "grad_norm": 6.493719400660229, + "learning_rate": 2.516769239705328e-06, + "loss": 0.45, + "step": 6316 + }, + { + "epoch": 0.5130350036546739, + "grad_norm": 15.674054593332599, + "learning_rate": 2.5161116317523367e-06, + "loss": 0.5184, + "step": 6317 + }, + { + "epoch": 0.5131162186307155, + "grad_norm": 5.072386439299523, + "learning_rate": 2.5154540226845053e-06, + "loss": 0.4148, + "step": 6318 + }, + { + "epoch": 0.5131974336067571, + "grad_norm": 6.330463701552802, + "learning_rate": 2.514796412547337e-06, + "loss": 0.6593, + "step": 6319 + }, + { + "epoch": 0.5132786485827987, + "grad_norm": 6.329569453939536, + "learning_rate": 2.5141388013863366e-06, + "loss": 0.6006, + "step": 6320 + }, + { + "epoch": 0.5133598635588402, + "grad_norm": 4.995421040496941, + "learning_rate": 2.5134811892470046e-06, + "loss": 0.4651, + "step": 6321 + }, + { + "epoch": 0.5134410785348819, + "grad_norm": 6.312161136312402, + "learning_rate": 2.512823576174846e-06, + "loss": 0.34, + "step": 6322 + }, + { + "epoch": 0.5135222935109234, + "grad_norm": 4.377238702155564, + "learning_rate": 2.5121659622153643e-06, + "loss": 0.4297, + "step": 6323 + }, + { + "epoch": 0.513603508486965, + "grad_norm": 4.928516077514282, + "learning_rate": 2.511508347414062e-06, + "loss": 0.5385, + "step": 6324 + }, + { + "epoch": 0.5136847234630065, + "grad_norm": 7.762372366845343, + "learning_rate": 2.510850731816443e-06, + "loss": 0.3269, + "step": 6325 + }, + { + "epoch": 0.5137659384390482, + "grad_norm": 5.504193583882282, + "learning_rate": 2.510193115468011e-06, + "loss": 0.3859, + "step": 6326 + }, + { + "epoch": 0.5138471534150898, + "grad_norm": 6.152137635079484, + "learning_rate": 2.5095354984142682e-06, + "loss": 0.4028, + "step": 6327 + }, + { + "epoch": 0.5139283683911313, + "grad_norm": 7.085948581559598, + "learning_rate": 2.5088778807007203e-06, + "loss": 0.3954, + "step": 6328 + }, + { + "epoch": 0.514009583367173, + "grad_norm": 5.649925226999557, + "learning_rate": 2.5082202623728707e-06, + "loss": 0.4362, + "step": 6329 + }, + { + "epoch": 0.5140907983432145, + "grad_norm": 7.520999779577865, + "learning_rate": 2.507562643476222e-06, + "loss": 0.7279, + "step": 6330 + }, + { + "epoch": 0.5141720133192561, + "grad_norm": 3.4023713944158143, + "learning_rate": 2.5069050240562782e-06, + "loss": 0.745, + "step": 6331 + }, + { + "epoch": 0.5142532282952976, + "grad_norm": 5.4405959398028125, + "learning_rate": 2.5062474041585432e-06, + "loss": 0.5726, + "step": 6332 + }, + { + "epoch": 0.5143344432713393, + "grad_norm": 5.517222494769225, + "learning_rate": 2.5055897838285207e-06, + "loss": 0.5267, + "step": 6333 + }, + { + "epoch": 0.5144156582473808, + "grad_norm": 4.726623734798838, + "learning_rate": 2.504932163111715e-06, + "loss": 0.5198, + "step": 6334 + }, + { + "epoch": 0.5144968732234224, + "grad_norm": 4.440896263973417, + "learning_rate": 2.5042745420536295e-06, + "loss": 0.5967, + "step": 6335 + }, + { + "epoch": 0.5145780881994639, + "grad_norm": 3.37064042601589, + "learning_rate": 2.503616920699769e-06, + "loss": 0.5971, + "step": 6336 + }, + { + "epoch": 0.5146593031755056, + "grad_norm": 4.690221273319353, + "learning_rate": 2.502959299095636e-06, + "loss": 0.5007, + "step": 6337 + }, + { + "epoch": 0.5147405181515472, + "grad_norm": 4.908136131112301, + "learning_rate": 2.5023016772867353e-06, + "loss": 0.5678, + "step": 6338 + }, + { + "epoch": 0.5148217331275887, + "grad_norm": 5.443552173567539, + "learning_rate": 2.5016440553185718e-06, + "loss": 0.4118, + "step": 6339 + }, + { + "epoch": 0.5149029481036304, + "grad_norm": 6.1838875143833665, + "learning_rate": 2.5009864332366467e-06, + "loss": 0.6241, + "step": 6340 + }, + { + "epoch": 0.5149841630796719, + "grad_norm": 3.986184988106473, + "learning_rate": 2.5003288110864664e-06, + "loss": 0.5065, + "step": 6341 + }, + { + "epoch": 0.5150653780557135, + "grad_norm": 4.231672066174447, + "learning_rate": 2.4996711889135344e-06, + "loss": 0.6497, + "step": 6342 + }, + { + "epoch": 0.515146593031755, + "grad_norm": 5.815390035755676, + "learning_rate": 2.499013566763354e-06, + "loss": 0.4079, + "step": 6343 + }, + { + "epoch": 0.5152278080077967, + "grad_norm": 7.9133098452043455, + "learning_rate": 2.4983559446814295e-06, + "loss": 0.438, + "step": 6344 + }, + { + "epoch": 0.5153090229838382, + "grad_norm": 3.207812612582073, + "learning_rate": 2.497698322713265e-06, + "loss": 0.5876, + "step": 6345 + }, + { + "epoch": 0.5153902379598798, + "grad_norm": 4.353200703794427, + "learning_rate": 2.4970407009043646e-06, + "loss": 0.5272, + "step": 6346 + }, + { + "epoch": 0.5154714529359213, + "grad_norm": 5.892469802972017, + "learning_rate": 2.4963830793002313e-06, + "loss": 0.4392, + "step": 6347 + }, + { + "epoch": 0.515552667911963, + "grad_norm": 7.87771121157062, + "learning_rate": 2.495725457946371e-06, + "loss": 0.326, + "step": 6348 + }, + { + "epoch": 0.5156338828880046, + "grad_norm": 4.905801737169398, + "learning_rate": 2.4950678368882863e-06, + "loss": 0.5542, + "step": 6349 + }, + { + "epoch": 0.5157150978640461, + "grad_norm": 3.2384008143696117, + "learning_rate": 2.49441021617148e-06, + "loss": 0.5257, + "step": 6350 + }, + { + "epoch": 0.5157963128400878, + "grad_norm": 8.399266140485144, + "learning_rate": 2.4937525958414576e-06, + "loss": 0.6298, + "step": 6351 + }, + { + "epoch": 0.5158775278161293, + "grad_norm": 6.628609352541173, + "learning_rate": 2.4930949759437234e-06, + "loss": 0.3812, + "step": 6352 + }, + { + "epoch": 0.5159587427921709, + "grad_norm": 7.882154633725088, + "learning_rate": 2.492437356523779e-06, + "loss": 0.5552, + "step": 6353 + }, + { + "epoch": 0.5160399577682124, + "grad_norm": 9.337887778597489, + "learning_rate": 2.4917797376271297e-06, + "loss": 0.5093, + "step": 6354 + }, + { + "epoch": 0.5161211727442541, + "grad_norm": 7.376950318246357, + "learning_rate": 2.49112211929928e-06, + "loss": 0.5847, + "step": 6355 + }, + { + "epoch": 0.5162023877202956, + "grad_norm": 6.482321048724295, + "learning_rate": 2.4904645015857318e-06, + "loss": 0.5784, + "step": 6356 + }, + { + "epoch": 0.5162836026963372, + "grad_norm": 5.797801300962893, + "learning_rate": 2.48980688453199e-06, + "loss": 0.5229, + "step": 6357 + }, + { + "epoch": 0.5163648176723787, + "grad_norm": 5.360257972509032, + "learning_rate": 2.4891492681835584e-06, + "loss": 0.4893, + "step": 6358 + }, + { + "epoch": 0.5164460326484204, + "grad_norm": 5.202599723312153, + "learning_rate": 2.4884916525859386e-06, + "loss": 0.4775, + "step": 6359 + }, + { + "epoch": 0.516527247624462, + "grad_norm": 6.783080832818119, + "learning_rate": 2.4878340377846365e-06, + "loss": 0.4474, + "step": 6360 + }, + { + "epoch": 0.5166084626005035, + "grad_norm": 12.845276601042336, + "learning_rate": 2.4871764238251547e-06, + "loss": 0.4225, + "step": 6361 + }, + { + "epoch": 0.5166896775765452, + "grad_norm": 7.494098434686925, + "learning_rate": 2.4865188107529963e-06, + "loss": 0.5043, + "step": 6362 + }, + { + "epoch": 0.5167708925525867, + "grad_norm": 5.004627980302835, + "learning_rate": 2.485861198613664e-06, + "loss": 0.4393, + "step": 6363 + }, + { + "epoch": 0.5168521075286283, + "grad_norm": 6.726551242092707, + "learning_rate": 2.4852035874526632e-06, + "loss": 0.5499, + "step": 6364 + }, + { + "epoch": 0.5169333225046698, + "grad_norm": 4.666439480933289, + "learning_rate": 2.4845459773154964e-06, + "loss": 0.5416, + "step": 6365 + }, + { + "epoch": 0.5170145374807115, + "grad_norm": 5.180749129277349, + "learning_rate": 2.483888368247664e-06, + "loss": 0.5082, + "step": 6366 + }, + { + "epoch": 0.517095752456753, + "grad_norm": 4.494845006036604, + "learning_rate": 2.4832307602946726e-06, + "loss": 0.4328, + "step": 6367 + }, + { + "epoch": 0.5171769674327946, + "grad_norm": 7.722256382211164, + "learning_rate": 2.4825731535020242e-06, + "loss": 0.4709, + "step": 6368 + }, + { + "epoch": 0.5172581824088361, + "grad_norm": 5.336177066714621, + "learning_rate": 2.48191554791522e-06, + "loss": 0.5804, + "step": 6369 + }, + { + "epoch": 0.5173393973848778, + "grad_norm": 6.222339347839942, + "learning_rate": 2.481257943579765e-06, + "loss": 0.4719, + "step": 6370 + }, + { + "epoch": 0.5174206123609194, + "grad_norm": 7.613124790360803, + "learning_rate": 2.4806003405411617e-06, + "loss": 0.6593, + "step": 6371 + }, + { + "epoch": 0.5175018273369609, + "grad_norm": 6.354106819523852, + "learning_rate": 2.479942738844911e-06, + "loss": 0.4365, + "step": 6372 + }, + { + "epoch": 0.5175830423130026, + "grad_norm": 5.688366482514378, + "learning_rate": 2.479285138536517e-06, + "loss": 0.5308, + "step": 6373 + }, + { + "epoch": 0.5176642572890441, + "grad_norm": 5.415154693434397, + "learning_rate": 2.4786275396614823e-06, + "loss": 0.4804, + "step": 6374 + }, + { + "epoch": 0.5177454722650857, + "grad_norm": 28.82102125102892, + "learning_rate": 2.477969942265308e-06, + "loss": 0.3669, + "step": 6375 + }, + { + "epoch": 0.5178266872411272, + "grad_norm": 6.810481547484077, + "learning_rate": 2.4773123463934973e-06, + "loss": 0.6299, + "step": 6376 + }, + { + "epoch": 0.5179079022171689, + "grad_norm": 4.392949418938218, + "learning_rate": 2.476654752091553e-06, + "loss": 0.5736, + "step": 6377 + }, + { + "epoch": 0.5179891171932104, + "grad_norm": 4.2704517486876234, + "learning_rate": 2.4759971594049763e-06, + "loss": 0.5811, + "step": 6378 + }, + { + "epoch": 0.518070332169252, + "grad_norm": 6.692688448203209, + "learning_rate": 2.4753395683792677e-06, + "loss": 0.4761, + "step": 6379 + }, + { + "epoch": 0.5181515471452935, + "grad_norm": 7.334413784638161, + "learning_rate": 2.474681979059931e-06, + "loss": 0.5694, + "step": 6380 + }, + { + "epoch": 0.5182327621213352, + "grad_norm": 4.544276408270827, + "learning_rate": 2.474024391492468e-06, + "loss": 0.4231, + "step": 6381 + }, + { + "epoch": 0.5183139770973768, + "grad_norm": 4.593897927457953, + "learning_rate": 2.473366805722379e-06, + "loss": 0.6074, + "step": 6382 + }, + { + "epoch": 0.5183951920734183, + "grad_norm": 4.897073039683832, + "learning_rate": 2.472709221795166e-06, + "loss": 0.4831, + "step": 6383 + }, + { + "epoch": 0.51847640704946, + "grad_norm": 4.507107061670829, + "learning_rate": 2.4720516397563314e-06, + "loss": 0.5052, + "step": 6384 + }, + { + "epoch": 0.5185576220255015, + "grad_norm": 4.582461812162122, + "learning_rate": 2.471394059651374e-06, + "loss": 0.6067, + "step": 6385 + }, + { + "epoch": 0.5186388370015431, + "grad_norm": 5.866771836678322, + "learning_rate": 2.470736481525797e-06, + "loss": 0.6531, + "step": 6386 + }, + { + "epoch": 0.5187200519775846, + "grad_norm": 5.813106703432218, + "learning_rate": 2.470078905425101e-06, + "loss": 0.4097, + "step": 6387 + }, + { + "epoch": 0.5188012669536263, + "grad_norm": 7.253453752131115, + "learning_rate": 2.4694213313947855e-06, + "loss": 0.4998, + "step": 6388 + }, + { + "epoch": 0.5188824819296678, + "grad_norm": 3.081751300748125, + "learning_rate": 2.4687637594803527e-06, + "loss": 0.5751, + "step": 6389 + }, + { + "epoch": 0.5189636969057094, + "grad_norm": 5.45405710599491, + "learning_rate": 2.4681061897273028e-06, + "loss": 0.4083, + "step": 6390 + }, + { + "epoch": 0.519044911881751, + "grad_norm": 4.843125937608803, + "learning_rate": 2.4674486221811345e-06, + "loss": 0.4051, + "step": 6391 + }, + { + "epoch": 0.5191261268577926, + "grad_norm": 6.00791556685988, + "learning_rate": 2.46679105688735e-06, + "loss": 0.5527, + "step": 6392 + }, + { + "epoch": 0.5192073418338342, + "grad_norm": 3.547137678101548, + "learning_rate": 2.466133493891449e-06, + "loss": 0.4875, + "step": 6393 + }, + { + "epoch": 0.5192885568098757, + "grad_norm": 8.66243792163446, + "learning_rate": 2.46547593323893e-06, + "loss": 0.496, + "step": 6394 + }, + { + "epoch": 0.5193697717859174, + "grad_norm": 4.430513402849331, + "learning_rate": 2.464818374975295e-06, + "loss": 0.4364, + "step": 6395 + }, + { + "epoch": 0.5194509867619589, + "grad_norm": 3.7975317298710625, + "learning_rate": 2.4641608191460415e-06, + "loss": 0.6302, + "step": 6396 + }, + { + "epoch": 0.5195322017380005, + "grad_norm": 3.888846410063618, + "learning_rate": 2.46350326579667e-06, + "loss": 0.5352, + "step": 6397 + }, + { + "epoch": 0.519613416714042, + "grad_norm": 4.412943996478983, + "learning_rate": 2.462845714972679e-06, + "loss": 0.4854, + "step": 6398 + }, + { + "epoch": 0.5196946316900837, + "grad_norm": 3.706708426188841, + "learning_rate": 2.4621881667195676e-06, + "loss": 0.4375, + "step": 6399 + }, + { + "epoch": 0.5197758466661252, + "grad_norm": 3.8449883186160787, + "learning_rate": 2.4615306210828357e-06, + "loss": 0.5516, + "step": 6400 + }, + { + "epoch": 0.5198570616421668, + "grad_norm": 5.909942632055917, + "learning_rate": 2.46087307810798e-06, + "loss": 0.5237, + "step": 6401 + }, + { + "epoch": 0.5199382766182084, + "grad_norm": 3.547279382037591, + "learning_rate": 2.460215537840501e-06, + "loss": 0.4175, + "step": 6402 + }, + { + "epoch": 0.52001949159425, + "grad_norm": 4.882379724186628, + "learning_rate": 2.459558000325897e-06, + "loss": 0.6123, + "step": 6403 + }, + { + "epoch": 0.5201007065702916, + "grad_norm": 4.050921637738143, + "learning_rate": 2.458900465609664e-06, + "loss": 0.5215, + "step": 6404 + }, + { + "epoch": 0.5201819215463331, + "grad_norm": 3.382346078488962, + "learning_rate": 2.4582429337373018e-06, + "loss": 0.4617, + "step": 6405 + }, + { + "epoch": 0.5202631365223748, + "grad_norm": 7.257748442971193, + "learning_rate": 2.4575854047543082e-06, + "loss": 0.4909, + "step": 6406 + }, + { + "epoch": 0.5203443514984163, + "grad_norm": 6.459780012681047, + "learning_rate": 2.456927878706179e-06, + "loss": 0.4125, + "step": 6407 + }, + { + "epoch": 0.5204255664744579, + "grad_norm": 3.8936939687249863, + "learning_rate": 2.4562703556384136e-06, + "loss": 0.5746, + "step": 6408 + }, + { + "epoch": 0.5205067814504994, + "grad_norm": 5.650603779541892, + "learning_rate": 2.4556128355965076e-06, + "loss": 0.4194, + "step": 6409 + }, + { + "epoch": 0.5205879964265411, + "grad_norm": 6.233819656387275, + "learning_rate": 2.454955318625958e-06, + "loss": 0.5296, + "step": 6410 + }, + { + "epoch": 0.5206692114025826, + "grad_norm": 5.175873926721092, + "learning_rate": 2.4542978047722633e-06, + "loss": 0.5687, + "step": 6411 + }, + { + "epoch": 0.5207504263786242, + "grad_norm": 7.441015494915213, + "learning_rate": 2.453640294080918e-06, + "loss": 0.7367, + "step": 6412 + }, + { + "epoch": 0.5208316413546658, + "grad_norm": 7.282413948152037, + "learning_rate": 2.452982786597419e-06, + "loss": 0.5059, + "step": 6413 + }, + { + "epoch": 0.5209128563307074, + "grad_norm": 5.063377741570186, + "learning_rate": 2.452325282367262e-06, + "loss": 0.4086, + "step": 6414 + }, + { + "epoch": 0.520994071306749, + "grad_norm": 4.377231675145512, + "learning_rate": 2.4516677814359434e-06, + "loss": 0.5554, + "step": 6415 + }, + { + "epoch": 0.5210752862827905, + "grad_norm": 5.111222932344912, + "learning_rate": 2.4510102838489587e-06, + "loss": 0.5666, + "step": 6416 + }, + { + "epoch": 0.5211565012588322, + "grad_norm": 2.6572969128420363, + "learning_rate": 2.4503527896518025e-06, + "loss": 0.5856, + "step": 6417 + }, + { + "epoch": 0.5212377162348737, + "grad_norm": 3.467977660333967, + "learning_rate": 2.449695298889971e-06, + "loss": 0.5549, + "step": 6418 + }, + { + "epoch": 0.5213189312109153, + "grad_norm": 3.9586980008450547, + "learning_rate": 2.449037811608959e-06, + "loss": 0.5114, + "step": 6419 + }, + { + "epoch": 0.5214001461869568, + "grad_norm": 6.6024044152724795, + "learning_rate": 2.4483803278542594e-06, + "loss": 0.4881, + "step": 6420 + }, + { + "epoch": 0.5214813611629985, + "grad_norm": 3.872306998345536, + "learning_rate": 2.447722847671369e-06, + "loss": 0.4957, + "step": 6421 + }, + { + "epoch": 0.52156257613904, + "grad_norm": 4.928256043695277, + "learning_rate": 2.4470653711057805e-06, + "loss": 0.5453, + "step": 6422 + }, + { + "epoch": 0.5216437911150816, + "grad_norm": 5.00049623611133, + "learning_rate": 2.446407898202988e-06, + "loss": 0.4302, + "step": 6423 + }, + { + "epoch": 0.5217250060911232, + "grad_norm": 6.210683915021455, + "learning_rate": 2.445750429008486e-06, + "loss": 0.4867, + "step": 6424 + }, + { + "epoch": 0.5218062210671648, + "grad_norm": 4.733137071274775, + "learning_rate": 2.4450929635677667e-06, + "loss": 0.5342, + "step": 6425 + }, + { + "epoch": 0.5218874360432064, + "grad_norm": 3.573213441447143, + "learning_rate": 2.4444355019263235e-06, + "loss": 0.5391, + "step": 6426 + }, + { + "epoch": 0.5219686510192479, + "grad_norm": 3.9218556675383685, + "learning_rate": 2.44377804412965e-06, + "loss": 0.5641, + "step": 6427 + }, + { + "epoch": 0.5220498659952896, + "grad_norm": 3.72842203066307, + "learning_rate": 2.443120590223238e-06, + "loss": 0.5727, + "step": 6428 + }, + { + "epoch": 0.5221310809713311, + "grad_norm": 4.5179452250246355, + "learning_rate": 2.4424631402525797e-06, + "loss": 0.5027, + "step": 6429 + }, + { + "epoch": 0.5222122959473727, + "grad_norm": 7.296740350963874, + "learning_rate": 2.4418056942631687e-06, + "loss": 0.4627, + "step": 6430 + }, + { + "epoch": 0.5222935109234143, + "grad_norm": 4.733022729564796, + "learning_rate": 2.4411482523004946e-06, + "loss": 0.4634, + "step": 6431 + }, + { + "epoch": 0.5223747258994559, + "grad_norm": 5.089595564180344, + "learning_rate": 2.4404908144100513e-06, + "loss": 0.3586, + "step": 6432 + }, + { + "epoch": 0.5224559408754974, + "grad_norm": 5.358613043214558, + "learning_rate": 2.4398333806373274e-06, + "loss": 0.465, + "step": 6433 + }, + { + "epoch": 0.522537155851539, + "grad_norm": 5.524190371189003, + "learning_rate": 2.4391759510278153e-06, + "loss": 0.5472, + "step": 6434 + }, + { + "epoch": 0.5226183708275806, + "grad_norm": 4.103822972000649, + "learning_rate": 2.438518525627006e-06, + "loss": 0.5572, + "step": 6435 + }, + { + "epoch": 0.5226995858036222, + "grad_norm": 5.959127088930798, + "learning_rate": 2.4378611044803887e-06, + "loss": 0.4209, + "step": 6436 + }, + { + "epoch": 0.5227808007796638, + "grad_norm": 5.31529709254031, + "learning_rate": 2.437203687633455e-06, + "loss": 0.3897, + "step": 6437 + }, + { + "epoch": 0.5228620157557053, + "grad_norm": 3.7128349527804656, + "learning_rate": 2.436546275131693e-06, + "loss": 0.4428, + "step": 6438 + }, + { + "epoch": 0.522943230731747, + "grad_norm": 3.784355733006889, + "learning_rate": 2.435888867020593e-06, + "loss": 0.575, + "step": 6439 + }, + { + "epoch": 0.5230244457077885, + "grad_norm": 4.907099811863354, + "learning_rate": 2.435231463345645e-06, + "loss": 0.5123, + "step": 6440 + }, + { + "epoch": 0.5231056606838301, + "grad_norm": 3.094941647382156, + "learning_rate": 2.4345740641523362e-06, + "loss": 0.6211, + "step": 6441 + }, + { + "epoch": 0.5231868756598717, + "grad_norm": 5.697788871531617, + "learning_rate": 2.4339166694861553e-06, + "loss": 0.4124, + "step": 6442 + }, + { + "epoch": 0.5232680906359133, + "grad_norm": 5.825537615998602, + "learning_rate": 2.433259279392592e-06, + "loss": 0.4934, + "step": 6443 + }, + { + "epoch": 0.5233493056119548, + "grad_norm": 4.894893357625769, + "learning_rate": 2.432601893917133e-06, + "loss": 0.5467, + "step": 6444 + }, + { + "epoch": 0.5234305205879964, + "grad_norm": 7.012722371867207, + "learning_rate": 2.431944513105266e-06, + "loss": 0.5173, + "step": 6445 + }, + { + "epoch": 0.523511735564038, + "grad_norm": 3.8914614811391144, + "learning_rate": 2.4312871370024794e-06, + "loss": 0.5101, + "step": 6446 + }, + { + "epoch": 0.5235929505400796, + "grad_norm": 4.15063424291914, + "learning_rate": 2.4306297656542584e-06, + "loss": 0.4239, + "step": 6447 + }, + { + "epoch": 0.5236741655161212, + "grad_norm": 26.114252256627633, + "learning_rate": 2.4299723991060904e-06, + "loss": 0.5355, + "step": 6448 + }, + { + "epoch": 0.5237553804921627, + "grad_norm": 5.921632572418434, + "learning_rate": 2.4293150374034625e-06, + "loss": 0.3273, + "step": 6449 + }, + { + "epoch": 0.5238365954682044, + "grad_norm": 5.16163949915224, + "learning_rate": 2.4286576805918604e-06, + "loss": 0.4013, + "step": 6450 + }, + { + "epoch": 0.5239178104442459, + "grad_norm": 3.8705275409101643, + "learning_rate": 2.4280003287167684e-06, + "loss": 0.4692, + "step": 6451 + }, + { + "epoch": 0.5239990254202875, + "grad_norm": 4.713434827452213, + "learning_rate": 2.427342981823672e-06, + "loss": 0.4827, + "step": 6452 + }, + { + "epoch": 0.5240802403963291, + "grad_norm": 9.847453330969838, + "learning_rate": 2.426685639958058e-06, + "loss": 0.597, + "step": 6453 + }, + { + "epoch": 0.5241614553723707, + "grad_norm": 14.084536911041138, + "learning_rate": 2.426028303165409e-06, + "loss": 0.7119, + "step": 6454 + }, + { + "epoch": 0.5242426703484122, + "grad_norm": 4.162924811617758, + "learning_rate": 2.425370971491209e-06, + "loss": 0.5353, + "step": 6455 + }, + { + "epoch": 0.5243238853244538, + "grad_norm": 5.310608964267923, + "learning_rate": 2.424713644980945e-06, + "loss": 0.4668, + "step": 6456 + }, + { + "epoch": 0.5244051003004954, + "grad_norm": 3.485581874385866, + "learning_rate": 2.424056323680097e-06, + "loss": 0.5376, + "step": 6457 + }, + { + "epoch": 0.524486315276537, + "grad_norm": 4.466249110985916, + "learning_rate": 2.423399007634149e-06, + "loss": 0.5362, + "step": 6458 + }, + { + "epoch": 0.5245675302525786, + "grad_norm": 15.507085752685619, + "learning_rate": 2.422741696888585e-06, + "loss": 0.4823, + "step": 6459 + }, + { + "epoch": 0.5246487452286202, + "grad_norm": 3.42283227124507, + "learning_rate": 2.4220843914888865e-06, + "loss": 0.5139, + "step": 6460 + }, + { + "epoch": 0.5247299602046618, + "grad_norm": 4.471153348445109, + "learning_rate": 2.4214270914805353e-06, + "loss": 0.6101, + "step": 6461 + }, + { + "epoch": 0.5248111751807033, + "grad_norm": 4.937962838447406, + "learning_rate": 2.4207697969090145e-06, + "loss": 0.5725, + "step": 6462 + }, + { + "epoch": 0.5248923901567449, + "grad_norm": 3.4301025114527355, + "learning_rate": 2.420112507819804e-06, + "loss": 0.6678, + "step": 6463 + }, + { + "epoch": 0.5249736051327865, + "grad_norm": 9.810088340742176, + "learning_rate": 2.4194552242583845e-06, + "loss": 0.4788, + "step": 6464 + }, + { + "epoch": 0.5250548201088281, + "grad_norm": 5.247524236071172, + "learning_rate": 2.4187979462702382e-06, + "loss": 0.588, + "step": 6465 + }, + { + "epoch": 0.5251360350848696, + "grad_norm": 3.8501987558213133, + "learning_rate": 2.4181406739008443e-06, + "loss": 0.5239, + "step": 6466 + }, + { + "epoch": 0.5252172500609112, + "grad_norm": 4.286565115822655, + "learning_rate": 2.417483407195682e-06, + "loss": 0.4764, + "step": 6467 + }, + { + "epoch": 0.5252984650369528, + "grad_norm": 2.9196698091729347, + "learning_rate": 2.416826146200231e-06, + "loss": 0.5093, + "step": 6468 + }, + { + "epoch": 0.5253796800129944, + "grad_norm": 4.6923054622476, + "learning_rate": 2.4161688909599715e-06, + "loss": 0.6884, + "step": 6469 + }, + { + "epoch": 0.525460894989036, + "grad_norm": 3.966646784959062, + "learning_rate": 2.4155116415203804e-06, + "loss": 0.5132, + "step": 6470 + }, + { + "epoch": 0.5255421099650776, + "grad_norm": 3.6916733502842956, + "learning_rate": 2.4148543979269357e-06, + "loss": 0.5949, + "step": 6471 + }, + { + "epoch": 0.5256233249411192, + "grad_norm": 4.986568652074796, + "learning_rate": 2.4141971602251176e-06, + "loss": 0.5262, + "step": 6472 + }, + { + "epoch": 0.5257045399171607, + "grad_norm": 7.761776773353264, + "learning_rate": 2.4135399284604012e-06, + "loss": 0.3962, + "step": 6473 + }, + { + "epoch": 0.5257857548932023, + "grad_norm": 28.513128146924068, + "learning_rate": 2.4128827026782633e-06, + "loss": 0.655, + "step": 6474 + }, + { + "epoch": 0.5258669698692439, + "grad_norm": 5.030638393005198, + "learning_rate": 2.4122254829241827e-06, + "loss": 0.6157, + "step": 6475 + }, + { + "epoch": 0.5259481848452855, + "grad_norm": 3.759316711837708, + "learning_rate": 2.4115682692436337e-06, + "loss": 0.4513, + "step": 6476 + }, + { + "epoch": 0.526029399821327, + "grad_norm": 5.1999339488745395, + "learning_rate": 2.4109110616820918e-06, + "loss": 0.427, + "step": 6477 + }, + { + "epoch": 0.5261106147973686, + "grad_norm": 9.081659712327916, + "learning_rate": 2.4102538602850337e-06, + "loss": 0.4293, + "step": 6478 + }, + { + "epoch": 0.5261918297734102, + "grad_norm": 3.9469459629155574, + "learning_rate": 2.4095966650979342e-06, + "loss": 0.5247, + "step": 6479 + }, + { + "epoch": 0.5262730447494518, + "grad_norm": 4.830126815254179, + "learning_rate": 2.4089394761662653e-06, + "loss": 0.5248, + "step": 6480 + }, + { + "epoch": 0.5263542597254934, + "grad_norm": 5.427859891508367, + "learning_rate": 2.4082822935355035e-06, + "loss": 0.4429, + "step": 6481 + }, + { + "epoch": 0.526435474701535, + "grad_norm": 5.032571069469282, + "learning_rate": 2.4076251172511224e-06, + "loss": 0.4578, + "step": 6482 + }, + { + "epoch": 0.5265166896775766, + "grad_norm": 3.8795340568732364, + "learning_rate": 2.4069679473585925e-06, + "loss": 0.5519, + "step": 6483 + }, + { + "epoch": 0.5265979046536181, + "grad_norm": 3.9342593469573877, + "learning_rate": 2.4063107839033894e-06, + "loss": 0.5532, + "step": 6484 + }, + { + "epoch": 0.5266791196296597, + "grad_norm": 6.08364360680254, + "learning_rate": 2.4056536269309847e-06, + "loss": 0.5749, + "step": 6485 + }, + { + "epoch": 0.5267603346057013, + "grad_norm": 6.140920876184235, + "learning_rate": 2.4049964764868493e-06, + "loss": 0.3915, + "step": 6486 + }, + { + "epoch": 0.5268415495817429, + "grad_norm": 4.532054993744361, + "learning_rate": 2.4043393326164536e-06, + "loss": 0.4485, + "step": 6487 + }, + { + "epoch": 0.5269227645577844, + "grad_norm": 4.418973334061362, + "learning_rate": 2.403682195365272e-06, + "loss": 0.4327, + "step": 6488 + }, + { + "epoch": 0.527003979533826, + "grad_norm": 4.167489894614658, + "learning_rate": 2.4030250647787714e-06, + "loss": 0.545, + "step": 6489 + }, + { + "epoch": 0.5270851945098676, + "grad_norm": 5.917681962851331, + "learning_rate": 2.402367940902423e-06, + "loss": 0.5363, + "step": 6490 + }, + { + "epoch": 0.5271664094859092, + "grad_norm": 10.913996246318028, + "learning_rate": 2.401710823781697e-06, + "loss": 0.3902, + "step": 6491 + }, + { + "epoch": 0.5272476244619508, + "grad_norm": 25.430239522056482, + "learning_rate": 2.4010537134620614e-06, + "loss": 0.4407, + "step": 6492 + }, + { + "epoch": 0.5273288394379924, + "grad_norm": 9.226096785828776, + "learning_rate": 2.400396609988985e-06, + "loss": 0.5414, + "step": 6493 + }, + { + "epoch": 0.527410054414034, + "grad_norm": 4.553011519720875, + "learning_rate": 2.3997395134079367e-06, + "loss": 0.5125, + "step": 6494 + }, + { + "epoch": 0.5274912693900755, + "grad_norm": 3.85392516623142, + "learning_rate": 2.399082423764383e-06, + "loss": 0.5804, + "step": 6495 + }, + { + "epoch": 0.5275724843661171, + "grad_norm": 6.43430352311751, + "learning_rate": 2.3984253411037913e-06, + "loss": 0.4332, + "step": 6496 + }, + { + "epoch": 0.5276536993421587, + "grad_norm": 5.348598603512892, + "learning_rate": 2.397768265471629e-06, + "loss": 0.4464, + "step": 6497 + }, + { + "epoch": 0.5277349143182003, + "grad_norm": 5.503883803363463, + "learning_rate": 2.397111196913362e-06, + "loss": 0.5333, + "step": 6498 + }, + { + "epoch": 0.5278161292942418, + "grad_norm": 5.084027665154841, + "learning_rate": 2.396454135474454e-06, + "loss": 0.5963, + "step": 6499 + }, + { + "epoch": 0.5278973442702835, + "grad_norm": 4.104843883398135, + "learning_rate": 2.3957970812003727e-06, + "loss": 0.6582, + "step": 6500 + }, + { + "epoch": 0.527978559246325, + "grad_norm": 6.334197801454462, + "learning_rate": 2.3951400341365827e-06, + "loss": 0.5699, + "step": 6501 + }, + { + "epoch": 0.5280597742223666, + "grad_norm": 6.9437969081416435, + "learning_rate": 2.394482994328546e-06, + "loss": 0.4565, + "step": 6502 + }, + { + "epoch": 0.5281409891984082, + "grad_norm": 3.941363956260876, + "learning_rate": 2.393825961821728e-06, + "loss": 0.705, + "step": 6503 + }, + { + "epoch": 0.5282222041744498, + "grad_norm": 3.7273474058501974, + "learning_rate": 2.3931689366615926e-06, + "loss": 0.488, + "step": 6504 + }, + { + "epoch": 0.5283034191504914, + "grad_norm": 5.084976289181063, + "learning_rate": 2.392511918893601e-06, + "loss": 0.5142, + "step": 6505 + }, + { + "epoch": 0.5283846341265329, + "grad_norm": 6.4658935525713295, + "learning_rate": 2.3918549085632145e-06, + "loss": 0.4072, + "step": 6506 + }, + { + "epoch": 0.5284658491025745, + "grad_norm": 3.651970957929354, + "learning_rate": 2.3911979057158974e-06, + "loss": 0.4404, + "step": 6507 + }, + { + "epoch": 0.5285470640786161, + "grad_norm": 3.5446244109539045, + "learning_rate": 2.3905409103971096e-06, + "loss": 0.5651, + "step": 6508 + }, + { + "epoch": 0.5286282790546577, + "grad_norm": 3.9057802685762217, + "learning_rate": 2.38988392265231e-06, + "loss": 0.4663, + "step": 6509 + }, + { + "epoch": 0.5287094940306992, + "grad_norm": 2.871351243716771, + "learning_rate": 2.389226942526961e-06, + "loss": 0.5126, + "step": 6510 + }, + { + "epoch": 0.5287907090067409, + "grad_norm": 6.264655385586473, + "learning_rate": 2.3885699700665217e-06, + "loss": 0.463, + "step": 6511 + }, + { + "epoch": 0.5288719239827824, + "grad_norm": 6.176994725304146, + "learning_rate": 2.3879130053164495e-06, + "loss": 0.3438, + "step": 6512 + }, + { + "epoch": 0.528953138958824, + "grad_norm": 3.841956377687345, + "learning_rate": 2.3872560483222048e-06, + "loss": 0.6078, + "step": 6513 + }, + { + "epoch": 0.5290343539348656, + "grad_norm": 3.946093406576217, + "learning_rate": 2.3865990991292458e-06, + "loss": 0.49, + "step": 6514 + }, + { + "epoch": 0.5291155689109072, + "grad_norm": 5.815463605481333, + "learning_rate": 2.3859421577830276e-06, + "loss": 0.5979, + "step": 6515 + }, + { + "epoch": 0.5291967838869488, + "grad_norm": 7.2849496983935875, + "learning_rate": 2.385285224329009e-06, + "loss": 0.4522, + "step": 6516 + }, + { + "epoch": 0.5292779988629903, + "grad_norm": 6.703027017567743, + "learning_rate": 2.384628298812646e-06, + "loss": 0.4208, + "step": 6517 + }, + { + "epoch": 0.529359213839032, + "grad_norm": 4.068730154097392, + "learning_rate": 2.383971381279393e-06, + "loss": 0.649, + "step": 6518 + }, + { + "epoch": 0.5294404288150735, + "grad_norm": 3.9699040583000667, + "learning_rate": 2.383314471774707e-06, + "loss": 0.7164, + "step": 6519 + }, + { + "epoch": 0.5295216437911151, + "grad_norm": 11.734590811153446, + "learning_rate": 2.382657570344043e-06, + "loss": 0.5432, + "step": 6520 + }, + { + "epoch": 0.5296028587671566, + "grad_norm": 5.270660282246951, + "learning_rate": 2.382000677032854e-06, + "loss": 0.5122, + "step": 6521 + }, + { + "epoch": 0.5296840737431983, + "grad_norm": 4.145643825845797, + "learning_rate": 2.3813437918865925e-06, + "loss": 0.5513, + "step": 6522 + }, + { + "epoch": 0.5297652887192398, + "grad_norm": 5.695234292809091, + "learning_rate": 2.380686914950713e-06, + "loss": 0.5945, + "step": 6523 + }, + { + "epoch": 0.5298465036952814, + "grad_norm": 5.187560295102548, + "learning_rate": 2.380030046270668e-06, + "loss": 0.5533, + "step": 6524 + }, + { + "epoch": 0.529927718671323, + "grad_norm": 5.2980757209337215, + "learning_rate": 2.379373185891908e-06, + "loss": 0.5801, + "step": 6525 + }, + { + "epoch": 0.5300089336473646, + "grad_norm": 4.379144750425471, + "learning_rate": 2.3787163338598854e-06, + "loss": 0.6238, + "step": 6526 + }, + { + "epoch": 0.5300901486234062, + "grad_norm": 5.906641818249719, + "learning_rate": 2.3780594902200515e-06, + "loss": 0.5211, + "step": 6527 + }, + { + "epoch": 0.5301713635994477, + "grad_norm": 5.654605677438833, + "learning_rate": 2.377402655017854e-06, + "loss": 0.4426, + "step": 6528 + }, + { + "epoch": 0.5302525785754894, + "grad_norm": 9.600316549753119, + "learning_rate": 2.376745828298745e-06, + "loss": 0.4189, + "step": 6529 + }, + { + "epoch": 0.5303337935515309, + "grad_norm": 4.652507037908199, + "learning_rate": 2.376089010108172e-06, + "loss": 0.5635, + "step": 6530 + }, + { + "epoch": 0.5304150085275725, + "grad_norm": 6.872480563300781, + "learning_rate": 2.3754322004915837e-06, + "loss": 0.452, + "step": 6531 + }, + { + "epoch": 0.530496223503614, + "grad_norm": 6.133795123377434, + "learning_rate": 2.3747753994944283e-06, + "loss": 0.5212, + "step": 6532 + }, + { + "epoch": 0.5305774384796557, + "grad_norm": 5.688402297871484, + "learning_rate": 2.3741186071621523e-06, + "loss": 0.5803, + "step": 6533 + }, + { + "epoch": 0.5306586534556972, + "grad_norm": 4.147307899835006, + "learning_rate": 2.373461823540202e-06, + "loss": 0.4938, + "step": 6534 + }, + { + "epoch": 0.5307398684317388, + "grad_norm": 4.16411315568013, + "learning_rate": 2.3728050486740244e-06, + "loss": 0.4921, + "step": 6535 + }, + { + "epoch": 0.5308210834077804, + "grad_norm": 4.86479630808414, + "learning_rate": 2.3721482826090643e-06, + "loss": 0.5561, + "step": 6536 + }, + { + "epoch": 0.530902298383822, + "grad_norm": 4.549881463792877, + "learning_rate": 2.3714915253907657e-06, + "loss": 0.4727, + "step": 6537 + }, + { + "epoch": 0.5309835133598636, + "grad_norm": 3.1960496743376745, + "learning_rate": 2.370834777064574e-06, + "loss": 0.5856, + "step": 6538 + }, + { + "epoch": 0.5310647283359051, + "grad_norm": 6.183831960146142, + "learning_rate": 2.3701780376759323e-06, + "loss": 0.4186, + "step": 6539 + }, + { + "epoch": 0.5311459433119468, + "grad_norm": 6.1626582028588075, + "learning_rate": 2.3695213072702834e-06, + "loss": 0.4421, + "step": 6540 + }, + { + "epoch": 0.5312271582879883, + "grad_norm": 4.399409989452963, + "learning_rate": 2.368864585893069e-06, + "loss": 0.5147, + "step": 6541 + }, + { + "epoch": 0.5313083732640299, + "grad_norm": 4.73410977261382, + "learning_rate": 2.368207873589731e-06, + "loss": 0.56, + "step": 6542 + }, + { + "epoch": 0.5313895882400714, + "grad_norm": 4.575239688507527, + "learning_rate": 2.3675511704057115e-06, + "loss": 0.5333, + "step": 6543 + }, + { + "epoch": 0.5314708032161131, + "grad_norm": 7.035858437872166, + "learning_rate": 2.3668944763864486e-06, + "loss": 0.5508, + "step": 6544 + }, + { + "epoch": 0.5315520181921546, + "grad_norm": 5.241353250145363, + "learning_rate": 2.3662377915773845e-06, + "loss": 0.4836, + "step": 6545 + }, + { + "epoch": 0.5316332331681962, + "grad_norm": 7.382490118024742, + "learning_rate": 2.365581116023958e-06, + "loss": 0.4395, + "step": 6546 + }, + { + "epoch": 0.5317144481442379, + "grad_norm": 5.3401428892737455, + "learning_rate": 2.364924449771605e-06, + "loss": 0.427, + "step": 6547 + }, + { + "epoch": 0.5317956631202794, + "grad_norm": 3.7061230832301066, + "learning_rate": 2.364267792865767e-06, + "loss": 0.8927, + "step": 6548 + }, + { + "epoch": 0.531876878096321, + "grad_norm": 4.420036708733726, + "learning_rate": 2.363611145351879e-06, + "loss": 0.5167, + "step": 6549 + }, + { + "epoch": 0.5319580930723625, + "grad_norm": 4.783438045653566, + "learning_rate": 2.3629545072753767e-06, + "loss": 0.5279, + "step": 6550 + }, + { + "epoch": 0.5320393080484042, + "grad_norm": 4.749522841051883, + "learning_rate": 2.3622978786816984e-06, + "loss": 0.4024, + "step": 6551 + }, + { + "epoch": 0.5321205230244457, + "grad_norm": 7.7640179721652, + "learning_rate": 2.361641259616278e-06, + "loss": 0.5393, + "step": 6552 + }, + { + "epoch": 0.5322017380004873, + "grad_norm": 3.6547210717885865, + "learning_rate": 2.3609846501245494e-06, + "loss": 0.4325, + "step": 6553 + }, + { + "epoch": 0.5322829529765288, + "grad_norm": 7.3797843232620455, + "learning_rate": 2.3603280502519482e-06, + "loss": 0.5021, + "step": 6554 + }, + { + "epoch": 0.5323641679525705, + "grad_norm": 3.694146153232864, + "learning_rate": 2.3596714600439062e-06, + "loss": 0.5116, + "step": 6555 + }, + { + "epoch": 0.532445382928612, + "grad_norm": 5.0985701151843354, + "learning_rate": 2.3590148795458577e-06, + "loss": 0.4945, + "step": 6556 + }, + { + "epoch": 0.5325265979046536, + "grad_norm": 5.131669229194393, + "learning_rate": 2.3583583088032313e-06, + "loss": 0.5813, + "step": 6557 + }, + { + "epoch": 0.5326078128806953, + "grad_norm": 4.005833213164181, + "learning_rate": 2.3577017478614613e-06, + "loss": 0.5053, + "step": 6558 + }, + { + "epoch": 0.5326890278567368, + "grad_norm": 14.883116766398912, + "learning_rate": 2.357045196765978e-06, + "loss": 0.6758, + "step": 6559 + }, + { + "epoch": 0.5327702428327784, + "grad_norm": 4.9450151525987955, + "learning_rate": 2.3563886555622093e-06, + "loss": 0.561, + "step": 6560 + }, + { + "epoch": 0.5328514578088199, + "grad_norm": 9.194980096203347, + "learning_rate": 2.355732124295586e-06, + "loss": 0.5195, + "step": 6561 + }, + { + "epoch": 0.5329326727848616, + "grad_norm": 5.200787441007623, + "learning_rate": 2.3550756030115364e-06, + "loss": 0.6133, + "step": 6562 + }, + { + "epoch": 0.5330138877609031, + "grad_norm": 4.874445965778859, + "learning_rate": 2.3544190917554875e-06, + "loss": 0.456, + "step": 6563 + }, + { + "epoch": 0.5330951027369447, + "grad_norm": 4.734097340063761, + "learning_rate": 2.3537625905728677e-06, + "loss": 0.5303, + "step": 6564 + }, + { + "epoch": 0.5331763177129862, + "grad_norm": 5.317506693611895, + "learning_rate": 2.3531060995091026e-06, + "loss": 0.6195, + "step": 6565 + }, + { + "epoch": 0.5332575326890279, + "grad_norm": 5.085875518247843, + "learning_rate": 2.352449618609617e-06, + "loss": 0.6773, + "step": 6566 + }, + { + "epoch": 0.5333387476650694, + "grad_norm": 5.216473307627287, + "learning_rate": 2.3517931479198383e-06, + "loss": 0.4075, + "step": 6567 + }, + { + "epoch": 0.533419962641111, + "grad_norm": 4.73626486723172, + "learning_rate": 2.3511366874851885e-06, + "loss": 0.5928, + "step": 6568 + }, + { + "epoch": 0.5335011776171527, + "grad_norm": 4.068709628111098, + "learning_rate": 2.350480237351092e-06, + "loss": 0.5746, + "step": 6569 + }, + { + "epoch": 0.5335823925931942, + "grad_norm": 7.162771272344813, + "learning_rate": 2.3498237975629726e-06, + "loss": 0.6726, + "step": 6570 + }, + { + "epoch": 0.5336636075692358, + "grad_norm": 8.523429545622452, + "learning_rate": 2.349167368166251e-06, + "loss": 0.4304, + "step": 6571 + }, + { + "epoch": 0.5337448225452773, + "grad_norm": 5.274081883984625, + "learning_rate": 2.348510949206349e-06, + "loss": 0.5222, + "step": 6572 + }, + { + "epoch": 0.533826037521319, + "grad_norm": 3.8799868155561406, + "learning_rate": 2.3478545407286883e-06, + "loss": 0.6353, + "step": 6573 + }, + { + "epoch": 0.5339072524973605, + "grad_norm": 3.6165853886009844, + "learning_rate": 2.3471981427786875e-06, + "loss": 0.5822, + "step": 6574 + }, + { + "epoch": 0.5339884674734021, + "grad_norm": 6.353757597164967, + "learning_rate": 2.3465417554017675e-06, + "loss": 0.4707, + "step": 6575 + }, + { + "epoch": 0.5340696824494436, + "grad_norm": 5.10414117668738, + "learning_rate": 2.3458853786433444e-06, + "loss": 0.5357, + "step": 6576 + }, + { + "epoch": 0.5341508974254853, + "grad_norm": 3.3925342392484317, + "learning_rate": 2.345229012548838e-06, + "loss": 0.6855, + "step": 6577 + }, + { + "epoch": 0.5342321124015268, + "grad_norm": 3.3042399836502776, + "learning_rate": 2.3445726571636656e-06, + "loss": 0.4581, + "step": 6578 + }, + { + "epoch": 0.5343133273775684, + "grad_norm": 4.792803765102656, + "learning_rate": 2.3439163125332415e-06, + "loss": 0.4442, + "step": 6579 + }, + { + "epoch": 0.5343945423536101, + "grad_norm": 53.09136619996998, + "learning_rate": 2.343259978702984e-06, + "loss": 0.5999, + "step": 6580 + }, + { + "epoch": 0.5344757573296516, + "grad_norm": 4.366575308474977, + "learning_rate": 2.3426036557183056e-06, + "loss": 0.55, + "step": 6581 + }, + { + "epoch": 0.5345569723056932, + "grad_norm": 4.219747570498485, + "learning_rate": 2.3419473436246206e-06, + "loss": 0.6209, + "step": 6582 + }, + { + "epoch": 0.5346381872817347, + "grad_norm": 5.505458655811916, + "learning_rate": 2.341291042467344e-06, + "loss": 0.5682, + "step": 6583 + }, + { + "epoch": 0.5347194022577764, + "grad_norm": 5.438215231835009, + "learning_rate": 2.3406347522918866e-06, + "loss": 0.4587, + "step": 6584 + }, + { + "epoch": 0.5348006172338179, + "grad_norm": 9.221582577454688, + "learning_rate": 2.339978473143661e-06, + "loss": 0.6507, + "step": 6585 + }, + { + "epoch": 0.5348818322098595, + "grad_norm": 6.264732038581897, + "learning_rate": 2.3393222050680788e-06, + "loss": 0.5201, + "step": 6586 + }, + { + "epoch": 0.534963047185901, + "grad_norm": 5.801333658841605, + "learning_rate": 2.338665948110549e-06, + "loss": 0.5507, + "step": 6587 + }, + { + "epoch": 0.5350442621619427, + "grad_norm": 6.810336490229261, + "learning_rate": 2.3380097023164813e-06, + "loss": 0.67, + "step": 6588 + }, + { + "epoch": 0.5351254771379842, + "grad_norm": 5.781581076596003, + "learning_rate": 2.337353467731286e-06, + "loss": 0.391, + "step": 6589 + }, + { + "epoch": 0.5352066921140258, + "grad_norm": 4.370970902980655, + "learning_rate": 2.3366972444003698e-06, + "loss": 0.5351, + "step": 6590 + }, + { + "epoch": 0.5352879070900675, + "grad_norm": 12.469579377782821, + "learning_rate": 2.3360410323691386e-06, + "loss": 0.4929, + "step": 6591 + }, + { + "epoch": 0.535369122066109, + "grad_norm": 10.008332152388622, + "learning_rate": 2.335384831683002e-06, + "loss": 0.4752, + "step": 6592 + }, + { + "epoch": 0.5354503370421506, + "grad_norm": 9.797876571328075, + "learning_rate": 2.334728642387363e-06, + "loss": 0.4668, + "step": 6593 + }, + { + "epoch": 0.5355315520181921, + "grad_norm": 5.754796284833849, + "learning_rate": 2.334072464527628e-06, + "loss": 0.3572, + "step": 6594 + }, + { + "epoch": 0.5356127669942338, + "grad_norm": 4.316813249696164, + "learning_rate": 2.333416298149199e-06, + "loss": 0.4839, + "step": 6595 + }, + { + "epoch": 0.5356939819702753, + "grad_norm": 4.974375179617703, + "learning_rate": 2.3327601432974817e-06, + "loss": 0.3869, + "step": 6596 + }, + { + "epoch": 0.5357751969463169, + "grad_norm": 3.985538587500043, + "learning_rate": 2.332104000017877e-06, + "loss": 0.4175, + "step": 6597 + }, + { + "epoch": 0.5358564119223584, + "grad_norm": 3.2055836403047064, + "learning_rate": 2.3314478683557863e-06, + "loss": 0.5612, + "step": 6598 + }, + { + "epoch": 0.5359376268984001, + "grad_norm": 5.214849251275779, + "learning_rate": 2.330791748356612e-06, + "loss": 0.6605, + "step": 6599 + }, + { + "epoch": 0.5360188418744416, + "grad_norm": 5.888955286356365, + "learning_rate": 2.3301356400657527e-06, + "loss": 0.4432, + "step": 6600 + }, + { + "epoch": 0.5361000568504832, + "grad_norm": 4.505062527177114, + "learning_rate": 2.3294795435286073e-06, + "loss": 0.6324, + "step": 6601 + }, + { + "epoch": 0.5361812718265249, + "grad_norm": 79.62251681114921, + "learning_rate": 2.3288234587905767e-06, + "loss": 0.4921, + "step": 6602 + }, + { + "epoch": 0.5362624868025664, + "grad_norm": 7.158021669325846, + "learning_rate": 2.328167385897056e-06, + "loss": 0.4798, + "step": 6603 + }, + { + "epoch": 0.536343701778608, + "grad_norm": 5.336534392507409, + "learning_rate": 2.327511324893442e-06, + "loss": 0.6895, + "step": 6604 + }, + { + "epoch": 0.5364249167546495, + "grad_norm": 5.969891236348079, + "learning_rate": 2.3268552758251327e-06, + "loss": 0.5348, + "step": 6605 + }, + { + "epoch": 0.5365061317306912, + "grad_norm": 5.339767902188447, + "learning_rate": 2.3261992387375216e-06, + "loss": 0.4408, + "step": 6606 + }, + { + "epoch": 0.5365873467067327, + "grad_norm": 6.745067120318976, + "learning_rate": 2.3255432136760026e-06, + "loss": 0.5484, + "step": 6607 + }, + { + "epoch": 0.5366685616827743, + "grad_norm": 6.071978645783486, + "learning_rate": 2.324887200685971e-06, + "loss": 0.5867, + "step": 6608 + }, + { + "epoch": 0.5367497766588158, + "grad_norm": 6.2394195972445665, + "learning_rate": 2.3242311998128182e-06, + "loss": 0.5295, + "step": 6609 + }, + { + "epoch": 0.5368309916348575, + "grad_norm": 5.667495576556501, + "learning_rate": 2.3235752111019362e-06, + "loss": 0.4169, + "step": 6610 + }, + { + "epoch": 0.536912206610899, + "grad_norm": 7.823798452635792, + "learning_rate": 2.3229192345987146e-06, + "loss": 0.6219, + "step": 6611 + }, + { + "epoch": 0.5369934215869406, + "grad_norm": 6.983866314486278, + "learning_rate": 2.322263270348546e-06, + "loss": 0.5665, + "step": 6612 + }, + { + "epoch": 0.5370746365629823, + "grad_norm": 6.526144378674829, + "learning_rate": 2.3216073183968184e-06, + "loss": 0.3971, + "step": 6613 + }, + { + "epoch": 0.5371558515390238, + "grad_norm": 10.243690491824887, + "learning_rate": 2.320951378788919e-06, + "loss": 0.6378, + "step": 6614 + }, + { + "epoch": 0.5372370665150654, + "grad_norm": 4.954484713664851, + "learning_rate": 2.3202954515702384e-06, + "loss": 0.548, + "step": 6615 + }, + { + "epoch": 0.5373182814911069, + "grad_norm": 5.184321630416869, + "learning_rate": 2.3196395367861605e-06, + "loss": 0.5172, + "step": 6616 + }, + { + "epoch": 0.5373994964671486, + "grad_norm": 5.640278703192089, + "learning_rate": 2.3189836344820717e-06, + "loss": 0.4252, + "step": 6617 + }, + { + "epoch": 0.5374807114431901, + "grad_norm": 6.756103044876508, + "learning_rate": 2.318327744703358e-06, + "loss": 0.5939, + "step": 6618 + }, + { + "epoch": 0.5375619264192317, + "grad_norm": 4.587869437489592, + "learning_rate": 2.317671867495403e-06, + "loss": 0.5293, + "step": 6619 + }, + { + "epoch": 0.5376431413952732, + "grad_norm": 5.744181498371175, + "learning_rate": 2.317016002903589e-06, + "loss": 0.4909, + "step": 6620 + }, + { + "epoch": 0.5377243563713149, + "grad_norm": 5.53015069217161, + "learning_rate": 2.3163601509733e-06, + "loss": 0.52, + "step": 6621 + }, + { + "epoch": 0.5378055713473564, + "grad_norm": 6.7221739959300555, + "learning_rate": 2.3157043117499174e-06, + "loss": 0.4972, + "step": 6622 + }, + { + "epoch": 0.537886786323398, + "grad_norm": 4.038968186465272, + "learning_rate": 2.3150484852788186e-06, + "loss": 0.4686, + "step": 6623 + }, + { + "epoch": 0.5379680012994397, + "grad_norm": 5.334775479299188, + "learning_rate": 2.3143926716053876e-06, + "loss": 0.4335, + "step": 6624 + }, + { + "epoch": 0.5380492162754812, + "grad_norm": 8.59183543983034, + "learning_rate": 2.3137368707750018e-06, + "loss": 0.6309, + "step": 6625 + }, + { + "epoch": 0.5381304312515228, + "grad_norm": 4.578718128339019, + "learning_rate": 2.3130810828330375e-06, + "loss": 0.6889, + "step": 6626 + }, + { + "epoch": 0.5382116462275643, + "grad_norm": 7.819423595541969, + "learning_rate": 2.3124253078248734e-06, + "loss": 0.43, + "step": 6627 + }, + { + "epoch": 0.538292861203606, + "grad_norm": 3.229990826836947, + "learning_rate": 2.3117695457958857e-06, + "loss": 0.7549, + "step": 6628 + }, + { + "epoch": 0.5383740761796475, + "grad_norm": 4.562845996906024, + "learning_rate": 2.3111137967914492e-06, + "loss": 0.434, + "step": 6629 + }, + { + "epoch": 0.5384552911556891, + "grad_norm": 5.915615501905179, + "learning_rate": 2.310458060856937e-06, + "loss": 0.3945, + "step": 6630 + }, + { + "epoch": 0.5385365061317307, + "grad_norm": 10.277625736537166, + "learning_rate": 2.3098023380377257e-06, + "loss": 0.542, + "step": 6631 + }, + { + "epoch": 0.5386177211077723, + "grad_norm": 3.9982941253903843, + "learning_rate": 2.309146628379185e-06, + "loss": 0.4956, + "step": 6632 + }, + { + "epoch": 0.5386989360838138, + "grad_norm": 5.587736755912178, + "learning_rate": 2.308490931926687e-06, + "loss": 0.3932, + "step": 6633 + }, + { + "epoch": 0.5387801510598554, + "grad_norm": 6.249822940854045, + "learning_rate": 2.3078352487256045e-06, + "loss": 0.551, + "step": 6634 + }, + { + "epoch": 0.5388613660358971, + "grad_norm": 4.330260149960894, + "learning_rate": 2.3071795788213047e-06, + "loss": 0.419, + "step": 6635 + }, + { + "epoch": 0.5389425810119386, + "grad_norm": 3.9213426854190554, + "learning_rate": 2.3065239222591574e-06, + "loss": 0.912, + "step": 6636 + }, + { + "epoch": 0.5390237959879802, + "grad_norm": 5.204419720367973, + "learning_rate": 2.3058682790845314e-06, + "loss": 0.582, + "step": 6637 + }, + { + "epoch": 0.5391050109640217, + "grad_norm": 6.739029726775385, + "learning_rate": 2.3052126493427934e-06, + "loss": 0.6209, + "step": 6638 + }, + { + "epoch": 0.5391862259400634, + "grad_norm": 4.96555228670431, + "learning_rate": 2.304557033079308e-06, + "loss": 0.431, + "step": 6639 + }, + { + "epoch": 0.5392674409161049, + "grad_norm": 6.401695121326637, + "learning_rate": 2.303901430339442e-06, + "loss": 0.3933, + "step": 6640 + }, + { + "epoch": 0.5393486558921465, + "grad_norm": 6.539736744319487, + "learning_rate": 2.30324584116856e-06, + "loss": 0.5304, + "step": 6641 + }, + { + "epoch": 0.539429870868188, + "grad_norm": 4.494895959996267, + "learning_rate": 2.302590265612023e-06, + "loss": 0.4851, + "step": 6642 + }, + { + "epoch": 0.5395110858442297, + "grad_norm": 5.068900846135241, + "learning_rate": 2.301934703715196e-06, + "loss": 0.542, + "step": 6643 + }, + { + "epoch": 0.5395923008202712, + "grad_norm": 3.670185354744509, + "learning_rate": 2.301279155523439e-06, + "loss": 0.4716, + "step": 6644 + }, + { + "epoch": 0.5396735157963128, + "grad_norm": 3.9777847287273436, + "learning_rate": 2.3006236210821127e-06, + "loss": 0.4095, + "step": 6645 + }, + { + "epoch": 0.5397547307723545, + "grad_norm": 8.621397414880517, + "learning_rate": 2.2999681004365755e-06, + "loss": 0.4291, + "step": 6646 + }, + { + "epoch": 0.539835945748396, + "grad_norm": 4.348209634682363, + "learning_rate": 2.299312593632189e-06, + "loss": 0.5603, + "step": 6647 + }, + { + "epoch": 0.5399171607244376, + "grad_norm": 5.453003219785878, + "learning_rate": 2.298657100714308e-06, + "loss": 0.4495, + "step": 6648 + }, + { + "epoch": 0.5399983757004791, + "grad_norm": 6.222935801462418, + "learning_rate": 2.2980016217282892e-06, + "loss": 0.5602, + "step": 6649 + }, + { + "epoch": 0.5400795906765208, + "grad_norm": 6.456779266203264, + "learning_rate": 2.2973461567194903e-06, + "loss": 0.6055, + "step": 6650 + }, + { + "epoch": 0.5401608056525623, + "grad_norm": 2.812177928800366, + "learning_rate": 2.296690705733265e-06, + "loss": 0.5931, + "step": 6651 + }, + { + "epoch": 0.5402420206286039, + "grad_norm": 5.628644054902416, + "learning_rate": 2.2960352688149657e-06, + "loss": 0.5323, + "step": 6652 + }, + { + "epoch": 0.5403232356046455, + "grad_norm": 6.158902153206404, + "learning_rate": 2.295379846009947e-06, + "loss": 0.6064, + "step": 6653 + }, + { + "epoch": 0.5404044505806871, + "grad_norm": 4.596838238746014, + "learning_rate": 2.2947244373635608e-06, + "loss": 0.5383, + "step": 6654 + }, + { + "epoch": 0.5404856655567286, + "grad_norm": 10.141289999607887, + "learning_rate": 2.294069042921156e-06, + "loss": 0.4802, + "step": 6655 + }, + { + "epoch": 0.5405668805327702, + "grad_norm": 3.56367135718309, + "learning_rate": 2.2934136627280834e-06, + "loss": 0.5922, + "step": 6656 + }, + { + "epoch": 0.5406480955088119, + "grad_norm": 4.957818406371747, + "learning_rate": 2.292758296829693e-06, + "loss": 0.7018, + "step": 6657 + }, + { + "epoch": 0.5407293104848534, + "grad_norm": 4.552129764687744, + "learning_rate": 2.2921029452713305e-06, + "loss": 0.5328, + "step": 6658 + }, + { + "epoch": 0.540810525460895, + "grad_norm": 5.962108877745079, + "learning_rate": 2.291447608098345e-06, + "loss": 0.5459, + "step": 6659 + }, + { + "epoch": 0.5408917404369366, + "grad_norm": 3.831361014930282, + "learning_rate": 2.290792285356081e-06, + "loss": 0.445, + "step": 6660 + }, + { + "epoch": 0.5409729554129782, + "grad_norm": 4.039508905590606, + "learning_rate": 2.290136977089883e-06, + "loss": 0.6447, + "step": 6661 + }, + { + "epoch": 0.5410541703890197, + "grad_norm": 4.701839781470267, + "learning_rate": 2.289481683345096e-06, + "loss": 0.4856, + "step": 6662 + }, + { + "epoch": 0.5411353853650613, + "grad_norm": 5.28893125359557, + "learning_rate": 2.2888264041670625e-06, + "loss": 0.3886, + "step": 6663 + }, + { + "epoch": 0.5412166003411029, + "grad_norm": 4.226127643770201, + "learning_rate": 2.288171139601124e-06, + "loss": 0.451, + "step": 6664 + }, + { + "epoch": 0.5412978153171445, + "grad_norm": 4.150106558822198, + "learning_rate": 2.287515889692621e-06, + "loss": 0.585, + "step": 6665 + }, + { + "epoch": 0.541379030293186, + "grad_norm": 7.149187546807345, + "learning_rate": 2.2868606544868947e-06, + "loss": 0.5057, + "step": 6666 + }, + { + "epoch": 0.5414602452692276, + "grad_norm": 4.941704759283249, + "learning_rate": 2.2862054340292835e-06, + "loss": 0.5628, + "step": 6667 + }, + { + "epoch": 0.5415414602452693, + "grad_norm": 6.843536039362226, + "learning_rate": 2.2855502283651238e-06, + "loss": 0.5538, + "step": 6668 + }, + { + "epoch": 0.5416226752213108, + "grad_norm": 6.747892070688352, + "learning_rate": 2.284895037539753e-06, + "loss": 0.5538, + "step": 6669 + }, + { + "epoch": 0.5417038901973524, + "grad_norm": 4.985721031141658, + "learning_rate": 2.2842398615985086e-06, + "loss": 0.5889, + "step": 6670 + }, + { + "epoch": 0.541785105173394, + "grad_norm": 3.9696526712453686, + "learning_rate": 2.283584700586723e-06, + "loss": 0.4235, + "step": 6671 + }, + { + "epoch": 0.5418663201494356, + "grad_norm": 4.370310826153604, + "learning_rate": 2.2829295545497304e-06, + "loss": 0.565, + "step": 6672 + }, + { + "epoch": 0.5419475351254771, + "grad_norm": 7.303076408805493, + "learning_rate": 2.282274423532865e-06, + "loss": 0.4592, + "step": 6673 + }, + { + "epoch": 0.5420287501015187, + "grad_norm": 5.82847333716156, + "learning_rate": 2.2816193075814557e-06, + "loss": 0.519, + "step": 6674 + }, + { + "epoch": 0.5421099650775603, + "grad_norm": 3.1507733594386704, + "learning_rate": 2.280964206740835e-06, + "loss": 0.5885, + "step": 6675 + }, + { + "epoch": 0.5421911800536019, + "grad_norm": 5.983802017759874, + "learning_rate": 2.280309121056333e-06, + "loss": 0.4888, + "step": 6676 + }, + { + "epoch": 0.5422723950296434, + "grad_norm": 3.19378590139564, + "learning_rate": 2.279654050573276e-06, + "loss": 0.4794, + "step": 6677 + }, + { + "epoch": 0.542353610005685, + "grad_norm": 4.270799718455597, + "learning_rate": 2.2789989953369924e-06, + "loss": 0.5408, + "step": 6678 + }, + { + "epoch": 0.5424348249817267, + "grad_norm": 5.277701316589272, + "learning_rate": 2.27834395539281e-06, + "loss": 0.4466, + "step": 6679 + }, + { + "epoch": 0.5425160399577682, + "grad_norm": 5.962388380985894, + "learning_rate": 2.2776889307860513e-06, + "loss": 0.4871, + "step": 6680 + }, + { + "epoch": 0.5425972549338098, + "grad_norm": 4.937531617716085, + "learning_rate": 2.2770339215620433e-06, + "loss": 0.4022, + "step": 6681 + }, + { + "epoch": 0.5426784699098514, + "grad_norm": 4.989377028856358, + "learning_rate": 2.2763789277661077e-06, + "loss": 0.4676, + "step": 6682 + }, + { + "epoch": 0.542759684885893, + "grad_norm": 3.814875085144808, + "learning_rate": 2.2757239494435666e-06, + "loss": 0.4234, + "step": 6683 + }, + { + "epoch": 0.5428408998619345, + "grad_norm": 6.832068792784659, + "learning_rate": 2.2750689866397407e-06, + "loss": 0.6319, + "step": 6684 + }, + { + "epoch": 0.5429221148379761, + "grad_norm": 26.304115965543602, + "learning_rate": 2.2744140393999507e-06, + "loss": 0.3713, + "step": 6685 + }, + { + "epoch": 0.5430033298140177, + "grad_norm": 7.2769279119321135, + "learning_rate": 2.273759107769516e-06, + "loss": 0.6063, + "step": 6686 + }, + { + "epoch": 0.5430845447900593, + "grad_norm": 4.819897180382021, + "learning_rate": 2.2731041917937524e-06, + "loss": 0.5373, + "step": 6687 + }, + { + "epoch": 0.5431657597661008, + "grad_norm": 5.898798059857416, + "learning_rate": 2.2724492915179787e-06, + "loss": 0.5572, + "step": 6688 + }, + { + "epoch": 0.5432469747421425, + "grad_norm": 4.441345383933851, + "learning_rate": 2.27179440698751e-06, + "loss": 0.6711, + "step": 6689 + }, + { + "epoch": 0.5433281897181841, + "grad_norm": 3.8157128193761185, + "learning_rate": 2.2711395382476595e-06, + "loss": 0.6712, + "step": 6690 + }, + { + "epoch": 0.5434094046942256, + "grad_norm": 4.388902512240195, + "learning_rate": 2.2704846853437424e-06, + "loss": 0.5042, + "step": 6691 + }, + { + "epoch": 0.5434906196702672, + "grad_norm": 7.873275069114776, + "learning_rate": 2.269829848321071e-06, + "loss": 0.4374, + "step": 6692 + }, + { + "epoch": 0.5435718346463088, + "grad_norm": 5.506546269665942, + "learning_rate": 2.2691750272249545e-06, + "loss": 0.3936, + "step": 6693 + }, + { + "epoch": 0.5436530496223504, + "grad_norm": 4.315922170242298, + "learning_rate": 2.2685202221007057e-06, + "loss": 0.66, + "step": 6694 + }, + { + "epoch": 0.5437342645983919, + "grad_norm": 41.128866529656484, + "learning_rate": 2.2678654329936322e-06, + "loss": 0.4566, + "step": 6695 + }, + { + "epoch": 0.5438154795744335, + "grad_norm": 4.547047318098447, + "learning_rate": 2.267210659949042e-06, + "loss": 0.5562, + "step": 6696 + }, + { + "epoch": 0.5438966945504751, + "grad_norm": 5.996426610029105, + "learning_rate": 2.2665559030122424e-06, + "loss": 0.4198, + "step": 6697 + }, + { + "epoch": 0.5439779095265167, + "grad_norm": 4.366420114566318, + "learning_rate": 2.2659011622285383e-06, + "loss": 0.468, + "step": 6698 + }, + { + "epoch": 0.5440591245025582, + "grad_norm": 4.742561220461673, + "learning_rate": 2.265246437643236e-06, + "loss": 0.4206, + "step": 6699 + }, + { + "epoch": 0.5441403394785999, + "grad_norm": 5.981908620139316, + "learning_rate": 2.2645917293016363e-06, + "loss": 0.6577, + "step": 6700 + }, + { + "epoch": 0.5442215544546415, + "grad_norm": 3.9854122807772723, + "learning_rate": 2.2639370372490434e-06, + "loss": 0.442, + "step": 6701 + }, + { + "epoch": 0.544302769430683, + "grad_norm": 4.317869994988899, + "learning_rate": 2.263282361530759e-06, + "loss": 0.6317, + "step": 6702 + }, + { + "epoch": 0.5443839844067246, + "grad_norm": 3.2344627144715874, + "learning_rate": 2.2626277021920813e-06, + "loss": 0.4615, + "step": 6703 + }, + { + "epoch": 0.5444651993827662, + "grad_norm": 6.42718836422913, + "learning_rate": 2.2619730592783108e-06, + "loss": 0.5471, + "step": 6704 + }, + { + "epoch": 0.5445464143588078, + "grad_norm": 4.488283000655936, + "learning_rate": 2.2613184328347453e-06, + "loss": 0.4092, + "step": 6705 + }, + { + "epoch": 0.5446276293348493, + "grad_norm": 6.305967356057076, + "learning_rate": 2.2606638229066802e-06, + "loss": 0.6374, + "step": 6706 + }, + { + "epoch": 0.544708844310891, + "grad_norm": 4.2258432165081565, + "learning_rate": 2.2600092295394125e-06, + "loss": 0.5745, + "step": 6707 + }, + { + "epoch": 0.5447900592869325, + "grad_norm": 4.384272018008662, + "learning_rate": 2.2593546527782362e-06, + "loss": 0.433, + "step": 6708 + }, + { + "epoch": 0.5448712742629741, + "grad_norm": 6.576525934865906, + "learning_rate": 2.2587000926684432e-06, + "loss": 0.5326, + "step": 6709 + }, + { + "epoch": 0.5449524892390156, + "grad_norm": 5.555291917382588, + "learning_rate": 2.258045549255328e-06, + "loss": 0.4881, + "step": 6710 + }, + { + "epoch": 0.5450337042150573, + "grad_norm": 5.546197578764206, + "learning_rate": 2.25739102258418e-06, + "loss": 0.479, + "step": 6711 + }, + { + "epoch": 0.5451149191910989, + "grad_norm": 6.1439652570444325, + "learning_rate": 2.256736512700288e-06, + "loss": 0.4543, + "step": 6712 + }, + { + "epoch": 0.5451961341671404, + "grad_norm": 5.653213757078279, + "learning_rate": 2.2560820196489437e-06, + "loss": 0.5768, + "step": 6713 + }, + { + "epoch": 0.545277349143182, + "grad_norm": 4.21691379783433, + "learning_rate": 2.255427543475432e-06, + "loss": 0.4927, + "step": 6714 + }, + { + "epoch": 0.5453585641192236, + "grad_norm": 6.798777933865642, + "learning_rate": 2.254773084225039e-06, + "loss": 0.4237, + "step": 6715 + }, + { + "epoch": 0.5454397790952652, + "grad_norm": 4.819909494343674, + "learning_rate": 2.254118641943052e-06, + "loss": 0.4288, + "step": 6716 + }, + { + "epoch": 0.5455209940713067, + "grad_norm": 7.439931633022933, + "learning_rate": 2.253464216674753e-06, + "loss": 0.482, + "step": 6717 + }, + { + "epoch": 0.5456022090473484, + "grad_norm": 7.792402778020558, + "learning_rate": 2.2528098084654262e-06, + "loss": 0.614, + "step": 6718 + }, + { + "epoch": 0.5456834240233899, + "grad_norm": 4.495435217507245, + "learning_rate": 2.2521554173603513e-06, + "loss": 0.4562, + "step": 6719 + }, + { + "epoch": 0.5457646389994315, + "grad_norm": 4.419124752755961, + "learning_rate": 2.25150104340481e-06, + "loss": 0.5417, + "step": 6720 + }, + { + "epoch": 0.545845853975473, + "grad_norm": 6.332131149232858, + "learning_rate": 2.2508466866440824e-06, + "loss": 0.4154, + "step": 6721 + }, + { + "epoch": 0.5459270689515147, + "grad_norm": 6.610876686409083, + "learning_rate": 2.2501923471234444e-06, + "loss": 0.49, + "step": 6722 + }, + { + "epoch": 0.5460082839275563, + "grad_norm": 17.017885967526706, + "learning_rate": 2.249538024888174e-06, + "loss": 0.4733, + "step": 6723 + }, + { + "epoch": 0.5460894989035978, + "grad_norm": 7.28235162312456, + "learning_rate": 2.2488837199835477e-06, + "loss": 0.6985, + "step": 6724 + }, + { + "epoch": 0.5461707138796394, + "grad_norm": 6.165256861279543, + "learning_rate": 2.2482294324548376e-06, + "loss": 0.5424, + "step": 6725 + }, + { + "epoch": 0.546251928855681, + "grad_norm": 7.302972322656833, + "learning_rate": 2.2475751623473193e-06, + "loss": 0.6049, + "step": 6726 + }, + { + "epoch": 0.5463331438317226, + "grad_norm": 4.71990096240607, + "learning_rate": 2.2469209097062637e-06, + "loss": 0.4342, + "step": 6727 + }, + { + "epoch": 0.5464143588077641, + "grad_norm": 6.900667478651783, + "learning_rate": 2.246266674576941e-06, + "loss": 0.4572, + "step": 6728 + }, + { + "epoch": 0.5464955737838058, + "grad_norm": 2.982802060269296, + "learning_rate": 2.245612457004622e-06, + "loss": 0.4723, + "step": 6729 + }, + { + "epoch": 0.5465767887598473, + "grad_norm": 4.176653744332102, + "learning_rate": 2.244958257034575e-06, + "loss": 0.4191, + "step": 6730 + }, + { + "epoch": 0.5466580037358889, + "grad_norm": 4.511011865460297, + "learning_rate": 2.244304074712066e-06, + "loss": 0.4973, + "step": 6731 + }, + { + "epoch": 0.5467392187119304, + "grad_norm": 4.202369107086116, + "learning_rate": 2.243649910082363e-06, + "loss": 0.6125, + "step": 6732 + }, + { + "epoch": 0.5468204336879721, + "grad_norm": 7.339054515036435, + "learning_rate": 2.2429957631907285e-06, + "loss": 0.444, + "step": 6733 + }, + { + "epoch": 0.5469016486640137, + "grad_norm": 4.854339006065371, + "learning_rate": 2.2423416340824266e-06, + "loss": 0.4205, + "step": 6734 + }, + { + "epoch": 0.5469828636400552, + "grad_norm": 5.693035824826316, + "learning_rate": 2.241687522802721e-06, + "loss": 0.5089, + "step": 6735 + }, + { + "epoch": 0.5470640786160968, + "grad_norm": 4.855926081415383, + "learning_rate": 2.2410334293968716e-06, + "loss": 0.43, + "step": 6736 + }, + { + "epoch": 0.5471452935921384, + "grad_norm": 4.266917082773724, + "learning_rate": 2.2403793539101387e-06, + "loss": 0.4145, + "step": 6737 + }, + { + "epoch": 0.54722650856818, + "grad_norm": 5.411690627496338, + "learning_rate": 2.2397252963877795e-06, + "loss": 0.5073, + "step": 6738 + }, + { + "epoch": 0.5473077235442215, + "grad_norm": 2.6098814011197096, + "learning_rate": 2.239071256875053e-06, + "loss": 0.5423, + "step": 6739 + }, + { + "epoch": 0.5473889385202632, + "grad_norm": 5.3581580919616885, + "learning_rate": 2.238417235417214e-06, + "loss": 0.4069, + "step": 6740 + }, + { + "epoch": 0.5474701534963047, + "grad_norm": 4.544117591382317, + "learning_rate": 2.237763232059518e-06, + "loss": 0.4756, + "step": 6741 + }, + { + "epoch": 0.5475513684723463, + "grad_norm": 18.0511878541294, + "learning_rate": 2.2371092468472193e-06, + "loss": 0.5058, + "step": 6742 + }, + { + "epoch": 0.5476325834483878, + "grad_norm": 6.268300319177437, + "learning_rate": 2.236455279825569e-06, + "loss": 0.4712, + "step": 6743 + }, + { + "epoch": 0.5477137984244295, + "grad_norm": 3.663452778757879, + "learning_rate": 2.2358013310398174e-06, + "loss": 0.5031, + "step": 6744 + }, + { + "epoch": 0.5477950134004711, + "grad_norm": 4.936451070405641, + "learning_rate": 2.235147400535217e-06, + "loss": 0.4914, + "step": 6745 + }, + { + "epoch": 0.5478762283765126, + "grad_norm": 5.976409510815185, + "learning_rate": 2.2344934883570143e-06, + "loss": 0.5417, + "step": 6746 + }, + { + "epoch": 0.5479574433525543, + "grad_norm": 7.332839588724573, + "learning_rate": 2.2338395945504557e-06, + "loss": 0.525, + "step": 6747 + }, + { + "epoch": 0.5480386583285958, + "grad_norm": 5.026062475739817, + "learning_rate": 2.23318571916079e-06, + "loss": 0.6906, + "step": 6748 + }, + { + "epoch": 0.5481198733046374, + "grad_norm": 5.070833605866744, + "learning_rate": 2.2325318622332606e-06, + "loss": 0.4563, + "step": 6749 + }, + { + "epoch": 0.5482010882806789, + "grad_norm": 4.9673547801455955, + "learning_rate": 2.2318780238131095e-06, + "loss": 0.5338, + "step": 6750 + }, + { + "epoch": 0.5482823032567206, + "grad_norm": 3.9729119791581744, + "learning_rate": 2.2312242039455816e-06, + "loss": 0.503, + "step": 6751 + }, + { + "epoch": 0.5483635182327621, + "grad_norm": 3.9917626185437878, + "learning_rate": 2.230570402675916e-06, + "loss": 0.3857, + "step": 6752 + }, + { + "epoch": 0.5484447332088037, + "grad_norm": 4.014300920446785, + "learning_rate": 2.2299166200493526e-06, + "loss": 0.4657, + "step": 6753 + }, + { + "epoch": 0.5485259481848452, + "grad_norm": 5.148556943279063, + "learning_rate": 2.2292628561111285e-06, + "loss": 0.4921, + "step": 6754 + }, + { + "epoch": 0.5486071631608869, + "grad_norm": 3.2886345422623715, + "learning_rate": 2.228609110906483e-06, + "loss": 0.4815, + "step": 6755 + }, + { + "epoch": 0.5486883781369285, + "grad_norm": 9.805600602297103, + "learning_rate": 2.2279553844806506e-06, + "loss": 0.5324, + "step": 6756 + }, + { + "epoch": 0.54876959311297, + "grad_norm": 7.9338566902011385, + "learning_rate": 2.2273016768788653e-06, + "loss": 0.4963, + "step": 6757 + }, + { + "epoch": 0.5488508080890117, + "grad_norm": 4.352389013230028, + "learning_rate": 2.2266479881463614e-06, + "loss": 0.4975, + "step": 6758 + }, + { + "epoch": 0.5489320230650532, + "grad_norm": 5.035317373817535, + "learning_rate": 2.2259943183283696e-06, + "loss": 0.5316, + "step": 6759 + }, + { + "epoch": 0.5490132380410948, + "grad_norm": 6.329671082512638, + "learning_rate": 2.2253406674701206e-06, + "loss": 0.4758, + "step": 6760 + }, + { + "epoch": 0.5490944530171363, + "grad_norm": 4.350812102992751, + "learning_rate": 2.2246870356168447e-06, + "loss": 0.3731, + "step": 6761 + }, + { + "epoch": 0.549175667993178, + "grad_norm": 4.411297111261324, + "learning_rate": 2.224033422813768e-06, + "loss": 0.4601, + "step": 6762 + }, + { + "epoch": 0.5492568829692195, + "grad_norm": 3.3546196364460172, + "learning_rate": 2.2233798291061177e-06, + "loss": 0.5048, + "step": 6763 + }, + { + "epoch": 0.5493380979452611, + "grad_norm": 5.75969034058185, + "learning_rate": 2.2227262545391204e-06, + "loss": 0.5135, + "step": 6764 + }, + { + "epoch": 0.5494193129213026, + "grad_norm": 4.676293581342606, + "learning_rate": 2.222072699157998e-06, + "loss": 0.382, + "step": 6765 + }, + { + "epoch": 0.5495005278973443, + "grad_norm": 4.301771743349808, + "learning_rate": 2.2214191630079733e-06, + "loss": 0.4019, + "step": 6766 + }, + { + "epoch": 0.5495817428733859, + "grad_norm": 8.356786267103574, + "learning_rate": 2.2207656461342696e-06, + "loss": 0.486, + "step": 6767 + }, + { + "epoch": 0.5496629578494274, + "grad_norm": 5.187735340758213, + "learning_rate": 2.2201121485821053e-06, + "loss": 0.5106, + "step": 6768 + }, + { + "epoch": 0.5497441728254691, + "grad_norm": 14.002790250484441, + "learning_rate": 2.2194586703966976e-06, + "loss": 0.5786, + "step": 6769 + }, + { + "epoch": 0.5498253878015106, + "grad_norm": 4.598645970086092, + "learning_rate": 2.218805211623266e-06, + "loss": 0.6202, + "step": 6770 + }, + { + "epoch": 0.5499066027775522, + "grad_norm": 7.035714823525913, + "learning_rate": 2.2181517723070263e-06, + "loss": 0.3767, + "step": 6771 + }, + { + "epoch": 0.5499878177535937, + "grad_norm": 3.789629383739064, + "learning_rate": 2.2174983524931916e-06, + "loss": 0.4665, + "step": 6772 + }, + { + "epoch": 0.5500690327296354, + "grad_norm": 5.153278702932309, + "learning_rate": 2.216844952226975e-06, + "loss": 0.5535, + "step": 6773 + }, + { + "epoch": 0.5501502477056769, + "grad_norm": 5.090968399667655, + "learning_rate": 2.2161915715535903e-06, + "loss": 0.5416, + "step": 6774 + }, + { + "epoch": 0.5502314626817185, + "grad_norm": 4.880285899644429, + "learning_rate": 2.2155382105182462e-06, + "loss": 0.528, + "step": 6775 + }, + { + "epoch": 0.55031267765776, + "grad_norm": 7.133831222495926, + "learning_rate": 2.214884869166152e-06, + "loss": 0.4645, + "step": 6776 + }, + { + "epoch": 0.5503938926338017, + "grad_norm": 5.111553296411102, + "learning_rate": 2.214231547542517e-06, + "loss": 0.5628, + "step": 6777 + }, + { + "epoch": 0.5504751076098433, + "grad_norm": 4.076454306491572, + "learning_rate": 2.213578245692546e-06, + "loss": 0.4838, + "step": 6778 + }, + { + "epoch": 0.5505563225858848, + "grad_norm": 5.211719936927261, + "learning_rate": 2.2129249636614443e-06, + "loss": 0.563, + "step": 6779 + }, + { + "epoch": 0.5506375375619265, + "grad_norm": 12.302787876796701, + "learning_rate": 2.2122717014944167e-06, + "loss": 0.4786, + "step": 6780 + }, + { + "epoch": 0.550718752537968, + "grad_norm": 4.330340043687264, + "learning_rate": 2.2116184592366643e-06, + "loss": 0.5747, + "step": 6781 + }, + { + "epoch": 0.5507999675140096, + "grad_norm": 5.034937065269623, + "learning_rate": 2.2109652369333873e-06, + "loss": 0.4252, + "step": 6782 + }, + { + "epoch": 0.5508811824900511, + "grad_norm": 7.614248716990825, + "learning_rate": 2.2103120346297864e-06, + "loss": 0.4404, + "step": 6783 + }, + { + "epoch": 0.5509623974660928, + "grad_norm": 7.1625705175917425, + "learning_rate": 2.2096588523710606e-06, + "loss": 0.5373, + "step": 6784 + }, + { + "epoch": 0.5510436124421343, + "grad_norm": 4.064726317144616, + "learning_rate": 2.2090056902024045e-06, + "loss": 0.4773, + "step": 6785 + }, + { + "epoch": 0.5511248274181759, + "grad_norm": 4.1799707253524065, + "learning_rate": 2.208352548169015e-06, + "loss": 0.5596, + "step": 6786 + }, + { + "epoch": 0.5512060423942174, + "grad_norm": 5.2430508668568425, + "learning_rate": 2.2076994263160863e-06, + "loss": 0.5455, + "step": 6787 + }, + { + "epoch": 0.5512872573702591, + "grad_norm": 5.514510034389823, + "learning_rate": 2.2070463246888094e-06, + "loss": 0.4733, + "step": 6788 + }, + { + "epoch": 0.5513684723463007, + "grad_norm": 4.50738218564209, + "learning_rate": 2.206393243332376e-06, + "loss": 0.5448, + "step": 6789 + }, + { + "epoch": 0.5514496873223422, + "grad_norm": 4.34724451667795, + "learning_rate": 2.2057401822919775e-06, + "loss": 0.5013, + "step": 6790 + }, + { + "epoch": 0.5515309022983839, + "grad_norm": 5.578815799717539, + "learning_rate": 2.2050871416128005e-06, + "loss": 0.5814, + "step": 6791 + }, + { + "epoch": 0.5516121172744254, + "grad_norm": 6.701219984709083, + "learning_rate": 2.204434121340032e-06, + "loss": 0.3959, + "step": 6792 + }, + { + "epoch": 0.551693332250467, + "grad_norm": 3.6134666388120404, + "learning_rate": 2.203781121518859e-06, + "loss": 0.536, + "step": 6793 + }, + { + "epoch": 0.5517745472265085, + "grad_norm": 6.117283840596765, + "learning_rate": 2.2031281421944643e-06, + "loss": 0.4515, + "step": 6794 + }, + { + "epoch": 0.5518557622025502, + "grad_norm": 3.9112049782621003, + "learning_rate": 2.2024751834120302e-06, + "loss": 0.4622, + "step": 6795 + }, + { + "epoch": 0.5519369771785917, + "grad_norm": 3.8079103420465437, + "learning_rate": 2.20182224521674e-06, + "loss": 0.5821, + "step": 6796 + }, + { + "epoch": 0.5520181921546333, + "grad_norm": 5.019201884297034, + "learning_rate": 2.2011693276537722e-06, + "loss": 0.6276, + "step": 6797 + }, + { + "epoch": 0.5520994071306748, + "grad_norm": 5.4407005403339825, + "learning_rate": 2.2005164307683047e-06, + "loss": 0.4406, + "step": 6798 + }, + { + "epoch": 0.5521806221067165, + "grad_norm": 4.301332089064263, + "learning_rate": 2.199863554605515e-06, + "loss": 0.5167, + "step": 6799 + }, + { + "epoch": 0.5522618370827581, + "grad_norm": 7.493043226544131, + "learning_rate": 2.19921069921058e-06, + "loss": 0.5069, + "step": 6800 + }, + { + "epoch": 0.5523430520587996, + "grad_norm": 3.8437035341706967, + "learning_rate": 2.1985578646286717e-06, + "loss": 0.6306, + "step": 6801 + }, + { + "epoch": 0.5524242670348413, + "grad_norm": 3.805644122254566, + "learning_rate": 2.197905050904964e-06, + "loss": 0.4094, + "step": 6802 + }, + { + "epoch": 0.5525054820108828, + "grad_norm": 6.899378255822541, + "learning_rate": 2.197252258084629e-06, + "loss": 0.4034, + "step": 6803 + }, + { + "epoch": 0.5525866969869244, + "grad_norm": 7.710934181517784, + "learning_rate": 2.196599486212834e-06, + "loss": 0.5314, + "step": 6804 + }, + { + "epoch": 0.5526679119629659, + "grad_norm": 3.3575643225535856, + "learning_rate": 2.1959467353347494e-06, + "loss": 0.4281, + "step": 6805 + }, + { + "epoch": 0.5527491269390076, + "grad_norm": 3.714838991436428, + "learning_rate": 2.195294005495542e-06, + "loss": 0.5778, + "step": 6806 + }, + { + "epoch": 0.5528303419150491, + "grad_norm": 6.171375505356923, + "learning_rate": 2.1946412967403763e-06, + "loss": 0.5032, + "step": 6807 + }, + { + "epoch": 0.5529115568910907, + "grad_norm": 5.843750427655172, + "learning_rate": 2.1939886091144165e-06, + "loss": 0.4943, + "step": 6808 + }, + { + "epoch": 0.5529927718671322, + "grad_norm": 3.222741223173242, + "learning_rate": 2.193335942662826e-06, + "loss": 0.4707, + "step": 6809 + }, + { + "epoch": 0.5530739868431739, + "grad_norm": 3.916188730239071, + "learning_rate": 2.192683297430766e-06, + "loss": 0.4572, + "step": 6810 + }, + { + "epoch": 0.5531552018192155, + "grad_norm": 4.698273133823479, + "learning_rate": 2.1920306734633932e-06, + "loss": 0.4173, + "step": 6811 + }, + { + "epoch": 0.553236416795257, + "grad_norm": 5.974971528842556, + "learning_rate": 2.1913780708058694e-06, + "loss": 0.5129, + "step": 6812 + }, + { + "epoch": 0.5533176317712987, + "grad_norm": 4.899970058534168, + "learning_rate": 2.19072548950335e-06, + "loss": 0.4309, + "step": 6813 + }, + { + "epoch": 0.5533988467473402, + "grad_norm": 4.224538543746903, + "learning_rate": 2.190072929600989e-06, + "loss": 0.6603, + "step": 6814 + }, + { + "epoch": 0.5534800617233818, + "grad_norm": 9.421465257051333, + "learning_rate": 2.189420391143941e-06, + "loss": 0.4734, + "step": 6815 + }, + { + "epoch": 0.5535612766994233, + "grad_norm": 6.665428434932173, + "learning_rate": 2.1887678741773592e-06, + "loss": 0.5195, + "step": 6816 + }, + { + "epoch": 0.553642491675465, + "grad_norm": 2.89387484029611, + "learning_rate": 2.188115378746392e-06, + "loss": 0.4763, + "step": 6817 + }, + { + "epoch": 0.5537237066515065, + "grad_norm": 6.823773735416532, + "learning_rate": 2.1874629048961904e-06, + "loss": 0.5321, + "step": 6818 + }, + { + "epoch": 0.5538049216275481, + "grad_norm": 4.521778809799006, + "learning_rate": 2.1868104526719023e-06, + "loss": 0.5816, + "step": 6819 + }, + { + "epoch": 0.5538861366035897, + "grad_norm": 3.3396433922902906, + "learning_rate": 2.1861580221186726e-06, + "loss": 0.4168, + "step": 6820 + }, + { + "epoch": 0.5539673515796313, + "grad_norm": 3.3374583977198036, + "learning_rate": 2.185505613281647e-06, + "loss": 0.4614, + "step": 6821 + }, + { + "epoch": 0.5540485665556729, + "grad_norm": 6.068612637720992, + "learning_rate": 2.1848532262059696e-06, + "loss": 0.5797, + "step": 6822 + }, + { + "epoch": 0.5541297815317144, + "grad_norm": 5.0334311376181144, + "learning_rate": 2.1842008609367794e-06, + "loss": 0.6781, + "step": 6823 + }, + { + "epoch": 0.5542109965077561, + "grad_norm": 4.81150612829465, + "learning_rate": 2.183548517519219e-06, + "loss": 0.4113, + "step": 6824 + }, + { + "epoch": 0.5542922114837976, + "grad_norm": 7.137103277777565, + "learning_rate": 2.1828961959984267e-06, + "loss": 0.502, + "step": 6825 + }, + { + "epoch": 0.5543734264598392, + "grad_norm": 4.93103788302294, + "learning_rate": 2.18224389641954e-06, + "loss": 0.3781, + "step": 6826 + }, + { + "epoch": 0.5544546414358807, + "grad_norm": 4.17706299919582, + "learning_rate": 2.1815916188276925e-06, + "loss": 0.5585, + "step": 6827 + }, + { + "epoch": 0.5545358564119224, + "grad_norm": 6.194948711383427, + "learning_rate": 2.18093936326802e-06, + "loss": 0.5219, + "step": 6828 + }, + { + "epoch": 0.5546170713879639, + "grad_norm": 7.761708704297234, + "learning_rate": 2.180287129785656e-06, + "loss": 0.4665, + "step": 6829 + }, + { + "epoch": 0.5546982863640055, + "grad_norm": 5.591231620591249, + "learning_rate": 2.1796349184257294e-06, + "loss": 0.4973, + "step": 6830 + }, + { + "epoch": 0.554779501340047, + "grad_norm": 3.9216643074294595, + "learning_rate": 2.1789827292333717e-06, + "loss": 0.3763, + "step": 6831 + }, + { + "epoch": 0.5548607163160887, + "grad_norm": 4.444201370705435, + "learning_rate": 2.1783305622537106e-06, + "loss": 0.522, + "step": 6832 + }, + { + "epoch": 0.5549419312921303, + "grad_norm": 3.5234577542993972, + "learning_rate": 2.1776784175318705e-06, + "loss": 0.6173, + "step": 6833 + }, + { + "epoch": 0.5550231462681718, + "grad_norm": 5.522357961927791, + "learning_rate": 2.1770262951129792e-06, + "loss": 0.4304, + "step": 6834 + }, + { + "epoch": 0.5551043612442135, + "grad_norm": 2.753675801015695, + "learning_rate": 2.1763741950421595e-06, + "loss": 0.5176, + "step": 6835 + }, + { + "epoch": 0.555185576220255, + "grad_norm": 5.5643466943030715, + "learning_rate": 2.175722117364531e-06, + "loss": 0.6027, + "step": 6836 + }, + { + "epoch": 0.5552667911962966, + "grad_norm": 2.6499029128692957, + "learning_rate": 2.175070062125217e-06, + "loss": 0.472, + "step": 6837 + }, + { + "epoch": 0.5553480061723381, + "grad_norm": 11.241112599357884, + "learning_rate": 2.1744180293693355e-06, + "loss": 0.5573, + "step": 6838 + }, + { + "epoch": 0.5554292211483798, + "grad_norm": 5.094638009416633, + "learning_rate": 2.173766019142002e-06, + "loss": 0.4747, + "step": 6839 + }, + { + "epoch": 0.5555104361244213, + "grad_norm": 6.191994268871611, + "learning_rate": 2.1731140314883346e-06, + "loss": 0.5245, + "step": 6840 + }, + { + "epoch": 0.5555916511004629, + "grad_norm": 4.2743419408023495, + "learning_rate": 2.1724620664534453e-06, + "loss": 0.508, + "step": 6841 + }, + { + "epoch": 0.5556728660765045, + "grad_norm": 12.369287470212933, + "learning_rate": 2.1718101240824485e-06, + "loss": 0.6018, + "step": 6842 + }, + { + "epoch": 0.5557540810525461, + "grad_norm": 6.34277452834017, + "learning_rate": 2.171158204420453e-06, + "loss": 0.5897, + "step": 6843 + }, + { + "epoch": 0.5558352960285877, + "grad_norm": 3.6842271055521763, + "learning_rate": 2.17050630751257e-06, + "loss": 0.3811, + "step": 6844 + }, + { + "epoch": 0.5559165110046292, + "grad_norm": 4.1774312445120545, + "learning_rate": 2.169854433403907e-06, + "loss": 0.7328, + "step": 6845 + }, + { + "epoch": 0.5559977259806709, + "grad_norm": 4.702814140349739, + "learning_rate": 2.169202582139569e-06, + "loss": 0.5899, + "step": 6846 + }, + { + "epoch": 0.5560789409567124, + "grad_norm": 5.095589215614225, + "learning_rate": 2.1685507537646622e-06, + "loss": 0.4751, + "step": 6847 + }, + { + "epoch": 0.556160155932754, + "grad_norm": 5.703171345481269, + "learning_rate": 2.1678989483242896e-06, + "loss": 0.4403, + "step": 6848 + }, + { + "epoch": 0.5562413709087956, + "grad_norm": 4.75054750690896, + "learning_rate": 2.1672471658635506e-06, + "loss": 0.5135, + "step": 6849 + }, + { + "epoch": 0.5563225858848372, + "grad_norm": 3.3902339048047305, + "learning_rate": 2.166595406427548e-06, + "loss": 0.6971, + "step": 6850 + }, + { + "epoch": 0.5564038008608787, + "grad_norm": 5.852734969187135, + "learning_rate": 2.1659436700613787e-06, + "loss": 0.4691, + "step": 6851 + }, + { + "epoch": 0.5564850158369203, + "grad_norm": 3.727932064595546, + "learning_rate": 2.1652919568101386e-06, + "loss": 0.5747, + "step": 6852 + }, + { + "epoch": 0.5565662308129619, + "grad_norm": 5.153139343207781, + "learning_rate": 2.1646402667189245e-06, + "loss": 0.4892, + "step": 6853 + }, + { + "epoch": 0.5566474457890035, + "grad_norm": 3.8440190223031125, + "learning_rate": 2.1639885998328293e-06, + "loss": 0.4627, + "step": 6854 + }, + { + "epoch": 0.5567286607650451, + "grad_norm": 8.606115530142445, + "learning_rate": 2.1633369561969435e-06, + "loss": 0.4342, + "step": 6855 + }, + { + "epoch": 0.5568098757410866, + "grad_norm": 3.6671273924850407, + "learning_rate": 2.1626853358563595e-06, + "loss": 0.5365, + "step": 6856 + }, + { + "epoch": 0.5568910907171283, + "grad_norm": 3.835808189282639, + "learning_rate": 2.162033738856165e-06, + "loss": 0.491, + "step": 6857 + }, + { + "epoch": 0.5569723056931698, + "grad_norm": 6.132790993906602, + "learning_rate": 2.161382165241446e-06, + "loss": 0.4935, + "step": 6858 + }, + { + "epoch": 0.5570535206692114, + "grad_norm": 4.4718395237709485, + "learning_rate": 2.1607306150572905e-06, + "loss": 0.5225, + "step": 6859 + }, + { + "epoch": 0.557134735645253, + "grad_norm": 5.4291132456188445, + "learning_rate": 2.1600790883487805e-06, + "loss": 0.4914, + "step": 6860 + }, + { + "epoch": 0.5572159506212946, + "grad_norm": 3.396009857568952, + "learning_rate": 2.159427585160999e-06, + "loss": 0.6867, + "step": 6861 + }, + { + "epoch": 0.5572971655973361, + "grad_norm": 6.2945750984242945, + "learning_rate": 2.1587761055390247e-06, + "loss": 0.3675, + "step": 6862 + }, + { + "epoch": 0.5573783805733777, + "grad_norm": 10.344490630116207, + "learning_rate": 2.1581246495279388e-06, + "loss": 0.4676, + "step": 6863 + }, + { + "epoch": 0.5574595955494193, + "grad_norm": 4.03264581660929, + "learning_rate": 2.1574732171728187e-06, + "loss": 0.5014, + "step": 6864 + }, + { + "epoch": 0.5575408105254609, + "grad_norm": 5.595299416089842, + "learning_rate": 2.1568218085187375e-06, + "loss": 0.5567, + "step": 6865 + }, + { + "epoch": 0.5576220255015025, + "grad_norm": 4.615982346308546, + "learning_rate": 2.1561704236107715e-06, + "loss": 0.4441, + "step": 6866 + }, + { + "epoch": 0.557703240477544, + "grad_norm": 3.371649845113746, + "learning_rate": 2.1555190624939933e-06, + "loss": 0.4734, + "step": 6867 + }, + { + "epoch": 0.5577844554535857, + "grad_norm": 4.993213905650965, + "learning_rate": 2.154867725213472e-06, + "loss": 0.5791, + "step": 6868 + }, + { + "epoch": 0.5578656704296272, + "grad_norm": 5.433984223606404, + "learning_rate": 2.154216411814278e-06, + "loss": 0.6413, + "step": 6869 + }, + { + "epoch": 0.5579468854056688, + "grad_norm": 4.1089167750307505, + "learning_rate": 2.1535651223414783e-06, + "loss": 0.4065, + "step": 6870 + }, + { + "epoch": 0.5580281003817104, + "grad_norm": 6.410732794666789, + "learning_rate": 2.1529138568401377e-06, + "loss": 0.4132, + "step": 6871 + }, + { + "epoch": 0.558109315357752, + "grad_norm": 2.952240951187637, + "learning_rate": 2.1522626153553224e-06, + "loss": 0.5569, + "step": 6872 + }, + { + "epoch": 0.5581905303337935, + "grad_norm": 5.778680431168145, + "learning_rate": 2.1516113979320937e-06, + "loss": 0.615, + "step": 6873 + }, + { + "epoch": 0.5582717453098351, + "grad_norm": 5.40543144487348, + "learning_rate": 2.150960204615511e-06, + "loss": 0.4005, + "step": 6874 + }, + { + "epoch": 0.5583529602858767, + "grad_norm": 5.360050799209055, + "learning_rate": 2.1503090354506366e-06, + "loss": 0.4878, + "step": 6875 + }, + { + "epoch": 0.5584341752619183, + "grad_norm": 4.768908334280203, + "learning_rate": 2.1496578904825253e-06, + "loss": 0.606, + "step": 6876 + }, + { + "epoch": 0.5585153902379599, + "grad_norm": 4.8420372746526255, + "learning_rate": 2.149006769756234e-06, + "loss": 0.3823, + "step": 6877 + }, + { + "epoch": 0.5585966052140015, + "grad_norm": 7.2191793614780835, + "learning_rate": 2.148355673316817e-06, + "loss": 0.4933, + "step": 6878 + }, + { + "epoch": 0.5586778201900431, + "grad_norm": 5.635609934888895, + "learning_rate": 2.1477046012093263e-06, + "loss": 0.4284, + "step": 6879 + }, + { + "epoch": 0.5587590351660846, + "grad_norm": 4.860255565023494, + "learning_rate": 2.147053553478813e-06, + "loss": 0.4215, + "step": 6880 + }, + { + "epoch": 0.5588402501421262, + "grad_norm": 6.326189480768318, + "learning_rate": 2.1464025301703243e-06, + "loss": 0.3612, + "step": 6881 + }, + { + "epoch": 0.5589214651181678, + "grad_norm": 10.398795327314401, + "learning_rate": 2.145751531328911e-06, + "loss": 0.638, + "step": 6882 + }, + { + "epoch": 0.5590026800942094, + "grad_norm": 5.9840259751672455, + "learning_rate": 2.1451005569996157e-06, + "loss": 0.4752, + "step": 6883 + }, + { + "epoch": 0.5590838950702509, + "grad_norm": 6.729449032368889, + "learning_rate": 2.144449607227483e-06, + "loss": 0.4235, + "step": 6884 + }, + { + "epoch": 0.5591651100462925, + "grad_norm": 4.99524830446243, + "learning_rate": 2.143798682057558e-06, + "loss": 0.565, + "step": 6885 + }, + { + "epoch": 0.5592463250223341, + "grad_norm": 11.826585338715425, + "learning_rate": 2.1431477815348775e-06, + "loss": 0.644, + "step": 6886 + }, + { + "epoch": 0.5593275399983757, + "grad_norm": 4.772181141872521, + "learning_rate": 2.1424969057044815e-06, + "loss": 0.6231, + "step": 6887 + }, + { + "epoch": 0.5594087549744173, + "grad_norm": 5.569287892562355, + "learning_rate": 2.1418460546114087e-06, + "loss": 0.4135, + "step": 6888 + }, + { + "epoch": 0.5594899699504589, + "grad_norm": 3.637243154196817, + "learning_rate": 2.141195228300693e-06, + "loss": 0.4882, + "step": 6889 + }, + { + "epoch": 0.5595711849265005, + "grad_norm": 4.970081667441746, + "learning_rate": 2.140544426817368e-06, + "loss": 0.4451, + "step": 6890 + }, + { + "epoch": 0.559652399902542, + "grad_norm": 6.0593249594190715, + "learning_rate": 2.139893650206467e-06, + "loss": 0.4589, + "step": 6891 + }, + { + "epoch": 0.5597336148785836, + "grad_norm": 4.272966835485669, + "learning_rate": 2.1392428985130192e-06, + "loss": 0.4929, + "step": 6892 + }, + { + "epoch": 0.5598148298546252, + "grad_norm": 4.935701907357544, + "learning_rate": 2.138592171782053e-06, + "loss": 0.6183, + "step": 6893 + }, + { + "epoch": 0.5598960448306668, + "grad_norm": 4.965836504024703, + "learning_rate": 2.137941470058597e-06, + "loss": 0.5274, + "step": 6894 + }, + { + "epoch": 0.5599772598067083, + "grad_norm": 3.8763705524472245, + "learning_rate": 2.1372907933876745e-06, + "loss": 0.6235, + "step": 6895 + }, + { + "epoch": 0.56005847478275, + "grad_norm": 3.7922687516844387, + "learning_rate": 2.13664014181431e-06, + "loss": 0.4554, + "step": 6896 + }, + { + "epoch": 0.5601396897587915, + "grad_norm": 5.143643713764753, + "learning_rate": 2.1359895153835235e-06, + "loss": 0.7668, + "step": 6897 + }, + { + "epoch": 0.5602209047348331, + "grad_norm": 3.9929186611253686, + "learning_rate": 2.1353389141403373e-06, + "loss": 0.5105, + "step": 6898 + }, + { + "epoch": 0.5603021197108747, + "grad_norm": 7.4139111080640605, + "learning_rate": 2.134688338129768e-06, + "loss": 0.3908, + "step": 6899 + }, + { + "epoch": 0.5603833346869163, + "grad_norm": 4.298081345320913, + "learning_rate": 2.1340377873968313e-06, + "loss": 0.3907, + "step": 6900 + }, + { + "epoch": 0.5604645496629579, + "grad_norm": 3.6665218877774404, + "learning_rate": 2.133387261986544e-06, + "loss": 0.4315, + "step": 6901 + }, + { + "epoch": 0.5605457646389994, + "grad_norm": 3.345078936648102, + "learning_rate": 2.132736761943917e-06, + "loss": 0.4787, + "step": 6902 + }, + { + "epoch": 0.560626979615041, + "grad_norm": 5.603195213119475, + "learning_rate": 2.1320862873139627e-06, + "loss": 0.517, + "step": 6903 + }, + { + "epoch": 0.5607081945910826, + "grad_norm": 4.582210135280795, + "learning_rate": 2.1314358381416906e-06, + "loss": 0.4446, + "step": 6904 + }, + { + "epoch": 0.5607894095671242, + "grad_norm": 5.520444524716127, + "learning_rate": 2.130785414472108e-06, + "loss": 0.5447, + "step": 6905 + }, + { + "epoch": 0.5608706245431657, + "grad_norm": 3.594858666968153, + "learning_rate": 2.1301350163502194e-06, + "loss": 0.4709, + "step": 6906 + }, + { + "epoch": 0.5609518395192074, + "grad_norm": 5.775944364817623, + "learning_rate": 2.1294846438210316e-06, + "loss": 0.5108, + "step": 6907 + }, + { + "epoch": 0.5610330544952489, + "grad_norm": 4.911293017998235, + "learning_rate": 2.128834296929545e-06, + "loss": 0.458, + "step": 6908 + }, + { + "epoch": 0.5611142694712905, + "grad_norm": 5.313549191131658, + "learning_rate": 2.12818397572076e-06, + "loss": 0.5297, + "step": 6909 + }, + { + "epoch": 0.5611954844473321, + "grad_norm": 5.235033497622972, + "learning_rate": 2.1275336802396775e-06, + "loss": 0.6024, + "step": 6910 + }, + { + "epoch": 0.5612766994233737, + "grad_norm": 8.31540179839902, + "learning_rate": 2.1268834105312926e-06, + "loss": 0.4589, + "step": 6911 + }, + { + "epoch": 0.5613579143994153, + "grad_norm": 5.844285401664914, + "learning_rate": 2.1262331666406003e-06, + "loss": 0.687, + "step": 6912 + }, + { + "epoch": 0.5614391293754568, + "grad_norm": 5.506426123033813, + "learning_rate": 2.125582948612595e-06, + "loss": 0.3788, + "step": 6913 + }, + { + "epoch": 0.5615203443514984, + "grad_norm": 4.938418738241158, + "learning_rate": 2.124932756492269e-06, + "loss": 0.4786, + "step": 6914 + }, + { + "epoch": 0.56160155932754, + "grad_norm": 5.011490627219829, + "learning_rate": 2.1242825903246104e-06, + "loss": 0.5104, + "step": 6915 + }, + { + "epoch": 0.5616827743035816, + "grad_norm": 4.130351403264876, + "learning_rate": 2.1236324501546073e-06, + "loss": 0.5, + "step": 6916 + }, + { + "epoch": 0.5617639892796231, + "grad_norm": 5.131299996818871, + "learning_rate": 2.1229823360272483e-06, + "loss": 0.5647, + "step": 6917 + }, + { + "epoch": 0.5618452042556648, + "grad_norm": 4.930401157312547, + "learning_rate": 2.1223322479875157e-06, + "loss": 0.6934, + "step": 6918 + }, + { + "epoch": 0.5619264192317063, + "grad_norm": 4.218515382201296, + "learning_rate": 2.1216821860803922e-06, + "loss": 0.5194, + "step": 6919 + }, + { + "epoch": 0.5620076342077479, + "grad_norm": 4.955917407462902, + "learning_rate": 2.12103215035086e-06, + "loss": 0.548, + "step": 6920 + }, + { + "epoch": 0.5620888491837895, + "grad_norm": 7.097785428246493, + "learning_rate": 2.1203821408438973e-06, + "loss": 0.3326, + "step": 6921 + }, + { + "epoch": 0.5621700641598311, + "grad_norm": 5.512144520285594, + "learning_rate": 2.1197321576044803e-06, + "loss": 0.5434, + "step": 6922 + }, + { + "epoch": 0.5622512791358727, + "grad_norm": 4.559962704037825, + "learning_rate": 2.119082200677587e-06, + "loss": 0.706, + "step": 6923 + }, + { + "epoch": 0.5623324941119142, + "grad_norm": 3.716157164787584, + "learning_rate": 2.1184322701081884e-06, + "loss": 0.4711, + "step": 6924 + }, + { + "epoch": 0.5624137090879558, + "grad_norm": 5.005561099415801, + "learning_rate": 2.117782365941257e-06, + "loss": 0.6163, + "step": 6925 + }, + { + "epoch": 0.5624949240639974, + "grad_norm": 3.3652327554641808, + "learning_rate": 2.1171324882217644e-06, + "loss": 0.5811, + "step": 6926 + }, + { + "epoch": 0.562576139040039, + "grad_norm": 3.4936659274070485, + "learning_rate": 2.116482636994677e-06, + "loss": 0.4976, + "step": 6927 + }, + { + "epoch": 0.5626573540160805, + "grad_norm": 15.606156521919054, + "learning_rate": 2.11583281230496e-06, + "loss": 0.3923, + "step": 6928 + }, + { + "epoch": 0.5627385689921222, + "grad_norm": 8.796650533681337, + "learning_rate": 2.11518301419758e-06, + "loss": 0.5009, + "step": 6929 + }, + { + "epoch": 0.5628197839681637, + "grad_norm": 5.043513216497029, + "learning_rate": 2.1145332427174995e-06, + "loss": 0.3586, + "step": 6930 + }, + { + "epoch": 0.5629009989442053, + "grad_norm": 5.302763968985061, + "learning_rate": 2.1138834979096778e-06, + "loss": 0.4416, + "step": 6931 + }, + { + "epoch": 0.5629822139202469, + "grad_norm": 4.618030423336223, + "learning_rate": 2.1132337798190743e-06, + "loss": 0.5591, + "step": 6932 + }, + { + "epoch": 0.5630634288962885, + "grad_norm": 5.305923379739884, + "learning_rate": 2.112584088490647e-06, + "loss": 0.6132, + "step": 6933 + }, + { + "epoch": 0.5631446438723301, + "grad_norm": 8.514031471933345, + "learning_rate": 2.11193442396935e-06, + "loss": 0.4011, + "step": 6934 + }, + { + "epoch": 0.5632258588483716, + "grad_norm": 4.100042747142503, + "learning_rate": 2.111284786300137e-06, + "loss": 0.6595, + "step": 6935 + }, + { + "epoch": 0.5633070738244133, + "grad_norm": 6.623608750925487, + "learning_rate": 2.11063517552796e-06, + "loss": 0.4761, + "step": 6936 + }, + { + "epoch": 0.5633882888004548, + "grad_norm": 4.043561752629549, + "learning_rate": 2.1099855916977676e-06, + "loss": 0.4361, + "step": 6937 + }, + { + "epoch": 0.5634695037764964, + "grad_norm": 3.492107617173054, + "learning_rate": 2.109336034854508e-06, + "loss": 0.6809, + "step": 6938 + }, + { + "epoch": 0.5635507187525379, + "grad_norm": 5.163084090773521, + "learning_rate": 2.1086865050431283e-06, + "loss": 0.4603, + "step": 6939 + }, + { + "epoch": 0.5636319337285796, + "grad_norm": 4.691574509899781, + "learning_rate": 2.1080370023085713e-06, + "loss": 0.6142, + "step": 6940 + }, + { + "epoch": 0.5637131487046211, + "grad_norm": 4.128014703538365, + "learning_rate": 2.107387526695778e-06, + "loss": 0.4824, + "step": 6941 + }, + { + "epoch": 0.5637943636806627, + "grad_norm": 11.05696154296604, + "learning_rate": 2.106738078249691e-06, + "loss": 0.5197, + "step": 6942 + }, + { + "epoch": 0.5638755786567043, + "grad_norm": 5.222620839074155, + "learning_rate": 2.1060886570152477e-06, + "loss": 0.4981, + "step": 6943 + }, + { + "epoch": 0.5639567936327459, + "grad_norm": 4.697414322539678, + "learning_rate": 2.105439263037384e-06, + "loss": 0.4387, + "step": 6944 + }, + { + "epoch": 0.5640380086087875, + "grad_norm": 3.8406258019033004, + "learning_rate": 2.1047898963610354e-06, + "loss": 0.4677, + "step": 6945 + }, + { + "epoch": 0.564119223584829, + "grad_norm": 5.2714849848498115, + "learning_rate": 2.1041405570311348e-06, + "loss": 0.3368, + "step": 6946 + }, + { + "epoch": 0.5642004385608707, + "grad_norm": 7.429954267821541, + "learning_rate": 2.1034912450926114e-06, + "loss": 0.4286, + "step": 6947 + }, + { + "epoch": 0.5642816535369122, + "grad_norm": 4.461747197120303, + "learning_rate": 2.102841960590396e-06, + "loss": 0.4246, + "step": 6948 + }, + { + "epoch": 0.5643628685129538, + "grad_norm": 6.852938483133885, + "learning_rate": 2.102192703569416e-06, + "loss": 0.4819, + "step": 6949 + }, + { + "epoch": 0.5644440834889953, + "grad_norm": 4.945485942357694, + "learning_rate": 2.1015434740745944e-06, + "loss": 0.5224, + "step": 6950 + }, + { + "epoch": 0.564525298465037, + "grad_norm": 5.1220985098046095, + "learning_rate": 2.1008942721508553e-06, + "loss": 0.4534, + "step": 6951 + }, + { + "epoch": 0.5646065134410785, + "grad_norm": 4.645911138798781, + "learning_rate": 2.1002450978431216e-06, + "loss": 0.4874, + "step": 6952 + }, + { + "epoch": 0.5646877284171201, + "grad_norm": 4.138063372246911, + "learning_rate": 2.099595951196311e-06, + "loss": 0.4335, + "step": 6953 + }, + { + "epoch": 0.5647689433931617, + "grad_norm": 5.760613534796062, + "learning_rate": 2.09894683225534e-06, + "loss": 0.5575, + "step": 6954 + }, + { + "epoch": 0.5648501583692033, + "grad_norm": 5.719720985157515, + "learning_rate": 2.0982977410651276e-06, + "loss": 0.5573, + "step": 6955 + }, + { + "epoch": 0.5649313733452449, + "grad_norm": 5.21317528533535, + "learning_rate": 2.0976486776705853e-06, + "loss": 0.4523, + "step": 6956 + }, + { + "epoch": 0.5650125883212864, + "grad_norm": 6.455893264664072, + "learning_rate": 2.0969996421166243e-06, + "loss": 0.5827, + "step": 6957 + }, + { + "epoch": 0.5650938032973281, + "grad_norm": 3.152544873395565, + "learning_rate": 2.0963506344481556e-06, + "loss": 0.4638, + "step": 6958 + }, + { + "epoch": 0.5651750182733696, + "grad_norm": 3.711902575933437, + "learning_rate": 2.0957016547100867e-06, + "loss": 0.5174, + "step": 6959 + }, + { + "epoch": 0.5652562332494112, + "grad_norm": 3.1937905928416774, + "learning_rate": 2.095052702947323e-06, + "loss": 0.5568, + "step": 6960 + }, + { + "epoch": 0.5653374482254527, + "grad_norm": 3.8506487142937336, + "learning_rate": 2.09440377920477e-06, + "loss": 0.6765, + "step": 6961 + }, + { + "epoch": 0.5654186632014944, + "grad_norm": 4.161307544084725, + "learning_rate": 2.0937548835273285e-06, + "loss": 0.4279, + "step": 6962 + }, + { + "epoch": 0.5654998781775359, + "grad_norm": 5.783786158584384, + "learning_rate": 2.0931060159598986e-06, + "loss": 0.483, + "step": 6963 + }, + { + "epoch": 0.5655810931535775, + "grad_norm": 4.391669807993531, + "learning_rate": 2.0924571765473793e-06, + "loss": 0.5543, + "step": 6964 + }, + { + "epoch": 0.5656623081296192, + "grad_norm": 11.934622866843036, + "learning_rate": 2.091808365334667e-06, + "loss": 0.5861, + "step": 6965 + }, + { + "epoch": 0.5657435231056607, + "grad_norm": 6.272947824575495, + "learning_rate": 2.091159582366655e-06, + "loss": 0.4361, + "step": 6966 + }, + { + "epoch": 0.5658247380817023, + "grad_norm": 2.896511468026234, + "learning_rate": 2.0905108276882356e-06, + "loss": 0.4904, + "step": 6967 + }, + { + "epoch": 0.5659059530577438, + "grad_norm": 4.7888166758801205, + "learning_rate": 2.089862101344301e-06, + "loss": 0.4592, + "step": 6968 + }, + { + "epoch": 0.5659871680337855, + "grad_norm": 10.0912728353721, + "learning_rate": 2.0892134033797383e-06, + "loss": 0.4861, + "step": 6969 + }, + { + "epoch": 0.566068383009827, + "grad_norm": 2.573138862813564, + "learning_rate": 2.088564733839433e-06, + "loss": 0.3952, + "step": 6970 + }, + { + "epoch": 0.5661495979858686, + "grad_norm": 2.924408214469329, + "learning_rate": 2.087916092768271e-06, + "loss": 0.4989, + "step": 6971 + }, + { + "epoch": 0.5662308129619101, + "grad_norm": 5.175327347055258, + "learning_rate": 2.087267480211135e-06, + "loss": 0.593, + "step": 6972 + }, + { + "epoch": 0.5663120279379518, + "grad_norm": 5.327737397653494, + "learning_rate": 2.086618896212904e-06, + "loss": 0.4824, + "step": 6973 + }, + { + "epoch": 0.5663932429139933, + "grad_norm": 3.5981451480047344, + "learning_rate": 2.0859703408184583e-06, + "loss": 0.569, + "step": 6974 + }, + { + "epoch": 0.5664744578900349, + "grad_norm": 3.98952188483392, + "learning_rate": 2.085321814072674e-06, + "loss": 0.5055, + "step": 6975 + }, + { + "epoch": 0.5665556728660766, + "grad_norm": 4.760263855647576, + "learning_rate": 2.0846733160204244e-06, + "loss": 0.5048, + "step": 6976 + }, + { + "epoch": 0.5666368878421181, + "grad_norm": 4.040158060702142, + "learning_rate": 2.084024846706584e-06, + "loss": 0.474, + "step": 6977 + }, + { + "epoch": 0.5667181028181597, + "grad_norm": 3.3443500978774394, + "learning_rate": 2.083376406176023e-06, + "loss": 0.6342, + "step": 6978 + }, + { + "epoch": 0.5667993177942012, + "grad_norm": 6.809697122943693, + "learning_rate": 2.082727994473609e-06, + "loss": 0.4007, + "step": 6979 + }, + { + "epoch": 0.5668805327702429, + "grad_norm": 4.007751918991223, + "learning_rate": 2.08207961164421e-06, + "loss": 0.3879, + "step": 6980 + }, + { + "epoch": 0.5669617477462844, + "grad_norm": 4.107776861520255, + "learning_rate": 2.08143125773269e-06, + "loss": 0.4471, + "step": 6981 + }, + { + "epoch": 0.567042962722326, + "grad_norm": 3.806832164663562, + "learning_rate": 2.080782932783911e-06, + "loss": 0.4513, + "step": 6982 + }, + { + "epoch": 0.5671241776983675, + "grad_norm": 6.942546340030462, + "learning_rate": 2.0801346368427356e-06, + "loss": 0.6031, + "step": 6983 + }, + { + "epoch": 0.5672053926744092, + "grad_norm": 3.225972794340873, + "learning_rate": 2.0794863699540206e-06, + "loss": 0.6745, + "step": 6984 + }, + { + "epoch": 0.5672866076504507, + "grad_norm": 4.7018910207675955, + "learning_rate": 2.0788381321626237e-06, + "loss": 0.531, + "step": 6985 + }, + { + "epoch": 0.5673678226264923, + "grad_norm": 3.88377602984253, + "learning_rate": 2.0781899235133984e-06, + "loss": 0.6038, + "step": 6986 + }, + { + "epoch": 0.567449037602534, + "grad_norm": 4.743493354886736, + "learning_rate": 2.077541744051198e-06, + "loss": 0.5044, + "step": 6987 + }, + { + "epoch": 0.5675302525785755, + "grad_norm": 3.725682821827385, + "learning_rate": 2.0768935938208735e-06, + "loss": 0.545, + "step": 6988 + }, + { + "epoch": 0.5676114675546171, + "grad_norm": 4.23366272227734, + "learning_rate": 2.0762454728672727e-06, + "loss": 0.6513, + "step": 6989 + }, + { + "epoch": 0.5676926825306586, + "grad_norm": 3.219731697269269, + "learning_rate": 2.0755973812352424e-06, + "loss": 0.5489, + "step": 6990 + }, + { + "epoch": 0.5677738975067003, + "grad_norm": 5.426598112663618, + "learning_rate": 2.074949318969628e-06, + "loss": 0.5292, + "step": 6991 + }, + { + "epoch": 0.5678551124827418, + "grad_norm": 5.501771039935805, + "learning_rate": 2.07430128611527e-06, + "loss": 0.4586, + "step": 6992 + }, + { + "epoch": 0.5679363274587834, + "grad_norm": 4.038443635513911, + "learning_rate": 2.0736532827170107e-06, + "loss": 0.414, + "step": 6993 + }, + { + "epoch": 0.5680175424348249, + "grad_norm": 5.939660501414922, + "learning_rate": 2.0730053088196883e-06, + "loss": 0.4727, + "step": 6994 + }, + { + "epoch": 0.5680987574108666, + "grad_norm": 4.71954105859692, + "learning_rate": 2.072357364468138e-06, + "loss": 0.4108, + "step": 6995 + }, + { + "epoch": 0.5681799723869081, + "grad_norm": 12.024015711126426, + "learning_rate": 2.0717094497071945e-06, + "loss": 0.4805, + "step": 6996 + }, + { + "epoch": 0.5682611873629497, + "grad_norm": 4.1133512422429455, + "learning_rate": 2.0710615645816913e-06, + "loss": 0.4311, + "step": 6997 + }, + { + "epoch": 0.5683424023389914, + "grad_norm": 3.993669100784519, + "learning_rate": 2.0704137091364568e-06, + "loss": 0.5012, + "step": 6998 + }, + { + "epoch": 0.5684236173150329, + "grad_norm": 5.302686271074888, + "learning_rate": 2.069765883416321e-06, + "loss": 0.43, + "step": 6999 + }, + { + "epoch": 0.5685048322910745, + "grad_norm": 4.755390559997945, + "learning_rate": 2.0691180874661086e-06, + "loss": 0.5289, + "step": 7000 + }, + { + "epoch": 0.568586047267116, + "grad_norm": 2.9673265351316704, + "learning_rate": 2.0684703213306435e-06, + "loss": 0.4691, + "step": 7001 + }, + { + "epoch": 0.5686672622431577, + "grad_norm": 3.9211031513326486, + "learning_rate": 2.0678225850547497e-06, + "loss": 0.5234, + "step": 7002 + }, + { + "epoch": 0.5687484772191992, + "grad_norm": 4.581102578387739, + "learning_rate": 2.0671748786832447e-06, + "loss": 0.5117, + "step": 7003 + }, + { + "epoch": 0.5688296921952408, + "grad_norm": 6.219007379440202, + "learning_rate": 2.0665272022609482e-06, + "loss": 0.5086, + "step": 7004 + }, + { + "epoch": 0.5689109071712823, + "grad_norm": 6.0686864713771485, + "learning_rate": 2.0658795558326745e-06, + "loss": 0.4348, + "step": 7005 + }, + { + "epoch": 0.568992122147324, + "grad_norm": 3.6037676191639885, + "learning_rate": 2.065231939443238e-06, + "loss": 0.4322, + "step": 7006 + }, + { + "epoch": 0.5690733371233655, + "grad_norm": 3.160547290178154, + "learning_rate": 2.064584353137451e-06, + "loss": 0.4967, + "step": 7007 + }, + { + "epoch": 0.5691545520994071, + "grad_norm": 4.373900459536156, + "learning_rate": 2.0639367969601215e-06, + "loss": 0.4513, + "step": 7008 + }, + { + "epoch": 0.5692357670754488, + "grad_norm": 10.5295783218965, + "learning_rate": 2.063289270956058e-06, + "loss": 0.4657, + "step": 7009 + }, + { + "epoch": 0.5693169820514903, + "grad_norm": 6.441791257822959, + "learning_rate": 2.0626417751700664e-06, + "loss": 0.3973, + "step": 7010 + }, + { + "epoch": 0.5693981970275319, + "grad_norm": 7.65548642747038, + "learning_rate": 2.0619943096469484e-06, + "loss": 0.4463, + "step": 7011 + }, + { + "epoch": 0.5694794120035734, + "grad_norm": 2.7981688902513864, + "learning_rate": 2.061346874431507e-06, + "loss": 0.5198, + "step": 7012 + }, + { + "epoch": 0.5695606269796151, + "grad_norm": 3.8831757918226586, + "learning_rate": 2.0606994695685396e-06, + "loss": 0.3686, + "step": 7013 + }, + { + "epoch": 0.5696418419556566, + "grad_norm": 3.8701501681776156, + "learning_rate": 2.0600520951028437e-06, + "loss": 0.5688, + "step": 7014 + }, + { + "epoch": 0.5697230569316982, + "grad_norm": 3.537194498397044, + "learning_rate": 2.059404751079215e-06, + "loss": 0.4693, + "step": 7015 + }, + { + "epoch": 0.5698042719077397, + "grad_norm": 4.5238431030094794, + "learning_rate": 2.0587574375424456e-06, + "loss": 0.4932, + "step": 7016 + }, + { + "epoch": 0.5698854868837814, + "grad_norm": 6.314721658579622, + "learning_rate": 2.0581101545373255e-06, + "loss": 0.5723, + "step": 7017 + }, + { + "epoch": 0.5699667018598229, + "grad_norm": 4.543665245055623, + "learning_rate": 2.057462902108645e-06, + "loss": 0.6108, + "step": 7018 + }, + { + "epoch": 0.5700479168358645, + "grad_norm": 4.326213404795697, + "learning_rate": 2.0568156803011897e-06, + "loss": 0.5064, + "step": 7019 + }, + { + "epoch": 0.5701291318119062, + "grad_norm": 6.344731487919164, + "learning_rate": 2.056168489159744e-06, + "loss": 0.4286, + "step": 7020 + }, + { + "epoch": 0.5702103467879477, + "grad_norm": 6.407501842611541, + "learning_rate": 2.0555213287290886e-06, + "loss": 0.4281, + "step": 7021 + }, + { + "epoch": 0.5702915617639893, + "grad_norm": 3.9016889868541873, + "learning_rate": 2.0548741990540057e-06, + "loss": 0.5229, + "step": 7022 + }, + { + "epoch": 0.5703727767400308, + "grad_norm": 4.485329708138932, + "learning_rate": 2.0542271001792726e-06, + "loss": 0.4988, + "step": 7023 + }, + { + "epoch": 0.5704539917160725, + "grad_norm": 7.011271727948778, + "learning_rate": 2.0535800321496645e-06, + "loss": 0.4174, + "step": 7024 + }, + { + "epoch": 0.570535206692114, + "grad_norm": 21.104012028716472, + "learning_rate": 2.0529329950099554e-06, + "loss": 0.566, + "step": 7025 + }, + { + "epoch": 0.5706164216681556, + "grad_norm": 5.5743586700224, + "learning_rate": 2.052285988804918e-06, + "loss": 0.4475, + "step": 7026 + }, + { + "epoch": 0.5706976366441971, + "grad_norm": 4.416468163895723, + "learning_rate": 2.0516390135793192e-06, + "loss": 0.3382, + "step": 7027 + }, + { + "epoch": 0.5707788516202388, + "grad_norm": 4.986468294717369, + "learning_rate": 2.050992069377929e-06, + "loss": 0.3999, + "step": 7028 + }, + { + "epoch": 0.5708600665962803, + "grad_norm": 9.4114543685177, + "learning_rate": 2.050345156245511e-06, + "loss": 0.4653, + "step": 7029 + }, + { + "epoch": 0.5709412815723219, + "grad_norm": 4.613503552061298, + "learning_rate": 2.0496982742268273e-06, + "loss": 0.3624, + "step": 7030 + }, + { + "epoch": 0.5710224965483636, + "grad_norm": 6.062105080879825, + "learning_rate": 2.0490514233666413e-06, + "loss": 0.5138, + "step": 7031 + }, + { + "epoch": 0.5711037115244051, + "grad_norm": 3.9841008788202132, + "learning_rate": 2.04840460370971e-06, + "loss": 0.4635, + "step": 7032 + }, + { + "epoch": 0.5711849265004467, + "grad_norm": 5.48691904012669, + "learning_rate": 2.0477578153007887e-06, + "loss": 0.4516, + "step": 7033 + }, + { + "epoch": 0.5712661414764882, + "grad_norm": 3.3836486599021636, + "learning_rate": 2.047111058184635e-06, + "loss": 0.5242, + "step": 7034 + }, + { + "epoch": 0.5713473564525299, + "grad_norm": 3.8672222024944856, + "learning_rate": 2.046464332405998e-06, + "loss": 0.5653, + "step": 7035 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 4.314382703074949, + "learning_rate": 2.045817638009629e-06, + "loss": 0.5402, + "step": 7036 + }, + { + "epoch": 0.571509786404613, + "grad_norm": 3.567072195815995, + "learning_rate": 2.045170975040276e-06, + "loss": 0.4213, + "step": 7037 + }, + { + "epoch": 0.5715910013806546, + "grad_norm": 4.48375869912088, + "learning_rate": 2.0445243435426847e-06, + "loss": 0.6564, + "step": 7038 + }, + { + "epoch": 0.5716722163566962, + "grad_norm": 3.552656399851541, + "learning_rate": 2.043877743561598e-06, + "loss": 0.5135, + "step": 7039 + }, + { + "epoch": 0.5717534313327377, + "grad_norm": 3.8377915597339873, + "learning_rate": 2.0432311751417568e-06, + "loss": 0.6527, + "step": 7040 + }, + { + "epoch": 0.5718346463087793, + "grad_norm": 6.426418479233554, + "learning_rate": 2.042584638327902e-06, + "loss": 0.3723, + "step": 7041 + }, + { + "epoch": 0.571915861284821, + "grad_norm": 4.241032012671777, + "learning_rate": 2.0419381331647687e-06, + "loss": 0.4957, + "step": 7042 + }, + { + "epoch": 0.5719970762608625, + "grad_norm": 3.735539966685467, + "learning_rate": 2.0412916596970918e-06, + "loss": 0.566, + "step": 7043 + }, + { + "epoch": 0.5720782912369041, + "grad_norm": 3.8956911217710917, + "learning_rate": 2.040645217969606e-06, + "loss": 0.4935, + "step": 7044 + }, + { + "epoch": 0.5721595062129456, + "grad_norm": 20.493625970905985, + "learning_rate": 2.0399988080270384e-06, + "loss": 0.4629, + "step": 7045 + }, + { + "epoch": 0.5722407211889873, + "grad_norm": 29.937743604580216, + "learning_rate": 2.039352429914119e-06, + "loss": 0.4411, + "step": 7046 + }, + { + "epoch": 0.5723219361650288, + "grad_norm": 6.109372993647007, + "learning_rate": 2.038706083675574e-06, + "loss": 0.4187, + "step": 7047 + }, + { + "epoch": 0.5724031511410704, + "grad_norm": 4.3015816031863086, + "learning_rate": 2.038059769356127e-06, + "loss": 0.5831, + "step": 7048 + }, + { + "epoch": 0.572484366117112, + "grad_norm": 4.347753553845655, + "learning_rate": 2.037413487000498e-06, + "loss": 0.4969, + "step": 7049 + }, + { + "epoch": 0.5725655810931536, + "grad_norm": 5.01369388436399, + "learning_rate": 2.0367672366534087e-06, + "loss": 0.4838, + "step": 7050 + }, + { + "epoch": 0.5726467960691951, + "grad_norm": 6.956995413047726, + "learning_rate": 2.036121018359574e-06, + "loss": 0.4995, + "step": 7051 + }, + { + "epoch": 0.5727280110452367, + "grad_norm": 7.014322634335082, + "learning_rate": 2.03547483216371e-06, + "loss": 0.5013, + "step": 7052 + }, + { + "epoch": 0.5728092260212784, + "grad_norm": 6.622483650279607, + "learning_rate": 2.0348286781105302e-06, + "loss": 0.4124, + "step": 7053 + }, + { + "epoch": 0.5728904409973199, + "grad_norm": 4.159914158121862, + "learning_rate": 2.0341825562447427e-06, + "loss": 0.6059, + "step": 7054 + }, + { + "epoch": 0.5729716559733615, + "grad_norm": 5.011801450523911, + "learning_rate": 2.0335364666110572e-06, + "loss": 0.4856, + "step": 7055 + }, + { + "epoch": 0.573052870949403, + "grad_norm": 4.952363770440026, + "learning_rate": 2.03289040925418e-06, + "loss": 0.4473, + "step": 7056 + }, + { + "epoch": 0.5731340859254447, + "grad_norm": 7.83403745870332, + "learning_rate": 2.032244384218815e-06, + "loss": 0.6729, + "step": 7057 + }, + { + "epoch": 0.5732153009014862, + "grad_norm": 6.993413026032686, + "learning_rate": 2.031598391549662e-06, + "loss": 0.5622, + "step": 7058 + }, + { + "epoch": 0.5732965158775278, + "grad_norm": 6.260135249773573, + "learning_rate": 2.030952431291421e-06, + "loss": 0.5174, + "step": 7059 + }, + { + "epoch": 0.5733777308535694, + "grad_norm": 4.075720359030366, + "learning_rate": 2.0303065034887904e-06, + "loss": 0.5199, + "step": 7060 + }, + { + "epoch": 0.573458945829611, + "grad_norm": 3.9202159123881475, + "learning_rate": 2.0296606081864634e-06, + "loss": 0.5269, + "step": 7061 + }, + { + "epoch": 0.5735401608056525, + "grad_norm": 4.8268240982250274, + "learning_rate": 2.0290147454291323e-06, + "loss": 0.4606, + "step": 7062 + }, + { + "epoch": 0.5736213757816941, + "grad_norm": 5.603064677109073, + "learning_rate": 2.0283689152614896e-06, + "loss": 0.6205, + "step": 7063 + }, + { + "epoch": 0.5737025907577358, + "grad_norm": 4.633814523131622, + "learning_rate": 2.0277231177282213e-06, + "loss": 0.5806, + "step": 7064 + }, + { + "epoch": 0.5737838057337773, + "grad_norm": 11.093950775524117, + "learning_rate": 2.0270773528740127e-06, + "loss": 0.5424, + "step": 7065 + }, + { + "epoch": 0.5738650207098189, + "grad_norm": 4.963416503789594, + "learning_rate": 2.02643162074355e-06, + "loss": 0.5403, + "step": 7066 + }, + { + "epoch": 0.5739462356858605, + "grad_norm": 7.269597973886369, + "learning_rate": 2.0257859213815123e-06, + "loss": 0.649, + "step": 7067 + }, + { + "epoch": 0.5740274506619021, + "grad_norm": 4.685273966619034, + "learning_rate": 2.0251402548325783e-06, + "loss": 0.6087, + "step": 7068 + }, + { + "epoch": 0.5741086656379436, + "grad_norm": 4.846673111295444, + "learning_rate": 2.0244946211414267e-06, + "loss": 0.5899, + "step": 7069 + }, + { + "epoch": 0.5741898806139852, + "grad_norm": 3.9278515943561625, + "learning_rate": 2.0238490203527307e-06, + "loss": 0.5397, + "step": 7070 + }, + { + "epoch": 0.5742710955900268, + "grad_norm": 4.034403858762683, + "learning_rate": 2.0232034525111617e-06, + "loss": 0.5535, + "step": 7071 + }, + { + "epoch": 0.5743523105660684, + "grad_norm": 6.517927780743827, + "learning_rate": 2.0225579176613905e-06, + "loss": 0.4494, + "step": 7072 + }, + { + "epoch": 0.5744335255421099, + "grad_norm": 3.068515611615937, + "learning_rate": 2.0219124158480853e-06, + "loss": 0.3643, + "step": 7073 + }, + { + "epoch": 0.5745147405181515, + "grad_norm": 3.6556023426111084, + "learning_rate": 2.0212669471159098e-06, + "loss": 0.4402, + "step": 7074 + }, + { + "epoch": 0.5745959554941932, + "grad_norm": 3.202736868542401, + "learning_rate": 2.020621511509528e-06, + "loss": 0.5183, + "step": 7075 + }, + { + "epoch": 0.5746771704702347, + "grad_norm": 5.842839385752603, + "learning_rate": 2.019976109073601e-06, + "loss": 0.477, + "step": 7076 + }, + { + "epoch": 0.5747583854462763, + "grad_norm": 12.847781405058441, + "learning_rate": 2.0193307398527865e-06, + "loss": 0.4842, + "step": 7077 + }, + { + "epoch": 0.5748396004223179, + "grad_norm": 21.13344792621275, + "learning_rate": 2.0186854038917405e-06, + "loss": 0.4538, + "step": 7078 + }, + { + "epoch": 0.5749208153983595, + "grad_norm": 10.50726009998217, + "learning_rate": 2.0180401012351182e-06, + "loss": 0.4578, + "step": 7079 + }, + { + "epoch": 0.575002030374401, + "grad_norm": 5.987355841136969, + "learning_rate": 2.0173948319275696e-06, + "loss": 0.5223, + "step": 7080 + }, + { + "epoch": 0.5750832453504426, + "grad_norm": 5.722234729191682, + "learning_rate": 2.016749596013744e-06, + "loss": 0.4965, + "step": 7081 + }, + { + "epoch": 0.5751644603264842, + "grad_norm": 4.664393900667678, + "learning_rate": 2.0161043935382897e-06, + "loss": 0.4677, + "step": 7082 + }, + { + "epoch": 0.5752456753025258, + "grad_norm": 7.498350619883631, + "learning_rate": 2.0154592245458504e-06, + "loss": 0.5619, + "step": 7083 + }, + { + "epoch": 0.5753268902785673, + "grad_norm": 5.775102597864225, + "learning_rate": 2.014814089081067e-06, + "loss": 0.5719, + "step": 7084 + }, + { + "epoch": 0.575408105254609, + "grad_norm": 5.1155695309255025, + "learning_rate": 2.014168987188582e-06, + "loss": 0.7247, + "step": 7085 + }, + { + "epoch": 0.5754893202306506, + "grad_norm": 4.2278332646738175, + "learning_rate": 2.0135239189130325e-06, + "loss": 0.6624, + "step": 7086 + }, + { + "epoch": 0.5755705352066921, + "grad_norm": 3.7678111182822334, + "learning_rate": 2.0128788842990516e-06, + "loss": 0.7632, + "step": 7087 + }, + { + "epoch": 0.5756517501827337, + "grad_norm": 11.009075470970467, + "learning_rate": 2.0122338833912743e-06, + "loss": 0.4833, + "step": 7088 + }, + { + "epoch": 0.5757329651587753, + "grad_norm": 4.5002280122486535, + "learning_rate": 2.0115889162343316e-06, + "loss": 0.4961, + "step": 7089 + }, + { + "epoch": 0.5758141801348169, + "grad_norm": 4.760191730901538, + "learning_rate": 2.01094398287285e-06, + "loss": 0.475, + "step": 7090 + }, + { + "epoch": 0.5758953951108584, + "grad_norm": 3.118348188458208, + "learning_rate": 2.010299083351457e-06, + "loss": 0.353, + "step": 7091 + }, + { + "epoch": 0.5759766100869, + "grad_norm": 6.734823400251969, + "learning_rate": 2.009654217714776e-06, + "loss": 0.5105, + "step": 7092 + }, + { + "epoch": 0.5760578250629416, + "grad_norm": 5.003848469090975, + "learning_rate": 2.0090093860074273e-06, + "loss": 0.5659, + "step": 7093 + }, + { + "epoch": 0.5761390400389832, + "grad_norm": 3.690892599491099, + "learning_rate": 2.008364588274031e-06, + "loss": 0.5335, + "step": 7094 + }, + { + "epoch": 0.5762202550150247, + "grad_norm": 5.097697074216427, + "learning_rate": 2.0077198245592033e-06, + "loss": 0.5175, + "step": 7095 + }, + { + "epoch": 0.5763014699910664, + "grad_norm": 3.9972301907638133, + "learning_rate": 2.0070750949075584e-06, + "loss": 0.5674, + "step": 7096 + }, + { + "epoch": 0.576382684967108, + "grad_norm": 3.765506245804216, + "learning_rate": 2.0064303993637073e-06, + "loss": 0.4085, + "step": 7097 + }, + { + "epoch": 0.5764638999431495, + "grad_norm": 5.010605186017302, + "learning_rate": 2.005785737972262e-06, + "loss": 0.5775, + "step": 7098 + }, + { + "epoch": 0.5765451149191911, + "grad_norm": 3.786228217501147, + "learning_rate": 2.0051411107778273e-06, + "loss": 0.5185, + "step": 7099 + }, + { + "epoch": 0.5766263298952327, + "grad_norm": 9.0650925529505, + "learning_rate": 2.004496517825008e-06, + "loss": 0.4735, + "step": 7100 + }, + { + "epoch": 0.5767075448712743, + "grad_norm": 5.189793367646856, + "learning_rate": 2.0038519591584078e-06, + "loss": 0.5995, + "step": 7101 + }, + { + "epoch": 0.5767887598473158, + "grad_norm": 4.700410500709092, + "learning_rate": 2.0032074348226268e-06, + "loss": 0.5761, + "step": 7102 + }, + { + "epoch": 0.5768699748233574, + "grad_norm": 4.234749790705928, + "learning_rate": 2.002562944862261e-06, + "loss": 0.5744, + "step": 7103 + }, + { + "epoch": 0.576951189799399, + "grad_norm": 3.629545684362975, + "learning_rate": 2.0019184893219076e-06, + "loss": 0.5361, + "step": 7104 + }, + { + "epoch": 0.5770324047754406, + "grad_norm": 5.227066105381711, + "learning_rate": 2.0012740682461585e-06, + "loss": 0.424, + "step": 7105 + }, + { + "epoch": 0.5771136197514821, + "grad_norm": 8.03973120733085, + "learning_rate": 2.0006296816796037e-06, + "loss": 0.4179, + "step": 7106 + }, + { + "epoch": 0.5771948347275238, + "grad_norm": 3.3013104736748575, + "learning_rate": 1.9999853296668326e-06, + "loss": 0.657, + "step": 7107 + }, + { + "epoch": 0.5772760497035654, + "grad_norm": 4.319311268255759, + "learning_rate": 1.999341012252431e-06, + "loss": 0.4993, + "step": 7108 + }, + { + "epoch": 0.5773572646796069, + "grad_norm": 3.6319991209865945, + "learning_rate": 1.9986967294809804e-06, + "loss": 0.474, + "step": 7109 + }, + { + "epoch": 0.5774384796556485, + "grad_norm": 4.960374042694389, + "learning_rate": 1.9980524813970635e-06, + "loss": 0.5666, + "step": 7110 + }, + { + "epoch": 0.5775196946316901, + "grad_norm": 2.9962262052676314, + "learning_rate": 1.997408268045259e-06, + "loss": 0.6228, + "step": 7111 + }, + { + "epoch": 0.5776009096077317, + "grad_norm": 5.668495286840983, + "learning_rate": 1.9967640894701424e-06, + "loss": 0.5147, + "step": 7112 + }, + { + "epoch": 0.5776821245837732, + "grad_norm": 4.163547326343956, + "learning_rate": 1.9961199457162867e-06, + "loss": 0.4194, + "step": 7113 + }, + { + "epoch": 0.5777633395598148, + "grad_norm": 3.4932620061985555, + "learning_rate": 1.995475836828264e-06, + "loss": 0.5275, + "step": 7114 + }, + { + "epoch": 0.5778445545358564, + "grad_norm": 4.368032739037839, + "learning_rate": 1.9948317628506444e-06, + "loss": 0.4277, + "step": 7115 + }, + { + "epoch": 0.577925769511898, + "grad_norm": 3.876730464227753, + "learning_rate": 1.994187723827992e-06, + "loss": 0.4987, + "step": 7116 + }, + { + "epoch": 0.5780069844879395, + "grad_norm": 3.2062830745638506, + "learning_rate": 1.9935437198048722e-06, + "loss": 0.5308, + "step": 7117 + }, + { + "epoch": 0.5780881994639812, + "grad_norm": 7.356711618544573, + "learning_rate": 1.9928997508258475e-06, + "loss": 0.464, + "step": 7118 + }, + { + "epoch": 0.5781694144400228, + "grad_norm": 5.484693237704757, + "learning_rate": 1.9922558169354752e-06, + "loss": 0.4484, + "step": 7119 + }, + { + "epoch": 0.5782506294160643, + "grad_norm": 6.681361631875744, + "learning_rate": 1.9916119181783135e-06, + "loss": 0.4322, + "step": 7120 + }, + { + "epoch": 0.5783318443921059, + "grad_norm": 4.438508741670482, + "learning_rate": 1.9909680545989175e-06, + "loss": 0.567, + "step": 7121 + }, + { + "epoch": 0.5784130593681475, + "grad_norm": 5.678949037298267, + "learning_rate": 1.9903242262418366e-06, + "loss": 0.5764, + "step": 7122 + }, + { + "epoch": 0.5784942743441891, + "grad_norm": 4.194676378904089, + "learning_rate": 1.989680433151622e-06, + "loss": 0.7083, + "step": 7123 + }, + { + "epoch": 0.5785754893202306, + "grad_norm": 8.002991142166685, + "learning_rate": 1.989036675372822e-06, + "loss": 0.5588, + "step": 7124 + }, + { + "epoch": 0.5786567042962723, + "grad_norm": 3.764738047010275, + "learning_rate": 1.988392952949978e-06, + "loss": 0.5561, + "step": 7125 + }, + { + "epoch": 0.5787379192723138, + "grad_norm": 7.746991321514468, + "learning_rate": 1.9877492659276353e-06, + "loss": 0.473, + "step": 7126 + }, + { + "epoch": 0.5788191342483554, + "grad_norm": 5.541041864204196, + "learning_rate": 1.9871056143503322e-06, + "loss": 0.417, + "step": 7127 + }, + { + "epoch": 0.5789003492243969, + "grad_norm": 3.424180901190808, + "learning_rate": 1.9864619982626064e-06, + "loss": 0.743, + "step": 7128 + }, + { + "epoch": 0.5789815642004386, + "grad_norm": 16.31331645341726, + "learning_rate": 1.9858184177089915e-06, + "loss": 0.6825, + "step": 7129 + }, + { + "epoch": 0.5790627791764802, + "grad_norm": 3.593693975763408, + "learning_rate": 1.9851748727340214e-06, + "loss": 0.5404, + "step": 7130 + }, + { + "epoch": 0.5791439941525217, + "grad_norm": 6.678160792934489, + "learning_rate": 1.9845313633822255e-06, + "loss": 0.5591, + "step": 7131 + }, + { + "epoch": 0.5792252091285633, + "grad_norm": 4.456993833787158, + "learning_rate": 1.9838878896981303e-06, + "loss": 0.5188, + "step": 7132 + }, + { + "epoch": 0.5793064241046049, + "grad_norm": 4.230178294928199, + "learning_rate": 1.9832444517262625e-06, + "loss": 0.3748, + "step": 7133 + }, + { + "epoch": 0.5793876390806465, + "grad_norm": 4.218687976196768, + "learning_rate": 1.982601049511144e-06, + "loss": 0.4318, + "step": 7134 + }, + { + "epoch": 0.579468854056688, + "grad_norm": 3.9051525132078386, + "learning_rate": 1.9819576830972938e-06, + "loss": 0.5105, + "step": 7135 + }, + { + "epoch": 0.5795500690327297, + "grad_norm": 5.953375128637914, + "learning_rate": 1.9813143525292304e-06, + "loss": 0.5147, + "step": 7136 + }, + { + "epoch": 0.5796312840087712, + "grad_norm": 5.299949856256123, + "learning_rate": 1.980671057851469e-06, + "loss": 0.5801, + "step": 7137 + }, + { + "epoch": 0.5797124989848128, + "grad_norm": 5.575356667344601, + "learning_rate": 1.9800277991085217e-06, + "loss": 0.484, + "step": 7138 + }, + { + "epoch": 0.5797937139608543, + "grad_norm": 4.75965558776345, + "learning_rate": 1.9793845763448987e-06, + "loss": 0.3816, + "step": 7139 + }, + { + "epoch": 0.579874928936896, + "grad_norm": 3.335625778162441, + "learning_rate": 1.9787413896051084e-06, + "loss": 0.4796, + "step": 7140 + }, + { + "epoch": 0.5799561439129376, + "grad_norm": 4.108846663125449, + "learning_rate": 1.978098238933654e-06, + "loss": 0.4803, + "step": 7141 + }, + { + "epoch": 0.5800373588889791, + "grad_norm": 4.357715753076788, + "learning_rate": 1.9774551243750403e-06, + "loss": 0.3848, + "step": 7142 + }, + { + "epoch": 0.5801185738650207, + "grad_norm": 4.9420482058322435, + "learning_rate": 1.9768120459737663e-06, + "loss": 0.4834, + "step": 7143 + }, + { + "epoch": 0.5801997888410623, + "grad_norm": 3.610962949080674, + "learning_rate": 1.9761690037743293e-06, + "loss": 0.4072, + "step": 7144 + }, + { + "epoch": 0.5802810038171039, + "grad_norm": 4.466463892907367, + "learning_rate": 1.9755259978212253e-06, + "loss": 0.5826, + "step": 7145 + }, + { + "epoch": 0.5803622187931454, + "grad_norm": 5.2380939327824345, + "learning_rate": 1.9748830281589464e-06, + "loss": 0.6036, + "step": 7146 + }, + { + "epoch": 0.5804434337691871, + "grad_norm": 3.3553525016104704, + "learning_rate": 1.9742400948319838e-06, + "loss": 0.6842, + "step": 7147 + }, + { + "epoch": 0.5805246487452286, + "grad_norm": 5.736174896340501, + "learning_rate": 1.9735971978848224e-06, + "loss": 0.5493, + "step": 7148 + }, + { + "epoch": 0.5806058637212702, + "grad_norm": 3.703679606497901, + "learning_rate": 1.9729543373619497e-06, + "loss": 0.4979, + "step": 7149 + }, + { + "epoch": 0.5806870786973117, + "grad_norm": 9.728903722310404, + "learning_rate": 1.972311513307848e-06, + "loss": 0.45, + "step": 7150 + }, + { + "epoch": 0.5807682936733534, + "grad_norm": 5.187888574328153, + "learning_rate": 1.971668725766996e-06, + "loss": 0.7753, + "step": 7151 + }, + { + "epoch": 0.580849508649395, + "grad_norm": 3.3139229056956396, + "learning_rate": 1.971025974783872e-06, + "loss": 0.675, + "step": 7152 + }, + { + "epoch": 0.5809307236254365, + "grad_norm": 9.118516211139692, + "learning_rate": 1.9703832604029523e-06, + "loss": 0.5264, + "step": 7153 + }, + { + "epoch": 0.5810119386014782, + "grad_norm": 4.415066688147641, + "learning_rate": 1.9697405826687063e-06, + "loss": 0.5034, + "step": 7154 + }, + { + "epoch": 0.5810931535775197, + "grad_norm": 5.533151171983695, + "learning_rate": 1.9690979416256062e-06, + "loss": 0.5316, + "step": 7155 + }, + { + "epoch": 0.5811743685535613, + "grad_norm": 3.4203284332256274, + "learning_rate": 1.9684553373181197e-06, + "loss": 0.4886, + "step": 7156 + }, + { + "epoch": 0.5812555835296028, + "grad_norm": 7.5906819086747435, + "learning_rate": 1.967812769790709e-06, + "loss": 0.5435, + "step": 7157 + }, + { + "epoch": 0.5813367985056445, + "grad_norm": 5.6092591367647815, + "learning_rate": 1.9671702390878396e-06, + "loss": 0.4718, + "step": 7158 + }, + { + "epoch": 0.581418013481686, + "grad_norm": 4.25266168413679, + "learning_rate": 1.9665277452539696e-06, + "loss": 0.5233, + "step": 7159 + }, + { + "epoch": 0.5814992284577276, + "grad_norm": 3.843541994359124, + "learning_rate": 1.965885288333555e-06, + "loss": 0.4787, + "step": 7160 + }, + { + "epoch": 0.5815804434337691, + "grad_norm": 4.549147478757539, + "learning_rate": 1.965242868371053e-06, + "loss": 0.6732, + "step": 7161 + }, + { + "epoch": 0.5816616584098108, + "grad_norm": 7.3370587404827345, + "learning_rate": 1.9646004854109136e-06, + "loss": 0.3952, + "step": 7162 + }, + { + "epoch": 0.5817428733858524, + "grad_norm": 4.832489166222658, + "learning_rate": 1.963958139497588e-06, + "loss": 0.5612, + "step": 7163 + }, + { + "epoch": 0.5818240883618939, + "grad_norm": 4.612979610999288, + "learning_rate": 1.9633158306755206e-06, + "loss": 0.4501, + "step": 7164 + }, + { + "epoch": 0.5819053033379356, + "grad_norm": 3.842889295187542, + "learning_rate": 1.962673558989158e-06, + "loss": 0.479, + "step": 7165 + }, + { + "epoch": 0.5819865183139771, + "grad_norm": 3.615074737738207, + "learning_rate": 1.9620313244829423e-06, + "loss": 0.5962, + "step": 7166 + }, + { + "epoch": 0.5820677332900187, + "grad_norm": 8.385548343766278, + "learning_rate": 1.961389127201311e-06, + "loss": 0.5366, + "step": 7167 + }, + { + "epoch": 0.5821489482660602, + "grad_norm": 4.0857765219914945, + "learning_rate": 1.9607469671887015e-06, + "loss": 0.3576, + "step": 7168 + }, + { + "epoch": 0.5822301632421019, + "grad_norm": 4.7701105433833995, + "learning_rate": 1.960104844489548e-06, + "loss": 0.5, + "step": 7169 + }, + { + "epoch": 0.5823113782181434, + "grad_norm": 4.079111419609443, + "learning_rate": 1.9594627591482817e-06, + "loss": 0.6516, + "step": 7170 + }, + { + "epoch": 0.582392593194185, + "grad_norm": 5.075085483225634, + "learning_rate": 1.9588207112093324e-06, + "loss": 0.5357, + "step": 7171 + }, + { + "epoch": 0.5824738081702265, + "grad_norm": 3.9201086563815624, + "learning_rate": 1.958178700717125e-06, + "loss": 0.4631, + "step": 7172 + }, + { + "epoch": 0.5825550231462682, + "grad_norm": 4.27616202832829, + "learning_rate": 1.957536727716084e-06, + "loss": 0.4648, + "step": 7173 + }, + { + "epoch": 0.5826362381223098, + "grad_norm": 4.882524062302524, + "learning_rate": 1.956894792250631e-06, + "loss": 0.5446, + "step": 7174 + }, + { + "epoch": 0.5827174530983513, + "grad_norm": 3.8190132907051435, + "learning_rate": 1.9562528943651837e-06, + "loss": 0.54, + "step": 7175 + }, + { + "epoch": 0.582798668074393, + "grad_norm": 6.140657105349779, + "learning_rate": 1.955611034104158e-06, + "loss": 0.4492, + "step": 7176 + }, + { + "epoch": 0.5828798830504345, + "grad_norm": 4.341900527053686, + "learning_rate": 1.9549692115119685e-06, + "loss": 0.5207, + "step": 7177 + }, + { + "epoch": 0.5829610980264761, + "grad_norm": 3.7483344786953685, + "learning_rate": 1.9543274266330244e-06, + "loss": 0.4006, + "step": 7178 + }, + { + "epoch": 0.5830423130025176, + "grad_norm": 3.3504281004368854, + "learning_rate": 1.9536856795117344e-06, + "loss": 0.5293, + "step": 7179 + }, + { + "epoch": 0.5831235279785593, + "grad_norm": 6.428753807471014, + "learning_rate": 1.9530439701925046e-06, + "loss": 0.6898, + "step": 7180 + }, + { + "epoch": 0.5832047429546008, + "grad_norm": 9.364262509671228, + "learning_rate": 1.952402298719737e-06, + "loss": 0.3292, + "step": 7181 + }, + { + "epoch": 0.5832859579306424, + "grad_norm": 3.942276858330493, + "learning_rate": 1.951760665137832e-06, + "loss": 0.4671, + "step": 7182 + }, + { + "epoch": 0.5833671729066839, + "grad_norm": 6.118096625229616, + "learning_rate": 1.9511190694911875e-06, + "loss": 0.4731, + "step": 7183 + }, + { + "epoch": 0.5834483878827256, + "grad_norm": 4.555831582312394, + "learning_rate": 1.9504775118241987e-06, + "loss": 0.4638, + "step": 7184 + }, + { + "epoch": 0.5835296028587672, + "grad_norm": 3.4499740673047214, + "learning_rate": 1.9498359921812583e-06, + "loss": 0.6663, + "step": 7185 + }, + { + "epoch": 0.5836108178348087, + "grad_norm": 6.8178554531301705, + "learning_rate": 1.9491945106067544e-06, + "loss": 0.4266, + "step": 7186 + }, + { + "epoch": 0.5836920328108504, + "grad_norm": 4.578346378430701, + "learning_rate": 1.948553067145076e-06, + "loss": 0.5886, + "step": 7187 + }, + { + "epoch": 0.5837732477868919, + "grad_norm": 3.0536821725267718, + "learning_rate": 1.947911661840607e-06, + "loss": 0.3962, + "step": 7188 + }, + { + "epoch": 0.5838544627629335, + "grad_norm": 10.782933310564555, + "learning_rate": 1.947270294737728e-06, + "loss": 0.3495, + "step": 7189 + }, + { + "epoch": 0.583935677738975, + "grad_norm": 4.129900561498041, + "learning_rate": 1.9466289658808207e-06, + "loss": 0.4919, + "step": 7190 + }, + { + "epoch": 0.5840168927150167, + "grad_norm": 6.249402586479176, + "learning_rate": 1.9459876753142593e-06, + "loss": 0.541, + "step": 7191 + }, + { + "epoch": 0.5840981076910582, + "grad_norm": 2.926784134616971, + "learning_rate": 1.9453464230824186e-06, + "loss": 0.4936, + "step": 7192 + }, + { + "epoch": 0.5841793226670998, + "grad_norm": 3.6484600396561433, + "learning_rate": 1.9447052092296712e-06, + "loss": 0.5192, + "step": 7193 + }, + { + "epoch": 0.5842605376431413, + "grad_norm": 3.898125434059147, + "learning_rate": 1.9440640338003835e-06, + "loss": 0.4092, + "step": 7194 + }, + { + "epoch": 0.584341752619183, + "grad_norm": 3.683502965273959, + "learning_rate": 1.943422896838922e-06, + "loss": 0.5507, + "step": 7195 + }, + { + "epoch": 0.5844229675952246, + "grad_norm": 5.288308403487764, + "learning_rate": 1.9427817983896518e-06, + "loss": 0.4068, + "step": 7196 + }, + { + "epoch": 0.5845041825712661, + "grad_norm": 5.445064431182992, + "learning_rate": 1.942140738496931e-06, + "loss": 0.7837, + "step": 7197 + }, + { + "epoch": 0.5845853975473078, + "grad_norm": 6.058709142095907, + "learning_rate": 1.9414997172051184e-06, + "loss": 0.3739, + "step": 7198 + }, + { + "epoch": 0.5846666125233493, + "grad_norm": 3.970322164486651, + "learning_rate": 1.9408587345585707e-06, + "loss": 0.5194, + "step": 7199 + }, + { + "epoch": 0.5847478274993909, + "grad_norm": 5.3016872496159495, + "learning_rate": 1.9402177906016395e-06, + "loss": 0.4401, + "step": 7200 + }, + { + "epoch": 0.5848290424754324, + "grad_norm": 4.775951466036485, + "learning_rate": 1.939576885378674e-06, + "loss": 0.398, + "step": 7201 + }, + { + "epoch": 0.5849102574514741, + "grad_norm": 6.899462009323301, + "learning_rate": 1.9389360189340213e-06, + "loss": 0.5067, + "step": 7202 + }, + { + "epoch": 0.5849914724275156, + "grad_norm": 3.2063799922509464, + "learning_rate": 1.9382951913120276e-06, + "loss": 0.5829, + "step": 7203 + }, + { + "epoch": 0.5850726874035572, + "grad_norm": 9.321053496169446, + "learning_rate": 1.937654402557034e-06, + "loss": 0.4999, + "step": 7204 + }, + { + "epoch": 0.5851539023795987, + "grad_norm": 5.677431258954087, + "learning_rate": 1.937013652713378e-06, + "loss": 0.2643, + "step": 7205 + }, + { + "epoch": 0.5852351173556404, + "grad_norm": 3.3598751265364446, + "learning_rate": 1.9363729418253995e-06, + "loss": 0.7468, + "step": 7206 + }, + { + "epoch": 0.585316332331682, + "grad_norm": 4.156902235894881, + "learning_rate": 1.93573226993743e-06, + "loss": 0.5862, + "step": 7207 + }, + { + "epoch": 0.5853975473077235, + "grad_norm": 4.5056414028942084, + "learning_rate": 1.9350916370938004e-06, + "loss": 0.6588, + "step": 7208 + }, + { + "epoch": 0.5854787622837652, + "grad_norm": 5.759552128771596, + "learning_rate": 1.9344510433388405e-06, + "loss": 0.8105, + "step": 7209 + }, + { + "epoch": 0.5855599772598067, + "grad_norm": 4.3224511149017255, + "learning_rate": 1.9338104887168753e-06, + "loss": 0.5087, + "step": 7210 + }, + { + "epoch": 0.5856411922358483, + "grad_norm": 4.7674728901987455, + "learning_rate": 1.933169973272227e-06, + "loss": 0.4611, + "step": 7211 + }, + { + "epoch": 0.5857224072118898, + "grad_norm": 5.687185959055942, + "learning_rate": 1.932529497049217e-06, + "loss": 0.5603, + "step": 7212 + }, + { + "epoch": 0.5858036221879315, + "grad_norm": 6.09715437191108, + "learning_rate": 1.9318890600921638e-06, + "loss": 0.6074, + "step": 7213 + }, + { + "epoch": 0.585884837163973, + "grad_norm": 5.020060984695505, + "learning_rate": 1.9312486624453783e-06, + "loss": 0.6328, + "step": 7214 + }, + { + "epoch": 0.5859660521400146, + "grad_norm": 4.477953926588951, + "learning_rate": 1.9306083041531773e-06, + "loss": 0.5313, + "step": 7215 + }, + { + "epoch": 0.5860472671160561, + "grad_norm": 3.519455685248619, + "learning_rate": 1.9299679852598684e-06, + "loss": 0.6649, + "step": 7216 + }, + { + "epoch": 0.5861284820920978, + "grad_norm": 3.6704238824484325, + "learning_rate": 1.929327705809757e-06, + "loss": 0.4758, + "step": 7217 + }, + { + "epoch": 0.5862096970681394, + "grad_norm": 6.283905565847572, + "learning_rate": 1.928687465847148e-06, + "loss": 0.4217, + "step": 7218 + }, + { + "epoch": 0.5862909120441809, + "grad_norm": 6.945906876806998, + "learning_rate": 1.9280472654163436e-06, + "loss": 0.4945, + "step": 7219 + }, + { + "epoch": 0.5863721270202226, + "grad_norm": 8.370884932771967, + "learning_rate": 1.927407104561641e-06, + "loss": 0.4238, + "step": 7220 + }, + { + "epoch": 0.5864533419962641, + "grad_norm": 4.383870501450878, + "learning_rate": 1.926766983327336e-06, + "loss": 0.5634, + "step": 7221 + }, + { + "epoch": 0.5865345569723057, + "grad_norm": 9.241632141950406, + "learning_rate": 1.9261269017577228e-06, + "loss": 0.634, + "step": 7222 + }, + { + "epoch": 0.5866157719483472, + "grad_norm": 3.7586621345897853, + "learning_rate": 1.9254868598970904e-06, + "loss": 0.5618, + "step": 7223 + }, + { + "epoch": 0.5866969869243889, + "grad_norm": 4.209074416276335, + "learning_rate": 1.924846857789726e-06, + "loss": 0.6119, + "step": 7224 + }, + { + "epoch": 0.5867782019004304, + "grad_norm": 4.546889702288765, + "learning_rate": 1.924206895479916e-06, + "loss": 0.4853, + "step": 7225 + }, + { + "epoch": 0.586859416876472, + "grad_norm": 3.7823061514718423, + "learning_rate": 1.9235669730119415e-06, + "loss": 0.3448, + "step": 7226 + }, + { + "epoch": 0.5869406318525136, + "grad_norm": 4.256499430588995, + "learning_rate": 1.922927090430081e-06, + "loss": 0.5403, + "step": 7227 + }, + { + "epoch": 0.5870218468285552, + "grad_norm": 44.81613726842311, + "learning_rate": 1.9222872477786124e-06, + "loss": 0.4421, + "step": 7228 + }, + { + "epoch": 0.5871030618045968, + "grad_norm": 4.765715400723174, + "learning_rate": 1.921647445101809e-06, + "loss": 0.4627, + "step": 7229 + }, + { + "epoch": 0.5871842767806383, + "grad_norm": 20.740588315180986, + "learning_rate": 1.921007682443941e-06, + "loss": 0.4778, + "step": 7230 + }, + { + "epoch": 0.58726549175668, + "grad_norm": 5.271002543096211, + "learning_rate": 1.920367959849277e-06, + "loss": 0.4699, + "step": 7231 + }, + { + "epoch": 0.5873467067327215, + "grad_norm": 4.633497702253883, + "learning_rate": 1.919728277362083e-06, + "loss": 0.6411, + "step": 7232 + }, + { + "epoch": 0.5874279217087631, + "grad_norm": 2.739394598878589, + "learning_rate": 1.91908863502662e-06, + "loss": 0.3667, + "step": 7233 + }, + { + "epoch": 0.5875091366848046, + "grad_norm": 5.224714666393266, + "learning_rate": 1.9184490328871502e-06, + "loss": 0.5021, + "step": 7234 + }, + { + "epoch": 0.5875903516608463, + "grad_norm": 7.019642199719357, + "learning_rate": 1.9178094709879296e-06, + "loss": 0.4818, + "step": 7235 + }, + { + "epoch": 0.5876715666368878, + "grad_norm": 5.128939865748304, + "learning_rate": 1.9171699493732122e-06, + "loss": 0.455, + "step": 7236 + }, + { + "epoch": 0.5877527816129294, + "grad_norm": 7.227653257937603, + "learning_rate": 1.916530468087249e-06, + "loss": 0.4862, + "step": 7237 + }, + { + "epoch": 0.587833996588971, + "grad_norm": 5.527188153857498, + "learning_rate": 1.9158910271742905e-06, + "loss": 0.5265, + "step": 7238 + }, + { + "epoch": 0.5879152115650126, + "grad_norm": 2.6827965014779593, + "learning_rate": 1.9152516266785807e-06, + "loss": 0.5964, + "step": 7239 + }, + { + "epoch": 0.5879964265410542, + "grad_norm": 5.211112335596169, + "learning_rate": 1.9146122666443635e-06, + "loss": 0.5542, + "step": 7240 + }, + { + "epoch": 0.5880776415170957, + "grad_norm": 4.03076289953267, + "learning_rate": 1.91397294711588e-06, + "loss": 0.4605, + "step": 7241 + }, + { + "epoch": 0.5881588564931374, + "grad_norm": 5.996591886378709, + "learning_rate": 1.9133336681373673e-06, + "loss": 0.5431, + "step": 7242 + }, + { + "epoch": 0.5882400714691789, + "grad_norm": 4.7750369105364845, + "learning_rate": 1.912694429753059e-06, + "loss": 0.4664, + "step": 7243 + }, + { + "epoch": 0.5883212864452205, + "grad_norm": 4.118454697868038, + "learning_rate": 1.912055232007188e-06, + "loss": 0.4708, + "step": 7244 + }, + { + "epoch": 0.588402501421262, + "grad_norm": 5.287731670886094, + "learning_rate": 1.911416074943984e-06, + "loss": 0.4099, + "step": 7245 + }, + { + "epoch": 0.5884837163973037, + "grad_norm": 3.463709326164506, + "learning_rate": 1.9107769586076716e-06, + "loss": 0.4555, + "step": 7246 + }, + { + "epoch": 0.5885649313733452, + "grad_norm": 3.9158367444108855, + "learning_rate": 1.9101378830424758e-06, + "loss": 0.5739, + "step": 7247 + }, + { + "epoch": 0.5886461463493868, + "grad_norm": 4.558330070983507, + "learning_rate": 1.909498848292617e-06, + "loss": 0.561, + "step": 7248 + }, + { + "epoch": 0.5887273613254284, + "grad_norm": 5.495600846510651, + "learning_rate": 1.9088598544023118e-06, + "loss": 0.3776, + "step": 7249 + }, + { + "epoch": 0.58880857630147, + "grad_norm": 4.632815620841959, + "learning_rate": 1.908220901415777e-06, + "loss": 0.4483, + "step": 7250 + }, + { + "epoch": 0.5888897912775116, + "grad_norm": 4.978619608664077, + "learning_rate": 1.907581989377224e-06, + "loss": 0.5724, + "step": 7251 + }, + { + "epoch": 0.5889710062535531, + "grad_norm": 5.346540069308749, + "learning_rate": 1.9069431183308615e-06, + "loss": 0.5294, + "step": 7252 + }, + { + "epoch": 0.5890522212295948, + "grad_norm": 4.59400323772315, + "learning_rate": 1.906304288320896e-06, + "loss": 0.6616, + "step": 7253 + }, + { + "epoch": 0.5891334362056363, + "grad_norm": 6.3719239881505665, + "learning_rate": 1.9056654993915326e-06, + "loss": 0.6185, + "step": 7254 + }, + { + "epoch": 0.5892146511816779, + "grad_norm": 6.682687810031172, + "learning_rate": 1.9050267515869709e-06, + "loss": 0.4781, + "step": 7255 + }, + { + "epoch": 0.5892958661577195, + "grad_norm": 3.7501427973660397, + "learning_rate": 1.9043880449514085e-06, + "loss": 0.8261, + "step": 7256 + }, + { + "epoch": 0.5893770811337611, + "grad_norm": 5.558288691226038, + "learning_rate": 1.9037493795290421e-06, + "loss": 0.4972, + "step": 7257 + }, + { + "epoch": 0.5894582961098026, + "grad_norm": 5.421642176266866, + "learning_rate": 1.9031107553640632e-06, + "loss": 0.4625, + "step": 7258 + }, + { + "epoch": 0.5895395110858442, + "grad_norm": 5.221069697025148, + "learning_rate": 1.9024721725006598e-06, + "loss": 0.3844, + "step": 7259 + }, + { + "epoch": 0.5896207260618858, + "grad_norm": 7.976877536523762, + "learning_rate": 1.9018336309830202e-06, + "loss": 0.3608, + "step": 7260 + }, + { + "epoch": 0.5897019410379274, + "grad_norm": 9.70876511343524, + "learning_rate": 1.9011951308553284e-06, + "loss": 0.5597, + "step": 7261 + }, + { + "epoch": 0.589783156013969, + "grad_norm": 4.2405529559362405, + "learning_rate": 1.900556672161763e-06, + "loss": 0.5158, + "step": 7262 + }, + { + "epoch": 0.5898643709900105, + "grad_norm": 6.1774490371345845, + "learning_rate": 1.899918254946504e-06, + "loss": 0.5368, + "step": 7263 + }, + { + "epoch": 0.5899455859660522, + "grad_norm": 5.486539007392727, + "learning_rate": 1.8992798792537265e-06, + "loss": 0.6, + "step": 7264 + }, + { + "epoch": 0.5900268009420937, + "grad_norm": 4.8768276862477995, + "learning_rate": 1.898641545127601e-06, + "loss": 0.3974, + "step": 7265 + }, + { + "epoch": 0.5901080159181353, + "grad_norm": 5.691046376635863, + "learning_rate": 1.8980032526122985e-06, + "loss": 0.5479, + "step": 7266 + }, + { + "epoch": 0.5901892308941769, + "grad_norm": 5.4416215817890325, + "learning_rate": 1.8973650017519855e-06, + "loss": 0.4471, + "step": 7267 + }, + { + "epoch": 0.5902704458702185, + "grad_norm": 3.7432268661179324, + "learning_rate": 1.8967267925908237e-06, + "loss": 0.5173, + "step": 7268 + }, + { + "epoch": 0.59035166084626, + "grad_norm": 8.040626967946537, + "learning_rate": 1.8960886251729756e-06, + "loss": 0.4184, + "step": 7269 + }, + { + "epoch": 0.5904328758223016, + "grad_norm": 6.080428319987492, + "learning_rate": 1.8954504995425994e-06, + "loss": 0.4146, + "step": 7270 + }, + { + "epoch": 0.5905140907983432, + "grad_norm": 4.115370064107381, + "learning_rate": 1.8948124157438485e-06, + "loss": 0.4314, + "step": 7271 + }, + { + "epoch": 0.5905953057743848, + "grad_norm": 4.654941886199049, + "learning_rate": 1.8941743738208752e-06, + "loss": 0.4831, + "step": 7272 + }, + { + "epoch": 0.5906765207504264, + "grad_norm": 6.074230738214155, + "learning_rate": 1.8935363738178288e-06, + "loss": 0.5219, + "step": 7273 + }, + { + "epoch": 0.590757735726468, + "grad_norm": 4.209033114032256, + "learning_rate": 1.8928984157788565e-06, + "loss": 0.6185, + "step": 7274 + }, + { + "epoch": 0.5908389507025096, + "grad_norm": 6.854702739452237, + "learning_rate": 1.8922604997480998e-06, + "loss": 0.5069, + "step": 7275 + }, + { + "epoch": 0.5909201656785511, + "grad_norm": 5.0281604588869975, + "learning_rate": 1.8916226257697004e-06, + "loss": 0.4144, + "step": 7276 + }, + { + "epoch": 0.5910013806545927, + "grad_norm": 3.904693540082544, + "learning_rate": 1.8909847938877962e-06, + "loss": 0.697, + "step": 7277 + }, + { + "epoch": 0.5910825956306343, + "grad_norm": 4.638587360552655, + "learning_rate": 1.89034700414652e-06, + "loss": 0.3997, + "step": 7278 + }, + { + "epoch": 0.5911638106066759, + "grad_norm": 4.955343175267275, + "learning_rate": 1.8897092565900048e-06, + "loss": 0.5474, + "step": 7279 + }, + { + "epoch": 0.5912450255827174, + "grad_norm": 4.586670514862452, + "learning_rate": 1.8890715512623802e-06, + "loss": 0.5885, + "step": 7280 + }, + { + "epoch": 0.591326240558759, + "grad_norm": 6.135861467307459, + "learning_rate": 1.8884338882077697e-06, + "loss": 0.4193, + "step": 7281 + }, + { + "epoch": 0.5914074555348006, + "grad_norm": 8.91363405049214, + "learning_rate": 1.8877962674702977e-06, + "loss": 0.4749, + "step": 7282 + }, + { + "epoch": 0.5914886705108422, + "grad_norm": 3.5396738186174317, + "learning_rate": 1.8871586890940847e-06, + "loss": 0.4059, + "step": 7283 + }, + { + "epoch": 0.5915698854868838, + "grad_norm": 4.623629413772576, + "learning_rate": 1.886521153123246e-06, + "loss": 0.4456, + "step": 7284 + }, + { + "epoch": 0.5916511004629254, + "grad_norm": 4.952672810545965, + "learning_rate": 1.8858836596018973e-06, + "loss": 0.4957, + "step": 7285 + }, + { + "epoch": 0.591732315438967, + "grad_norm": 4.741145039545014, + "learning_rate": 1.8852462085741497e-06, + "loss": 0.5374, + "step": 7286 + }, + { + "epoch": 0.5918135304150085, + "grad_norm": 8.418571766778543, + "learning_rate": 1.8846088000841096e-06, + "loss": 0.558, + "step": 7287 + }, + { + "epoch": 0.5918947453910501, + "grad_norm": 3.602706463619327, + "learning_rate": 1.8839714341758847e-06, + "loss": 0.5848, + "step": 7288 + }, + { + "epoch": 0.5919759603670917, + "grad_norm": 10.651397817035003, + "learning_rate": 1.883334110893576e-06, + "loss": 0.5071, + "step": 7289 + }, + { + "epoch": 0.5920571753431333, + "grad_norm": 4.361973439215471, + "learning_rate": 1.8826968302812837e-06, + "loss": 0.4555, + "step": 7290 + }, + { + "epoch": 0.5921383903191748, + "grad_norm": 6.18398302417084, + "learning_rate": 1.8820595923831025e-06, + "loss": 0.6062, + "step": 7291 + }, + { + "epoch": 0.5922196052952164, + "grad_norm": 8.248248546443998, + "learning_rate": 1.8814223972431276e-06, + "loss": 0.5944, + "step": 7292 + }, + { + "epoch": 0.592300820271258, + "grad_norm": 7.741858864310114, + "learning_rate": 1.8807852449054497e-06, + "loss": 0.5043, + "step": 7293 + }, + { + "epoch": 0.5923820352472996, + "grad_norm": 3.9704756953268565, + "learning_rate": 1.8801481354141547e-06, + "loss": 0.5163, + "step": 7294 + }, + { + "epoch": 0.5924632502233412, + "grad_norm": 4.432871358786943, + "learning_rate": 1.8795110688133283e-06, + "loss": 0.4582, + "step": 7295 + }, + { + "epoch": 0.5925444651993828, + "grad_norm": 4.035829611958314, + "learning_rate": 1.878874045147053e-06, + "loss": 0.4393, + "step": 7296 + }, + { + "epoch": 0.5926256801754244, + "grad_norm": 3.2780411870082284, + "learning_rate": 1.8782370644594055e-06, + "loss": 0.62, + "step": 7297 + }, + { + "epoch": 0.5927068951514659, + "grad_norm": 5.278700539156712, + "learning_rate": 1.8776001267944628e-06, + "loss": 0.5785, + "step": 7298 + }, + { + "epoch": 0.5927881101275075, + "grad_norm": 3.827532458925898, + "learning_rate": 1.876963232196298e-06, + "loss": 0.57, + "step": 7299 + }, + { + "epoch": 0.5928693251035491, + "grad_norm": 4.0256389154851595, + "learning_rate": 1.876326380708979e-06, + "loss": 0.609, + "step": 7300 + }, + { + "epoch": 0.5929505400795907, + "grad_norm": 8.391342343121048, + "learning_rate": 1.8756895723765747e-06, + "loss": 0.5095, + "step": 7301 + }, + { + "epoch": 0.5930317550556322, + "grad_norm": 3.698568278989539, + "learning_rate": 1.8750528072431477e-06, + "loss": 0.5275, + "step": 7302 + }, + { + "epoch": 0.5931129700316738, + "grad_norm": 7.336981984750687, + "learning_rate": 1.8744160853527579e-06, + "loss": 0.6133, + "step": 7303 + }, + { + "epoch": 0.5931941850077154, + "grad_norm": 4.166581863576458, + "learning_rate": 1.8737794067494656e-06, + "loss": 0.4626, + "step": 7304 + }, + { + "epoch": 0.593275399983757, + "grad_norm": 4.274646132254657, + "learning_rate": 1.8731427714773233e-06, + "loss": 0.5315, + "step": 7305 + }, + { + "epoch": 0.5933566149597986, + "grad_norm": 3.911879665279761, + "learning_rate": 1.8725061795803846e-06, + "loss": 0.4508, + "step": 7306 + }, + { + "epoch": 0.5934378299358402, + "grad_norm": 2.642462457610074, + "learning_rate": 1.8718696311026956e-06, + "loss": 0.5147, + "step": 7307 + }, + { + "epoch": 0.5935190449118818, + "grad_norm": 4.675230745750343, + "learning_rate": 1.871233126088305e-06, + "loss": 0.3801, + "step": 7308 + }, + { + "epoch": 0.5936002598879233, + "grad_norm": 10.255898660318655, + "learning_rate": 1.8705966645812544e-06, + "loss": 0.5329, + "step": 7309 + }, + { + "epoch": 0.5936814748639649, + "grad_norm": 4.954517136307957, + "learning_rate": 1.8699602466255828e-06, + "loss": 0.4131, + "step": 7310 + }, + { + "epoch": 0.5937626898400065, + "grad_norm": 4.751776132607896, + "learning_rate": 1.8693238722653278e-06, + "loss": 0.4266, + "step": 7311 + }, + { + "epoch": 0.5938439048160481, + "grad_norm": 3.047587348683506, + "learning_rate": 1.8686875415445238e-06, + "loss": 0.4889, + "step": 7312 + }, + { + "epoch": 0.5939251197920896, + "grad_norm": 12.990954504278774, + "learning_rate": 1.8680512545071999e-06, + "loss": 0.4731, + "step": 7313 + }, + { + "epoch": 0.5940063347681313, + "grad_norm": 4.070689985920183, + "learning_rate": 1.8674150111973854e-06, + "loss": 0.6631, + "step": 7314 + }, + { + "epoch": 0.5940875497441728, + "grad_norm": 2.864000413306198, + "learning_rate": 1.866778811659104e-06, + "loss": 0.8279, + "step": 7315 + }, + { + "epoch": 0.5941687647202144, + "grad_norm": 3.763015690605599, + "learning_rate": 1.8661426559363768e-06, + "loss": 0.6624, + "step": 7316 + }, + { + "epoch": 0.594249979696256, + "grad_norm": 6.349440373441788, + "learning_rate": 1.8655065440732243e-06, + "loss": 0.5768, + "step": 7317 + }, + { + "epoch": 0.5943311946722976, + "grad_norm": 3.739887750298035, + "learning_rate": 1.8648704761136604e-06, + "loss": 0.54, + "step": 7318 + }, + { + "epoch": 0.5944124096483392, + "grad_norm": 3.8465470522434977, + "learning_rate": 1.8642344521016974e-06, + "loss": 0.4265, + "step": 7319 + }, + { + "epoch": 0.5944936246243807, + "grad_norm": 5.442222857316663, + "learning_rate": 1.8635984720813471e-06, + "loss": 0.5427, + "step": 7320 + }, + { + "epoch": 0.5945748396004223, + "grad_norm": 5.036275208595901, + "learning_rate": 1.8629625360966137e-06, + "loss": 0.4966, + "step": 7321 + }, + { + "epoch": 0.5946560545764639, + "grad_norm": 4.325139913021066, + "learning_rate": 1.8623266441915006e-06, + "loss": 0.4192, + "step": 7322 + }, + { + "epoch": 0.5947372695525055, + "grad_norm": 3.7229174258197437, + "learning_rate": 1.86169079641001e-06, + "loss": 0.4411, + "step": 7323 + }, + { + "epoch": 0.594818484528547, + "grad_norm": 4.325040819420997, + "learning_rate": 1.861054992796138e-06, + "loss": 0.5971, + "step": 7324 + }, + { + "epoch": 0.5948996995045887, + "grad_norm": 5.429278035591401, + "learning_rate": 1.860419233393879e-06, + "loss": 0.5484, + "step": 7325 + }, + { + "epoch": 0.5949809144806302, + "grad_norm": 4.384346726547922, + "learning_rate": 1.859783518247223e-06, + "loss": 0.5676, + "step": 7326 + }, + { + "epoch": 0.5950621294566718, + "grad_norm": 7.391929682556335, + "learning_rate": 1.8591478474001601e-06, + "loss": 0.4677, + "step": 7327 + }, + { + "epoch": 0.5951433444327134, + "grad_norm": 5.680231306618248, + "learning_rate": 1.858512220896675e-06, + "loss": 0.562, + "step": 7328 + }, + { + "epoch": 0.595224559408755, + "grad_norm": 4.0720446605312395, + "learning_rate": 1.857876638780748e-06, + "loss": 0.4176, + "step": 7329 + }, + { + "epoch": 0.5953057743847966, + "grad_norm": 4.5916594217289575, + "learning_rate": 1.85724110109636e-06, + "loss": 0.5309, + "step": 7330 + }, + { + "epoch": 0.5953869893608381, + "grad_norm": 4.6780280469096365, + "learning_rate": 1.8566056078874858e-06, + "loss": 0.4923, + "step": 7331 + }, + { + "epoch": 0.5954682043368797, + "grad_norm": 3.6599980641941734, + "learning_rate": 1.8559701591980977e-06, + "loss": 0.5152, + "step": 7332 + }, + { + "epoch": 0.5955494193129213, + "grad_norm": 2.978800119941686, + "learning_rate": 1.8553347550721672e-06, + "loss": 0.4539, + "step": 7333 + }, + { + "epoch": 0.5956306342889629, + "grad_norm": 3.411250854476356, + "learning_rate": 1.8546993955536597e-06, + "loss": 0.64, + "step": 7334 + }, + { + "epoch": 0.5957118492650044, + "grad_norm": 4.069278334737661, + "learning_rate": 1.8540640806865379e-06, + "loss": 0.5401, + "step": 7335 + }, + { + "epoch": 0.595793064241046, + "grad_norm": 7.8976077553265505, + "learning_rate": 1.8534288105147644e-06, + "loss": 0.4868, + "step": 7336 + }, + { + "epoch": 0.5958742792170876, + "grad_norm": 3.08341050427021, + "learning_rate": 1.8527935850822947e-06, + "loss": 0.4693, + "step": 7337 + }, + { + "epoch": 0.5959554941931292, + "grad_norm": 4.0472231931258955, + "learning_rate": 1.8521584044330832e-06, + "loss": 0.5314, + "step": 7338 + }, + { + "epoch": 0.5960367091691708, + "grad_norm": 7.289070610294687, + "learning_rate": 1.851523268611082e-06, + "loss": 0.6015, + "step": 7339 + }, + { + "epoch": 0.5961179241452124, + "grad_norm": 5.48979102747638, + "learning_rate": 1.8508881776602386e-06, + "loss": 0.4459, + "step": 7340 + }, + { + "epoch": 0.596199139121254, + "grad_norm": 15.0231623389192, + "learning_rate": 1.850253131624497e-06, + "loss": 0.5274, + "step": 7341 + }, + { + "epoch": 0.5962803540972955, + "grad_norm": 4.569897243481455, + "learning_rate": 1.8496181305478014e-06, + "loss": 0.7543, + "step": 7342 + }, + { + "epoch": 0.5963615690733371, + "grad_norm": 5.818717995667628, + "learning_rate": 1.8489831744740887e-06, + "loss": 0.4809, + "step": 7343 + }, + { + "epoch": 0.5964427840493787, + "grad_norm": 7.4916301022433505, + "learning_rate": 1.8483482634472948e-06, + "loss": 0.4216, + "step": 7344 + }, + { + "epoch": 0.5965239990254203, + "grad_norm": 2.952985348960789, + "learning_rate": 1.8477133975113516e-06, + "loss": 0.4636, + "step": 7345 + }, + { + "epoch": 0.5966052140014618, + "grad_norm": 5.3497885917720875, + "learning_rate": 1.8470785767101898e-06, + "loss": 0.5137, + "step": 7346 + }, + { + "epoch": 0.5966864289775035, + "grad_norm": 4.056347500018793, + "learning_rate": 1.8464438010877348e-06, + "loss": 0.6904, + "step": 7347 + }, + { + "epoch": 0.596767643953545, + "grad_norm": 3.334261926906368, + "learning_rate": 1.845809070687909e-06, + "loss": 0.5172, + "step": 7348 + }, + { + "epoch": 0.5968488589295866, + "grad_norm": 8.367386875652794, + "learning_rate": 1.8451743855546345e-06, + "loss": 0.434, + "step": 7349 + }, + { + "epoch": 0.5969300739056282, + "grad_norm": 6.559459547573149, + "learning_rate": 1.8445397457318265e-06, + "loss": 0.5824, + "step": 7350 + }, + { + "epoch": 0.5970112888816698, + "grad_norm": 6.094611931889219, + "learning_rate": 1.8439051512633984e-06, + "loss": 0.5153, + "step": 7351 + }, + { + "epoch": 0.5970925038577114, + "grad_norm": 25.80816640442915, + "learning_rate": 1.8432706021932627e-06, + "loss": 0.6593, + "step": 7352 + }, + { + "epoch": 0.5971737188337529, + "grad_norm": 4.977546000712229, + "learning_rate": 1.8426360985653248e-06, + "loss": 0.6459, + "step": 7353 + }, + { + "epoch": 0.5972549338097946, + "grad_norm": 5.821123685581579, + "learning_rate": 1.8420016404234897e-06, + "loss": 0.5861, + "step": 7354 + }, + { + "epoch": 0.5973361487858361, + "grad_norm": 6.3086144237425525, + "learning_rate": 1.8413672278116595e-06, + "loss": 0.4389, + "step": 7355 + }, + { + "epoch": 0.5974173637618777, + "grad_norm": 5.044008936737754, + "learning_rate": 1.840732860773731e-06, + "loss": 0.4021, + "step": 7356 + }, + { + "epoch": 0.5974985787379192, + "grad_norm": 4.6734719943119005, + "learning_rate": 1.8400985393535986e-06, + "loss": 0.4824, + "step": 7357 + }, + { + "epoch": 0.5975797937139609, + "grad_norm": 3.603622687404737, + "learning_rate": 1.8394642635951563e-06, + "loss": 0.5207, + "step": 7358 + }, + { + "epoch": 0.5976610086900024, + "grad_norm": 3.7989831364132, + "learning_rate": 1.838830033542291e-06, + "loss": 0.6815, + "step": 7359 + }, + { + "epoch": 0.597742223666044, + "grad_norm": 2.3766163305666153, + "learning_rate": 1.8381958492388873e-06, + "loss": 0.4749, + "step": 7360 + }, + { + "epoch": 0.5978234386420856, + "grad_norm": 3.0966906185261993, + "learning_rate": 1.837561710728828e-06, + "loss": 0.5948, + "step": 7361 + }, + { + "epoch": 0.5979046536181272, + "grad_norm": 8.468628755593905, + "learning_rate": 1.8369276180559933e-06, + "loss": 0.3566, + "step": 7362 + }, + { + "epoch": 0.5979858685941688, + "grad_norm": 9.11735062017391, + "learning_rate": 1.836293571264258e-06, + "loss": 0.4737, + "step": 7363 + }, + { + "epoch": 0.5980670835702103, + "grad_norm": 5.041492074022004, + "learning_rate": 1.835659570397494e-06, + "loss": 0.4476, + "step": 7364 + }, + { + "epoch": 0.598148298546252, + "grad_norm": 3.99736329645453, + "learning_rate": 1.8350256154995733e-06, + "loss": 0.6365, + "step": 7365 + }, + { + "epoch": 0.5982295135222935, + "grad_norm": 8.810624359575181, + "learning_rate": 1.8343917066143597e-06, + "loss": 0.4232, + "step": 7366 + }, + { + "epoch": 0.5983107284983351, + "grad_norm": 4.925256449200388, + "learning_rate": 1.8337578437857169e-06, + "loss": 0.4499, + "step": 7367 + }, + { + "epoch": 0.5983919434743766, + "grad_norm": 3.5049195280397565, + "learning_rate": 1.8331240270575062e-06, + "loss": 0.6362, + "step": 7368 + }, + { + "epoch": 0.5984731584504183, + "grad_norm": 8.10123639728082, + "learning_rate": 1.8324902564735834e-06, + "loss": 0.4814, + "step": 7369 + }, + { + "epoch": 0.5985543734264598, + "grad_norm": 3.656092093713114, + "learning_rate": 1.831856532077801e-06, + "loss": 0.6061, + "step": 7370 + }, + { + "epoch": 0.5986355884025014, + "grad_norm": 5.372488168989093, + "learning_rate": 1.831222853914012e-06, + "loss": 0.5401, + "step": 7371 + }, + { + "epoch": 0.598716803378543, + "grad_norm": 5.279583180258407, + "learning_rate": 1.830589222026062e-06, + "loss": 0.4336, + "step": 7372 + }, + { + "epoch": 0.5987980183545846, + "grad_norm": 8.322203353642944, + "learning_rate": 1.8299556364577936e-06, + "loss": 0.6477, + "step": 7373 + }, + { + "epoch": 0.5988792333306262, + "grad_norm": 4.359529690940866, + "learning_rate": 1.8293220972530498e-06, + "loss": 0.5287, + "step": 7374 + }, + { + "epoch": 0.5989604483066677, + "grad_norm": 4.599497389802597, + "learning_rate": 1.8286886044556678e-06, + "loss": 0.4167, + "step": 7375 + }, + { + "epoch": 0.5990416632827094, + "grad_norm": 4.944453273477391, + "learning_rate": 1.8280551581094808e-06, + "loss": 0.4743, + "step": 7376 + }, + { + "epoch": 0.5991228782587509, + "grad_norm": 6.818339130298646, + "learning_rate": 1.8274217582583207e-06, + "loss": 0.6592, + "step": 7377 + }, + { + "epoch": 0.5992040932347925, + "grad_norm": 7.668165030066059, + "learning_rate": 1.826788404946016e-06, + "loss": 0.5305, + "step": 7378 + }, + { + "epoch": 0.599285308210834, + "grad_norm": 4.13273227160322, + "learning_rate": 1.8261550982163904e-06, + "loss": 0.5133, + "step": 7379 + }, + { + "epoch": 0.5993665231868757, + "grad_norm": 2.7993562519365196, + "learning_rate": 1.825521838113265e-06, + "loss": 0.5664, + "step": 7380 + }, + { + "epoch": 0.5994477381629172, + "grad_norm": 9.626646869624484, + "learning_rate": 1.8248886246804598e-06, + "loss": 0.4671, + "step": 7381 + }, + { + "epoch": 0.5995289531389588, + "grad_norm": 7.444176905061055, + "learning_rate": 1.8242554579617883e-06, + "loss": 0.5715, + "step": 7382 + }, + { + "epoch": 0.5996101681150005, + "grad_norm": 9.588196848439587, + "learning_rate": 1.8236223380010625e-06, + "loss": 0.4073, + "step": 7383 + }, + { + "epoch": 0.599691383091042, + "grad_norm": 6.866343301362604, + "learning_rate": 1.8229892648420922e-06, + "loss": 0.6084, + "step": 7384 + }, + { + "epoch": 0.5997725980670836, + "grad_norm": 5.01419995093504, + "learning_rate": 1.8223562385286809e-06, + "loss": 0.4774, + "step": 7385 + }, + { + "epoch": 0.5998538130431251, + "grad_norm": 5.956266278463153, + "learning_rate": 1.8217232591046313e-06, + "loss": 0.5421, + "step": 7386 + }, + { + "epoch": 0.5999350280191668, + "grad_norm": 5.610744689095254, + "learning_rate": 1.8210903266137434e-06, + "loss": 0.7168, + "step": 7387 + }, + { + "epoch": 0.6000162429952083, + "grad_norm": 4.657987631786978, + "learning_rate": 1.8204574410998119e-06, + "loss": 0.5227, + "step": 7388 + }, + { + "epoch": 0.6000974579712499, + "grad_norm": 4.434514515163585, + "learning_rate": 1.8198246026066279e-06, + "loss": 0.4822, + "step": 7389 + }, + { + "epoch": 0.6001786729472914, + "grad_norm": 5.183467175096997, + "learning_rate": 1.819191811177982e-06, + "loss": 0.579, + "step": 7390 + }, + { + "epoch": 0.6002598879233331, + "grad_norm": 4.579031661588377, + "learning_rate": 1.8185590668576602e-06, + "loss": 0.6555, + "step": 7391 + }, + { + "epoch": 0.6003411028993746, + "grad_norm": 5.1765990102478066, + "learning_rate": 1.817926369689444e-06, + "loss": 0.541, + "step": 7392 + }, + { + "epoch": 0.6004223178754162, + "grad_norm": 3.4939919762553977, + "learning_rate": 1.817293719717113e-06, + "loss": 0.5579, + "step": 7393 + }, + { + "epoch": 0.6005035328514579, + "grad_norm": 4.962538720225719, + "learning_rate": 1.8166611169844444e-06, + "loss": 0.3529, + "step": 7394 + }, + { + "epoch": 0.6005847478274994, + "grad_norm": 5.042600514532005, + "learning_rate": 1.8160285615352092e-06, + "loss": 0.4768, + "step": 7395 + }, + { + "epoch": 0.600665962803541, + "grad_norm": 3.8637771148189097, + "learning_rate": 1.8153960534131774e-06, + "loss": 0.6365, + "step": 7396 + }, + { + "epoch": 0.6007471777795825, + "grad_norm": 4.22885301921691, + "learning_rate": 1.8147635926621162e-06, + "loss": 0.5797, + "step": 7397 + }, + { + "epoch": 0.6008283927556242, + "grad_norm": 3.6962740741884095, + "learning_rate": 1.8141311793257876e-06, + "loss": 0.3554, + "step": 7398 + }, + { + "epoch": 0.6009096077316657, + "grad_norm": 3.758699623360753, + "learning_rate": 1.813498813447951e-06, + "loss": 0.596, + "step": 7399 + }, + { + "epoch": 0.6009908227077073, + "grad_norm": 3.201807519185477, + "learning_rate": 1.812866495072364e-06, + "loss": 0.5367, + "step": 7400 + }, + { + "epoch": 0.6010720376837488, + "grad_norm": 4.447710834904164, + "learning_rate": 1.812234224242779e-06, + "loss": 0.4886, + "step": 7401 + }, + { + "epoch": 0.6011532526597905, + "grad_norm": 4.642639006126207, + "learning_rate": 1.8116020010029448e-06, + "loss": 0.5678, + "step": 7402 + }, + { + "epoch": 0.601234467635832, + "grad_norm": 3.848966652372118, + "learning_rate": 1.8109698253966092e-06, + "loss": 0.4742, + "step": 7403 + }, + { + "epoch": 0.6013156826118736, + "grad_norm": 4.125486033292355, + "learning_rate": 1.8103376974675157e-06, + "loss": 0.4872, + "step": 7404 + }, + { + "epoch": 0.6013968975879153, + "grad_norm": 5.747061637911401, + "learning_rate": 1.8097056172594023e-06, + "loss": 0.4748, + "step": 7405 + }, + { + "epoch": 0.6014781125639568, + "grad_norm": 4.074457822433304, + "learning_rate": 1.8090735848160079e-06, + "loss": 0.3921, + "step": 7406 + }, + { + "epoch": 0.6015593275399984, + "grad_norm": 9.807247879545486, + "learning_rate": 1.808441600181065e-06, + "loss": 0.5085, + "step": 7407 + }, + { + "epoch": 0.6016405425160399, + "grad_norm": 5.7128146988760475, + "learning_rate": 1.8078096633983023e-06, + "loss": 0.431, + "step": 7408 + }, + { + "epoch": 0.6017217574920816, + "grad_norm": 21.8207651189527, + "learning_rate": 1.8071777745114477e-06, + "loss": 0.5174, + "step": 7409 + }, + { + "epoch": 0.6018029724681231, + "grad_norm": 9.425825620792596, + "learning_rate": 1.8065459335642254e-06, + "loss": 0.4276, + "step": 7410 + }, + { + "epoch": 0.6018841874441647, + "grad_norm": 4.023515489113687, + "learning_rate": 1.8059141406003532e-06, + "loss": 0.4419, + "step": 7411 + }, + { + "epoch": 0.6019654024202062, + "grad_norm": 3.615004223601, + "learning_rate": 1.8052823956635496e-06, + "loss": 0.5893, + "step": 7412 + }, + { + "epoch": 0.6020466173962479, + "grad_norm": 5.159465901388711, + "learning_rate": 1.8046506987975278e-06, + "loss": 0.429, + "step": 7413 + }, + { + "epoch": 0.6021278323722894, + "grad_norm": 3.103825361755657, + "learning_rate": 1.804019050045998e-06, + "loss": 0.6653, + "step": 7414 + }, + { + "epoch": 0.602209047348331, + "grad_norm": 5.198427303718621, + "learning_rate": 1.8033874494526646e-06, + "loss": 0.5391, + "step": 7415 + }, + { + "epoch": 0.6022902623243727, + "grad_norm": 7.073819537725478, + "learning_rate": 1.8027558970612347e-06, + "loss": 0.7089, + "step": 7416 + }, + { + "epoch": 0.6023714773004142, + "grad_norm": 3.801629807198999, + "learning_rate": 1.8021243929154063e-06, + "loss": 0.3996, + "step": 7417 + }, + { + "epoch": 0.6024526922764558, + "grad_norm": 24.873498159563677, + "learning_rate": 1.8014929370588757e-06, + "loss": 0.5828, + "step": 7418 + }, + { + "epoch": 0.6025339072524973, + "grad_norm": 5.485907374005129, + "learning_rate": 1.8008615295353376e-06, + "loss": 0.5204, + "step": 7419 + }, + { + "epoch": 0.602615122228539, + "grad_norm": 3.9473453351114705, + "learning_rate": 1.8002301703884816e-06, + "loss": 0.4032, + "step": 7420 + }, + { + "epoch": 0.6026963372045805, + "grad_norm": 4.470083941751573, + "learning_rate": 1.799598859661994e-06, + "loss": 0.4807, + "step": 7421 + }, + { + "epoch": 0.6027775521806221, + "grad_norm": 5.031697404411579, + "learning_rate": 1.7989675973995585e-06, + "loss": 0.4221, + "step": 7422 + }, + { + "epoch": 0.6028587671566636, + "grad_norm": 6.979484374974144, + "learning_rate": 1.7983363836448559e-06, + "loss": 0.3792, + "step": 7423 + }, + { + "epoch": 0.6029399821327053, + "grad_norm": 5.457913677824293, + "learning_rate": 1.7977052184415606e-06, + "loss": 0.3426, + "step": 7424 + }, + { + "epoch": 0.6030211971087468, + "grad_norm": 4.4056991242013455, + "learning_rate": 1.7970741018333482e-06, + "loss": 0.5527, + "step": 7425 + }, + { + "epoch": 0.6031024120847884, + "grad_norm": 9.887330375392434, + "learning_rate": 1.7964430338638883e-06, + "loss": 0.392, + "step": 7426 + }, + { + "epoch": 0.6031836270608301, + "grad_norm": 4.068954581333052, + "learning_rate": 1.7958120145768457e-06, + "loss": 0.4971, + "step": 7427 + }, + { + "epoch": 0.6032648420368716, + "grad_norm": 4.434746948784985, + "learning_rate": 1.7951810440158853e-06, + "loss": 0.503, + "step": 7428 + }, + { + "epoch": 0.6033460570129132, + "grad_norm": 4.082130196766388, + "learning_rate": 1.7945501222246673e-06, + "loss": 0.4411, + "step": 7429 + }, + { + "epoch": 0.6034272719889547, + "grad_norm": 10.979428713512291, + "learning_rate": 1.793919249246846e-06, + "loss": 0.4922, + "step": 7430 + }, + { + "epoch": 0.6035084869649964, + "grad_norm": 4.413393240856426, + "learning_rate": 1.7932884251260767e-06, + "loss": 0.5367, + "step": 7431 + }, + { + "epoch": 0.6035897019410379, + "grad_norm": 3.815015398191507, + "learning_rate": 1.7926576499060078e-06, + "loss": 0.5533, + "step": 7432 + }, + { + "epoch": 0.6036709169170795, + "grad_norm": 4.993930990302795, + "learning_rate": 1.7920269236302868e-06, + "loss": 0.4204, + "step": 7433 + }, + { + "epoch": 0.603752131893121, + "grad_norm": 4.907619542220186, + "learning_rate": 1.7913962463425544e-06, + "loss": 0.5547, + "step": 7434 + }, + { + "epoch": 0.6038333468691627, + "grad_norm": 4.135750795376068, + "learning_rate": 1.7907656180864519e-06, + "loss": 0.6044, + "step": 7435 + }, + { + "epoch": 0.6039145618452042, + "grad_norm": 15.259932160511177, + "learning_rate": 1.790135038905616e-06, + "loss": 0.5229, + "step": 7436 + }, + { + "epoch": 0.6039957768212458, + "grad_norm": 5.126091226363883, + "learning_rate": 1.7895045088436772e-06, + "loss": 0.4935, + "step": 7437 + }, + { + "epoch": 0.6040769917972875, + "grad_norm": 4.347192454488315, + "learning_rate": 1.7888740279442669e-06, + "loss": 0.4883, + "step": 7438 + }, + { + "epoch": 0.604158206773329, + "grad_norm": 8.459728931856809, + "learning_rate": 1.7882435962510102e-06, + "loss": 0.6231, + "step": 7439 + }, + { + "epoch": 0.6042394217493706, + "grad_norm": 4.357956562088014, + "learning_rate": 1.7876132138075292e-06, + "loss": 0.6246, + "step": 7440 + }, + { + "epoch": 0.6043206367254121, + "grad_norm": 5.116492859940842, + "learning_rate": 1.786982880657444e-06, + "loss": 0.3546, + "step": 7441 + }, + { + "epoch": 0.6044018517014538, + "grad_norm": 4.281674880545698, + "learning_rate": 1.7863525968443705e-06, + "loss": 0.3998, + "step": 7442 + }, + { + "epoch": 0.6044830666774953, + "grad_norm": 4.231714096569343, + "learning_rate": 1.785722362411919e-06, + "loss": 0.434, + "step": 7443 + }, + { + "epoch": 0.6045642816535369, + "grad_norm": 4.3628558475147265, + "learning_rate": 1.7850921774037012e-06, + "loss": 0.4972, + "step": 7444 + }, + { + "epoch": 0.6046454966295784, + "grad_norm": 3.8871791003489866, + "learning_rate": 1.7844620418633202e-06, + "loss": 0.5716, + "step": 7445 + }, + { + "epoch": 0.6047267116056201, + "grad_norm": 5.078543576155098, + "learning_rate": 1.7838319558343786e-06, + "loss": 0.5441, + "step": 7446 + }, + { + "epoch": 0.6048079265816616, + "grad_norm": 11.062361952259796, + "learning_rate": 1.7832019193604767e-06, + "loss": 0.5189, + "step": 7447 + }, + { + "epoch": 0.6048891415577032, + "grad_norm": 5.15437180400868, + "learning_rate": 1.7825719324852075e-06, + "loss": 0.5409, + "step": 7448 + }, + { + "epoch": 0.6049703565337449, + "grad_norm": 6.299357609539531, + "learning_rate": 1.7819419952521645e-06, + "loss": 0.4818, + "step": 7449 + }, + { + "epoch": 0.6050515715097864, + "grad_norm": 3.001544405666737, + "learning_rate": 1.7813121077049336e-06, + "loss": 0.5361, + "step": 7450 + }, + { + "epoch": 0.605132786485828, + "grad_norm": 4.256801862178172, + "learning_rate": 1.7806822698871022e-06, + "loss": 0.6223, + "step": 7451 + }, + { + "epoch": 0.6052140014618695, + "grad_norm": 3.963672219119554, + "learning_rate": 1.780052481842251e-06, + "loss": 0.4201, + "step": 7452 + }, + { + "epoch": 0.6052952164379112, + "grad_norm": 3.995116455512054, + "learning_rate": 1.7794227436139569e-06, + "loss": 0.5345, + "step": 7453 + }, + { + "epoch": 0.6053764314139527, + "grad_norm": 6.799955515987135, + "learning_rate": 1.778793055245796e-06, + "loss": 0.5772, + "step": 7454 + }, + { + "epoch": 0.6054576463899943, + "grad_norm": 5.467231768671727, + "learning_rate": 1.7781634167813388e-06, + "loss": 0.6087, + "step": 7455 + }, + { + "epoch": 0.6055388613660359, + "grad_norm": 5.970835497003089, + "learning_rate": 1.7775338282641525e-06, + "loss": 0.4178, + "step": 7456 + }, + { + "epoch": 0.6056200763420775, + "grad_norm": 4.908729638196646, + "learning_rate": 1.776904289737802e-06, + "loss": 0.5252, + "step": 7457 + }, + { + "epoch": 0.605701291318119, + "grad_norm": 8.614648749798091, + "learning_rate": 1.7762748012458481e-06, + "loss": 0.527, + "step": 7458 + }, + { + "epoch": 0.6057825062941606, + "grad_norm": 5.933581667639831, + "learning_rate": 1.7756453628318465e-06, + "loss": 0.4565, + "step": 7459 + }, + { + "epoch": 0.6058637212702023, + "grad_norm": 3.8154357222510034, + "learning_rate": 1.7750159745393536e-06, + "loss": 0.4551, + "step": 7460 + }, + { + "epoch": 0.6059449362462438, + "grad_norm": 7.8464108786714055, + "learning_rate": 1.7743866364119175e-06, + "loss": 0.473, + "step": 7461 + }, + { + "epoch": 0.6060261512222854, + "grad_norm": 4.514141406557562, + "learning_rate": 1.7737573484930853e-06, + "loss": 0.6192, + "step": 7462 + }, + { + "epoch": 0.606107366198327, + "grad_norm": 4.9804191071501585, + "learning_rate": 1.7731281108264025e-06, + "loss": 0.5783, + "step": 7463 + }, + { + "epoch": 0.6061885811743686, + "grad_norm": 3.8086232760432837, + "learning_rate": 1.7724989234554068e-06, + "loss": 0.4875, + "step": 7464 + }, + { + "epoch": 0.6062697961504101, + "grad_norm": 4.453392068019001, + "learning_rate": 1.7718697864236344e-06, + "loss": 0.4404, + "step": 7465 + }, + { + "epoch": 0.6063510111264517, + "grad_norm": 6.631970307210538, + "learning_rate": 1.771240699774621e-06, + "loss": 0.3888, + "step": 7466 + }, + { + "epoch": 0.6064322261024933, + "grad_norm": 3.3473485905433815, + "learning_rate": 1.7706116635518933e-06, + "loss": 0.5312, + "step": 7467 + }, + { + "epoch": 0.6065134410785349, + "grad_norm": 8.74164772322533, + "learning_rate": 1.7699826777989788e-06, + "loss": 0.4054, + "step": 7468 + }, + { + "epoch": 0.6065946560545764, + "grad_norm": 4.144159190764816, + "learning_rate": 1.7693537425593984e-06, + "loss": 0.5257, + "step": 7469 + }, + { + "epoch": 0.606675871030618, + "grad_norm": 4.098313315005926, + "learning_rate": 1.7687248578766727e-06, + "loss": 0.3604, + "step": 7470 + }, + { + "epoch": 0.6067570860066597, + "grad_norm": 6.610586143194924, + "learning_rate": 1.7680960237943174e-06, + "loss": 0.6578, + "step": 7471 + }, + { + "epoch": 0.6068383009827012, + "grad_norm": 4.968803137717695, + "learning_rate": 1.7674672403558421e-06, + "loss": 0.4375, + "step": 7472 + }, + { + "epoch": 0.6069195159587428, + "grad_norm": 7.6507424354276, + "learning_rate": 1.7668385076047584e-06, + "loss": 0.4943, + "step": 7473 + }, + { + "epoch": 0.6070007309347843, + "grad_norm": 5.452374844312675, + "learning_rate": 1.7662098255845689e-06, + "loss": 0.6319, + "step": 7474 + }, + { + "epoch": 0.607081945910826, + "grad_norm": 10.145687214568056, + "learning_rate": 1.7655811943387758e-06, + "loss": 0.4869, + "step": 7475 + }, + { + "epoch": 0.6071631608868675, + "grad_norm": 4.614412678031245, + "learning_rate": 1.764952613910878e-06, + "loss": 0.5788, + "step": 7476 + }, + { + "epoch": 0.6072443758629091, + "grad_norm": 7.111983931763767, + "learning_rate": 1.7643240843443686e-06, + "loss": 0.505, + "step": 7477 + }, + { + "epoch": 0.6073255908389507, + "grad_norm": 8.127180595467415, + "learning_rate": 1.7636956056827384e-06, + "loss": 0.6297, + "step": 7478 + }, + { + "epoch": 0.6074068058149923, + "grad_norm": 6.123717134281918, + "learning_rate": 1.7630671779694768e-06, + "loss": 0.5801, + "step": 7479 + }, + { + "epoch": 0.6074880207910338, + "grad_norm": 5.652896769901682, + "learning_rate": 1.7624388012480656e-06, + "loss": 0.4418, + "step": 7480 + }, + { + "epoch": 0.6075692357670754, + "grad_norm": 5.923671663679729, + "learning_rate": 1.7618104755619852e-06, + "loss": 0.4421, + "step": 7481 + }, + { + "epoch": 0.6076504507431171, + "grad_norm": 5.6081581939098495, + "learning_rate": 1.7611822009547143e-06, + "loss": 0.7917, + "step": 7482 + }, + { + "epoch": 0.6077316657191586, + "grad_norm": 5.284990419872968, + "learning_rate": 1.7605539774697244e-06, + "loss": 0.567, + "step": 7483 + }, + { + "epoch": 0.6078128806952002, + "grad_norm": 7.660780757932287, + "learning_rate": 1.7599258051504856e-06, + "loss": 0.5853, + "step": 7484 + }, + { + "epoch": 0.6078940956712418, + "grad_norm": 4.687240171351279, + "learning_rate": 1.7592976840404652e-06, + "loss": 0.4899, + "step": 7485 + }, + { + "epoch": 0.6079753106472834, + "grad_norm": 4.470994352643192, + "learning_rate": 1.7586696141831242e-06, + "loss": 0.4768, + "step": 7486 + }, + { + "epoch": 0.6080565256233249, + "grad_norm": 5.24599057514584, + "learning_rate": 1.7580415956219229e-06, + "loss": 0.3794, + "step": 7487 + }, + { + "epoch": 0.6081377405993665, + "grad_norm": 4.900299331302958, + "learning_rate": 1.7574136284003158e-06, + "loss": 0.5302, + "step": 7488 + }, + { + "epoch": 0.6082189555754081, + "grad_norm": 4.555248134756306, + "learning_rate": 1.756785712561756e-06, + "loss": 0.4212, + "step": 7489 + }, + { + "epoch": 0.6083001705514497, + "grad_norm": 7.786367658307978, + "learning_rate": 1.7561578481496917e-06, + "loss": 0.4278, + "step": 7490 + }, + { + "epoch": 0.6083813855274912, + "grad_norm": 4.5786486381903515, + "learning_rate": 1.7555300352075662e-06, + "loss": 0.6145, + "step": 7491 + }, + { + "epoch": 0.6084626005035328, + "grad_norm": 7.245725994286198, + "learning_rate": 1.7549022737788241e-06, + "loss": 0.4126, + "step": 7492 + }, + { + "epoch": 0.6085438154795745, + "grad_norm": 3.67634983056109, + "learning_rate": 1.7542745639069004e-06, + "loss": 0.5058, + "step": 7493 + }, + { + "epoch": 0.608625030455616, + "grad_norm": 9.370780365887464, + "learning_rate": 1.7536469056352296e-06, + "loss": 0.3767, + "step": 7494 + }, + { + "epoch": 0.6087062454316576, + "grad_norm": 2.966365312799223, + "learning_rate": 1.7530192990072436e-06, + "loss": 0.5582, + "step": 7495 + }, + { + "epoch": 0.6087874604076992, + "grad_norm": 5.219714565139399, + "learning_rate": 1.7523917440663687e-06, + "loss": 0.409, + "step": 7496 + }, + { + "epoch": 0.6088686753837408, + "grad_norm": 7.275963784070534, + "learning_rate": 1.7517642408560278e-06, + "loss": 0.516, + "step": 7497 + }, + { + "epoch": 0.6089498903597823, + "grad_norm": 5.53725602534592, + "learning_rate": 1.7511367894196426e-06, + "loss": 0.5271, + "step": 7498 + }, + { + "epoch": 0.6090311053358239, + "grad_norm": 6.420027579961015, + "learning_rate": 1.7505093898006275e-06, + "loss": 0.4234, + "step": 7499 + }, + { + "epoch": 0.6091123203118655, + "grad_norm": 5.134669425660005, + "learning_rate": 1.749882042042396e-06, + "loss": 0.5215, + "step": 7500 + }, + { + "epoch": 0.6091935352879071, + "grad_norm": 3.969108933592243, + "learning_rate": 1.749254746188358e-06, + "loss": 0.4116, + "step": 7501 + }, + { + "epoch": 0.6092747502639486, + "grad_norm": 9.697561858550591, + "learning_rate": 1.7486275022819183e-06, + "loss": 0.4249, + "step": 7502 + }, + { + "epoch": 0.6093559652399902, + "grad_norm": 6.498690544614313, + "learning_rate": 1.748000310366478e-06, + "loss": 0.5176, + "step": 7503 + }, + { + "epoch": 0.6094371802160319, + "grad_norm": 3.9324281115521202, + "learning_rate": 1.7473731704854363e-06, + "loss": 0.566, + "step": 7504 + }, + { + "epoch": 0.6095183951920734, + "grad_norm": 8.158895435967443, + "learning_rate": 1.7467460826821885e-06, + "loss": 0.4352, + "step": 7505 + }, + { + "epoch": 0.609599610168115, + "grad_norm": 6.125377617707914, + "learning_rate": 1.7461190470001252e-06, + "loss": 0.4992, + "step": 7506 + }, + { + "epoch": 0.6096808251441566, + "grad_norm": 4.208156614795222, + "learning_rate": 1.7454920634826334e-06, + "loss": 0.4183, + "step": 7507 + }, + { + "epoch": 0.6097620401201982, + "grad_norm": 11.55413920993596, + "learning_rate": 1.7448651321730985e-06, + "loss": 0.4898, + "step": 7508 + }, + { + "epoch": 0.6098432550962397, + "grad_norm": 4.906701536886049, + "learning_rate": 1.7442382531148993e-06, + "loss": 0.4855, + "step": 7509 + }, + { + "epoch": 0.6099244700722813, + "grad_norm": 4.012360366132999, + "learning_rate": 1.743611426351413e-06, + "loss": 0.4437, + "step": 7510 + }, + { + "epoch": 0.6100056850483229, + "grad_norm": 4.507122998130638, + "learning_rate": 1.7429846519260139e-06, + "loss": 0.4269, + "step": 7511 + }, + { + "epoch": 0.6100869000243645, + "grad_norm": 7.688714435664871, + "learning_rate": 1.7423579298820698e-06, + "loss": 0.4297, + "step": 7512 + }, + { + "epoch": 0.610168115000406, + "grad_norm": 4.5702359534664145, + "learning_rate": 1.7417312602629466e-06, + "loss": 0.4187, + "step": 7513 + }, + { + "epoch": 0.6102493299764477, + "grad_norm": 7.677278210562309, + "learning_rate": 1.7411046431120082e-06, + "loss": 0.4983, + "step": 7514 + }, + { + "epoch": 0.6103305449524893, + "grad_norm": 6.1185795533491545, + "learning_rate": 1.7404780784726113e-06, + "loss": 0.6269, + "step": 7515 + }, + { + "epoch": 0.6104117599285308, + "grad_norm": 14.544008771708137, + "learning_rate": 1.7398515663881117e-06, + "loss": 0.595, + "step": 7516 + }, + { + "epoch": 0.6104929749045724, + "grad_norm": 10.989546459503224, + "learning_rate": 1.7392251069018612e-06, + "loss": 0.6077, + "step": 7517 + }, + { + "epoch": 0.610574189880614, + "grad_norm": 6.706531652553325, + "learning_rate": 1.7385987000572072e-06, + "loss": 0.558, + "step": 7518 + }, + { + "epoch": 0.6106554048566556, + "grad_norm": 3.482810028814553, + "learning_rate": 1.7379723458974923e-06, + "loss": 0.4935, + "step": 7519 + }, + { + "epoch": 0.6107366198326971, + "grad_norm": 4.216977832082727, + "learning_rate": 1.737346044466059e-06, + "loss": 0.5477, + "step": 7520 + }, + { + "epoch": 0.6108178348087387, + "grad_norm": 5.407693817919376, + "learning_rate": 1.7367197958062432e-06, + "loss": 0.561, + "step": 7521 + }, + { + "epoch": 0.6108990497847803, + "grad_norm": 7.6500405315944215, + "learning_rate": 1.7360935999613777e-06, + "loss": 0.7106, + "step": 7522 + }, + { + "epoch": 0.6109802647608219, + "grad_norm": 8.727840725862155, + "learning_rate": 1.7354674569747914e-06, + "loss": 0.4518, + "step": 7523 + }, + { + "epoch": 0.6110614797368634, + "grad_norm": 6.081313481209265, + "learning_rate": 1.7348413668898124e-06, + "loss": 0.4906, + "step": 7524 + }, + { + "epoch": 0.611142694712905, + "grad_norm": 4.575030134738263, + "learning_rate": 1.73421532974976e-06, + "loss": 0.5392, + "step": 7525 + }, + { + "epoch": 0.6112239096889467, + "grad_norm": 3.827031756918253, + "learning_rate": 1.7335893455979538e-06, + "loss": 0.5111, + "step": 7526 + }, + { + "epoch": 0.6113051246649882, + "grad_norm": 4.399973290074878, + "learning_rate": 1.7329634144777097e-06, + "loss": 0.7858, + "step": 7527 + }, + { + "epoch": 0.6113863396410298, + "grad_norm": 6.275043877297012, + "learning_rate": 1.7323375364323374e-06, + "loss": 0.4, + "step": 7528 + }, + { + "epoch": 0.6114675546170714, + "grad_norm": 3.7917580182798964, + "learning_rate": 1.731711711505144e-06, + "loss": 0.5784, + "step": 7529 + }, + { + "epoch": 0.611548769593113, + "grad_norm": 5.449393065833403, + "learning_rate": 1.7310859397394356e-06, + "loss": 0.4097, + "step": 7530 + }, + { + "epoch": 0.6116299845691545, + "grad_norm": 9.087763572815122, + "learning_rate": 1.7304602211785105e-06, + "loss": 0.4269, + "step": 7531 + }, + { + "epoch": 0.6117111995451961, + "grad_norm": 3.9239408883458196, + "learning_rate": 1.7298345558656643e-06, + "loss": 0.4189, + "step": 7532 + }, + { + "epoch": 0.6117924145212377, + "grad_norm": 3.9512250145217327, + "learning_rate": 1.7292089438441912e-06, + "loss": 0.5171, + "step": 7533 + }, + { + "epoch": 0.6118736294972793, + "grad_norm": 5.225768936804722, + "learning_rate": 1.7285833851573802e-06, + "loss": 0.5148, + "step": 7534 + }, + { + "epoch": 0.6119548444733209, + "grad_norm": 8.877862953830741, + "learning_rate": 1.727957879848516e-06, + "loss": 0.6574, + "step": 7535 + }, + { + "epoch": 0.6120360594493625, + "grad_norm": 3.7513803990699457, + "learning_rate": 1.72733242796088e-06, + "loss": 0.4649, + "step": 7536 + }, + { + "epoch": 0.6121172744254041, + "grad_norm": 6.526118406530345, + "learning_rate": 1.7267070295377519e-06, + "loss": 0.5629, + "step": 7537 + }, + { + "epoch": 0.6121984894014456, + "grad_norm": 8.83748922556791, + "learning_rate": 1.726081684622404e-06, + "loss": 0.483, + "step": 7538 + }, + { + "epoch": 0.6122797043774872, + "grad_norm": 4.032292821062164, + "learning_rate": 1.7254563932581072e-06, + "loss": 0.6093, + "step": 7539 + }, + { + "epoch": 0.6123609193535288, + "grad_norm": 19.261864290456085, + "learning_rate": 1.7248311554881297e-06, + "loss": 0.3226, + "step": 7540 + }, + { + "epoch": 0.6124421343295704, + "grad_norm": 6.178514406690231, + "learning_rate": 1.7242059713557336e-06, + "loss": 0.4116, + "step": 7541 + }, + { + "epoch": 0.6125233493056119, + "grad_norm": 6.272920000009786, + "learning_rate": 1.7235808409041775e-06, + "loss": 0.514, + "step": 7542 + }, + { + "epoch": 0.6126045642816536, + "grad_norm": 3.700775958812784, + "learning_rate": 1.7229557641767191e-06, + "loss": 0.3956, + "step": 7543 + }, + { + "epoch": 0.6126857792576951, + "grad_norm": 5.184180318926046, + "learning_rate": 1.7223307412166097e-06, + "loss": 0.5573, + "step": 7544 + }, + { + "epoch": 0.6127669942337367, + "grad_norm": 7.17746460216716, + "learning_rate": 1.7217057720670955e-06, + "loss": 0.4188, + "step": 7545 + }, + { + "epoch": 0.6128482092097783, + "grad_norm": 5.19044747938764, + "learning_rate": 1.7210808567714244e-06, + "loss": 0.6008, + "step": 7546 + }, + { + "epoch": 0.6129294241858199, + "grad_norm": 6.637207987486905, + "learning_rate": 1.7204559953728355e-06, + "loss": 0.4163, + "step": 7547 + }, + { + "epoch": 0.6130106391618615, + "grad_norm": 4.123118258455451, + "learning_rate": 1.7198311879145652e-06, + "loss": 0.4583, + "step": 7548 + }, + { + "epoch": 0.613091854137903, + "grad_norm": 4.634690421136184, + "learning_rate": 1.719206434439848e-06, + "loss": 0.6119, + "step": 7549 + }, + { + "epoch": 0.6131730691139446, + "grad_norm": 5.330581154155666, + "learning_rate": 1.7185817349919137e-06, + "loss": 0.679, + "step": 7550 + }, + { + "epoch": 0.6132542840899862, + "grad_norm": 4.733672202157837, + "learning_rate": 1.7179570896139869e-06, + "loss": 0.5463, + "step": 7551 + }, + { + "epoch": 0.6133354990660278, + "grad_norm": 3.417972262827162, + "learning_rate": 1.7173324983492912e-06, + "loss": 0.5625, + "step": 7552 + }, + { + "epoch": 0.6134167140420693, + "grad_norm": 5.5606216442015155, + "learning_rate": 1.7167079612410448e-06, + "loss": 0.541, + "step": 7553 + }, + { + "epoch": 0.613497929018111, + "grad_norm": 7.366074745822286, + "learning_rate": 1.7160834783324608e-06, + "loss": 0.5007, + "step": 7554 + }, + { + "epoch": 0.6135791439941525, + "grad_norm": 4.4511321208599925, + "learning_rate": 1.7154590496667523e-06, + "loss": 0.4106, + "step": 7555 + }, + { + "epoch": 0.6136603589701941, + "grad_norm": 4.380414416940069, + "learning_rate": 1.7148346752871253e-06, + "loss": 0.5297, + "step": 7556 + }, + { + "epoch": 0.6137415739462357, + "grad_norm": 5.789276347920089, + "learning_rate": 1.7142103552367834e-06, + "loss": 0.4956, + "step": 7557 + }, + { + "epoch": 0.6138227889222773, + "grad_norm": 5.435165522246494, + "learning_rate": 1.713586089558925e-06, + "loss": 0.5016, + "step": 7558 + }, + { + "epoch": 0.6139040038983189, + "grad_norm": 3.8853370254417796, + "learning_rate": 1.7129618782967488e-06, + "loss": 0.4796, + "step": 7559 + }, + { + "epoch": 0.6139852188743604, + "grad_norm": 6.1117435269535765, + "learning_rate": 1.712337721493445e-06, + "loss": 0.3512, + "step": 7560 + }, + { + "epoch": 0.614066433850402, + "grad_norm": 5.140667780868619, + "learning_rate": 1.7117136191922013e-06, + "loss": 0.383, + "step": 7561 + }, + { + "epoch": 0.6141476488264436, + "grad_norm": 5.239177233517926, + "learning_rate": 1.7110895714362035e-06, + "loss": 0.4811, + "step": 7562 + }, + { + "epoch": 0.6142288638024852, + "grad_norm": 5.786445992631991, + "learning_rate": 1.710465578268633e-06, + "loss": 0.5776, + "step": 7563 + }, + { + "epoch": 0.6143100787785267, + "grad_norm": 4.264725420672594, + "learning_rate": 1.7098416397326647e-06, + "loss": 0.548, + "step": 7564 + }, + { + "epoch": 0.6143912937545684, + "grad_norm": 5.098547500018314, + "learning_rate": 1.7092177558714735e-06, + "loss": 0.4055, + "step": 7565 + }, + { + "epoch": 0.6144725087306099, + "grad_norm": 3.972712496253238, + "learning_rate": 1.7085939267282292e-06, + "loss": 0.5024, + "step": 7566 + }, + { + "epoch": 0.6145537237066515, + "grad_norm": 6.4566948580335835, + "learning_rate": 1.7079701523460957e-06, + "loss": 0.3837, + "step": 7567 + }, + { + "epoch": 0.6146349386826931, + "grad_norm": 4.440870142659438, + "learning_rate": 1.707346432768236e-06, + "loss": 0.5221, + "step": 7568 + }, + { + "epoch": 0.6147161536587347, + "grad_norm": 4.191872882758366, + "learning_rate": 1.706722768037809e-06, + "loss": 0.5073, + "step": 7569 + }, + { + "epoch": 0.6147973686347763, + "grad_norm": 5.091945928993117, + "learning_rate": 1.7060991581979668e-06, + "loss": 0.4276, + "step": 7570 + }, + { + "epoch": 0.6148785836108178, + "grad_norm": 4.128925636949817, + "learning_rate": 1.7054756032918619e-06, + "loss": 0.5327, + "step": 7571 + }, + { + "epoch": 0.6149597985868595, + "grad_norm": 4.6497677030599265, + "learning_rate": 1.7048521033626406e-06, + "loss": 0.4233, + "step": 7572 + }, + { + "epoch": 0.615041013562901, + "grad_norm": 5.259086945381799, + "learning_rate": 1.7042286584534446e-06, + "loss": 0.5063, + "step": 7573 + }, + { + "epoch": 0.6151222285389426, + "grad_norm": 9.344094684965398, + "learning_rate": 1.703605268607415e-06, + "loss": 0.4642, + "step": 7574 + }, + { + "epoch": 0.6152034435149841, + "grad_norm": 6.059064695851907, + "learning_rate": 1.7029819338676851e-06, + "loss": 0.5159, + "step": 7575 + }, + { + "epoch": 0.6152846584910258, + "grad_norm": 7.950438579454252, + "learning_rate": 1.702358654277388e-06, + "loss": 0.4775, + "step": 7576 + }, + { + "epoch": 0.6153658734670673, + "grad_norm": 5.6577857752289695, + "learning_rate": 1.7017354298796495e-06, + "loss": 0.4521, + "step": 7577 + }, + { + "epoch": 0.6154470884431089, + "grad_norm": 6.694027039647233, + "learning_rate": 1.701112260717595e-06, + "loss": 0.5675, + "step": 7578 + }, + { + "epoch": 0.6155283034191505, + "grad_norm": 7.207274495499145, + "learning_rate": 1.7004891468343445e-06, + "loss": 0.4767, + "step": 7579 + }, + { + "epoch": 0.6156095183951921, + "grad_norm": 10.710533616510515, + "learning_rate": 1.6998660882730127e-06, + "loss": 0.4575, + "step": 7580 + }, + { + "epoch": 0.6156907333712337, + "grad_norm": 9.051943184654498, + "learning_rate": 1.6992430850767133e-06, + "loss": 0.4646, + "step": 7581 + }, + { + "epoch": 0.6157719483472752, + "grad_norm": 6.318629144815555, + "learning_rate": 1.6986201372885551e-06, + "loss": 0.5112, + "step": 7582 + }, + { + "epoch": 0.6158531633233169, + "grad_norm": 9.374156721440105, + "learning_rate": 1.6979972449516414e-06, + "loss": 0.5938, + "step": 7583 + }, + { + "epoch": 0.6159343782993584, + "grad_norm": 5.954834061127918, + "learning_rate": 1.6973744081090737e-06, + "loss": 0.5195, + "step": 7584 + }, + { + "epoch": 0.6160155932754, + "grad_norm": 4.686975998201248, + "learning_rate": 1.6967516268039502e-06, + "loss": 0.5192, + "step": 7585 + }, + { + "epoch": 0.6160968082514415, + "grad_norm": 5.255217548237502, + "learning_rate": 1.696128901079362e-06, + "loss": 0.5776, + "step": 7586 + }, + { + "epoch": 0.6161780232274832, + "grad_norm": 3.9588121129361915, + "learning_rate": 1.6955062309783993e-06, + "loss": 0.4449, + "step": 7587 + }, + { + "epoch": 0.6162592382035247, + "grad_norm": 2.910107855866022, + "learning_rate": 1.6948836165441487e-06, + "loss": 0.6015, + "step": 7588 + }, + { + "epoch": 0.6163404531795663, + "grad_norm": 4.0433138018943735, + "learning_rate": 1.6942610578196898e-06, + "loss": 0.4804, + "step": 7589 + }, + { + "epoch": 0.616421668155608, + "grad_norm": 12.510674806813109, + "learning_rate": 1.6936385548481022e-06, + "loss": 0.3454, + "step": 7590 + }, + { + "epoch": 0.6165028831316495, + "grad_norm": 4.596018618309467, + "learning_rate": 1.6930161076724586e-06, + "loss": 0.4479, + "step": 7591 + }, + { + "epoch": 0.6165840981076911, + "grad_norm": 4.171966658619839, + "learning_rate": 1.69239371633583e-06, + "loss": 0.5211, + "step": 7592 + }, + { + "epoch": 0.6166653130837326, + "grad_norm": 8.771836978846274, + "learning_rate": 1.6917713808812808e-06, + "loss": 0.4352, + "step": 7593 + }, + { + "epoch": 0.6167465280597743, + "grad_norm": 4.139812780948147, + "learning_rate": 1.6911491013518752e-06, + "loss": 0.5914, + "step": 7594 + }, + { + "epoch": 0.6168277430358158, + "grad_norm": 5.236782993816778, + "learning_rate": 1.6905268777906713e-06, + "loss": 0.5286, + "step": 7595 + }, + { + "epoch": 0.6169089580118574, + "grad_norm": 5.408332820716584, + "learning_rate": 1.6899047102407228e-06, + "loss": 0.3983, + "step": 7596 + }, + { + "epoch": 0.6169901729878989, + "grad_norm": 3.729915257776433, + "learning_rate": 1.6892825987450811e-06, + "loss": 0.5177, + "step": 7597 + }, + { + "epoch": 0.6170713879639406, + "grad_norm": 4.845431694776476, + "learning_rate": 1.6886605433467937e-06, + "loss": 0.5994, + "step": 7598 + }, + { + "epoch": 0.6171526029399821, + "grad_norm": 4.28204183668613, + "learning_rate": 1.6880385440889016e-06, + "loss": 0.4965, + "step": 7599 + }, + { + "epoch": 0.6172338179160237, + "grad_norm": 3.81321254807878, + "learning_rate": 1.6874166010144454e-06, + "loss": 0.5212, + "step": 7600 + }, + { + "epoch": 0.6173150328920654, + "grad_norm": 4.863898183735516, + "learning_rate": 1.6867947141664606e-06, + "loss": 0.6291, + "step": 7601 + }, + { + "epoch": 0.6173962478681069, + "grad_norm": 3.908697856611701, + "learning_rate": 1.6861728835879764e-06, + "loss": 0.5934, + "step": 7602 + }, + { + "epoch": 0.6174774628441485, + "grad_norm": 4.930386518701432, + "learning_rate": 1.685551109322023e-06, + "loss": 0.4434, + "step": 7603 + }, + { + "epoch": 0.61755867782019, + "grad_norm": 3.7606290186734586, + "learning_rate": 1.6849293914116215e-06, + "loss": 0.5494, + "step": 7604 + }, + { + "epoch": 0.6176398927962317, + "grad_norm": 4.342048831757059, + "learning_rate": 1.6843077298997924e-06, + "loss": 0.5169, + "step": 7605 + }, + { + "epoch": 0.6177211077722732, + "grad_norm": 5.829444571059767, + "learning_rate": 1.6836861248295522e-06, + "loss": 0.5501, + "step": 7606 + }, + { + "epoch": 0.6178023227483148, + "grad_norm": 4.7706445198897915, + "learning_rate": 1.6830645762439113e-06, + "loss": 0.4783, + "step": 7607 + }, + { + "epoch": 0.6178835377243563, + "grad_norm": 8.032579213522444, + "learning_rate": 1.6824430841858773e-06, + "loss": 0.4552, + "step": 7608 + }, + { + "epoch": 0.617964752700398, + "grad_norm": 6.670715215785492, + "learning_rate": 1.6818216486984565e-06, + "loss": 0.4983, + "step": 7609 + }, + { + "epoch": 0.6180459676764395, + "grad_norm": 6.128265762260737, + "learning_rate": 1.6812002698246468e-06, + "loss": 0.4088, + "step": 7610 + }, + { + "epoch": 0.6181271826524811, + "grad_norm": 5.814013730186966, + "learning_rate": 1.6805789476074457e-06, + "loss": 0.5589, + "step": 7611 + }, + { + "epoch": 0.6182083976285228, + "grad_norm": 6.00192645217164, + "learning_rate": 1.6799576820898433e-06, + "loss": 0.4175, + "step": 7612 + }, + { + "epoch": 0.6182896126045643, + "grad_norm": 4.073976353754178, + "learning_rate": 1.6793364733148299e-06, + "loss": 0.3606, + "step": 7613 + }, + { + "epoch": 0.6183708275806059, + "grad_norm": 4.36296820738616, + "learning_rate": 1.67871532132539e-06, + "loss": 0.6092, + "step": 7614 + }, + { + "epoch": 0.6184520425566474, + "grad_norm": 6.684933223291455, + "learning_rate": 1.6780942261645022e-06, + "loss": 0.4535, + "step": 7615 + }, + { + "epoch": 0.6185332575326891, + "grad_norm": 4.124907550937014, + "learning_rate": 1.6774731878751443e-06, + "loss": 0.5198, + "step": 7616 + }, + { + "epoch": 0.6186144725087306, + "grad_norm": 2.736304544029034, + "learning_rate": 1.6768522065002895e-06, + "loss": 0.6509, + "step": 7617 + }, + { + "epoch": 0.6186956874847722, + "grad_norm": 3.054312384173202, + "learning_rate": 1.676231282082904e-06, + "loss": 0.5161, + "step": 7618 + }, + { + "epoch": 0.6187769024608137, + "grad_norm": 8.402180136218686, + "learning_rate": 1.6756104146659557e-06, + "loss": 0.394, + "step": 7619 + }, + { + "epoch": 0.6188581174368554, + "grad_norm": 4.6791235959713315, + "learning_rate": 1.674989604292403e-06, + "loss": 0.6243, + "step": 7620 + }, + { + "epoch": 0.6189393324128969, + "grad_norm": 9.604477306580446, + "learning_rate": 1.6743688510052025e-06, + "loss": 0.4754, + "step": 7621 + }, + { + "epoch": 0.6190205473889385, + "grad_norm": 9.89281790814999, + "learning_rate": 1.6737481548473094e-06, + "loss": 0.4893, + "step": 7622 + }, + { + "epoch": 0.6191017623649802, + "grad_norm": 3.3991457520981383, + "learning_rate": 1.6731275158616706e-06, + "loss": 0.457, + "step": 7623 + }, + { + "epoch": 0.6191829773410217, + "grad_norm": 2.80593016197633, + "learning_rate": 1.6725069340912306e-06, + "loss": 0.6278, + "step": 7624 + }, + { + "epoch": 0.6192641923170633, + "grad_norm": 5.418301042196, + "learning_rate": 1.6718864095789328e-06, + "loss": 0.4187, + "step": 7625 + }, + { + "epoch": 0.6193454072931048, + "grad_norm": 13.06513067077772, + "learning_rate": 1.671265942367712e-06, + "loss": 0.4663, + "step": 7626 + }, + { + "epoch": 0.6194266222691465, + "grad_norm": 4.625063932915541, + "learning_rate": 1.6706455325005022e-06, + "loss": 0.4295, + "step": 7627 + }, + { + "epoch": 0.619507837245188, + "grad_norm": 3.5014064055098175, + "learning_rate": 1.6700251800202316e-06, + "loss": 0.4579, + "step": 7628 + }, + { + "epoch": 0.6195890522212296, + "grad_norm": 6.56770323902514, + "learning_rate": 1.6694048849698262e-06, + "loss": 0.6269, + "step": 7629 + }, + { + "epoch": 0.6196702671972711, + "grad_norm": 4.07016650700423, + "learning_rate": 1.668784647392208e-06, + "loss": 0.6907, + "step": 7630 + }, + { + "epoch": 0.6197514821733128, + "grad_norm": 5.113480533421879, + "learning_rate": 1.6681644673302915e-06, + "loss": 0.5111, + "step": 7631 + }, + { + "epoch": 0.6198326971493543, + "grad_norm": 4.254910794569682, + "learning_rate": 1.6675443448269924e-06, + "loss": 0.5239, + "step": 7632 + }, + { + "epoch": 0.6199139121253959, + "grad_norm": 5.408424083847769, + "learning_rate": 1.666924279925219e-06, + "loss": 0.5857, + "step": 7633 + }, + { + "epoch": 0.6199951271014376, + "grad_norm": 3.2011054128203766, + "learning_rate": 1.6663042726678752e-06, + "loss": 0.5784, + "step": 7634 + }, + { + "epoch": 0.6200763420774791, + "grad_norm": 5.235602080991919, + "learning_rate": 1.6656843230978647e-06, + "loss": 0.3975, + "step": 7635 + }, + { + "epoch": 0.6201575570535207, + "grad_norm": 5.165250298024449, + "learning_rate": 1.6650644312580833e-06, + "loss": 0.5924, + "step": 7636 + }, + { + "epoch": 0.6202387720295622, + "grad_norm": 4.340672727279216, + "learning_rate": 1.6644445971914235e-06, + "loss": 0.5221, + "step": 7637 + }, + { + "epoch": 0.6203199870056039, + "grad_norm": 6.841912089854011, + "learning_rate": 1.6638248209407767e-06, + "loss": 0.443, + "step": 7638 + }, + { + "epoch": 0.6204012019816454, + "grad_norm": 4.081953440344846, + "learning_rate": 1.6632051025490265e-06, + "loss": 0.3495, + "step": 7639 + }, + { + "epoch": 0.620482416957687, + "grad_norm": 3.7772718364769142, + "learning_rate": 1.6625854420590538e-06, + "loss": 0.5034, + "step": 7640 + }, + { + "epoch": 0.6205636319337285, + "grad_norm": 5.210380997814291, + "learning_rate": 1.6619658395137375e-06, + "loss": 0.6397, + "step": 7641 + }, + { + "epoch": 0.6206448469097702, + "grad_norm": 6.577887175557655, + "learning_rate": 1.6613462949559494e-06, + "loss": 0.4802, + "step": 7642 + }, + { + "epoch": 0.6207260618858117, + "grad_norm": 3.7841172613723657, + "learning_rate": 1.6607268084285587e-06, + "loss": 0.7127, + "step": 7643 + }, + { + "epoch": 0.6208072768618533, + "grad_norm": 8.73460364920805, + "learning_rate": 1.6601073799744322e-06, + "loss": 0.4822, + "step": 7644 + }, + { + "epoch": 0.620888491837895, + "grad_norm": 5.904973312911216, + "learning_rate": 1.6594880096364302e-06, + "loss": 0.5566, + "step": 7645 + }, + { + "epoch": 0.6209697068139365, + "grad_norm": 4.941626996148981, + "learning_rate": 1.6588686974574086e-06, + "loss": 0.3796, + "step": 7646 + }, + { + "epoch": 0.6210509217899781, + "grad_norm": 5.8291085465722325, + "learning_rate": 1.658249443480221e-06, + "loss": 0.4666, + "step": 7647 + }, + { + "epoch": 0.6211321367660196, + "grad_norm": 5.909142689212799, + "learning_rate": 1.6576302477477185e-06, + "loss": 0.4453, + "step": 7648 + }, + { + "epoch": 0.6212133517420613, + "grad_norm": 5.951228309961236, + "learning_rate": 1.6570111103027436e-06, + "loss": 0.3766, + "step": 7649 + }, + { + "epoch": 0.6212945667181028, + "grad_norm": 6.681405040737811, + "learning_rate": 1.6563920311881382e-06, + "loss": 0.3992, + "step": 7650 + }, + { + "epoch": 0.6213757816941444, + "grad_norm": 5.5260582323770135, + "learning_rate": 1.6557730104467407e-06, + "loss": 0.4259, + "step": 7651 + }, + { + "epoch": 0.621456996670186, + "grad_norm": 5.076655366887786, + "learning_rate": 1.6551540481213817e-06, + "loss": 0.5468, + "step": 7652 + }, + { + "epoch": 0.6215382116462276, + "grad_norm": 5.8919929585227155, + "learning_rate": 1.6545351442548915e-06, + "loss": 0.4871, + "step": 7653 + }, + { + "epoch": 0.6216194266222691, + "grad_norm": 4.042380796567633, + "learning_rate": 1.6539162988900952e-06, + "loss": 0.4153, + "step": 7654 + }, + { + "epoch": 0.6217006415983107, + "grad_norm": 6.784897682604056, + "learning_rate": 1.6532975120698133e-06, + "loss": 0.4534, + "step": 7655 + }, + { + "epoch": 0.6217818565743524, + "grad_norm": 9.65234999480218, + "learning_rate": 1.6526787838368616e-06, + "loss": 0.4155, + "step": 7656 + }, + { + "epoch": 0.6218630715503939, + "grad_norm": 17.08200480716379, + "learning_rate": 1.6520601142340549e-06, + "loss": 0.5921, + "step": 7657 + }, + { + "epoch": 0.6219442865264355, + "grad_norm": 5.175377372780213, + "learning_rate": 1.6514415033041997e-06, + "loss": 0.4526, + "step": 7658 + }, + { + "epoch": 0.622025501502477, + "grad_norm": 3.458058541785411, + "learning_rate": 1.6508229510901013e-06, + "loss": 0.4555, + "step": 7659 + }, + { + "epoch": 0.6221067164785187, + "grad_norm": 10.604824315531577, + "learning_rate": 1.6502044576345614e-06, + "loss": 0.452, + "step": 7660 + }, + { + "epoch": 0.6221879314545602, + "grad_norm": 5.411425272158323, + "learning_rate": 1.6495860229803756e-06, + "loss": 0.6222, + "step": 7661 + }, + { + "epoch": 0.6222691464306018, + "grad_norm": 5.1740005505908995, + "learning_rate": 1.6489676471703352e-06, + "loss": 0.4145, + "step": 7662 + }, + { + "epoch": 0.6223503614066433, + "grad_norm": 8.085499523946917, + "learning_rate": 1.6483493302472302e-06, + "loss": 0.513, + "step": 7663 + }, + { + "epoch": 0.622431576382685, + "grad_norm": 10.292791376029198, + "learning_rate": 1.6477310722538447e-06, + "loss": 0.6126, + "step": 7664 + }, + { + "epoch": 0.6225127913587265, + "grad_norm": 6.0268201310361995, + "learning_rate": 1.6471128732329579e-06, + "loss": 0.4292, + "step": 7665 + }, + { + "epoch": 0.6225940063347681, + "grad_norm": 5.022456342536074, + "learning_rate": 1.6464947332273459e-06, + "loss": 0.6152, + "step": 7666 + }, + { + "epoch": 0.6226752213108098, + "grad_norm": 8.543190424818972, + "learning_rate": 1.6458766522797822e-06, + "loss": 0.6125, + "step": 7667 + }, + { + "epoch": 0.6227564362868513, + "grad_norm": 4.1156125031275135, + "learning_rate": 1.6452586304330333e-06, + "loss": 0.462, + "step": 7668 + }, + { + "epoch": 0.6228376512628929, + "grad_norm": 7.81665689239009, + "learning_rate": 1.6446406677298632e-06, + "loss": 0.4678, + "step": 7669 + }, + { + "epoch": 0.6229188662389344, + "grad_norm": 6.949544264134675, + "learning_rate": 1.644022764213033e-06, + "loss": 0.4414, + "step": 7670 + }, + { + "epoch": 0.6230000812149761, + "grad_norm": 8.196141138164219, + "learning_rate": 1.6434049199252966e-06, + "loss": 0.4455, + "step": 7671 + }, + { + "epoch": 0.6230812961910176, + "grad_norm": 6.110724030611538, + "learning_rate": 1.6427871349094058e-06, + "loss": 0.3828, + "step": 7672 + }, + { + "epoch": 0.6231625111670592, + "grad_norm": 4.134192350462065, + "learning_rate": 1.6421694092081097e-06, + "loss": 0.5077, + "step": 7673 + }, + { + "epoch": 0.6232437261431008, + "grad_norm": 4.263913374724854, + "learning_rate": 1.6415517428641504e-06, + "loss": 0.6757, + "step": 7674 + }, + { + "epoch": 0.6233249411191424, + "grad_norm": 6.728733308443024, + "learning_rate": 1.640934135920266e-06, + "loss": 0.485, + "step": 7675 + }, + { + "epoch": 0.6234061560951839, + "grad_norm": 6.332399518498263, + "learning_rate": 1.6403165884191935e-06, + "loss": 0.4495, + "step": 7676 + }, + { + "epoch": 0.6234873710712255, + "grad_norm": 5.132785407111167, + "learning_rate": 1.6396991004036638e-06, + "loss": 0.4718, + "step": 7677 + }, + { + "epoch": 0.6235685860472672, + "grad_norm": 4.651566891974008, + "learning_rate": 1.6390816719164022e-06, + "loss": 0.5763, + "step": 7678 + }, + { + "epoch": 0.6236498010233087, + "grad_norm": 4.413484601105079, + "learning_rate": 1.6384643030001333e-06, + "loss": 0.722, + "step": 7679 + }, + { + "epoch": 0.6237310159993503, + "grad_norm": 8.71808254189496, + "learning_rate": 1.6378469936975752e-06, + "loss": 0.5068, + "step": 7680 + }, + { + "epoch": 0.6238122309753918, + "grad_norm": 6.479840818255055, + "learning_rate": 1.6372297440514417e-06, + "loss": 0.4299, + "step": 7681 + }, + { + "epoch": 0.6238934459514335, + "grad_norm": 3.683453972916311, + "learning_rate": 1.6366125541044435e-06, + "loss": 0.545, + "step": 7682 + }, + { + "epoch": 0.623974660927475, + "grad_norm": 9.69410298079771, + "learning_rate": 1.6359954238992882e-06, + "loss": 0.3748, + "step": 7683 + }, + { + "epoch": 0.6240558759035166, + "grad_norm": 4.97119687493066, + "learning_rate": 1.6353783534786763e-06, + "loss": 0.4015, + "step": 7684 + }, + { + "epoch": 0.6241370908795582, + "grad_norm": 7.7545114750163, + "learning_rate": 1.6347613428853059e-06, + "loss": 0.4756, + "step": 7685 + }, + { + "epoch": 0.6242183058555998, + "grad_norm": 3.5468434651588976, + "learning_rate": 1.634144392161872e-06, + "loss": 0.5798, + "step": 7686 + }, + { + "epoch": 0.6242995208316413, + "grad_norm": 4.706606608660131, + "learning_rate": 1.6335275013510638e-06, + "loss": 0.5285, + "step": 7687 + }, + { + "epoch": 0.6243807358076829, + "grad_norm": 6.147626373688174, + "learning_rate": 1.632910670495566e-06, + "loss": 0.3421, + "step": 7688 + }, + { + "epoch": 0.6244619507837246, + "grad_norm": 15.303934361158852, + "learning_rate": 1.6322938996380617e-06, + "loss": 0.46, + "step": 7689 + }, + { + "epoch": 0.6245431657597661, + "grad_norm": 14.350203161237948, + "learning_rate": 1.6316771888212275e-06, + "loss": 0.4245, + "step": 7690 + }, + { + "epoch": 0.6246243807358077, + "grad_norm": 4.190438584869176, + "learning_rate": 1.631060538087735e-06, + "loss": 0.566, + "step": 7691 + }, + { + "epoch": 0.6247055957118492, + "grad_norm": 3.82815155289053, + "learning_rate": 1.6304439474802554e-06, + "loss": 0.4528, + "step": 7692 + }, + { + "epoch": 0.6247868106878909, + "grad_norm": 6.351971066286823, + "learning_rate": 1.6298274170414524e-06, + "loss": 0.5233, + "step": 7693 + }, + { + "epoch": 0.6248680256639324, + "grad_norm": 4.36724324752686, + "learning_rate": 1.6292109468139863e-06, + "loss": 0.4774, + "step": 7694 + }, + { + "epoch": 0.624949240639974, + "grad_norm": 6.655112537446748, + "learning_rate": 1.6285945368405146e-06, + "loss": 0.5333, + "step": 7695 + }, + { + "epoch": 0.6250304556160156, + "grad_norm": 6.018071010184688, + "learning_rate": 1.6279781871636896e-06, + "loss": 0.4432, + "step": 7696 + }, + { + "epoch": 0.6251116705920572, + "grad_norm": 5.306711764250664, + "learning_rate": 1.6273618978261576e-06, + "loss": 0.5138, + "step": 7697 + }, + { + "epoch": 0.6251928855680987, + "grad_norm": 3.9883932484916325, + "learning_rate": 1.6267456688705647e-06, + "loss": 0.5547, + "step": 7698 + }, + { + "epoch": 0.6252741005441403, + "grad_norm": 6.6197356790826625, + "learning_rate": 1.6261295003395506e-06, + "loss": 0.4235, + "step": 7699 + }, + { + "epoch": 0.625355315520182, + "grad_norm": 3.6076841841661667, + "learning_rate": 1.6255133922757493e-06, + "loss": 0.4933, + "step": 7700 + }, + { + "epoch": 0.6254365304962235, + "grad_norm": 4.929421788606065, + "learning_rate": 1.6248973447217926e-06, + "loss": 0.5483, + "step": 7701 + }, + { + "epoch": 0.6255177454722651, + "grad_norm": 4.242914786049469, + "learning_rate": 1.6242813577203093e-06, + "loss": 0.4914, + "step": 7702 + }, + { + "epoch": 0.6255989604483067, + "grad_norm": 4.388993067736639, + "learning_rate": 1.6236654313139213e-06, + "loss": 0.428, + "step": 7703 + }, + { + "epoch": 0.6256801754243483, + "grad_norm": 6.684554645774749, + "learning_rate": 1.6230495655452466e-06, + "loss": 0.5199, + "step": 7704 + }, + { + "epoch": 0.6257613904003898, + "grad_norm": 4.683366955828708, + "learning_rate": 1.6224337604569012e-06, + "loss": 0.4485, + "step": 7705 + }, + { + "epoch": 0.6258426053764314, + "grad_norm": 4.786816490822921, + "learning_rate": 1.6218180160914959e-06, + "loss": 0.4682, + "step": 7706 + }, + { + "epoch": 0.625923820352473, + "grad_norm": 5.411731210706411, + "learning_rate": 1.6212023324916349e-06, + "loss": 0.423, + "step": 7707 + }, + { + "epoch": 0.6260050353285146, + "grad_norm": 4.143891933212883, + "learning_rate": 1.620586709699922e-06, + "loss": 0.7241, + "step": 7708 + }, + { + "epoch": 0.6260862503045561, + "grad_norm": 5.695424075779182, + "learning_rate": 1.6199711477589553e-06, + "loss": 0.4617, + "step": 7709 + }, + { + "epoch": 0.6261674652805977, + "grad_norm": 7.132319497435146, + "learning_rate": 1.6193556467113264e-06, + "loss": 0.3636, + "step": 7710 + }, + { + "epoch": 0.6262486802566394, + "grad_norm": 7.5139687849034225, + "learning_rate": 1.6187402065996267e-06, + "loss": 0.4507, + "step": 7711 + }, + { + "epoch": 0.6263298952326809, + "grad_norm": 3.7215878058767675, + "learning_rate": 1.6181248274664413e-06, + "loss": 0.5443, + "step": 7712 + }, + { + "epoch": 0.6264111102087225, + "grad_norm": 3.861480100387473, + "learning_rate": 1.617509509354349e-06, + "loss": 0.4875, + "step": 7713 + }, + { + "epoch": 0.626492325184764, + "grad_norm": 5.924413160412586, + "learning_rate": 1.616894252305929e-06, + "loss": 0.5528, + "step": 7714 + }, + { + "epoch": 0.6265735401608057, + "grad_norm": 6.36026377881963, + "learning_rate": 1.6162790563637538e-06, + "loss": 0.4045, + "step": 7715 + }, + { + "epoch": 0.6266547551368472, + "grad_norm": 4.4919168460790475, + "learning_rate": 1.6156639215703896e-06, + "loss": 0.4531, + "step": 7716 + }, + { + "epoch": 0.6267359701128888, + "grad_norm": 3.3752966345250583, + "learning_rate": 1.6150488479684022e-06, + "loss": 0.4291, + "step": 7717 + }, + { + "epoch": 0.6268171850889304, + "grad_norm": 4.683411418815497, + "learning_rate": 1.6144338356003513e-06, + "loss": 0.5847, + "step": 7718 + }, + { + "epoch": 0.626898400064972, + "grad_norm": 6.26290414561336, + "learning_rate": 1.6138188845087926e-06, + "loss": 0.4307, + "step": 7719 + }, + { + "epoch": 0.6269796150410135, + "grad_norm": 7.127187742803709, + "learning_rate": 1.613203994736276e-06, + "loss": 0.6484, + "step": 7720 + }, + { + "epoch": 0.6270608300170551, + "grad_norm": 4.465338617099092, + "learning_rate": 1.61258916632535e-06, + "loss": 0.4218, + "step": 7721 + }, + { + "epoch": 0.6271420449930968, + "grad_norm": 6.635578415699874, + "learning_rate": 1.6119743993185574e-06, + "loss": 0.5503, + "step": 7722 + }, + { + "epoch": 0.6272232599691383, + "grad_norm": 9.293378886349172, + "learning_rate": 1.6113596937584358e-06, + "loss": 0.4743, + "step": 7723 + }, + { + "epoch": 0.6273044749451799, + "grad_norm": 3.9282722976253432, + "learning_rate": 1.610745049687521e-06, + "loss": 0.4391, + "step": 7724 + }, + { + "epoch": 0.6273856899212215, + "grad_norm": 5.900142462141413, + "learning_rate": 1.6101304671483425e-06, + "loss": 0.4308, + "step": 7725 + }, + { + "epoch": 0.6274669048972631, + "grad_norm": 5.538553572045406, + "learning_rate": 1.6095159461834252e-06, + "loss": 0.3723, + "step": 7726 + }, + { + "epoch": 0.6275481198733046, + "grad_norm": 5.156039599008567, + "learning_rate": 1.6089014868352925e-06, + "loss": 0.5306, + "step": 7727 + }, + { + "epoch": 0.6276293348493462, + "grad_norm": 5.634533236318975, + "learning_rate": 1.608287089146461e-06, + "loss": 0.525, + "step": 7728 + }, + { + "epoch": 0.6277105498253878, + "grad_norm": 5.598968838519203, + "learning_rate": 1.6076727531594428e-06, + "loss": 0.3932, + "step": 7729 + }, + { + "epoch": 0.6277917648014294, + "grad_norm": 5.654175409392677, + "learning_rate": 1.607058478916748e-06, + "loss": 0.4358, + "step": 7730 + }, + { + "epoch": 0.6278729797774709, + "grad_norm": 7.282317038390533, + "learning_rate": 1.6064442664608808e-06, + "loss": 0.6522, + "step": 7731 + }, + { + "epoch": 0.6279541947535126, + "grad_norm": 6.6301535369071924, + "learning_rate": 1.6058301158343408e-06, + "loss": 0.4913, + "step": 7732 + }, + { + "epoch": 0.6280354097295542, + "grad_norm": 4.56677620849299, + "learning_rate": 1.6052160270796252e-06, + "loss": 0.4066, + "step": 7733 + }, + { + "epoch": 0.6281166247055957, + "grad_norm": 6.930735350423143, + "learning_rate": 1.6046020002392242e-06, + "loss": 0.6191, + "step": 7734 + }, + { + "epoch": 0.6281978396816373, + "grad_norm": 3.3385261614929584, + "learning_rate": 1.603988035355627e-06, + "loss": 0.4239, + "step": 7735 + }, + { + "epoch": 0.6282790546576789, + "grad_norm": 6.108614723656081, + "learning_rate": 1.6033741324713143e-06, + "loss": 0.5148, + "step": 7736 + }, + { + "epoch": 0.6283602696337205, + "grad_norm": 5.083243850769626, + "learning_rate": 1.6027602916287665e-06, + "loss": 0.5385, + "step": 7737 + }, + { + "epoch": 0.628441484609762, + "grad_norm": 4.3659844044456735, + "learning_rate": 1.6021465128704592e-06, + "loss": 0.4434, + "step": 7738 + }, + { + "epoch": 0.6285226995858036, + "grad_norm": 3.246650947537016, + "learning_rate": 1.60153279623886e-06, + "loss": 0.5304, + "step": 7739 + }, + { + "epoch": 0.6286039145618452, + "grad_norm": 6.965030627490836, + "learning_rate": 1.6009191417764366e-06, + "loss": 0.4603, + "step": 7740 + }, + { + "epoch": 0.6286851295378868, + "grad_norm": 4.3257443108941835, + "learning_rate": 1.600305549525651e-06, + "loss": 0.4716, + "step": 7741 + }, + { + "epoch": 0.6287663445139283, + "grad_norm": 5.777339582650028, + "learning_rate": 1.5996920195289586e-06, + "loss": 0.5073, + "step": 7742 + }, + { + "epoch": 0.62884755948997, + "grad_norm": 11.440189242706712, + "learning_rate": 1.5990785518288144e-06, + "loss": 0.6117, + "step": 7743 + }, + { + "epoch": 0.6289287744660116, + "grad_norm": 5.114208771227363, + "learning_rate": 1.5984651464676664e-06, + "loss": 0.4889, + "step": 7744 + }, + { + "epoch": 0.6290099894420531, + "grad_norm": 5.705508333315021, + "learning_rate": 1.5978518034879583e-06, + "loss": 0.3973, + "step": 7745 + }, + { + "epoch": 0.6290912044180947, + "grad_norm": 8.471787190410229, + "learning_rate": 1.5972385229321313e-06, + "loss": 0.5272, + "step": 7746 + }, + { + "epoch": 0.6291724193941363, + "grad_norm": 4.82503142826085, + "learning_rate": 1.5966253048426212e-06, + "loss": 0.6467, + "step": 7747 + }, + { + "epoch": 0.6292536343701779, + "grad_norm": 4.760084685878624, + "learning_rate": 1.596012149261858e-06, + "loss": 0.544, + "step": 7748 + }, + { + "epoch": 0.6293348493462194, + "grad_norm": 4.772517813956784, + "learning_rate": 1.5953990562322708e-06, + "loss": 0.4272, + "step": 7749 + }, + { + "epoch": 0.629416064322261, + "grad_norm": 8.327385517646334, + "learning_rate": 1.5947860257962808e-06, + "loss": 0.4509, + "step": 7750 + }, + { + "epoch": 0.6294972792983026, + "grad_norm": 4.076666918311391, + "learning_rate": 1.5941730579963065e-06, + "loss": 0.4959, + "step": 7751 + }, + { + "epoch": 0.6295784942743442, + "grad_norm": 6.179747838755071, + "learning_rate": 1.5935601528747635e-06, + "loss": 0.4375, + "step": 7752 + }, + { + "epoch": 0.6296597092503857, + "grad_norm": 12.93986219429753, + "learning_rate": 1.5929473104740605e-06, + "loss": 0.5545, + "step": 7753 + }, + { + "epoch": 0.6297409242264274, + "grad_norm": 3.6144579855996346, + "learning_rate": 1.5923345308366033e-06, + "loss": 0.4124, + "step": 7754 + }, + { + "epoch": 0.629822139202469, + "grad_norm": 3.832036350634625, + "learning_rate": 1.591721814004792e-06, + "loss": 0.6357, + "step": 7755 + }, + { + "epoch": 0.6299033541785105, + "grad_norm": 3.254583154769827, + "learning_rate": 1.5911091600210243e-06, + "loss": 0.582, + "step": 7756 + }, + { + "epoch": 0.6299845691545521, + "grad_norm": 4.7973970186134505, + "learning_rate": 1.5904965689276935e-06, + "loss": 0.809, + "step": 7757 + }, + { + "epoch": 0.6300657841305937, + "grad_norm": 4.07832937486864, + "learning_rate": 1.5898840407671854e-06, + "loss": 0.4779, + "step": 7758 + }, + { + "epoch": 0.6301469991066353, + "grad_norm": 11.649929406013856, + "learning_rate": 1.5892715755818855e-06, + "loss": 0.4955, + "step": 7759 + }, + { + "epoch": 0.6302282140826768, + "grad_norm": 7.298974738788142, + "learning_rate": 1.588659173414173e-06, + "loss": 0.3887, + "step": 7760 + }, + { + "epoch": 0.6303094290587185, + "grad_norm": 6.5690578488061115, + "learning_rate": 1.5880468343064215e-06, + "loss": 0.5319, + "step": 7761 + }, + { + "epoch": 0.63039064403476, + "grad_norm": 4.501194184210136, + "learning_rate": 1.5874345583010038e-06, + "loss": 0.5715, + "step": 7762 + }, + { + "epoch": 0.6304718590108016, + "grad_norm": 4.823495001127094, + "learning_rate": 1.5868223454402842e-06, + "loss": 0.5115, + "step": 7763 + }, + { + "epoch": 0.6305530739868431, + "grad_norm": 4.765174608509368, + "learning_rate": 1.5862101957666251e-06, + "loss": 0.4864, + "step": 7764 + }, + { + "epoch": 0.6306342889628848, + "grad_norm": 8.316397458431478, + "learning_rate": 1.5855981093223851e-06, + "loss": 0.5312, + "step": 7765 + }, + { + "epoch": 0.6307155039389264, + "grad_norm": 3.4813538772732096, + "learning_rate": 1.5849860861499161e-06, + "loss": 0.5063, + "step": 7766 + }, + { + "epoch": 0.6307967189149679, + "grad_norm": 5.709151262434003, + "learning_rate": 1.584374126291567e-06, + "loss": 0.6373, + "step": 7767 + }, + { + "epoch": 0.6308779338910095, + "grad_norm": 6.071499004825764, + "learning_rate": 1.5837622297896832e-06, + "loss": 0.4196, + "step": 7768 + }, + { + "epoch": 0.6309591488670511, + "grad_norm": 4.152234565584312, + "learning_rate": 1.5831503966866038e-06, + "loss": 0.4568, + "step": 7769 + }, + { + "epoch": 0.6310403638430927, + "grad_norm": 5.124374169356126, + "learning_rate": 1.5825386270246649e-06, + "loss": 0.5482, + "step": 7770 + }, + { + "epoch": 0.6311215788191342, + "grad_norm": 6.603032548895071, + "learning_rate": 1.5819269208461962e-06, + "loss": 0.4628, + "step": 7771 + }, + { + "epoch": 0.6312027937951759, + "grad_norm": 3.929016081455086, + "learning_rate": 1.5813152781935264e-06, + "loss": 0.4962, + "step": 7772 + }, + { + "epoch": 0.6312840087712174, + "grad_norm": 5.248036747944997, + "learning_rate": 1.5807036991089781e-06, + "loss": 0.5938, + "step": 7773 + }, + { + "epoch": 0.631365223747259, + "grad_norm": 4.602869975600319, + "learning_rate": 1.5800921836348671e-06, + "loss": 0.4762, + "step": 7774 + }, + { + "epoch": 0.6314464387233005, + "grad_norm": 3.998867630823634, + "learning_rate": 1.5794807318135097e-06, + "loss": 0.4337, + "step": 7775 + }, + { + "epoch": 0.6315276536993422, + "grad_norm": 4.511923050001, + "learning_rate": 1.5788693436872132e-06, + "loss": 0.5222, + "step": 7776 + }, + { + "epoch": 0.6316088686753838, + "grad_norm": 4.732480730008836, + "learning_rate": 1.5782580192982827e-06, + "loss": 0.5321, + "step": 7777 + }, + { + "epoch": 0.6316900836514253, + "grad_norm": 6.209675234125353, + "learning_rate": 1.57764675868902e-06, + "loss": 0.4472, + "step": 7778 + }, + { + "epoch": 0.631771298627467, + "grad_norm": 4.538684305840299, + "learning_rate": 1.5770355619017198e-06, + "loss": 0.5362, + "step": 7779 + }, + { + "epoch": 0.6318525136035085, + "grad_norm": 3.509225017297568, + "learning_rate": 1.5764244289786728e-06, + "loss": 0.4968, + "step": 7780 + }, + { + "epoch": 0.6319337285795501, + "grad_norm": 6.521031887625569, + "learning_rate": 1.575813359962169e-06, + "loss": 0.3996, + "step": 7781 + }, + { + "epoch": 0.6320149435555916, + "grad_norm": 12.929846426438688, + "learning_rate": 1.5752023548944889e-06, + "loss": 0.4625, + "step": 7782 + }, + { + "epoch": 0.6320961585316333, + "grad_norm": 9.52281730299787, + "learning_rate": 1.574591413817911e-06, + "loss": 0.5613, + "step": 7783 + }, + { + "epoch": 0.6321773735076748, + "grad_norm": 3.924489403653561, + "learning_rate": 1.57398053677471e-06, + "loss": 0.5386, + "step": 7784 + }, + { + "epoch": 0.6322585884837164, + "grad_norm": 3.8784061814789137, + "learning_rate": 1.5733697238071553e-06, + "loss": 0.4747, + "step": 7785 + }, + { + "epoch": 0.6323398034597579, + "grad_norm": 4.881789075504623, + "learning_rate": 1.5727589749575107e-06, + "loss": 0.5493, + "step": 7786 + }, + { + "epoch": 0.6324210184357996, + "grad_norm": 7.261971143003088, + "learning_rate": 1.5721482902680385e-06, + "loss": 0.5279, + "step": 7787 + }, + { + "epoch": 0.6325022334118412, + "grad_norm": 6.077300198937425, + "learning_rate": 1.5715376697809937e-06, + "loss": 0.5173, + "step": 7788 + }, + { + "epoch": 0.6325834483878827, + "grad_norm": 5.975876289320817, + "learning_rate": 1.570927113538629e-06, + "loss": 0.5638, + "step": 7789 + }, + { + "epoch": 0.6326646633639244, + "grad_norm": 3.670648216770045, + "learning_rate": 1.5703166215831899e-06, + "loss": 0.4479, + "step": 7790 + }, + { + "epoch": 0.6327458783399659, + "grad_norm": 3.6145233680649254, + "learning_rate": 1.5697061939569214e-06, + "loss": 0.5388, + "step": 7791 + }, + { + "epoch": 0.6328270933160075, + "grad_norm": 4.822710807501337, + "learning_rate": 1.56909583070206e-06, + "loss": 0.6051, + "step": 7792 + }, + { + "epoch": 0.632908308292049, + "grad_norm": 6.545220131213222, + "learning_rate": 1.56848553186084e-06, + "loss": 0.4538, + "step": 7793 + }, + { + "epoch": 0.6329895232680907, + "grad_norm": 6.080696544546954, + "learning_rate": 1.567875297475492e-06, + "loss": 0.4151, + "step": 7794 + }, + { + "epoch": 0.6330707382441322, + "grad_norm": 9.899327382181253, + "learning_rate": 1.56726512758824e-06, + "loss": 0.5212, + "step": 7795 + }, + { + "epoch": 0.6331519532201738, + "grad_norm": 4.273991889208243, + "learning_rate": 1.566655022241304e-06, + "loss": 0.5173, + "step": 7796 + }, + { + "epoch": 0.6332331681962153, + "grad_norm": 5.170717612782167, + "learning_rate": 1.5660449814769021e-06, + "loss": 0.4797, + "step": 7797 + }, + { + "epoch": 0.633314383172257, + "grad_norm": 5.7053519794438685, + "learning_rate": 1.5654350053372442e-06, + "loss": 0.4382, + "step": 7798 + }, + { + "epoch": 0.6333955981482986, + "grad_norm": 3.5081020926912667, + "learning_rate": 1.564825093864537e-06, + "loss": 0.4524, + "step": 7799 + }, + { + "epoch": 0.6334768131243401, + "grad_norm": 9.01478070880551, + "learning_rate": 1.5642152471009849e-06, + "loss": 0.4047, + "step": 7800 + }, + { + "epoch": 0.6335580281003818, + "grad_norm": 5.177815198718995, + "learning_rate": 1.563605465088785e-06, + "loss": 0.4582, + "step": 7801 + }, + { + "epoch": 0.6336392430764233, + "grad_norm": 8.412137360840807, + "learning_rate": 1.5629957478701303e-06, + "loss": 0.4636, + "step": 7802 + }, + { + "epoch": 0.6337204580524649, + "grad_norm": 5.581728871746328, + "learning_rate": 1.5623860954872116e-06, + "loss": 0.5674, + "step": 7803 + }, + { + "epoch": 0.6338016730285064, + "grad_norm": 4.471083148690759, + "learning_rate": 1.5617765079822133e-06, + "loss": 0.4592, + "step": 7804 + }, + { + "epoch": 0.6338828880045481, + "grad_norm": 3.475348192814086, + "learning_rate": 1.5611669853973141e-06, + "loss": 0.5273, + "step": 7805 + }, + { + "epoch": 0.6339641029805896, + "grad_norm": 5.120001622959669, + "learning_rate": 1.5605575277746912e-06, + "loss": 0.4363, + "step": 7806 + }, + { + "epoch": 0.6340453179566312, + "grad_norm": 4.543060760794114, + "learning_rate": 1.559948135156516e-06, + "loss": 0.4075, + "step": 7807 + }, + { + "epoch": 0.6341265329326727, + "grad_norm": 4.1227313596641, + "learning_rate": 1.559338807584954e-06, + "loss": 0.4653, + "step": 7808 + }, + { + "epoch": 0.6342077479087144, + "grad_norm": 4.831713571856866, + "learning_rate": 1.5587295451021678e-06, + "loss": 0.5367, + "step": 7809 + }, + { + "epoch": 0.634288962884756, + "grad_norm": 4.2655332482022414, + "learning_rate": 1.5581203477503166e-06, + "loss": 0.6405, + "step": 7810 + }, + { + "epoch": 0.6343701778607975, + "grad_norm": 8.517022361410643, + "learning_rate": 1.5575112155715516e-06, + "loss": 0.408, + "step": 7811 + }, + { + "epoch": 0.6344513928368392, + "grad_norm": 4.14581190989536, + "learning_rate": 1.5569021486080223e-06, + "loss": 0.4273, + "step": 7812 + }, + { + "epoch": 0.6345326078128807, + "grad_norm": 4.406741970957099, + "learning_rate": 1.5562931469018738e-06, + "loss": 0.688, + "step": 7813 + }, + { + "epoch": 0.6346138227889223, + "grad_norm": 4.483314144569286, + "learning_rate": 1.555684210495245e-06, + "loss": 0.5208, + "step": 7814 + }, + { + "epoch": 0.6346950377649638, + "grad_norm": 4.337838645295809, + "learning_rate": 1.5550753394302702e-06, + "loss": 0.5329, + "step": 7815 + }, + { + "epoch": 0.6347762527410055, + "grad_norm": 6.548340782272945, + "learning_rate": 1.5544665337490822e-06, + "loss": 0.5038, + "step": 7816 + }, + { + "epoch": 0.634857467717047, + "grad_norm": 26.26368001727583, + "learning_rate": 1.5538577934938051e-06, + "loss": 0.415, + "step": 7817 + }, + { + "epoch": 0.6349386826930886, + "grad_norm": 6.56905135630574, + "learning_rate": 1.5532491187065607e-06, + "loss": 0.4125, + "step": 7818 + }, + { + "epoch": 0.6350198976691301, + "grad_norm": 4.494023935708008, + "learning_rate": 1.5526405094294682e-06, + "loss": 0.4559, + "step": 7819 + }, + { + "epoch": 0.6351011126451718, + "grad_norm": 7.264576312866752, + "learning_rate": 1.5520319657046384e-06, + "loss": 0.4749, + "step": 7820 + }, + { + "epoch": 0.6351823276212134, + "grad_norm": 3.8422635582509037, + "learning_rate": 1.5514234875741785e-06, + "loss": 0.5266, + "step": 7821 + }, + { + "epoch": 0.6352635425972549, + "grad_norm": 4.7814474085953815, + "learning_rate": 1.550815075080193e-06, + "loss": 0.5509, + "step": 7822 + }, + { + "epoch": 0.6353447575732966, + "grad_norm": 14.680542072418953, + "learning_rate": 1.5502067282647821e-06, + "loss": 0.4687, + "step": 7823 + }, + { + "epoch": 0.6354259725493381, + "grad_norm": 5.102982342242324, + "learning_rate": 1.5495984471700382e-06, + "loss": 0.5625, + "step": 7824 + }, + { + "epoch": 0.6355071875253797, + "grad_norm": 6.340239866996015, + "learning_rate": 1.5489902318380512e-06, + "loss": 0.3983, + "step": 7825 + }, + { + "epoch": 0.6355884025014212, + "grad_norm": 5.566203991765062, + "learning_rate": 1.5483820823109078e-06, + "loss": 0.5833, + "step": 7826 + }, + { + "epoch": 0.6356696174774629, + "grad_norm": 4.137320108495999, + "learning_rate": 1.5477739986306878e-06, + "loss": 0.6481, + "step": 7827 + }, + { + "epoch": 0.6357508324535044, + "grad_norm": 4.613431700216779, + "learning_rate": 1.5471659808394669e-06, + "loss": 0.4265, + "step": 7828 + }, + { + "epoch": 0.635832047429546, + "grad_norm": 3.7043285271670507, + "learning_rate": 1.546558028979318e-06, + "loss": 0.3759, + "step": 7829 + }, + { + "epoch": 0.6359132624055875, + "grad_norm": 5.051376359459655, + "learning_rate": 1.5459501430923073e-06, + "loss": 0.4925, + "step": 7830 + }, + { + "epoch": 0.6359944773816292, + "grad_norm": 4.08262450327924, + "learning_rate": 1.5453423232204968e-06, + "loss": 0.4753, + "step": 7831 + }, + { + "epoch": 0.6360756923576708, + "grad_norm": 4.998294324944461, + "learning_rate": 1.5447345694059462e-06, + "loss": 0.4227, + "step": 7832 + }, + { + "epoch": 0.6361569073337123, + "grad_norm": 13.638204885625212, + "learning_rate": 1.5441268816907077e-06, + "loss": 0.5512, + "step": 7833 + }, + { + "epoch": 0.636238122309754, + "grad_norm": 22.49883181989276, + "learning_rate": 1.5435192601168293e-06, + "loss": 0.5912, + "step": 7834 + }, + { + "epoch": 0.6363193372857955, + "grad_norm": 5.137209289732004, + "learning_rate": 1.542911704726356e-06, + "loss": 0.425, + "step": 7835 + }, + { + "epoch": 0.6364005522618371, + "grad_norm": 5.554841196878051, + "learning_rate": 1.5423042155613283e-06, + "loss": 0.3496, + "step": 7836 + }, + { + "epoch": 0.6364817672378786, + "grad_norm": 6.808818397785924, + "learning_rate": 1.5416967926637793e-06, + "loss": 0.3677, + "step": 7837 + }, + { + "epoch": 0.6365629822139203, + "grad_norm": 10.986138436222886, + "learning_rate": 1.5410894360757408e-06, + "loss": 0.5055, + "step": 7838 + }, + { + "epoch": 0.6366441971899618, + "grad_norm": 7.0748129231448695, + "learning_rate": 1.540482145839239e-06, + "loss": 0.4956, + "step": 7839 + }, + { + "epoch": 0.6367254121660034, + "grad_norm": 5.517324400634849, + "learning_rate": 1.5398749219962935e-06, + "loss": 0.5188, + "step": 7840 + }, + { + "epoch": 0.636806627142045, + "grad_norm": 7.369254822891894, + "learning_rate": 1.5392677645889225e-06, + "loss": 0.4918, + "step": 7841 + }, + { + "epoch": 0.6368878421180866, + "grad_norm": 4.443831510239287, + "learning_rate": 1.5386606736591381e-06, + "loss": 0.5523, + "step": 7842 + }, + { + "epoch": 0.6369690570941282, + "grad_norm": 5.484153254041072, + "learning_rate": 1.5380536492489468e-06, + "loss": 0.4526, + "step": 7843 + }, + { + "epoch": 0.6370502720701697, + "grad_norm": 3.6793655308252364, + "learning_rate": 1.5374466914003516e-06, + "loss": 0.5204, + "step": 7844 + }, + { + "epoch": 0.6371314870462114, + "grad_norm": 5.6124428134045, + "learning_rate": 1.536839800155352e-06, + "loss": 0.5365, + "step": 7845 + }, + { + "epoch": 0.6372127020222529, + "grad_norm": 4.415081736558317, + "learning_rate": 1.5362329755559402e-06, + "loss": 0.4503, + "step": 7846 + }, + { + "epoch": 0.6372939169982945, + "grad_norm": 8.36645209753681, + "learning_rate": 1.5356262176441051e-06, + "loss": 0.5873, + "step": 7847 + }, + { + "epoch": 0.637375131974336, + "grad_norm": 4.410444200701278, + "learning_rate": 1.5350195264618333e-06, + "loss": 0.4172, + "step": 7848 + }, + { + "epoch": 0.6374563469503777, + "grad_norm": 5.471308320004208, + "learning_rate": 1.5344129020511029e-06, + "loss": 0.378, + "step": 7849 + }, + { + "epoch": 0.6375375619264192, + "grad_norm": 2.964980011323985, + "learning_rate": 1.5338063444538887e-06, + "loss": 0.4858, + "step": 7850 + }, + { + "epoch": 0.6376187769024608, + "grad_norm": 4.034123659647055, + "learning_rate": 1.533199853712162e-06, + "loss": 0.4601, + "step": 7851 + }, + { + "epoch": 0.6376999918785023, + "grad_norm": 5.360248478803197, + "learning_rate": 1.5325934298678896e-06, + "loss": 0.5198, + "step": 7852 + }, + { + "epoch": 0.637781206854544, + "grad_norm": 4.128851671530124, + "learning_rate": 1.5319870729630303e-06, + "loss": 0.5347, + "step": 7853 + }, + { + "epoch": 0.6378624218305856, + "grad_norm": 5.837367955901182, + "learning_rate": 1.5313807830395437e-06, + "loss": 0.4249, + "step": 7854 + }, + { + "epoch": 0.6379436368066271, + "grad_norm": 4.420374712053163, + "learning_rate": 1.5307745601393808e-06, + "loss": 0.5663, + "step": 7855 + }, + { + "epoch": 0.6380248517826688, + "grad_norm": 8.34561480709057, + "learning_rate": 1.5301684043044875e-06, + "loss": 0.569, + "step": 7856 + }, + { + "epoch": 0.6381060667587103, + "grad_norm": 11.076659493261435, + "learning_rate": 1.5295623155768086e-06, + "loss": 0.4581, + "step": 7857 + }, + { + "epoch": 0.6381872817347519, + "grad_norm": 5.281957951201278, + "learning_rate": 1.5289562939982822e-06, + "loss": 0.5524, + "step": 7858 + }, + { + "epoch": 0.6382684967107934, + "grad_norm": 4.075328659102114, + "learning_rate": 1.5283503396108401e-06, + "loss": 0.5083, + "step": 7859 + }, + { + "epoch": 0.6383497116868351, + "grad_norm": 4.739091105300953, + "learning_rate": 1.5277444524564117e-06, + "loss": 0.524, + "step": 7860 + }, + { + "epoch": 0.6384309266628766, + "grad_norm": 12.502865910338423, + "learning_rate": 1.5271386325769227e-06, + "loss": 0.2867, + "step": 7861 + }, + { + "epoch": 0.6385121416389182, + "grad_norm": 4.833851282125769, + "learning_rate": 1.526532880014292e-06, + "loss": 0.6359, + "step": 7862 + }, + { + "epoch": 0.6385933566149598, + "grad_norm": 4.157660884304162, + "learning_rate": 1.5259271948104323e-06, + "loss": 0.4139, + "step": 7863 + }, + { + "epoch": 0.6386745715910014, + "grad_norm": 4.699011951945198, + "learning_rate": 1.5253215770072564e-06, + "loss": 0.4898, + "step": 7864 + }, + { + "epoch": 0.638755786567043, + "grad_norm": 5.911530191898309, + "learning_rate": 1.5247160266466693e-06, + "loss": 0.5325, + "step": 7865 + }, + { + "epoch": 0.6388370015430845, + "grad_norm": 3.680767366819412, + "learning_rate": 1.5241105437705706e-06, + "loss": 0.5321, + "step": 7866 + }, + { + "epoch": 0.6389182165191262, + "grad_norm": 6.398260678627941, + "learning_rate": 1.523505128420858e-06, + "loss": 0.4228, + "step": 7867 + }, + { + "epoch": 0.6389994314951677, + "grad_norm": 3.398702627971208, + "learning_rate": 1.522899780639423e-06, + "loss": 0.5505, + "step": 7868 + }, + { + "epoch": 0.6390806464712093, + "grad_norm": 6.202752332409573, + "learning_rate": 1.5222945004681504e-06, + "loss": 0.7697, + "step": 7869 + }, + { + "epoch": 0.6391618614472508, + "grad_norm": 5.874268780003323, + "learning_rate": 1.5216892879489253e-06, + "loss": 0.4696, + "step": 7870 + }, + { + "epoch": 0.6392430764232925, + "grad_norm": 4.404120702182698, + "learning_rate": 1.521084143123624e-06, + "loss": 0.6132, + "step": 7871 + }, + { + "epoch": 0.639324291399334, + "grad_norm": 4.8946588555368695, + "learning_rate": 1.5204790660341178e-06, + "loss": 0.4298, + "step": 7872 + }, + { + "epoch": 0.6394055063753756, + "grad_norm": 4.660366909329026, + "learning_rate": 1.519874056722277e-06, + "loss": 0.6014, + "step": 7873 + }, + { + "epoch": 0.6394867213514172, + "grad_norm": 9.926138073456057, + "learning_rate": 1.5192691152299649e-06, + "loss": 0.5607, + "step": 7874 + }, + { + "epoch": 0.6395679363274588, + "grad_norm": 6.862534435729142, + "learning_rate": 1.5186642415990382e-06, + "loss": 0.4831, + "step": 7875 + }, + { + "epoch": 0.6396491513035004, + "grad_norm": 5.109846150321177, + "learning_rate": 1.518059435871353e-06, + "loss": 0.3118, + "step": 7876 + }, + { + "epoch": 0.6397303662795419, + "grad_norm": 4.824987683441751, + "learning_rate": 1.5174546980887585e-06, + "loss": 0.407, + "step": 7877 + }, + { + "epoch": 0.6398115812555836, + "grad_norm": 3.088737083022684, + "learning_rate": 1.516850028293099e-06, + "loss": 0.6486, + "step": 7878 + }, + { + "epoch": 0.6398927962316251, + "grad_norm": 4.687914435447199, + "learning_rate": 1.516245426526213e-06, + "loss": 0.6395, + "step": 7879 + }, + { + "epoch": 0.6399740112076667, + "grad_norm": 6.45233418054512, + "learning_rate": 1.5156408928299377e-06, + "loss": 0.5865, + "step": 7880 + }, + { + "epoch": 0.6400552261837082, + "grad_norm": 4.4953810242052, + "learning_rate": 1.5150364272461035e-06, + "loss": 0.4263, + "step": 7881 + }, + { + "epoch": 0.6401364411597499, + "grad_norm": 5.174417200023911, + "learning_rate": 1.5144320298165346e-06, + "loss": 0.4651, + "step": 7882 + }, + { + "epoch": 0.6402176561357914, + "grad_norm": 8.999793348428359, + "learning_rate": 1.5138277005830538e-06, + "loss": 0.4134, + "step": 7883 + }, + { + "epoch": 0.640298871111833, + "grad_norm": 4.942872220787506, + "learning_rate": 1.5132234395874773e-06, + "loss": 0.5613, + "step": 7884 + }, + { + "epoch": 0.6403800860878746, + "grad_norm": 3.7848669467000433, + "learning_rate": 1.5126192468716152e-06, + "loss": 0.4878, + "step": 7885 + }, + { + "epoch": 0.6404613010639162, + "grad_norm": 4.408963540747303, + "learning_rate": 1.5120151224772765e-06, + "loss": 0.6558, + "step": 7886 + }, + { + "epoch": 0.6405425160399578, + "grad_norm": 6.167053391776367, + "learning_rate": 1.5114110664462624e-06, + "loss": 0.5179, + "step": 7887 + }, + { + "epoch": 0.6406237310159993, + "grad_norm": 4.691809504965685, + "learning_rate": 1.5108070788203699e-06, + "loss": 0.5723, + "step": 7888 + }, + { + "epoch": 0.640704945992041, + "grad_norm": 4.8601836373152, + "learning_rate": 1.5102031596413927e-06, + "loss": 0.5001, + "step": 7889 + }, + { + "epoch": 0.6407861609680825, + "grad_norm": 4.771276325593275, + "learning_rate": 1.509599308951119e-06, + "loss": 0.4755, + "step": 7890 + }, + { + "epoch": 0.6408673759441241, + "grad_norm": 7.6294635480914135, + "learning_rate": 1.5089955267913303e-06, + "loss": 0.3362, + "step": 7891 + }, + { + "epoch": 0.6409485909201657, + "grad_norm": 4.020044219945879, + "learning_rate": 1.5083918132038072e-06, + "loss": 0.3609, + "step": 7892 + }, + { + "epoch": 0.6410298058962073, + "grad_norm": 8.75122403334399, + "learning_rate": 1.5077881682303225e-06, + "loss": 0.4598, + "step": 7893 + }, + { + "epoch": 0.6411110208722488, + "grad_norm": 12.260107016009812, + "learning_rate": 1.5071845919126448e-06, + "loss": 0.5443, + "step": 7894 + }, + { + "epoch": 0.6411922358482904, + "grad_norm": 6.260976119003239, + "learning_rate": 1.5065810842925399e-06, + "loss": 0.456, + "step": 7895 + }, + { + "epoch": 0.641273450824332, + "grad_norm": 4.832778705163437, + "learning_rate": 1.5059776454117658e-06, + "loss": 0.4861, + "step": 7896 + }, + { + "epoch": 0.6413546658003736, + "grad_norm": 4.117223333471925, + "learning_rate": 1.505374275312078e-06, + "loss": 0.4146, + "step": 7897 + }, + { + "epoch": 0.6414358807764152, + "grad_norm": 7.715748444314561, + "learning_rate": 1.504770974035226e-06, + "loss": 0.4328, + "step": 7898 + }, + { + "epoch": 0.6415170957524567, + "grad_norm": 3.698130972907306, + "learning_rate": 1.5041677416229556e-06, + "loss": 0.5428, + "step": 7899 + }, + { + "epoch": 0.6415983107284984, + "grad_norm": 8.80799380441798, + "learning_rate": 1.5035645781170078e-06, + "loss": 0.5696, + "step": 7900 + }, + { + "epoch": 0.6416795257045399, + "grad_norm": 5.951094606026148, + "learning_rate": 1.502961483559116e-06, + "loss": 0.6523, + "step": 7901 + }, + { + "epoch": 0.6417607406805815, + "grad_norm": 5.956077850717039, + "learning_rate": 1.502358457991014e-06, + "loss": 0.4707, + "step": 7902 + }, + { + "epoch": 0.641841955656623, + "grad_norm": 17.140276946196106, + "learning_rate": 1.5017555014544273e-06, + "loss": 0.4322, + "step": 7903 + }, + { + "epoch": 0.6419231706326647, + "grad_norm": 4.87844418439407, + "learning_rate": 1.5011526139910754e-06, + "loss": 0.523, + "step": 7904 + }, + { + "epoch": 0.6420043856087062, + "grad_norm": 4.800050176008073, + "learning_rate": 1.5005497956426773e-06, + "loss": 0.4496, + "step": 7905 + }, + { + "epoch": 0.6420856005847478, + "grad_norm": 15.944468140204993, + "learning_rate": 1.4999470464509432e-06, + "loss": 0.4226, + "step": 7906 + }, + { + "epoch": 0.6421668155607894, + "grad_norm": 10.898961363880764, + "learning_rate": 1.4993443664575807e-06, + "loss": 0.4787, + "step": 7907 + }, + { + "epoch": 0.642248030536831, + "grad_norm": 3.386355939222327, + "learning_rate": 1.4987417557042928e-06, + "loss": 0.4948, + "step": 7908 + }, + { + "epoch": 0.6423292455128726, + "grad_norm": 4.044784764443736, + "learning_rate": 1.4981392142327761e-06, + "loss": 0.504, + "step": 7909 + }, + { + "epoch": 0.6424104604889141, + "grad_norm": 7.7128980212700915, + "learning_rate": 1.4975367420847225e-06, + "loss": 0.5462, + "step": 7910 + }, + { + "epoch": 0.6424916754649558, + "grad_norm": 7.797675381291463, + "learning_rate": 1.4969343393018224e-06, + "loss": 0.4488, + "step": 7911 + }, + { + "epoch": 0.6425728904409973, + "grad_norm": 3.7094574496346198, + "learning_rate": 1.4963320059257565e-06, + "loss": 0.5137, + "step": 7912 + }, + { + "epoch": 0.6426541054170389, + "grad_norm": 6.476581873676165, + "learning_rate": 1.4957297419982047e-06, + "loss": 0.4704, + "step": 7913 + }, + { + "epoch": 0.6427353203930805, + "grad_norm": 4.73377261927438, + "learning_rate": 1.4951275475608387e-06, + "loss": 0.463, + "step": 7914 + }, + { + "epoch": 0.6428165353691221, + "grad_norm": 3.1667300398717386, + "learning_rate": 1.4945254226553288e-06, + "loss": 0.5192, + "step": 7915 + }, + { + "epoch": 0.6428977503451636, + "grad_norm": 4.733929299465015, + "learning_rate": 1.4939233673233387e-06, + "loss": 0.6137, + "step": 7916 + }, + { + "epoch": 0.6429789653212052, + "grad_norm": 5.618667686017774, + "learning_rate": 1.4933213816065257e-06, + "loss": 0.4966, + "step": 7917 + }, + { + "epoch": 0.6430601802972468, + "grad_norm": 3.847952094017611, + "learning_rate": 1.492719465546546e-06, + "loss": 0.4784, + "step": 7918 + }, + { + "epoch": 0.6431413952732884, + "grad_norm": 4.697213023502698, + "learning_rate": 1.492117619185049e-06, + "loss": 0.4471, + "step": 7919 + }, + { + "epoch": 0.64322261024933, + "grad_norm": 5.479689495939854, + "learning_rate": 1.4915158425636772e-06, + "loss": 0.5378, + "step": 7920 + }, + { + "epoch": 0.6433038252253716, + "grad_norm": 3.3937052015697367, + "learning_rate": 1.4909141357240731e-06, + "loss": 0.4669, + "step": 7921 + }, + { + "epoch": 0.6433850402014132, + "grad_norm": 4.013372087007401, + "learning_rate": 1.4903124987078698e-06, + "loss": 0.4361, + "step": 7922 + }, + { + "epoch": 0.6434662551774547, + "grad_norm": 3.746362491526315, + "learning_rate": 1.4897109315566974e-06, + "loss": 0.4443, + "step": 7923 + }, + { + "epoch": 0.6435474701534963, + "grad_norm": 4.102308375643804, + "learning_rate": 1.4891094343121827e-06, + "loss": 0.446, + "step": 7924 + }, + { + "epoch": 0.6436286851295379, + "grad_norm": 3.8787498068648634, + "learning_rate": 1.488508007015944e-06, + "loss": 0.7058, + "step": 7925 + }, + { + "epoch": 0.6437099001055795, + "grad_norm": 7.193908220842526, + "learning_rate": 1.487906649709598e-06, + "loss": 0.4578, + "step": 7926 + }, + { + "epoch": 0.643791115081621, + "grad_norm": 7.588715391068836, + "learning_rate": 1.4873053624347567e-06, + "loss": 0.5096, + "step": 7927 + }, + { + "epoch": 0.6438723300576626, + "grad_norm": 5.508197098569508, + "learning_rate": 1.4867041452330238e-06, + "loss": 0.5221, + "step": 7928 + }, + { + "epoch": 0.6439535450337042, + "grad_norm": 3.6948692770323928, + "learning_rate": 1.4861029981460007e-06, + "loss": 0.4898, + "step": 7929 + }, + { + "epoch": 0.6440347600097458, + "grad_norm": 3.770725561207981, + "learning_rate": 1.4855019212152852e-06, + "loss": 0.4907, + "step": 7930 + }, + { + "epoch": 0.6441159749857874, + "grad_norm": 6.044600128479535, + "learning_rate": 1.484900914482467e-06, + "loss": 0.3815, + "step": 7931 + }, + { + "epoch": 0.644197189961829, + "grad_norm": 5.141487441925867, + "learning_rate": 1.484299977989134e-06, + "loss": 0.5212, + "step": 7932 + }, + { + "epoch": 0.6442784049378706, + "grad_norm": 6.722063710271542, + "learning_rate": 1.4836991117768657e-06, + "loss": 0.465, + "step": 7933 + }, + { + "epoch": 0.6443596199139121, + "grad_norm": 6.60995189007181, + "learning_rate": 1.4830983158872414e-06, + "loss": 0.3926, + "step": 7934 + }, + { + "epoch": 0.6444408348899537, + "grad_norm": 5.159161950257783, + "learning_rate": 1.482497590361831e-06, + "loss": 0.5679, + "step": 7935 + }, + { + "epoch": 0.6445220498659953, + "grad_norm": 4.722697499573461, + "learning_rate": 1.4818969352422018e-06, + "loss": 0.4301, + "step": 7936 + }, + { + "epoch": 0.6446032648420369, + "grad_norm": 5.60413330763897, + "learning_rate": 1.4812963505699179e-06, + "loss": 0.5284, + "step": 7937 + }, + { + "epoch": 0.6446844798180784, + "grad_norm": 4.906758714354853, + "learning_rate": 1.4806958363865342e-06, + "loss": 0.4881, + "step": 7938 + }, + { + "epoch": 0.64476569479412, + "grad_norm": 5.416581495053993, + "learning_rate": 1.4800953927336036e-06, + "loss": 0.4121, + "step": 7939 + }, + { + "epoch": 0.6448469097701616, + "grad_norm": 3.1479449081387605, + "learning_rate": 1.4794950196526753e-06, + "loss": 0.4134, + "step": 7940 + }, + { + "epoch": 0.6449281247462032, + "grad_norm": 4.49719543896871, + "learning_rate": 1.4788947171852899e-06, + "loss": 0.3979, + "step": 7941 + }, + { + "epoch": 0.6450093397222448, + "grad_norm": 5.0629449929653205, + "learning_rate": 1.4782944853729856e-06, + "loss": 0.4008, + "step": 7942 + }, + { + "epoch": 0.6450905546982864, + "grad_norm": 4.398126234430731, + "learning_rate": 1.4776943242572966e-06, + "loss": 0.6205, + "step": 7943 + }, + { + "epoch": 0.645171769674328, + "grad_norm": 4.1699063676723425, + "learning_rate": 1.4770942338797491e-06, + "loss": 0.5619, + "step": 7944 + }, + { + "epoch": 0.6452529846503695, + "grad_norm": 83.64698952954932, + "learning_rate": 1.4764942142818667e-06, + "loss": 0.4946, + "step": 7945 + }, + { + "epoch": 0.6453341996264111, + "grad_norm": 5.2534053694927625, + "learning_rate": 1.475894265505169e-06, + "loss": 0.6233, + "step": 7946 + }, + { + "epoch": 0.6454154146024527, + "grad_norm": 3.941627067484926, + "learning_rate": 1.4752943875911673e-06, + "loss": 0.4209, + "step": 7947 + }, + { + "epoch": 0.6454966295784943, + "grad_norm": 9.922976413484275, + "learning_rate": 1.4746945805813707e-06, + "loss": 0.5124, + "step": 7948 + }, + { + "epoch": 0.6455778445545358, + "grad_norm": 6.9619380350835085, + "learning_rate": 1.4740948445172834e-06, + "loss": 0.3986, + "step": 7949 + }, + { + "epoch": 0.6456590595305775, + "grad_norm": 4.2939089133962325, + "learning_rate": 1.4734951794404035e-06, + "loss": 0.4537, + "step": 7950 + }, + { + "epoch": 0.645740274506619, + "grad_norm": 3.1416028655227293, + "learning_rate": 1.4728955853922238e-06, + "loss": 0.5554, + "step": 7951 + }, + { + "epoch": 0.6458214894826606, + "grad_norm": 17.016801772420305, + "learning_rate": 1.4722960624142336e-06, + "loss": 0.4172, + "step": 7952 + }, + { + "epoch": 0.6459027044587022, + "grad_norm": 5.911786929784765, + "learning_rate": 1.4716966105479175e-06, + "loss": 0.3805, + "step": 7953 + }, + { + "epoch": 0.6459839194347438, + "grad_norm": 4.538417255604256, + "learning_rate": 1.471097229834753e-06, + "loss": 0.4062, + "step": 7954 + }, + { + "epoch": 0.6460651344107854, + "grad_norm": 7.061375321676026, + "learning_rate": 1.4704979203162148e-06, + "loss": 0.4817, + "step": 7955 + }, + { + "epoch": 0.6461463493868269, + "grad_norm": 8.396993615373008, + "learning_rate": 1.4698986820337729e-06, + "loss": 0.3514, + "step": 7956 + }, + { + "epoch": 0.6462275643628685, + "grad_norm": 5.11600701972491, + "learning_rate": 1.4692995150288896e-06, + "loss": 0.4232, + "step": 7957 + }, + { + "epoch": 0.6463087793389101, + "grad_norm": 3.9201591191720446, + "learning_rate": 1.4687004193430248e-06, + "loss": 0.6237, + "step": 7958 + }, + { + "epoch": 0.6463899943149517, + "grad_norm": 4.347166743753607, + "learning_rate": 1.4681013950176338e-06, + "loss": 0.4223, + "step": 7959 + }, + { + "epoch": 0.6464712092909932, + "grad_norm": 4.5755798327245625, + "learning_rate": 1.4675024420941643e-06, + "loss": 0.4167, + "step": 7960 + }, + { + "epoch": 0.6465524242670349, + "grad_norm": 3.768261457483465, + "learning_rate": 1.4669035606140613e-06, + "loss": 0.531, + "step": 7961 + }, + { + "epoch": 0.6466336392430764, + "grad_norm": 4.347537189926498, + "learning_rate": 1.4663047506187649e-06, + "loss": 0.4719, + "step": 7962 + }, + { + "epoch": 0.646714854219118, + "grad_norm": 4.256186132283668, + "learning_rate": 1.4657060121497095e-06, + "loss": 0.55, + "step": 7963 + }, + { + "epoch": 0.6467960691951596, + "grad_norm": 4.071283516303743, + "learning_rate": 1.4651073452483228e-06, + "loss": 0.762, + "step": 7964 + }, + { + "epoch": 0.6468772841712012, + "grad_norm": 10.936892803098575, + "learning_rate": 1.4645087499560313e-06, + "loss": 0.5069, + "step": 7965 + }, + { + "epoch": 0.6469584991472428, + "grad_norm": 4.622669916018039, + "learning_rate": 1.4639102263142546e-06, + "loss": 0.4825, + "step": 7966 + }, + { + "epoch": 0.6470397141232843, + "grad_norm": 5.995534947389931, + "learning_rate": 1.463311774364406e-06, + "loss": 0.5098, + "step": 7967 + }, + { + "epoch": 0.647120929099326, + "grad_norm": 81.25869393551474, + "learning_rate": 1.4627133941478958e-06, + "loss": 0.6188, + "step": 7968 + }, + { + "epoch": 0.6472021440753675, + "grad_norm": 5.489822227742221, + "learning_rate": 1.46211508570613e-06, + "loss": 0.4715, + "step": 7969 + }, + { + "epoch": 0.6472833590514091, + "grad_norm": 5.211788919279321, + "learning_rate": 1.4615168490805066e-06, + "loss": 0.3925, + "step": 7970 + }, + { + "epoch": 0.6473645740274506, + "grad_norm": 4.441097732454784, + "learning_rate": 1.4609186843124208e-06, + "loss": 0.5827, + "step": 7971 + }, + { + "epoch": 0.6474457890034923, + "grad_norm": 4.918274629630136, + "learning_rate": 1.4603205914432638e-06, + "loss": 0.5865, + "step": 7972 + }, + { + "epoch": 0.6475270039795338, + "grad_norm": 5.381182030506248, + "learning_rate": 1.4597225705144189e-06, + "loss": 0.427, + "step": 7973 + }, + { + "epoch": 0.6476082189555754, + "grad_norm": 4.47373690879082, + "learning_rate": 1.459124621567266e-06, + "loss": 0.5468, + "step": 7974 + }, + { + "epoch": 0.647689433931617, + "grad_norm": 4.853763535922953, + "learning_rate": 1.4585267446431817e-06, + "loss": 0.4893, + "step": 7975 + }, + { + "epoch": 0.6477706489076586, + "grad_norm": 12.67331094044437, + "learning_rate": 1.4579289397835344e-06, + "loss": 0.6503, + "step": 7976 + }, + { + "epoch": 0.6478518638837002, + "grad_norm": 3.5376573305444725, + "learning_rate": 1.4573312070296885e-06, + "loss": 0.48, + "step": 7977 + }, + { + "epoch": 0.6479330788597417, + "grad_norm": 6.986996329380094, + "learning_rate": 1.4567335464230062e-06, + "loss": 0.4793, + "step": 7978 + }, + { + "epoch": 0.6480142938357834, + "grad_norm": 6.098834362052517, + "learning_rate": 1.4561359580048394e-06, + "loss": 0.567, + "step": 7979 + }, + { + "epoch": 0.6480955088118249, + "grad_norm": 5.728827122709186, + "learning_rate": 1.4555384418165405e-06, + "loss": 0.5839, + "step": 7980 + }, + { + "epoch": 0.6481767237878665, + "grad_norm": 6.108126965155767, + "learning_rate": 1.4549409978994543e-06, + "loss": 0.721, + "step": 7981 + }, + { + "epoch": 0.648257938763908, + "grad_norm": 12.175353754173727, + "learning_rate": 1.45434362629492e-06, + "loss": 0.3989, + "step": 7982 + }, + { + "epoch": 0.6483391537399497, + "grad_norm": 5.726270390332232, + "learning_rate": 1.453746327044272e-06, + "loss": 0.5943, + "step": 7983 + }, + { + "epoch": 0.6484203687159912, + "grad_norm": 8.228427503586767, + "learning_rate": 1.4531491001888421e-06, + "loss": 0.3702, + "step": 7984 + }, + { + "epoch": 0.6485015836920328, + "grad_norm": 6.718803197230006, + "learning_rate": 1.4525519457699527e-06, + "loss": 0.4701, + "step": 7985 + }, + { + "epoch": 0.6485827986680744, + "grad_norm": 6.288459331865035, + "learning_rate": 1.451954863828926e-06, + "loss": 0.454, + "step": 7986 + }, + { + "epoch": 0.648664013644116, + "grad_norm": 5.184071201153414, + "learning_rate": 1.4513578544070753e-06, + "loss": 0.4614, + "step": 7987 + }, + { + "epoch": 0.6487452286201576, + "grad_norm": 6.12187597162872, + "learning_rate": 1.4507609175457121e-06, + "loss": 0.4276, + "step": 7988 + }, + { + "epoch": 0.6488264435961991, + "grad_norm": 6.050006695871517, + "learning_rate": 1.4501640532861405e-06, + "loss": 0.4831, + "step": 7989 + }, + { + "epoch": 0.6489076585722408, + "grad_norm": 6.399026819639625, + "learning_rate": 1.4495672616696594e-06, + "loss": 0.3898, + "step": 7990 + }, + { + "epoch": 0.6489888735482823, + "grad_norm": 5.876464671442483, + "learning_rate": 1.448970542737565e-06, + "loss": 0.5073, + "step": 7991 + }, + { + "epoch": 0.6490700885243239, + "grad_norm": 4.813231306921125, + "learning_rate": 1.4483738965311455e-06, + "loss": 0.5262, + "step": 7992 + }, + { + "epoch": 0.6491513035003654, + "grad_norm": 3.7301648640363716, + "learning_rate": 1.4477773230916872e-06, + "loss": 0.4813, + "step": 7993 + }, + { + "epoch": 0.6492325184764071, + "grad_norm": 5.647909285913068, + "learning_rate": 1.44718082246047e-06, + "loss": 0.442, + "step": 7994 + }, + { + "epoch": 0.6493137334524486, + "grad_norm": 5.414289909475496, + "learning_rate": 1.4465843946787683e-06, + "loss": 0.3569, + "step": 7995 + }, + { + "epoch": 0.6493949484284902, + "grad_norm": 7.022409413689571, + "learning_rate": 1.44598803978785e-06, + "loss": 0.485, + "step": 7996 + }, + { + "epoch": 0.6494761634045318, + "grad_norm": 5.325732469754769, + "learning_rate": 1.4453917578289823e-06, + "loss": 0.6726, + "step": 7997 + }, + { + "epoch": 0.6495573783805734, + "grad_norm": 6.073037858190773, + "learning_rate": 1.4447955488434223e-06, + "loss": 0.5305, + "step": 7998 + }, + { + "epoch": 0.649638593356615, + "grad_norm": 7.0964517666533515, + "learning_rate": 1.4441994128724258e-06, + "loss": 0.4777, + "step": 7999 + }, + { + "epoch": 0.6497198083326565, + "grad_norm": 5.760466887618525, + "learning_rate": 1.443603349957243e-06, + "loss": 0.5041, + "step": 8000 + }, + { + "epoch": 0.6498010233086982, + "grad_norm": 4.591683516314596, + "learning_rate": 1.4430073601391175e-06, + "loss": 0.5213, + "step": 8001 + }, + { + "epoch": 0.6498822382847397, + "grad_norm": 5.711693932834374, + "learning_rate": 1.442411443459289e-06, + "loss": 0.5668, + "step": 8002 + }, + { + "epoch": 0.6499634532607813, + "grad_norm": 5.284351713643368, + "learning_rate": 1.44181559995899e-06, + "loss": 0.6332, + "step": 8003 + }, + { + "epoch": 0.6500446682368228, + "grad_norm": 7.634523671563961, + "learning_rate": 1.4412198296794516e-06, + "loss": 0.4598, + "step": 8004 + }, + { + "epoch": 0.6501258832128645, + "grad_norm": 4.421291562696544, + "learning_rate": 1.4406241326618981e-06, + "loss": 0.585, + "step": 8005 + }, + { + "epoch": 0.650207098188906, + "grad_norm": 4.721051742091286, + "learning_rate": 1.4400285089475468e-06, + "loss": 0.7653, + "step": 8006 + }, + { + "epoch": 0.6502883131649476, + "grad_norm": 5.351925711601577, + "learning_rate": 1.4394329585776143e-06, + "loss": 0.3979, + "step": 8007 + }, + { + "epoch": 0.6503695281409893, + "grad_norm": 4.447327868885781, + "learning_rate": 1.4388374815933078e-06, + "loss": 0.4846, + "step": 8008 + }, + { + "epoch": 0.6504507431170308, + "grad_norm": 4.016961136441612, + "learning_rate": 1.4382420780358306e-06, + "loss": 0.4739, + "step": 8009 + }, + { + "epoch": 0.6505319580930724, + "grad_norm": 5.414485646946576, + "learning_rate": 1.4376467479463832e-06, + "loss": 0.3597, + "step": 8010 + }, + { + "epoch": 0.6506131730691139, + "grad_norm": 11.24686014696604, + "learning_rate": 1.4370514913661576e-06, + "loss": 0.5058, + "step": 8011 + }, + { + "epoch": 0.6506943880451556, + "grad_norm": 3.574284491275758, + "learning_rate": 1.436456308336343e-06, + "loss": 0.6707, + "step": 8012 + }, + { + "epoch": 0.6507756030211971, + "grad_norm": 4.818858068808228, + "learning_rate": 1.4358611988981242e-06, + "loss": 0.4094, + "step": 8013 + }, + { + "epoch": 0.6508568179972387, + "grad_norm": 4.129816073543707, + "learning_rate": 1.4352661630926783e-06, + "loss": 0.5239, + "step": 8014 + }, + { + "epoch": 0.6509380329732802, + "grad_norm": 7.2250869844226155, + "learning_rate": 1.4346712009611786e-06, + "loss": 0.4763, + "step": 8015 + }, + { + "epoch": 0.6510192479493219, + "grad_norm": 6.519568020534992, + "learning_rate": 1.434076312544794e-06, + "loss": 0.4563, + "step": 8016 + }, + { + "epoch": 0.6511004629253634, + "grad_norm": 5.982613565742739, + "learning_rate": 1.4334814978846863e-06, + "loss": 0.4024, + "step": 8017 + }, + { + "epoch": 0.651181677901405, + "grad_norm": 6.189908706083811, + "learning_rate": 1.4328867570220148e-06, + "loss": 0.4997, + "step": 8018 + }, + { + "epoch": 0.6512628928774467, + "grad_norm": 6.213197790148053, + "learning_rate": 1.4322920899979327e-06, + "loss": 0.5354, + "step": 8019 + }, + { + "epoch": 0.6513441078534882, + "grad_norm": 4.184480691904144, + "learning_rate": 1.4316974968535873e-06, + "loss": 0.3249, + "step": 8020 + }, + { + "epoch": 0.6514253228295298, + "grad_norm": 4.810772139069584, + "learning_rate": 1.4311029776301216e-06, + "loss": 0.4938, + "step": 8021 + }, + { + "epoch": 0.6515065378055713, + "grad_norm": 6.385985541045838, + "learning_rate": 1.4305085323686714e-06, + "loss": 0.4544, + "step": 8022 + }, + { + "epoch": 0.651587752781613, + "grad_norm": 4.1470201913398155, + "learning_rate": 1.4299141611103717e-06, + "loss": 0.5221, + "step": 8023 + }, + { + "epoch": 0.6516689677576545, + "grad_norm": 4.404511419426839, + "learning_rate": 1.4293198638963476e-06, + "loss": 0.5142, + "step": 8024 + }, + { + "epoch": 0.6517501827336961, + "grad_norm": 4.771427690672761, + "learning_rate": 1.4287256407677225e-06, + "loss": 0.6298, + "step": 8025 + }, + { + "epoch": 0.6518313977097376, + "grad_norm": 5.110969640062156, + "learning_rate": 1.4281314917656144e-06, + "loss": 0.4597, + "step": 8026 + }, + { + "epoch": 0.6519126126857793, + "grad_norm": 4.07743528288326, + "learning_rate": 1.4275374169311345e-06, + "loss": 0.5317, + "step": 8027 + }, + { + "epoch": 0.6519938276618208, + "grad_norm": 3.381461916822027, + "learning_rate": 1.426943416305388e-06, + "loss": 0.6096, + "step": 8028 + }, + { + "epoch": 0.6520750426378624, + "grad_norm": 6.120291184514056, + "learning_rate": 1.4263494899294794e-06, + "loss": 0.3527, + "step": 8029 + }, + { + "epoch": 0.6521562576139041, + "grad_norm": 6.654443944669531, + "learning_rate": 1.4257556378445025e-06, + "loss": 0.479, + "step": 8030 + }, + { + "epoch": 0.6522374725899456, + "grad_norm": 5.964757640753549, + "learning_rate": 1.4251618600915503e-06, + "loss": 0.4986, + "step": 8031 + }, + { + "epoch": 0.6523186875659872, + "grad_norm": 4.555104801980313, + "learning_rate": 1.4245681567117097e-06, + "loss": 0.4979, + "step": 8032 + }, + { + "epoch": 0.6523999025420287, + "grad_norm": 4.9235533317425, + "learning_rate": 1.4239745277460614e-06, + "loss": 0.4747, + "step": 8033 + }, + { + "epoch": 0.6524811175180704, + "grad_norm": 6.579534318663932, + "learning_rate": 1.4233809732356798e-06, + "loss": 0.5299, + "step": 8034 + }, + { + "epoch": 0.6525623324941119, + "grad_norm": 5.083100688498042, + "learning_rate": 1.4227874932216378e-06, + "loss": 0.3675, + "step": 8035 + }, + { + "epoch": 0.6526435474701535, + "grad_norm": 23.029068662665217, + "learning_rate": 1.4221940877450006e-06, + "loss": 0.5651, + "step": 8036 + }, + { + "epoch": 0.652724762446195, + "grad_norm": 5.851390054577819, + "learning_rate": 1.4216007568468272e-06, + "loss": 0.52, + "step": 8037 + }, + { + "epoch": 0.6528059774222367, + "grad_norm": 3.2181868045877704, + "learning_rate": 1.4210075005681737e-06, + "loss": 0.5162, + "step": 8038 + }, + { + "epoch": 0.6528871923982782, + "grad_norm": 7.483188427836994, + "learning_rate": 1.420414318950092e-06, + "loss": 0.4435, + "step": 8039 + }, + { + "epoch": 0.6529684073743198, + "grad_norm": 4.4784108288995315, + "learning_rate": 1.4198212120336255e-06, + "loss": 0.441, + "step": 8040 + }, + { + "epoch": 0.6530496223503615, + "grad_norm": 6.391569173443781, + "learning_rate": 1.4192281798598133e-06, + "loss": 0.492, + "step": 8041 + }, + { + "epoch": 0.653130837326403, + "grad_norm": 5.75063446601935, + "learning_rate": 1.4186352224696926e-06, + "loss": 0.5992, + "step": 8042 + }, + { + "epoch": 0.6532120523024446, + "grad_norm": 6.211326632664685, + "learning_rate": 1.4180423399042902e-06, + "loss": 0.4642, + "step": 8043 + }, + { + "epoch": 0.6532932672784861, + "grad_norm": 11.997397806821628, + "learning_rate": 1.4174495322046316e-06, + "loss": 0.5696, + "step": 8044 + }, + { + "epoch": 0.6533744822545278, + "grad_norm": 7.193039419958394, + "learning_rate": 1.4168567994117375e-06, + "loss": 0.4744, + "step": 8045 + }, + { + "epoch": 0.6534556972305693, + "grad_norm": 4.494283159785974, + "learning_rate": 1.41626414156662e-06, + "loss": 0.3842, + "step": 8046 + }, + { + "epoch": 0.6535369122066109, + "grad_norm": 5.857275012139722, + "learning_rate": 1.4156715587102875e-06, + "loss": 0.4835, + "step": 8047 + }, + { + "epoch": 0.6536181271826524, + "grad_norm": 6.4200372823639675, + "learning_rate": 1.4150790508837453e-06, + "loss": 0.4392, + "step": 8048 + }, + { + "epoch": 0.6536993421586941, + "grad_norm": 3.22921759731994, + "learning_rate": 1.4144866181279908e-06, + "loss": 0.429, + "step": 8049 + }, + { + "epoch": 0.6537805571347356, + "grad_norm": 4.691847016227414, + "learning_rate": 1.4138942604840167e-06, + "loss": 0.4329, + "step": 8050 + }, + { + "epoch": 0.6538617721107772, + "grad_norm": 5.045037115274969, + "learning_rate": 1.4133019779928115e-06, + "loss": 0.4094, + "step": 8051 + }, + { + "epoch": 0.6539429870868189, + "grad_norm": 3.3537113111805317, + "learning_rate": 1.4127097706953591e-06, + "loss": 0.6154, + "step": 8052 + }, + { + "epoch": 0.6540242020628604, + "grad_norm": 6.660882673507178, + "learning_rate": 1.4121176386326352e-06, + "loss": 0.3856, + "step": 8053 + }, + { + "epoch": 0.654105417038902, + "grad_norm": 4.011343252532216, + "learning_rate": 1.4115255818456138e-06, + "loss": 0.5311, + "step": 8054 + }, + { + "epoch": 0.6541866320149435, + "grad_norm": 3.3532310150002513, + "learning_rate": 1.4109336003752619e-06, + "loss": 0.4813, + "step": 8055 + }, + { + "epoch": 0.6542678469909852, + "grad_norm": 4.86729352164219, + "learning_rate": 1.4103416942625397e-06, + "loss": 0.4928, + "step": 8056 + }, + { + "epoch": 0.6543490619670267, + "grad_norm": 3.96263482227693, + "learning_rate": 1.4097498635484057e-06, + "loss": 0.447, + "step": 8057 + }, + { + "epoch": 0.6544302769430683, + "grad_norm": 5.204864204379556, + "learning_rate": 1.4091581082738122e-06, + "loss": 0.3607, + "step": 8058 + }, + { + "epoch": 0.6545114919191098, + "grad_norm": 4.292415805341833, + "learning_rate": 1.4085664284797041e-06, + "loss": 0.448, + "step": 8059 + }, + { + "epoch": 0.6545927068951515, + "grad_norm": 7.880826154296216, + "learning_rate": 1.407974824207022e-06, + "loss": 0.6328, + "step": 8060 + }, + { + "epoch": 0.654673921871193, + "grad_norm": 5.310250315901305, + "learning_rate": 1.4073832954967032e-06, + "loss": 0.4029, + "step": 8061 + }, + { + "epoch": 0.6547551368472346, + "grad_norm": 6.728553995913191, + "learning_rate": 1.406791842389677e-06, + "loss": 0.5177, + "step": 8062 + }, + { + "epoch": 0.6548363518232763, + "grad_norm": 5.981311174156778, + "learning_rate": 1.4062004649268696e-06, + "loss": 0.4751, + "step": 8063 + }, + { + "epoch": 0.6549175667993178, + "grad_norm": 6.024059197477494, + "learning_rate": 1.405609163149202e-06, + "loss": 0.4305, + "step": 8064 + }, + { + "epoch": 0.6549987817753594, + "grad_norm": 7.798448146594784, + "learning_rate": 1.4050179370975886e-06, + "loss": 0.4975, + "step": 8065 + }, + { + "epoch": 0.6550799967514009, + "grad_norm": 4.4502104987754105, + "learning_rate": 1.4044267868129374e-06, + "loss": 0.4111, + "step": 8066 + }, + { + "epoch": 0.6551612117274426, + "grad_norm": 3.807078310048636, + "learning_rate": 1.4038357123361556e-06, + "loss": 0.5769, + "step": 8067 + }, + { + "epoch": 0.6552424267034841, + "grad_norm": 4.154767587614258, + "learning_rate": 1.4032447137081414e-06, + "loss": 0.5461, + "step": 8068 + }, + { + "epoch": 0.6553236416795257, + "grad_norm": 5.8170559826468935, + "learning_rate": 1.4026537909697873e-06, + "loss": 0.5508, + "step": 8069 + }, + { + "epoch": 0.6554048566555672, + "grad_norm": 6.732343822655561, + "learning_rate": 1.4020629441619831e-06, + "loss": 0.4174, + "step": 8070 + }, + { + "epoch": 0.6554860716316089, + "grad_norm": 15.814471928424329, + "learning_rate": 1.4014721733256137e-06, + "loss": 0.3694, + "step": 8071 + }, + { + "epoch": 0.6555672866076504, + "grad_norm": 5.728680709044823, + "learning_rate": 1.4008814785015548e-06, + "loss": 0.5148, + "step": 8072 + }, + { + "epoch": 0.655648501583692, + "grad_norm": 5.735426011218967, + "learning_rate": 1.4002908597306817e-06, + "loss": 0.3364, + "step": 8073 + }, + { + "epoch": 0.6557297165597337, + "grad_norm": 3.8279642749863916, + "learning_rate": 1.3997003170538608e-06, + "loss": 0.4943, + "step": 8074 + }, + { + "epoch": 0.6558109315357752, + "grad_norm": 8.191610658404883, + "learning_rate": 1.3991098505119537e-06, + "loss": 0.6124, + "step": 8075 + }, + { + "epoch": 0.6558921465118168, + "grad_norm": 22.562198626943964, + "learning_rate": 1.3985194601458192e-06, + "loss": 0.6547, + "step": 8076 + }, + { + "epoch": 0.6559733614878583, + "grad_norm": 5.841079008969222, + "learning_rate": 1.3979291459963087e-06, + "loss": 0.2968, + "step": 8077 + }, + { + "epoch": 0.6560545764639, + "grad_norm": 3.714067743777654, + "learning_rate": 1.397338908104269e-06, + "loss": 0.43, + "step": 8078 + }, + { + "epoch": 0.6561357914399415, + "grad_norm": 4.366280468082913, + "learning_rate": 1.3967487465105401e-06, + "loss": 0.643, + "step": 8079 + }, + { + "epoch": 0.6562170064159831, + "grad_norm": 4.043858317470031, + "learning_rate": 1.3961586612559602e-06, + "loss": 0.6495, + "step": 8080 + }, + { + "epoch": 0.6562982213920246, + "grad_norm": 4.59900089958901, + "learning_rate": 1.3955686523813588e-06, + "loss": 0.3702, + "step": 8081 + }, + { + "epoch": 0.6563794363680663, + "grad_norm": 6.449912994740261, + "learning_rate": 1.3949787199275606e-06, + "loss": 0.5347, + "step": 8082 + }, + { + "epoch": 0.6564606513441078, + "grad_norm": 5.373213623100858, + "learning_rate": 1.3943888639353866e-06, + "loss": 0.4742, + "step": 8083 + }, + { + "epoch": 0.6565418663201494, + "grad_norm": 5.296870374063085, + "learning_rate": 1.3937990844456528e-06, + "loss": 0.4213, + "step": 8084 + }, + { + "epoch": 0.6566230812961911, + "grad_norm": 3.8879241039555734, + "learning_rate": 1.393209381499167e-06, + "loss": 0.3738, + "step": 8085 + }, + { + "epoch": 0.6567042962722326, + "grad_norm": 4.289208191857055, + "learning_rate": 1.3926197551367355e-06, + "loss": 0.4136, + "step": 8086 + }, + { + "epoch": 0.6567855112482742, + "grad_norm": 5.234771066049035, + "learning_rate": 1.3920302053991564e-06, + "loss": 0.3542, + "step": 8087 + }, + { + "epoch": 0.6568667262243157, + "grad_norm": 5.571643681394787, + "learning_rate": 1.3914407323272216e-06, + "loss": 0.4513, + "step": 8088 + }, + { + "epoch": 0.6569479412003574, + "grad_norm": 7.213207108596739, + "learning_rate": 1.3908513359617217e-06, + "loss": 0.5061, + "step": 8089 + }, + { + "epoch": 0.6570291561763989, + "grad_norm": 4.780947638090955, + "learning_rate": 1.39026201634344e-06, + "loss": 0.3634, + "step": 8090 + }, + { + "epoch": 0.6571103711524405, + "grad_norm": 3.0166870867114053, + "learning_rate": 1.3896727735131538e-06, + "loss": 0.6613, + "step": 8091 + }, + { + "epoch": 0.657191586128482, + "grad_norm": 3.8519737800090974, + "learning_rate": 1.3890836075116343e-06, + "loss": 0.4642, + "step": 8092 + }, + { + "epoch": 0.6572728011045237, + "grad_norm": 4.334939110181824, + "learning_rate": 1.3884945183796505e-06, + "loss": 0.5912, + "step": 8093 + }, + { + "epoch": 0.6573540160805652, + "grad_norm": 6.294374675194089, + "learning_rate": 1.3879055061579635e-06, + "loss": 0.7303, + "step": 8094 + }, + { + "epoch": 0.6574352310566068, + "grad_norm": 3.7598306069040546, + "learning_rate": 1.3873165708873286e-06, + "loss": 0.5085, + "step": 8095 + }, + { + "epoch": 0.6575164460326485, + "grad_norm": 6.863418465636099, + "learning_rate": 1.3867277126084989e-06, + "loss": 0.5744, + "step": 8096 + }, + { + "epoch": 0.65759766100869, + "grad_norm": 7.054233776542322, + "learning_rate": 1.3861389313622197e-06, + "loss": 0.5576, + "step": 8097 + }, + { + "epoch": 0.6576788759847316, + "grad_norm": 4.4365820824808555, + "learning_rate": 1.3855502271892313e-06, + "loss": 0.423, + "step": 8098 + }, + { + "epoch": 0.6577600909607731, + "grad_norm": 6.862928291636716, + "learning_rate": 1.3849616001302696e-06, + "loss": 0.4621, + "step": 8099 + }, + { + "epoch": 0.6578413059368148, + "grad_norm": 5.667668412588925, + "learning_rate": 1.3843730502260639e-06, + "loss": 0.4926, + "step": 8100 + }, + { + "epoch": 0.6579225209128563, + "grad_norm": 4.227345644846646, + "learning_rate": 1.3837845775173375e-06, + "loss": 0.6139, + "step": 8101 + }, + { + "epoch": 0.6580037358888979, + "grad_norm": 4.744805542732661, + "learning_rate": 1.383196182044811e-06, + "loss": 0.5144, + "step": 8102 + }, + { + "epoch": 0.6580849508649395, + "grad_norm": 3.999861099199686, + "learning_rate": 1.3826078638491994e-06, + "loss": 0.561, + "step": 8103 + }, + { + "epoch": 0.6581661658409811, + "grad_norm": 4.070333113027189, + "learning_rate": 1.3820196229712085e-06, + "loss": 0.4381, + "step": 8104 + }, + { + "epoch": 0.6582473808170226, + "grad_norm": 6.884502103701524, + "learning_rate": 1.3814314594515443e-06, + "loss": 0.6255, + "step": 8105 + }, + { + "epoch": 0.6583285957930642, + "grad_norm": 4.918735322058878, + "learning_rate": 1.3808433733309028e-06, + "loss": 0.548, + "step": 8106 + }, + { + "epoch": 0.6584098107691059, + "grad_norm": 16.27622530232076, + "learning_rate": 1.380255364649976e-06, + "loss": 0.3939, + "step": 8107 + }, + { + "epoch": 0.6584910257451474, + "grad_norm": 10.25631338164915, + "learning_rate": 1.3796674334494529e-06, + "loss": 0.581, + "step": 8108 + }, + { + "epoch": 0.658572240721189, + "grad_norm": 5.903422438123663, + "learning_rate": 1.3790795797700129e-06, + "loss": 0.5879, + "step": 8109 + }, + { + "epoch": 0.6586534556972305, + "grad_norm": 5.425671652540845, + "learning_rate": 1.3784918036523346e-06, + "loss": 0.4939, + "step": 8110 + }, + { + "epoch": 0.6587346706732722, + "grad_norm": 6.259094931538714, + "learning_rate": 1.377904105137087e-06, + "loss": 0.5689, + "step": 8111 + }, + { + "epoch": 0.6588158856493137, + "grad_norm": 5.861839266316476, + "learning_rate": 1.3773164842649377e-06, + "loss": 0.4444, + "step": 8112 + }, + { + "epoch": 0.6588971006253553, + "grad_norm": 6.629921332417931, + "learning_rate": 1.376728941076546e-06, + "loss": 0.3919, + "step": 8113 + }, + { + "epoch": 0.6589783156013969, + "grad_norm": 6.496651649704183, + "learning_rate": 1.3761414756125658e-06, + "loss": 0.5354, + "step": 8114 + }, + { + "epoch": 0.6590595305774385, + "grad_norm": 4.3858502738087655, + "learning_rate": 1.3755540879136474e-06, + "loss": 0.5213, + "step": 8115 + }, + { + "epoch": 0.65914074555348, + "grad_norm": 5.705183500815472, + "learning_rate": 1.3749667780204365e-06, + "loss": 0.5628, + "step": 8116 + }, + { + "epoch": 0.6592219605295216, + "grad_norm": 3.5828316864822236, + "learning_rate": 1.3743795459735692e-06, + "loss": 0.518, + "step": 8117 + }, + { + "epoch": 0.6593031755055633, + "grad_norm": 4.1843405148298025, + "learning_rate": 1.373792391813681e-06, + "loss": 0.6037, + "step": 8118 + }, + { + "epoch": 0.6593843904816048, + "grad_norm": 5.543134932829188, + "learning_rate": 1.3732053155813987e-06, + "loss": 0.3632, + "step": 8119 + }, + { + "epoch": 0.6594656054576464, + "grad_norm": 3.9215981353205938, + "learning_rate": 1.3726183173173441e-06, + "loss": 0.4956, + "step": 8120 + }, + { + "epoch": 0.659546820433688, + "grad_norm": 6.56518519854498, + "learning_rate": 1.3720313970621369e-06, + "loss": 0.479, + "step": 8121 + }, + { + "epoch": 0.6596280354097296, + "grad_norm": 8.721756176365712, + "learning_rate": 1.3714445548563856e-06, + "loss": 0.5228, + "step": 8122 + }, + { + "epoch": 0.6597092503857711, + "grad_norm": 6.414976174126078, + "learning_rate": 1.3708577907406988e-06, + "loss": 0.5035, + "step": 8123 + }, + { + "epoch": 0.6597904653618127, + "grad_norm": 5.440110547817854, + "learning_rate": 1.3702711047556777e-06, + "loss": 0.4776, + "step": 8124 + }, + { + "epoch": 0.6598716803378543, + "grad_norm": 5.614130528625265, + "learning_rate": 1.3696844969419174e-06, + "loss": 0.3457, + "step": 8125 + }, + { + "epoch": 0.6599528953138959, + "grad_norm": 4.022888633337103, + "learning_rate": 1.3690979673400067e-06, + "loss": 0.445, + "step": 8126 + }, + { + "epoch": 0.6600341102899374, + "grad_norm": 3.741552498785602, + "learning_rate": 1.3685115159905325e-06, + "loss": 0.3523, + "step": 8127 + }, + { + "epoch": 0.660115325265979, + "grad_norm": 5.950815202545946, + "learning_rate": 1.3679251429340717e-06, + "loss": 0.738, + "step": 8128 + }, + { + "epoch": 0.6601965402420207, + "grad_norm": 9.51105805660359, + "learning_rate": 1.367338848211201e-06, + "loss": 0.4257, + "step": 8129 + }, + { + "epoch": 0.6602777552180622, + "grad_norm": 2.9131542456623314, + "learning_rate": 1.3667526318624862e-06, + "loss": 0.4203, + "step": 8130 + }, + { + "epoch": 0.6603589701941038, + "grad_norm": 6.754300006832807, + "learning_rate": 1.366166493928493e-06, + "loss": 0.6132, + "step": 8131 + }, + { + "epoch": 0.6604401851701454, + "grad_norm": 11.724662577057464, + "learning_rate": 1.3655804344497775e-06, + "loss": 0.4595, + "step": 8132 + }, + { + "epoch": 0.660521400146187, + "grad_norm": 5.387659360154549, + "learning_rate": 1.364994453466891e-06, + "loss": 0.4081, + "step": 8133 + }, + { + "epoch": 0.6606026151222285, + "grad_norm": 5.107977084416256, + "learning_rate": 1.3644085510203813e-06, + "loss": 0.4898, + "step": 8134 + }, + { + "epoch": 0.6606838300982701, + "grad_norm": 3.8833833190727183, + "learning_rate": 1.363822727150791e-06, + "loss": 0.5417, + "step": 8135 + }, + { + "epoch": 0.6607650450743117, + "grad_norm": 7.8141538363601795, + "learning_rate": 1.363236981898654e-06, + "loss": 0.4968, + "step": 8136 + }, + { + "epoch": 0.6608462600503533, + "grad_norm": 3.590897276111095, + "learning_rate": 1.3626513153045024e-06, + "loss": 0.5024, + "step": 8137 + }, + { + "epoch": 0.6609274750263948, + "grad_norm": 3.612475255145642, + "learning_rate": 1.3620657274088606e-06, + "loss": 0.4954, + "step": 8138 + }, + { + "epoch": 0.6610086900024364, + "grad_norm": 14.567400918143866, + "learning_rate": 1.3614802182522469e-06, + "loss": 0.668, + "step": 8139 + }, + { + "epoch": 0.6610899049784781, + "grad_norm": 11.017893005217958, + "learning_rate": 1.3608947878751777e-06, + "loss": 0.4346, + "step": 8140 + }, + { + "epoch": 0.6611711199545196, + "grad_norm": 3.7950075970163617, + "learning_rate": 1.3603094363181596e-06, + "loss": 0.5231, + "step": 8141 + }, + { + "epoch": 0.6612523349305612, + "grad_norm": 7.66686320728164, + "learning_rate": 1.3597241636216965e-06, + "loss": 0.6696, + "step": 8142 + }, + { + "epoch": 0.6613335499066028, + "grad_norm": 5.608044387753557, + "learning_rate": 1.3591389698262875e-06, + "loss": 0.5376, + "step": 8143 + }, + { + "epoch": 0.6614147648826444, + "grad_norm": 4.03444037697512, + "learning_rate": 1.3585538549724242e-06, + "loss": 0.526, + "step": 8144 + }, + { + "epoch": 0.6614959798586859, + "grad_norm": 5.082198135850972, + "learning_rate": 1.3579688191005926e-06, + "loss": 0.5056, + "step": 8145 + }, + { + "epoch": 0.6615771948347275, + "grad_norm": 3.3444467710171346, + "learning_rate": 1.3573838622512743e-06, + "loss": 0.6148, + "step": 8146 + }, + { + "epoch": 0.6616584098107691, + "grad_norm": 4.212117525236555, + "learning_rate": 1.3567989844649448e-06, + "loss": 0.5947, + "step": 8147 + }, + { + "epoch": 0.6617396247868107, + "grad_norm": 5.234823635180376, + "learning_rate": 1.3562141857820765e-06, + "loss": 0.5863, + "step": 8148 + }, + { + "epoch": 0.6618208397628522, + "grad_norm": 9.716406215450048, + "learning_rate": 1.3556294662431325e-06, + "loss": 0.4493, + "step": 8149 + }, + { + "epoch": 0.6619020547388939, + "grad_norm": 5.722283978029896, + "learning_rate": 1.3550448258885734e-06, + "loss": 0.4364, + "step": 8150 + }, + { + "epoch": 0.6619832697149355, + "grad_norm": 4.763814608798664, + "learning_rate": 1.3544602647588528e-06, + "loss": 0.742, + "step": 8151 + }, + { + "epoch": 0.662064484690977, + "grad_norm": 5.80232988441981, + "learning_rate": 1.3538757828944188e-06, + "loss": 0.4423, + "step": 8152 + }, + { + "epoch": 0.6621456996670186, + "grad_norm": 7.5957193433728145, + "learning_rate": 1.353291380335715e-06, + "loss": 0.5588, + "step": 8153 + }, + { + "epoch": 0.6622269146430602, + "grad_norm": 4.336543165787254, + "learning_rate": 1.3527070571231786e-06, + "loss": 0.5969, + "step": 8154 + }, + { + "epoch": 0.6623081296191018, + "grad_norm": 4.034182275364077, + "learning_rate": 1.3521228132972414e-06, + "loss": 0.4803, + "step": 8155 + }, + { + "epoch": 0.6623893445951433, + "grad_norm": 6.6167843827609865, + "learning_rate": 1.3515386488983317e-06, + "loss": 0.4521, + "step": 8156 + }, + { + "epoch": 0.662470559571185, + "grad_norm": 7.161780111206926, + "learning_rate": 1.3509545639668691e-06, + "loss": 0.3106, + "step": 8157 + }, + { + "epoch": 0.6625517745472265, + "grad_norm": 5.4096275250827945, + "learning_rate": 1.3503705585432687e-06, + "loss": 0.4119, + "step": 8158 + }, + { + "epoch": 0.6626329895232681, + "grad_norm": 7.137614931386881, + "learning_rate": 1.349786632667942e-06, + "loss": 0.6203, + "step": 8159 + }, + { + "epoch": 0.6627142044993096, + "grad_norm": 4.038451931034832, + "learning_rate": 1.3492027863812924e-06, + "loss": 0.749, + "step": 8160 + }, + { + "epoch": 0.6627954194753513, + "grad_norm": 4.101256157779943, + "learning_rate": 1.3486190197237189e-06, + "loss": 0.5766, + "step": 8161 + }, + { + "epoch": 0.6628766344513929, + "grad_norm": 3.827717281308639, + "learning_rate": 1.348035332735617e-06, + "loss": 0.3621, + "step": 8162 + }, + { + "epoch": 0.6629578494274344, + "grad_norm": 3.2797754851269856, + "learning_rate": 1.3474517254573731e-06, + "loss": 0.6257, + "step": 8163 + }, + { + "epoch": 0.663039064403476, + "grad_norm": 7.6698914171017245, + "learning_rate": 1.3468681979293702e-06, + "loss": 0.413, + "step": 8164 + }, + { + "epoch": 0.6631202793795176, + "grad_norm": 6.602624984901768, + "learning_rate": 1.3462847501919843e-06, + "loss": 0.3934, + "step": 8165 + }, + { + "epoch": 0.6632014943555592, + "grad_norm": 7.1560351882255215, + "learning_rate": 1.3457013822855886e-06, + "loss": 0.7646, + "step": 8166 + }, + { + "epoch": 0.6632827093316007, + "grad_norm": 6.274238416078604, + "learning_rate": 1.345118094250547e-06, + "loss": 0.408, + "step": 8167 + }, + { + "epoch": 0.6633639243076423, + "grad_norm": 4.400441017779448, + "learning_rate": 1.3445348861272217e-06, + "loss": 0.4495, + "step": 8168 + }, + { + "epoch": 0.6634451392836839, + "grad_norm": 4.941239230227576, + "learning_rate": 1.3439517579559675e-06, + "loss": 0.3428, + "step": 8169 + }, + { + "epoch": 0.6635263542597255, + "grad_norm": 6.964343826754793, + "learning_rate": 1.3433687097771337e-06, + "loss": 0.4209, + "step": 8170 + }, + { + "epoch": 0.663607569235767, + "grad_norm": 5.512709601692857, + "learning_rate": 1.3427857416310626e-06, + "loss": 0.5785, + "step": 8171 + }, + { + "epoch": 0.6636887842118087, + "grad_norm": 4.294567753627923, + "learning_rate": 1.3422028535580947e-06, + "loss": 0.4546, + "step": 8172 + }, + { + "epoch": 0.6637699991878503, + "grad_norm": 4.0701024523531135, + "learning_rate": 1.3416200455985607e-06, + "loss": 0.8187, + "step": 8173 + }, + { + "epoch": 0.6638512141638918, + "grad_norm": 7.655445454850961, + "learning_rate": 1.3410373177927893e-06, + "loss": 0.4059, + "step": 8174 + }, + { + "epoch": 0.6639324291399334, + "grad_norm": 2.7798324043491838, + "learning_rate": 1.3404546701811022e-06, + "loss": 0.5913, + "step": 8175 + }, + { + "epoch": 0.664013644115975, + "grad_norm": 3.646058875181944, + "learning_rate": 1.3398721028038155e-06, + "loss": 0.5026, + "step": 8176 + }, + { + "epoch": 0.6640948590920166, + "grad_norm": 4.496311517930436, + "learning_rate": 1.3392896157012386e-06, + "loss": 0.6419, + "step": 8177 + }, + { + "epoch": 0.6641760740680581, + "grad_norm": 17.84153072314891, + "learning_rate": 1.3387072089136776e-06, + "loss": 0.5958, + "step": 8178 + }, + { + "epoch": 0.6642572890440998, + "grad_norm": 3.8720341467833834, + "learning_rate": 1.3381248824814326e-06, + "loss": 0.5721, + "step": 8179 + }, + { + "epoch": 0.6643385040201413, + "grad_norm": 4.7847332539123935, + "learning_rate": 1.337542636444795e-06, + "loss": 0.44, + "step": 8180 + }, + { + "epoch": 0.6644197189961829, + "grad_norm": 6.021015620083501, + "learning_rate": 1.3369604708440548e-06, + "loss": 0.3977, + "step": 8181 + }, + { + "epoch": 0.6645009339722244, + "grad_norm": 11.600335989741517, + "learning_rate": 1.3363783857194957e-06, + "loss": 0.469, + "step": 8182 + }, + { + "epoch": 0.6645821489482661, + "grad_norm": 3.859966985311204, + "learning_rate": 1.3357963811113938e-06, + "loss": 0.4263, + "step": 8183 + }, + { + "epoch": 0.6646633639243077, + "grad_norm": 18.782716428504518, + "learning_rate": 1.3352144570600203e-06, + "loss": 0.6374, + "step": 8184 + }, + { + "epoch": 0.6647445789003492, + "grad_norm": 3.387091226384912, + "learning_rate": 1.3346326136056425e-06, + "loss": 0.4308, + "step": 8185 + }, + { + "epoch": 0.6648257938763908, + "grad_norm": 4.252596910764206, + "learning_rate": 1.3340508507885194e-06, + "loss": 0.5107, + "step": 8186 + }, + { + "epoch": 0.6649070088524324, + "grad_norm": 7.6038955526056995, + "learning_rate": 1.3334691686489064e-06, + "loss": 0.4946, + "step": 8187 + }, + { + "epoch": 0.664988223828474, + "grad_norm": 4.976169303513984, + "learning_rate": 1.3328875672270547e-06, + "loss": 0.4381, + "step": 8188 + }, + { + "epoch": 0.6650694388045155, + "grad_norm": 4.667554378601018, + "learning_rate": 1.332306046563206e-06, + "loss": 0.6222, + "step": 8189 + }, + { + "epoch": 0.6651506537805572, + "grad_norm": 4.379049488125857, + "learning_rate": 1.3317246066975981e-06, + "loss": 0.6358, + "step": 8190 + }, + { + "epoch": 0.6652318687565987, + "grad_norm": 3.794618143261633, + "learning_rate": 1.3311432476704655e-06, + "loss": 0.4243, + "step": 8191 + }, + { + "epoch": 0.6653130837326403, + "grad_norm": 5.072184639847683, + "learning_rate": 1.3305619695220332e-06, + "loss": 0.5623, + "step": 8192 + }, + { + "epoch": 0.6653942987086818, + "grad_norm": 7.59051423468418, + "learning_rate": 1.3299807722925231e-06, + "loss": 0.4993, + "step": 8193 + }, + { + "epoch": 0.6654755136847235, + "grad_norm": 8.819021574798183, + "learning_rate": 1.3293996560221526e-06, + "loss": 0.4178, + "step": 8194 + }, + { + "epoch": 0.6655567286607651, + "grad_norm": 6.287522877063349, + "learning_rate": 1.3288186207511303e-06, + "loss": 0.3488, + "step": 8195 + }, + { + "epoch": 0.6656379436368066, + "grad_norm": 5.563934142204648, + "learning_rate": 1.3282376665196603e-06, + "loss": 0.4812, + "step": 8196 + }, + { + "epoch": 0.6657191586128482, + "grad_norm": 3.033804351653404, + "learning_rate": 1.327656793367943e-06, + "loss": 0.4407, + "step": 8197 + }, + { + "epoch": 0.6658003735888898, + "grad_norm": 3.998217455418668, + "learning_rate": 1.3270760013361713e-06, + "loss": 0.4281, + "step": 8198 + }, + { + "epoch": 0.6658815885649314, + "grad_norm": 4.64706121281191, + "learning_rate": 1.3264952904645317e-06, + "loss": 0.6847, + "step": 8199 + }, + { + "epoch": 0.6659628035409729, + "grad_norm": 6.436231964294674, + "learning_rate": 1.325914660793207e-06, + "loss": 0.4931, + "step": 8200 + }, + { + "epoch": 0.6660440185170146, + "grad_norm": 3.707584274007003, + "learning_rate": 1.3253341123623756e-06, + "loss": 0.5129, + "step": 8201 + }, + { + "epoch": 0.6661252334930561, + "grad_norm": 6.262622978288038, + "learning_rate": 1.3247536452122064e-06, + "loss": 0.5491, + "step": 8202 + }, + { + "epoch": 0.6662064484690977, + "grad_norm": 4.982037820278029, + "learning_rate": 1.3241732593828644e-06, + "loss": 0.4305, + "step": 8203 + }, + { + "epoch": 0.6662876634451392, + "grad_norm": 6.381634581129901, + "learning_rate": 1.3235929549145105e-06, + "loss": 0.4896, + "step": 8204 + }, + { + "epoch": 0.6663688784211809, + "grad_norm": 7.969785119936784, + "learning_rate": 1.3230127318472972e-06, + "loss": 0.5373, + "step": 8205 + }, + { + "epoch": 0.6664500933972225, + "grad_norm": 6.0137267480338945, + "learning_rate": 1.3224325902213736e-06, + "loss": 0.476, + "step": 8206 + }, + { + "epoch": 0.666531308373264, + "grad_norm": 6.68986461696678, + "learning_rate": 1.3218525300768837e-06, + "loss": 0.4782, + "step": 8207 + }, + { + "epoch": 0.6666125233493057, + "grad_norm": 6.316034448019237, + "learning_rate": 1.3212725514539635e-06, + "loss": 0.4462, + "step": 8208 + }, + { + "epoch": 0.6666937383253472, + "grad_norm": 5.228047836489257, + "learning_rate": 1.3206926543927435e-06, + "loss": 0.3812, + "step": 8209 + }, + { + "epoch": 0.6667749533013888, + "grad_norm": 6.369850283074102, + "learning_rate": 1.320112838933351e-06, + "loss": 0.469, + "step": 8210 + }, + { + "epoch": 0.6668561682774303, + "grad_norm": 4.859894104230745, + "learning_rate": 1.3195331051159058e-06, + "loss": 0.4444, + "step": 8211 + }, + { + "epoch": 0.666937383253472, + "grad_norm": 4.0624444842962735, + "learning_rate": 1.3189534529805212e-06, + "loss": 0.4554, + "step": 8212 + }, + { + "epoch": 0.6670185982295135, + "grad_norm": 3.4817004946398287, + "learning_rate": 1.318373882567307e-06, + "loss": 0.5201, + "step": 8213 + }, + { + "epoch": 0.6670998132055551, + "grad_norm": 4.704982903669908, + "learning_rate": 1.3177943939163677e-06, + "loss": 0.4444, + "step": 8214 + }, + { + "epoch": 0.6671810281815966, + "grad_norm": 5.794219397597491, + "learning_rate": 1.3172149870677985e-06, + "loss": 0.4768, + "step": 8215 + }, + { + "epoch": 0.6672622431576383, + "grad_norm": 6.208905832820019, + "learning_rate": 1.3166356620616932e-06, + "loss": 0.5479, + "step": 8216 + }, + { + "epoch": 0.6673434581336799, + "grad_norm": 8.198766362247154, + "learning_rate": 1.3160564189381376e-06, + "loss": 0.4532, + "step": 8217 + }, + { + "epoch": 0.6674246731097214, + "grad_norm": 4.331763449279288, + "learning_rate": 1.3154772577372104e-06, + "loss": 0.4955, + "step": 8218 + }, + { + "epoch": 0.667505888085763, + "grad_norm": 5.10837125089982, + "learning_rate": 1.3148981784989884e-06, + "loss": 0.4523, + "step": 8219 + }, + { + "epoch": 0.6675871030618046, + "grad_norm": 3.5163841336361807, + "learning_rate": 1.3143191812635408e-06, + "loss": 0.6125, + "step": 8220 + }, + { + "epoch": 0.6676683180378462, + "grad_norm": 11.844210364989475, + "learning_rate": 1.3137402660709314e-06, + "loss": 0.4425, + "step": 8221 + }, + { + "epoch": 0.6677495330138877, + "grad_norm": 7.996601522252169, + "learning_rate": 1.3131614329612158e-06, + "loss": 0.6123, + "step": 8222 + }, + { + "epoch": 0.6678307479899294, + "grad_norm": 6.114487257718311, + "learning_rate": 1.3125826819744493e-06, + "loss": 0.5059, + "step": 8223 + }, + { + "epoch": 0.6679119629659709, + "grad_norm": 4.43142805188028, + "learning_rate": 1.3120040131506767e-06, + "loss": 0.4303, + "step": 8224 + }, + { + "epoch": 0.6679931779420125, + "grad_norm": 5.687962653675712, + "learning_rate": 1.3114254265299379e-06, + "loss": 0.4586, + "step": 8225 + }, + { + "epoch": 0.668074392918054, + "grad_norm": 4.1918238610968475, + "learning_rate": 1.310846922152269e-06, + "loss": 0.5445, + "step": 8226 + }, + { + "epoch": 0.6681556078940957, + "grad_norm": 5.179655298264121, + "learning_rate": 1.310268500057701e-06, + "loss": 0.5172, + "step": 8227 + }, + { + "epoch": 0.6682368228701373, + "grad_norm": 7.779574674479231, + "learning_rate": 1.309690160286255e-06, + "loss": 0.5864, + "step": 8228 + }, + { + "epoch": 0.6683180378461788, + "grad_norm": 6.122052156829049, + "learning_rate": 1.3091119028779514e-06, + "loss": 0.548, + "step": 8229 + }, + { + "epoch": 0.6683992528222205, + "grad_norm": 4.789126863334509, + "learning_rate": 1.308533727872801e-06, + "loss": 0.4313, + "step": 8230 + }, + { + "epoch": 0.668480467798262, + "grad_norm": 5.766521551851652, + "learning_rate": 1.3079556353108106e-06, + "loss": 0.532, + "step": 8231 + }, + { + "epoch": 0.6685616827743036, + "grad_norm": 6.05863385781591, + "learning_rate": 1.307377625231981e-06, + "loss": 0.443, + "step": 8232 + }, + { + "epoch": 0.6686428977503451, + "grad_norm": 4.181674552131915, + "learning_rate": 1.3067996976763086e-06, + "loss": 0.5801, + "step": 8233 + }, + { + "epoch": 0.6687241127263868, + "grad_norm": 3.206551391730835, + "learning_rate": 1.3062218526837828e-06, + "loss": 0.5704, + "step": 8234 + }, + { + "epoch": 0.6688053277024283, + "grad_norm": 10.62932913375947, + "learning_rate": 1.3056440902943856e-06, + "loss": 0.4123, + "step": 8235 + }, + { + "epoch": 0.6688865426784699, + "grad_norm": 4.544980879348178, + "learning_rate": 1.305066410548097e-06, + "loss": 0.5647, + "step": 8236 + }, + { + "epoch": 0.6689677576545114, + "grad_norm": 5.011004929662773, + "learning_rate": 1.304488813484889e-06, + "loss": 0.5001, + "step": 8237 + }, + { + "epoch": 0.6690489726305531, + "grad_norm": 5.610021787019349, + "learning_rate": 1.303911299144727e-06, + "loss": 0.4763, + "step": 8238 + }, + { + "epoch": 0.6691301876065947, + "grad_norm": 87.36388987621034, + "learning_rate": 1.3033338675675726e-06, + "loss": 0.4336, + "step": 8239 + }, + { + "epoch": 0.6692114025826362, + "grad_norm": 3.9673278123195566, + "learning_rate": 1.3027565187933828e-06, + "loss": 0.4406, + "step": 8240 + }, + { + "epoch": 0.6692926175586779, + "grad_norm": 4.0150982480618165, + "learning_rate": 1.3021792528621041e-06, + "loss": 0.4505, + "step": 8241 + }, + { + "epoch": 0.6693738325347194, + "grad_norm": 3.8387858911160473, + "learning_rate": 1.3016020698136827e-06, + "loss": 0.4101, + "step": 8242 + }, + { + "epoch": 0.669455047510761, + "grad_norm": 3.2719359158581245, + "learning_rate": 1.3010249696880558e-06, + "loss": 0.5057, + "step": 8243 + }, + { + "epoch": 0.6695362624868025, + "grad_norm": 8.689632620129073, + "learning_rate": 1.3004479525251545e-06, + "loss": 0.3984, + "step": 8244 + }, + { + "epoch": 0.6696174774628442, + "grad_norm": 4.334649919352921, + "learning_rate": 1.2998710183649066e-06, + "loss": 0.48, + "step": 8245 + }, + { + "epoch": 0.6696986924388857, + "grad_norm": 3.604389609859541, + "learning_rate": 1.2992941672472332e-06, + "loss": 0.4935, + "step": 8246 + }, + { + "epoch": 0.6697799074149273, + "grad_norm": 4.775702711045651, + "learning_rate": 1.2987173992120478e-06, + "loss": 0.4603, + "step": 8247 + }, + { + "epoch": 0.6698611223909688, + "grad_norm": 5.179644996464377, + "learning_rate": 1.2981407142992618e-06, + "loss": 0.4972, + "step": 8248 + }, + { + "epoch": 0.6699423373670105, + "grad_norm": 4.832968999586265, + "learning_rate": 1.2975641125487777e-06, + "loss": 0.5587, + "step": 8249 + }, + { + "epoch": 0.6700235523430521, + "grad_norm": 4.3612144381710065, + "learning_rate": 1.2969875940004923e-06, + "loss": 0.335, + "step": 8250 + }, + { + "epoch": 0.6701047673190936, + "grad_norm": 6.055517461311078, + "learning_rate": 1.2964111586942996e-06, + "loss": 0.6306, + "step": 8251 + }, + { + "epoch": 0.6701859822951353, + "grad_norm": 3.996269903879433, + "learning_rate": 1.2958348066700833e-06, + "loss": 0.5962, + "step": 8252 + }, + { + "epoch": 0.6702671972711768, + "grad_norm": 6.099178781598084, + "learning_rate": 1.2952585379677268e-06, + "loss": 0.453, + "step": 8253 + }, + { + "epoch": 0.6703484122472184, + "grad_norm": 6.116289571948397, + "learning_rate": 1.2946823526271023e-06, + "loss": 0.4299, + "step": 8254 + }, + { + "epoch": 0.6704296272232599, + "grad_norm": 5.092428454455709, + "learning_rate": 1.2941062506880811e-06, + "loss": 0.5551, + "step": 8255 + }, + { + "epoch": 0.6705108421993016, + "grad_norm": 5.682161639107994, + "learning_rate": 1.2935302321905252e-06, + "loss": 0.6384, + "step": 8256 + }, + { + "epoch": 0.6705920571753431, + "grad_norm": 4.3807459106680575, + "learning_rate": 1.292954297174291e-06, + "loss": 0.6603, + "step": 8257 + }, + { + "epoch": 0.6706732721513847, + "grad_norm": 3.7416878890359517, + "learning_rate": 1.2923784456792314e-06, + "loss": 0.4831, + "step": 8258 + }, + { + "epoch": 0.6707544871274262, + "grad_norm": 3.7742295359579874, + "learning_rate": 1.291802677745193e-06, + "loss": 0.4317, + "step": 8259 + }, + { + "epoch": 0.6708357021034679, + "grad_norm": 4.136648173116798, + "learning_rate": 1.2912269934120142e-06, + "loss": 0.5397, + "step": 8260 + }, + { + "epoch": 0.6709169170795095, + "grad_norm": 10.177523748838187, + "learning_rate": 1.2906513927195308e-06, + "loss": 0.4942, + "step": 8261 + }, + { + "epoch": 0.670998132055551, + "grad_norm": 5.041065114037551, + "learning_rate": 1.290075875707571e-06, + "loss": 0.3985, + "step": 8262 + }, + { + "epoch": 0.6710793470315927, + "grad_norm": 6.204016855298799, + "learning_rate": 1.2895004424159557e-06, + "loss": 0.4884, + "step": 8263 + }, + { + "epoch": 0.6711605620076342, + "grad_norm": 6.451950713200213, + "learning_rate": 1.2889250928845038e-06, + "loss": 0.3397, + "step": 8264 + }, + { + "epoch": 0.6712417769836758, + "grad_norm": 5.789167573594882, + "learning_rate": 1.2883498271530265e-06, + "loss": 0.5267, + "step": 8265 + }, + { + "epoch": 0.6713229919597173, + "grad_norm": 3.020900821630963, + "learning_rate": 1.2877746452613277e-06, + "loss": 0.5771, + "step": 8266 + }, + { + "epoch": 0.671404206935759, + "grad_norm": 7.136575144733493, + "learning_rate": 1.2871995472492088e-06, + "loss": 0.4145, + "step": 8267 + }, + { + "epoch": 0.6714854219118005, + "grad_norm": 6.86909884452662, + "learning_rate": 1.2866245331564627e-06, + "loss": 0.4899, + "step": 8268 + }, + { + "epoch": 0.6715666368878421, + "grad_norm": 4.392832664582703, + "learning_rate": 1.2860496030228763e-06, + "loss": 0.7517, + "step": 8269 + }, + { + "epoch": 0.6716478518638836, + "grad_norm": 4.036722831992974, + "learning_rate": 1.2854747568882336e-06, + "loss": 0.4853, + "step": 8270 + }, + { + "epoch": 0.6717290668399253, + "grad_norm": 4.039548813778698, + "learning_rate": 1.2848999947923089e-06, + "loss": 0.3944, + "step": 8271 + }, + { + "epoch": 0.6718102818159669, + "grad_norm": 4.0410979346726625, + "learning_rate": 1.2843253167748745e-06, + "loss": 0.4949, + "step": 8272 + }, + { + "epoch": 0.6718914967920084, + "grad_norm": 4.170268917322463, + "learning_rate": 1.2837507228756934e-06, + "loss": 0.7473, + "step": 8273 + }, + { + "epoch": 0.6719727117680501, + "grad_norm": 5.339351144770748, + "learning_rate": 1.2831762131345265e-06, + "loss": 0.3525, + "step": 8274 + }, + { + "epoch": 0.6720539267440916, + "grad_norm": 4.558326943954262, + "learning_rate": 1.2826017875911257e-06, + "loss": 0.5092, + "step": 8275 + }, + { + "epoch": 0.6721351417201332, + "grad_norm": 6.121469512209196, + "learning_rate": 1.2820274462852373e-06, + "loss": 0.3959, + "step": 8276 + }, + { + "epoch": 0.6722163566961747, + "grad_norm": 6.247553742372245, + "learning_rate": 1.2814531892566034e-06, + "loss": 0.4574, + "step": 8277 + }, + { + "epoch": 0.6722975716722164, + "grad_norm": 3.743110767471212, + "learning_rate": 1.2808790165449609e-06, + "loss": 0.4817, + "step": 8278 + }, + { + "epoch": 0.6723787866482579, + "grad_norm": 3.847254853806929, + "learning_rate": 1.280304928190037e-06, + "loss": 0.4439, + "step": 8279 + }, + { + "epoch": 0.6724600016242995, + "grad_norm": 5.111654340531649, + "learning_rate": 1.2797309242315584e-06, + "loss": 0.5166, + "step": 8280 + }, + { + "epoch": 0.672541216600341, + "grad_norm": 5.402468457889438, + "learning_rate": 1.2791570047092413e-06, + "loss": 0.4379, + "step": 8281 + }, + { + "epoch": 0.6726224315763827, + "grad_norm": 4.64148894732247, + "learning_rate": 1.2785831696627975e-06, + "loss": 0.4985, + "step": 8282 + }, + { + "epoch": 0.6727036465524243, + "grad_norm": 4.977759194515003, + "learning_rate": 1.2780094191319348e-06, + "loss": 0.3378, + "step": 8283 + }, + { + "epoch": 0.6727848615284658, + "grad_norm": 5.750546485168828, + "learning_rate": 1.2774357531563522e-06, + "loss": 0.5315, + "step": 8284 + }, + { + "epoch": 0.6728660765045075, + "grad_norm": 5.462788945858097, + "learning_rate": 1.276862171775745e-06, + "loss": 0.611, + "step": 8285 + }, + { + "epoch": 0.672947291480549, + "grad_norm": 9.040739646165454, + "learning_rate": 1.2762886750298033e-06, + "loss": 0.4435, + "step": 8286 + }, + { + "epoch": 0.6730285064565906, + "grad_norm": 5.419042990213366, + "learning_rate": 1.275715262958209e-06, + "loss": 0.585, + "step": 8287 + }, + { + "epoch": 0.6731097214326321, + "grad_norm": 8.622423592062997, + "learning_rate": 1.275141935600639e-06, + "loss": 0.4452, + "step": 8288 + }, + { + "epoch": 0.6731909364086738, + "grad_norm": 3.39665996833734, + "learning_rate": 1.2745686929967632e-06, + "loss": 0.5745, + "step": 8289 + }, + { + "epoch": 0.6732721513847153, + "grad_norm": 9.640446040935274, + "learning_rate": 1.2739955351862488e-06, + "loss": 0.4325, + "step": 8290 + }, + { + "epoch": 0.6733533663607569, + "grad_norm": 3.375245810027421, + "learning_rate": 1.2734224622087556e-06, + "loss": 0.742, + "step": 8291 + }, + { + "epoch": 0.6734345813367985, + "grad_norm": 6.070864539301688, + "learning_rate": 1.2728494741039354e-06, + "loss": 0.4282, + "step": 8292 + }, + { + "epoch": 0.6735157963128401, + "grad_norm": 3.1473742664954956, + "learning_rate": 1.2722765709114382e-06, + "loss": 0.5101, + "step": 8293 + }, + { + "epoch": 0.6735970112888817, + "grad_norm": 5.797541760307167, + "learning_rate": 1.2717037526709048e-06, + "loss": 0.4927, + "step": 8294 + }, + { + "epoch": 0.6736782262649232, + "grad_norm": 3.9968403294119192, + "learning_rate": 1.2711310194219695e-06, + "loss": 0.5272, + "step": 8295 + }, + { + "epoch": 0.6737594412409649, + "grad_norm": 8.686594608534724, + "learning_rate": 1.2705583712042654e-06, + "loss": 0.3865, + "step": 8296 + }, + { + "epoch": 0.6738406562170064, + "grad_norm": 5.299840930228158, + "learning_rate": 1.2699858080574141e-06, + "loss": 0.6049, + "step": 8297 + }, + { + "epoch": 0.673921871193048, + "grad_norm": 5.383383957246399, + "learning_rate": 1.2694133300210354e-06, + "loss": 0.5769, + "step": 8298 + }, + { + "epoch": 0.6740030861690895, + "grad_norm": 5.48276722761316, + "learning_rate": 1.2688409371347422e-06, + "loss": 0.5609, + "step": 8299 + }, + { + "epoch": 0.6740843011451312, + "grad_norm": 4.890248642899915, + "learning_rate": 1.2682686294381403e-06, + "loss": 0.5895, + "step": 8300 + }, + { + "epoch": 0.6741655161211727, + "grad_norm": 5.286027918182194, + "learning_rate": 1.2676964069708294e-06, + "loss": 0.406, + "step": 8301 + }, + { + "epoch": 0.6742467310972143, + "grad_norm": 4.144062037889405, + "learning_rate": 1.2671242697724061e-06, + "loss": 0.6805, + "step": 8302 + }, + { + "epoch": 0.6743279460732559, + "grad_norm": 3.10497422112379, + "learning_rate": 1.266552217882458e-06, + "loss": 0.4172, + "step": 8303 + }, + { + "epoch": 0.6744091610492975, + "grad_norm": 4.7159885122704495, + "learning_rate": 1.265980251340568e-06, + "loss": 0.3766, + "step": 8304 + }, + { + "epoch": 0.6744903760253391, + "grad_norm": 6.029787369464984, + "learning_rate": 1.265408370186315e-06, + "loss": 0.5068, + "step": 8305 + }, + { + "epoch": 0.6745715910013806, + "grad_norm": 9.477273387771245, + "learning_rate": 1.2648365744592683e-06, + "loss": 0.5335, + "step": 8306 + }, + { + "epoch": 0.6746528059774223, + "grad_norm": 5.274931921378115, + "learning_rate": 1.264264864198994e-06, + "loss": 0.3893, + "step": 8307 + }, + { + "epoch": 0.6747340209534638, + "grad_norm": 6.694925532405975, + "learning_rate": 1.2636932394450502e-06, + "loss": 0.3245, + "step": 8308 + }, + { + "epoch": 0.6748152359295054, + "grad_norm": 3.4329112146537994, + "learning_rate": 1.2631217002369917e-06, + "loss": 0.5724, + "step": 8309 + }, + { + "epoch": 0.674896450905547, + "grad_norm": 5.211887506168933, + "learning_rate": 1.2625502466143646e-06, + "loss": 0.6977, + "step": 8310 + }, + { + "epoch": 0.6749776658815886, + "grad_norm": 3.3614053784547258, + "learning_rate": 1.2619788786167113e-06, + "loss": 0.4674, + "step": 8311 + }, + { + "epoch": 0.6750588808576301, + "grad_norm": 3.329679930838226, + "learning_rate": 1.2614075962835688e-06, + "loss": 0.4857, + "step": 8312 + }, + { + "epoch": 0.6751400958336717, + "grad_norm": 5.737443755358583, + "learning_rate": 1.2608363996544654e-06, + "loss": 0.5106, + "step": 8313 + }, + { + "epoch": 0.6752213108097133, + "grad_norm": 6.125567742013193, + "learning_rate": 1.2602652887689237e-06, + "loss": 0.4443, + "step": 8314 + }, + { + "epoch": 0.6753025257857549, + "grad_norm": 9.237383534994807, + "learning_rate": 1.2596942636664638e-06, + "loss": 0.4483, + "step": 8315 + }, + { + "epoch": 0.6753837407617965, + "grad_norm": 5.106127682199849, + "learning_rate": 1.2591233243865958e-06, + "loss": 0.4114, + "step": 8316 + }, + { + "epoch": 0.675464955737838, + "grad_norm": 6.956467222846337, + "learning_rate": 1.2585524709688268e-06, + "loss": 0.5567, + "step": 8317 + }, + { + "epoch": 0.6755461707138797, + "grad_norm": 6.165264566742239, + "learning_rate": 1.257981703452657e-06, + "loss": 0.4624, + "step": 8318 + }, + { + "epoch": 0.6756273856899212, + "grad_norm": 3.908854775644603, + "learning_rate": 1.2574110218775804e-06, + "loss": 0.479, + "step": 8319 + }, + { + "epoch": 0.6757086006659628, + "grad_norm": 5.299167034989398, + "learning_rate": 1.2568404262830836e-06, + "loss": 0.4723, + "step": 8320 + }, + { + "epoch": 0.6757898156420044, + "grad_norm": 5.113207491776007, + "learning_rate": 1.256269916708651e-06, + "loss": 0.4358, + "step": 8321 + }, + { + "epoch": 0.675871030618046, + "grad_norm": 6.490258586098834, + "learning_rate": 1.2556994931937565e-06, + "loss": 0.4825, + "step": 8322 + }, + { + "epoch": 0.6759522455940875, + "grad_norm": 5.544338315054307, + "learning_rate": 1.2551291557778721e-06, + "loss": 0.4548, + "step": 8323 + }, + { + "epoch": 0.6760334605701291, + "grad_norm": 4.16562191240731, + "learning_rate": 1.2545589045004627e-06, + "loss": 0.5272, + "step": 8324 + }, + { + "epoch": 0.6761146755461707, + "grad_norm": 8.317648547442525, + "learning_rate": 1.2539887394009855e-06, + "loss": 0.5807, + "step": 8325 + }, + { + "epoch": 0.6761958905222123, + "grad_norm": 4.863432965134022, + "learning_rate": 1.2534186605188933e-06, + "loss": 0.4716, + "step": 8326 + }, + { + "epoch": 0.6762771054982539, + "grad_norm": 4.473812223631319, + "learning_rate": 1.2528486678936313e-06, + "loss": 0.3962, + "step": 8327 + }, + { + "epoch": 0.6763583204742954, + "grad_norm": 7.282308721393281, + "learning_rate": 1.2522787615646421e-06, + "loss": 0.4288, + "step": 8328 + }, + { + "epoch": 0.6764395354503371, + "grad_norm": 6.136375474933964, + "learning_rate": 1.251708941571358e-06, + "loss": 0.4208, + "step": 8329 + }, + { + "epoch": 0.6765207504263786, + "grad_norm": 8.250435251721406, + "learning_rate": 1.2511392079532087e-06, + "loss": 0.3731, + "step": 8330 + }, + { + "epoch": 0.6766019654024202, + "grad_norm": 8.8618136486422, + "learning_rate": 1.2505695607496176e-06, + "loss": 0.4431, + "step": 8331 + }, + { + "epoch": 0.6766831803784618, + "grad_norm": 6.809802423773869, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.437, + "step": 8332 + }, + { + "epoch": 0.6767643953545034, + "grad_norm": 4.412176686499365, + "learning_rate": 1.2494305257437669e-06, + "loss": 0.6663, + "step": 8333 + }, + { + "epoch": 0.6768456103305449, + "grad_norm": 8.259279196472452, + "learning_rate": 1.2488611380203234e-06, + "loss": 0.5233, + "step": 8334 + }, + { + "epoch": 0.6769268253065865, + "grad_norm": 5.681125220205126, + "learning_rate": 1.2482918368690666e-06, + "loss": 0.4985, + "step": 8335 + }, + { + "epoch": 0.6770080402826281, + "grad_norm": 8.25956808119689, + "learning_rate": 1.24772262232939e-06, + "loss": 0.5764, + "step": 8336 + }, + { + "epoch": 0.6770892552586697, + "grad_norm": 5.608911686082642, + "learning_rate": 1.2471534944406813e-06, + "loss": 0.5972, + "step": 8337 + }, + { + "epoch": 0.6771704702347113, + "grad_norm": 3.3213708352263054, + "learning_rate": 1.2465844532423201e-06, + "loss": 0.4957, + "step": 8338 + }, + { + "epoch": 0.6772516852107529, + "grad_norm": 7.167255021326034, + "learning_rate": 1.2460154987736806e-06, + "loss": 0.5549, + "step": 8339 + }, + { + "epoch": 0.6773329001867945, + "grad_norm": 4.442760200949274, + "learning_rate": 1.2454466310741326e-06, + "loss": 0.427, + "step": 8340 + }, + { + "epoch": 0.677414115162836, + "grad_norm": 5.768478872426691, + "learning_rate": 1.244877850183038e-06, + "loss": 0.4285, + "step": 8341 + }, + { + "epoch": 0.6774953301388776, + "grad_norm": 4.225339833775357, + "learning_rate": 1.2443091561397527e-06, + "loss": 0.5469, + "step": 8342 + }, + { + "epoch": 0.6775765451149192, + "grad_norm": 6.322105186178602, + "learning_rate": 1.2437405489836282e-06, + "loss": 0.4678, + "step": 8343 + }, + { + "epoch": 0.6776577600909608, + "grad_norm": 12.80035724783234, + "learning_rate": 1.2431720287540097e-06, + "loss": 0.4633, + "step": 8344 + }, + { + "epoch": 0.6777389750670023, + "grad_norm": 6.865099951483512, + "learning_rate": 1.2426035954902356e-06, + "loss": 0.5027, + "step": 8345 + }, + { + "epoch": 0.677820190043044, + "grad_norm": 4.2665462756693975, + "learning_rate": 1.2420352492316368e-06, + "loss": 0.4114, + "step": 8346 + }, + { + "epoch": 0.6779014050190855, + "grad_norm": 4.207313035828698, + "learning_rate": 1.2414669900175423e-06, + "loss": 0.4601, + "step": 8347 + }, + { + "epoch": 0.6779826199951271, + "grad_norm": 5.802065214743892, + "learning_rate": 1.2408988178872699e-06, + "loss": 0.3487, + "step": 8348 + }, + { + "epoch": 0.6780638349711687, + "grad_norm": 5.071217203345165, + "learning_rate": 1.240330732880136e-06, + "loss": 0.4018, + "step": 8349 + }, + { + "epoch": 0.6781450499472103, + "grad_norm": 4.658634987558885, + "learning_rate": 1.2397627350354494e-06, + "loss": 0.5006, + "step": 8350 + }, + { + "epoch": 0.6782262649232519, + "grad_norm": 5.989526021591514, + "learning_rate": 1.2391948243925119e-06, + "loss": 0.3748, + "step": 8351 + }, + { + "epoch": 0.6783074798992934, + "grad_norm": 4.543912419314957, + "learning_rate": 1.238627000990619e-06, + "loss": 0.3414, + "step": 8352 + }, + { + "epoch": 0.678388694875335, + "grad_norm": 4.935095050500174, + "learning_rate": 1.2380592648690629e-06, + "loss": 0.4755, + "step": 8353 + }, + { + "epoch": 0.6784699098513766, + "grad_norm": 3.9622208310484024, + "learning_rate": 1.2374916160671268e-06, + "loss": 0.4053, + "step": 8354 + }, + { + "epoch": 0.6785511248274182, + "grad_norm": 4.65626323141216, + "learning_rate": 1.2369240546240881e-06, + "loss": 0.5236, + "step": 8355 + }, + { + "epoch": 0.6786323398034597, + "grad_norm": 5.707169042711556, + "learning_rate": 1.2363565805792202e-06, + "loss": 0.4848, + "step": 8356 + }, + { + "epoch": 0.6787135547795013, + "grad_norm": 5.695511088930679, + "learning_rate": 1.2357891939717903e-06, + "loss": 0.4024, + "step": 8357 + }, + { + "epoch": 0.6787947697555429, + "grad_norm": 5.28913545779317, + "learning_rate": 1.2352218948410563e-06, + "loss": 0.4305, + "step": 8358 + }, + { + "epoch": 0.6788759847315845, + "grad_norm": 3.680524773252283, + "learning_rate": 1.2346546832262743e-06, + "loss": 0.5757, + "step": 8359 + }, + { + "epoch": 0.6789571997076261, + "grad_norm": 4.937519409498853, + "learning_rate": 1.2340875591666917e-06, + "loss": 0.5091, + "step": 8360 + }, + { + "epoch": 0.6790384146836677, + "grad_norm": 6.252444692643531, + "learning_rate": 1.2335205227015494e-06, + "loss": 0.3707, + "step": 8361 + }, + { + "epoch": 0.6791196296597093, + "grad_norm": 6.749516651304727, + "learning_rate": 1.2329535738700838e-06, + "loss": 0.4521, + "step": 8362 + }, + { + "epoch": 0.6792008446357508, + "grad_norm": 6.554192643461585, + "learning_rate": 1.232386712711526e-06, + "loss": 0.4651, + "step": 8363 + }, + { + "epoch": 0.6792820596117924, + "grad_norm": 6.372502999598231, + "learning_rate": 1.2318199392650993e-06, + "loss": 0.5727, + "step": 8364 + }, + { + "epoch": 0.679363274587834, + "grad_norm": 7.402271446804732, + "learning_rate": 1.23125325357002e-06, + "loss": 0.3813, + "step": 8365 + }, + { + "epoch": 0.6794444895638756, + "grad_norm": 4.56010296625159, + "learning_rate": 1.2306866556655016e-06, + "loss": 0.3856, + "step": 8366 + }, + { + "epoch": 0.6795257045399171, + "grad_norm": 5.070138865153647, + "learning_rate": 1.2301201455907492e-06, + "loss": 0.3799, + "step": 8367 + }, + { + "epoch": 0.6796069195159588, + "grad_norm": 7.907043621920382, + "learning_rate": 1.2295537233849608e-06, + "loss": 0.4336, + "step": 8368 + }, + { + "epoch": 0.6796881344920003, + "grad_norm": 4.083402933054175, + "learning_rate": 1.2289873890873311e-06, + "loss": 0.4805, + "step": 8369 + }, + { + "epoch": 0.6797693494680419, + "grad_norm": 3.3225519411749134, + "learning_rate": 1.2284211427370483e-06, + "loss": 0.5801, + "step": 8370 + }, + { + "epoch": 0.6798505644440835, + "grad_norm": 5.893743990272938, + "learning_rate": 1.2278549843732915e-06, + "loss": 0.4576, + "step": 8371 + }, + { + "epoch": 0.6799317794201251, + "grad_norm": 4.351833235603589, + "learning_rate": 1.2272889140352382e-06, + "loss": 0.5446, + "step": 8372 + }, + { + "epoch": 0.6800129943961667, + "grad_norm": 6.1661761775892945, + "learning_rate": 1.2267229317620564e-06, + "loss": 0.4169, + "step": 8373 + }, + { + "epoch": 0.6800942093722082, + "grad_norm": 3.8116371519348404, + "learning_rate": 1.2261570375929077e-06, + "loss": 0.4341, + "step": 8374 + }, + { + "epoch": 0.6801754243482498, + "grad_norm": 8.305006268325108, + "learning_rate": 1.2255912315669507e-06, + "loss": 0.5365, + "step": 8375 + }, + { + "epoch": 0.6802566393242914, + "grad_norm": 3.87785735314886, + "learning_rate": 1.2250255137233363e-06, + "loss": 0.3122, + "step": 8376 + }, + { + "epoch": 0.680337854300333, + "grad_norm": 5.364139835040124, + "learning_rate": 1.224459884101209e-06, + "loss": 0.4254, + "step": 8377 + }, + { + "epoch": 0.6804190692763745, + "grad_norm": 5.505941463868022, + "learning_rate": 1.2238943427397059e-06, + "loss": 0.5725, + "step": 8378 + }, + { + "epoch": 0.6805002842524162, + "grad_norm": 3.2670918728153495, + "learning_rate": 1.2233288896779617e-06, + "loss": 0.4677, + "step": 8379 + }, + { + "epoch": 0.6805814992284577, + "grad_norm": 5.650997362758408, + "learning_rate": 1.2227635249551014e-06, + "loss": 0.4548, + "step": 8380 + }, + { + "epoch": 0.6806627142044993, + "grad_norm": 4.85029252716426, + "learning_rate": 1.2221982486102446e-06, + "loss": 0.533, + "step": 8381 + }, + { + "epoch": 0.6807439291805409, + "grad_norm": 6.520961416314167, + "learning_rate": 1.2216330606825063e-06, + "loss": 0.5872, + "step": 8382 + }, + { + "epoch": 0.6808251441565825, + "grad_norm": 4.358523321449324, + "learning_rate": 1.2210679612109957e-06, + "loss": 0.463, + "step": 8383 + }, + { + "epoch": 0.6809063591326241, + "grad_norm": 3.3319401584526256, + "learning_rate": 1.2205029502348123e-06, + "loss": 0.4915, + "step": 8384 + }, + { + "epoch": 0.6809875741086656, + "grad_norm": 5.916515103224009, + "learning_rate": 1.2199380277930542e-06, + "loss": 0.3504, + "step": 8385 + }, + { + "epoch": 0.6810687890847072, + "grad_norm": 3.720584300258901, + "learning_rate": 1.2193731939248098e-06, + "loss": 0.5314, + "step": 8386 + }, + { + "epoch": 0.6811500040607488, + "grad_norm": 6.706752035654295, + "learning_rate": 1.218808448669162e-06, + "loss": 0.4826, + "step": 8387 + }, + { + "epoch": 0.6812312190367904, + "grad_norm": 7.136044672639504, + "learning_rate": 1.218243792065189e-06, + "loss": 0.3701, + "step": 8388 + }, + { + "epoch": 0.6813124340128319, + "grad_norm": 15.62272184000503, + "learning_rate": 1.2176792241519628e-06, + "loss": 0.511, + "step": 8389 + }, + { + "epoch": 0.6813936489888736, + "grad_norm": 5.7775925878468275, + "learning_rate": 1.2171147449685469e-06, + "loss": 0.4626, + "step": 8390 + }, + { + "epoch": 0.6814748639649151, + "grad_norm": 8.76813533021635, + "learning_rate": 1.2165503545540017e-06, + "loss": 0.3721, + "step": 8391 + }, + { + "epoch": 0.6815560789409567, + "grad_norm": 2.7821535648469307, + "learning_rate": 1.2159860529473796e-06, + "loss": 0.5439, + "step": 8392 + }, + { + "epoch": 0.6816372939169983, + "grad_norm": 8.296103570527087, + "learning_rate": 1.2154218401877263e-06, + "loss": 0.4945, + "step": 8393 + }, + { + "epoch": 0.6817185088930399, + "grad_norm": 3.680639071610561, + "learning_rate": 1.214857716314083e-06, + "loss": 0.5059, + "step": 8394 + }, + { + "epoch": 0.6817997238690815, + "grad_norm": 3.5629979371942997, + "learning_rate": 1.2142936813654848e-06, + "loss": 0.4787, + "step": 8395 + }, + { + "epoch": 0.681880938845123, + "grad_norm": 13.02646445886847, + "learning_rate": 1.21372973538096e-06, + "loss": 0.5225, + "step": 8396 + }, + { + "epoch": 0.6819621538211647, + "grad_norm": 6.585522603186978, + "learning_rate": 1.2131658783995285e-06, + "loss": 0.3445, + "step": 8397 + }, + { + "epoch": 0.6820433687972062, + "grad_norm": 4.999188751543709, + "learning_rate": 1.212602110460209e-06, + "loss": 0.6751, + "step": 8398 + }, + { + "epoch": 0.6821245837732478, + "grad_norm": 5.137382890011477, + "learning_rate": 1.2120384316020098e-06, + "loss": 0.464, + "step": 8399 + }, + { + "epoch": 0.6822057987492893, + "grad_norm": 3.57144580039995, + "learning_rate": 1.2114748418639339e-06, + "loss": 0.3924, + "step": 8400 + }, + { + "epoch": 0.682287013725331, + "grad_norm": 6.448424519995378, + "learning_rate": 1.2109113412849792e-06, + "loss": 0.5204, + "step": 8401 + }, + { + "epoch": 0.6823682287013725, + "grad_norm": 4.80310339792193, + "learning_rate": 1.2103479299041388e-06, + "loss": 0.5435, + "step": 8402 + }, + { + "epoch": 0.6824494436774141, + "grad_norm": 11.6534555123514, + "learning_rate": 1.209784607760395e-06, + "loss": 0.514, + "step": 8403 + }, + { + "epoch": 0.6825306586534557, + "grad_norm": 14.006283811833011, + "learning_rate": 1.209221374892729e-06, + "loss": 0.5202, + "step": 8404 + }, + { + "epoch": 0.6826118736294973, + "grad_norm": 3.410546012906801, + "learning_rate": 1.2086582313401125e-06, + "loss": 0.6359, + "step": 8405 + }, + { + "epoch": 0.6826930886055389, + "grad_norm": 4.650396850410136, + "learning_rate": 1.208095177141511e-06, + "loss": 0.6307, + "step": 8406 + }, + { + "epoch": 0.6827743035815804, + "grad_norm": 4.067175768365543, + "learning_rate": 1.2075322123358857e-06, + "loss": 0.5884, + "step": 8407 + }, + { + "epoch": 0.682855518557622, + "grad_norm": 7.834411232708035, + "learning_rate": 1.2069693369621924e-06, + "loss": 0.4241, + "step": 8408 + }, + { + "epoch": 0.6829367335336636, + "grad_norm": 5.025485826761079, + "learning_rate": 1.2064065510593765e-06, + "loss": 0.4524, + "step": 8409 + }, + { + "epoch": 0.6830179485097052, + "grad_norm": 7.200545827802387, + "learning_rate": 1.205843854666382e-06, + "loss": 0.6125, + "step": 8410 + }, + { + "epoch": 0.6830991634857467, + "grad_norm": 5.911187736116562, + "learning_rate": 1.2052812478221437e-06, + "loss": 0.6645, + "step": 8411 + }, + { + "epoch": 0.6831803784617884, + "grad_norm": 7.850988389705168, + "learning_rate": 1.2047187305655898e-06, + "loss": 0.4591, + "step": 8412 + }, + { + "epoch": 0.6832615934378299, + "grad_norm": 4.511296495523972, + "learning_rate": 1.2041563029356454e-06, + "loss": 0.6595, + "step": 8413 + }, + { + "epoch": 0.6833428084138715, + "grad_norm": 5.3954091188162145, + "learning_rate": 1.203593964971226e-06, + "loss": 0.5137, + "step": 8414 + }, + { + "epoch": 0.6834240233899131, + "grad_norm": 3.303858814227981, + "learning_rate": 1.2030317167112438e-06, + "loss": 0.5769, + "step": 8415 + }, + { + "epoch": 0.6835052383659547, + "grad_norm": 3.763109647115813, + "learning_rate": 1.2024695581946016e-06, + "loss": 0.4826, + "step": 8416 + }, + { + "epoch": 0.6835864533419963, + "grad_norm": 17.92152213202901, + "learning_rate": 1.2019074894602005e-06, + "loss": 0.3603, + "step": 8417 + }, + { + "epoch": 0.6836676683180378, + "grad_norm": 6.268708930822519, + "learning_rate": 1.2013455105469304e-06, + "loss": 0.4264, + "step": 8418 + }, + { + "epoch": 0.6837488832940795, + "grad_norm": 4.52113050826669, + "learning_rate": 1.2007836214936773e-06, + "loss": 0.5712, + "step": 8419 + }, + { + "epoch": 0.683830098270121, + "grad_norm": 5.948974609805699, + "learning_rate": 1.2002218223393213e-06, + "loss": 0.5343, + "step": 8420 + }, + { + "epoch": 0.6839113132461626, + "grad_norm": 5.13305028563167, + "learning_rate": 1.1996601131227376e-06, + "loss": 0.6497, + "step": 8421 + }, + { + "epoch": 0.6839925282222041, + "grad_norm": 6.071521774254982, + "learning_rate": 1.1990984938827907e-06, + "loss": 0.6081, + "step": 8422 + }, + { + "epoch": 0.6840737431982458, + "grad_norm": 4.917477348975693, + "learning_rate": 1.1985369646583442e-06, + "loss": 0.5464, + "step": 8423 + }, + { + "epoch": 0.6841549581742873, + "grad_norm": 4.304996098971284, + "learning_rate": 1.1979755254882519e-06, + "loss": 0.4677, + "step": 8424 + }, + { + "epoch": 0.6842361731503289, + "grad_norm": 7.848305808110954, + "learning_rate": 1.1974141764113617e-06, + "loss": 0.3471, + "step": 8425 + }, + { + "epoch": 0.6843173881263706, + "grad_norm": 5.645856868747358, + "learning_rate": 1.1968529174665173e-06, + "loss": 0.4901, + "step": 8426 + }, + { + "epoch": 0.6843986031024121, + "grad_norm": 5.273055048309659, + "learning_rate": 1.1962917486925532e-06, + "loss": 0.5054, + "step": 8427 + }, + { + "epoch": 0.6844798180784537, + "grad_norm": 4.090778363630613, + "learning_rate": 1.1957306701283002e-06, + "loss": 0.3776, + "step": 8428 + }, + { + "epoch": 0.6845610330544952, + "grad_norm": 6.031487669992356, + "learning_rate": 1.1951696818125835e-06, + "loss": 0.5705, + "step": 8429 + }, + { + "epoch": 0.6846422480305369, + "grad_norm": 5.241073571056581, + "learning_rate": 1.1946087837842188e-06, + "loss": 0.3882, + "step": 8430 + }, + { + "epoch": 0.6847234630065784, + "grad_norm": 6.683258623229494, + "learning_rate": 1.1940479760820177e-06, + "loss": 0.486, + "step": 8431 + }, + { + "epoch": 0.68480467798262, + "grad_norm": 5.092498342825334, + "learning_rate": 1.1934872587447838e-06, + "loss": 0.4291, + "step": 8432 + }, + { + "epoch": 0.6848858929586615, + "grad_norm": 2.9158897378436746, + "learning_rate": 1.1929266318113172e-06, + "loss": 0.5162, + "step": 8433 + }, + { + "epoch": 0.6849671079347032, + "grad_norm": 6.199252123306311, + "learning_rate": 1.192366095320411e-06, + "loss": 0.4291, + "step": 8434 + }, + { + "epoch": 0.6850483229107447, + "grad_norm": 6.833595326483498, + "learning_rate": 1.1918056493108493e-06, + "loss": 0.398, + "step": 8435 + }, + { + "epoch": 0.6851295378867863, + "grad_norm": 4.488586364488279, + "learning_rate": 1.1912452938214142e-06, + "loss": 0.5889, + "step": 8436 + }, + { + "epoch": 0.685210752862828, + "grad_norm": 7.767681900407471, + "learning_rate": 1.1906850288908783e-06, + "loss": 0.5559, + "step": 8437 + }, + { + "epoch": 0.6852919678388695, + "grad_norm": 4.340043418686966, + "learning_rate": 1.1901248545580082e-06, + "loss": 0.4359, + "step": 8438 + }, + { + "epoch": 0.6853731828149111, + "grad_norm": 4.93465575158808, + "learning_rate": 1.1895647708615665e-06, + "loss": 0.4433, + "step": 8439 + }, + { + "epoch": 0.6854543977909526, + "grad_norm": 4.964788137277078, + "learning_rate": 1.1890047778403063e-06, + "loss": 0.4608, + "step": 8440 + }, + { + "epoch": 0.6855356127669943, + "grad_norm": 4.5014832518222905, + "learning_rate": 1.1884448755329772e-06, + "loss": 0.6192, + "step": 8441 + }, + { + "epoch": 0.6856168277430358, + "grad_norm": 3.814073027261063, + "learning_rate": 1.1878850639783224e-06, + "loss": 0.65, + "step": 8442 + }, + { + "epoch": 0.6856980427190774, + "grad_norm": 4.698487957966528, + "learning_rate": 1.1873253432150769e-06, + "loss": 0.4156, + "step": 8443 + }, + { + "epoch": 0.6857792576951189, + "grad_norm": 2.6979949526342173, + "learning_rate": 1.1867657132819693e-06, + "loss": 0.4989, + "step": 8444 + }, + { + "epoch": 0.6858604726711606, + "grad_norm": 4.663840811404981, + "learning_rate": 1.1862061742177253e-06, + "loss": 0.3499, + "step": 8445 + }, + { + "epoch": 0.6859416876472021, + "grad_norm": 4.603946365527777, + "learning_rate": 1.1856467260610597e-06, + "loss": 0.4283, + "step": 8446 + }, + { + "epoch": 0.6860229026232437, + "grad_norm": 6.004017761597916, + "learning_rate": 1.1850873688506847e-06, + "loss": 0.4741, + "step": 8447 + }, + { + "epoch": 0.6861041175992854, + "grad_norm": 4.91811982386808, + "learning_rate": 1.1845281026253055e-06, + "loss": 0.4488, + "step": 8448 + }, + { + "epoch": 0.6861853325753269, + "grad_norm": 4.124631666496995, + "learning_rate": 1.1839689274236197e-06, + "loss": 0.4576, + "step": 8449 + }, + { + "epoch": 0.6862665475513685, + "grad_norm": 4.967951805934966, + "learning_rate": 1.183409843284319e-06, + "loss": 0.4204, + "step": 8450 + }, + { + "epoch": 0.68634776252741, + "grad_norm": 3.6946567527289083, + "learning_rate": 1.1828508502460884e-06, + "loss": 0.501, + "step": 8451 + }, + { + "epoch": 0.6864289775034517, + "grad_norm": 4.3257591185306214, + "learning_rate": 1.1822919483476089e-06, + "loss": 0.5587, + "step": 8452 + }, + { + "epoch": 0.6865101924794932, + "grad_norm": 13.53355986373059, + "learning_rate": 1.1817331376275518e-06, + "loss": 0.3784, + "step": 8453 + }, + { + "epoch": 0.6865914074555348, + "grad_norm": 4.686471250690919, + "learning_rate": 1.181174418124585e-06, + "loss": 0.529, + "step": 8454 + }, + { + "epoch": 0.6866726224315763, + "grad_norm": 13.019567449891584, + "learning_rate": 1.1806157898773694e-06, + "loss": 0.521, + "step": 8455 + }, + { + "epoch": 0.686753837407618, + "grad_norm": 5.024504582924606, + "learning_rate": 1.1800572529245581e-06, + "loss": 0.4579, + "step": 8456 + }, + { + "epoch": 0.6868350523836595, + "grad_norm": 5.622740484481374, + "learning_rate": 1.1794988073047986e-06, + "loss": 0.4946, + "step": 8457 + }, + { + "epoch": 0.6869162673597011, + "grad_norm": 3.383472516465761, + "learning_rate": 1.1789404530567338e-06, + "loss": 0.454, + "step": 8458 + }, + { + "epoch": 0.6869974823357428, + "grad_norm": 39.06057548040847, + "learning_rate": 1.178382190218997e-06, + "loss": 0.4916, + "step": 8459 + }, + { + "epoch": 0.6870786973117843, + "grad_norm": 5.13942966732467, + "learning_rate": 1.1778240188302181e-06, + "loss": 0.5374, + "step": 8460 + }, + { + "epoch": 0.6871599122878259, + "grad_norm": 5.2491128634756805, + "learning_rate": 1.177265938929021e-06, + "loss": 0.4787, + "step": 8461 + }, + { + "epoch": 0.6872411272638674, + "grad_norm": 3.883090224748362, + "learning_rate": 1.1767079505540198e-06, + "loss": 0.4962, + "step": 8462 + }, + { + "epoch": 0.6873223422399091, + "grad_norm": 4.459207504966209, + "learning_rate": 1.1761500537438246e-06, + "loss": 0.4901, + "step": 8463 + }, + { + "epoch": 0.6874035572159506, + "grad_norm": 5.526598548112002, + "learning_rate": 1.1755922485370397e-06, + "loss": 0.6049, + "step": 8464 + }, + { + "epoch": 0.6874847721919922, + "grad_norm": 3.9962235924628, + "learning_rate": 1.1750345349722611e-06, + "loss": 0.6439, + "step": 8465 + }, + { + "epoch": 0.6875659871680337, + "grad_norm": 5.430777327111154, + "learning_rate": 1.1744769130880814e-06, + "loss": 0.5233, + "step": 8466 + }, + { + "epoch": 0.6876472021440754, + "grad_norm": 5.478225172555752, + "learning_rate": 1.1739193829230833e-06, + "loss": 0.4564, + "step": 8467 + }, + { + "epoch": 0.6877284171201169, + "grad_norm": 14.175270639083902, + "learning_rate": 1.1733619445158465e-06, + "loss": 0.5561, + "step": 8468 + }, + { + "epoch": 0.6878096320961585, + "grad_norm": 4.726662335713709, + "learning_rate": 1.1728045979049421e-06, + "loss": 0.439, + "step": 8469 + }, + { + "epoch": 0.6878908470722002, + "grad_norm": 6.585567352748735, + "learning_rate": 1.1722473431289344e-06, + "loss": 0.4297, + "step": 8470 + }, + { + "epoch": 0.6879720620482417, + "grad_norm": 7.151511500781442, + "learning_rate": 1.1716901802263845e-06, + "loss": 0.4433, + "step": 8471 + }, + { + "epoch": 0.6880532770242833, + "grad_norm": 13.543001844242685, + "learning_rate": 1.171133109235843e-06, + "loss": 0.3943, + "step": 8472 + }, + { + "epoch": 0.6881344920003248, + "grad_norm": 6.360919578390468, + "learning_rate": 1.1705761301958576e-06, + "loss": 0.3842, + "step": 8473 + }, + { + "epoch": 0.6882157069763665, + "grad_norm": 5.289828859594367, + "learning_rate": 1.170019243144969e-06, + "loss": 0.3475, + "step": 8474 + }, + { + "epoch": 0.688296921952408, + "grad_norm": 6.436765945910618, + "learning_rate": 1.16946244812171e-06, + "loss": 0.5738, + "step": 8475 + }, + { + "epoch": 0.6883781369284496, + "grad_norm": 9.265057636256202, + "learning_rate": 1.1689057451646072e-06, + "loss": 0.5015, + "step": 8476 + }, + { + "epoch": 0.6884593519044911, + "grad_norm": 5.62906939860262, + "learning_rate": 1.1683491343121825e-06, + "loss": 0.6312, + "step": 8477 + }, + { + "epoch": 0.6885405668805328, + "grad_norm": 3.608605687615943, + "learning_rate": 1.1677926156029495e-06, + "loss": 0.5705, + "step": 8478 + }, + { + "epoch": 0.6886217818565743, + "grad_norm": 7.326218578037191, + "learning_rate": 1.1672361890754165e-06, + "loss": 0.481, + "step": 8479 + }, + { + "epoch": 0.6887029968326159, + "grad_norm": 3.101458446740605, + "learning_rate": 1.1666798547680871e-06, + "loss": 0.5597, + "step": 8480 + }, + { + "epoch": 0.6887842118086576, + "grad_norm": 4.57887880856938, + "learning_rate": 1.166123612719455e-06, + "loss": 0.6213, + "step": 8481 + }, + { + "epoch": 0.6888654267846991, + "grad_norm": 10.086781507059271, + "learning_rate": 1.1655674629680083e-06, + "loss": 0.5009, + "step": 8482 + }, + { + "epoch": 0.6889466417607407, + "grad_norm": 5.0242271976636514, + "learning_rate": 1.165011405552232e-06, + "loss": 0.7435, + "step": 8483 + }, + { + "epoch": 0.6890278567367822, + "grad_norm": 7.2774357651661274, + "learning_rate": 1.164455440510601e-06, + "loss": 0.4607, + "step": 8484 + }, + { + "epoch": 0.6891090717128239, + "grad_norm": 4.136550631258544, + "learning_rate": 1.1638995678815843e-06, + "loss": 0.5113, + "step": 8485 + }, + { + "epoch": 0.6891902866888654, + "grad_norm": 5.117305452537372, + "learning_rate": 1.1633437877036462e-06, + "loss": 0.4338, + "step": 8486 + }, + { + "epoch": 0.689271501664907, + "grad_norm": 3.714646489723773, + "learning_rate": 1.162788100015245e-06, + "loss": 0.5338, + "step": 8487 + }, + { + "epoch": 0.6893527166409485, + "grad_norm": 3.9854040499315877, + "learning_rate": 1.1622325048548303e-06, + "loss": 0.6151, + "step": 8488 + }, + { + "epoch": 0.6894339316169902, + "grad_norm": 4.8147850700984804, + "learning_rate": 1.1616770022608447e-06, + "loss": 0.5104, + "step": 8489 + }, + { + "epoch": 0.6895151465930317, + "grad_norm": 12.49347709660076, + "learning_rate": 1.161121592271729e-06, + "loss": 0.4588, + "step": 8490 + }, + { + "epoch": 0.6895963615690733, + "grad_norm": 4.382600842190844, + "learning_rate": 1.1605662749259123e-06, + "loss": 0.5458, + "step": 8491 + }, + { + "epoch": 0.689677576545115, + "grad_norm": 3.4100489659901823, + "learning_rate": 1.1600110502618204e-06, + "loss": 0.5746, + "step": 8492 + }, + { + "epoch": 0.6897587915211565, + "grad_norm": 5.056416537418844, + "learning_rate": 1.1594559183178727e-06, + "loss": 0.4043, + "step": 8493 + }, + { + "epoch": 0.6898400064971981, + "grad_norm": 4.3441729287251105, + "learning_rate": 1.158900879132481e-06, + "loss": 0.5067, + "step": 8494 + }, + { + "epoch": 0.6899212214732396, + "grad_norm": 3.6975358584627447, + "learning_rate": 1.1583459327440496e-06, + "loss": 0.6198, + "step": 8495 + }, + { + "epoch": 0.6900024364492813, + "grad_norm": 5.765909988508904, + "learning_rate": 1.1577910791909802e-06, + "loss": 0.557, + "step": 8496 + }, + { + "epoch": 0.6900836514253228, + "grad_norm": 6.346348844345966, + "learning_rate": 1.1572363185116648e-06, + "loss": 0.5423, + "step": 8497 + }, + { + "epoch": 0.6901648664013644, + "grad_norm": 6.761851681144281, + "learning_rate": 1.1566816507444884e-06, + "loss": 0.7113, + "step": 8498 + }, + { + "epoch": 0.690246081377406, + "grad_norm": 4.9827548056260484, + "learning_rate": 1.1561270759278326e-06, + "loss": 0.4832, + "step": 8499 + }, + { + "epoch": 0.6903272963534476, + "grad_norm": 7.290496908601368, + "learning_rate": 1.1555725941000715e-06, + "loss": 0.4773, + "step": 8500 + }, + { + "epoch": 0.6904085113294891, + "grad_norm": 4.93347902991029, + "learning_rate": 1.1550182052995706e-06, + "loss": 0.4249, + "step": 8501 + }, + { + "epoch": 0.6904897263055307, + "grad_norm": 4.857856203684281, + "learning_rate": 1.154463909564693e-06, + "loss": 0.4705, + "step": 8502 + }, + { + "epoch": 0.6905709412815724, + "grad_norm": 16.432653414970492, + "learning_rate": 1.1539097069337913e-06, + "loss": 0.4833, + "step": 8503 + }, + { + "epoch": 0.6906521562576139, + "grad_norm": 4.517576424813998, + "learning_rate": 1.1533555974452128e-06, + "loss": 0.5461, + "step": 8504 + }, + { + "epoch": 0.6907333712336555, + "grad_norm": 6.387090563981134, + "learning_rate": 1.1528015811373004e-06, + "loss": 0.44, + "step": 8505 + }, + { + "epoch": 0.690814586209697, + "grad_norm": 7.168256261694638, + "learning_rate": 1.1522476580483893e-06, + "loss": 0.4357, + "step": 8506 + }, + { + "epoch": 0.6908958011857387, + "grad_norm": 5.267349807599555, + "learning_rate": 1.1516938282168074e-06, + "loss": 0.5764, + "step": 8507 + }, + { + "epoch": 0.6909770161617802, + "grad_norm": 5.446388451950695, + "learning_rate": 1.151140091680876e-06, + "loss": 0.4762, + "step": 8508 + }, + { + "epoch": 0.6910582311378218, + "grad_norm": 7.776144758103636, + "learning_rate": 1.1505864484789122e-06, + "loss": 0.4635, + "step": 8509 + }, + { + "epoch": 0.6911394461138634, + "grad_norm": 4.652490283523999, + "learning_rate": 1.1500328986492246e-06, + "loss": 0.4744, + "step": 8510 + }, + { + "epoch": 0.691220661089905, + "grad_norm": 5.281761321834011, + "learning_rate": 1.149479442230115e-06, + "loss": 0.5802, + "step": 8511 + }, + { + "epoch": 0.6913018760659465, + "grad_norm": 4.121830278672146, + "learning_rate": 1.1489260792598803e-06, + "loss": 0.4812, + "step": 8512 + }, + { + "epoch": 0.6913830910419881, + "grad_norm": 4.197838163994761, + "learning_rate": 1.1483728097768116e-06, + "loss": 0.4427, + "step": 8513 + }, + { + "epoch": 0.6914643060180298, + "grad_norm": 4.757540075293028, + "learning_rate": 1.14781963381919e-06, + "loss": 0.4649, + "step": 8514 + }, + { + "epoch": 0.6915455209940713, + "grad_norm": 4.228790147362145, + "learning_rate": 1.1472665514252943e-06, + "loss": 0.4501, + "step": 8515 + }, + { + "epoch": 0.6916267359701129, + "grad_norm": 3.198795799968073, + "learning_rate": 1.146713562633394e-06, + "loss": 0.4265, + "step": 8516 + }, + { + "epoch": 0.6917079509461544, + "grad_norm": 3.293645191890074, + "learning_rate": 1.1461606674817518e-06, + "loss": 0.5258, + "step": 8517 + }, + { + "epoch": 0.6917891659221961, + "grad_norm": 4.377855525825729, + "learning_rate": 1.1456078660086266e-06, + "loss": 0.4102, + "step": 8518 + }, + { + "epoch": 0.6918703808982376, + "grad_norm": 9.573511509775727, + "learning_rate": 1.1450551582522702e-06, + "loss": 0.3486, + "step": 8519 + }, + { + "epoch": 0.6919515958742792, + "grad_norm": 3.560198474639802, + "learning_rate": 1.1445025442509258e-06, + "loss": 0.4863, + "step": 8520 + }, + { + "epoch": 0.6920328108503208, + "grad_norm": 6.303365456673007, + "learning_rate": 1.1439500240428304e-06, + "loss": 0.5363, + "step": 8521 + }, + { + "epoch": 0.6921140258263624, + "grad_norm": 5.438200231090006, + "learning_rate": 1.1433975976662172e-06, + "loss": 0.5908, + "step": 8522 + }, + { + "epoch": 0.6921952408024039, + "grad_norm": 7.217503219294745, + "learning_rate": 1.1428452651593102e-06, + "loss": 0.5535, + "step": 8523 + }, + { + "epoch": 0.6922764557784455, + "grad_norm": 3.113332201559408, + "learning_rate": 1.142293026560328e-06, + "loss": 0.4252, + "step": 8524 + }, + { + "epoch": 0.6923576707544872, + "grad_norm": 3.7670638203585396, + "learning_rate": 1.1417408819074835e-06, + "loss": 0.5357, + "step": 8525 + }, + { + "epoch": 0.6924388857305287, + "grad_norm": 9.156807084218489, + "learning_rate": 1.1411888312389815e-06, + "loss": 0.5814, + "step": 8526 + }, + { + "epoch": 0.6925201007065703, + "grad_norm": 7.477340025472298, + "learning_rate": 1.1406368745930201e-06, + "loss": 0.5775, + "step": 8527 + }, + { + "epoch": 0.6926013156826119, + "grad_norm": 5.777723223269901, + "learning_rate": 1.140085012007794e-06, + "loss": 0.4531, + "step": 8528 + }, + { + "epoch": 0.6926825306586535, + "grad_norm": 17.932544473465622, + "learning_rate": 1.1395332435214873e-06, + "loss": 0.5404, + "step": 8529 + }, + { + "epoch": 0.692763745634695, + "grad_norm": 6.420149798086578, + "learning_rate": 1.138981569172279e-06, + "loss": 0.4261, + "step": 8530 + }, + { + "epoch": 0.6928449606107366, + "grad_norm": 9.184826721296508, + "learning_rate": 1.1384299889983432e-06, + "loss": 0.4474, + "step": 8531 + }, + { + "epoch": 0.6929261755867782, + "grad_norm": 8.376036553312167, + "learning_rate": 1.1378785030378473e-06, + "loss": 0.5644, + "step": 8532 + }, + { + "epoch": 0.6930073905628198, + "grad_norm": 4.928837681497556, + "learning_rate": 1.137327111328949e-06, + "loss": 0.3391, + "step": 8533 + }, + { + "epoch": 0.6930886055388613, + "grad_norm": 4.407213199355116, + "learning_rate": 1.1367758139098037e-06, + "loss": 0.453, + "step": 8534 + }, + { + "epoch": 0.693169820514903, + "grad_norm": 4.884167064220795, + "learning_rate": 1.1362246108185571e-06, + "loss": 0.4148, + "step": 8535 + }, + { + "epoch": 0.6932510354909446, + "grad_norm": 5.275613682866477, + "learning_rate": 1.135673502093349e-06, + "loss": 0.6555, + "step": 8536 + }, + { + "epoch": 0.6933322504669861, + "grad_norm": 4.72395706382707, + "learning_rate": 1.1351224877723137e-06, + "loss": 0.4664, + "step": 8537 + }, + { + "epoch": 0.6934134654430277, + "grad_norm": 6.683252761194182, + "learning_rate": 1.1345715678935802e-06, + "loss": 0.4335, + "step": 8538 + }, + { + "epoch": 0.6934946804190693, + "grad_norm": 3.6056858456464704, + "learning_rate": 1.1340207424952673e-06, + "loss": 0.4784, + "step": 8539 + }, + { + "epoch": 0.6935758953951109, + "grad_norm": 7.002949745318934, + "learning_rate": 1.133470011615489e-06, + "loss": 0.5434, + "step": 8540 + }, + { + "epoch": 0.6936571103711524, + "grad_norm": 6.659133719702008, + "learning_rate": 1.1329193752923543e-06, + "loss": 0.5708, + "step": 8541 + }, + { + "epoch": 0.693738325347194, + "grad_norm": 12.35767990560376, + "learning_rate": 1.1323688335639637e-06, + "loss": 0.4835, + "step": 8542 + }, + { + "epoch": 0.6938195403232356, + "grad_norm": 3.77158348926073, + "learning_rate": 1.131818386468411e-06, + "loss": 0.4171, + "step": 8543 + }, + { + "epoch": 0.6939007552992772, + "grad_norm": 5.1758054265529205, + "learning_rate": 1.1312680340437848e-06, + "loss": 0.4684, + "step": 8544 + }, + { + "epoch": 0.6939819702753187, + "grad_norm": 4.915722528576737, + "learning_rate": 1.130717776328168e-06, + "loss": 0.2615, + "step": 8545 + }, + { + "epoch": 0.6940631852513603, + "grad_norm": 3.1585274055113914, + "learning_rate": 1.130167613359633e-06, + "loss": 0.5899, + "step": 8546 + }, + { + "epoch": 0.694144400227402, + "grad_norm": 4.7926238667126055, + "learning_rate": 1.1296175451762504e-06, + "loss": 0.3803, + "step": 8547 + }, + { + "epoch": 0.6942256152034435, + "grad_norm": 4.742004176848653, + "learning_rate": 1.129067571816081e-06, + "loss": 0.5899, + "step": 8548 + }, + { + "epoch": 0.6943068301794851, + "grad_norm": 4.740522772470059, + "learning_rate": 1.128517693317179e-06, + "loss": 0.5, + "step": 8549 + }, + { + "epoch": 0.6943880451555267, + "grad_norm": 3.0796548456527457, + "learning_rate": 1.1279679097175944e-06, + "loss": 0.4379, + "step": 8550 + }, + { + "epoch": 0.6944692601315683, + "grad_norm": 4.872361456956494, + "learning_rate": 1.12741822105537e-06, + "loss": 0.4375, + "step": 8551 + }, + { + "epoch": 0.6945504751076098, + "grad_norm": 3.8126089904616625, + "learning_rate": 1.1268686273685391e-06, + "loss": 0.41, + "step": 8552 + }, + { + "epoch": 0.6946316900836514, + "grad_norm": 4.918083532251475, + "learning_rate": 1.1263191286951333e-06, + "loss": 0.7237, + "step": 8553 + }, + { + "epoch": 0.694712905059693, + "grad_norm": 4.980898147720527, + "learning_rate": 1.1257697250731735e-06, + "loss": 0.5379, + "step": 8554 + }, + { + "epoch": 0.6947941200357346, + "grad_norm": 3.3037282568689204, + "learning_rate": 1.1252204165406753e-06, + "loss": 0.5215, + "step": 8555 + }, + { + "epoch": 0.6948753350117761, + "grad_norm": 4.960611865059338, + "learning_rate": 1.1246712031356486e-06, + "loss": 0.5758, + "step": 8556 + }, + { + "epoch": 0.6949565499878178, + "grad_norm": 7.319288707757544, + "learning_rate": 1.1241220848960952e-06, + "loss": 0.4719, + "step": 8557 + }, + { + "epoch": 0.6950377649638594, + "grad_norm": 3.819399015745177, + "learning_rate": 1.1235730618600126e-06, + "loss": 0.4936, + "step": 8558 + }, + { + "epoch": 0.6951189799399009, + "grad_norm": 3.538376081354946, + "learning_rate": 1.1230241340653888e-06, + "loss": 0.5413, + "step": 8559 + }, + { + "epoch": 0.6952001949159425, + "grad_norm": 6.249841085120328, + "learning_rate": 1.122475301550208e-06, + "loss": 0.5515, + "step": 8560 + }, + { + "epoch": 0.6952814098919841, + "grad_norm": 8.110595035089235, + "learning_rate": 1.121926564352446e-06, + "loss": 0.6036, + "step": 8561 + }, + { + "epoch": 0.6953626248680257, + "grad_norm": 4.635874973816129, + "learning_rate": 1.1213779225100715e-06, + "loss": 0.7324, + "step": 8562 + }, + { + "epoch": 0.6954438398440672, + "grad_norm": 6.322869921153947, + "learning_rate": 1.1208293760610486e-06, + "loss": 0.52, + "step": 8563 + }, + { + "epoch": 0.6955250548201088, + "grad_norm": 7.6067666576253545, + "learning_rate": 1.1202809250433345e-06, + "loss": 0.3976, + "step": 8564 + }, + { + "epoch": 0.6956062697961504, + "grad_norm": 4.351083583151557, + "learning_rate": 1.1197325694948774e-06, + "loss": 0.5236, + "step": 8565 + }, + { + "epoch": 0.695687484772192, + "grad_norm": 5.090734435411529, + "learning_rate": 1.1191843094536225e-06, + "loss": 0.4614, + "step": 8566 + }, + { + "epoch": 0.6957686997482335, + "grad_norm": 6.001665213669127, + "learning_rate": 1.1186361449575055e-06, + "loss": 0.3699, + "step": 8567 + }, + { + "epoch": 0.6958499147242752, + "grad_norm": 6.350310371588752, + "learning_rate": 1.1180880760444558e-06, + "loss": 0.6122, + "step": 8568 + }, + { + "epoch": 0.6959311297003168, + "grad_norm": 4.918916151218767, + "learning_rate": 1.117540102752398e-06, + "loss": 0.4498, + "step": 8569 + }, + { + "epoch": 0.6960123446763583, + "grad_norm": 4.2337365050726286, + "learning_rate": 1.116992225119248e-06, + "loss": 0.3585, + "step": 8570 + }, + { + "epoch": 0.6960935596523999, + "grad_norm": 4.164412182555689, + "learning_rate": 1.1164444431829163e-06, + "loss": 0.5052, + "step": 8571 + }, + { + "epoch": 0.6961747746284415, + "grad_norm": 4.915037193176119, + "learning_rate": 1.1158967569813079e-06, + "loss": 0.4742, + "step": 8572 + }, + { + "epoch": 0.6962559896044831, + "grad_norm": 7.126882606950106, + "learning_rate": 1.1153491665523186e-06, + "loss": 0.5764, + "step": 8573 + }, + { + "epoch": 0.6963372045805246, + "grad_norm": 6.194568824850657, + "learning_rate": 1.1148016719338387e-06, + "loss": 0.3955, + "step": 8574 + }, + { + "epoch": 0.6964184195565662, + "grad_norm": 4.582251362036855, + "learning_rate": 1.1142542731637513e-06, + "loss": 0.2946, + "step": 8575 + }, + { + "epoch": 0.6964996345326078, + "grad_norm": 4.565291424432009, + "learning_rate": 1.1137069702799341e-06, + "loss": 0.5785, + "step": 8576 + }, + { + "epoch": 0.6965808495086494, + "grad_norm": 6.268014709041092, + "learning_rate": 1.1131597633202587e-06, + "loss": 0.5502, + "step": 8577 + }, + { + "epoch": 0.6966620644846909, + "grad_norm": 4.232016340106684, + "learning_rate": 1.1126126523225869e-06, + "loss": 0.4602, + "step": 8578 + }, + { + "epoch": 0.6967432794607326, + "grad_norm": 5.324130939463538, + "learning_rate": 1.112065637324778e-06, + "loss": 0.4341, + "step": 8579 + }, + { + "epoch": 0.6968244944367742, + "grad_norm": 3.6701229318957034, + "learning_rate": 1.1115187183646814e-06, + "loss": 0.5489, + "step": 8580 + }, + { + "epoch": 0.6969057094128157, + "grad_norm": 4.1075695508244365, + "learning_rate": 1.1109718954801398e-06, + "loss": 0.5336, + "step": 8581 + }, + { + "epoch": 0.6969869243888573, + "grad_norm": 4.14721530411858, + "learning_rate": 1.110425168708993e-06, + "loss": 0.5781, + "step": 8582 + }, + { + "epoch": 0.6970681393648989, + "grad_norm": 5.832775679972837, + "learning_rate": 1.1098785380890696e-06, + "loss": 0.427, + "step": 8583 + }, + { + "epoch": 0.6971493543409405, + "grad_norm": 3.33845605592306, + "learning_rate": 1.1093320036581936e-06, + "loss": 0.4442, + "step": 8584 + }, + { + "epoch": 0.697230569316982, + "grad_norm": 5.037481188891807, + "learning_rate": 1.1087855654541843e-06, + "loss": 0.368, + "step": 8585 + }, + { + "epoch": 0.6973117842930237, + "grad_norm": 4.499749726051026, + "learning_rate": 1.1082392235148509e-06, + "loss": 0.4573, + "step": 8586 + }, + { + "epoch": 0.6973929992690652, + "grad_norm": 4.154523490167273, + "learning_rate": 1.1076929778779965e-06, + "loss": 0.4741, + "step": 8587 + }, + { + "epoch": 0.6974742142451068, + "grad_norm": 6.620859889971063, + "learning_rate": 1.1071468285814201e-06, + "loss": 0.5117, + "step": 8588 + }, + { + "epoch": 0.6975554292211483, + "grad_norm": 5.322737847031269, + "learning_rate": 1.106600775662911e-06, + "loss": 0.428, + "step": 8589 + }, + { + "epoch": 0.69763664419719, + "grad_norm": 6.543366704108589, + "learning_rate": 1.1060548191602535e-06, + "loss": 0.5052, + "step": 8590 + }, + { + "epoch": 0.6977178591732316, + "grad_norm": 3.978052554444944, + "learning_rate": 1.105508959111226e-06, + "loss": 0.4051, + "step": 8591 + }, + { + "epoch": 0.6977990741492731, + "grad_norm": 4.013135595680058, + "learning_rate": 1.1049631955535985e-06, + "loss": 0.5263, + "step": 8592 + }, + { + "epoch": 0.6978802891253147, + "grad_norm": 7.931439768063147, + "learning_rate": 1.1044175285251348e-06, + "loss": 0.4994, + "step": 8593 + }, + { + "epoch": 0.6979615041013563, + "grad_norm": 7.9626002926512465, + "learning_rate": 1.1038719580635913e-06, + "loss": 0.5575, + "step": 8594 + }, + { + "epoch": 0.6980427190773979, + "grad_norm": 5.399616750795122, + "learning_rate": 1.103326484206719e-06, + "loss": 0.492, + "step": 8595 + }, + { + "epoch": 0.6981239340534394, + "grad_norm": 5.8829565085948765, + "learning_rate": 1.1027811069922634e-06, + "loss": 0.5324, + "step": 8596 + }, + { + "epoch": 0.698205149029481, + "grad_norm": 4.044823741608967, + "learning_rate": 1.1022358264579593e-06, + "loss": 0.4975, + "step": 8597 + }, + { + "epoch": 0.6982863640055226, + "grad_norm": 5.039936723673592, + "learning_rate": 1.1016906426415397e-06, + "loss": 0.6072, + "step": 8598 + }, + { + "epoch": 0.6983675789815642, + "grad_norm": 12.620558241769995, + "learning_rate": 1.1011455555807272e-06, + "loss": 0.4333, + "step": 8599 + }, + { + "epoch": 0.6984487939576057, + "grad_norm": 4.360858885346072, + "learning_rate": 1.1006005653132376e-06, + "loss": 0.4666, + "step": 8600 + }, + { + "epoch": 0.6985300089336474, + "grad_norm": 4.573558742038945, + "learning_rate": 1.100055671876784e-06, + "loss": 0.5247, + "step": 8601 + }, + { + "epoch": 0.698611223909689, + "grad_norm": 5.213935252357303, + "learning_rate": 1.0995108753090677e-06, + "loss": 0.4746, + "step": 8602 + }, + { + "epoch": 0.6986924388857305, + "grad_norm": 5.309900569340087, + "learning_rate": 1.0989661756477869e-06, + "loss": 0.4982, + "step": 8603 + }, + { + "epoch": 0.6987736538617721, + "grad_norm": 6.957485365514494, + "learning_rate": 1.0984215729306328e-06, + "loss": 0.6216, + "step": 8604 + }, + { + "epoch": 0.6988548688378137, + "grad_norm": 4.191127343883717, + "learning_rate": 1.097877067195288e-06, + "loss": 0.5468, + "step": 8605 + }, + { + "epoch": 0.6989360838138553, + "grad_norm": 4.214027815939813, + "learning_rate": 1.0973326584794286e-06, + "loss": 0.5753, + "step": 8606 + }, + { + "epoch": 0.6990172987898968, + "grad_norm": 5.627295117665019, + "learning_rate": 1.0967883468207265e-06, + "loss": 0.3873, + "step": 8607 + }, + { + "epoch": 0.6990985137659385, + "grad_norm": 5.494066620160367, + "learning_rate": 1.0962441322568437e-06, + "loss": 0.4377, + "step": 8608 + }, + { + "epoch": 0.69917972874198, + "grad_norm": 4.933052728810609, + "learning_rate": 1.0957000148254387e-06, + "loss": 0.4346, + "step": 8609 + }, + { + "epoch": 0.6992609437180216, + "grad_norm": 4.67963560490583, + "learning_rate": 1.0951559945641592e-06, + "loss": 0.4196, + "step": 8610 + }, + { + "epoch": 0.6993421586940631, + "grad_norm": 4.555476078942722, + "learning_rate": 1.094612071510651e-06, + "loss": 0.5331, + "step": 8611 + }, + { + "epoch": 0.6994233736701048, + "grad_norm": 4.542359966546809, + "learning_rate": 1.0940682457025498e-06, + "loss": 0.3957, + "step": 8612 + }, + { + "epoch": 0.6995045886461464, + "grad_norm": 4.4445515035232495, + "learning_rate": 1.0935245171774842e-06, + "loss": 0.6409, + "step": 8613 + }, + { + "epoch": 0.6995858036221879, + "grad_norm": 5.499967666697894, + "learning_rate": 1.092980885973079e-06, + "loss": 0.5317, + "step": 8614 + }, + { + "epoch": 0.6996670185982296, + "grad_norm": 4.090307339205739, + "learning_rate": 1.0924373521269492e-06, + "loss": 0.3992, + "step": 8615 + }, + { + "epoch": 0.6997482335742711, + "grad_norm": 5.094031142055631, + "learning_rate": 1.091893915676705e-06, + "loss": 0.8069, + "step": 8616 + }, + { + "epoch": 0.6998294485503127, + "grad_norm": 5.102016167710116, + "learning_rate": 1.0913505766599506e-06, + "loss": 0.5136, + "step": 8617 + }, + { + "epoch": 0.6999106635263542, + "grad_norm": 6.840823358662633, + "learning_rate": 1.090807335114281e-06, + "loss": 0.4567, + "step": 8618 + }, + { + "epoch": 0.6999918785023959, + "grad_norm": 6.604394180582011, + "learning_rate": 1.0902641910772852e-06, + "loss": 0.411, + "step": 8619 + }, + { + "epoch": 0.7000730934784374, + "grad_norm": 5.660730236893839, + "learning_rate": 1.0897211445865472e-06, + "loss": 0.432, + "step": 8620 + }, + { + "epoch": 0.700154308454479, + "grad_norm": 4.702707757036113, + "learning_rate": 1.089178195679641e-06, + "loss": 0.3665, + "step": 8621 + }, + { + "epoch": 0.7002355234305205, + "grad_norm": 4.96553057734693, + "learning_rate": 1.0886353443941373e-06, + "loss": 0.4387, + "step": 8622 + }, + { + "epoch": 0.7003167384065622, + "grad_norm": 8.264862786380114, + "learning_rate": 1.088092590767599e-06, + "loss": 0.3632, + "step": 8623 + }, + { + "epoch": 0.7003979533826038, + "grad_norm": 6.893091911108661, + "learning_rate": 1.0875499348375812e-06, + "loss": 0.5675, + "step": 8624 + }, + { + "epoch": 0.7004791683586453, + "grad_norm": 4.387426391179371, + "learning_rate": 1.0870073766416315e-06, + "loss": 0.4464, + "step": 8625 + }, + { + "epoch": 0.700560383334687, + "grad_norm": 5.027871465737512, + "learning_rate": 1.0864649162172941e-06, + "loss": 0.6075, + "step": 8626 + }, + { + "epoch": 0.7006415983107285, + "grad_norm": 6.143259232572587, + "learning_rate": 1.0859225536021034e-06, + "loss": 0.5228, + "step": 8627 + }, + { + "epoch": 0.7007228132867701, + "grad_norm": 6.464771014837394, + "learning_rate": 1.0853802888335874e-06, + "loss": 0.6185, + "step": 8628 + }, + { + "epoch": 0.7008040282628116, + "grad_norm": 4.205829517254945, + "learning_rate": 1.0848381219492684e-06, + "loss": 0.498, + "step": 8629 + }, + { + "epoch": 0.7008852432388533, + "grad_norm": 3.9072366994873855, + "learning_rate": 1.0842960529866627e-06, + "loss": 0.4887, + "step": 8630 + }, + { + "epoch": 0.7009664582148948, + "grad_norm": 5.509854679596712, + "learning_rate": 1.0837540819832779e-06, + "loss": 0.4346, + "step": 8631 + }, + { + "epoch": 0.7010476731909364, + "grad_norm": 4.3762550261485, + "learning_rate": 1.0832122089766143e-06, + "loss": 0.4646, + "step": 8632 + }, + { + "epoch": 0.7011288881669779, + "grad_norm": 4.21313805277909, + "learning_rate": 1.082670434004168e-06, + "loss": 0.605, + "step": 8633 + }, + { + "epoch": 0.7012101031430196, + "grad_norm": 5.0290295451057245, + "learning_rate": 1.0821287571034261e-06, + "loss": 0.5164, + "step": 8634 + }, + { + "epoch": 0.7012913181190612, + "grad_norm": 3.852504953053529, + "learning_rate": 1.0815871783118701e-06, + "loss": 0.5563, + "step": 8635 + }, + { + "epoch": 0.7013725330951027, + "grad_norm": 4.4031006382832265, + "learning_rate": 1.0810456976669753e-06, + "loss": 0.4816, + "step": 8636 + }, + { + "epoch": 0.7014537480711444, + "grad_norm": 5.020999898610326, + "learning_rate": 1.0805043152062086e-06, + "loss": 0.5762, + "step": 8637 + }, + { + "epoch": 0.7015349630471859, + "grad_norm": 12.438678796300241, + "learning_rate": 1.07996303096703e-06, + "loss": 0.4189, + "step": 8638 + }, + { + "epoch": 0.7016161780232275, + "grad_norm": 3.8964414970182455, + "learning_rate": 1.0794218449868948e-06, + "loss": 0.4803, + "step": 8639 + }, + { + "epoch": 0.701697392999269, + "grad_norm": 6.66037981965594, + "learning_rate": 1.07888075730325e-06, + "loss": 0.4304, + "step": 8640 + }, + { + "epoch": 0.7017786079753107, + "grad_norm": 9.601550182923392, + "learning_rate": 1.0783397679535343e-06, + "loss": 0.5567, + "step": 8641 + }, + { + "epoch": 0.7018598229513522, + "grad_norm": 5.837088176132076, + "learning_rate": 1.077798876975183e-06, + "loss": 0.3856, + "step": 8642 + }, + { + "epoch": 0.7019410379273938, + "grad_norm": 4.127084436289391, + "learning_rate": 1.0772580844056232e-06, + "loss": 0.5247, + "step": 8643 + }, + { + "epoch": 0.7020222529034353, + "grad_norm": 3.613384254118512, + "learning_rate": 1.0767173902822733e-06, + "loss": 0.3272, + "step": 8644 + }, + { + "epoch": 0.702103467879477, + "grad_norm": 2.5823659536071637, + "learning_rate": 1.0761767946425482e-06, + "loss": 0.5089, + "step": 8645 + }, + { + "epoch": 0.7021846828555186, + "grad_norm": 4.879797467260154, + "learning_rate": 1.0756362975238539e-06, + "loss": 0.8247, + "step": 8646 + }, + { + "epoch": 0.7022658978315601, + "grad_norm": 6.519914984541137, + "learning_rate": 1.0750958989635879e-06, + "loss": 0.4224, + "step": 8647 + }, + { + "epoch": 0.7023471128076018, + "grad_norm": 3.9501597340899903, + "learning_rate": 1.074555598999145e-06, + "loss": 0.571, + "step": 8648 + }, + { + "epoch": 0.7024283277836433, + "grad_norm": 6.770723225323952, + "learning_rate": 1.0740153976679114e-06, + "loss": 0.4328, + "step": 8649 + }, + { + "epoch": 0.7025095427596849, + "grad_norm": 13.76466570186757, + "learning_rate": 1.073475295007265e-06, + "loss": 0.424, + "step": 8650 + }, + { + "epoch": 0.7025907577357264, + "grad_norm": 5.062420028751364, + "learning_rate": 1.0729352910545779e-06, + "loss": 0.5359, + "step": 8651 + }, + { + "epoch": 0.7026719727117681, + "grad_norm": 8.4588766125872, + "learning_rate": 1.0723953858472167e-06, + "loss": 0.4192, + "step": 8652 + }, + { + "epoch": 0.7027531876878096, + "grad_norm": 4.594691832051315, + "learning_rate": 1.0718555794225385e-06, + "loss": 0.4356, + "step": 8653 + }, + { + "epoch": 0.7028344026638512, + "grad_norm": 5.910635422121678, + "learning_rate": 1.071315871817896e-06, + "loss": 0.4946, + "step": 8654 + }, + { + "epoch": 0.7029156176398927, + "grad_norm": 7.200371807778343, + "learning_rate": 1.0707762630706345e-06, + "loss": 0.6541, + "step": 8655 + }, + { + "epoch": 0.7029968326159344, + "grad_norm": 4.327245345198459, + "learning_rate": 1.0702367532180919e-06, + "loss": 0.5585, + "step": 8656 + }, + { + "epoch": 0.703078047591976, + "grad_norm": 5.70898727396328, + "learning_rate": 1.0696973422975978e-06, + "loss": 0.4336, + "step": 8657 + }, + { + "epoch": 0.7031592625680175, + "grad_norm": 3.3122955544251, + "learning_rate": 1.0691580303464791e-06, + "loss": 0.7113, + "step": 8658 + }, + { + "epoch": 0.7032404775440592, + "grad_norm": 6.853811044628905, + "learning_rate": 1.068618817402052e-06, + "loss": 0.5942, + "step": 8659 + }, + { + "epoch": 0.7033216925201007, + "grad_norm": 6.076640488626101, + "learning_rate": 1.0680797035016264e-06, + "loss": 0.657, + "step": 8660 + }, + { + "epoch": 0.7034029074961423, + "grad_norm": 6.2346543123925136, + "learning_rate": 1.0675406886825065e-06, + "loss": 0.5554, + "step": 8661 + }, + { + "epoch": 0.7034841224721838, + "grad_norm": 4.087802951829346, + "learning_rate": 1.0670017729819911e-06, + "loss": 0.6035, + "step": 8662 + }, + { + "epoch": 0.7035653374482255, + "grad_norm": 4.8340326949666, + "learning_rate": 1.066462956437369e-06, + "loss": 0.4762, + "step": 8663 + }, + { + "epoch": 0.703646552424267, + "grad_norm": 5.486314431686029, + "learning_rate": 1.0659242390859224e-06, + "loss": 0.474, + "step": 8664 + }, + { + "epoch": 0.7037277674003086, + "grad_norm": 5.968816148722602, + "learning_rate": 1.0653856209649297e-06, + "loss": 0.4181, + "step": 8665 + }, + { + "epoch": 0.7038089823763501, + "grad_norm": 4.467813005237297, + "learning_rate": 1.0648471021116584e-06, + "loss": 0.5303, + "step": 8666 + }, + { + "epoch": 0.7038901973523918, + "grad_norm": 3.4998502366425757, + "learning_rate": 1.0643086825633723e-06, + "loss": 0.5941, + "step": 8667 + }, + { + "epoch": 0.7039714123284334, + "grad_norm": 3.138792298225771, + "learning_rate": 1.0637703623573278e-06, + "loss": 0.6115, + "step": 8668 + }, + { + "epoch": 0.7040526273044749, + "grad_norm": 4.420119383519892, + "learning_rate": 1.0632321415307734e-06, + "loss": 0.4483, + "step": 8669 + }, + { + "epoch": 0.7041338422805166, + "grad_norm": 5.331801055403152, + "learning_rate": 1.0626940201209497e-06, + "loss": 0.3817, + "step": 8670 + }, + { + "epoch": 0.7042150572565581, + "grad_norm": 5.750793644056218, + "learning_rate": 1.062155998165094e-06, + "loss": 0.5817, + "step": 8671 + }, + { + "epoch": 0.7042962722325997, + "grad_norm": 3.4756270753695104, + "learning_rate": 1.0616180757004333e-06, + "loss": 0.6701, + "step": 8672 + }, + { + "epoch": 0.7043774872086412, + "grad_norm": 6.270791519856763, + "learning_rate": 1.0610802527641883e-06, + "loss": 0.4016, + "step": 8673 + }, + { + "epoch": 0.7044587021846829, + "grad_norm": 5.4889815502837775, + "learning_rate": 1.0605425293935748e-06, + "loss": 0.431, + "step": 8674 + }, + { + "epoch": 0.7045399171607244, + "grad_norm": 4.255319581514907, + "learning_rate": 1.0600049056258008e-06, + "loss": 0.623, + "step": 8675 + }, + { + "epoch": 0.704621132136766, + "grad_norm": 4.619912538008321, + "learning_rate": 1.0594673814980652e-06, + "loss": 0.4224, + "step": 8676 + }, + { + "epoch": 0.7047023471128075, + "grad_norm": 4.4624099848947445, + "learning_rate": 1.058929957047564e-06, + "loss": 0.4773, + "step": 8677 + }, + { + "epoch": 0.7047835620888492, + "grad_norm": 3.568435548062833, + "learning_rate": 1.0583926323114829e-06, + "loss": 0.5004, + "step": 8678 + }, + { + "epoch": 0.7048647770648908, + "grad_norm": 5.199277112036462, + "learning_rate": 1.057855407327001e-06, + "loss": 0.5205, + "step": 8679 + }, + { + "epoch": 0.7049459920409323, + "grad_norm": 4.806067495687931, + "learning_rate": 1.0573182821312927e-06, + "loss": 0.4348, + "step": 8680 + }, + { + "epoch": 0.705027207016974, + "grad_norm": 4.581743151795791, + "learning_rate": 1.056781256761525e-06, + "loss": 0.5282, + "step": 8681 + }, + { + "epoch": 0.7051084219930155, + "grad_norm": 4.187102501676517, + "learning_rate": 1.0562443312548558e-06, + "loss": 0.3811, + "step": 8682 + }, + { + "epoch": 0.7051896369690571, + "grad_norm": 10.94900482763677, + "learning_rate": 1.0557075056484373e-06, + "loss": 0.4645, + "step": 8683 + }, + { + "epoch": 0.7052708519450986, + "grad_norm": 3.286189799041891, + "learning_rate": 1.0551707799794164e-06, + "loss": 0.515, + "step": 8684 + }, + { + "epoch": 0.7053520669211403, + "grad_norm": 5.1687388035544535, + "learning_rate": 1.054634154284931e-06, + "loss": 0.3609, + "step": 8685 + }, + { + "epoch": 0.7054332818971818, + "grad_norm": 6.765257196850052, + "learning_rate": 1.0540976286021115e-06, + "loss": 0.5426, + "step": 8686 + }, + { + "epoch": 0.7055144968732234, + "grad_norm": 5.1893140007432885, + "learning_rate": 1.053561202968084e-06, + "loss": 0.4086, + "step": 8687 + }, + { + "epoch": 0.705595711849265, + "grad_norm": 4.075572539458918, + "learning_rate": 1.053024877419967e-06, + "loss": 0.4706, + "step": 8688 + }, + { + "epoch": 0.7056769268253066, + "grad_norm": 5.198722303974706, + "learning_rate": 1.0524886519948693e-06, + "loss": 0.5246, + "step": 8689 + }, + { + "epoch": 0.7057581418013482, + "grad_norm": 5.742252264604989, + "learning_rate": 1.0519525267298972e-06, + "loss": 0.3365, + "step": 8690 + }, + { + "epoch": 0.7058393567773897, + "grad_norm": 4.08553333166632, + "learning_rate": 1.0514165016621464e-06, + "loss": 0.6085, + "step": 8691 + }, + { + "epoch": 0.7059205717534314, + "grad_norm": 5.7021732693041995, + "learning_rate": 1.0508805768287061e-06, + "loss": 0.5561, + "step": 8692 + }, + { + "epoch": 0.7060017867294729, + "grad_norm": 11.500144742392935, + "learning_rate": 1.050344752266661e-06, + "loss": 0.4041, + "step": 8693 + }, + { + "epoch": 0.7060830017055145, + "grad_norm": 5.728188208611933, + "learning_rate": 1.0498090280130873e-06, + "loss": 0.5148, + "step": 8694 + }, + { + "epoch": 0.706164216681556, + "grad_norm": 3.8194604173660442, + "learning_rate": 1.0492734041050532e-06, + "loss": 0.5706, + "step": 8695 + }, + { + "epoch": 0.7062454316575977, + "grad_norm": 4.238106988426028, + "learning_rate": 1.0487378805796225e-06, + "loss": 0.4247, + "step": 8696 + }, + { + "epoch": 0.7063266466336392, + "grad_norm": 4.981104011286187, + "learning_rate": 1.0482024574738498e-06, + "loss": 0.4802, + "step": 8697 + }, + { + "epoch": 0.7064078616096808, + "grad_norm": 5.353614712971508, + "learning_rate": 1.0476671348247834e-06, + "loss": 0.3993, + "step": 8698 + }, + { + "epoch": 0.7064890765857224, + "grad_norm": 5.533743790772518, + "learning_rate": 1.047131912669464e-06, + "loss": 0.391, + "step": 8699 + }, + { + "epoch": 0.706570291561764, + "grad_norm": 5.99441767333963, + "learning_rate": 1.0465967910449274e-06, + "loss": 0.4343, + "step": 8700 + }, + { + "epoch": 0.7066515065378056, + "grad_norm": 10.472893307184739, + "learning_rate": 1.046061769988201e-06, + "loss": 0.3472, + "step": 8701 + }, + { + "epoch": 0.7067327215138471, + "grad_norm": 6.614917885036994, + "learning_rate": 1.045526849536305e-06, + "loss": 0.3806, + "step": 8702 + }, + { + "epoch": 0.7068139364898888, + "grad_norm": 6.161723653557723, + "learning_rate": 1.0449920297262542e-06, + "loss": 0.5681, + "step": 8703 + }, + { + "epoch": 0.7068951514659303, + "grad_norm": 4.341342853033297, + "learning_rate": 1.0444573105950543e-06, + "loss": 0.4967, + "step": 8704 + }, + { + "epoch": 0.7069763664419719, + "grad_norm": 3.964088413604789, + "learning_rate": 1.0439226921797042e-06, + "loss": 0.5641, + "step": 8705 + }, + { + "epoch": 0.7070575814180134, + "grad_norm": 7.436857002923769, + "learning_rate": 1.0433881745171976e-06, + "loss": 0.3931, + "step": 8706 + }, + { + "epoch": 0.7071387963940551, + "grad_norm": 21.38007720377836, + "learning_rate": 1.042853757644521e-06, + "loss": 0.5358, + "step": 8707 + }, + { + "epoch": 0.7072200113700966, + "grad_norm": 9.571044595510465, + "learning_rate": 1.0423194415986518e-06, + "loss": 0.394, + "step": 8708 + }, + { + "epoch": 0.7073012263461382, + "grad_norm": 3.4522432755903045, + "learning_rate": 1.0417852264165637e-06, + "loss": 0.7711, + "step": 8709 + }, + { + "epoch": 0.7073824413221798, + "grad_norm": 4.437337860525526, + "learning_rate": 1.0412511121352201e-06, + "loss": 0.5022, + "step": 8710 + }, + { + "epoch": 0.7074636562982214, + "grad_norm": 4.832102307573993, + "learning_rate": 1.0407170987915786e-06, + "loss": 0.3741, + "step": 8711 + }, + { + "epoch": 0.707544871274263, + "grad_norm": 4.424143419021889, + "learning_rate": 1.0401831864225915e-06, + "loss": 0.4477, + "step": 8712 + }, + { + "epoch": 0.7076260862503045, + "grad_norm": 4.167265583846372, + "learning_rate": 1.0396493750652008e-06, + "loss": 0.6794, + "step": 8713 + }, + { + "epoch": 0.7077073012263462, + "grad_norm": 5.174243065313732, + "learning_rate": 1.039115664756345e-06, + "loss": 0.4642, + "step": 8714 + }, + { + "epoch": 0.7077885162023877, + "grad_norm": 4.436027938464365, + "learning_rate": 1.0385820555329543e-06, + "loss": 0.5378, + "step": 8715 + }, + { + "epoch": 0.7078697311784293, + "grad_norm": 6.048625139053751, + "learning_rate": 1.0380485474319507e-06, + "loss": 0.4807, + "step": 8716 + }, + { + "epoch": 0.7079509461544709, + "grad_norm": 4.409054430942545, + "learning_rate": 1.0375151404902507e-06, + "loss": 0.4596, + "step": 8717 + }, + { + "epoch": 0.7080321611305125, + "grad_norm": 6.231874069745442, + "learning_rate": 1.0369818347447617e-06, + "loss": 0.396, + "step": 8718 + }, + { + "epoch": 0.708113376106554, + "grad_norm": 6.571192019178439, + "learning_rate": 1.0364486302323868e-06, + "loss": 0.6371, + "step": 8719 + }, + { + "epoch": 0.7081945910825956, + "grad_norm": 6.2588184473383555, + "learning_rate": 1.035915526990022e-06, + "loss": 0.4857, + "step": 8720 + }, + { + "epoch": 0.7082758060586372, + "grad_norm": 4.086683961249212, + "learning_rate": 1.0353825250545533e-06, + "loss": 0.4409, + "step": 8721 + }, + { + "epoch": 0.7083570210346788, + "grad_norm": 4.896811289532637, + "learning_rate": 1.0348496244628633e-06, + "loss": 0.4143, + "step": 8722 + }, + { + "epoch": 0.7084382360107204, + "grad_norm": 6.411254993382916, + "learning_rate": 1.0343168252518252e-06, + "loss": 0.4197, + "step": 8723 + }, + { + "epoch": 0.708519450986762, + "grad_norm": 6.5354079259733915, + "learning_rate": 1.0337841274583046e-06, + "loss": 0.6165, + "step": 8724 + }, + { + "epoch": 0.7086006659628036, + "grad_norm": 5.167176814080369, + "learning_rate": 1.0332515311191627e-06, + "loss": 0.5354, + "step": 8725 + }, + { + "epoch": 0.7086818809388451, + "grad_norm": 7.4442577463893835, + "learning_rate": 1.032719036271253e-06, + "loss": 0.5054, + "step": 8726 + }, + { + "epoch": 0.7087630959148867, + "grad_norm": 4.039848430781059, + "learning_rate": 1.0321866429514199e-06, + "loss": 0.5241, + "step": 8727 + }, + { + "epoch": 0.7088443108909283, + "grad_norm": 3.2336915820436736, + "learning_rate": 1.0316543511965035e-06, + "loss": 0.5754, + "step": 8728 + }, + { + "epoch": 0.7089255258669699, + "grad_norm": 4.7161410228484195, + "learning_rate": 1.031122161043335e-06, + "loss": 0.377, + "step": 8729 + }, + { + "epoch": 0.7090067408430114, + "grad_norm": 3.6485307474977624, + "learning_rate": 1.030590072528738e-06, + "loss": 0.4437, + "step": 8730 + }, + { + "epoch": 0.709087955819053, + "grad_norm": 4.868323212713634, + "learning_rate": 1.030058085689532e-06, + "loss": 0.4561, + "step": 8731 + }, + { + "epoch": 0.7091691707950946, + "grad_norm": 6.225865954562011, + "learning_rate": 1.0295262005625262e-06, + "loss": 0.413, + "step": 8732 + }, + { + "epoch": 0.7092503857711362, + "grad_norm": 4.393355867203619, + "learning_rate": 1.028994417184525e-06, + "loss": 0.5137, + "step": 8733 + }, + { + "epoch": 0.7093316007471778, + "grad_norm": 4.03858492787502, + "learning_rate": 1.0284627355923257e-06, + "loss": 0.5499, + "step": 8734 + }, + { + "epoch": 0.7094128157232193, + "grad_norm": 7.2081095740889225, + "learning_rate": 1.0279311558227174e-06, + "loss": 0.4145, + "step": 8735 + }, + { + "epoch": 0.709494030699261, + "grad_norm": 4.52684696449661, + "learning_rate": 1.027399677912482e-06, + "loss": 0.4802, + "step": 8736 + }, + { + "epoch": 0.7095752456753025, + "grad_norm": 4.627236682606212, + "learning_rate": 1.0268683018983944e-06, + "loss": 0.4124, + "step": 8737 + }, + { + "epoch": 0.7096564606513441, + "grad_norm": 4.60387331373005, + "learning_rate": 1.026337027817224e-06, + "loss": 0.4463, + "step": 8738 + }, + { + "epoch": 0.7097376756273857, + "grad_norm": 4.049679102453057, + "learning_rate": 1.0258058557057328e-06, + "loss": 0.4391, + "step": 8739 + }, + { + "epoch": 0.7098188906034273, + "grad_norm": 3.932889064119171, + "learning_rate": 1.0252747856006735e-06, + "loss": 0.4598, + "step": 8740 + }, + { + "epoch": 0.7099001055794688, + "grad_norm": 5.445177988596398, + "learning_rate": 1.0247438175387946e-06, + "loss": 0.3975, + "step": 8741 + }, + { + "epoch": 0.7099813205555104, + "grad_norm": 4.336033272668492, + "learning_rate": 1.0242129515568364e-06, + "loss": 0.6375, + "step": 8742 + }, + { + "epoch": 0.710062535531552, + "grad_norm": 5.942395150551147, + "learning_rate": 1.0236821876915303e-06, + "loss": 0.5762, + "step": 8743 + }, + { + "epoch": 0.7101437505075936, + "grad_norm": 7.955186448658773, + "learning_rate": 1.0231515259796046e-06, + "loss": 0.4297, + "step": 8744 + }, + { + "epoch": 0.7102249654836352, + "grad_norm": 4.589645135923486, + "learning_rate": 1.022620966457776e-06, + "loss": 0.5287, + "step": 8745 + }, + { + "epoch": 0.7103061804596768, + "grad_norm": 10.369934951756198, + "learning_rate": 1.0220905091627581e-06, + "loss": 0.3939, + "step": 8746 + }, + { + "epoch": 0.7103873954357184, + "grad_norm": 4.830026444654436, + "learning_rate": 1.0215601541312556e-06, + "loss": 0.4922, + "step": 8747 + }, + { + "epoch": 0.7104686104117599, + "grad_norm": 4.89390413686288, + "learning_rate": 1.0210299013999662e-06, + "loss": 0.4854, + "step": 8748 + }, + { + "epoch": 0.7105498253878015, + "grad_norm": 5.6081171633262015, + "learning_rate": 1.0204997510055793e-06, + "loss": 0.4383, + "step": 8749 + }, + { + "epoch": 0.7106310403638431, + "grad_norm": 7.598002540248723, + "learning_rate": 1.0199697029847804e-06, + "loss": 0.5754, + "step": 8750 + }, + { + "epoch": 0.7107122553398847, + "grad_norm": 6.496257353872819, + "learning_rate": 1.0194397573742442e-06, + "loss": 0.3061, + "step": 8751 + }, + { + "epoch": 0.7107934703159262, + "grad_norm": 7.92240606328125, + "learning_rate": 1.0189099142106421e-06, + "loss": 0.4262, + "step": 8752 + }, + { + "epoch": 0.7108746852919678, + "grad_norm": 27.70458593561067, + "learning_rate": 1.0183801735306342e-06, + "loss": 0.4334, + "step": 8753 + }, + { + "epoch": 0.7109559002680094, + "grad_norm": 4.449530450041586, + "learning_rate": 1.0178505353708779e-06, + "loss": 0.5682, + "step": 8754 + }, + { + "epoch": 0.711037115244051, + "grad_norm": 5.978122767767535, + "learning_rate": 1.0173209997680203e-06, + "loss": 0.5643, + "step": 8755 + }, + { + "epoch": 0.7111183302200926, + "grad_norm": 4.850468611110746, + "learning_rate": 1.0167915667587019e-06, + "loss": 0.47, + "step": 8756 + }, + { + "epoch": 0.7111995451961342, + "grad_norm": 4.3524338310305835, + "learning_rate": 1.016262236379558e-06, + "loss": 0.5099, + "step": 8757 + }, + { + "epoch": 0.7112807601721758, + "grad_norm": 6.685288002285286, + "learning_rate": 1.015733008667214e-06, + "loss": 0.6591, + "step": 8758 + }, + { + "epoch": 0.7113619751482173, + "grad_norm": 7.509285220548554, + "learning_rate": 1.0152038836582903e-06, + "loss": 0.5526, + "step": 8759 + }, + { + "epoch": 0.7114431901242589, + "grad_norm": 3.6292963710683366, + "learning_rate": 1.0146748613894005e-06, + "loss": 0.4786, + "step": 8760 + }, + { + "epoch": 0.7115244051003005, + "grad_norm": 4.730030440457719, + "learning_rate": 1.0141459418971496e-06, + "loss": 0.409, + "step": 8761 + }, + { + "epoch": 0.7116056200763421, + "grad_norm": 3.5510572572185977, + "learning_rate": 1.0136171252181348e-06, + "loss": 0.5859, + "step": 8762 + }, + { + "epoch": 0.7116868350523836, + "grad_norm": 7.350261219519113, + "learning_rate": 1.0130884113889491e-06, + "loss": 0.4996, + "step": 8763 + }, + { + "epoch": 0.7117680500284252, + "grad_norm": 4.1865654744906635, + "learning_rate": 1.0125598004461752e-06, + "loss": 0.395, + "step": 8764 + }, + { + "epoch": 0.7118492650044668, + "grad_norm": 8.196706389670227, + "learning_rate": 1.012031292426391e-06, + "loss": 0.4005, + "step": 8765 + }, + { + "epoch": 0.7119304799805084, + "grad_norm": 5.9136835481706855, + "learning_rate": 1.011502887366167e-06, + "loss": 0.4292, + "step": 8766 + }, + { + "epoch": 0.71201169495655, + "grad_norm": 5.09214562271127, + "learning_rate": 1.0109745853020655e-06, + "loss": 0.4259, + "step": 8767 + }, + { + "epoch": 0.7120929099325916, + "grad_norm": 8.352854444325088, + "learning_rate": 1.0104463862706414e-06, + "loss": 0.3907, + "step": 8768 + }, + { + "epoch": 0.7121741249086332, + "grad_norm": 4.539279498681904, + "learning_rate": 1.0099182903084448e-06, + "loss": 0.5434, + "step": 8769 + }, + { + "epoch": 0.7122553398846747, + "grad_norm": 6.108837746410746, + "learning_rate": 1.0093902974520165e-06, + "loss": 0.4994, + "step": 8770 + }, + { + "epoch": 0.7123365548607163, + "grad_norm": 6.827549453211144, + "learning_rate": 1.0088624077378897e-06, + "loss": 0.4736, + "step": 8771 + }, + { + "epoch": 0.7124177698367579, + "grad_norm": 5.0626039477295315, + "learning_rate": 1.0083346212025923e-06, + "loss": 0.6442, + "step": 8772 + }, + { + "epoch": 0.7124989848127995, + "grad_norm": 4.104258023545044, + "learning_rate": 1.0078069378826458e-06, + "loss": 0.579, + "step": 8773 + }, + { + "epoch": 0.712580199788841, + "grad_norm": 3.7706839195991297, + "learning_rate": 1.0072793578145618e-06, + "loss": 0.5327, + "step": 8774 + }, + { + "epoch": 0.7126614147648827, + "grad_norm": 7.155111652149057, + "learning_rate": 1.0067518810348453e-06, + "loss": 0.5282, + "step": 8775 + }, + { + "epoch": 0.7127426297409242, + "grad_norm": 7.637395079270046, + "learning_rate": 1.0062245075799966e-06, + "loss": 0.5045, + "step": 8776 + }, + { + "epoch": 0.7128238447169658, + "grad_norm": 6.413393628245962, + "learning_rate": 1.0056972374865054e-06, + "loss": 0.6264, + "step": 8777 + }, + { + "epoch": 0.7129050596930074, + "grad_norm": 5.7064526408846135, + "learning_rate": 1.0051700707908569e-06, + "loss": 0.7116, + "step": 8778 + }, + { + "epoch": 0.712986274669049, + "grad_norm": 5.902226546762958, + "learning_rate": 1.0046430075295287e-06, + "loss": 0.6247, + "step": 8779 + }, + { + "epoch": 0.7130674896450906, + "grad_norm": 5.831122076718099, + "learning_rate": 1.0041160477389909e-06, + "loss": 0.4892, + "step": 8780 + }, + { + "epoch": 0.7131487046211321, + "grad_norm": 5.215468338547022, + "learning_rate": 1.0035891914557044e-06, + "loss": 0.5625, + "step": 8781 + }, + { + "epoch": 0.7132299195971737, + "grad_norm": 7.447326637422664, + "learning_rate": 1.0030624387161273e-06, + "loss": 0.4898, + "step": 8782 + }, + { + "epoch": 0.7133111345732153, + "grad_norm": 4.541675365615177, + "learning_rate": 1.002535789556707e-06, + "loss": 0.5263, + "step": 8783 + }, + { + "epoch": 0.7133923495492569, + "grad_norm": 5.008687031433551, + "learning_rate": 1.0020092440138833e-06, + "loss": 0.57, + "step": 8784 + }, + { + "epoch": 0.7134735645252984, + "grad_norm": 4.218282129108528, + "learning_rate": 1.0014828021240932e-06, + "loss": 0.3873, + "step": 8785 + }, + { + "epoch": 0.71355477950134, + "grad_norm": 7.9910976481847875, + "learning_rate": 1.0009564639237627e-06, + "loss": 0.3644, + "step": 8786 + }, + { + "epoch": 0.7136359944773816, + "grad_norm": 4.348847062904255, + "learning_rate": 1.0004302294493104e-06, + "loss": 0.4983, + "step": 8787 + }, + { + "epoch": 0.7137172094534232, + "grad_norm": 6.4121947565646815, + "learning_rate": 9.999040987371505e-07, + "loss": 0.4464, + "step": 8788 + }, + { + "epoch": 0.7137984244294648, + "grad_norm": 4.829469885568656, + "learning_rate": 9.993780718236882e-07, + "loss": 0.3603, + "step": 8789 + }, + { + "epoch": 0.7138796394055064, + "grad_norm": 3.668444891519201, + "learning_rate": 9.988521487453203e-07, + "loss": 0.5219, + "step": 8790 + }, + { + "epoch": 0.713960854381548, + "grad_norm": 4.4085220444412245, + "learning_rate": 9.98326329538439e-07, + "loss": 0.48, + "step": 8791 + }, + { + "epoch": 0.7140420693575895, + "grad_norm": 6.942288338011259, + "learning_rate": 9.978006142394292e-07, + "loss": 0.6356, + "step": 8792 + }, + { + "epoch": 0.7141232843336311, + "grad_norm": 3.9011079567183353, + "learning_rate": 9.972750028846665e-07, + "loss": 0.4649, + "step": 8793 + }, + { + "epoch": 0.7142044993096727, + "grad_norm": 5.691178624558273, + "learning_rate": 9.967494955105197e-07, + "loss": 0.5606, + "step": 8794 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 4.779174640506254, + "learning_rate": 9.962240921533528e-07, + "loss": 0.4898, + "step": 8795 + }, + { + "epoch": 0.7143669292617558, + "grad_norm": 5.8645704612877445, + "learning_rate": 9.956987928495193e-07, + "loss": 0.4236, + "step": 8796 + }, + { + "epoch": 0.7144481442377975, + "grad_norm": 4.402062224686781, + "learning_rate": 9.951735976353677e-07, + "loss": 0.4852, + "step": 8797 + }, + { + "epoch": 0.714529359213839, + "grad_norm": 5.0026387695604875, + "learning_rate": 9.946485065472402e-07, + "loss": 0.357, + "step": 8798 + }, + { + "epoch": 0.7146105741898806, + "grad_norm": 7.498003830377879, + "learning_rate": 9.941235196214687e-07, + "loss": 0.5172, + "step": 8799 + }, + { + "epoch": 0.7146917891659222, + "grad_norm": 3.3478530645853675, + "learning_rate": 9.935986368943796e-07, + "loss": 0.4808, + "step": 8800 + }, + { + "epoch": 0.7147730041419638, + "grad_norm": 6.779585993646702, + "learning_rate": 9.930738584022925e-07, + "loss": 0.5855, + "step": 8801 + }, + { + "epoch": 0.7148542191180054, + "grad_norm": 4.424567878048419, + "learning_rate": 9.925491841815197e-07, + "loss": 0.4775, + "step": 8802 + }, + { + "epoch": 0.7149354340940469, + "grad_norm": 4.168929836677361, + "learning_rate": 9.92024614268364e-07, + "loss": 0.611, + "step": 8803 + }, + { + "epoch": 0.7150166490700886, + "grad_norm": 4.398671369510578, + "learning_rate": 9.915001486991243e-07, + "loss": 0.6159, + "step": 8804 + }, + { + "epoch": 0.7150978640461301, + "grad_norm": 4.041777750287028, + "learning_rate": 9.909757875100914e-07, + "loss": 0.4175, + "step": 8805 + }, + { + "epoch": 0.7151790790221717, + "grad_norm": 7.847660489045761, + "learning_rate": 9.904515307375478e-07, + "loss": 0.5913, + "step": 8806 + }, + { + "epoch": 0.7152602939982132, + "grad_norm": 4.250630000320927, + "learning_rate": 9.899273784177681e-07, + "loss": 0.3363, + "step": 8807 + }, + { + "epoch": 0.7153415089742549, + "grad_norm": 8.197363212404545, + "learning_rate": 9.894033305870229e-07, + "loss": 0.4725, + "step": 8808 + }, + { + "epoch": 0.7154227239502964, + "grad_norm": 3.91482587309766, + "learning_rate": 9.888793872815716e-07, + "loss": 0.4924, + "step": 8809 + }, + { + "epoch": 0.715503938926338, + "grad_norm": 4.933671181822034, + "learning_rate": 9.883555485376688e-07, + "loss": 0.6437, + "step": 8810 + }, + { + "epoch": 0.7155851539023796, + "grad_norm": 5.499179017080881, + "learning_rate": 9.878318143915633e-07, + "loss": 0.3706, + "step": 8811 + }, + { + "epoch": 0.7156663688784212, + "grad_norm": 7.743811775235963, + "learning_rate": 9.873081848794926e-07, + "loss": 0.4138, + "step": 8812 + }, + { + "epoch": 0.7157475838544628, + "grad_norm": 4.332868158341539, + "learning_rate": 9.867846600376892e-07, + "loss": 0.5267, + "step": 8813 + }, + { + "epoch": 0.7158287988305043, + "grad_norm": 4.072077480048703, + "learning_rate": 9.862612399023797e-07, + "loss": 0.6121, + "step": 8814 + }, + { + "epoch": 0.715910013806546, + "grad_norm": 6.275203078249558, + "learning_rate": 9.85737924509781e-07, + "loss": 0.4842, + "step": 8815 + }, + { + "epoch": 0.7159912287825875, + "grad_norm": 6.229912521186827, + "learning_rate": 9.852147138961026e-07, + "loss": 0.3927, + "step": 8816 + }, + { + "epoch": 0.7160724437586291, + "grad_norm": 5.84959994620228, + "learning_rate": 9.846916080975493e-07, + "loss": 0.4515, + "step": 8817 + }, + { + "epoch": 0.7161536587346706, + "grad_norm": 8.565273476837879, + "learning_rate": 9.841686071503178e-07, + "loss": 0.4579, + "step": 8818 + }, + { + "epoch": 0.7162348737107123, + "grad_norm": 5.7709811538417854, + "learning_rate": 9.836457110905956e-07, + "loss": 0.7484, + "step": 8819 + }, + { + "epoch": 0.7163160886867538, + "grad_norm": 5.3290551767188035, + "learning_rate": 9.831229199545659e-07, + "loss": 0.3563, + "step": 8820 + }, + { + "epoch": 0.7163973036627954, + "grad_norm": 6.90902242367057, + "learning_rate": 9.82600233778402e-07, + "loss": 0.5667, + "step": 8821 + }, + { + "epoch": 0.716478518638837, + "grad_norm": 5.0511931716125105, + "learning_rate": 9.820776525982703e-07, + "loss": 0.4209, + "step": 8822 + }, + { + "epoch": 0.7165597336148786, + "grad_norm": 4.8369460147576095, + "learning_rate": 9.815551764503317e-07, + "loss": 0.4668, + "step": 8823 + }, + { + "epoch": 0.7166409485909202, + "grad_norm": 4.486543984473168, + "learning_rate": 9.810328053707394e-07, + "loss": 0.8039, + "step": 8824 + }, + { + "epoch": 0.7167221635669617, + "grad_norm": 4.014813431510159, + "learning_rate": 9.805105393956378e-07, + "loss": 0.471, + "step": 8825 + }, + { + "epoch": 0.7168033785430034, + "grad_norm": 7.519778869225568, + "learning_rate": 9.799883785611647e-07, + "loss": 0.4735, + "step": 8826 + }, + { + "epoch": 0.7168845935190449, + "grad_norm": 5.5537814970796315, + "learning_rate": 9.794663229034518e-07, + "loss": 0.4381, + "step": 8827 + }, + { + "epoch": 0.7169658084950865, + "grad_norm": 19.344716592490663, + "learning_rate": 9.78944372458622e-07, + "loss": 0.4718, + "step": 8828 + }, + { + "epoch": 0.717047023471128, + "grad_norm": 5.884182711703903, + "learning_rate": 9.784225272627908e-07, + "loss": 0.5988, + "step": 8829 + }, + { + "epoch": 0.7171282384471697, + "grad_norm": 4.891986811172294, + "learning_rate": 9.77900787352068e-07, + "loss": 0.3812, + "step": 8830 + }, + { + "epoch": 0.7172094534232112, + "grad_norm": 10.276801237301445, + "learning_rate": 9.773791527625557e-07, + "loss": 0.6522, + "step": 8831 + }, + { + "epoch": 0.7172906683992528, + "grad_norm": 6.1468764794545505, + "learning_rate": 9.76857623530347e-07, + "loss": 0.5199, + "step": 8832 + }, + { + "epoch": 0.7173718833752945, + "grad_norm": 3.8035905037114097, + "learning_rate": 9.763361996915302e-07, + "loss": 0.5275, + "step": 8833 + }, + { + "epoch": 0.717453098351336, + "grad_norm": 6.897927489006048, + "learning_rate": 9.75814881282185e-07, + "loss": 0.5616, + "step": 8834 + }, + { + "epoch": 0.7175343133273776, + "grad_norm": 3.453903520221075, + "learning_rate": 9.752936683383822e-07, + "loss": 0.4339, + "step": 8835 + }, + { + "epoch": 0.7176155283034191, + "grad_norm": 4.99189702123876, + "learning_rate": 9.747725608961881e-07, + "loss": 0.5881, + "step": 8836 + }, + { + "epoch": 0.7176967432794608, + "grad_norm": 5.00035982299125, + "learning_rate": 9.742515589916615e-07, + "loss": 0.4863, + "step": 8837 + }, + { + "epoch": 0.7177779582555023, + "grad_norm": 4.906966447423704, + "learning_rate": 9.737306626608514e-07, + "loss": 0.5093, + "step": 8838 + }, + { + "epoch": 0.7178591732315439, + "grad_norm": 5.968171309664626, + "learning_rate": 9.732098719398025e-07, + "loss": 0.5323, + "step": 8839 + }, + { + "epoch": 0.7179403882075854, + "grad_norm": 4.720388953914238, + "learning_rate": 9.726891868645502e-07, + "loss": 0.4468, + "step": 8840 + }, + { + "epoch": 0.7180216031836271, + "grad_norm": 3.842805699155025, + "learning_rate": 9.721686074711228e-07, + "loss": 0.5385, + "step": 8841 + }, + { + "epoch": 0.7181028181596686, + "grad_norm": 9.473267130229601, + "learning_rate": 9.716481337955411e-07, + "loss": 0.4456, + "step": 8842 + }, + { + "epoch": 0.7181840331357102, + "grad_norm": 5.317653998758775, + "learning_rate": 9.711277658738197e-07, + "loss": 0.6716, + "step": 8843 + }, + { + "epoch": 0.7182652481117519, + "grad_norm": 4.085741117429908, + "learning_rate": 9.706075037419666e-07, + "loss": 0.553, + "step": 8844 + }, + { + "epoch": 0.7183464630877934, + "grad_norm": 21.53419743501485, + "learning_rate": 9.700873474359786e-07, + "loss": 0.4472, + "step": 8845 + }, + { + "epoch": 0.718427678063835, + "grad_norm": 6.994061607271889, + "learning_rate": 9.695672969918508e-07, + "loss": 0.4094, + "step": 8846 + }, + { + "epoch": 0.7185088930398765, + "grad_norm": 4.554212460035874, + "learning_rate": 9.69047352445566e-07, + "loss": 0.4225, + "step": 8847 + }, + { + "epoch": 0.7185901080159182, + "grad_norm": 7.821491253661923, + "learning_rate": 9.68527513833101e-07, + "loss": 0.4877, + "step": 8848 + }, + { + "epoch": 0.7186713229919597, + "grad_norm": 3.1283429984031783, + "learning_rate": 9.68007781190427e-07, + "loss": 0.3561, + "step": 8849 + }, + { + "epoch": 0.7187525379680013, + "grad_norm": 5.628658133939152, + "learning_rate": 9.674881545535073e-07, + "loss": 0.5745, + "step": 8850 + }, + { + "epoch": 0.7188337529440428, + "grad_norm": 11.726251049043716, + "learning_rate": 9.669686339582959e-07, + "loss": 0.6452, + "step": 8851 + }, + { + "epoch": 0.7189149679200845, + "grad_norm": 4.111507695592982, + "learning_rate": 9.664492194407425e-07, + "loss": 0.559, + "step": 8852 + }, + { + "epoch": 0.718996182896126, + "grad_norm": 6.507482959837951, + "learning_rate": 9.659299110367868e-07, + "loss": 0.4079, + "step": 8853 + }, + { + "epoch": 0.7190773978721676, + "grad_norm": 6.32505374062163, + "learning_rate": 9.654107087823613e-07, + "loss": 0.6939, + "step": 8854 + }, + { + "epoch": 0.7191586128482093, + "grad_norm": 7.74320454513088, + "learning_rate": 9.64891612713393e-07, + "loss": 0.6168, + "step": 8855 + }, + { + "epoch": 0.7192398278242508, + "grad_norm": 9.529588740791155, + "learning_rate": 9.643726228658017e-07, + "loss": 0.4821, + "step": 8856 + }, + { + "epoch": 0.7193210428002924, + "grad_norm": 5.534280027657455, + "learning_rate": 9.638537392754968e-07, + "loss": 0.4415, + "step": 8857 + }, + { + "epoch": 0.7194022577763339, + "grad_norm": 5.287392525426979, + "learning_rate": 9.63334961978384e-07, + "loss": 0.4648, + "step": 8858 + }, + { + "epoch": 0.7194834727523756, + "grad_norm": 5.411950885514694, + "learning_rate": 9.628162910103595e-07, + "loss": 0.4528, + "step": 8859 + }, + { + "epoch": 0.7195646877284171, + "grad_norm": 5.234971772019722, + "learning_rate": 9.62297726407312e-07, + "loss": 0.5357, + "step": 8860 + }, + { + "epoch": 0.7196459027044587, + "grad_norm": 3.5786127468446147, + "learning_rate": 9.617792682051228e-07, + "loss": 0.4252, + "step": 8861 + }, + { + "epoch": 0.7197271176805002, + "grad_norm": 4.004340303264486, + "learning_rate": 9.612609164396672e-07, + "loss": 0.4677, + "step": 8862 + }, + { + "epoch": 0.7198083326565419, + "grad_norm": 5.758524558297638, + "learning_rate": 9.607426711468135e-07, + "loss": 0.4476, + "step": 8863 + }, + { + "epoch": 0.7198895476325834, + "grad_norm": 8.535570347156773, + "learning_rate": 9.602245323624195e-07, + "loss": 0.4546, + "step": 8864 + }, + { + "epoch": 0.719970762608625, + "grad_norm": 4.001330283022169, + "learning_rate": 9.597065001223397e-07, + "loss": 0.4607, + "step": 8865 + }, + { + "epoch": 0.7200519775846667, + "grad_norm": 4.377728948837983, + "learning_rate": 9.591885744624183e-07, + "loss": 0.3572, + "step": 8866 + }, + { + "epoch": 0.7201331925607082, + "grad_norm": 3.8588903316964442, + "learning_rate": 9.586707554184918e-07, + "loss": 0.4552, + "step": 8867 + }, + { + "epoch": 0.7202144075367498, + "grad_norm": 6.457542883089068, + "learning_rate": 9.581530430263919e-07, + "loss": 0.4036, + "step": 8868 + }, + { + "epoch": 0.7202956225127913, + "grad_norm": 6.505717474355659, + "learning_rate": 9.57635437321942e-07, + "loss": 0.2751, + "step": 8869 + }, + { + "epoch": 0.720376837488833, + "grad_norm": 7.655308168808172, + "learning_rate": 9.571179383409561e-07, + "loss": 0.4116, + "step": 8870 + }, + { + "epoch": 0.7204580524648745, + "grad_norm": 3.1031621847917634, + "learning_rate": 9.566005461192444e-07, + "loss": 0.4976, + "step": 8871 + }, + { + "epoch": 0.7205392674409161, + "grad_norm": 6.088083430309052, + "learning_rate": 9.560832606926064e-07, + "loss": 0.4635, + "step": 8872 + }, + { + "epoch": 0.7206204824169576, + "grad_norm": 6.004449008237206, + "learning_rate": 9.55566082096835e-07, + "loss": 0.5112, + "step": 8873 + }, + { + "epoch": 0.7207016973929993, + "grad_norm": 9.024846945792124, + "learning_rate": 9.550490103677176e-07, + "loss": 0.3386, + "step": 8874 + }, + { + "epoch": 0.7207829123690408, + "grad_norm": 4.2699585973289915, + "learning_rate": 9.54532045541031e-07, + "loss": 0.4822, + "step": 8875 + }, + { + "epoch": 0.7208641273450824, + "grad_norm": 4.631080829221704, + "learning_rate": 9.54015187652548e-07, + "loss": 0.5532, + "step": 8876 + }, + { + "epoch": 0.7209453423211241, + "grad_norm": 6.8091971004552585, + "learning_rate": 9.534984367380329e-07, + "loss": 0.5322, + "step": 8877 + }, + { + "epoch": 0.7210265572971656, + "grad_norm": 3.589150990709112, + "learning_rate": 9.529817928332411e-07, + "loss": 0.523, + "step": 8878 + }, + { + "epoch": 0.7211077722732072, + "grad_norm": 8.421640744360598, + "learning_rate": 9.524652559739217e-07, + "loss": 0.3827, + "step": 8879 + }, + { + "epoch": 0.7211889872492487, + "grad_norm": 5.462634830370036, + "learning_rate": 9.519488261958157e-07, + "loss": 0.4339, + "step": 8880 + }, + { + "epoch": 0.7212702022252904, + "grad_norm": 6.652542616373296, + "learning_rate": 9.514325035346577e-07, + "loss": 0.5582, + "step": 8881 + }, + { + "epoch": 0.7213514172013319, + "grad_norm": 11.329982918292163, + "learning_rate": 9.509162880261757e-07, + "loss": 0.4287, + "step": 8882 + }, + { + "epoch": 0.7214326321773735, + "grad_norm": 4.346970981846385, + "learning_rate": 9.504001797060875e-07, + "loss": 0.3969, + "step": 8883 + }, + { + "epoch": 0.721513847153415, + "grad_norm": 10.533676550458823, + "learning_rate": 9.498841786101065e-07, + "loss": 0.3896, + "step": 8884 + }, + { + "epoch": 0.7215950621294567, + "grad_norm": 6.522166801681026, + "learning_rate": 9.493682847739363e-07, + "loss": 0.4312, + "step": 8885 + }, + { + "epoch": 0.7216762771054982, + "grad_norm": 15.683352653559364, + "learning_rate": 9.488524982332734e-07, + "loss": 0.4908, + "step": 8886 + }, + { + "epoch": 0.7217574920815398, + "grad_norm": 4.885082924071923, + "learning_rate": 9.483368190238093e-07, + "loss": 0.604, + "step": 8887 + }, + { + "epoch": 0.7218387070575815, + "grad_norm": 6.215187035651405, + "learning_rate": 9.478212471812242e-07, + "loss": 0.6215, + "step": 8888 + }, + { + "epoch": 0.721919922033623, + "grad_norm": 6.199409295607022, + "learning_rate": 9.473057827411941e-07, + "loss": 0.46, + "step": 8889 + }, + { + "epoch": 0.7220011370096646, + "grad_norm": 5.210688836874202, + "learning_rate": 9.467904257393873e-07, + "loss": 0.4037, + "step": 8890 + }, + { + "epoch": 0.7220823519857061, + "grad_norm": 5.580114042556584, + "learning_rate": 9.462751762114625e-07, + "loss": 0.5964, + "step": 8891 + }, + { + "epoch": 0.7221635669617478, + "grad_norm": 5.621455969727306, + "learning_rate": 9.45760034193072e-07, + "loss": 0.5682, + "step": 8892 + }, + { + "epoch": 0.7222447819377893, + "grad_norm": 11.91702063843931, + "learning_rate": 9.45244999719862e-07, + "loss": 0.3895, + "step": 8893 + }, + { + "epoch": 0.7223259969138309, + "grad_norm": 4.231827622581398, + "learning_rate": 9.447300728274689e-07, + "loss": 0.3848, + "step": 8894 + }, + { + "epoch": 0.7224072118898724, + "grad_norm": 4.914560874009442, + "learning_rate": 9.442152535515245e-07, + "loss": 0.4818, + "step": 8895 + }, + { + "epoch": 0.7224884268659141, + "grad_norm": 3.3941269423081386, + "learning_rate": 9.437005419276496e-07, + "loss": 0.5125, + "step": 8896 + }, + { + "epoch": 0.7225696418419557, + "grad_norm": 7.1114894868029515, + "learning_rate": 9.431859379914615e-07, + "loss": 0.5125, + "step": 8897 + }, + { + "epoch": 0.7226508568179972, + "grad_norm": 7.590278377377921, + "learning_rate": 9.426714417785673e-07, + "loss": 0.4268, + "step": 8898 + }, + { + "epoch": 0.7227320717940389, + "grad_norm": 8.118555760980742, + "learning_rate": 9.421570533245663e-07, + "loss": 0.6289, + "step": 8899 + }, + { + "epoch": 0.7228132867700804, + "grad_norm": 4.8344990060798025, + "learning_rate": 9.416427726650535e-07, + "loss": 0.4768, + "step": 8900 + }, + { + "epoch": 0.722894501746122, + "grad_norm": 3.812357829292625, + "learning_rate": 9.411285998356124e-07, + "loss": 0.588, + "step": 8901 + }, + { + "epoch": 0.7229757167221635, + "grad_norm": 4.3624783009325805, + "learning_rate": 9.406145348718218e-07, + "loss": 0.6037, + "step": 8902 + }, + { + "epoch": 0.7230569316982052, + "grad_norm": 7.735840837751049, + "learning_rate": 9.401005778092537e-07, + "loss": 0.5546, + "step": 8903 + }, + { + "epoch": 0.7231381466742467, + "grad_norm": 4.2176648353846184, + "learning_rate": 9.395867286834695e-07, + "loss": 0.4084, + "step": 8904 + }, + { + "epoch": 0.7232193616502883, + "grad_norm": 10.500833125072006, + "learning_rate": 9.390729875300247e-07, + "loss": 0.3893, + "step": 8905 + }, + { + "epoch": 0.7233005766263298, + "grad_norm": 5.323683099575961, + "learning_rate": 9.38559354384469e-07, + "loss": 0.48, + "step": 8906 + }, + { + "epoch": 0.7233817916023715, + "grad_norm": 4.623752481785489, + "learning_rate": 9.38045829282341e-07, + "loss": 0.6171, + "step": 8907 + }, + { + "epoch": 0.7234630065784131, + "grad_norm": 7.855386690391926, + "learning_rate": 9.375324122591753e-07, + "loss": 0.5059, + "step": 8908 + }, + { + "epoch": 0.7235442215544546, + "grad_norm": 4.500533084765437, + "learning_rate": 9.370191033504982e-07, + "loss": 0.5279, + "step": 8909 + }, + { + "epoch": 0.7236254365304963, + "grad_norm": 7.958925600183833, + "learning_rate": 9.365059025918274e-07, + "loss": 0.4437, + "step": 8910 + }, + { + "epoch": 0.7237066515065378, + "grad_norm": 3.95735207574078, + "learning_rate": 9.359928100186724e-07, + "loss": 0.5215, + "step": 8911 + }, + { + "epoch": 0.7237878664825794, + "grad_norm": 11.501156203158045, + "learning_rate": 9.354798256665384e-07, + "loss": 0.474, + "step": 8912 + }, + { + "epoch": 0.7238690814586209, + "grad_norm": 4.487504865241946, + "learning_rate": 9.349669495709208e-07, + "loss": 0.3921, + "step": 8913 + }, + { + "epoch": 0.7239502964346626, + "grad_norm": 8.160345585558959, + "learning_rate": 9.344541817673061e-07, + "loss": 0.5789, + "step": 8914 + }, + { + "epoch": 0.7240315114107041, + "grad_norm": 5.808899053157705, + "learning_rate": 9.339415222911766e-07, + "loss": 0.5123, + "step": 8915 + }, + { + "epoch": 0.7241127263867457, + "grad_norm": 3.2770410237558165, + "learning_rate": 9.334289711780062e-07, + "loss": 0.7287, + "step": 8916 + }, + { + "epoch": 0.7241939413627873, + "grad_norm": 6.106039871460762, + "learning_rate": 9.329165284632602e-07, + "loss": 0.4535, + "step": 8917 + }, + { + "epoch": 0.7242751563388289, + "grad_norm": 4.5914206885164, + "learning_rate": 9.324041941823961e-07, + "loss": 0.4845, + "step": 8918 + }, + { + "epoch": 0.7243563713148705, + "grad_norm": 8.623594737117811, + "learning_rate": 9.318919683708661e-07, + "loss": 0.5425, + "step": 8919 + }, + { + "epoch": 0.724437586290912, + "grad_norm": 5.503813897391588, + "learning_rate": 9.313798510641117e-07, + "loss": 0.507, + "step": 8920 + }, + { + "epoch": 0.7245188012669537, + "grad_norm": 27.071015075508583, + "learning_rate": 9.308678422975701e-07, + "loss": 0.4499, + "step": 8921 + }, + { + "epoch": 0.7246000162429952, + "grad_norm": 5.264809337126668, + "learning_rate": 9.303559421066699e-07, + "loss": 0.4108, + "step": 8922 + }, + { + "epoch": 0.7246812312190368, + "grad_norm": 4.739116238844419, + "learning_rate": 9.298441505268316e-07, + "loss": 0.459, + "step": 8923 + }, + { + "epoch": 0.7247624461950783, + "grad_norm": 7.57503395698494, + "learning_rate": 9.29332467593467e-07, + "loss": 0.4787, + "step": 8924 + }, + { + "epoch": 0.72484366117112, + "grad_norm": 7.320416604446665, + "learning_rate": 9.28820893341984e-07, + "loss": 0.4384, + "step": 8925 + }, + { + "epoch": 0.7249248761471615, + "grad_norm": 8.330410195117771, + "learning_rate": 9.28309427807779e-07, + "loss": 0.5397, + "step": 8926 + }, + { + "epoch": 0.7250060911232031, + "grad_norm": 8.653115313732869, + "learning_rate": 9.277980710262432e-07, + "loss": 0.5418, + "step": 8927 + }, + { + "epoch": 0.7250873060992447, + "grad_norm": 5.712763483431208, + "learning_rate": 9.272868230327614e-07, + "loss": 0.4397, + "step": 8928 + }, + { + "epoch": 0.7251685210752863, + "grad_norm": 7.20946070132443, + "learning_rate": 9.267756838627079e-07, + "loss": 0.619, + "step": 8929 + }, + { + "epoch": 0.7252497360513279, + "grad_norm": 3.836932313032167, + "learning_rate": 9.262646535514499e-07, + "loss": 0.5411, + "step": 8930 + }, + { + "epoch": 0.7253309510273694, + "grad_norm": 7.397925686014042, + "learning_rate": 9.257537321343499e-07, + "loss": 0.5804, + "step": 8931 + }, + { + "epoch": 0.7254121660034111, + "grad_norm": 7.414052669733408, + "learning_rate": 9.252429196467603e-07, + "loss": 0.5053, + "step": 8932 + }, + { + "epoch": 0.7254933809794526, + "grad_norm": 6.780735171626088, + "learning_rate": 9.247322161240252e-07, + "loss": 0.4038, + "step": 8933 + }, + { + "epoch": 0.7255745959554942, + "grad_norm": 4.08445905007205, + "learning_rate": 9.242216216014838e-07, + "loss": 0.6769, + "step": 8934 + }, + { + "epoch": 0.7256558109315357, + "grad_norm": 5.6918275065058275, + "learning_rate": 9.237111361144674e-07, + "loss": 0.3345, + "step": 8935 + }, + { + "epoch": 0.7257370259075774, + "grad_norm": 5.320211088421992, + "learning_rate": 9.232007596982978e-07, + "loss": 0.5265, + "step": 8936 + }, + { + "epoch": 0.7258182408836189, + "grad_norm": 5.5679637549611165, + "learning_rate": 9.226904923882901e-07, + "loss": 0.5231, + "step": 8937 + }, + { + "epoch": 0.7258994558596605, + "grad_norm": 9.0059520761274, + "learning_rate": 9.22180334219753e-07, + "loss": 0.4251, + "step": 8938 + }, + { + "epoch": 0.7259806708357021, + "grad_norm": 7.189840208919893, + "learning_rate": 9.216702852279857e-07, + "loss": 0.5836, + "step": 8939 + }, + { + "epoch": 0.7260618858117437, + "grad_norm": 8.24631932538056, + "learning_rate": 9.211603454482812e-07, + "loss": 0.5427, + "step": 8940 + }, + { + "epoch": 0.7261431007877853, + "grad_norm": 11.91801174885722, + "learning_rate": 9.206505149159259e-07, + "loss": 0.5369, + "step": 8941 + }, + { + "epoch": 0.7262243157638268, + "grad_norm": 4.788846104575723, + "learning_rate": 9.201407936661963e-07, + "loss": 0.6227, + "step": 8942 + }, + { + "epoch": 0.7263055307398685, + "grad_norm": 4.572117460437812, + "learning_rate": 9.196311817343618e-07, + "loss": 0.4761, + "step": 8943 + }, + { + "epoch": 0.72638674571591, + "grad_norm": 6.432401948520424, + "learning_rate": 9.191216791556864e-07, + "loss": 0.494, + "step": 8944 + }, + { + "epoch": 0.7264679606919516, + "grad_norm": 3.666688290179581, + "learning_rate": 9.18612285965424e-07, + "loss": 0.4724, + "step": 8945 + }, + { + "epoch": 0.7265491756679932, + "grad_norm": 3.558045855054741, + "learning_rate": 9.18103002198821e-07, + "loss": 0.6309, + "step": 8946 + }, + { + "epoch": 0.7266303906440348, + "grad_norm": 5.292568438307435, + "learning_rate": 9.175938278911184e-07, + "loss": 0.5463, + "step": 8947 + }, + { + "epoch": 0.7267116056200763, + "grad_norm": 8.253942436158207, + "learning_rate": 9.170847630775489e-07, + "loss": 0.4128, + "step": 8948 + }, + { + "epoch": 0.7267928205961179, + "grad_norm": 12.693589172331533, + "learning_rate": 9.165758077933365e-07, + "loss": 0.5094, + "step": 8949 + }, + { + "epoch": 0.7268740355721595, + "grad_norm": 4.3211523742949876, + "learning_rate": 9.160669620736973e-07, + "loss": 0.5676, + "step": 8950 + }, + { + "epoch": 0.7269552505482011, + "grad_norm": 11.192017775213207, + "learning_rate": 9.15558225953842e-07, + "loss": 0.5242, + "step": 8951 + }, + { + "epoch": 0.7270364655242427, + "grad_norm": 25.015456026993192, + "learning_rate": 9.150495994689712e-07, + "loss": 0.4027, + "step": 8952 + }, + { + "epoch": 0.7271176805002842, + "grad_norm": 5.152075474730206, + "learning_rate": 9.145410826542797e-07, + "loss": 0.4545, + "step": 8953 + }, + { + "epoch": 0.7271988954763259, + "grad_norm": 6.684848268615437, + "learning_rate": 9.140326755449555e-07, + "loss": 0.4933, + "step": 8954 + }, + { + "epoch": 0.7272801104523674, + "grad_norm": 5.088644961893354, + "learning_rate": 9.135243781761763e-07, + "loss": 0.4792, + "step": 8955 + }, + { + "epoch": 0.727361325428409, + "grad_norm": 3.810037495766052, + "learning_rate": 9.130161905831131e-07, + "loss": 0.5671, + "step": 8956 + }, + { + "epoch": 0.7274425404044506, + "grad_norm": 8.122117526021382, + "learning_rate": 9.125081128009314e-07, + "loss": 0.4428, + "step": 8957 + }, + { + "epoch": 0.7275237553804922, + "grad_norm": 6.6487819565990325, + "learning_rate": 9.120001448647867e-07, + "loss": 0.4813, + "step": 8958 + }, + { + "epoch": 0.7276049703565337, + "grad_norm": 6.992838856608024, + "learning_rate": 9.114922868098267e-07, + "loss": 0.4352, + "step": 8959 + }, + { + "epoch": 0.7276861853325753, + "grad_norm": 4.483263285402361, + "learning_rate": 9.109845386711932e-07, + "loss": 0.5618, + "step": 8960 + }, + { + "epoch": 0.7277674003086169, + "grad_norm": 4.2451531883975235, + "learning_rate": 9.104769004840208e-07, + "loss": 0.5705, + "step": 8961 + }, + { + "epoch": 0.7278486152846585, + "grad_norm": 3.5682877453830186, + "learning_rate": 9.099693722834336e-07, + "loss": 0.3645, + "step": 8962 + }, + { + "epoch": 0.7279298302607001, + "grad_norm": 4.835617927031836, + "learning_rate": 9.094619541045516e-07, + "loss": 0.6062, + "step": 8963 + }, + { + "epoch": 0.7280110452367416, + "grad_norm": 4.723308760739116, + "learning_rate": 9.089546459824846e-07, + "loss": 0.441, + "step": 8964 + }, + { + "epoch": 0.7280922602127833, + "grad_norm": 5.594812767897018, + "learning_rate": 9.084474479523347e-07, + "loss": 0.4843, + "step": 8965 + }, + { + "epoch": 0.7281734751888248, + "grad_norm": 5.966924171312079, + "learning_rate": 9.079403600491982e-07, + "loss": 0.4151, + "step": 8966 + }, + { + "epoch": 0.7282546901648664, + "grad_norm": 4.336193418701047, + "learning_rate": 9.074333823081638e-07, + "loss": 0.5172, + "step": 8967 + }, + { + "epoch": 0.728335905140908, + "grad_norm": 6.063173388232329, + "learning_rate": 9.069265147643109e-07, + "loss": 0.4559, + "step": 8968 + }, + { + "epoch": 0.7284171201169496, + "grad_norm": 4.628275139282452, + "learning_rate": 9.064197574527112e-07, + "loss": 0.5929, + "step": 8969 + }, + { + "epoch": 0.7284983350929911, + "grad_norm": 3.7566474590811447, + "learning_rate": 9.059131104084309e-07, + "loss": 0.6199, + "step": 8970 + }, + { + "epoch": 0.7285795500690327, + "grad_norm": 5.357753313412571, + "learning_rate": 9.054065736665268e-07, + "loss": 0.519, + "step": 8971 + }, + { + "epoch": 0.7286607650450743, + "grad_norm": 5.601299513674894, + "learning_rate": 9.049001472620481e-07, + "loss": 0.6267, + "step": 8972 + }, + { + "epoch": 0.7287419800211159, + "grad_norm": 6.85651492063141, + "learning_rate": 9.043938312300368e-07, + "loss": 0.4893, + "step": 8973 + }, + { + "epoch": 0.7288231949971575, + "grad_norm": 6.764798797003058, + "learning_rate": 9.038876256055288e-07, + "loss": 0.3582, + "step": 8974 + }, + { + "epoch": 0.728904409973199, + "grad_norm": 6.409476783815426, + "learning_rate": 9.033815304235488e-07, + "loss": 0.412, + "step": 8975 + }, + { + "epoch": 0.7289856249492407, + "grad_norm": 18.649132110530104, + "learning_rate": 9.028755457191179e-07, + "loss": 0.3983, + "step": 8976 + }, + { + "epoch": 0.7290668399252822, + "grad_norm": 6.513807711538606, + "learning_rate": 9.023696715272468e-07, + "loss": 0.3894, + "step": 8977 + }, + { + "epoch": 0.7291480549013238, + "grad_norm": 6.299055569156435, + "learning_rate": 9.018639078829378e-07, + "loss": 0.4124, + "step": 8978 + }, + { + "epoch": 0.7292292698773654, + "grad_norm": 3.514477375819592, + "learning_rate": 9.013582548211885e-07, + "loss": 0.568, + "step": 8979 + }, + { + "epoch": 0.729310484853407, + "grad_norm": 3.3460958135923318, + "learning_rate": 9.008527123769883e-07, + "loss": 0.4293, + "step": 8980 + }, + { + "epoch": 0.7293916998294485, + "grad_norm": 4.436979913495132, + "learning_rate": 9.003472805853161e-07, + "loss": 0.3699, + "step": 8981 + }, + { + "epoch": 0.7294729148054901, + "grad_norm": 5.276006150098665, + "learning_rate": 8.998419594811467e-07, + "loss": 0.5529, + "step": 8982 + }, + { + "epoch": 0.7295541297815317, + "grad_norm": 4.525908461414848, + "learning_rate": 8.993367490994451e-07, + "loss": 0.4417, + "step": 8983 + }, + { + "epoch": 0.7296353447575733, + "grad_norm": 6.180441338511432, + "learning_rate": 8.988316494751683e-07, + "loss": 0.4299, + "step": 8984 + }, + { + "epoch": 0.7297165597336149, + "grad_norm": 3.4284667198540983, + "learning_rate": 8.983266606432672e-07, + "loss": 0.5197, + "step": 8985 + }, + { + "epoch": 0.7297977747096565, + "grad_norm": 4.57248481351034, + "learning_rate": 8.978217826386853e-07, + "loss": 0.5724, + "step": 8986 + }, + { + "epoch": 0.7298789896856981, + "grad_norm": 3.3815176907013984, + "learning_rate": 8.973170154963567e-07, + "loss": 0.5855, + "step": 8987 + }, + { + "epoch": 0.7299602046617396, + "grad_norm": 4.911910066001664, + "learning_rate": 8.968123592512076e-07, + "loss": 0.5791, + "step": 8988 + }, + { + "epoch": 0.7300414196377812, + "grad_norm": 6.250038591037473, + "learning_rate": 8.963078139381595e-07, + "loss": 0.4979, + "step": 8989 + }, + { + "epoch": 0.7301226346138228, + "grad_norm": 4.472456064967235, + "learning_rate": 8.958033795921231e-07, + "loss": 0.4881, + "step": 8990 + }, + { + "epoch": 0.7302038495898644, + "grad_norm": 6.163691073085615, + "learning_rate": 8.952990562480021e-07, + "loss": 0.3618, + "step": 8991 + }, + { + "epoch": 0.7302850645659059, + "grad_norm": 6.473412100775378, + "learning_rate": 8.947948439406934e-07, + "loss": 0.6202, + "step": 8992 + }, + { + "epoch": 0.7303662795419475, + "grad_norm": 6.060384183610972, + "learning_rate": 8.94290742705087e-07, + "loss": 0.4327, + "step": 8993 + }, + { + "epoch": 0.7304474945179891, + "grad_norm": 7.291283308794519, + "learning_rate": 8.937867525760622e-07, + "loss": 0.6409, + "step": 8994 + }, + { + "epoch": 0.7305287094940307, + "grad_norm": 4.822306787639834, + "learning_rate": 8.932828735884944e-07, + "loss": 0.6233, + "step": 8995 + }, + { + "epoch": 0.7306099244700723, + "grad_norm": 4.5942352939506055, + "learning_rate": 8.927791057772481e-07, + "loss": 0.4074, + "step": 8996 + }, + { + "epoch": 0.7306911394461139, + "grad_norm": 5.662519945223263, + "learning_rate": 8.922754491771807e-07, + "loss": 0.4292, + "step": 8997 + }, + { + "epoch": 0.7307723544221555, + "grad_norm": 5.270491463874796, + "learning_rate": 8.917719038231437e-07, + "loss": 0.5461, + "step": 8998 + }, + { + "epoch": 0.730853569398197, + "grad_norm": 4.428969136605445, + "learning_rate": 8.912684697499801e-07, + "loss": 0.4651, + "step": 8999 + }, + { + "epoch": 0.7309347843742386, + "grad_norm": 6.961674772191147, + "learning_rate": 8.907651469925236e-07, + "loss": 0.4143, + "step": 9000 + }, + { + "epoch": 0.7310159993502802, + "grad_norm": 4.726617406201288, + "learning_rate": 8.902619355856032e-07, + "loss": 0.4823, + "step": 9001 + }, + { + "epoch": 0.7310972143263218, + "grad_norm": 5.037522415851766, + "learning_rate": 8.897588355640371e-07, + "loss": 0.4849, + "step": 9002 + }, + { + "epoch": 0.7311784293023633, + "grad_norm": 4.702839968633091, + "learning_rate": 8.892558469626375e-07, + "loss": 0.4989, + "step": 9003 + }, + { + "epoch": 0.731259644278405, + "grad_norm": 4.121133343260713, + "learning_rate": 8.887529698162079e-07, + "loss": 0.4176, + "step": 9004 + }, + { + "epoch": 0.7313408592544465, + "grad_norm": 5.799397892585551, + "learning_rate": 8.882502041595454e-07, + "loss": 0.4902, + "step": 9005 + }, + { + "epoch": 0.7314220742304881, + "grad_norm": 10.129599048058408, + "learning_rate": 8.877475500274393e-07, + "loss": 0.5977, + "step": 9006 + }, + { + "epoch": 0.7315032892065297, + "grad_norm": 6.254376447594514, + "learning_rate": 8.872450074546696e-07, + "loss": 0.4533, + "step": 9007 + }, + { + "epoch": 0.7315845041825713, + "grad_norm": 7.497553888308231, + "learning_rate": 8.867425764760104e-07, + "loss": 0.4203, + "step": 9008 + }, + { + "epoch": 0.7316657191586129, + "grad_norm": 7.377753478494686, + "learning_rate": 8.862402571262272e-07, + "loss": 0.5435, + "step": 9009 + }, + { + "epoch": 0.7317469341346544, + "grad_norm": 3.964499330804677, + "learning_rate": 8.857380494400764e-07, + "loss": 0.6699, + "step": 9010 + }, + { + "epoch": 0.731828149110696, + "grad_norm": 4.556955979542769, + "learning_rate": 8.852359534523091e-07, + "loss": 0.4287, + "step": 9011 + }, + { + "epoch": 0.7319093640867376, + "grad_norm": 5.283260885622843, + "learning_rate": 8.847339691976689e-07, + "loss": 0.6336, + "step": 9012 + }, + { + "epoch": 0.7319905790627792, + "grad_norm": 8.239303453615642, + "learning_rate": 8.842320967108886e-07, + "loss": 0.5459, + "step": 9013 + }, + { + "epoch": 0.7320717940388207, + "grad_norm": 9.181392373672194, + "learning_rate": 8.837303360266966e-07, + "loss": 0.4328, + "step": 9014 + }, + { + "epoch": 0.7321530090148624, + "grad_norm": 9.020007347941661, + "learning_rate": 8.832286871798113e-07, + "loss": 0.4695, + "step": 9015 + }, + { + "epoch": 0.7322342239909039, + "grad_norm": 11.695346831333792, + "learning_rate": 8.827271502049434e-07, + "loss": 0.4455, + "step": 9016 + }, + { + "epoch": 0.7323154389669455, + "grad_norm": 5.166457906877146, + "learning_rate": 8.822257251367983e-07, + "loss": 0.4783, + "step": 9017 + }, + { + "epoch": 0.7323966539429871, + "grad_norm": 3.9452559803067575, + "learning_rate": 8.817244120100702e-07, + "loss": 0.4167, + "step": 9018 + }, + { + "epoch": 0.7324778689190287, + "grad_norm": 5.030514569697522, + "learning_rate": 8.812232108594482e-07, + "loss": 0.3934, + "step": 9019 + }, + { + "epoch": 0.7325590838950703, + "grad_norm": 10.567800558944477, + "learning_rate": 8.807221217196135e-07, + "loss": 0.5891, + "step": 9020 + }, + { + "epoch": 0.7326402988711118, + "grad_norm": 7.700337486933208, + "learning_rate": 8.802211446252379e-07, + "loss": 0.5597, + "step": 9021 + }, + { + "epoch": 0.7327215138471534, + "grad_norm": 11.713465789679601, + "learning_rate": 8.797202796109869e-07, + "loss": 0.5172, + "step": 9022 + }, + { + "epoch": 0.732802728823195, + "grad_norm": 7.145291394301389, + "learning_rate": 8.792195267115163e-07, + "loss": 0.4569, + "step": 9023 + }, + { + "epoch": 0.7328839437992366, + "grad_norm": 4.598031815516408, + "learning_rate": 8.787188859614768e-07, + "loss": 0.6056, + "step": 9024 + }, + { + "epoch": 0.7329651587752781, + "grad_norm": 5.275503679314213, + "learning_rate": 8.782183573955105e-07, + "loss": 0.4652, + "step": 9025 + }, + { + "epoch": 0.7330463737513198, + "grad_norm": 6.295243362632932, + "learning_rate": 8.777179410482498e-07, + "loss": 0.6455, + "step": 9026 + }, + { + "epoch": 0.7331275887273613, + "grad_norm": 4.4474547984782395, + "learning_rate": 8.772176369543229e-07, + "loss": 0.3385, + "step": 9027 + }, + { + "epoch": 0.7332088037034029, + "grad_norm": 3.172334169434078, + "learning_rate": 8.767174451483468e-07, + "loss": 0.5206, + "step": 9028 + }, + { + "epoch": 0.7332900186794445, + "grad_norm": 4.6456662676555815, + "learning_rate": 8.762173656649317e-07, + "loss": 0.6199, + "step": 9029 + }, + { + "epoch": 0.7333712336554861, + "grad_norm": 7.265881502649945, + "learning_rate": 8.757173985386819e-07, + "loss": 0.4957, + "step": 9030 + }, + { + "epoch": 0.7334524486315277, + "grad_norm": 3.4982261171160833, + "learning_rate": 8.752175438041908e-07, + "loss": 0.5664, + "step": 9031 + }, + { + "epoch": 0.7335336636075692, + "grad_norm": 3.6827798694601754, + "learning_rate": 8.747178014960467e-07, + "loss": 0.4881, + "step": 9032 + }, + { + "epoch": 0.7336148785836109, + "grad_norm": 7.960332905031216, + "learning_rate": 8.742181716488302e-07, + "loss": 0.3276, + "step": 9033 + }, + { + "epoch": 0.7336960935596524, + "grad_norm": 4.4974178890676075, + "learning_rate": 8.737186542971115e-07, + "loss": 0.4636, + "step": 9034 + }, + { + "epoch": 0.733777308535694, + "grad_norm": 4.122814660625721, + "learning_rate": 8.732192494754541e-07, + "loss": 0.3476, + "step": 9035 + }, + { + "epoch": 0.7338585235117355, + "grad_norm": 4.213370810352235, + "learning_rate": 8.727199572184161e-07, + "loss": 0.5909, + "step": 9036 + }, + { + "epoch": 0.7339397384877772, + "grad_norm": 10.14479503528053, + "learning_rate": 8.722207775605437e-07, + "loss": 0.4764, + "step": 9037 + }, + { + "epoch": 0.7340209534638187, + "grad_norm": 4.624804246473112, + "learning_rate": 8.717217105363798e-07, + "loss": 0.5015, + "step": 9038 + }, + { + "epoch": 0.7341021684398603, + "grad_norm": 5.183928674417439, + "learning_rate": 8.712227561804548e-07, + "loss": 0.4768, + "step": 9039 + }, + { + "epoch": 0.734183383415902, + "grad_norm": 7.897979278065682, + "learning_rate": 8.707239145272958e-07, + "loss": 0.5025, + "step": 9040 + }, + { + "epoch": 0.7342645983919435, + "grad_norm": 16.857325758216245, + "learning_rate": 8.702251856114191e-07, + "loss": 0.4793, + "step": 9041 + }, + { + "epoch": 0.7343458133679851, + "grad_norm": 6.305732473052533, + "learning_rate": 8.697265694673334e-07, + "loss": 0.4738, + "step": 9042 + }, + { + "epoch": 0.7344270283440266, + "grad_norm": 4.796761988404685, + "learning_rate": 8.692280661295419e-07, + "loss": 0.334, + "step": 9043 + }, + { + "epoch": 0.7345082433200683, + "grad_norm": 5.71378461104089, + "learning_rate": 8.687296756325364e-07, + "loss": 0.5107, + "step": 9044 + }, + { + "epoch": 0.7345894582961098, + "grad_norm": 10.227432767121579, + "learning_rate": 8.68231398010804e-07, + "loss": 0.6141, + "step": 9045 + }, + { + "epoch": 0.7346706732721514, + "grad_norm": 6.276424519094259, + "learning_rate": 8.677332332988236e-07, + "loss": 0.4475, + "step": 9046 + }, + { + "epoch": 0.7347518882481929, + "grad_norm": 5.843493313646698, + "learning_rate": 8.672351815310651e-07, + "loss": 0.4045, + "step": 9047 + }, + { + "epoch": 0.7348331032242346, + "grad_norm": 7.797951199210248, + "learning_rate": 8.667372427419895e-07, + "loss": 0.4015, + "step": 9048 + }, + { + "epoch": 0.7349143182002761, + "grad_norm": 7.633315491295807, + "learning_rate": 8.66239416966054e-07, + "loss": 0.4516, + "step": 9049 + }, + { + "epoch": 0.7349955331763177, + "grad_norm": 5.0774721644669025, + "learning_rate": 8.657417042377034e-07, + "loss": 0.5822, + "step": 9050 + }, + { + "epoch": 0.7350767481523593, + "grad_norm": 3.3621612548870967, + "learning_rate": 8.652441045913775e-07, + "loss": 0.5267, + "step": 9051 + }, + { + "epoch": 0.7351579631284009, + "grad_norm": 6.871554047086195, + "learning_rate": 8.647466180615085e-07, + "loss": 0.5536, + "step": 9052 + }, + { + "epoch": 0.7352391781044425, + "grad_norm": 6.572776544165536, + "learning_rate": 8.642492446825193e-07, + "loss": 0.5135, + "step": 9053 + }, + { + "epoch": 0.735320393080484, + "grad_norm": 5.770321723227472, + "learning_rate": 8.637519844888245e-07, + "loss": 0.4804, + "step": 9054 + }, + { + "epoch": 0.7354016080565257, + "grad_norm": 10.272898706445313, + "learning_rate": 8.632548375148333e-07, + "loss": 0.5766, + "step": 9055 + }, + { + "epoch": 0.7354828230325672, + "grad_norm": 7.89177634147062, + "learning_rate": 8.627578037949441e-07, + "loss": 0.6254, + "step": 9056 + }, + { + "epoch": 0.7355640380086088, + "grad_norm": 6.509795135122351, + "learning_rate": 8.62260883363551e-07, + "loss": 0.4276, + "step": 9057 + }, + { + "epoch": 0.7356452529846503, + "grad_norm": 5.327797962449124, + "learning_rate": 8.617640762550361e-07, + "loss": 0.4104, + "step": 9058 + }, + { + "epoch": 0.735726467960692, + "grad_norm": 10.83596502585155, + "learning_rate": 8.612673825037776e-07, + "loss": 0.6383, + "step": 9059 + }, + { + "epoch": 0.7358076829367335, + "grad_norm": 3.2930515840707257, + "learning_rate": 8.607708021441436e-07, + "loss": 0.5538, + "step": 9060 + }, + { + "epoch": 0.7358888979127751, + "grad_norm": 6.123300825914036, + "learning_rate": 8.602743352104936e-07, + "loss": 0.5416, + "step": 9061 + }, + { + "epoch": 0.7359701128888168, + "grad_norm": 4.126241878471722, + "learning_rate": 8.597779817371824e-07, + "loss": 0.4344, + "step": 9062 + }, + { + "epoch": 0.7360513278648583, + "grad_norm": 4.001896575407721, + "learning_rate": 8.592817417585534e-07, + "loss": 0.5276, + "step": 9063 + }, + { + "epoch": 0.7361325428408999, + "grad_norm": 4.410919527265607, + "learning_rate": 8.587856153089444e-07, + "loss": 0.5128, + "step": 9064 + }, + { + "epoch": 0.7362137578169414, + "grad_norm": 4.6300581291782885, + "learning_rate": 8.582896024226855e-07, + "loss": 0.5539, + "step": 9065 + }, + { + "epoch": 0.7362949727929831, + "grad_norm": 5.804508870075955, + "learning_rate": 8.577937031340975e-07, + "loss": 0.4621, + "step": 9066 + }, + { + "epoch": 0.7363761877690246, + "grad_norm": 3.9641442301684653, + "learning_rate": 8.572979174774934e-07, + "loss": 0.4759, + "step": 9067 + }, + { + "epoch": 0.7364574027450662, + "grad_norm": 3.935697034995573, + "learning_rate": 8.568022454871802e-07, + "loss": 0.5874, + "step": 9068 + }, + { + "epoch": 0.7365386177211077, + "grad_norm": 5.777703937802822, + "learning_rate": 8.563066871974543e-07, + "loss": 0.4736, + "step": 9069 + }, + { + "epoch": 0.7366198326971494, + "grad_norm": 3.7700387674835176, + "learning_rate": 8.558112426426062e-07, + "loss": 0.4482, + "step": 9070 + }, + { + "epoch": 0.7367010476731909, + "grad_norm": 4.042064333038825, + "learning_rate": 8.553159118569196e-07, + "loss": 0.3638, + "step": 9071 + }, + { + "epoch": 0.7367822626492325, + "grad_norm": 5.379208558888325, + "learning_rate": 8.548206948746673e-07, + "loss": 0.359, + "step": 9072 + }, + { + "epoch": 0.7368634776252742, + "grad_norm": 5.80342467828285, + "learning_rate": 8.543255917301163e-07, + "loss": 0.5691, + "step": 9073 + }, + { + "epoch": 0.7369446926013157, + "grad_norm": 9.897018766300208, + "learning_rate": 8.538306024575235e-07, + "loss": 0.5253, + "step": 9074 + }, + { + "epoch": 0.7370259075773573, + "grad_norm": 5.1025049749314055, + "learning_rate": 8.533357270911419e-07, + "loss": 0.4075, + "step": 9075 + }, + { + "epoch": 0.7371071225533988, + "grad_norm": 8.535318151291682, + "learning_rate": 8.52840965665212e-07, + "loss": 0.4842, + "step": 9076 + }, + { + "epoch": 0.7371883375294405, + "grad_norm": 8.277801039325004, + "learning_rate": 8.523463182139699e-07, + "loss": 0.4553, + "step": 9077 + }, + { + "epoch": 0.737269552505482, + "grad_norm": 4.521612840769156, + "learning_rate": 8.518517847716435e-07, + "loss": 0.627, + "step": 9078 + }, + { + "epoch": 0.7373507674815236, + "grad_norm": 3.8960485335075603, + "learning_rate": 8.513573653724508e-07, + "loss": 0.5943, + "step": 9079 + }, + { + "epoch": 0.7374319824575651, + "grad_norm": 4.606452564353779, + "learning_rate": 8.508630600506021e-07, + "loss": 0.6701, + "step": 9080 + }, + { + "epoch": 0.7375131974336068, + "grad_norm": 7.052763751543803, + "learning_rate": 8.503688688403028e-07, + "loss": 0.5393, + "step": 9081 + }, + { + "epoch": 0.7375944124096483, + "grad_norm": 4.538323480054813, + "learning_rate": 8.498747917757464e-07, + "loss": 0.434, + "step": 9082 + }, + { + "epoch": 0.7376756273856899, + "grad_norm": 3.679103857972724, + "learning_rate": 8.49380828891121e-07, + "loss": 0.561, + "step": 9083 + }, + { + "epoch": 0.7377568423617316, + "grad_norm": 9.599527757034656, + "learning_rate": 8.488869802206073e-07, + "loss": 0.4462, + "step": 9084 + }, + { + "epoch": 0.7378380573377731, + "grad_norm": 5.51962161938916, + "learning_rate": 8.483932457983765e-07, + "loss": 0.5055, + "step": 9085 + }, + { + "epoch": 0.7379192723138147, + "grad_norm": 7.720325892853574, + "learning_rate": 8.478996256585909e-07, + "loss": 0.3904, + "step": 9086 + }, + { + "epoch": 0.7380004872898562, + "grad_norm": 4.8150352416660125, + "learning_rate": 8.474061198354086e-07, + "loss": 0.3425, + "step": 9087 + }, + { + "epoch": 0.7380817022658979, + "grad_norm": 4.123061421099957, + "learning_rate": 8.469127283629766e-07, + "loss": 0.4812, + "step": 9088 + }, + { + "epoch": 0.7381629172419394, + "grad_norm": 4.2839585199281975, + "learning_rate": 8.464194512754339e-07, + "loss": 0.6729, + "step": 9089 + }, + { + "epoch": 0.738244132217981, + "grad_norm": 10.07664977406758, + "learning_rate": 8.459262886069139e-07, + "loss": 0.4227, + "step": 9090 + }, + { + "epoch": 0.7383253471940225, + "grad_norm": 5.667705109796953, + "learning_rate": 8.454332403915416e-07, + "loss": 0.4117, + "step": 9091 + }, + { + "epoch": 0.7384065621700642, + "grad_norm": 5.360148423288729, + "learning_rate": 8.44940306663432e-07, + "loss": 0.4129, + "step": 9092 + }, + { + "epoch": 0.7384877771461057, + "grad_norm": 4.840577845593679, + "learning_rate": 8.444474874566935e-07, + "loss": 0.43, + "step": 9093 + }, + { + "epoch": 0.7385689921221473, + "grad_norm": 5.228586184130316, + "learning_rate": 8.439547828054276e-07, + "loss": 0.5745, + "step": 9094 + }, + { + "epoch": 0.738650207098189, + "grad_norm": 5.804073223258344, + "learning_rate": 8.434621927437253e-07, + "loss": 0.6014, + "step": 9095 + }, + { + "epoch": 0.7387314220742305, + "grad_norm": 6.711353565911964, + "learning_rate": 8.429697173056726e-07, + "loss": 0.4756, + "step": 9096 + }, + { + "epoch": 0.7388126370502721, + "grad_norm": 5.514320352974961, + "learning_rate": 8.42477356525346e-07, + "loss": 0.5374, + "step": 9097 + }, + { + "epoch": 0.7388938520263136, + "grad_norm": 3.835666081477301, + "learning_rate": 8.419851104368143e-07, + "loss": 0.3875, + "step": 9098 + }, + { + "epoch": 0.7389750670023553, + "grad_norm": 12.210910906950325, + "learning_rate": 8.414929790741371e-07, + "loss": 0.4524, + "step": 9099 + }, + { + "epoch": 0.7390562819783968, + "grad_norm": 5.875608947253129, + "learning_rate": 8.410009624713691e-07, + "loss": 0.5712, + "step": 9100 + }, + { + "epoch": 0.7391374969544384, + "grad_norm": 4.500274285625198, + "learning_rate": 8.405090606625547e-07, + "loss": 0.6175, + "step": 9101 + }, + { + "epoch": 0.7392187119304799, + "grad_norm": 7.944159885650859, + "learning_rate": 8.400172736817294e-07, + "loss": 0.5565, + "step": 9102 + }, + { + "epoch": 0.7392999269065216, + "grad_norm": 10.979049183670785, + "learning_rate": 8.395256015629233e-07, + "loss": 0.4571, + "step": 9103 + }, + { + "epoch": 0.7393811418825631, + "grad_norm": 6.5837339811285815, + "learning_rate": 8.390340443401588e-07, + "loss": 0.4318, + "step": 9104 + }, + { + "epoch": 0.7394623568586047, + "grad_norm": 6.045591348963159, + "learning_rate": 8.385426020474468e-07, + "loss": 0.4703, + "step": 9105 + }, + { + "epoch": 0.7395435718346464, + "grad_norm": 11.087294536746336, + "learning_rate": 8.380512747187944e-07, + "loss": 0.3586, + "step": 9106 + }, + { + "epoch": 0.7396247868106879, + "grad_norm": 5.179365997405158, + "learning_rate": 8.375600623881983e-07, + "loss": 0.3442, + "step": 9107 + }, + { + "epoch": 0.7397060017867295, + "grad_norm": 5.5937991392764355, + "learning_rate": 8.370689650896465e-07, + "loss": 0.4803, + "step": 9108 + }, + { + "epoch": 0.739787216762771, + "grad_norm": 5.345681951321185, + "learning_rate": 8.365779828571214e-07, + "loss": 0.5371, + "step": 9109 + }, + { + "epoch": 0.7398684317388127, + "grad_norm": 10.438164567037536, + "learning_rate": 8.360871157245973e-07, + "loss": 0.4857, + "step": 9110 + }, + { + "epoch": 0.7399496467148542, + "grad_norm": 12.570837392968885, + "learning_rate": 8.355963637260387e-07, + "loss": 0.4761, + "step": 9111 + }, + { + "epoch": 0.7400308616908958, + "grad_norm": 5.293881936064758, + "learning_rate": 8.351057268954019e-07, + "loss": 0.6403, + "step": 9112 + }, + { + "epoch": 0.7401120766669373, + "grad_norm": 8.074420587248548, + "learning_rate": 8.346152052666385e-07, + "loss": 0.4348, + "step": 9113 + }, + { + "epoch": 0.740193291642979, + "grad_norm": 4.833671703517281, + "learning_rate": 8.341247988736889e-07, + "loss": 0.5021, + "step": 9114 + }, + { + "epoch": 0.7402745066190205, + "grad_norm": 5.173257511917773, + "learning_rate": 8.336345077504851e-07, + "loss": 0.4408, + "step": 9115 + }, + { + "epoch": 0.7403557215950621, + "grad_norm": 4.696108390642895, + "learning_rate": 8.331443319309557e-07, + "loss": 0.4978, + "step": 9116 + }, + { + "epoch": 0.7404369365711038, + "grad_norm": 6.333906802407053, + "learning_rate": 8.326542714490172e-07, + "loss": 0.4354, + "step": 9117 + }, + { + "epoch": 0.7405181515471453, + "grad_norm": 4.361831325871427, + "learning_rate": 8.321643263385776e-07, + "loss": 0.4368, + "step": 9118 + }, + { + "epoch": 0.7405993665231869, + "grad_norm": 4.6714291210784085, + "learning_rate": 8.316744966335408e-07, + "loss": 0.6296, + "step": 9119 + }, + { + "epoch": 0.7406805814992284, + "grad_norm": 4.933127806189035, + "learning_rate": 8.31184782367799e-07, + "loss": 0.5809, + "step": 9120 + }, + { + "epoch": 0.7407617964752701, + "grad_norm": 4.025591077863814, + "learning_rate": 8.306951835752378e-07, + "loss": 0.6013, + "step": 9121 + }, + { + "epoch": 0.7408430114513116, + "grad_norm": 11.41144080285039, + "learning_rate": 8.302057002897349e-07, + "loss": 0.5021, + "step": 9122 + }, + { + "epoch": 0.7409242264273532, + "grad_norm": 4.627955700774956, + "learning_rate": 8.297163325451612e-07, + "loss": 0.5904, + "step": 9123 + }, + { + "epoch": 0.7410054414033947, + "grad_norm": 7.878246043105683, + "learning_rate": 8.292270803753765e-07, + "loss": 0.457, + "step": 9124 + }, + { + "epoch": 0.7410866563794364, + "grad_norm": 4.436045720835732, + "learning_rate": 8.287379438142365e-07, + "loss": 0.5311, + "step": 9125 + }, + { + "epoch": 0.7411678713554779, + "grad_norm": 5.747604125886795, + "learning_rate": 8.282489228955856e-07, + "loss": 0.4093, + "step": 9126 + }, + { + "epoch": 0.7412490863315195, + "grad_norm": 26.14780023838032, + "learning_rate": 8.277600176532608e-07, + "loss": 0.4506, + "step": 9127 + }, + { + "epoch": 0.7413303013075612, + "grad_norm": 5.408380773620181, + "learning_rate": 8.272712281210926e-07, + "loss": 0.5179, + "step": 9128 + }, + { + "epoch": 0.7414115162836027, + "grad_norm": 3.881322907472951, + "learning_rate": 8.267825543329033e-07, + "loss": 0.5395, + "step": 9129 + }, + { + "epoch": 0.7414927312596443, + "grad_norm": 4.612767077216341, + "learning_rate": 8.262939963225058e-07, + "loss": 0.4454, + "step": 9130 + }, + { + "epoch": 0.7415739462356858, + "grad_norm": 20.881362818292562, + "learning_rate": 8.258055541237054e-07, + "loss": 0.4422, + "step": 9131 + }, + { + "epoch": 0.7416551612117275, + "grad_norm": 10.331638585893913, + "learning_rate": 8.253172277703006e-07, + "loss": 0.4213, + "step": 9132 + }, + { + "epoch": 0.741736376187769, + "grad_norm": 7.471565014697083, + "learning_rate": 8.248290172960804e-07, + "loss": 0.4839, + "step": 9133 + }, + { + "epoch": 0.7418175911638106, + "grad_norm": 6.969777149983467, + "learning_rate": 8.24340922734826e-07, + "loss": 0.4021, + "step": 9134 + }, + { + "epoch": 0.7418988061398522, + "grad_norm": 6.91819887817937, + "learning_rate": 8.238529441203111e-07, + "loss": 0.3838, + "step": 9135 + }, + { + "epoch": 0.7419800211158938, + "grad_norm": 3.8948651109688703, + "learning_rate": 8.233650814863026e-07, + "loss": 0.4054, + "step": 9136 + }, + { + "epoch": 0.7420612360919353, + "grad_norm": 4.018293826197548, + "learning_rate": 8.228773348665561e-07, + "loss": 0.4144, + "step": 9137 + }, + { + "epoch": 0.7421424510679769, + "grad_norm": 3.9662330274627355, + "learning_rate": 8.223897042948228e-07, + "loss": 0.4907, + "step": 9138 + }, + { + "epoch": 0.7422236660440186, + "grad_norm": 10.723781510737455, + "learning_rate": 8.219021898048435e-07, + "loss": 0.5779, + "step": 9139 + }, + { + "epoch": 0.7423048810200601, + "grad_norm": 4.79787389560584, + "learning_rate": 8.214147914303505e-07, + "loss": 0.4803, + "step": 9140 + }, + { + "epoch": 0.7423860959961017, + "grad_norm": 3.222506085754893, + "learning_rate": 8.209275092050701e-07, + "loss": 0.5073, + "step": 9141 + }, + { + "epoch": 0.7424673109721432, + "grad_norm": 7.127778601731678, + "learning_rate": 8.204403431627206e-07, + "loss": 0.4199, + "step": 9142 + }, + { + "epoch": 0.7425485259481849, + "grad_norm": 6.285963377503724, + "learning_rate": 8.199532933370094e-07, + "loss": 0.3786, + "step": 9143 + }, + { + "epoch": 0.7426297409242264, + "grad_norm": 8.21223183631982, + "learning_rate": 8.194663597616398e-07, + "loss": 0.5224, + "step": 9144 + }, + { + "epoch": 0.742710955900268, + "grad_norm": 9.976782891147915, + "learning_rate": 8.18979542470304e-07, + "loss": 0.5302, + "step": 9145 + }, + { + "epoch": 0.7427921708763096, + "grad_norm": 6.028309491334264, + "learning_rate": 8.184928414966873e-07, + "loss": 0.4914, + "step": 9146 + }, + { + "epoch": 0.7428733858523512, + "grad_norm": 13.676857941967034, + "learning_rate": 8.180062568744657e-07, + "loss": 0.5282, + "step": 9147 + }, + { + "epoch": 0.7429546008283927, + "grad_norm": 17.032146668354567, + "learning_rate": 8.175197886373093e-07, + "loss": 0.4902, + "step": 9148 + }, + { + "epoch": 0.7430358158044343, + "grad_norm": 5.4622799815547936, + "learning_rate": 8.170334368188798e-07, + "loss": 0.3934, + "step": 9149 + }, + { + "epoch": 0.743117030780476, + "grad_norm": 8.517487460268729, + "learning_rate": 8.16547201452829e-07, + "loss": 0.5146, + "step": 9150 + }, + { + "epoch": 0.7431982457565175, + "grad_norm": 5.038501260406377, + "learning_rate": 8.160610825728029e-07, + "loss": 0.5049, + "step": 9151 + }, + { + "epoch": 0.7432794607325591, + "grad_norm": 4.455377373168075, + "learning_rate": 8.155750802124379e-07, + "loss": 0.5832, + "step": 9152 + }, + { + "epoch": 0.7433606757086006, + "grad_norm": 3.9339441762573863, + "learning_rate": 8.150891944053615e-07, + "loss": 0.4169, + "step": 9153 + }, + { + "epoch": 0.7434418906846423, + "grad_norm": 3.517549583636786, + "learning_rate": 8.146034251851959e-07, + "loss": 0.5344, + "step": 9154 + }, + { + "epoch": 0.7435231056606838, + "grad_norm": 4.5372086466348085, + "learning_rate": 8.141177725855543e-07, + "loss": 0.5208, + "step": 9155 + }, + { + "epoch": 0.7436043206367254, + "grad_norm": 4.458967507222066, + "learning_rate": 8.136322366400396e-07, + "loss": 0.4537, + "step": 9156 + }, + { + "epoch": 0.743685535612767, + "grad_norm": 3.5505929046873104, + "learning_rate": 8.131468173822499e-07, + "loss": 0.3824, + "step": 9157 + }, + { + "epoch": 0.7437667505888086, + "grad_norm": 4.049572608845729, + "learning_rate": 8.126615148457728e-07, + "loss": 0.4316, + "step": 9158 + }, + { + "epoch": 0.7438479655648501, + "grad_norm": 6.329253089634774, + "learning_rate": 8.121763290641879e-07, + "loss": 0.3844, + "step": 9159 + }, + { + "epoch": 0.7439291805408917, + "grad_norm": 5.7289617458205235, + "learning_rate": 8.116912600710694e-07, + "loss": 0.5157, + "step": 9160 + }, + { + "epoch": 0.7440103955169334, + "grad_norm": 7.461558435663731, + "learning_rate": 8.112063078999794e-07, + "loss": 0.4108, + "step": 9161 + }, + { + "epoch": 0.7440916104929749, + "grad_norm": 6.260601884198078, + "learning_rate": 8.107214725844753e-07, + "loss": 0.4063, + "step": 9162 + }, + { + "epoch": 0.7441728254690165, + "grad_norm": 7.096380076386157, + "learning_rate": 8.102367541581055e-07, + "loss": 0.375, + "step": 9163 + }, + { + "epoch": 0.744254040445058, + "grad_norm": 4.583892134072468, + "learning_rate": 8.097521526544094e-07, + "loss": 0.5004, + "step": 9164 + }, + { + "epoch": 0.7443352554210997, + "grad_norm": 4.285669547146368, + "learning_rate": 8.092676681069189e-07, + "loss": 0.4534, + "step": 9165 + }, + { + "epoch": 0.7444164703971412, + "grad_norm": 5.84102693467306, + "learning_rate": 8.087833005491568e-07, + "loss": 0.6221, + "step": 9166 + }, + { + "epoch": 0.7444976853731828, + "grad_norm": 4.581219331241057, + "learning_rate": 8.082990500146398e-07, + "loss": 0.5261, + "step": 9167 + }, + { + "epoch": 0.7445789003492244, + "grad_norm": 6.253344718728393, + "learning_rate": 8.078149165368762e-07, + "loss": 0.4019, + "step": 9168 + }, + { + "epoch": 0.744660115325266, + "grad_norm": 4.961021707376709, + "learning_rate": 8.073309001493637e-07, + "loss": 0.5145, + "step": 9169 + }, + { + "epoch": 0.7447413303013075, + "grad_norm": 60.59463933382603, + "learning_rate": 8.068470008855953e-07, + "loss": 0.4394, + "step": 9170 + }, + { + "epoch": 0.7448225452773491, + "grad_norm": 5.570185363264054, + "learning_rate": 8.063632187790538e-07, + "loss": 0.4144, + "step": 9171 + }, + { + "epoch": 0.7449037602533908, + "grad_norm": 6.346150984764303, + "learning_rate": 8.05879553863213e-07, + "loss": 0.4612, + "step": 9172 + }, + { + "epoch": 0.7449849752294323, + "grad_norm": 5.7858397897115745, + "learning_rate": 8.053960061715421e-07, + "loss": 0.5154, + "step": 9173 + }, + { + "epoch": 0.7450661902054739, + "grad_norm": 9.04581065503539, + "learning_rate": 8.049125757374978e-07, + "loss": 0.4543, + "step": 9174 + }, + { + "epoch": 0.7451474051815155, + "grad_norm": 4.5831119885995575, + "learning_rate": 8.044292625945327e-07, + "loss": 0.5, + "step": 9175 + }, + { + "epoch": 0.7452286201575571, + "grad_norm": 4.73034396532834, + "learning_rate": 8.039460667760892e-07, + "loss": 0.6158, + "step": 9176 + }, + { + "epoch": 0.7453098351335986, + "grad_norm": 7.297415511486316, + "learning_rate": 8.034629883156019e-07, + "loss": 0.4434, + "step": 9177 + }, + { + "epoch": 0.7453910501096402, + "grad_norm": 7.087502286901482, + "learning_rate": 8.029800272464963e-07, + "loss": 0.3851, + "step": 9178 + }, + { + "epoch": 0.7454722650856818, + "grad_norm": 21.28250462127868, + "learning_rate": 8.024971836021922e-07, + "loss": 0.3312, + "step": 9179 + }, + { + "epoch": 0.7455534800617234, + "grad_norm": 4.028786793175537, + "learning_rate": 8.020144574160984e-07, + "loss": 0.5862, + "step": 9180 + }, + { + "epoch": 0.7456346950377649, + "grad_norm": 8.089625182959997, + "learning_rate": 8.015318487216184e-07, + "loss": 0.4179, + "step": 9181 + }, + { + "epoch": 0.7457159100138065, + "grad_norm": 10.703076445335201, + "learning_rate": 8.010493575521444e-07, + "loss": 0.5247, + "step": 9182 + }, + { + "epoch": 0.7457971249898482, + "grad_norm": 5.996668534107049, + "learning_rate": 8.005669839410643e-07, + "loss": 0.6692, + "step": 9183 + }, + { + "epoch": 0.7458783399658897, + "grad_norm": 4.264762756411719, + "learning_rate": 8.00084727921755e-07, + "loss": 0.7248, + "step": 9184 + }, + { + "epoch": 0.7459595549419313, + "grad_norm": 6.337959117205082, + "learning_rate": 7.996025895275846e-07, + "loss": 0.4624, + "step": 9185 + }, + { + "epoch": 0.7460407699179729, + "grad_norm": 4.73694838657318, + "learning_rate": 7.991205687919163e-07, + "loss": 0.5874, + "step": 9186 + }, + { + "epoch": 0.7461219848940145, + "grad_norm": 4.444846677665557, + "learning_rate": 7.986386657481032e-07, + "loss": 0.4305, + "step": 9187 + }, + { + "epoch": 0.746203199870056, + "grad_norm": 4.371046177977482, + "learning_rate": 7.981568804294895e-07, + "loss": 0.4158, + "step": 9188 + }, + { + "epoch": 0.7462844148460976, + "grad_norm": 5.23612698457551, + "learning_rate": 7.976752128694134e-07, + "loss": 0.5297, + "step": 9189 + }, + { + "epoch": 0.7463656298221392, + "grad_norm": 5.6064882601787325, + "learning_rate": 7.971936631012033e-07, + "loss": 0.4, + "step": 9190 + }, + { + "epoch": 0.7464468447981808, + "grad_norm": 4.736891493880583, + "learning_rate": 7.96712231158179e-07, + "loss": 0.465, + "step": 9191 + }, + { + "epoch": 0.7465280597742223, + "grad_norm": 16.20683819170043, + "learning_rate": 7.962309170736546e-07, + "loss": 0.538, + "step": 9192 + }, + { + "epoch": 0.746609274750264, + "grad_norm": 4.072174973984312, + "learning_rate": 7.957497208809328e-07, + "loss": 0.5302, + "step": 9193 + }, + { + "epoch": 0.7466904897263056, + "grad_norm": 3.329371490211698, + "learning_rate": 7.952686426133105e-07, + "loss": 0.5949, + "step": 9194 + }, + { + "epoch": 0.7467717047023471, + "grad_norm": 10.181237849562704, + "learning_rate": 7.947876823040771e-07, + "loss": 0.5202, + "step": 9195 + }, + { + "epoch": 0.7468529196783887, + "grad_norm": 4.700711476652391, + "learning_rate": 7.943068399865111e-07, + "loss": 0.3624, + "step": 9196 + }, + { + "epoch": 0.7469341346544303, + "grad_norm": 4.764991430944277, + "learning_rate": 7.93826115693884e-07, + "loss": 0.3774, + "step": 9197 + }, + { + "epoch": 0.7470153496304719, + "grad_norm": 7.85688314418302, + "learning_rate": 7.933455094594602e-07, + "loss": 0.3546, + "step": 9198 + }, + { + "epoch": 0.7470965646065134, + "grad_norm": 4.120541672766695, + "learning_rate": 7.928650213164945e-07, + "loss": 0.6123, + "step": 9199 + }, + { + "epoch": 0.747177779582555, + "grad_norm": 7.327958848564803, + "learning_rate": 7.92384651298235e-07, + "loss": 0.5695, + "step": 9200 + }, + { + "epoch": 0.7472589945585966, + "grad_norm": 5.719769810682618, + "learning_rate": 7.919043994379194e-07, + "loss": 0.4224, + "step": 9201 + }, + { + "epoch": 0.7473402095346382, + "grad_norm": 4.094688049104328, + "learning_rate": 7.914242657687804e-07, + "loss": 0.5687, + "step": 9202 + }, + { + "epoch": 0.7474214245106797, + "grad_norm": 8.405367942793522, + "learning_rate": 7.909442503240395e-07, + "loss": 0.429, + "step": 9203 + }, + { + "epoch": 0.7475026394867214, + "grad_norm": 6.01950206469752, + "learning_rate": 7.904643531369108e-07, + "loss": 0.4256, + "step": 9204 + }, + { + "epoch": 0.747583854462763, + "grad_norm": 7.930996919339839, + "learning_rate": 7.899845742406017e-07, + "loss": 0.4707, + "step": 9205 + }, + { + "epoch": 0.7476650694388045, + "grad_norm": 16.262367161009937, + "learning_rate": 7.895049136683095e-07, + "loss": 0.411, + "step": 9206 + }, + { + "epoch": 0.7477462844148461, + "grad_norm": 3.3605717819153527, + "learning_rate": 7.890253714532245e-07, + "loss": 0.4796, + "step": 9207 + }, + { + "epoch": 0.7478274993908877, + "grad_norm": 4.04997390894807, + "learning_rate": 7.885459476285292e-07, + "loss": 0.5579, + "step": 9208 + }, + { + "epoch": 0.7479087143669293, + "grad_norm": 4.435766819763639, + "learning_rate": 7.880666422273969e-07, + "loss": 0.3457, + "step": 9209 + }, + { + "epoch": 0.7479899293429708, + "grad_norm": 5.1365435166693505, + "learning_rate": 7.875874552829918e-07, + "loss": 0.5797, + "step": 9210 + }, + { + "epoch": 0.7480711443190124, + "grad_norm": 5.591130735882937, + "learning_rate": 7.871083868284726e-07, + "loss": 0.5462, + "step": 9211 + }, + { + "epoch": 0.748152359295054, + "grad_norm": 8.16910174430336, + "learning_rate": 7.866294368969871e-07, + "loss": 0.4643, + "step": 9212 + }, + { + "epoch": 0.7482335742710956, + "grad_norm": 7.038360149078053, + "learning_rate": 7.861506055216764e-07, + "loss": 0.5222, + "step": 9213 + }, + { + "epoch": 0.7483147892471371, + "grad_norm": 5.659068513220842, + "learning_rate": 7.856718927356743e-07, + "loss": 0.5721, + "step": 9214 + }, + { + "epoch": 0.7483960042231788, + "grad_norm": 7.479766446282722, + "learning_rate": 7.851932985721042e-07, + "loss": 0.5484, + "step": 9215 + }, + { + "epoch": 0.7484772191992204, + "grad_norm": 7.0753754777583575, + "learning_rate": 7.847148230640825e-07, + "loss": 0.3887, + "step": 9216 + }, + { + "epoch": 0.7485584341752619, + "grad_norm": 6.933072652515654, + "learning_rate": 7.842364662447161e-07, + "loss": 0.4933, + "step": 9217 + }, + { + "epoch": 0.7486396491513035, + "grad_norm": 4.082184303346161, + "learning_rate": 7.837582281471065e-07, + "loss": 0.4314, + "step": 9218 + }, + { + "epoch": 0.7487208641273451, + "grad_norm": 5.120311127792642, + "learning_rate": 7.832801088043438e-07, + "loss": 0.3486, + "step": 9219 + }, + { + "epoch": 0.7488020791033867, + "grad_norm": 5.389253627233221, + "learning_rate": 7.828021082495118e-07, + "loss": 0.5207, + "step": 9220 + }, + { + "epoch": 0.7488832940794282, + "grad_norm": 5.865053171467202, + "learning_rate": 7.823242265156866e-07, + "loss": 0.3661, + "step": 9221 + }, + { + "epoch": 0.7489645090554699, + "grad_norm": 4.518682135966551, + "learning_rate": 7.818464636359344e-07, + "loss": 0.6257, + "step": 9222 + }, + { + "epoch": 0.7490457240315114, + "grad_norm": 5.19395467966571, + "learning_rate": 7.813688196433125e-07, + "loss": 0.4796, + "step": 9223 + }, + { + "epoch": 0.749126939007553, + "grad_norm": 4.850339059401898, + "learning_rate": 7.808912945708738e-07, + "loss": 0.4585, + "step": 9224 + }, + { + "epoch": 0.7492081539835945, + "grad_norm": 5.013059286403891, + "learning_rate": 7.804138884516583e-07, + "loss": 0.3884, + "step": 9225 + }, + { + "epoch": 0.7492893689596362, + "grad_norm": 4.841130061593439, + "learning_rate": 7.799366013187007e-07, + "loss": 0.4484, + "step": 9226 + }, + { + "epoch": 0.7493705839356778, + "grad_norm": 5.230772273048009, + "learning_rate": 7.794594332050282e-07, + "loss": 0.4274, + "step": 9227 + }, + { + "epoch": 0.7494517989117193, + "grad_norm": 7.075947384663193, + "learning_rate": 7.789823841436567e-07, + "loss": 0.4516, + "step": 9228 + }, + { + "epoch": 0.749533013887761, + "grad_norm": 6.992483714956555, + "learning_rate": 7.785054541675954e-07, + "loss": 0.3975, + "step": 9229 + }, + { + "epoch": 0.7496142288638025, + "grad_norm": 4.561345669683036, + "learning_rate": 7.780286433098464e-07, + "loss": 0.7948, + "step": 9230 + }, + { + "epoch": 0.7496954438398441, + "grad_norm": 4.478198203586605, + "learning_rate": 7.775519516034019e-07, + "loss": 0.4752, + "step": 9231 + }, + { + "epoch": 0.7497766588158856, + "grad_norm": 3.9360754902947894, + "learning_rate": 7.770753790812455e-07, + "loss": 0.5631, + "step": 9232 + }, + { + "epoch": 0.7498578737919273, + "grad_norm": 6.792888436524307, + "learning_rate": 7.765989257763545e-07, + "loss": 0.4384, + "step": 9233 + }, + { + "epoch": 0.7499390887679688, + "grad_norm": 5.737542594203367, + "learning_rate": 7.761225917216978e-07, + "loss": 0.4428, + "step": 9234 + }, + { + "epoch": 0.7500203037440104, + "grad_norm": 10.689284036969283, + "learning_rate": 7.75646376950234e-07, + "loss": 0.4604, + "step": 9235 + }, + { + "epoch": 0.7501015187200519, + "grad_norm": 22.015199504327047, + "learning_rate": 7.751702814949145e-07, + "loss": 0.5013, + "step": 9236 + }, + { + "epoch": 0.7501827336960936, + "grad_norm": 9.731543225761502, + "learning_rate": 7.746943053886835e-07, + "loss": 0.4218, + "step": 9237 + }, + { + "epoch": 0.7502639486721352, + "grad_norm": 6.82059578534053, + "learning_rate": 7.742184486644746e-07, + "loss": 0.5271, + "step": 9238 + }, + { + "epoch": 0.7503451636481767, + "grad_norm": 4.580942490166404, + "learning_rate": 7.737427113552157e-07, + "loss": 0.5683, + "step": 9239 + }, + { + "epoch": 0.7504263786242183, + "grad_norm": 3.654932263444829, + "learning_rate": 7.732670934938257e-07, + "loss": 0.4838, + "step": 9240 + }, + { + "epoch": 0.7505075936002599, + "grad_norm": 5.579207786127818, + "learning_rate": 7.727915951132145e-07, + "loss": 0.478, + "step": 9241 + }, + { + "epoch": 0.7505888085763015, + "grad_norm": 4.861880759749431, + "learning_rate": 7.723162162462827e-07, + "loss": 0.5107, + "step": 9242 + }, + { + "epoch": 0.750670023552343, + "grad_norm": 3.336615456803476, + "learning_rate": 7.718409569259261e-07, + "loss": 0.439, + "step": 9243 + }, + { + "epoch": 0.7507512385283847, + "grad_norm": 3.594363219165978, + "learning_rate": 7.713658171850289e-07, + "loss": 0.5717, + "step": 9244 + }, + { + "epoch": 0.7508324535044262, + "grad_norm": 6.352249538029256, + "learning_rate": 7.708907970564672e-07, + "loss": 0.4124, + "step": 9245 + }, + { + "epoch": 0.7509136684804678, + "grad_norm": 5.815211228408834, + "learning_rate": 7.704158965731126e-07, + "loss": 0.6313, + "step": 9246 + }, + { + "epoch": 0.7509948834565093, + "grad_norm": 4.251212567683483, + "learning_rate": 7.699411157678241e-07, + "loss": 0.4166, + "step": 9247 + }, + { + "epoch": 0.751076098432551, + "grad_norm": 4.650261805037805, + "learning_rate": 7.694664546734534e-07, + "loss": 0.5619, + "step": 9248 + }, + { + "epoch": 0.7511573134085926, + "grad_norm": 7.169544455959594, + "learning_rate": 7.689919133228462e-07, + "loss": 0.3819, + "step": 9249 + }, + { + "epoch": 0.7512385283846341, + "grad_norm": 5.930269929305272, + "learning_rate": 7.685174917488375e-07, + "loss": 0.45, + "step": 9250 + }, + { + "epoch": 0.7513197433606758, + "grad_norm": 4.069763452953068, + "learning_rate": 7.680431899842538e-07, + "loss": 0.4904, + "step": 9251 + }, + { + "epoch": 0.7514009583367173, + "grad_norm": 6.375436578366889, + "learning_rate": 7.67569008061915e-07, + "loss": 0.42, + "step": 9252 + }, + { + "epoch": 0.7514821733127589, + "grad_norm": 4.507854006209749, + "learning_rate": 7.670949460146329e-07, + "loss": 0.5423, + "step": 9253 + }, + { + "epoch": 0.7515633882888004, + "grad_norm": 4.6919308246764615, + "learning_rate": 7.666210038752092e-07, + "loss": 0.4196, + "step": 9254 + }, + { + "epoch": 0.7516446032648421, + "grad_norm": 7.409437183391873, + "learning_rate": 7.661471816764377e-07, + "loss": 0.5427, + "step": 9255 + }, + { + "epoch": 0.7517258182408836, + "grad_norm": 3.4419438252999446, + "learning_rate": 7.656734794511056e-07, + "loss": 0.4405, + "step": 9256 + }, + { + "epoch": 0.7518070332169252, + "grad_norm": 12.815965237950453, + "learning_rate": 7.65199897231989e-07, + "loss": 0.4739, + "step": 9257 + }, + { + "epoch": 0.7518882481929667, + "grad_norm": 6.654081291463507, + "learning_rate": 7.647264350518582e-07, + "loss": 0.4545, + "step": 9258 + }, + { + "epoch": 0.7519694631690084, + "grad_norm": 4.719867564770338, + "learning_rate": 7.642530929434752e-07, + "loss": 0.4012, + "step": 9259 + }, + { + "epoch": 0.75205067814505, + "grad_norm": 5.37752756201268, + "learning_rate": 7.637798709395919e-07, + "loss": 0.5197, + "step": 9260 + }, + { + "epoch": 0.7521318931210915, + "grad_norm": 5.47385742990871, + "learning_rate": 7.633067690729517e-07, + "loss": 0.367, + "step": 9261 + }, + { + "epoch": 0.7522131080971332, + "grad_norm": 4.4733368861841045, + "learning_rate": 7.628337873762928e-07, + "loss": 0.4252, + "step": 9262 + }, + { + "epoch": 0.7522943230731747, + "grad_norm": 6.510631608383568, + "learning_rate": 7.62360925882342e-07, + "loss": 0.4848, + "step": 9263 + }, + { + "epoch": 0.7523755380492163, + "grad_norm": 8.2985790641125, + "learning_rate": 7.618881846238177e-07, + "loss": 0.3856, + "step": 9264 + }, + { + "epoch": 0.7524567530252578, + "grad_norm": 8.29108898122942, + "learning_rate": 7.614155636334325e-07, + "loss": 0.284, + "step": 9265 + }, + { + "epoch": 0.7525379680012995, + "grad_norm": 14.447143227318355, + "learning_rate": 7.609430629438896e-07, + "loss": 0.4909, + "step": 9266 + }, + { + "epoch": 0.752619182977341, + "grad_norm": 9.786187786821417, + "learning_rate": 7.604706825878822e-07, + "loss": 0.5262, + "step": 9267 + }, + { + "epoch": 0.7527003979533826, + "grad_norm": 6.314038643222093, + "learning_rate": 7.59998422598098e-07, + "loss": 0.4241, + "step": 9268 + }, + { + "epoch": 0.7527816129294241, + "grad_norm": 3.701696525893301, + "learning_rate": 7.595262830072142e-07, + "loss": 0.4264, + "step": 9269 + }, + { + "epoch": 0.7528628279054658, + "grad_norm": 6.147998028522049, + "learning_rate": 7.590542638478992e-07, + "loss": 0.5874, + "step": 9270 + }, + { + "epoch": 0.7529440428815074, + "grad_norm": 5.173315947912089, + "learning_rate": 7.585823651528157e-07, + "loss": 0.5389, + "step": 9271 + }, + { + "epoch": 0.7530252578575489, + "grad_norm": 5.039146817437555, + "learning_rate": 7.581105869546168e-07, + "loss": 0.3099, + "step": 9272 + }, + { + "epoch": 0.7531064728335906, + "grad_norm": 6.7461838507335505, + "learning_rate": 7.576389292859465e-07, + "loss": 0.5686, + "step": 9273 + }, + { + "epoch": 0.7531876878096321, + "grad_norm": 4.776535888289513, + "learning_rate": 7.5716739217944e-07, + "loss": 0.5206, + "step": 9274 + }, + { + "epoch": 0.7532689027856737, + "grad_norm": 5.586476353122101, + "learning_rate": 7.566959756677272e-07, + "loss": 0.4071, + "step": 9275 + }, + { + "epoch": 0.7533501177617152, + "grad_norm": 6.574615954432533, + "learning_rate": 7.562246797834266e-07, + "loss": 0.4455, + "step": 9276 + }, + { + "epoch": 0.7534313327377569, + "grad_norm": 3.8731896502314855, + "learning_rate": 7.557535045591485e-07, + "loss": 0.5776, + "step": 9277 + }, + { + "epoch": 0.7535125477137984, + "grad_norm": 4.1035501788561906, + "learning_rate": 7.552824500274963e-07, + "loss": 0.4607, + "step": 9278 + }, + { + "epoch": 0.75359376268984, + "grad_norm": 3.37747957490476, + "learning_rate": 7.548115162210659e-07, + "loss": 0.5089, + "step": 9279 + }, + { + "epoch": 0.7536749776658815, + "grad_norm": 5.307633133304405, + "learning_rate": 7.543407031724415e-07, + "loss": 0.7355, + "step": 9280 + }, + { + "epoch": 0.7537561926419232, + "grad_norm": 5.0482613355790695, + "learning_rate": 7.538700109142022e-07, + "loss": 0.5037, + "step": 9281 + }, + { + "epoch": 0.7538374076179648, + "grad_norm": 5.458953154438223, + "learning_rate": 7.533994394789171e-07, + "loss": 0.4972, + "step": 9282 + }, + { + "epoch": 0.7539186225940063, + "grad_norm": 8.54249426936385, + "learning_rate": 7.529289888991462e-07, + "loss": 0.5581, + "step": 9283 + }, + { + "epoch": 0.753999837570048, + "grad_norm": 7.083450524943488, + "learning_rate": 7.524586592074432e-07, + "loss": 0.4335, + "step": 9284 + }, + { + "epoch": 0.7540810525460895, + "grad_norm": 3.984907663593172, + "learning_rate": 7.519884504363525e-07, + "loss": 0.5737, + "step": 9285 + }, + { + "epoch": 0.7541622675221311, + "grad_norm": 11.241450055916163, + "learning_rate": 7.515183626184095e-07, + "loss": 0.4297, + "step": 9286 + }, + { + "epoch": 0.7542434824981726, + "grad_norm": 5.7339323931383035, + "learning_rate": 7.510483957861428e-07, + "loss": 0.5938, + "step": 9287 + }, + { + "epoch": 0.7543246974742143, + "grad_norm": 4.59002005480556, + "learning_rate": 7.505785499720708e-07, + "loss": 0.3925, + "step": 9288 + }, + { + "epoch": 0.7544059124502558, + "grad_norm": 4.295589622295897, + "learning_rate": 7.501088252087046e-07, + "loss": 0.5777, + "step": 9289 + }, + { + "epoch": 0.7544871274262974, + "grad_norm": 6.677871787585794, + "learning_rate": 7.496392215285456e-07, + "loss": 0.5826, + "step": 9290 + }, + { + "epoch": 0.7545683424023389, + "grad_norm": 6.4999364694050366, + "learning_rate": 7.49169738964089e-07, + "loss": 0.6462, + "step": 9291 + }, + { + "epoch": 0.7546495573783806, + "grad_norm": 4.2437533825373706, + "learning_rate": 7.487003775478208e-07, + "loss": 0.4204, + "step": 9292 + }, + { + "epoch": 0.7547307723544222, + "grad_norm": 6.9886624820729475, + "learning_rate": 7.482311373122173e-07, + "loss": 0.4936, + "step": 9293 + }, + { + "epoch": 0.7548119873304637, + "grad_norm": 9.509767514085462, + "learning_rate": 7.477620182897485e-07, + "loss": 0.5231, + "step": 9294 + }, + { + "epoch": 0.7548932023065054, + "grad_norm": 6.97430223457955, + "learning_rate": 7.472930205128748e-07, + "loss": 0.614, + "step": 9295 + }, + { + "epoch": 0.7549744172825469, + "grad_norm": 6.802220975306286, + "learning_rate": 7.46824144014047e-07, + "loss": 0.4198, + "step": 9296 + }, + { + "epoch": 0.7550556322585885, + "grad_norm": 5.419177843717326, + "learning_rate": 7.4635538882571e-07, + "loss": 0.5168, + "step": 9297 + }, + { + "epoch": 0.75513684723463, + "grad_norm": 4.5756834213715365, + "learning_rate": 7.458867549802998e-07, + "loss": 0.4739, + "step": 9298 + }, + { + "epoch": 0.7552180622106717, + "grad_norm": 10.932740692189409, + "learning_rate": 7.454182425102418e-07, + "loss": 0.3756, + "step": 9299 + }, + { + "epoch": 0.7552992771867132, + "grad_norm": 7.458052877323758, + "learning_rate": 7.449498514479564e-07, + "loss": 0.4295, + "step": 9300 + }, + { + "epoch": 0.7553804921627548, + "grad_norm": 7.572226076657359, + "learning_rate": 7.444815818258527e-07, + "loss": 0.5706, + "step": 9301 + }, + { + "epoch": 0.7554617071387963, + "grad_norm": 7.895090561106518, + "learning_rate": 7.440134336763316e-07, + "loss": 0.7025, + "step": 9302 + }, + { + "epoch": 0.755542922114838, + "grad_norm": 7.629608085828636, + "learning_rate": 7.435454070317885e-07, + "loss": 0.4952, + "step": 9303 + }, + { + "epoch": 0.7556241370908796, + "grad_norm": 5.105832653252446, + "learning_rate": 7.430775019246064e-07, + "loss": 0.3872, + "step": 9304 + }, + { + "epoch": 0.7557053520669211, + "grad_norm": 5.491484533703616, + "learning_rate": 7.426097183871636e-07, + "loss": 0.3978, + "step": 9305 + }, + { + "epoch": 0.7557865670429628, + "grad_norm": 6.946802048002631, + "learning_rate": 7.421420564518267e-07, + "loss": 0.6585, + "step": 9306 + }, + { + "epoch": 0.7558677820190043, + "grad_norm": 3.9872313781539206, + "learning_rate": 7.41674516150957e-07, + "loss": 0.6047, + "step": 9307 + }, + { + "epoch": 0.7559489969950459, + "grad_norm": 7.213117113503657, + "learning_rate": 7.412070975169047e-07, + "loss": 0.583, + "step": 9308 + }, + { + "epoch": 0.7560302119710874, + "grad_norm": 5.24742138382295, + "learning_rate": 7.407398005820123e-07, + "loss": 0.4034, + "step": 9309 + }, + { + "epoch": 0.7561114269471291, + "grad_norm": 4.211946251001222, + "learning_rate": 7.402726253786152e-07, + "loss": 0.4058, + "step": 9310 + }, + { + "epoch": 0.7561926419231706, + "grad_norm": 4.415557038084326, + "learning_rate": 7.398055719390399e-07, + "loss": 0.5176, + "step": 9311 + }, + { + "epoch": 0.7562738568992122, + "grad_norm": 4.229473804773957, + "learning_rate": 7.39338640295602e-07, + "loss": 0.533, + "step": 9312 + }, + { + "epoch": 0.7563550718752537, + "grad_norm": 6.390722780847675, + "learning_rate": 7.388718304806133e-07, + "loss": 0.4943, + "step": 9313 + }, + { + "epoch": 0.7564362868512954, + "grad_norm": 3.9879443116206224, + "learning_rate": 7.384051425263733e-07, + "loss": 0.6664, + "step": 9314 + }, + { + "epoch": 0.756517501827337, + "grad_norm": 4.396256081895415, + "learning_rate": 7.379385764651737e-07, + "loss": 0.3858, + "step": 9315 + }, + { + "epoch": 0.7565987168033785, + "grad_norm": 9.650686792105313, + "learning_rate": 7.374721323292985e-07, + "loss": 0.6082, + "step": 9316 + }, + { + "epoch": 0.7566799317794202, + "grad_norm": 4.52159603222765, + "learning_rate": 7.370058101510249e-07, + "loss": 0.4969, + "step": 9317 + }, + { + "epoch": 0.7567611467554617, + "grad_norm": 4.592670810869141, + "learning_rate": 7.365396099626176e-07, + "loss": 0.6284, + "step": 9318 + }, + { + "epoch": 0.7568423617315033, + "grad_norm": 14.987779633029737, + "learning_rate": 7.360735317963374e-07, + "loss": 0.4282, + "step": 9319 + }, + { + "epoch": 0.7569235767075448, + "grad_norm": 5.108452215084867, + "learning_rate": 7.356075756844333e-07, + "loss": 0.5101, + "step": 9320 + }, + { + "epoch": 0.7570047916835865, + "grad_norm": 6.336779399554521, + "learning_rate": 7.351417416591461e-07, + "loss": 0.4218, + "step": 9321 + }, + { + "epoch": 0.757086006659628, + "grad_norm": 3.2420955010959633, + "learning_rate": 7.346760297527109e-07, + "loss": 0.4596, + "step": 9322 + }, + { + "epoch": 0.7571672216356696, + "grad_norm": 14.31442060248585, + "learning_rate": 7.342104399973507e-07, + "loss": 0.6222, + "step": 9323 + }, + { + "epoch": 0.7572484366117112, + "grad_norm": 4.841998000446461, + "learning_rate": 7.337449724252837e-07, + "loss": 0.3906, + "step": 9324 + }, + { + "epoch": 0.7573296515877528, + "grad_norm": 6.189602667818374, + "learning_rate": 7.332796270687159e-07, + "loss": 0.5206, + "step": 9325 + }, + { + "epoch": 0.7574108665637944, + "grad_norm": 3.4330814806419747, + "learning_rate": 7.328144039598487e-07, + "loss": 0.4608, + "step": 9326 + }, + { + "epoch": 0.7574920815398359, + "grad_norm": 8.246364139369895, + "learning_rate": 7.323493031308718e-07, + "loss": 0.3708, + "step": 9327 + }, + { + "epoch": 0.7575732965158776, + "grad_norm": 11.977800366186257, + "learning_rate": 7.318843246139673e-07, + "loss": 0.4251, + "step": 9328 + }, + { + "epoch": 0.7576545114919191, + "grad_norm": 4.480080892757191, + "learning_rate": 7.314194684413098e-07, + "loss": 0.5398, + "step": 9329 + }, + { + "epoch": 0.7577357264679607, + "grad_norm": 5.761110269898293, + "learning_rate": 7.309547346450658e-07, + "loss": 0.486, + "step": 9330 + }, + { + "epoch": 0.7578169414440022, + "grad_norm": 4.670162793343675, + "learning_rate": 7.304901232573908e-07, + "loss": 0.4607, + "step": 9331 + }, + { + "epoch": 0.7578981564200439, + "grad_norm": 7.987770600523061, + "learning_rate": 7.300256343104351e-07, + "loss": 0.4614, + "step": 9332 + }, + { + "epoch": 0.7579793713960854, + "grad_norm": 5.479915469263117, + "learning_rate": 7.295612678363382e-07, + "loss": 0.3639, + "step": 9333 + }, + { + "epoch": 0.758060586372127, + "grad_norm": 25.580608798332342, + "learning_rate": 7.290970238672307e-07, + "loss": 0.3886, + "step": 9334 + }, + { + "epoch": 0.7581418013481686, + "grad_norm": 3.80358687651955, + "learning_rate": 7.286329024352376e-07, + "loss": 0.5083, + "step": 9335 + }, + { + "epoch": 0.7582230163242102, + "grad_norm": 3.6188064823034747, + "learning_rate": 7.281689035724718e-07, + "loss": 0.4203, + "step": 9336 + }, + { + "epoch": 0.7583042313002518, + "grad_norm": 6.662801100557623, + "learning_rate": 7.277050273110408e-07, + "loss": 0.428, + "step": 9337 + }, + { + "epoch": 0.7583854462762933, + "grad_norm": 4.198195373303361, + "learning_rate": 7.272412736830431e-07, + "loss": 0.5499, + "step": 9338 + }, + { + "epoch": 0.758466661252335, + "grad_norm": 7.26119245955303, + "learning_rate": 7.26777642720567e-07, + "loss": 0.5308, + "step": 9339 + }, + { + "epoch": 0.7585478762283765, + "grad_norm": 6.6134602398961295, + "learning_rate": 7.263141344556924e-07, + "loss": 0.4514, + "step": 9340 + }, + { + "epoch": 0.7586290912044181, + "grad_norm": 3.9208951218117427, + "learning_rate": 7.258507489204935e-07, + "loss": 0.3762, + "step": 9341 + }, + { + "epoch": 0.7587103061804596, + "grad_norm": 7.76615893471226, + "learning_rate": 7.253874861470325e-07, + "loss": 0.5326, + "step": 9342 + }, + { + "epoch": 0.7587915211565013, + "grad_norm": 7.24244627028924, + "learning_rate": 7.24924346167366e-07, + "loss": 0.4045, + "step": 9343 + }, + { + "epoch": 0.7588727361325428, + "grad_norm": 3.373479639826732, + "learning_rate": 7.244613290135396e-07, + "loss": 0.4882, + "step": 9344 + }, + { + "epoch": 0.7589539511085844, + "grad_norm": 6.628941975131967, + "learning_rate": 7.239984347175932e-07, + "loss": 0.3442, + "step": 9345 + }, + { + "epoch": 0.759035166084626, + "grad_norm": 3.784372823794077, + "learning_rate": 7.235356633115559e-07, + "loss": 0.5905, + "step": 9346 + }, + { + "epoch": 0.7591163810606676, + "grad_norm": 4.27156618640782, + "learning_rate": 7.230730148274478e-07, + "loss": 0.3702, + "step": 9347 + }, + { + "epoch": 0.7591975960367092, + "grad_norm": 3.9436102310547327, + "learning_rate": 7.226104892972838e-07, + "loss": 0.3701, + "step": 9348 + }, + { + "epoch": 0.7592788110127507, + "grad_norm": 3.9798514443021196, + "learning_rate": 7.221480867530664e-07, + "loss": 0.4805, + "step": 9349 + }, + { + "epoch": 0.7593600259887924, + "grad_norm": 3.87348251530302, + "learning_rate": 7.216858072267924e-07, + "loss": 0.5473, + "step": 9350 + }, + { + "epoch": 0.7594412409648339, + "grad_norm": 5.915769976523819, + "learning_rate": 7.212236507504494e-07, + "loss": 0.5054, + "step": 9351 + }, + { + "epoch": 0.7595224559408755, + "grad_norm": 3.8732533424148605, + "learning_rate": 7.207616173560158e-07, + "loss": 0.4806, + "step": 9352 + }, + { + "epoch": 0.759603670916917, + "grad_norm": 6.11681123799206, + "learning_rate": 7.202997070754613e-07, + "loss": 0.46, + "step": 9353 + }, + { + "epoch": 0.7596848858929587, + "grad_norm": 8.365104395894681, + "learning_rate": 7.198379199407488e-07, + "loss": 0.5624, + "step": 9354 + }, + { + "epoch": 0.7597661008690002, + "grad_norm": 6.4866744161399685, + "learning_rate": 7.193762559838299e-07, + "loss": 0.5142, + "step": 9355 + }, + { + "epoch": 0.7598473158450418, + "grad_norm": 6.4484042472139675, + "learning_rate": 7.189147152366504e-07, + "loss": 0.3683, + "step": 9356 + }, + { + "epoch": 0.7599285308210834, + "grad_norm": 4.5783151146826935, + "learning_rate": 7.184532977311471e-07, + "loss": 0.4517, + "step": 9357 + }, + { + "epoch": 0.760009745797125, + "grad_norm": 4.142951500808283, + "learning_rate": 7.179920034992469e-07, + "loss": 0.4783, + "step": 9358 + }, + { + "epoch": 0.7600909607731666, + "grad_norm": 3.327558136006068, + "learning_rate": 7.175308325728689e-07, + "loss": 0.6403, + "step": 9359 + }, + { + "epoch": 0.7601721757492081, + "grad_norm": 7.732587779509069, + "learning_rate": 7.170697849839229e-07, + "loss": 0.4863, + "step": 9360 + }, + { + "epoch": 0.7602533907252498, + "grad_norm": 3.4261727551662866, + "learning_rate": 7.166088607643123e-07, + "loss": 0.4994, + "step": 9361 + }, + { + "epoch": 0.7603346057012913, + "grad_norm": 7.420461816144346, + "learning_rate": 7.161480599459297e-07, + "loss": 0.3715, + "step": 9362 + }, + { + "epoch": 0.7604158206773329, + "grad_norm": 4.177142639059444, + "learning_rate": 7.156873825606603e-07, + "loss": 0.5635, + "step": 9363 + }, + { + "epoch": 0.7604970356533745, + "grad_norm": 6.465161714985024, + "learning_rate": 7.152268286403813e-07, + "loss": 0.4047, + "step": 9364 + }, + { + "epoch": 0.7605782506294161, + "grad_norm": 6.173359562536338, + "learning_rate": 7.147663982169601e-07, + "loss": 0.4028, + "step": 9365 + }, + { + "epoch": 0.7606594656054576, + "grad_norm": 3.9455176413170583, + "learning_rate": 7.143060913222552e-07, + "loss": 0.6058, + "step": 9366 + }, + { + "epoch": 0.7607406805814992, + "grad_norm": 10.642099423017335, + "learning_rate": 7.138459079881188e-07, + "loss": 0.706, + "step": 9367 + }, + { + "epoch": 0.7608218955575408, + "grad_norm": 4.19417453033392, + "learning_rate": 7.133858482463918e-07, + "loss": 0.6098, + "step": 9368 + }, + { + "epoch": 0.7609031105335824, + "grad_norm": 4.253649965044964, + "learning_rate": 7.129259121289086e-07, + "loss": 0.4453, + "step": 9369 + }, + { + "epoch": 0.760984325509624, + "grad_norm": 3.668621601717135, + "learning_rate": 7.124660996674951e-07, + "loss": 0.5733, + "step": 9370 + }, + { + "epoch": 0.7610655404856655, + "grad_norm": 6.048356569124348, + "learning_rate": 7.12006410893967e-07, + "loss": 0.4285, + "step": 9371 + }, + { + "epoch": 0.7611467554617072, + "grad_norm": 4.317558436319294, + "learning_rate": 7.115468458401317e-07, + "loss": 0.6383, + "step": 9372 + }, + { + "epoch": 0.7612279704377487, + "grad_norm": 7.296251737717159, + "learning_rate": 7.110874045377902e-07, + "loss": 0.551, + "step": 9373 + }, + { + "epoch": 0.7613091854137903, + "grad_norm": 5.66808374273542, + "learning_rate": 7.106280870187326e-07, + "loss": 0.4665, + "step": 9374 + }, + { + "epoch": 0.7613904003898319, + "grad_norm": 3.897913983532991, + "learning_rate": 7.101688933147397e-07, + "loss": 0.5898, + "step": 9375 + }, + { + "epoch": 0.7614716153658735, + "grad_norm": 5.913213010761257, + "learning_rate": 7.097098234575883e-07, + "loss": 0.379, + "step": 9376 + }, + { + "epoch": 0.761552830341915, + "grad_norm": 8.586921010693551, + "learning_rate": 7.092508774790424e-07, + "loss": 0.3769, + "step": 9377 + }, + { + "epoch": 0.7616340453179566, + "grad_norm": 6.837790290025689, + "learning_rate": 7.087920554108582e-07, + "loss": 0.5177, + "step": 9378 + }, + { + "epoch": 0.7617152602939982, + "grad_norm": 4.809392070744878, + "learning_rate": 7.083333572847831e-07, + "loss": 0.4528, + "step": 9379 + }, + { + "epoch": 0.7617964752700398, + "grad_norm": 3.467857419045606, + "learning_rate": 7.078747831325583e-07, + "loss": 0.5457, + "step": 9380 + }, + { + "epoch": 0.7618776902460814, + "grad_norm": 4.495338193171126, + "learning_rate": 7.074163329859129e-07, + "loss": 0.3799, + "step": 9381 + }, + { + "epoch": 0.761958905222123, + "grad_norm": 4.00025930868917, + "learning_rate": 7.069580068765702e-07, + "loss": 0.3478, + "step": 9382 + }, + { + "epoch": 0.7620401201981646, + "grad_norm": 10.827446952884547, + "learning_rate": 7.064998048362448e-07, + "loss": 0.4943, + "step": 9383 + }, + { + "epoch": 0.7621213351742061, + "grad_norm": 5.064306842835891, + "learning_rate": 7.060417268966408e-07, + "loss": 0.4528, + "step": 9384 + }, + { + "epoch": 0.7622025501502477, + "grad_norm": 4.468211696665271, + "learning_rate": 7.055837730894541e-07, + "loss": 0.4532, + "step": 9385 + }, + { + "epoch": 0.7622837651262893, + "grad_norm": 6.800829039851487, + "learning_rate": 7.051259434463745e-07, + "loss": 0.347, + "step": 9386 + }, + { + "epoch": 0.7623649801023309, + "grad_norm": 6.208378537952931, + "learning_rate": 7.046682379990794e-07, + "loss": 0.4928, + "step": 9387 + }, + { + "epoch": 0.7624461950783724, + "grad_norm": 4.2620464174120425, + "learning_rate": 7.042106567792406e-07, + "loss": 0.5339, + "step": 9388 + }, + { + "epoch": 0.762527410054414, + "grad_norm": 5.564383633638113, + "learning_rate": 7.03753199818521e-07, + "loss": 0.4415, + "step": 9389 + }, + { + "epoch": 0.7626086250304556, + "grad_norm": 6.375756161119263, + "learning_rate": 7.032958671485734e-07, + "loss": 0.4055, + "step": 9390 + }, + { + "epoch": 0.7626898400064972, + "grad_norm": 6.484850680260223, + "learning_rate": 7.028386588010421e-07, + "loss": 0.5106, + "step": 9391 + }, + { + "epoch": 0.7627710549825388, + "grad_norm": 6.127381595989031, + "learning_rate": 7.023815748075651e-07, + "loss": 0.324, + "step": 9392 + }, + { + "epoch": 0.7628522699585804, + "grad_norm": 4.069418782141938, + "learning_rate": 7.019246151997694e-07, + "loss": 0.4221, + "step": 9393 + }, + { + "epoch": 0.762933484934622, + "grad_norm": 3.966681143690813, + "learning_rate": 7.014677800092734e-07, + "loss": 0.5576, + "step": 9394 + }, + { + "epoch": 0.7630146999106635, + "grad_norm": 6.497751818986687, + "learning_rate": 7.010110692676886e-07, + "loss": 0.4702, + "step": 9395 + }, + { + "epoch": 0.7630959148867051, + "grad_norm": 6.139900453454705, + "learning_rate": 7.005544830066172e-07, + "loss": 0.3902, + "step": 9396 + }, + { + "epoch": 0.7631771298627467, + "grad_norm": 5.005470766044911, + "learning_rate": 7.000980212576522e-07, + "loss": 0.4043, + "step": 9397 + }, + { + "epoch": 0.7632583448387883, + "grad_norm": 13.852454716609795, + "learning_rate": 6.996416840523776e-07, + "loss": 0.5992, + "step": 9398 + }, + { + "epoch": 0.7633395598148298, + "grad_norm": 4.487243506708304, + "learning_rate": 6.991854714223711e-07, + "loss": 0.4657, + "step": 9399 + }, + { + "epoch": 0.7634207747908714, + "grad_norm": 5.202920899583729, + "learning_rate": 6.987293833991984e-07, + "loss": 0.6265, + "step": 9400 + }, + { + "epoch": 0.763501989766913, + "grad_norm": 17.066516911339072, + "learning_rate": 6.982734200144192e-07, + "loss": 0.4925, + "step": 9401 + }, + { + "epoch": 0.7635832047429546, + "grad_norm": 3.6601744793382305, + "learning_rate": 6.978175812995847e-07, + "loss": 0.4923, + "step": 9402 + }, + { + "epoch": 0.7636644197189962, + "grad_norm": 6.162261867859467, + "learning_rate": 6.973618672862357e-07, + "loss": 0.3402, + "step": 9403 + }, + { + "epoch": 0.7637456346950378, + "grad_norm": 3.383805533605749, + "learning_rate": 6.969062780059041e-07, + "loss": 0.4892, + "step": 9404 + }, + { + "epoch": 0.7638268496710794, + "grad_norm": 5.034659251576166, + "learning_rate": 6.964508134901162e-07, + "loss": 0.4553, + "step": 9405 + }, + { + "epoch": 0.7639080646471209, + "grad_norm": 4.219393214106526, + "learning_rate": 6.959954737703872e-07, + "loss": 0.3549, + "step": 9406 + }, + { + "epoch": 0.7639892796231625, + "grad_norm": 7.125686847574229, + "learning_rate": 6.955402588782229e-07, + "loss": 0.4162, + "step": 9407 + }, + { + "epoch": 0.7640704945992041, + "grad_norm": 4.00897406634947, + "learning_rate": 6.950851688451224e-07, + "loss": 0.4606, + "step": 9408 + }, + { + "epoch": 0.7641517095752457, + "grad_norm": 5.368532207744499, + "learning_rate": 6.94630203702577e-07, + "loss": 0.6832, + "step": 9409 + }, + { + "epoch": 0.7642329245512872, + "grad_norm": 4.61391210253197, + "learning_rate": 6.941753634820658e-07, + "loss": 0.4458, + "step": 9410 + }, + { + "epoch": 0.7643141395273289, + "grad_norm": 10.571864997786388, + "learning_rate": 6.93720648215063e-07, + "loss": 0.3812, + "step": 9411 + }, + { + "epoch": 0.7643953545033704, + "grad_norm": 6.071699040314471, + "learning_rate": 6.932660579330317e-07, + "loss": 0.3526, + "step": 9412 + }, + { + "epoch": 0.764476569479412, + "grad_norm": 5.121163071457247, + "learning_rate": 6.928115926674265e-07, + "loss": 0.5468, + "step": 9413 + }, + { + "epoch": 0.7645577844554536, + "grad_norm": 4.5789958968091025, + "learning_rate": 6.923572524496946e-07, + "loss": 0.4688, + "step": 9414 + }, + { + "epoch": 0.7646389994314952, + "grad_norm": 4.61496141221473, + "learning_rate": 6.919030373112748e-07, + "loss": 0.538, + "step": 9415 + }, + { + "epoch": 0.7647202144075368, + "grad_norm": 5.530010793903644, + "learning_rate": 6.914489472835959e-07, + "loss": 0.3921, + "step": 9416 + }, + { + "epoch": 0.7648014293835783, + "grad_norm": 4.053580386916616, + "learning_rate": 6.909949823980772e-07, + "loss": 0.4411, + "step": 9417 + }, + { + "epoch": 0.76488264435962, + "grad_norm": 5.361420424547812, + "learning_rate": 6.905411426861322e-07, + "loss": 0.4674, + "step": 9418 + }, + { + "epoch": 0.7649638593356615, + "grad_norm": 6.357458261429573, + "learning_rate": 6.900874281791639e-07, + "loss": 0.4666, + "step": 9419 + }, + { + "epoch": 0.7650450743117031, + "grad_norm": 6.195545553218025, + "learning_rate": 6.89633838908566e-07, + "loss": 0.4103, + "step": 9420 + }, + { + "epoch": 0.7651262892877446, + "grad_norm": 5.87439782429603, + "learning_rate": 6.891803749057255e-07, + "loss": 0.415, + "step": 9421 + }, + { + "epoch": 0.7652075042637863, + "grad_norm": 3.5630192542232497, + "learning_rate": 6.887270362020199e-07, + "loss": 0.4371, + "step": 9422 + }, + { + "epoch": 0.7652887192398278, + "grad_norm": 6.836068046454834, + "learning_rate": 6.882738228288166e-07, + "loss": 0.3986, + "step": 9423 + }, + { + "epoch": 0.7653699342158694, + "grad_norm": 6.446997290383441, + "learning_rate": 6.87820734817477e-07, + "loss": 0.4896, + "step": 9424 + }, + { + "epoch": 0.765451149191911, + "grad_norm": 4.358114968607592, + "learning_rate": 6.873677721993518e-07, + "loss": 0.514, + "step": 9425 + }, + { + "epoch": 0.7655323641679526, + "grad_norm": 8.732219383382438, + "learning_rate": 6.86914935005783e-07, + "loss": 0.5265, + "step": 9426 + }, + { + "epoch": 0.7656135791439942, + "grad_norm": 4.310805179732659, + "learning_rate": 6.864622232681048e-07, + "loss": 0.4076, + "step": 9427 + }, + { + "epoch": 0.7656947941200357, + "grad_norm": 4.045216285940855, + "learning_rate": 6.860096370176436e-07, + "loss": 0.5051, + "step": 9428 + }, + { + "epoch": 0.7657760090960773, + "grad_norm": 7.3004627003203435, + "learning_rate": 6.855571762857144e-07, + "loss": 0.5137, + "step": 9429 + }, + { + "epoch": 0.7658572240721189, + "grad_norm": 4.355642197018315, + "learning_rate": 6.851048411036265e-07, + "loss": 0.4596, + "step": 9430 + }, + { + "epoch": 0.7659384390481605, + "grad_norm": 4.107096289312408, + "learning_rate": 6.846526315026783e-07, + "loss": 0.5929, + "step": 9431 + }, + { + "epoch": 0.766019654024202, + "grad_norm": 4.55847735238035, + "learning_rate": 6.842005475141606e-07, + "loss": 0.5062, + "step": 9432 + }, + { + "epoch": 0.7661008690002437, + "grad_norm": 4.084279279942593, + "learning_rate": 6.837485891693541e-07, + "loss": 0.5043, + "step": 9433 + }, + { + "epoch": 0.7661820839762852, + "grad_norm": 13.22441226336881, + "learning_rate": 6.83296756499533e-07, + "loss": 0.4094, + "step": 9434 + }, + { + "epoch": 0.7662632989523268, + "grad_norm": 4.334505119567936, + "learning_rate": 6.828450495359623e-07, + "loss": 0.4747, + "step": 9435 + }, + { + "epoch": 0.7663445139283684, + "grad_norm": 5.930691768024282, + "learning_rate": 6.823934683098963e-07, + "loss": 0.6257, + "step": 9436 + }, + { + "epoch": 0.76642572890441, + "grad_norm": 3.3704120793280197, + "learning_rate": 6.819420128525834e-07, + "loss": 0.3181, + "step": 9437 + }, + { + "epoch": 0.7665069438804516, + "grad_norm": 3.842942498154423, + "learning_rate": 6.814906831952611e-07, + "loss": 0.4773, + "step": 9438 + }, + { + "epoch": 0.7665881588564931, + "grad_norm": 8.895568429556429, + "learning_rate": 6.810394793691585e-07, + "loss": 0.3918, + "step": 9439 + }, + { + "epoch": 0.7666693738325348, + "grad_norm": 6.8916662279772565, + "learning_rate": 6.805884014054975e-07, + "loss": 0.4543, + "step": 9440 + }, + { + "epoch": 0.7667505888085763, + "grad_norm": 4.932759072758281, + "learning_rate": 6.801374493354907e-07, + "loss": 0.555, + "step": 9441 + }, + { + "epoch": 0.7668318037846179, + "grad_norm": 3.165718589290986, + "learning_rate": 6.796866231903402e-07, + "loss": 0.5693, + "step": 9442 + }, + { + "epoch": 0.7669130187606594, + "grad_norm": 5.661881562952782, + "learning_rate": 6.792359230012418e-07, + "loss": 0.3955, + "step": 9443 + }, + { + "epoch": 0.7669942337367011, + "grad_norm": 5.857023328763498, + "learning_rate": 6.787853487993817e-07, + "loss": 0.2898, + "step": 9444 + }, + { + "epoch": 0.7670754487127426, + "grad_norm": 6.46410699371152, + "learning_rate": 6.783349006159359e-07, + "loss": 0.326, + "step": 9445 + }, + { + "epoch": 0.7671566636887842, + "grad_norm": 6.9335177221214614, + "learning_rate": 6.778845784820739e-07, + "loss": 0.4844, + "step": 9446 + }, + { + "epoch": 0.7672378786648258, + "grad_norm": 5.439529483084923, + "learning_rate": 6.774343824289567e-07, + "loss": 0.6114, + "step": 9447 + }, + { + "epoch": 0.7673190936408674, + "grad_norm": 5.645470096440396, + "learning_rate": 6.769843124877343e-07, + "loss": 0.5204, + "step": 9448 + }, + { + "epoch": 0.767400308616909, + "grad_norm": 24.655262909769625, + "learning_rate": 6.765343686895484e-07, + "loss": 0.4941, + "step": 9449 + }, + { + "epoch": 0.7674815235929505, + "grad_norm": 6.241765051887523, + "learning_rate": 6.760845510655345e-07, + "loss": 0.3861, + "step": 9450 + }, + { + "epoch": 0.7675627385689922, + "grad_norm": 8.035908170373448, + "learning_rate": 6.756348596468168e-07, + "loss": 0.5332, + "step": 9451 + }, + { + "epoch": 0.7676439535450337, + "grad_norm": 4.6392581562003405, + "learning_rate": 6.751852944645107e-07, + "loss": 0.4359, + "step": 9452 + }, + { + "epoch": 0.7677251685210753, + "grad_norm": 6.355510758018541, + "learning_rate": 6.747358555497244e-07, + "loss": 0.4006, + "step": 9453 + }, + { + "epoch": 0.7678063834971168, + "grad_norm": 5.191185254070647, + "learning_rate": 6.742865429335576e-07, + "loss": 0.6034, + "step": 9454 + }, + { + "epoch": 0.7678875984731585, + "grad_norm": 4.336515041136279, + "learning_rate": 6.738373566470991e-07, + "loss": 0.3565, + "step": 9455 + }, + { + "epoch": 0.7679688134492, + "grad_norm": 7.028294785077958, + "learning_rate": 6.733882967214312e-07, + "loss": 0.4001, + "step": 9456 + }, + { + "epoch": 0.7680500284252416, + "grad_norm": 3.345393934309023, + "learning_rate": 6.729393631876257e-07, + "loss": 0.4984, + "step": 9457 + }, + { + "epoch": 0.7681312434012832, + "grad_norm": 4.618093066401069, + "learning_rate": 6.724905560767464e-07, + "loss": 0.3027, + "step": 9458 + }, + { + "epoch": 0.7682124583773248, + "grad_norm": 5.511844154772235, + "learning_rate": 6.720418754198485e-07, + "loss": 0.3492, + "step": 9459 + }, + { + "epoch": 0.7682936733533664, + "grad_norm": 4.512587186638037, + "learning_rate": 6.715933212479791e-07, + "loss": 0.3126, + "step": 9460 + }, + { + "epoch": 0.7683748883294079, + "grad_norm": 3.8768216178706996, + "learning_rate": 6.711448935921744e-07, + "loss": 0.4618, + "step": 9461 + }, + { + "epoch": 0.7684561033054496, + "grad_norm": 4.849694261108834, + "learning_rate": 6.706965924834649e-07, + "loss": 0.3315, + "step": 9462 + }, + { + "epoch": 0.7685373182814911, + "grad_norm": 4.801051258082802, + "learning_rate": 6.702484179528699e-07, + "loss": 0.5456, + "step": 9463 + }, + { + "epoch": 0.7686185332575327, + "grad_norm": 9.352209783795086, + "learning_rate": 6.698003700313993e-07, + "loss": 0.427, + "step": 9464 + }, + { + "epoch": 0.7686997482335742, + "grad_norm": 7.643828934266029, + "learning_rate": 6.69352448750058e-07, + "loss": 0.7298, + "step": 9465 + }, + { + "epoch": 0.7687809632096159, + "grad_norm": 5.286073745430146, + "learning_rate": 6.689046541398378e-07, + "loss": 0.5492, + "step": 9466 + }, + { + "epoch": 0.7688621781856574, + "grad_norm": 7.104403560909407, + "learning_rate": 6.684569862317255e-07, + "loss": 0.4082, + "step": 9467 + }, + { + "epoch": 0.768943393161699, + "grad_norm": 21.80499840324673, + "learning_rate": 6.680094450566957e-07, + "loss": 0.4536, + "step": 9468 + }, + { + "epoch": 0.7690246081377407, + "grad_norm": 9.81351894485181, + "learning_rate": 6.675620306457172e-07, + "loss": 0.4041, + "step": 9469 + }, + { + "epoch": 0.7691058231137822, + "grad_norm": 5.220882981566686, + "learning_rate": 6.671147430297481e-07, + "loss": 0.4332, + "step": 9470 + }, + { + "epoch": 0.7691870380898238, + "grad_norm": 5.429857205093562, + "learning_rate": 6.666675822397378e-07, + "loss": 0.4061, + "step": 9471 + }, + { + "epoch": 0.7692682530658653, + "grad_norm": 8.052028153479819, + "learning_rate": 6.662205483066281e-07, + "loss": 0.326, + "step": 9472 + }, + { + "epoch": 0.769349468041907, + "grad_norm": 4.655903640053554, + "learning_rate": 6.65773641261352e-07, + "loss": 0.469, + "step": 9473 + }, + { + "epoch": 0.7694306830179485, + "grad_norm": 6.279574614073089, + "learning_rate": 6.653268611348315e-07, + "loss": 0.3736, + "step": 9474 + }, + { + "epoch": 0.7695118979939901, + "grad_norm": 4.349805162366064, + "learning_rate": 6.64880207957983e-07, + "loss": 0.4281, + "step": 9475 + }, + { + "epoch": 0.7695931129700316, + "grad_norm": 4.485659687329684, + "learning_rate": 6.644336817617122e-07, + "loss": 0.4795, + "step": 9476 + }, + { + "epoch": 0.7696743279460733, + "grad_norm": 5.1817758651078885, + "learning_rate": 6.63987282576915e-07, + "loss": 0.4077, + "step": 9477 + }, + { + "epoch": 0.7697555429221148, + "grad_norm": 3.6533998487921298, + "learning_rate": 6.635410104344819e-07, + "loss": 0.5877, + "step": 9478 + }, + { + "epoch": 0.7698367578981564, + "grad_norm": 5.823071091152919, + "learning_rate": 6.630948653652905e-07, + "loss": 0.4759, + "step": 9479 + }, + { + "epoch": 0.769917972874198, + "grad_norm": 6.866234496364005, + "learning_rate": 6.62648847400213e-07, + "loss": 0.4428, + "step": 9480 + }, + { + "epoch": 0.7699991878502396, + "grad_norm": 5.054351987364898, + "learning_rate": 6.622029565701118e-07, + "loss": 0.4417, + "step": 9481 + }, + { + "epoch": 0.7700804028262812, + "grad_norm": 4.79929266732889, + "learning_rate": 6.617571929058397e-07, + "loss": 0.4682, + "step": 9482 + }, + { + "epoch": 0.7701616178023227, + "grad_norm": 4.783027204950755, + "learning_rate": 6.613115564382403e-07, + "loss": 0.402, + "step": 9483 + }, + { + "epoch": 0.7702428327783644, + "grad_norm": 11.930675421175312, + "learning_rate": 6.608660471981509e-07, + "loss": 0.4791, + "step": 9484 + }, + { + "epoch": 0.7703240477544059, + "grad_norm": 6.054291067646692, + "learning_rate": 6.604206652163967e-07, + "loss": 0.4235, + "step": 9485 + }, + { + "epoch": 0.7704052627304475, + "grad_norm": 7.228203275275898, + "learning_rate": 6.599754105237974e-07, + "loss": 0.4832, + "step": 9486 + }, + { + "epoch": 0.770486477706489, + "grad_norm": 3.7391869318420285, + "learning_rate": 6.595302831511607e-07, + "loss": 0.537, + "step": 9487 + }, + { + "epoch": 0.7705676926825307, + "grad_norm": 5.637019086673706, + "learning_rate": 6.590852831292885e-07, + "loss": 0.6264, + "step": 9488 + }, + { + "epoch": 0.7706489076585722, + "grad_norm": 7.91711166378353, + "learning_rate": 6.586404104889721e-07, + "loss": 0.477, + "step": 9489 + }, + { + "epoch": 0.7707301226346138, + "grad_norm": 5.980422311755343, + "learning_rate": 6.58195665260993e-07, + "loss": 0.433, + "step": 9490 + }, + { + "epoch": 0.7708113376106555, + "grad_norm": 10.886812979952378, + "learning_rate": 6.577510474761272e-07, + "loss": 0.4323, + "step": 9491 + }, + { + "epoch": 0.770892552586697, + "grad_norm": 5.242170156688564, + "learning_rate": 6.573065571651383e-07, + "loss": 0.3652, + "step": 9492 + }, + { + "epoch": 0.7709737675627386, + "grad_norm": 5.032479849055025, + "learning_rate": 6.56862194358783e-07, + "loss": 0.4055, + "step": 9493 + }, + { + "epoch": 0.7710549825387801, + "grad_norm": 9.525750208818708, + "learning_rate": 6.5641795908781e-07, + "loss": 0.5108, + "step": 9494 + }, + { + "epoch": 0.7711361975148218, + "grad_norm": 4.492371142698525, + "learning_rate": 6.559738513829572e-07, + "loss": 0.679, + "step": 9495 + }, + { + "epoch": 0.7712174124908633, + "grad_norm": 5.27190852592138, + "learning_rate": 6.555298712749538e-07, + "loss": 0.5479, + "step": 9496 + }, + { + "epoch": 0.7712986274669049, + "grad_norm": 5.955473036514189, + "learning_rate": 6.550860187945227e-07, + "loss": 0.4585, + "step": 9497 + }, + { + "epoch": 0.7713798424429464, + "grad_norm": 5.382341493930171, + "learning_rate": 6.546422939723738e-07, + "loss": 0.4537, + "step": 9498 + }, + { + "epoch": 0.7714610574189881, + "grad_norm": 5.842094824500748, + "learning_rate": 6.541986968392119e-07, + "loss": 0.4458, + "step": 9499 + }, + { + "epoch": 0.7715422723950296, + "grad_norm": 4.8906020204774165, + "learning_rate": 6.537552274257322e-07, + "loss": 0.4932, + "step": 9500 + }, + { + "epoch": 0.7716234873710712, + "grad_norm": 8.934213855225288, + "learning_rate": 6.533118857626194e-07, + "loss": 0.5223, + "step": 9501 + }, + { + "epoch": 0.7717047023471129, + "grad_norm": 4.746748520348421, + "learning_rate": 6.52868671880551e-07, + "loss": 0.5565, + "step": 9502 + }, + { + "epoch": 0.7717859173231544, + "grad_norm": 8.532705058823186, + "learning_rate": 6.524255858101938e-07, + "loss": 0.5495, + "step": 9503 + }, + { + "epoch": 0.771867132299196, + "grad_norm": 10.865485946925974, + "learning_rate": 6.519826275822086e-07, + "loss": 0.5132, + "step": 9504 + }, + { + "epoch": 0.7719483472752375, + "grad_norm": 3.470727903343789, + "learning_rate": 6.515397972272444e-07, + "loss": 0.4055, + "step": 9505 + }, + { + "epoch": 0.7720295622512792, + "grad_norm": 4.190361234303148, + "learning_rate": 6.510970947759434e-07, + "loss": 0.4745, + "step": 9506 + }, + { + "epoch": 0.7721107772273207, + "grad_norm": 4.1269789461455515, + "learning_rate": 6.50654520258939e-07, + "loss": 0.4198, + "step": 9507 + }, + { + "epoch": 0.7721919922033623, + "grad_norm": 5.796517123286705, + "learning_rate": 6.502120737068543e-07, + "loss": 0.5517, + "step": 9508 + }, + { + "epoch": 0.7722732071794038, + "grad_norm": 3.9798948730265002, + "learning_rate": 6.497697551503032e-07, + "loss": 0.4771, + "step": 9509 + }, + { + "epoch": 0.7723544221554455, + "grad_norm": 3.3670043032275445, + "learning_rate": 6.493275646198941e-07, + "loss": 0.507, + "step": 9510 + }, + { + "epoch": 0.772435637131487, + "grad_norm": 4.789052775696503, + "learning_rate": 6.488855021462218e-07, + "loss": 0.651, + "step": 9511 + }, + { + "epoch": 0.7725168521075286, + "grad_norm": 4.574449211149501, + "learning_rate": 6.484435677598761e-07, + "loss": 0.3833, + "step": 9512 + }, + { + "epoch": 0.7725980670835703, + "grad_norm": 5.744708779873541, + "learning_rate": 6.480017614914369e-07, + "loss": 0.52, + "step": 9513 + }, + { + "epoch": 0.7726792820596118, + "grad_norm": 6.185709557761946, + "learning_rate": 6.475600833714743e-07, + "loss": 0.5064, + "step": 9514 + }, + { + "epoch": 0.7727604970356534, + "grad_norm": 6.093796766128401, + "learning_rate": 6.471185334305491e-07, + "loss": 0.5049, + "step": 9515 + }, + { + "epoch": 0.7728417120116949, + "grad_norm": 5.325643999422, + "learning_rate": 6.466771116992162e-07, + "loss": 0.4234, + "step": 9516 + }, + { + "epoch": 0.7729229269877366, + "grad_norm": 5.854284079602375, + "learning_rate": 6.462358182080175e-07, + "loss": 0.504, + "step": 9517 + }, + { + "epoch": 0.7730041419637781, + "grad_norm": 6.5341398235283865, + "learning_rate": 6.457946529874895e-07, + "loss": 0.6357, + "step": 9518 + }, + { + "epoch": 0.7730853569398197, + "grad_norm": 3.7271101675803027, + "learning_rate": 6.453536160681592e-07, + "loss": 0.484, + "step": 9519 + }, + { + "epoch": 0.7731665719158612, + "grad_norm": 3.610155546754207, + "learning_rate": 6.449127074805428e-07, + "loss": 0.464, + "step": 9520 + }, + { + "epoch": 0.7732477868919029, + "grad_norm": 4.384516099481292, + "learning_rate": 6.444719272551491e-07, + "loss": 0.4131, + "step": 9521 + }, + { + "epoch": 0.7733290018679444, + "grad_norm": 7.395867107582111, + "learning_rate": 6.440312754224773e-07, + "loss": 0.489, + "step": 9522 + }, + { + "epoch": 0.773410216843986, + "grad_norm": 3.52191839672315, + "learning_rate": 6.435907520130191e-07, + "loss": 0.3974, + "step": 9523 + }, + { + "epoch": 0.7734914318200277, + "grad_norm": 5.735438136851437, + "learning_rate": 6.431503570572554e-07, + "loss": 0.3681, + "step": 9524 + }, + { + "epoch": 0.7735726467960692, + "grad_norm": 4.138098595663441, + "learning_rate": 6.427100905856598e-07, + "loss": 0.46, + "step": 9525 + }, + { + "epoch": 0.7736538617721108, + "grad_norm": 4.627439119735813, + "learning_rate": 6.422699526286969e-07, + "loss": 0.5792, + "step": 9526 + }, + { + "epoch": 0.7737350767481523, + "grad_norm": 4.326175948208715, + "learning_rate": 6.418299432168215e-07, + "loss": 0.6043, + "step": 9527 + }, + { + "epoch": 0.773816291724194, + "grad_norm": 4.7243209100130334, + "learning_rate": 6.413900623804792e-07, + "loss": 0.3815, + "step": 9528 + }, + { + "epoch": 0.7738975067002355, + "grad_norm": 6.9628262078532535, + "learning_rate": 6.409503101501086e-07, + "loss": 0.5016, + "step": 9529 + }, + { + "epoch": 0.7739787216762771, + "grad_norm": 3.781446710484677, + "learning_rate": 6.405106865561367e-07, + "loss": 0.6054, + "step": 9530 + }, + { + "epoch": 0.7740599366523186, + "grad_norm": 8.30986430715804, + "learning_rate": 6.400711916289846e-07, + "loss": 0.443, + "step": 9531 + }, + { + "epoch": 0.7741411516283603, + "grad_norm": 4.838062384790877, + "learning_rate": 6.396318253990628e-07, + "loss": 0.4772, + "step": 9532 + }, + { + "epoch": 0.7742223666044018, + "grad_norm": 6.597678306183141, + "learning_rate": 6.391925878967728e-07, + "loss": 0.4941, + "step": 9533 + }, + { + "epoch": 0.7743035815804434, + "grad_norm": 4.6205393788485845, + "learning_rate": 6.387534791525072e-07, + "loss": 0.5037, + "step": 9534 + }, + { + "epoch": 0.7743847965564851, + "grad_norm": 4.269745258902035, + "learning_rate": 6.383144991966508e-07, + "loss": 0.7124, + "step": 9535 + }, + { + "epoch": 0.7744660115325266, + "grad_norm": 7.2941195850877145, + "learning_rate": 6.378756480595782e-07, + "loss": 0.4697, + "step": 9536 + }, + { + "epoch": 0.7745472265085682, + "grad_norm": 8.30813779565925, + "learning_rate": 6.374369257716548e-07, + "loss": 0.4272, + "step": 9537 + }, + { + "epoch": 0.7746284414846097, + "grad_norm": 5.268713325301926, + "learning_rate": 6.369983323632389e-07, + "loss": 0.3696, + "step": 9538 + }, + { + "epoch": 0.7747096564606514, + "grad_norm": 6.353077785001505, + "learning_rate": 6.365598678646793e-07, + "loss": 0.499, + "step": 9539 + }, + { + "epoch": 0.7747908714366929, + "grad_norm": 7.787690788411613, + "learning_rate": 6.361215323063144e-07, + "loss": 0.517, + "step": 9540 + }, + { + "epoch": 0.7748720864127345, + "grad_norm": 6.712726861513404, + "learning_rate": 6.356833257184747e-07, + "loss": 0.3541, + "step": 9541 + }, + { + "epoch": 0.774953301388776, + "grad_norm": 7.036101207768536, + "learning_rate": 6.352452481314825e-07, + "loss": 0.5433, + "step": 9542 + }, + { + "epoch": 0.7750345163648177, + "grad_norm": 4.440795097972874, + "learning_rate": 6.348072995756497e-07, + "loss": 0.4672, + "step": 9543 + }, + { + "epoch": 0.7751157313408592, + "grad_norm": 6.623356794323855, + "learning_rate": 6.3436948008128e-07, + "loss": 0.479, + "step": 9544 + }, + { + "epoch": 0.7751969463169008, + "grad_norm": 3.870234328714397, + "learning_rate": 6.339317896786693e-07, + "loss": 0.5124, + "step": 9545 + }, + { + "epoch": 0.7752781612929425, + "grad_norm": 7.935205106400605, + "learning_rate": 6.33494228398103e-07, + "loss": 0.5108, + "step": 9546 + }, + { + "epoch": 0.775359376268984, + "grad_norm": 4.477009003593662, + "learning_rate": 6.33056796269857e-07, + "loss": 0.5893, + "step": 9547 + }, + { + "epoch": 0.7754405912450256, + "grad_norm": 6.221717457872238, + "learning_rate": 6.326194933242006e-07, + "loss": 0.5316, + "step": 9548 + }, + { + "epoch": 0.7755218062210671, + "grad_norm": 5.507373178005721, + "learning_rate": 6.321823195913924e-07, + "loss": 0.5283, + "step": 9549 + }, + { + "epoch": 0.7756030211971088, + "grad_norm": 5.749251136653241, + "learning_rate": 6.317452751016815e-07, + "loss": 0.5322, + "step": 9550 + }, + { + "epoch": 0.7756842361731503, + "grad_norm": 4.194072441757716, + "learning_rate": 6.313083598853101e-07, + "loss": 0.5895, + "step": 9551 + }, + { + "epoch": 0.7757654511491919, + "grad_norm": 5.534135119543315, + "learning_rate": 6.308715739725108e-07, + "loss": 0.3903, + "step": 9552 + }, + { + "epoch": 0.7758466661252335, + "grad_norm": 5.890217163199354, + "learning_rate": 6.30434917393506e-07, + "loss": 0.4641, + "step": 9553 + }, + { + "epoch": 0.7759278811012751, + "grad_norm": 4.610182903829491, + "learning_rate": 6.299983901785109e-07, + "loss": 0.5238, + "step": 9554 + }, + { + "epoch": 0.7760090960773166, + "grad_norm": 5.475473491313324, + "learning_rate": 6.295619923577303e-07, + "loss": 0.4373, + "step": 9555 + }, + { + "epoch": 0.7760903110533582, + "grad_norm": 3.7085653093083364, + "learning_rate": 6.291257239613599e-07, + "loss": 0.5475, + "step": 9556 + }, + { + "epoch": 0.7761715260293999, + "grad_norm": 5.431007075610515, + "learning_rate": 6.286895850195882e-07, + "loss": 0.521, + "step": 9557 + }, + { + "epoch": 0.7762527410054414, + "grad_norm": 7.326528629894263, + "learning_rate": 6.28253575562594e-07, + "loss": 0.5119, + "step": 9558 + }, + { + "epoch": 0.776333955981483, + "grad_norm": 6.69328051954586, + "learning_rate": 6.278176956205462e-07, + "loss": 0.468, + "step": 9559 + }, + { + "epoch": 0.7764151709575245, + "grad_norm": 7.6181081748087776, + "learning_rate": 6.273819452236049e-07, + "loss": 0.4013, + "step": 9560 + }, + { + "epoch": 0.7764963859335662, + "grad_norm": 5.517368857197591, + "learning_rate": 6.269463244019231e-07, + "loss": 0.4919, + "step": 9561 + }, + { + "epoch": 0.7765776009096077, + "grad_norm": 5.2478286059509145, + "learning_rate": 6.265108331856423e-07, + "loss": 0.4883, + "step": 9562 + }, + { + "epoch": 0.7766588158856493, + "grad_norm": 8.241994509786712, + "learning_rate": 6.260754716048961e-07, + "loss": 0.4527, + "step": 9563 + }, + { + "epoch": 0.7767400308616909, + "grad_norm": 5.796864584164602, + "learning_rate": 6.256402396898095e-07, + "loss": 0.4505, + "step": 9564 + }, + { + "epoch": 0.7768212458377325, + "grad_norm": 5.288449361519552, + "learning_rate": 6.252051374704992e-07, + "loss": 0.5593, + "step": 9565 + }, + { + "epoch": 0.776902460813774, + "grad_norm": 4.215256696565076, + "learning_rate": 6.247701649770707e-07, + "loss": 0.4833, + "step": 9566 + }, + { + "epoch": 0.7769836757898156, + "grad_norm": 4.625763848409002, + "learning_rate": 6.243353222396229e-07, + "loss": 0.4169, + "step": 9567 + }, + { + "epoch": 0.7770648907658573, + "grad_norm": 7.386761076675003, + "learning_rate": 6.239006092882438e-07, + "loss": 0.5572, + "step": 9568 + }, + { + "epoch": 0.7771461057418988, + "grad_norm": 3.960429753595598, + "learning_rate": 6.234660261530126e-07, + "loss": 0.448, + "step": 9569 + }, + { + "epoch": 0.7772273207179404, + "grad_norm": 6.218816667980998, + "learning_rate": 6.23031572864001e-07, + "loss": 0.4399, + "step": 9570 + }, + { + "epoch": 0.777308535693982, + "grad_norm": 11.37451355769962, + "learning_rate": 6.225972494512719e-07, + "loss": 0.4474, + "step": 9571 + }, + { + "epoch": 0.7773897506700236, + "grad_norm": 4.63095111539401, + "learning_rate": 6.22163055944876e-07, + "loss": 0.4599, + "step": 9572 + }, + { + "epoch": 0.7774709656460651, + "grad_norm": 4.637853908423072, + "learning_rate": 6.217289923748592e-07, + "loss": 0.3144, + "step": 9573 + }, + { + "epoch": 0.7775521806221067, + "grad_norm": 7.081685491650372, + "learning_rate": 6.212950587712557e-07, + "loss": 0.6146, + "step": 9574 + }, + { + "epoch": 0.7776333955981483, + "grad_norm": 4.122348761014315, + "learning_rate": 6.20861255164091e-07, + "loss": 0.5314, + "step": 9575 + }, + { + "epoch": 0.7777146105741899, + "grad_norm": 5.74857146041133, + "learning_rate": 6.204275815833807e-07, + "loss": 0.4767, + "step": 9576 + }, + { + "epoch": 0.7777958255502314, + "grad_norm": 5.874273219932277, + "learning_rate": 6.19994038059136e-07, + "loss": 0.4707, + "step": 9577 + }, + { + "epoch": 0.777877040526273, + "grad_norm": 6.134515006712459, + "learning_rate": 6.19560624621354e-07, + "loss": 0.5144, + "step": 9578 + }, + { + "epoch": 0.7779582555023147, + "grad_norm": 5.019484047549089, + "learning_rate": 6.191273413000237e-07, + "loss": 0.3622, + "step": 9579 + }, + { + "epoch": 0.7780394704783562, + "grad_norm": 4.813176127586792, + "learning_rate": 6.186941881251279e-07, + "loss": 0.3609, + "step": 9580 + }, + { + "epoch": 0.7781206854543978, + "grad_norm": 6.957964558660779, + "learning_rate": 6.182611651266376e-07, + "loss": 0.4625, + "step": 9581 + }, + { + "epoch": 0.7782019004304394, + "grad_norm": 4.786938388125974, + "learning_rate": 6.17828272334515e-07, + "loss": 0.4196, + "step": 9582 + }, + { + "epoch": 0.778283115406481, + "grad_norm": 14.455541969389369, + "learning_rate": 6.173955097787149e-07, + "loss": 0.3105, + "step": 9583 + }, + { + "epoch": 0.7783643303825225, + "grad_norm": 6.015474094967977, + "learning_rate": 6.169628774891826e-07, + "loss": 0.6127, + "step": 9584 + }, + { + "epoch": 0.7784455453585641, + "grad_norm": 7.69183168267593, + "learning_rate": 6.165303754958524e-07, + "loss": 0.5329, + "step": 9585 + }, + { + "epoch": 0.7785267603346057, + "grad_norm": 7.494666404519941, + "learning_rate": 6.160980038286529e-07, + "loss": 0.3892, + "step": 9586 + }, + { + "epoch": 0.7786079753106473, + "grad_norm": 3.6352807891152508, + "learning_rate": 6.156657625175011e-07, + "loss": 0.4718, + "step": 9587 + }, + { + "epoch": 0.7786891902866888, + "grad_norm": 3.6374040934237066, + "learning_rate": 6.152336515923052e-07, + "loss": 0.49, + "step": 9588 + }, + { + "epoch": 0.7787704052627304, + "grad_norm": 4.790151735557977, + "learning_rate": 6.148016710829654e-07, + "loss": 0.5964, + "step": 9589 + }, + { + "epoch": 0.7788516202387721, + "grad_norm": 6.6959826045900614, + "learning_rate": 6.143698210193738e-07, + "loss": 0.7207, + "step": 9590 + }, + { + "epoch": 0.7789328352148136, + "grad_norm": 5.021956045573845, + "learning_rate": 6.139381014314108e-07, + "loss": 0.4336, + "step": 9591 + }, + { + "epoch": 0.7790140501908552, + "grad_norm": 8.574991586907696, + "learning_rate": 6.135065123489486e-07, + "loss": 0.4282, + "step": 9592 + }, + { + "epoch": 0.7790952651668968, + "grad_norm": 4.869738833833407, + "learning_rate": 6.130750538018524e-07, + "loss": 0.5189, + "step": 9593 + }, + { + "epoch": 0.7791764801429384, + "grad_norm": 5.876692632158238, + "learning_rate": 6.12643725819976e-07, + "loss": 0.4978, + "step": 9594 + }, + { + "epoch": 0.7792576951189799, + "grad_norm": 7.302404610138221, + "learning_rate": 6.122125284331646e-07, + "loss": 0.5031, + "step": 9595 + }, + { + "epoch": 0.7793389100950215, + "grad_norm": 6.239496444525358, + "learning_rate": 6.117814616712548e-07, + "loss": 0.4399, + "step": 9596 + }, + { + "epoch": 0.7794201250710631, + "grad_norm": 4.341820954785043, + "learning_rate": 6.113505255640756e-07, + "loss": 0.4661, + "step": 9597 + }, + { + "epoch": 0.7795013400471047, + "grad_norm": 4.82168987167205, + "learning_rate": 6.109197201414438e-07, + "loss": 0.4285, + "step": 9598 + }, + { + "epoch": 0.7795825550231462, + "grad_norm": 5.620360996597726, + "learning_rate": 6.104890454331702e-07, + "loss": 0.6044, + "step": 9599 + }, + { + "epoch": 0.7796637699991879, + "grad_norm": 4.809738135444571, + "learning_rate": 6.100585014690547e-07, + "loss": 0.6426, + "step": 9600 + }, + { + "epoch": 0.7797449849752295, + "grad_norm": 3.621342250605208, + "learning_rate": 6.096280882788874e-07, + "loss": 0.5404, + "step": 9601 + }, + { + "epoch": 0.779826199951271, + "grad_norm": 8.468527887913863, + "learning_rate": 6.091978058924522e-07, + "loss": 0.3656, + "step": 9602 + }, + { + "epoch": 0.7799074149273126, + "grad_norm": 3.464489359474417, + "learning_rate": 6.087676543395224e-07, + "loss": 0.6374, + "step": 9603 + }, + { + "epoch": 0.7799886299033542, + "grad_norm": 5.245897271466931, + "learning_rate": 6.083376336498608e-07, + "loss": 0.5772, + "step": 9604 + }, + { + "epoch": 0.7800698448793958, + "grad_norm": 6.62332037882836, + "learning_rate": 6.079077438532246e-07, + "loss": 0.5533, + "step": 9605 + }, + { + "epoch": 0.7801510598554373, + "grad_norm": 5.690296444200083, + "learning_rate": 6.074779849793585e-07, + "loss": 0.5408, + "step": 9606 + }, + { + "epoch": 0.780232274831479, + "grad_norm": 4.804271179305116, + "learning_rate": 6.07048357057999e-07, + "loss": 0.558, + "step": 9607 + }, + { + "epoch": 0.7803134898075205, + "grad_norm": 6.074710914942616, + "learning_rate": 6.066188601188757e-07, + "loss": 0.5146, + "step": 9608 + }, + { + "epoch": 0.7803947047835621, + "grad_norm": 6.506228847573674, + "learning_rate": 6.061894941917062e-07, + "loss": 0.5551, + "step": 9609 + }, + { + "epoch": 0.7804759197596036, + "grad_norm": 5.035663800312012, + "learning_rate": 6.057602593062015e-07, + "loss": 0.4042, + "step": 9610 + }, + { + "epoch": 0.7805571347356453, + "grad_norm": 3.7689993752107234, + "learning_rate": 6.053311554920607e-07, + "loss": 0.4688, + "step": 9611 + }, + { + "epoch": 0.7806383497116869, + "grad_norm": 6.095711004984003, + "learning_rate": 6.049021827789774e-07, + "loss": 0.3031, + "step": 9612 + }, + { + "epoch": 0.7807195646877284, + "grad_norm": 5.01460034158621, + "learning_rate": 6.044733411966336e-07, + "loss": 0.4798, + "step": 9613 + }, + { + "epoch": 0.78080077966377, + "grad_norm": 8.19175212779431, + "learning_rate": 6.040446307747019e-07, + "loss": 0.5674, + "step": 9614 + }, + { + "epoch": 0.7808819946398116, + "grad_norm": 4.961605893602875, + "learning_rate": 6.036160515428475e-07, + "loss": 0.4449, + "step": 9615 + }, + { + "epoch": 0.7809632096158532, + "grad_norm": 6.915675610043426, + "learning_rate": 6.031876035307263e-07, + "loss": 0.4569, + "step": 9616 + }, + { + "epoch": 0.7810444245918947, + "grad_norm": 5.568377529756666, + "learning_rate": 6.027592867679838e-07, + "loss": 0.3962, + "step": 9617 + }, + { + "epoch": 0.7811256395679363, + "grad_norm": 6.129156630011155, + "learning_rate": 6.023311012842581e-07, + "loss": 0.5745, + "step": 9618 + }, + { + "epoch": 0.7812068545439779, + "grad_norm": 5.108160511033742, + "learning_rate": 6.019030471091772e-07, + "loss": 0.3951, + "step": 9619 + }, + { + "epoch": 0.7812880695200195, + "grad_norm": 5.561892142462487, + "learning_rate": 6.014751242723591e-07, + "loss": 0.5265, + "step": 9620 + }, + { + "epoch": 0.781369284496061, + "grad_norm": 3.7511127265919244, + "learning_rate": 6.010473328034153e-07, + "loss": 0.5185, + "step": 9621 + }, + { + "epoch": 0.7814504994721027, + "grad_norm": 5.167066450794399, + "learning_rate": 6.006196727319452e-07, + "loss": 0.4383, + "step": 9622 + }, + { + "epoch": 0.7815317144481443, + "grad_norm": 7.96179114023694, + "learning_rate": 6.001921440875414e-07, + "loss": 0.3846, + "step": 9623 + }, + { + "epoch": 0.7816129294241858, + "grad_norm": 5.371452420693181, + "learning_rate": 5.997647468997875e-07, + "loss": 0.6281, + "step": 9624 + }, + { + "epoch": 0.7816941444002274, + "grad_norm": 5.8732462009916, + "learning_rate": 5.99337481198256e-07, + "loss": 0.5173, + "step": 9625 + }, + { + "epoch": 0.781775359376269, + "grad_norm": 5.56845044238569, + "learning_rate": 5.989103470125113e-07, + "loss": 0.5523, + "step": 9626 + }, + { + "epoch": 0.7818565743523106, + "grad_norm": 6.424489397897517, + "learning_rate": 5.984833443721097e-07, + "loss": 0.3735, + "step": 9627 + }, + { + "epoch": 0.7819377893283521, + "grad_norm": 4.1718433726191275, + "learning_rate": 5.980564733065963e-07, + "loss": 0.4501, + "step": 9628 + }, + { + "epoch": 0.7820190043043938, + "grad_norm": 5.012432986771707, + "learning_rate": 5.976297338455101e-07, + "loss": 0.5626, + "step": 9629 + }, + { + "epoch": 0.7821002192804353, + "grad_norm": 13.705889019793014, + "learning_rate": 5.972031260183772e-07, + "loss": 0.5116, + "step": 9630 + }, + { + "epoch": 0.7821814342564769, + "grad_norm": 5.262747778591087, + "learning_rate": 5.967766498547181e-07, + "loss": 0.4009, + "step": 9631 + }, + { + "epoch": 0.7822626492325184, + "grad_norm": 4.146136341129552, + "learning_rate": 5.963503053840425e-07, + "loss": 0.4744, + "step": 9632 + }, + { + "epoch": 0.7823438642085601, + "grad_norm": 6.971385385680695, + "learning_rate": 5.959240926358501e-07, + "loss": 0.4348, + "step": 9633 + }, + { + "epoch": 0.7824250791846017, + "grad_norm": 4.565877029510119, + "learning_rate": 5.954980116396336e-07, + "loss": 0.6681, + "step": 9634 + }, + { + "epoch": 0.7825062941606432, + "grad_norm": 4.089174310097666, + "learning_rate": 5.950720624248749e-07, + "loss": 0.484, + "step": 9635 + }, + { + "epoch": 0.7825875091366848, + "grad_norm": 5.870091842126987, + "learning_rate": 5.946462450210477e-07, + "loss": 0.4509, + "step": 9636 + }, + { + "epoch": 0.7826687241127264, + "grad_norm": 17.233940029392116, + "learning_rate": 5.942205594576173e-07, + "loss": 0.5734, + "step": 9637 + }, + { + "epoch": 0.782749939088768, + "grad_norm": 12.113335333152698, + "learning_rate": 5.937950057640376e-07, + "loss": 0.4828, + "step": 9638 + }, + { + "epoch": 0.7828311540648095, + "grad_norm": 7.210552071495601, + "learning_rate": 5.933695839697548e-07, + "loss": 0.4928, + "step": 9639 + }, + { + "epoch": 0.7829123690408512, + "grad_norm": 4.338185170282578, + "learning_rate": 5.929442941042066e-07, + "loss": 0.578, + "step": 9640 + }, + { + "epoch": 0.7829935840168927, + "grad_norm": 6.223280812892159, + "learning_rate": 5.925191361968194e-07, + "loss": 0.4616, + "step": 9641 + }, + { + "epoch": 0.7830747989929343, + "grad_norm": 3.9906953099976397, + "learning_rate": 5.920941102770128e-07, + "loss": 0.539, + "step": 9642 + }, + { + "epoch": 0.7831560139689758, + "grad_norm": 4.895008165041246, + "learning_rate": 5.916692163741972e-07, + "loss": 0.5437, + "step": 9643 + }, + { + "epoch": 0.7832372289450175, + "grad_norm": 7.001022857461772, + "learning_rate": 5.91244454517772e-07, + "loss": 0.4387, + "step": 9644 + }, + { + "epoch": 0.7833184439210591, + "grad_norm": 8.865782780347201, + "learning_rate": 5.908198247371289e-07, + "loss": 0.4938, + "step": 9645 + }, + { + "epoch": 0.7833996588971006, + "grad_norm": 3.951135802126336, + "learning_rate": 5.903953270616486e-07, + "loss": 0.4084, + "step": 9646 + }, + { + "epoch": 0.7834808738731422, + "grad_norm": 6.755133249113319, + "learning_rate": 5.899709615207055e-07, + "loss": 0.5784, + "step": 9647 + }, + { + "epoch": 0.7835620888491838, + "grad_norm": 7.946028938098305, + "learning_rate": 5.895467281436637e-07, + "loss": 0.5064, + "step": 9648 + }, + { + "epoch": 0.7836433038252254, + "grad_norm": 5.014225943190868, + "learning_rate": 5.891226269598768e-07, + "loss": 0.6636, + "step": 9649 + }, + { + "epoch": 0.7837245188012669, + "grad_norm": 5.034789109316721, + "learning_rate": 5.886986579986917e-07, + "loss": 0.4543, + "step": 9650 + }, + { + "epoch": 0.7838057337773086, + "grad_norm": 5.031634092156704, + "learning_rate": 5.882748212894441e-07, + "loss": 0.5141, + "step": 9651 + }, + { + "epoch": 0.7838869487533501, + "grad_norm": 3.651635786477034, + "learning_rate": 5.878511168614601e-07, + "loss": 0.5764, + "step": 9652 + }, + { + "epoch": 0.7839681637293917, + "grad_norm": 4.577152340868677, + "learning_rate": 5.874275447440599e-07, + "loss": 0.4339, + "step": 9653 + }, + { + "epoch": 0.7840493787054332, + "grad_norm": 6.2520933191135875, + "learning_rate": 5.870041049665507e-07, + "loss": 0.5523, + "step": 9654 + }, + { + "epoch": 0.7841305936814749, + "grad_norm": 4.385286574382671, + "learning_rate": 5.86580797558233e-07, + "loss": 0.4017, + "step": 9655 + }, + { + "epoch": 0.7842118086575165, + "grad_norm": 3.6357368477226233, + "learning_rate": 5.861576225483984e-07, + "loss": 0.4594, + "step": 9656 + }, + { + "epoch": 0.784293023633558, + "grad_norm": 5.476827776987912, + "learning_rate": 5.857345799663272e-07, + "loss": 0.3793, + "step": 9657 + }, + { + "epoch": 0.7843742386095996, + "grad_norm": 5.500782599304935, + "learning_rate": 5.853116698412913e-07, + "loss": 0.4516, + "step": 9658 + }, + { + "epoch": 0.7844554535856412, + "grad_norm": 4.882146338011125, + "learning_rate": 5.848888922025553e-07, + "loss": 0.5161, + "step": 9659 + }, + { + "epoch": 0.7845366685616828, + "grad_norm": 5.737418334120377, + "learning_rate": 5.844662470793716e-07, + "loss": 0.4623, + "step": 9660 + }, + { + "epoch": 0.7846178835377243, + "grad_norm": 5.3911366434680685, + "learning_rate": 5.840437345009859e-07, + "loss": 0.5734, + "step": 9661 + }, + { + "epoch": 0.784699098513766, + "grad_norm": 4.290906281542307, + "learning_rate": 5.83621354496634e-07, + "loss": 0.5312, + "step": 9662 + }, + { + "epoch": 0.7847803134898075, + "grad_norm": 5.746160856248301, + "learning_rate": 5.831991070955426e-07, + "loss": 0.4219, + "step": 9663 + }, + { + "epoch": 0.7848615284658491, + "grad_norm": 5.04902095963468, + "learning_rate": 5.827769923269283e-07, + "loss": 0.5044, + "step": 9664 + }, + { + "epoch": 0.7849427434418906, + "grad_norm": 10.70114453686239, + "learning_rate": 5.823550102199985e-07, + "loss": 0.4453, + "step": 9665 + }, + { + "epoch": 0.7850239584179323, + "grad_norm": 4.150840098492769, + "learning_rate": 5.819331608039538e-07, + "loss": 0.4819, + "step": 9666 + }, + { + "epoch": 0.7851051733939739, + "grad_norm": 7.526924883890027, + "learning_rate": 5.815114441079825e-07, + "loss": 0.5268, + "step": 9667 + }, + { + "epoch": 0.7851863883700154, + "grad_norm": 6.803056853476825, + "learning_rate": 5.810898601612657e-07, + "loss": 0.6169, + "step": 9668 + }, + { + "epoch": 0.785267603346057, + "grad_norm": 6.656336422437745, + "learning_rate": 5.806684089929756e-07, + "loss": 0.4574, + "step": 9669 + }, + { + "epoch": 0.7853488183220986, + "grad_norm": 4.099463565565569, + "learning_rate": 5.802470906322738e-07, + "loss": 0.5343, + "step": 9670 + }, + { + "epoch": 0.7854300332981402, + "grad_norm": 9.804778473842848, + "learning_rate": 5.798259051083124e-07, + "loss": 0.4658, + "step": 9671 + }, + { + "epoch": 0.7855112482741817, + "grad_norm": 5.831632381423531, + "learning_rate": 5.794048524502366e-07, + "loss": 0.3633, + "step": 9672 + }, + { + "epoch": 0.7855924632502234, + "grad_norm": 4.512592207772697, + "learning_rate": 5.789839326871799e-07, + "loss": 0.5674, + "step": 9673 + }, + { + "epoch": 0.7856736782262649, + "grad_norm": 7.058643124685738, + "learning_rate": 5.785631458482679e-07, + "loss": 0.4566, + "step": 9674 + }, + { + "epoch": 0.7857548932023065, + "grad_norm": 9.677711342457231, + "learning_rate": 5.781424919626183e-07, + "loss": 0.5513, + "step": 9675 + }, + { + "epoch": 0.785836108178348, + "grad_norm": 7.654772369572434, + "learning_rate": 5.777219710593365e-07, + "loss": 0.4374, + "step": 9676 + }, + { + "epoch": 0.7859173231543897, + "grad_norm": 5.412971281189123, + "learning_rate": 5.773015831675204e-07, + "loss": 0.518, + "step": 9677 + }, + { + "epoch": 0.7859985381304313, + "grad_norm": 7.042856826106027, + "learning_rate": 5.768813283162597e-07, + "loss": 0.4422, + "step": 9678 + }, + { + "epoch": 0.7860797531064728, + "grad_norm": 8.697761416414153, + "learning_rate": 5.764612065346328e-07, + "loss": 0.3727, + "step": 9679 + }, + { + "epoch": 0.7861609680825145, + "grad_norm": 11.951299771554737, + "learning_rate": 5.760412178517099e-07, + "loss": 0.3518, + "step": 9680 + }, + { + "epoch": 0.786242183058556, + "grad_norm": 5.170910615421707, + "learning_rate": 5.75621362296552e-07, + "loss": 0.7013, + "step": 9681 + }, + { + "epoch": 0.7863233980345976, + "grad_norm": 15.391790408942455, + "learning_rate": 5.752016398982122e-07, + "loss": 0.4973, + "step": 9682 + }, + { + "epoch": 0.7864046130106391, + "grad_norm": 6.4992455503524145, + "learning_rate": 5.747820506857318e-07, + "loss": 0.5486, + "step": 9683 + }, + { + "epoch": 0.7864858279866808, + "grad_norm": 5.514244182216503, + "learning_rate": 5.74362594688144e-07, + "loss": 0.434, + "step": 9684 + }, + { + "epoch": 0.7865670429627223, + "grad_norm": 4.203294166143777, + "learning_rate": 5.739432719344737e-07, + "loss": 0.4048, + "step": 9685 + }, + { + "epoch": 0.7866482579387639, + "grad_norm": 5.401361035345732, + "learning_rate": 5.73524082453735e-07, + "loss": 0.4279, + "step": 9686 + }, + { + "epoch": 0.7867294729148054, + "grad_norm": 4.158513777957124, + "learning_rate": 5.731050262749341e-07, + "loss": 0.4841, + "step": 9687 + }, + { + "epoch": 0.7868106878908471, + "grad_norm": 7.879580004180549, + "learning_rate": 5.726861034270681e-07, + "loss": 0.3681, + "step": 9688 + }, + { + "epoch": 0.7868919028668887, + "grad_norm": 3.9835235897565107, + "learning_rate": 5.722673139391236e-07, + "loss": 0.4581, + "step": 9689 + }, + { + "epoch": 0.7869731178429302, + "grad_norm": 5.506433493815765, + "learning_rate": 5.718486578400775e-07, + "loss": 0.4241, + "step": 9690 + }, + { + "epoch": 0.7870543328189719, + "grad_norm": 6.423378795375947, + "learning_rate": 5.714301351589008e-07, + "loss": 0.6751, + "step": 9691 + }, + { + "epoch": 0.7871355477950134, + "grad_norm": 4.6037563920362095, + "learning_rate": 5.710117459245518e-07, + "loss": 0.6218, + "step": 9692 + }, + { + "epoch": 0.787216762771055, + "grad_norm": 5.535444733765645, + "learning_rate": 5.705934901659804e-07, + "loss": 0.609, + "step": 9693 + }, + { + "epoch": 0.7872979777470965, + "grad_norm": 6.072040161253787, + "learning_rate": 5.70175367912128e-07, + "loss": 0.3255, + "step": 9694 + }, + { + "epoch": 0.7873791927231382, + "grad_norm": 3.930200276565603, + "learning_rate": 5.697573791919275e-07, + "loss": 0.3883, + "step": 9695 + }, + { + "epoch": 0.7874604076991797, + "grad_norm": 13.405961625131877, + "learning_rate": 5.693395240343e-07, + "loss": 0.469, + "step": 9696 + }, + { + "epoch": 0.7875416226752213, + "grad_norm": 3.8674362850461357, + "learning_rate": 5.689218024681603e-07, + "loss": 0.4996, + "step": 9697 + }, + { + "epoch": 0.7876228376512628, + "grad_norm": 6.411118676070839, + "learning_rate": 5.685042145224118e-07, + "loss": 0.3813, + "step": 9698 + }, + { + "epoch": 0.7877040526273045, + "grad_norm": 3.36119530692789, + "learning_rate": 5.680867602259485e-07, + "loss": 0.449, + "step": 9699 + }, + { + "epoch": 0.7877852676033461, + "grad_norm": 5.637861341121901, + "learning_rate": 5.676694396076568e-07, + "loss": 0.5444, + "step": 9700 + }, + { + "epoch": 0.7878664825793876, + "grad_norm": 8.011959392935337, + "learning_rate": 5.672522526964141e-07, + "loss": 0.4141, + "step": 9701 + }, + { + "epoch": 0.7879476975554293, + "grad_norm": 4.254934260559165, + "learning_rate": 5.668351995210866e-07, + "loss": 0.5489, + "step": 9702 + }, + { + "epoch": 0.7880289125314708, + "grad_norm": 4.2266389347699995, + "learning_rate": 5.664182801105314e-07, + "loss": 0.4893, + "step": 9703 + }, + { + "epoch": 0.7881101275075124, + "grad_norm": 9.12782592389075, + "learning_rate": 5.660014944935985e-07, + "loss": 0.5584, + "step": 9704 + }, + { + "epoch": 0.7881913424835539, + "grad_norm": 4.637756752610576, + "learning_rate": 5.655848426991267e-07, + "loss": 0.4832, + "step": 9705 + }, + { + "epoch": 0.7882725574595956, + "grad_norm": 5.685502111243551, + "learning_rate": 5.651683247559445e-07, + "loss": 0.3528, + "step": 9706 + }, + { + "epoch": 0.7883537724356371, + "grad_norm": 6.879039707986046, + "learning_rate": 5.647519406928758e-07, + "loss": 0.3939, + "step": 9707 + }, + { + "epoch": 0.7884349874116787, + "grad_norm": 4.090096342166239, + "learning_rate": 5.643356905387307e-07, + "loss": 0.6541, + "step": 9708 + }, + { + "epoch": 0.7885162023877202, + "grad_norm": 5.705199569403093, + "learning_rate": 5.639195743223105e-07, + "loss": 0.4791, + "step": 9709 + }, + { + "epoch": 0.7885974173637619, + "grad_norm": 6.503352654339194, + "learning_rate": 5.635035920724102e-07, + "loss": 0.4197, + "step": 9710 + }, + { + "epoch": 0.7886786323398035, + "grad_norm": 3.9110515788245204, + "learning_rate": 5.630877438178126e-07, + "loss": 0.6805, + "step": 9711 + }, + { + "epoch": 0.788759847315845, + "grad_norm": 6.1493240468982275, + "learning_rate": 5.626720295872911e-07, + "loss": 0.4074, + "step": 9712 + }, + { + "epoch": 0.7888410622918867, + "grad_norm": 3.943608994312866, + "learning_rate": 5.622564494096122e-07, + "loss": 0.4565, + "step": 9713 + }, + { + "epoch": 0.7889222772679282, + "grad_norm": 7.779376580705911, + "learning_rate": 5.618410033135325e-07, + "loss": 0.5792, + "step": 9714 + }, + { + "epoch": 0.7890034922439698, + "grad_norm": 3.3797672800758134, + "learning_rate": 5.614256913277968e-07, + "loss": 0.3237, + "step": 9715 + }, + { + "epoch": 0.7890847072200113, + "grad_norm": 5.330674659642353, + "learning_rate": 5.610105134811444e-07, + "loss": 0.4209, + "step": 9716 + }, + { + "epoch": 0.789165922196053, + "grad_norm": 4.322221219959529, + "learning_rate": 5.605954698023023e-07, + "loss": 0.4035, + "step": 9717 + }, + { + "epoch": 0.7892471371720945, + "grad_norm": 9.488626438377107, + "learning_rate": 5.601805603199889e-07, + "loss": 0.3763, + "step": 9718 + }, + { + "epoch": 0.7893283521481361, + "grad_norm": 5.681079603298455, + "learning_rate": 5.597657850629145e-07, + "loss": 0.5129, + "step": 9719 + }, + { + "epoch": 0.7894095671241776, + "grad_norm": 4.177871819811436, + "learning_rate": 5.593511440597799e-07, + "loss": 0.4432, + "step": 9720 + }, + { + "epoch": 0.7894907821002193, + "grad_norm": 7.924561057142348, + "learning_rate": 5.589366373392754e-07, + "loss": 0.4561, + "step": 9721 + }, + { + "epoch": 0.7895719970762609, + "grad_norm": 3.32290128813931, + "learning_rate": 5.58522264930082e-07, + "loss": 0.5691, + "step": 9722 + }, + { + "epoch": 0.7896532120523024, + "grad_norm": 7.189607322217405, + "learning_rate": 5.581080268608733e-07, + "loss": 0.5209, + "step": 9723 + }, + { + "epoch": 0.7897344270283441, + "grad_norm": 6.847085179945522, + "learning_rate": 5.576939231603118e-07, + "loss": 0.533, + "step": 9724 + }, + { + "epoch": 0.7898156420043856, + "grad_norm": 5.441399177870424, + "learning_rate": 5.572799538570506e-07, + "loss": 0.3767, + "step": 9725 + }, + { + "epoch": 0.7898968569804272, + "grad_norm": 5.540677770063936, + "learning_rate": 5.56866118979735e-07, + "loss": 0.5119, + "step": 9726 + }, + { + "epoch": 0.7899780719564687, + "grad_norm": 8.524810246072764, + "learning_rate": 5.564524185570008e-07, + "loss": 0.5606, + "step": 9727 + }, + { + "epoch": 0.7900592869325104, + "grad_norm": 3.4073724271339327, + "learning_rate": 5.560388526174723e-07, + "loss": 0.6541, + "step": 9728 + }, + { + "epoch": 0.7901405019085519, + "grad_norm": 4.045724757251306, + "learning_rate": 5.556254211897677e-07, + "loss": 0.481, + "step": 9729 + }, + { + "epoch": 0.7902217168845935, + "grad_norm": 6.886529740016336, + "learning_rate": 5.552121243024935e-07, + "loss": 0.456, + "step": 9730 + }, + { + "epoch": 0.790302931860635, + "grad_norm": 4.136880017146439, + "learning_rate": 5.54798961984247e-07, + "loss": 0.4138, + "step": 9731 + }, + { + "epoch": 0.7903841468366767, + "grad_norm": 5.315316660423206, + "learning_rate": 5.543859342636177e-07, + "loss": 0.4179, + "step": 9732 + }, + { + "epoch": 0.7904653618127183, + "grad_norm": 9.808582227554735, + "learning_rate": 5.539730411691851e-07, + "loss": 0.4608, + "step": 9733 + }, + { + "epoch": 0.7905465767887598, + "grad_norm": 3.7605530791600725, + "learning_rate": 5.535602827295189e-07, + "loss": 0.4465, + "step": 9734 + }, + { + "epoch": 0.7906277917648015, + "grad_norm": 5.688896710047547, + "learning_rate": 5.53147658973179e-07, + "loss": 0.3428, + "step": 9735 + }, + { + "epoch": 0.790709006740843, + "grad_norm": 11.37494567313138, + "learning_rate": 5.527351699287184e-07, + "loss": 0.4684, + "step": 9736 + }, + { + "epoch": 0.7907902217168846, + "grad_norm": 6.335210866254778, + "learning_rate": 5.523228156246782e-07, + "loss": 0.5216, + "step": 9737 + }, + { + "epoch": 0.7908714366929261, + "grad_norm": 3.543537249807746, + "learning_rate": 5.519105960895904e-07, + "loss": 0.6721, + "step": 9738 + }, + { + "epoch": 0.7909526516689678, + "grad_norm": 3.746923394479714, + "learning_rate": 5.514985113519794e-07, + "loss": 0.5059, + "step": 9739 + }, + { + "epoch": 0.7910338666450093, + "grad_norm": 3.243317942178259, + "learning_rate": 5.510865614403599e-07, + "loss": 0.5028, + "step": 9740 + }, + { + "epoch": 0.7911150816210509, + "grad_norm": 5.231091301479268, + "learning_rate": 5.506747463832348e-07, + "loss": 0.506, + "step": 9741 + }, + { + "epoch": 0.7911962965970925, + "grad_norm": 3.037936622624663, + "learning_rate": 5.502630662091016e-07, + "loss": 0.4508, + "step": 9742 + }, + { + "epoch": 0.7912775115731341, + "grad_norm": 5.011070638322092, + "learning_rate": 5.498515209464453e-07, + "loss": 0.5612, + "step": 9743 + }, + { + "epoch": 0.7913587265491757, + "grad_norm": 57.93538906369309, + "learning_rate": 5.49440110623742e-07, + "loss": 0.4245, + "step": 9744 + }, + { + "epoch": 0.7914399415252172, + "grad_norm": 4.391572013936799, + "learning_rate": 5.490288352694598e-07, + "loss": 0.4482, + "step": 9745 + }, + { + "epoch": 0.7915211565012589, + "grad_norm": 4.558873733467807, + "learning_rate": 5.486176949120575e-07, + "loss": 0.6039, + "step": 9746 + }, + { + "epoch": 0.7916023714773004, + "grad_norm": 7.119700732402204, + "learning_rate": 5.482066895799825e-07, + "loss": 0.467, + "step": 9747 + }, + { + "epoch": 0.791683586453342, + "grad_norm": 9.302916508581008, + "learning_rate": 5.477958193016758e-07, + "loss": 0.589, + "step": 9748 + }, + { + "epoch": 0.7917648014293835, + "grad_norm": 3.810258704896879, + "learning_rate": 5.473850841055664e-07, + "loss": 0.5026, + "step": 9749 + }, + { + "epoch": 0.7918460164054252, + "grad_norm": 3.970336831775221, + "learning_rate": 5.469744840200741e-07, + "loss": 0.4808, + "step": 9750 + }, + { + "epoch": 0.7919272313814667, + "grad_norm": 4.344838984297026, + "learning_rate": 5.465640190736124e-07, + "loss": 0.5038, + "step": 9751 + }, + { + "epoch": 0.7920084463575083, + "grad_norm": 4.912118996595724, + "learning_rate": 5.461536892945812e-07, + "loss": 0.4581, + "step": 9752 + }, + { + "epoch": 0.7920896613335499, + "grad_norm": 5.652749892181998, + "learning_rate": 5.457434947113749e-07, + "loss": 0.4009, + "step": 9753 + }, + { + "epoch": 0.7921708763095915, + "grad_norm": 17.57147989196119, + "learning_rate": 5.453334353523754e-07, + "loss": 0.4287, + "step": 9754 + }, + { + "epoch": 0.7922520912856331, + "grad_norm": 3.898615950581783, + "learning_rate": 5.449235112459577e-07, + "loss": 0.4907, + "step": 9755 + }, + { + "epoch": 0.7923333062616746, + "grad_norm": 6.661559536211671, + "learning_rate": 5.445137224204861e-07, + "loss": 0.6232, + "step": 9756 + }, + { + "epoch": 0.7924145212377163, + "grad_norm": 4.6703720755130265, + "learning_rate": 5.441040689043148e-07, + "loss": 0.3485, + "step": 9757 + }, + { + "epoch": 0.7924957362137578, + "grad_norm": 7.3100697341525995, + "learning_rate": 5.436945507257907e-07, + "loss": 0.485, + "step": 9758 + }, + { + "epoch": 0.7925769511897994, + "grad_norm": 5.877076789096071, + "learning_rate": 5.432851679132506e-07, + "loss": 0.5487, + "step": 9759 + }, + { + "epoch": 0.792658166165841, + "grad_norm": 6.67706951439948, + "learning_rate": 5.428759204950204e-07, + "loss": 0.8112, + "step": 9760 + }, + { + "epoch": 0.7927393811418826, + "grad_norm": 5.439821280458846, + "learning_rate": 5.424668084994195e-07, + "loss": 0.5156, + "step": 9761 + }, + { + "epoch": 0.7928205961179241, + "grad_norm": 4.572295130779405, + "learning_rate": 5.420578319547551e-07, + "loss": 0.3297, + "step": 9762 + }, + { + "epoch": 0.7929018110939657, + "grad_norm": 5.618034848926441, + "learning_rate": 5.416489908893258e-07, + "loss": 0.5123, + "step": 9763 + }, + { + "epoch": 0.7929830260700073, + "grad_norm": 5.221377474566307, + "learning_rate": 5.412402853314227e-07, + "loss": 0.3917, + "step": 9764 + }, + { + "epoch": 0.7930642410460489, + "grad_norm": 10.356233466887792, + "learning_rate": 5.408317153093245e-07, + "loss": 0.6027, + "step": 9765 + }, + { + "epoch": 0.7931454560220905, + "grad_norm": 4.822402093871006, + "learning_rate": 5.404232808513027e-07, + "loss": 0.5062, + "step": 9766 + }, + { + "epoch": 0.793226670998132, + "grad_norm": 4.342762770694148, + "learning_rate": 5.400149819856199e-07, + "loss": 0.4948, + "step": 9767 + }, + { + "epoch": 0.7933078859741737, + "grad_norm": 5.561331791979438, + "learning_rate": 5.396068187405273e-07, + "loss": 0.4032, + "step": 9768 + }, + { + "epoch": 0.7933891009502152, + "grad_norm": 8.670840918114324, + "learning_rate": 5.391987911442667e-07, + "loss": 0.613, + "step": 9769 + }, + { + "epoch": 0.7934703159262568, + "grad_norm": 4.613030257259587, + "learning_rate": 5.387908992250731e-07, + "loss": 0.3999, + "step": 9770 + }, + { + "epoch": 0.7935515309022984, + "grad_norm": 5.329616352899454, + "learning_rate": 5.383831430111691e-07, + "loss": 0.4021, + "step": 9771 + }, + { + "epoch": 0.79363274587834, + "grad_norm": 5.051235158011703, + "learning_rate": 5.379755225307707e-07, + "loss": 0.6117, + "step": 9772 + }, + { + "epoch": 0.7937139608543815, + "grad_norm": 4.393658723431004, + "learning_rate": 5.375680378120812e-07, + "loss": 0.5466, + "step": 9773 + }, + { + "epoch": 0.7937951758304231, + "grad_norm": 8.092758743122241, + "learning_rate": 5.371606888832984e-07, + "loss": 0.4675, + "step": 9774 + }, + { + "epoch": 0.7938763908064647, + "grad_norm": 4.672078958128614, + "learning_rate": 5.367534757726079e-07, + "loss": 0.5704, + "step": 9775 + }, + { + "epoch": 0.7939576057825063, + "grad_norm": 5.017160554885112, + "learning_rate": 5.363463985081854e-07, + "loss": 0.5743, + "step": 9776 + }, + { + "epoch": 0.7940388207585479, + "grad_norm": 3.292901108068312, + "learning_rate": 5.359394571182e-07, + "loss": 0.5656, + "step": 9777 + }, + { + "epoch": 0.7941200357345894, + "grad_norm": 5.959333091031521, + "learning_rate": 5.355326516308102e-07, + "loss": 0.484, + "step": 9778 + }, + { + "epoch": 0.7942012507106311, + "grad_norm": 3.555927641182116, + "learning_rate": 5.351259820741633e-07, + "loss": 0.5468, + "step": 9779 + }, + { + "epoch": 0.7942824656866726, + "grad_norm": 9.33015466014825, + "learning_rate": 5.347194484764001e-07, + "loss": 0.5981, + "step": 9780 + }, + { + "epoch": 0.7943636806627142, + "grad_norm": 5.675486153355801, + "learning_rate": 5.343130508656502e-07, + "loss": 0.5224, + "step": 9781 + }, + { + "epoch": 0.7944448956387558, + "grad_norm": 4.362949089225925, + "learning_rate": 5.339067892700331e-07, + "loss": 0.4188, + "step": 9782 + }, + { + "epoch": 0.7945261106147974, + "grad_norm": 4.529916181314969, + "learning_rate": 5.335006637176612e-07, + "loss": 0.4768, + "step": 9783 + }, + { + "epoch": 0.7946073255908389, + "grad_norm": 6.84665053039972, + "learning_rate": 5.330946742366356e-07, + "loss": 0.4384, + "step": 9784 + }, + { + "epoch": 0.7946885405668805, + "grad_norm": 5.58529418435716, + "learning_rate": 5.326888208550485e-07, + "loss": 0.3712, + "step": 9785 + }, + { + "epoch": 0.7947697555429221, + "grad_norm": 5.963815630595506, + "learning_rate": 5.32283103600984e-07, + "loss": 0.5674, + "step": 9786 + }, + { + "epoch": 0.7948509705189637, + "grad_norm": 5.054597490597483, + "learning_rate": 5.318775225025147e-07, + "loss": 0.5324, + "step": 9787 + }, + { + "epoch": 0.7949321854950053, + "grad_norm": 5.080349599593879, + "learning_rate": 5.314720775877046e-07, + "loss": 0.4342, + "step": 9788 + }, + { + "epoch": 0.7950134004710468, + "grad_norm": 4.048899362069842, + "learning_rate": 5.31066768884608e-07, + "loss": 0.3672, + "step": 9789 + }, + { + "epoch": 0.7950946154470885, + "grad_norm": 33.20617438804645, + "learning_rate": 5.306615964212705e-07, + "loss": 0.5476, + "step": 9790 + }, + { + "epoch": 0.79517583042313, + "grad_norm": 7.450168497762338, + "learning_rate": 5.302565602257285e-07, + "loss": 0.6486, + "step": 9791 + }, + { + "epoch": 0.7952570453991716, + "grad_norm": 7.689105107124658, + "learning_rate": 5.298516603260071e-07, + "loss": 0.3788, + "step": 9792 + }, + { + "epoch": 0.7953382603752132, + "grad_norm": 4.276239675857061, + "learning_rate": 5.294468967501248e-07, + "loss": 0.4255, + "step": 9793 + }, + { + "epoch": 0.7954194753512548, + "grad_norm": 5.366407144652406, + "learning_rate": 5.29042269526088e-07, + "loss": 0.5787, + "step": 9794 + }, + { + "epoch": 0.7955006903272963, + "grad_norm": 3.8340051916962503, + "learning_rate": 5.286377786818944e-07, + "loss": 0.6228, + "step": 9795 + }, + { + "epoch": 0.7955819053033379, + "grad_norm": 4.50626750141108, + "learning_rate": 5.282334242455339e-07, + "loss": 0.5358, + "step": 9796 + }, + { + "epoch": 0.7956631202793795, + "grad_norm": 6.044552408083624, + "learning_rate": 5.278292062449844e-07, + "loss": 0.5658, + "step": 9797 + }, + { + "epoch": 0.7957443352554211, + "grad_norm": 4.404044522106774, + "learning_rate": 5.274251247082163e-07, + "loss": 0.5208, + "step": 9798 + }, + { + "epoch": 0.7958255502314627, + "grad_norm": 7.333443652190031, + "learning_rate": 5.270211796631905e-07, + "loss": 0.3324, + "step": 9799 + }, + { + "epoch": 0.7959067652075043, + "grad_norm": 4.223546024424326, + "learning_rate": 5.266173711378572e-07, + "loss": 0.5734, + "step": 9800 + }, + { + "epoch": 0.7959879801835459, + "grad_norm": 5.341487545794119, + "learning_rate": 5.262136991601572e-07, + "loss": 0.385, + "step": 9801 + }, + { + "epoch": 0.7960691951595874, + "grad_norm": 4.803777217199065, + "learning_rate": 5.258101637580238e-07, + "loss": 0.4812, + "step": 9802 + }, + { + "epoch": 0.796150410135629, + "grad_norm": 6.650426970575839, + "learning_rate": 5.254067649593781e-07, + "loss": 0.4336, + "step": 9803 + }, + { + "epoch": 0.7962316251116706, + "grad_norm": 5.441981893530827, + "learning_rate": 5.250035027921338e-07, + "loss": 0.3946, + "step": 9804 + }, + { + "epoch": 0.7963128400877122, + "grad_norm": 5.107486747720455, + "learning_rate": 5.246003772841953e-07, + "loss": 0.4776, + "step": 9805 + }, + { + "epoch": 0.7963940550637537, + "grad_norm": 3.5626105502835497, + "learning_rate": 5.24197388463456e-07, + "loss": 0.328, + "step": 9806 + }, + { + "epoch": 0.7964752700397953, + "grad_norm": 6.074463700483569, + "learning_rate": 5.237945363578006e-07, + "loss": 0.4275, + "step": 9807 + }, + { + "epoch": 0.7965564850158369, + "grad_norm": 12.182536694266675, + "learning_rate": 5.233918209951039e-07, + "loss": 0.3317, + "step": 9808 + }, + { + "epoch": 0.7966376999918785, + "grad_norm": 6.106295220054585, + "learning_rate": 5.229892424032326e-07, + "loss": 0.4446, + "step": 9809 + }, + { + "epoch": 0.7967189149679201, + "grad_norm": 4.589038843159026, + "learning_rate": 5.225868006100421e-07, + "loss": 0.4815, + "step": 9810 + }, + { + "epoch": 0.7968001299439617, + "grad_norm": 13.701424714841433, + "learning_rate": 5.221844956433794e-07, + "loss": 0.3585, + "step": 9811 + }, + { + "epoch": 0.7968813449200033, + "grad_norm": 4.331772567167185, + "learning_rate": 5.21782327531083e-07, + "loss": 0.426, + "step": 9812 + }, + { + "epoch": 0.7969625598960448, + "grad_norm": 5.814768311032915, + "learning_rate": 5.213802963009798e-07, + "loss": 0.5894, + "step": 9813 + }, + { + "epoch": 0.7970437748720864, + "grad_norm": 4.7885091116888505, + "learning_rate": 5.209784019808877e-07, + "loss": 0.401, + "step": 9814 + }, + { + "epoch": 0.797124989848128, + "grad_norm": 3.7476316440320403, + "learning_rate": 5.205766445986174e-07, + "loss": 0.4015, + "step": 9815 + }, + { + "epoch": 0.7972062048241696, + "grad_norm": 4.916810203123699, + "learning_rate": 5.201750241819664e-07, + "loss": 0.3485, + "step": 9816 + }, + { + "epoch": 0.7972874198002111, + "grad_norm": 4.506048343346604, + "learning_rate": 5.197735407587257e-07, + "loss": 0.4305, + "step": 9817 + }, + { + "epoch": 0.7973686347762527, + "grad_norm": 3.516529711458559, + "learning_rate": 5.193721943566762e-07, + "loss": 0.5148, + "step": 9818 + }, + { + "epoch": 0.7974498497522943, + "grad_norm": 4.681381707890613, + "learning_rate": 5.189709850035887e-07, + "loss": 0.4342, + "step": 9819 + }, + { + "epoch": 0.7975310647283359, + "grad_norm": 5.7757941641398665, + "learning_rate": 5.185699127272243e-07, + "loss": 0.3261, + "step": 9820 + }, + { + "epoch": 0.7976122797043775, + "grad_norm": 3.0498451518722196, + "learning_rate": 5.181689775553355e-07, + "loss": 0.5548, + "step": 9821 + }, + { + "epoch": 0.7976934946804191, + "grad_norm": 7.950302605138208, + "learning_rate": 5.17768179515665e-07, + "loss": 0.3438, + "step": 9822 + }, + { + "epoch": 0.7977747096564607, + "grad_norm": 6.18464348555484, + "learning_rate": 5.173675186359451e-07, + "loss": 0.4751, + "step": 9823 + }, + { + "epoch": 0.7978559246325022, + "grad_norm": 7.611226667202139, + "learning_rate": 5.169669949438996e-07, + "loss": 0.3245, + "step": 9824 + }, + { + "epoch": 0.7979371396085438, + "grad_norm": 6.064790144719224, + "learning_rate": 5.165666084672439e-07, + "loss": 0.4096, + "step": 9825 + }, + { + "epoch": 0.7980183545845854, + "grad_norm": 4.770411960870846, + "learning_rate": 5.161663592336815e-07, + "loss": 0.6826, + "step": 9826 + }, + { + "epoch": 0.798099569560627, + "grad_norm": 7.597577840704956, + "learning_rate": 5.157662472709075e-07, + "loss": 0.5609, + "step": 9827 + }, + { + "epoch": 0.7981807845366685, + "grad_norm": 8.243596223621436, + "learning_rate": 5.153662726066083e-07, + "loss": 0.4814, + "step": 9828 + }, + { + "epoch": 0.7982619995127102, + "grad_norm": 9.676688199336681, + "learning_rate": 5.149664352684586e-07, + "loss": 0.5136, + "step": 9829 + }, + { + "epoch": 0.7983432144887517, + "grad_norm": 5.795108480675768, + "learning_rate": 5.14566735284126e-07, + "loss": 0.3747, + "step": 9830 + }, + { + "epoch": 0.7984244294647933, + "grad_norm": 6.471676661405976, + "learning_rate": 5.141671726812683e-07, + "loss": 0.5808, + "step": 9831 + }, + { + "epoch": 0.7985056444408349, + "grad_norm": 4.438123959791211, + "learning_rate": 5.137677474875324e-07, + "loss": 0.5517, + "step": 9832 + }, + { + "epoch": 0.7985868594168765, + "grad_norm": 7.424549295374417, + "learning_rate": 5.133684597305557e-07, + "loss": 0.3537, + "step": 9833 + }, + { + "epoch": 0.7986680743929181, + "grad_norm": 5.275736853110713, + "learning_rate": 5.129693094379684e-07, + "loss": 0.3372, + "step": 9834 + }, + { + "epoch": 0.7987492893689596, + "grad_norm": 3.2450613994463056, + "learning_rate": 5.125702966373883e-07, + "loss": 0.3272, + "step": 9835 + }, + { + "epoch": 0.7988305043450012, + "grad_norm": 12.566077839285736, + "learning_rate": 5.121714213564249e-07, + "loss": 0.4256, + "step": 9836 + }, + { + "epoch": 0.7989117193210428, + "grad_norm": 3.457305067918944, + "learning_rate": 5.117726836226786e-07, + "loss": 0.4961, + "step": 9837 + }, + { + "epoch": 0.7989929342970844, + "grad_norm": 5.900089910314316, + "learning_rate": 5.113740834637407e-07, + "loss": 0.3549, + "step": 9838 + }, + { + "epoch": 0.7990741492731259, + "grad_norm": 8.54709344532498, + "learning_rate": 5.109756209071908e-07, + "loss": 0.469, + "step": 9839 + }, + { + "epoch": 0.7991553642491676, + "grad_norm": 3.7791884771682147, + "learning_rate": 5.105772959806021e-07, + "loss": 0.5095, + "step": 9840 + }, + { + "epoch": 0.7992365792252091, + "grad_norm": 6.060813178732931, + "learning_rate": 5.101791087115354e-07, + "loss": 0.5591, + "step": 9841 + }, + { + "epoch": 0.7993177942012507, + "grad_norm": 3.1018063396354605, + "learning_rate": 5.097810591275429e-07, + "loss": 0.3985, + "step": 9842 + }, + { + "epoch": 0.7993990091772923, + "grad_norm": 5.327968286749991, + "learning_rate": 5.093831472561681e-07, + "loss": 0.4773, + "step": 9843 + }, + { + "epoch": 0.7994802241533339, + "grad_norm": 7.284888223378119, + "learning_rate": 5.089853731249448e-07, + "loss": 0.4411, + "step": 9844 + }, + { + "epoch": 0.7995614391293755, + "grad_norm": 9.03698387870564, + "learning_rate": 5.085877367613964e-07, + "loss": 0.4432, + "step": 9845 + }, + { + "epoch": 0.799642654105417, + "grad_norm": 4.655253921964928, + "learning_rate": 5.081902381930365e-07, + "loss": 0.5223, + "step": 9846 + }, + { + "epoch": 0.7997238690814586, + "grad_norm": 6.893829661308056, + "learning_rate": 5.077928774473714e-07, + "loss": 0.3228, + "step": 9847 + }, + { + "epoch": 0.7998050840575002, + "grad_norm": 4.268542033494732, + "learning_rate": 5.073956545518949e-07, + "loss": 0.4231, + "step": 9848 + }, + { + "epoch": 0.7998862990335418, + "grad_norm": 14.574424300264582, + "learning_rate": 5.069985695340931e-07, + "loss": 0.4657, + "step": 9849 + }, + { + "epoch": 0.7999675140095833, + "grad_norm": 5.388314959320381, + "learning_rate": 5.066016224214435e-07, + "loss": 0.6228, + "step": 9850 + }, + { + "epoch": 0.800048728985625, + "grad_norm": 5.9704183797863575, + "learning_rate": 5.062048132414116e-07, + "loss": 0.4077, + "step": 9851 + }, + { + "epoch": 0.8001299439616665, + "grad_norm": 7.054787471974678, + "learning_rate": 5.058081420214538e-07, + "loss": 0.394, + "step": 9852 + }, + { + "epoch": 0.8002111589377081, + "grad_norm": 9.477648614150743, + "learning_rate": 5.054116087890196e-07, + "loss": 0.5202, + "step": 9853 + }, + { + "epoch": 0.8002923739137497, + "grad_norm": 5.337615316923885, + "learning_rate": 5.050152135715453e-07, + "loss": 0.4712, + "step": 9854 + }, + { + "epoch": 0.8003735888897913, + "grad_norm": 4.045630006380179, + "learning_rate": 5.046189563964595e-07, + "loss": 0.4629, + "step": 9855 + }, + { + "epoch": 0.8004548038658329, + "grad_norm": 4.540887627720557, + "learning_rate": 5.042228372911815e-07, + "loss": 0.4685, + "step": 9856 + }, + { + "epoch": 0.8005360188418744, + "grad_norm": 5.474722039970322, + "learning_rate": 5.038268562831214e-07, + "loss": 0.4574, + "step": 9857 + }, + { + "epoch": 0.800617233817916, + "grad_norm": 4.262273306813277, + "learning_rate": 5.034310133996772e-07, + "loss": 0.6693, + "step": 9858 + }, + { + "epoch": 0.8006984487939576, + "grad_norm": 4.349294849404306, + "learning_rate": 5.030353086682413e-07, + "loss": 0.4987, + "step": 9859 + }, + { + "epoch": 0.8007796637699992, + "grad_norm": 3.749623526864484, + "learning_rate": 5.02639742116193e-07, + "loss": 0.5335, + "step": 9860 + }, + { + "epoch": 0.8008608787460407, + "grad_norm": 3.6511910926652926, + "learning_rate": 5.022443137709032e-07, + "loss": 0.5024, + "step": 9861 + }, + { + "epoch": 0.8009420937220824, + "grad_norm": 7.167220329931862, + "learning_rate": 5.018490236597337e-07, + "loss": 0.7358, + "step": 9862 + }, + { + "epoch": 0.8010233086981239, + "grad_norm": 4.748703110219761, + "learning_rate": 5.014538718100373e-07, + "loss": 0.5198, + "step": 9863 + }, + { + "epoch": 0.8011045236741655, + "grad_norm": 6.53063347027075, + "learning_rate": 5.01058858249156e-07, + "loss": 0.5499, + "step": 9864 + }, + { + "epoch": 0.8011857386502071, + "grad_norm": 4.349589259638556, + "learning_rate": 5.006639830044219e-07, + "loss": 0.4342, + "step": 9865 + }, + { + "epoch": 0.8012669536262487, + "grad_norm": 4.143371415401599, + "learning_rate": 5.002692461031591e-07, + "loss": 0.5895, + "step": 9866 + }, + { + "epoch": 0.8013481686022903, + "grad_norm": 7.7254724395274055, + "learning_rate": 4.998746475726815e-07, + "loss": 0.5336, + "step": 9867 + }, + { + "epoch": 0.8014293835783318, + "grad_norm": 5.175670084157273, + "learning_rate": 4.994801874402918e-07, + "loss": 0.6204, + "step": 9868 + }, + { + "epoch": 0.8015105985543735, + "grad_norm": 6.735892978669596, + "learning_rate": 4.990858657332856e-07, + "loss": 0.3911, + "step": 9869 + }, + { + "epoch": 0.801591813530415, + "grad_norm": 5.104643492256398, + "learning_rate": 4.986916824789484e-07, + "loss": 0.5035, + "step": 9870 + }, + { + "epoch": 0.8016730285064566, + "grad_norm": 3.5531328145211476, + "learning_rate": 4.982976377045546e-07, + "loss": 0.4066, + "step": 9871 + }, + { + "epoch": 0.8017542434824981, + "grad_norm": 4.292790999663058, + "learning_rate": 4.979037314373708e-07, + "loss": 0.7253, + "step": 9872 + }, + { + "epoch": 0.8018354584585398, + "grad_norm": 3.9126112690178934, + "learning_rate": 4.975099637046529e-07, + "loss": 0.5173, + "step": 9873 + }, + { + "epoch": 0.8019166734345813, + "grad_norm": 4.952119344921789, + "learning_rate": 4.971163345336469e-07, + "loss": 0.546, + "step": 9874 + }, + { + "epoch": 0.8019978884106229, + "grad_norm": 4.007491175094282, + "learning_rate": 4.967228439515903e-07, + "loss": 0.5014, + "step": 9875 + }, + { + "epoch": 0.8020791033866645, + "grad_norm": 7.4996807066549405, + "learning_rate": 4.963294919857115e-07, + "loss": 0.4807, + "step": 9876 + }, + { + "epoch": 0.8021603183627061, + "grad_norm": 3.8718307958429308, + "learning_rate": 4.959362786632274e-07, + "loss": 0.3412, + "step": 9877 + }, + { + "epoch": 0.8022415333387477, + "grad_norm": 14.309366650475116, + "learning_rate": 4.955432040113459e-07, + "loss": 0.4632, + "step": 9878 + }, + { + "epoch": 0.8023227483147892, + "grad_norm": 6.072346159252271, + "learning_rate": 4.95150268057267e-07, + "loss": 0.3375, + "step": 9879 + }, + { + "epoch": 0.8024039632908309, + "grad_norm": 3.9060655122044956, + "learning_rate": 4.947574708281788e-07, + "loss": 0.4548, + "step": 9880 + }, + { + "epoch": 0.8024851782668724, + "grad_norm": 4.543113575494821, + "learning_rate": 4.943648123512607e-07, + "loss": 0.5799, + "step": 9881 + }, + { + "epoch": 0.802566393242914, + "grad_norm": 8.154618850928907, + "learning_rate": 4.939722926536825e-07, + "loss": 0.3684, + "step": 9882 + }, + { + "epoch": 0.8026476082189555, + "grad_norm": 3.3302480665945673, + "learning_rate": 4.935799117626058e-07, + "loss": 0.5121, + "step": 9883 + }, + { + "epoch": 0.8027288231949972, + "grad_norm": 5.010095824925431, + "learning_rate": 4.931876697051797e-07, + "loss": 0.4584, + "step": 9884 + }, + { + "epoch": 0.8028100381710387, + "grad_norm": 8.290730780611321, + "learning_rate": 4.927955665085466e-07, + "loss": 0.5904, + "step": 9885 + }, + { + "epoch": 0.8028912531470803, + "grad_norm": 6.530421557967707, + "learning_rate": 4.924036021998372e-07, + "loss": 0.6501, + "step": 9886 + }, + { + "epoch": 0.802972468123122, + "grad_norm": 3.933371390397304, + "learning_rate": 4.92011776806173e-07, + "loss": 0.4631, + "step": 9887 + }, + { + "epoch": 0.8030536830991635, + "grad_norm": 4.630936993671216, + "learning_rate": 4.916200903546664e-07, + "loss": 0.4694, + "step": 9888 + }, + { + "epoch": 0.8031348980752051, + "grad_norm": 8.920541461832672, + "learning_rate": 4.912285428724214e-07, + "loss": 0.4291, + "step": 9889 + }, + { + "epoch": 0.8032161130512466, + "grad_norm": 4.683558801065991, + "learning_rate": 4.908371343865289e-07, + "loss": 0.5715, + "step": 9890 + }, + { + "epoch": 0.8032973280272883, + "grad_norm": 4.064081828276734, + "learning_rate": 4.904458649240742e-07, + "loss": 0.6187, + "step": 9891 + }, + { + "epoch": 0.8033785430033298, + "grad_norm": 13.844068111838718, + "learning_rate": 4.900547345121304e-07, + "loss": 0.4852, + "step": 9892 + }, + { + "epoch": 0.8034597579793714, + "grad_norm": 7.334487582827237, + "learning_rate": 4.896637431777607e-07, + "loss": 0.5241, + "step": 9893 + }, + { + "epoch": 0.8035409729554129, + "grad_norm": 4.075794059330378, + "learning_rate": 4.89272890948021e-07, + "loss": 0.4231, + "step": 9894 + }, + { + "epoch": 0.8036221879314546, + "grad_norm": 3.1839585809732913, + "learning_rate": 4.88882177849955e-07, + "loss": 0.3873, + "step": 9895 + }, + { + "epoch": 0.8037034029074961, + "grad_norm": 3.9421773033440233, + "learning_rate": 4.884916039105994e-07, + "loss": 0.4194, + "step": 9896 + }, + { + "epoch": 0.8037846178835377, + "grad_norm": 4.937157672903017, + "learning_rate": 4.881011691569781e-07, + "loss": 0.3529, + "step": 9897 + }, + { + "epoch": 0.8038658328595794, + "grad_norm": 9.136038936527344, + "learning_rate": 4.877108736161091e-07, + "loss": 0.5727, + "step": 9898 + }, + { + "epoch": 0.8039470478356209, + "grad_norm": 4.5845637025408, + "learning_rate": 4.873207173149974e-07, + "loss": 0.608, + "step": 9899 + }, + { + "epoch": 0.8040282628116625, + "grad_norm": 5.642715408471168, + "learning_rate": 4.869307002806397e-07, + "loss": 0.471, + "step": 9900 + }, + { + "epoch": 0.804109477787704, + "grad_norm": 6.2233329502733294, + "learning_rate": 4.865408225400234e-07, + "loss": 0.51, + "step": 9901 + }, + { + "epoch": 0.8041906927637457, + "grad_norm": 7.993507725893066, + "learning_rate": 4.861510841201266e-07, + "loss": 0.3892, + "step": 9902 + }, + { + "epoch": 0.8042719077397872, + "grad_norm": 4.96226956095677, + "learning_rate": 4.857614850479161e-07, + "loss": 0.5246, + "step": 9903 + }, + { + "epoch": 0.8043531227158288, + "grad_norm": 11.603584675552659, + "learning_rate": 4.853720253503514e-07, + "loss": 0.4751, + "step": 9904 + }, + { + "epoch": 0.8044343376918703, + "grad_norm": 4.342808900661657, + "learning_rate": 4.849827050543801e-07, + "loss": 0.5307, + "step": 9905 + }, + { + "epoch": 0.804515552667912, + "grad_norm": 6.60599795238213, + "learning_rate": 4.845935241869409e-07, + "loss": 0.4751, + "step": 9906 + }, + { + "epoch": 0.8045967676439535, + "grad_norm": 4.155426698257137, + "learning_rate": 4.842044827749632e-07, + "loss": 0.511, + "step": 9907 + }, + { + "epoch": 0.8046779826199951, + "grad_norm": 5.569956854051911, + "learning_rate": 4.838155808453676e-07, + "loss": 0.6412, + "step": 9908 + }, + { + "epoch": 0.8047591975960368, + "grad_norm": 8.778391998701604, + "learning_rate": 4.834268184250626e-07, + "loss": 0.3876, + "step": 9909 + }, + { + "epoch": 0.8048404125720783, + "grad_norm": 6.809131934976907, + "learning_rate": 4.830381955409497e-07, + "loss": 0.438, + "step": 9910 + }, + { + "epoch": 0.8049216275481199, + "grad_norm": 6.2072199104772885, + "learning_rate": 4.826497122199191e-07, + "loss": 0.5043, + "step": 9911 + }, + { + "epoch": 0.8050028425241614, + "grad_norm": 9.779692232356227, + "learning_rate": 4.822613684888519e-07, + "loss": 0.4799, + "step": 9912 + }, + { + "epoch": 0.8050840575002031, + "grad_norm": 5.12858071171813, + "learning_rate": 4.818731643746186e-07, + "loss": 0.5888, + "step": 9913 + }, + { + "epoch": 0.8051652724762446, + "grad_norm": 3.86454745630488, + "learning_rate": 4.814850999040816e-07, + "loss": 0.551, + "step": 9914 + }, + { + "epoch": 0.8052464874522862, + "grad_norm": 4.189470752357414, + "learning_rate": 4.810971751040932e-07, + "loss": 0.4416, + "step": 9915 + }, + { + "epoch": 0.8053277024283277, + "grad_norm": 8.768793347385872, + "learning_rate": 4.80709390001495e-07, + "loss": 0.3583, + "step": 9916 + }, + { + "epoch": 0.8054089174043694, + "grad_norm": 5.814244672682726, + "learning_rate": 4.803217446231206e-07, + "loss": 0.5113, + "step": 9917 + }, + { + "epoch": 0.8054901323804109, + "grad_norm": 4.585923445910094, + "learning_rate": 4.799342389957925e-07, + "loss": 0.4861, + "step": 9918 + }, + { + "epoch": 0.8055713473564525, + "grad_norm": 5.831696175264736, + "learning_rate": 4.795468731463232e-07, + "loss": 0.5333, + "step": 9919 + }, + { + "epoch": 0.8056525623324942, + "grad_norm": 4.199244928525552, + "learning_rate": 4.791596471015175e-07, + "loss": 0.5757, + "step": 9920 + }, + { + "epoch": 0.8057337773085357, + "grad_norm": 4.550642371506264, + "learning_rate": 4.787725608881694e-07, + "loss": 0.5091, + "step": 9921 + }, + { + "epoch": 0.8058149922845773, + "grad_norm": 6.904305095897743, + "learning_rate": 4.783856145330624e-07, + "loss": 0.3853, + "step": 9922 + }, + { + "epoch": 0.8058962072606188, + "grad_norm": 5.386276895757226, + "learning_rate": 4.779988080629722e-07, + "loss": 0.547, + "step": 9923 + }, + { + "epoch": 0.8059774222366605, + "grad_norm": 5.592802881169542, + "learning_rate": 4.776121415046634e-07, + "loss": 0.6014, + "step": 9924 + }, + { + "epoch": 0.806058637212702, + "grad_norm": 6.5204999954371825, + "learning_rate": 4.772256148848903e-07, + "loss": 0.5031, + "step": 9925 + }, + { + "epoch": 0.8061398521887436, + "grad_norm": 4.641348135597619, + "learning_rate": 4.768392282303999e-07, + "loss": 0.4754, + "step": 9926 + }, + { + "epoch": 0.8062210671647851, + "grad_norm": 6.611205228762905, + "learning_rate": 4.7645298156792667e-07, + "loss": 0.6335, + "step": 9927 + }, + { + "epoch": 0.8063022821408268, + "grad_norm": 4.080391885424033, + "learning_rate": 4.7606687492419785e-07, + "loss": 0.5242, + "step": 9928 + }, + { + "epoch": 0.8063834971168683, + "grad_norm": 12.801912551363564, + "learning_rate": 4.7568090832593033e-07, + "loss": 0.4403, + "step": 9929 + }, + { + "epoch": 0.8064647120929099, + "grad_norm": 6.118657536304109, + "learning_rate": 4.752950817998303e-07, + "loss": 0.505, + "step": 9930 + }, + { + "epoch": 0.8065459270689516, + "grad_norm": 5.362943206753638, + "learning_rate": 4.7490939537259527e-07, + "loss": 0.4619, + "step": 9931 + }, + { + "epoch": 0.8066271420449931, + "grad_norm": 6.975033167992144, + "learning_rate": 4.745238490709117e-07, + "loss": 0.3267, + "step": 9932 + }, + { + "epoch": 0.8067083570210347, + "grad_norm": 4.258954432113065, + "learning_rate": 4.741384429214579e-07, + "loss": 0.4316, + "step": 9933 + }, + { + "epoch": 0.8067895719970762, + "grad_norm": 3.8337908412938972, + "learning_rate": 4.7375317695090295e-07, + "loss": 0.5411, + "step": 9934 + }, + { + "epoch": 0.8068707869731179, + "grad_norm": 3.8911807697106213, + "learning_rate": 4.7336805118590375e-07, + "loss": 0.4551, + "step": 9935 + }, + { + "epoch": 0.8069520019491594, + "grad_norm": 3.9612535233371764, + "learning_rate": 4.729830656531101e-07, + "loss": 0.5383, + "step": 9936 + }, + { + "epoch": 0.807033216925201, + "grad_norm": 4.951917020944689, + "learning_rate": 4.725982203791607e-07, + "loss": 0.6442, + "step": 9937 + }, + { + "epoch": 0.8071144319012425, + "grad_norm": 9.313306419072807, + "learning_rate": 4.7221351539068374e-07, + "loss": 0.4088, + "step": 9938 + }, + { + "epoch": 0.8071956468772842, + "grad_norm": 5.792989399998622, + "learning_rate": 4.7182895071430036e-07, + "loss": 0.4883, + "step": 9939 + }, + { + "epoch": 0.8072768618533257, + "grad_norm": 6.612743860989568, + "learning_rate": 4.7144452637661875e-07, + "loss": 0.522, + "step": 9940 + }, + { + "epoch": 0.8073580768293673, + "grad_norm": 5.3380622522214916, + "learning_rate": 4.7106024240424014e-07, + "loss": 0.5033, + "step": 9941 + }, + { + "epoch": 0.807439291805409, + "grad_norm": 10.643012463139256, + "learning_rate": 4.706760988237555e-07, + "loss": 0.4336, + "step": 9942 + }, + { + "epoch": 0.8075205067814505, + "grad_norm": 6.794117107098659, + "learning_rate": 4.702920956617446e-07, + "loss": 0.4817, + "step": 9943 + }, + { + "epoch": 0.8076017217574921, + "grad_norm": 10.634630209323328, + "learning_rate": 4.6990823294477795e-07, + "loss": 0.3929, + "step": 9944 + }, + { + "epoch": 0.8076829367335336, + "grad_norm": 6.387003003213154, + "learning_rate": 4.695245106994181e-07, + "loss": 0.4227, + "step": 9945 + }, + { + "epoch": 0.8077641517095753, + "grad_norm": 5.885749845817612, + "learning_rate": 4.691409289522156e-07, + "loss": 0.4674, + "step": 9946 + }, + { + "epoch": 0.8078453666856168, + "grad_norm": 5.744010803243097, + "learning_rate": 4.6875748772971244e-07, + "loss": 0.6015, + "step": 9947 + }, + { + "epoch": 0.8079265816616584, + "grad_norm": 3.2133616441041704, + "learning_rate": 4.683741870584413e-07, + "loss": 0.4788, + "step": 9948 + }, + { + "epoch": 0.8080077966377, + "grad_norm": 6.807257172917265, + "learning_rate": 4.679910269649246e-07, + "loss": 0.5901, + "step": 9949 + }, + { + "epoch": 0.8080890116137416, + "grad_norm": 3.9285075156101823, + "learning_rate": 4.676080074756745e-07, + "loss": 0.5877, + "step": 9950 + }, + { + "epoch": 0.8081702265897831, + "grad_norm": 5.276676978089501, + "learning_rate": 4.6722512861719304e-07, + "loss": 0.5472, + "step": 9951 + }, + { + "epoch": 0.8082514415658247, + "grad_norm": 5.815263402068283, + "learning_rate": 4.6684239041597524e-07, + "loss": 0.6121, + "step": 9952 + }, + { + "epoch": 0.8083326565418664, + "grad_norm": 8.366540950724946, + "learning_rate": 4.6645979289850316e-07, + "loss": 0.3672, + "step": 9953 + }, + { + "epoch": 0.8084138715179079, + "grad_norm": 4.915630857560634, + "learning_rate": 4.66077336091251e-07, + "loss": 0.4365, + "step": 9954 + }, + { + "epoch": 0.8084950864939495, + "grad_norm": 14.896314047238715, + "learning_rate": 4.6569502002068336e-07, + "loss": 0.3652, + "step": 9955 + }, + { + "epoch": 0.808576301469991, + "grad_norm": 5.45782242106025, + "learning_rate": 4.6531284471325375e-07, + "loss": 0.4285, + "step": 9956 + }, + { + "epoch": 0.8086575164460327, + "grad_norm": 4.550708998274069, + "learning_rate": 4.649308101954064e-07, + "loss": 0.5137, + "step": 9957 + }, + { + "epoch": 0.8087387314220742, + "grad_norm": 4.5179592494433125, + "learning_rate": 4.645489164935774e-07, + "loss": 0.4994, + "step": 9958 + }, + { + "epoch": 0.8088199463981158, + "grad_norm": 8.186932272267061, + "learning_rate": 4.641671636341899e-07, + "loss": 0.4727, + "step": 9959 + }, + { + "epoch": 0.8089011613741574, + "grad_norm": 7.7788457747580635, + "learning_rate": 4.637855516436604e-07, + "loss": 0.4331, + "step": 9960 + }, + { + "epoch": 0.808982376350199, + "grad_norm": 9.523483750496569, + "learning_rate": 4.634040805483947e-07, + "loss": 0.4042, + "step": 9961 + }, + { + "epoch": 0.8090635913262405, + "grad_norm": 4.262182822939964, + "learning_rate": 4.6302275037478804e-07, + "loss": 0.6089, + "step": 9962 + }, + { + "epoch": 0.8091448063022821, + "grad_norm": 5.01145075636782, + "learning_rate": 4.6264156114922605e-07, + "loss": 0.6344, + "step": 9963 + }, + { + "epoch": 0.8092260212783238, + "grad_norm": 6.573486459689827, + "learning_rate": 4.622605128980862e-07, + "loss": 0.3984, + "step": 9964 + }, + { + "epoch": 0.8093072362543653, + "grad_norm": 6.380655374815469, + "learning_rate": 4.61879605647734e-07, + "loss": 0.3601, + "step": 9965 + }, + { + "epoch": 0.8093884512304069, + "grad_norm": 11.839284886283753, + "learning_rate": 4.6149883942452595e-07, + "loss": 0.626, + "step": 9966 + }, + { + "epoch": 0.8094696662064484, + "grad_norm": 6.373110585879932, + "learning_rate": 4.6111821425480956e-07, + "loss": 0.7225, + "step": 9967 + }, + { + "epoch": 0.8095508811824901, + "grad_norm": 7.7580649386331455, + "learning_rate": 4.6073773016492267e-07, + "loss": 0.5471, + "step": 9968 + }, + { + "epoch": 0.8096320961585316, + "grad_norm": 4.824098525076564, + "learning_rate": 4.603573871811923e-07, + "loss": 0.5152, + "step": 9969 + }, + { + "epoch": 0.8097133111345732, + "grad_norm": 2.881153799640226, + "learning_rate": 4.5997718532993535e-07, + "loss": 0.5088, + "step": 9970 + }, + { + "epoch": 0.8097945261106148, + "grad_norm": 6.514585014658824, + "learning_rate": 4.5959712463746144e-07, + "loss": 0.4321, + "step": 9971 + }, + { + "epoch": 0.8098757410866564, + "grad_norm": 7.85731483603247, + "learning_rate": 4.5921720513006697e-07, + "loss": 0.3917, + "step": 9972 + }, + { + "epoch": 0.8099569560626979, + "grad_norm": 5.1598024248620495, + "learning_rate": 4.588374268340412e-07, + "loss": 0.5726, + "step": 9973 + }, + { + "epoch": 0.8100381710387395, + "grad_norm": 4.512666934916995, + "learning_rate": 4.584577897756634e-07, + "loss": 0.5283, + "step": 9974 + }, + { + "epoch": 0.8101193860147812, + "grad_norm": 8.411566127602082, + "learning_rate": 4.58078293981202e-07, + "loss": 0.4524, + "step": 9975 + }, + { + "epoch": 0.8102006009908227, + "grad_norm": 8.15739766766053, + "learning_rate": 4.5769893947691517e-07, + "loss": 0.4705, + "step": 9976 + }, + { + "epoch": 0.8102818159668643, + "grad_norm": 6.8035924289379235, + "learning_rate": 4.5731972628905357e-07, + "loss": 0.4252, + "step": 9977 + }, + { + "epoch": 0.8103630309429058, + "grad_norm": 4.227519769293889, + "learning_rate": 4.5694065444385564e-07, + "loss": 0.5061, + "step": 9978 + }, + { + "epoch": 0.8104442459189475, + "grad_norm": 6.581692909390787, + "learning_rate": 4.5656172396755156e-07, + "loss": 0.4785, + "step": 9979 + }, + { + "epoch": 0.810525460894989, + "grad_norm": 4.606311339058521, + "learning_rate": 4.561829348863622e-07, + "loss": 0.5214, + "step": 9980 + }, + { + "epoch": 0.8106066758710306, + "grad_norm": 6.277335985489802, + "learning_rate": 4.55804287226497e-07, + "loss": 0.2991, + "step": 9981 + }, + { + "epoch": 0.8106878908470722, + "grad_norm": 5.904104493248545, + "learning_rate": 4.5542578101415576e-07, + "loss": 0.4301, + "step": 9982 + }, + { + "epoch": 0.8107691058231138, + "grad_norm": 3.728461827087679, + "learning_rate": 4.550474162755303e-07, + "loss": 0.6069, + "step": 9983 + }, + { + "epoch": 0.8108503207991553, + "grad_norm": 4.995689618250689, + "learning_rate": 4.546691930368008e-07, + "loss": 0.4381, + "step": 9984 + }, + { + "epoch": 0.8109315357751969, + "grad_norm": 6.939274659064955, + "learning_rate": 4.5429111132413773e-07, + "loss": 0.5178, + "step": 9985 + }, + { + "epoch": 0.8110127507512386, + "grad_norm": 6.822025362372183, + "learning_rate": 4.539131711637032e-07, + "loss": 0.3963, + "step": 9986 + }, + { + "epoch": 0.8110939657272801, + "grad_norm": 6.644364425158026, + "learning_rate": 4.535353725816488e-07, + "loss": 0.504, + "step": 9987 + }, + { + "epoch": 0.8111751807033217, + "grad_norm": 5.480802203291707, + "learning_rate": 4.5315771560411617e-07, + "loss": 0.4743, + "step": 9988 + }, + { + "epoch": 0.8112563956793633, + "grad_norm": 5.912237232094158, + "learning_rate": 4.5278020025723596e-07, + "loss": 0.4656, + "step": 9989 + }, + { + "epoch": 0.8113376106554049, + "grad_norm": 5.0776992221582935, + "learning_rate": 4.524028265671318e-07, + "loss": 0.4281, + "step": 9990 + }, + { + "epoch": 0.8114188256314464, + "grad_norm": 5.3270257232616025, + "learning_rate": 4.5202559455991473e-07, + "loss": 0.4505, + "step": 9991 + }, + { + "epoch": 0.811500040607488, + "grad_norm": 6.190730407880878, + "learning_rate": 4.516485042616878e-07, + "loss": 0.4437, + "step": 9992 + }, + { + "epoch": 0.8115812555835296, + "grad_norm": 5.542928858101199, + "learning_rate": 4.512715556985442e-07, + "loss": 0.4011, + "step": 9993 + }, + { + "epoch": 0.8116624705595712, + "grad_norm": 7.642739194941673, + "learning_rate": 4.508947488965662e-07, + "loss": 0.4349, + "step": 9994 + }, + { + "epoch": 0.8117436855356127, + "grad_norm": 7.8072925191375395, + "learning_rate": 4.505180838818263e-07, + "loss": 0.5984, + "step": 9995 + }, + { + "epoch": 0.8118249005116543, + "grad_norm": 4.656078397528526, + "learning_rate": 4.501415606803888e-07, + "loss": 0.4646, + "step": 9996 + }, + { + "epoch": 0.811906115487696, + "grad_norm": 4.525381591463549, + "learning_rate": 4.4976517931830637e-07, + "loss": 0.4135, + "step": 9997 + }, + { + "epoch": 0.8119873304637375, + "grad_norm": 5.890129159202458, + "learning_rate": 4.4938893982162253e-07, + "loss": 0.5532, + "step": 9998 + }, + { + "epoch": 0.8120685454397791, + "grad_norm": 6.903081683256655, + "learning_rate": 4.4901284221637113e-07, + "loss": 0.5935, + "step": 9999 + }, + { + "epoch": 0.8121497604158207, + "grad_norm": 19.18119241326282, + "learning_rate": 4.48636886528577e-07, + "loss": 0.4778, + "step": 10000 + }, + { + "epoch": 0.8122309753918623, + "grad_norm": 6.157895091798137, + "learning_rate": 4.482610727842532e-07, + "loss": 0.69, + "step": 10001 + }, + { + "epoch": 0.8123121903679038, + "grad_norm": 5.599909239938712, + "learning_rate": 4.47885401009405e-07, + "loss": 0.4685, + "step": 10002 + }, + { + "epoch": 0.8123934053439454, + "grad_norm": 6.484452032525161, + "learning_rate": 4.475098712300263e-07, + "loss": 0.5125, + "step": 10003 + }, + { + "epoch": 0.812474620319987, + "grad_norm": 8.535162150811436, + "learning_rate": 4.4713448347210114e-07, + "loss": 0.3792, + "step": 10004 + }, + { + "epoch": 0.8125558352960286, + "grad_norm": 13.187357588897884, + "learning_rate": 4.4675923776160533e-07, + "loss": 0.4935, + "step": 10005 + }, + { + "epoch": 0.8126370502720701, + "grad_norm": 7.720879346905562, + "learning_rate": 4.463841341245043e-07, + "loss": 0.6142, + "step": 10006 + }, + { + "epoch": 0.8127182652481117, + "grad_norm": 4.332760377433552, + "learning_rate": 4.460091725867524e-07, + "loss": 0.436, + "step": 10007 + }, + { + "epoch": 0.8127994802241534, + "grad_norm": 6.514614871896753, + "learning_rate": 4.456343531742946e-07, + "loss": 0.4141, + "step": 10008 + }, + { + "epoch": 0.8128806952001949, + "grad_norm": 5.3062919238625605, + "learning_rate": 4.4525967591306757e-07, + "loss": 0.4301, + "step": 10009 + }, + { + "epoch": 0.8129619101762365, + "grad_norm": 9.134817655594835, + "learning_rate": 4.448851408289964e-07, + "loss": 0.5738, + "step": 10010 + }, + { + "epoch": 0.8130431251522781, + "grad_norm": 4.873683627050256, + "learning_rate": 4.4451074794799627e-07, + "loss": 0.304, + "step": 10011 + }, + { + "epoch": 0.8131243401283197, + "grad_norm": 3.4718602879812437, + "learning_rate": 4.4413649729597386e-07, + "loss": 0.3923, + "step": 10012 + }, + { + "epoch": 0.8132055551043612, + "grad_norm": 4.570485265053012, + "learning_rate": 4.43762388898826e-07, + "loss": 0.4372, + "step": 10013 + }, + { + "epoch": 0.8132867700804028, + "grad_norm": 3.9431671288156664, + "learning_rate": 4.4338842278243784e-07, + "loss": 0.4041, + "step": 10014 + }, + { + "epoch": 0.8133679850564444, + "grad_norm": 9.625205916829707, + "learning_rate": 4.4301459897268695e-07, + "loss": 0.4428, + "step": 10015 + }, + { + "epoch": 0.813449200032486, + "grad_norm": 7.135399774793007, + "learning_rate": 4.426409174954391e-07, + "loss": 0.5457, + "step": 10016 + }, + { + "epoch": 0.8135304150085275, + "grad_norm": 7.241634226766735, + "learning_rate": 4.4226737837655106e-07, + "loss": 0.4478, + "step": 10017 + }, + { + "epoch": 0.8136116299845692, + "grad_norm": 5.474906261114699, + "learning_rate": 4.418939816418699e-07, + "loss": 0.4813, + "step": 10018 + }, + { + "epoch": 0.8136928449606108, + "grad_norm": 7.248288071920974, + "learning_rate": 4.4152072731723336e-07, + "loss": 0.5229, + "step": 10019 + }, + { + "epoch": 0.8137740599366523, + "grad_norm": 4.68601406629194, + "learning_rate": 4.411476154284683e-07, + "loss": 0.4383, + "step": 10020 + }, + { + "epoch": 0.8138552749126939, + "grad_norm": 8.548985633110162, + "learning_rate": 4.407746460013912e-07, + "loss": 0.434, + "step": 10021 + }, + { + "epoch": 0.8139364898887355, + "grad_norm": 3.598258827038912, + "learning_rate": 4.404018190618109e-07, + "loss": 0.6293, + "step": 10022 + }, + { + "epoch": 0.8140177048647771, + "grad_norm": 4.9162560687761525, + "learning_rate": 4.4002913463552457e-07, + "loss": 0.6338, + "step": 10023 + }, + { + "epoch": 0.8140989198408186, + "grad_norm": 5.176333776815743, + "learning_rate": 4.39656592748319e-07, + "loss": 0.4134, + "step": 10024 + }, + { + "epoch": 0.8141801348168602, + "grad_norm": 5.309773965001737, + "learning_rate": 4.392841934259731e-07, + "loss": 0.4122, + "step": 10025 + }, + { + "epoch": 0.8142613497929018, + "grad_norm": 4.204661866235209, + "learning_rate": 4.3891193669425567e-07, + "loss": 0.5778, + "step": 10026 + }, + { + "epoch": 0.8143425647689434, + "grad_norm": 5.654465245751199, + "learning_rate": 4.3853982257892335e-07, + "loss": 0.5514, + "step": 10027 + }, + { + "epoch": 0.8144237797449849, + "grad_norm": 4.506512234014459, + "learning_rate": 4.3816785110572554e-07, + "loss": 0.537, + "step": 10028 + }, + { + "epoch": 0.8145049947210266, + "grad_norm": 3.728172094291727, + "learning_rate": 4.3779602230040075e-07, + "loss": 0.4989, + "step": 10029 + }, + { + "epoch": 0.8145862096970682, + "grad_norm": 7.112854813373821, + "learning_rate": 4.3742433618867623e-07, + "loss": 0.4539, + "step": 10030 + }, + { + "epoch": 0.8146674246731097, + "grad_norm": 4.9529582527943745, + "learning_rate": 4.370527927962717e-07, + "loss": 0.6694, + "step": 10031 + }, + { + "epoch": 0.8147486396491513, + "grad_norm": 5.619899087759572, + "learning_rate": 4.366813921488966e-07, + "loss": 0.5285, + "step": 10032 + }, + { + "epoch": 0.8148298546251929, + "grad_norm": 5.707386464605084, + "learning_rate": 4.363101342722484e-07, + "loss": 0.4412, + "step": 10033 + }, + { + "epoch": 0.8149110696012345, + "grad_norm": 4.86389548489138, + "learning_rate": 4.359390191920176e-07, + "loss": 0.5749, + "step": 10034 + }, + { + "epoch": 0.814992284577276, + "grad_norm": 5.994879397108235, + "learning_rate": 4.35568046933883e-07, + "loss": 0.3602, + "step": 10035 + }, + { + "epoch": 0.8150734995533176, + "grad_norm": 3.0944128092213523, + "learning_rate": 4.3519721752351305e-07, + "loss": 0.448, + "step": 10036 + }, + { + "epoch": 0.8151547145293592, + "grad_norm": 6.293183554065846, + "learning_rate": 4.3482653098656764e-07, + "loss": 0.5155, + "step": 10037 + }, + { + "epoch": 0.8152359295054008, + "grad_norm": 6.726079260676008, + "learning_rate": 4.3445598734869725e-07, + "loss": 0.5044, + "step": 10038 + }, + { + "epoch": 0.8153171444814423, + "grad_norm": 5.929353123479193, + "learning_rate": 4.340855866355409e-07, + "loss": 0.3846, + "step": 10039 + }, + { + "epoch": 0.815398359457484, + "grad_norm": 4.346247885330788, + "learning_rate": 4.3371532887272747e-07, + "loss": 0.4431, + "step": 10040 + }, + { + "epoch": 0.8154795744335256, + "grad_norm": 6.564210241083292, + "learning_rate": 4.333452140858782e-07, + "loss": 0.4179, + "step": 10041 + }, + { + "epoch": 0.8155607894095671, + "grad_norm": 6.160516432704893, + "learning_rate": 4.3297524230060257e-07, + "loss": 0.4109, + "step": 10042 + }, + { + "epoch": 0.8156420043856087, + "grad_norm": 4.046212520398475, + "learning_rate": 4.326054135425001e-07, + "loss": 0.4727, + "step": 10043 + }, + { + "epoch": 0.8157232193616503, + "grad_norm": 4.21989184407862, + "learning_rate": 4.322357278371614e-07, + "loss": 0.5261, + "step": 10044 + }, + { + "epoch": 0.8158044343376919, + "grad_norm": 5.065472356667533, + "learning_rate": 4.3186618521016745e-07, + "loss": 0.4124, + "step": 10045 + }, + { + "epoch": 0.8158856493137334, + "grad_norm": 6.317472489620383, + "learning_rate": 4.314967856870872e-07, + "loss": 0.439, + "step": 10046 + }, + { + "epoch": 0.815966864289775, + "grad_norm": 6.622990894266353, + "learning_rate": 4.31127529293483e-07, + "loss": 0.2964, + "step": 10047 + }, + { + "epoch": 0.8160480792658166, + "grad_norm": 5.343326462409636, + "learning_rate": 4.3075841605490414e-07, + "loss": 0.4928, + "step": 10048 + }, + { + "epoch": 0.8161292942418582, + "grad_norm": 7.146378891333632, + "learning_rate": 4.3038944599689105e-07, + "loss": 0.3974, + "step": 10049 + }, + { + "epoch": 0.8162105092178997, + "grad_norm": 4.603775445290206, + "learning_rate": 4.300206191449749e-07, + "loss": 0.3021, + "step": 10050 + }, + { + "epoch": 0.8162917241939414, + "grad_norm": 6.382432355081491, + "learning_rate": 4.2965193552467753e-07, + "loss": 0.4456, + "step": 10051 + }, + { + "epoch": 0.816372939169983, + "grad_norm": 5.191619470887934, + "learning_rate": 4.292833951615083e-07, + "loss": 0.3823, + "step": 10052 + }, + { + "epoch": 0.8164541541460245, + "grad_norm": 3.9888610071010224, + "learning_rate": 4.289149980809698e-07, + "loss": 0.6601, + "step": 10053 + }, + { + "epoch": 0.8165353691220661, + "grad_norm": 8.500802989835256, + "learning_rate": 4.2854674430855224e-07, + "loss": 0.4692, + "step": 10054 + }, + { + "epoch": 0.8166165840981077, + "grad_norm": 4.251250937241956, + "learning_rate": 4.281786338697369e-07, + "loss": 0.4196, + "step": 10055 + }, + { + "epoch": 0.8166977990741493, + "grad_norm": 5.93357022678489, + "learning_rate": 4.278106667899945e-07, + "loss": 0.7008, + "step": 10056 + }, + { + "epoch": 0.8167790140501908, + "grad_norm": 4.588729939434148, + "learning_rate": 4.274428430947872e-07, + "loss": 0.4552, + "step": 10057 + }, + { + "epoch": 0.8168602290262325, + "grad_norm": 6.368147028406131, + "learning_rate": 4.270751628095668e-07, + "loss": 0.3992, + "step": 10058 + }, + { + "epoch": 0.816941444002274, + "grad_norm": 6.6517461490094565, + "learning_rate": 4.2670762595977356e-07, + "loss": 0.4615, + "step": 10059 + }, + { + "epoch": 0.8170226589783156, + "grad_norm": 8.295784988993061, + "learning_rate": 4.2634023257084074e-07, + "loss": 0.5174, + "step": 10060 + }, + { + "epoch": 0.8171038739543571, + "grad_norm": 5.523298295622445, + "learning_rate": 4.259729826681891e-07, + "loss": 0.3388, + "step": 10061 + }, + { + "epoch": 0.8171850889303988, + "grad_norm": 3.8441813036836137, + "learning_rate": 4.2560587627722973e-07, + "loss": 0.4553, + "step": 10062 + }, + { + "epoch": 0.8172663039064404, + "grad_norm": 6.951945297996189, + "learning_rate": 4.2523891342336506e-07, + "loss": 0.5312, + "step": 10063 + }, + { + "epoch": 0.8173475188824819, + "grad_norm": 4.871582562337795, + "learning_rate": 4.2487209413198784e-07, + "loss": 0.3208, + "step": 10064 + }, + { + "epoch": 0.8174287338585235, + "grad_norm": 7.346570602660871, + "learning_rate": 4.245054184284786e-07, + "loss": 0.4115, + "step": 10065 + }, + { + "epoch": 0.8175099488345651, + "grad_norm": 4.392641843655781, + "learning_rate": 4.2413888633821064e-07, + "loss": 0.5406, + "step": 10066 + }, + { + "epoch": 0.8175911638106067, + "grad_norm": 7.23578723028704, + "learning_rate": 4.237724978865454e-07, + "loss": 0.4598, + "step": 10067 + }, + { + "epoch": 0.8176723787866482, + "grad_norm": 3.553356832932114, + "learning_rate": 4.234062530988342e-07, + "loss": 0.5109, + "step": 10068 + }, + { + "epoch": 0.8177535937626899, + "grad_norm": 5.0655040941198255, + "learning_rate": 4.2304015200042095e-07, + "loss": 0.6022, + "step": 10069 + }, + { + "epoch": 0.8178348087387314, + "grad_norm": 4.3230856397717226, + "learning_rate": 4.2267419461663626e-07, + "loss": 0.4315, + "step": 10070 + }, + { + "epoch": 0.817916023714773, + "grad_norm": 4.6434028931530715, + "learning_rate": 4.223083809728032e-07, + "loss": 0.489, + "step": 10071 + }, + { + "epoch": 0.8179972386908145, + "grad_norm": 3.9428152943512673, + "learning_rate": 4.219427110942348e-07, + "loss": 0.5609, + "step": 10072 + }, + { + "epoch": 0.8180784536668562, + "grad_norm": 3.9140488822139243, + "learning_rate": 4.215771850062328e-07, + "loss": 0.5247, + "step": 10073 + }, + { + "epoch": 0.8181596686428978, + "grad_norm": 5.179289360939204, + "learning_rate": 4.2121180273408976e-07, + "loss": 0.4456, + "step": 10074 + }, + { + "epoch": 0.8182408836189393, + "grad_norm": 6.433424694858643, + "learning_rate": 4.2084656430308765e-07, + "loss": 0.4248, + "step": 10075 + }, + { + "epoch": 0.818322098594981, + "grad_norm": 3.365422337217589, + "learning_rate": 4.204814697384993e-07, + "loss": 0.4796, + "step": 10076 + }, + { + "epoch": 0.8184033135710225, + "grad_norm": 7.511774642841241, + "learning_rate": 4.2011651906558815e-07, + "loss": 0.6421, + "step": 10077 + }, + { + "epoch": 0.8184845285470641, + "grad_norm": 8.546191005512076, + "learning_rate": 4.1975171230960563e-07, + "loss": 0.3865, + "step": 10078 + }, + { + "epoch": 0.8185657435231056, + "grad_norm": 4.417370933628543, + "learning_rate": 4.193870494957958e-07, + "loss": 0.453, + "step": 10079 + }, + { + "epoch": 0.8186469584991473, + "grad_norm": 6.377398869498588, + "learning_rate": 4.190225306493906e-07, + "loss": 0.4526, + "step": 10080 + }, + { + "epoch": 0.8187281734751888, + "grad_norm": 4.132468182015889, + "learning_rate": 4.186581557956124e-07, + "loss": 0.5708, + "step": 10081 + }, + { + "epoch": 0.8188093884512304, + "grad_norm": 5.42105998479533, + "learning_rate": 4.1829392495967485e-07, + "loss": 0.5129, + "step": 10082 + }, + { + "epoch": 0.8188906034272719, + "grad_norm": 5.495216782158462, + "learning_rate": 4.1792983816677987e-07, + "loss": 0.5831, + "step": 10083 + }, + { + "epoch": 0.8189718184033136, + "grad_norm": 3.395888728224807, + "learning_rate": 4.175658954421208e-07, + "loss": 0.5072, + "step": 10084 + }, + { + "epoch": 0.8190530333793552, + "grad_norm": 5.212070130995405, + "learning_rate": 4.172020968108814e-07, + "loss": 0.5714, + "step": 10085 + }, + { + "epoch": 0.8191342483553967, + "grad_norm": 5.41209526251742, + "learning_rate": 4.168384422982338e-07, + "loss": 0.5516, + "step": 10086 + }, + { + "epoch": 0.8192154633314384, + "grad_norm": 5.766740901549195, + "learning_rate": 4.164749319293404e-07, + "loss": 0.54, + "step": 10087 + }, + { + "epoch": 0.8192966783074799, + "grad_norm": 12.522728877358098, + "learning_rate": 4.1611156572935545e-07, + "loss": 0.5872, + "step": 10088 + }, + { + "epoch": 0.8193778932835215, + "grad_norm": 7.055171169509863, + "learning_rate": 4.1574834372342053e-07, + "loss": 0.3578, + "step": 10089 + }, + { + "epoch": 0.819459108259563, + "grad_norm": 10.127142685814183, + "learning_rate": 4.153852659366697e-07, + "loss": 0.4256, + "step": 10090 + }, + { + "epoch": 0.8195403232356047, + "grad_norm": 6.833361098639403, + "learning_rate": 4.1502233239422624e-07, + "loss": 0.5271, + "step": 10091 + }, + { + "epoch": 0.8196215382116462, + "grad_norm": 3.7527822726714057, + "learning_rate": 4.14659543121203e-07, + "loss": 0.4952, + "step": 10092 + }, + { + "epoch": 0.8197027531876878, + "grad_norm": 8.853385017010872, + "learning_rate": 4.1429689814270284e-07, + "loss": 0.4493, + "step": 10093 + }, + { + "epoch": 0.8197839681637293, + "grad_norm": 4.804086833503905, + "learning_rate": 4.139343974838181e-07, + "loss": 0.4145, + "step": 10094 + }, + { + "epoch": 0.819865183139771, + "grad_norm": 2.9666099986165473, + "learning_rate": 4.135720411696334e-07, + "loss": 0.3967, + "step": 10095 + }, + { + "epoch": 0.8199463981158126, + "grad_norm": 6.498314301540783, + "learning_rate": 4.132098292252204e-07, + "loss": 0.3838, + "step": 10096 + }, + { + "epoch": 0.8200276130918541, + "grad_norm": 5.829495180353205, + "learning_rate": 4.128477616756432e-07, + "loss": 0.4869, + "step": 10097 + }, + { + "epoch": 0.8201088280678958, + "grad_norm": 5.33074506215202, + "learning_rate": 4.124858385459554e-07, + "loss": 0.4165, + "step": 10098 + }, + { + "epoch": 0.8201900430439373, + "grad_norm": 4.833962665019804, + "learning_rate": 4.1212405986119975e-07, + "loss": 0.546, + "step": 10099 + }, + { + "epoch": 0.8202712580199789, + "grad_norm": 4.555452441900101, + "learning_rate": 4.117624256464084e-07, + "loss": 0.4589, + "step": 10100 + }, + { + "epoch": 0.8203524729960204, + "grad_norm": 3.2866233085274623, + "learning_rate": 4.114009359266061e-07, + "loss": 0.482, + "step": 10101 + }, + { + "epoch": 0.8204336879720621, + "grad_norm": 5.633628638636074, + "learning_rate": 4.1103959072680446e-07, + "loss": 0.3467, + "step": 10102 + }, + { + "epoch": 0.8205149029481036, + "grad_norm": 2.6695213124316703, + "learning_rate": 4.106783900720074e-07, + "loss": 0.5319, + "step": 10103 + }, + { + "epoch": 0.8205961179241452, + "grad_norm": 6.398738658758001, + "learning_rate": 4.1031733398720906e-07, + "loss": 0.4174, + "step": 10104 + }, + { + "epoch": 0.8206773329001867, + "grad_norm": 10.457002673876866, + "learning_rate": 4.099564224973915e-07, + "loss": 0.4107, + "step": 10105 + }, + { + "epoch": 0.8207585478762284, + "grad_norm": 3.806681811308465, + "learning_rate": 4.0959565562752767e-07, + "loss": 0.5535, + "step": 10106 + }, + { + "epoch": 0.82083976285227, + "grad_norm": 3.11228162293575, + "learning_rate": 4.092350334025816e-07, + "loss": 0.5205, + "step": 10107 + }, + { + "epoch": 0.8209209778283115, + "grad_norm": 6.448242144302385, + "learning_rate": 4.0887455584750547e-07, + "loss": 0.5066, + "step": 10108 + }, + { + "epoch": 0.8210021928043532, + "grad_norm": 4.566144679361284, + "learning_rate": 4.0851422298724354e-07, + "loss": 0.6172, + "step": 10109 + }, + { + "epoch": 0.8210834077803947, + "grad_norm": 5.487995399366438, + "learning_rate": 4.081540348467278e-07, + "loss": 0.4025, + "step": 10110 + }, + { + "epoch": 0.8211646227564363, + "grad_norm": 5.156550511751783, + "learning_rate": 4.0779399145088247e-07, + "loss": 0.4171, + "step": 10111 + }, + { + "epoch": 0.8212458377324778, + "grad_norm": 5.693851091075354, + "learning_rate": 4.074340928246201e-07, + "loss": 0.4858, + "step": 10112 + }, + { + "epoch": 0.8213270527085195, + "grad_norm": 10.430588612793592, + "learning_rate": 4.0707433899284333e-07, + "loss": 0.4029, + "step": 10113 + }, + { + "epoch": 0.821408267684561, + "grad_norm": 6.3587031868115975, + "learning_rate": 4.067147299804458e-07, + "loss": 0.4363, + "step": 10114 + }, + { + "epoch": 0.8214894826606026, + "grad_norm": 6.255964758565154, + "learning_rate": 4.063552658123102e-07, + "loss": 0.5734, + "step": 10115 + }, + { + "epoch": 0.8215706976366441, + "grad_norm": 5.605733474613555, + "learning_rate": 4.0599594651330956e-07, + "loss": 0.5963, + "step": 10116 + }, + { + "epoch": 0.8216519126126858, + "grad_norm": 4.079641751194994, + "learning_rate": 4.0563677210830763e-07, + "loss": 0.5508, + "step": 10117 + }, + { + "epoch": 0.8217331275887274, + "grad_norm": 6.640081496102607, + "learning_rate": 4.0527774262215687e-07, + "loss": 0.3412, + "step": 10118 + }, + { + "epoch": 0.8218143425647689, + "grad_norm": 5.1973586672338845, + "learning_rate": 4.049188580796995e-07, + "loss": 0.3926, + "step": 10119 + }, + { + "epoch": 0.8218955575408106, + "grad_norm": 6.292394610949972, + "learning_rate": 4.0456011850576985e-07, + "loss": 0.3678, + "step": 10120 + }, + { + "epoch": 0.8219767725168521, + "grad_norm": 7.307142444437324, + "learning_rate": 4.0420152392518926e-07, + "loss": 0.4533, + "step": 10121 + }, + { + "epoch": 0.8220579874928937, + "grad_norm": 4.1068923254830345, + "learning_rate": 4.038430743627714e-07, + "loss": 0.535, + "step": 10122 + }, + { + "epoch": 0.8221392024689352, + "grad_norm": 10.080633399311306, + "learning_rate": 4.0348476984331977e-07, + "loss": 0.5241, + "step": 10123 + }, + { + "epoch": 0.8222204174449769, + "grad_norm": 5.11781610289055, + "learning_rate": 4.031266103916262e-07, + "loss": 0.6826, + "step": 10124 + }, + { + "epoch": 0.8223016324210184, + "grad_norm": 10.246403450559914, + "learning_rate": 4.0276859603247317e-07, + "loss": 0.5244, + "step": 10125 + }, + { + "epoch": 0.82238284739706, + "grad_norm": 5.888899275267595, + "learning_rate": 4.0241072679063437e-07, + "loss": 0.5099, + "step": 10126 + }, + { + "epoch": 0.8224640623731015, + "grad_norm": 4.555118993832531, + "learning_rate": 4.02053002690872e-07, + "loss": 0.4915, + "step": 10127 + }, + { + "epoch": 0.8225452773491432, + "grad_norm": 6.444960720915754, + "learning_rate": 4.016954237579382e-07, + "loss": 0.6126, + "step": 10128 + }, + { + "epoch": 0.8226264923251848, + "grad_norm": 5.028533921984391, + "learning_rate": 4.013379900165756e-07, + "loss": 0.4286, + "step": 10129 + }, + { + "epoch": 0.8227077073012263, + "grad_norm": 9.57264830043398, + "learning_rate": 4.009807014915179e-07, + "loss": 0.634, + "step": 10130 + }, + { + "epoch": 0.822788922277268, + "grad_norm": 3.535035189189592, + "learning_rate": 4.006235582074866e-07, + "loss": 0.519, + "step": 10131 + }, + { + "epoch": 0.8228701372533095, + "grad_norm": 5.20695512400172, + "learning_rate": 4.002665601891939e-07, + "loss": 0.4938, + "step": 10132 + }, + { + "epoch": 0.8229513522293511, + "grad_norm": 4.97816750794527, + "learning_rate": 3.9990970746134283e-07, + "loss": 0.3337, + "step": 10133 + }, + { + "epoch": 0.8230325672053926, + "grad_norm": 4.2871354698006945, + "learning_rate": 3.99553000048625e-07, + "loss": 0.4538, + "step": 10134 + }, + { + "epoch": 0.8231137821814343, + "grad_norm": 17.838493735891614, + "learning_rate": 3.991964379757232e-07, + "loss": 0.5731, + "step": 10135 + }, + { + "epoch": 0.8231949971574758, + "grad_norm": 5.170687504793261, + "learning_rate": 3.988400212673099e-07, + "loss": 0.5173, + "step": 10136 + }, + { + "epoch": 0.8232762121335174, + "grad_norm": 6.426083270641829, + "learning_rate": 3.9848374994804734e-07, + "loss": 0.5226, + "step": 10137 + }, + { + "epoch": 0.823357427109559, + "grad_norm": 3.7713552353157476, + "learning_rate": 3.9812762404258605e-07, + "loss": 0.4277, + "step": 10138 + }, + { + "epoch": 0.8234386420856006, + "grad_norm": 6.423516094367269, + "learning_rate": 3.977716435755702e-07, + "loss": 0.4386, + "step": 10139 + }, + { + "epoch": 0.8235198570616422, + "grad_norm": 5.7394083306414325, + "learning_rate": 3.9741580857163036e-07, + "loss": 0.38, + "step": 10140 + }, + { + "epoch": 0.8236010720376837, + "grad_norm": 5.863968524558383, + "learning_rate": 3.9706011905538827e-07, + "loss": 0.3539, + "step": 10141 + }, + { + "epoch": 0.8236822870137254, + "grad_norm": 7.197622618993387, + "learning_rate": 3.9670457505145643e-07, + "loss": 0.6156, + "step": 10142 + }, + { + "epoch": 0.8237635019897669, + "grad_norm": 9.307547582177493, + "learning_rate": 3.963491765844371e-07, + "loss": 0.5406, + "step": 10143 + }, + { + "epoch": 0.8238447169658085, + "grad_norm": 3.8307158333938185, + "learning_rate": 3.959939236789212e-07, + "loss": 0.431, + "step": 10144 + }, + { + "epoch": 0.82392593194185, + "grad_norm": 5.112333570814876, + "learning_rate": 3.9563881635948984e-07, + "loss": 0.4988, + "step": 10145 + }, + { + "epoch": 0.8240071469178917, + "grad_norm": 5.863847427204812, + "learning_rate": 3.9528385465071594e-07, + "loss": 0.6043, + "step": 10146 + }, + { + "epoch": 0.8240883618939332, + "grad_norm": 5.626678027585737, + "learning_rate": 3.949290385771595e-07, + "loss": 0.431, + "step": 10147 + }, + { + "epoch": 0.8241695768699748, + "grad_norm": 4.081512795131598, + "learning_rate": 3.945743681633729e-07, + "loss": 0.5401, + "step": 10148 + }, + { + "epoch": 0.8242507918460164, + "grad_norm": 4.533747792384772, + "learning_rate": 3.9421984343389756e-07, + "loss": 0.5523, + "step": 10149 + }, + { + "epoch": 0.824332006822058, + "grad_norm": 3.743850075034431, + "learning_rate": 3.9386546441326444e-07, + "loss": 0.4706, + "step": 10150 + }, + { + "epoch": 0.8244132217980996, + "grad_norm": 23.913229954006653, + "learning_rate": 3.9351123112599393e-07, + "loss": 0.6842, + "step": 10151 + }, + { + "epoch": 0.8244944367741411, + "grad_norm": 7.100866725720028, + "learning_rate": 3.931571435965986e-07, + "loss": 0.377, + "step": 10152 + }, + { + "epoch": 0.8245756517501828, + "grad_norm": 8.290123785692964, + "learning_rate": 3.9280320184957864e-07, + "loss": 0.5232, + "step": 10153 + }, + { + "epoch": 0.8246568667262243, + "grad_norm": 4.89062234331338, + "learning_rate": 3.9244940590942413e-07, + "loss": 0.4422, + "step": 10154 + }, + { + "epoch": 0.8247380817022659, + "grad_norm": 7.550251223417641, + "learning_rate": 3.9209575580061663e-07, + "loss": 0.2946, + "step": 10155 + }, + { + "epoch": 0.8248192966783074, + "grad_norm": 10.05450235709694, + "learning_rate": 3.9174225154762766e-07, + "loss": 0.6482, + "step": 10156 + }, + { + "epoch": 0.8249005116543491, + "grad_norm": 5.365029681588859, + "learning_rate": 3.9138889317491656e-07, + "loss": 0.5071, + "step": 10157 + }, + { + "epoch": 0.8249817266303906, + "grad_norm": 5.609914610484028, + "learning_rate": 3.9103568070693485e-07, + "loss": 0.5451, + "step": 10158 + }, + { + "epoch": 0.8250629416064322, + "grad_norm": 4.514680074383672, + "learning_rate": 3.906826141681225e-07, + "loss": 0.532, + "step": 10159 + }, + { + "epoch": 0.8251441565824738, + "grad_norm": 10.206347105991203, + "learning_rate": 3.903296935829093e-07, + "loss": 0.5132, + "step": 10160 + }, + { + "epoch": 0.8252253715585154, + "grad_norm": 6.858018717895062, + "learning_rate": 3.8997691897571577e-07, + "loss": 0.4462, + "step": 10161 + }, + { + "epoch": 0.825306586534557, + "grad_norm": 5.173227278123777, + "learning_rate": 3.896242903709532e-07, + "loss": 0.5436, + "step": 10162 + }, + { + "epoch": 0.8253878015105985, + "grad_norm": 11.450687404556978, + "learning_rate": 3.8927180779302076e-07, + "loss": 0.4657, + "step": 10163 + }, + { + "epoch": 0.8254690164866402, + "grad_norm": 4.609261708122115, + "learning_rate": 3.889194712663075e-07, + "loss": 0.4993, + "step": 10164 + }, + { + "epoch": 0.8255502314626817, + "grad_norm": 3.5084080535675026, + "learning_rate": 3.885672808151947e-07, + "loss": 0.4571, + "step": 10165 + }, + { + "epoch": 0.8256314464387233, + "grad_norm": 21.30482869821724, + "learning_rate": 3.882152364640518e-07, + "loss": 0.3259, + "step": 10166 + }, + { + "epoch": 0.8257126614147648, + "grad_norm": 4.848731610369329, + "learning_rate": 3.878633382372371e-07, + "loss": 0.5129, + "step": 10167 + }, + { + "epoch": 0.8257938763908065, + "grad_norm": 4.713993311348796, + "learning_rate": 3.875115861591014e-07, + "loss": 0.4037, + "step": 10168 + }, + { + "epoch": 0.825875091366848, + "grad_norm": 6.885554383643488, + "learning_rate": 3.871599802539841e-07, + "loss": 0.2991, + "step": 10169 + }, + { + "epoch": 0.8259563063428896, + "grad_norm": 11.560653980725935, + "learning_rate": 3.868085205462135e-07, + "loss": 0.4264, + "step": 10170 + }, + { + "epoch": 0.8260375213189312, + "grad_norm": 18.94305982548915, + "learning_rate": 3.8645720706010997e-07, + "loss": 0.4572, + "step": 10171 + }, + { + "epoch": 0.8261187362949728, + "grad_norm": 4.3868517626424754, + "learning_rate": 3.8610603981998204e-07, + "loss": 0.4054, + "step": 10172 + }, + { + "epoch": 0.8261999512710144, + "grad_norm": 10.379799445814022, + "learning_rate": 3.85755018850128e-07, + "loss": 0.4782, + "step": 10173 + }, + { + "epoch": 0.8262811662470559, + "grad_norm": 5.464482264230086, + "learning_rate": 3.854041441748371e-07, + "loss": 0.4246, + "step": 10174 + }, + { + "epoch": 0.8263623812230976, + "grad_norm": 5.259022216933499, + "learning_rate": 3.8505341581838854e-07, + "loss": 0.4936, + "step": 10175 + }, + { + "epoch": 0.8264435961991391, + "grad_norm": 3.834344996242316, + "learning_rate": 3.8470283380504987e-07, + "loss": 0.47, + "step": 10176 + }, + { + "epoch": 0.8265248111751807, + "grad_norm": 9.45059948818576, + "learning_rate": 3.8435239815908077e-07, + "loss": 0.4509, + "step": 10177 + }, + { + "epoch": 0.8266060261512223, + "grad_norm": 4.5939757403211585, + "learning_rate": 3.8400210890472883e-07, + "loss": 0.3479, + "step": 10178 + }, + { + "epoch": 0.8266872411272639, + "grad_norm": 5.823737388475345, + "learning_rate": 3.836519660662313e-07, + "loss": 0.3906, + "step": 10179 + }, + { + "epoch": 0.8267684561033054, + "grad_norm": 4.6299433684789095, + "learning_rate": 3.8330196966781723e-07, + "loss": 0.5363, + "step": 10180 + }, + { + "epoch": 0.826849671079347, + "grad_norm": 6.306388306208667, + "learning_rate": 3.829521197337052e-07, + "loss": 0.433, + "step": 10181 + }, + { + "epoch": 0.8269308860553886, + "grad_norm": 5.650157408413531, + "learning_rate": 3.8260241628810203e-07, + "loss": 0.5346, + "step": 10182 + }, + { + "epoch": 0.8270121010314302, + "grad_norm": 4.139343726577053, + "learning_rate": 3.8225285935520493e-07, + "loss": 0.5034, + "step": 10183 + }, + { + "epoch": 0.8270933160074718, + "grad_norm": 4.27842292241429, + "learning_rate": 3.8190344895920246e-07, + "loss": 0.4205, + "step": 10184 + }, + { + "epoch": 0.8271745309835133, + "grad_norm": 4.60689072938466, + "learning_rate": 3.815541851242713e-07, + "loss": 0.5121, + "step": 10185 + }, + { + "epoch": 0.827255745959555, + "grad_norm": 14.92261155989774, + "learning_rate": 3.812050678745785e-07, + "loss": 0.3954, + "step": 10186 + }, + { + "epoch": 0.8273369609355965, + "grad_norm": 4.53044644344942, + "learning_rate": 3.808560972342812e-07, + "loss": 0.5392, + "step": 10187 + }, + { + "epoch": 0.8274181759116381, + "grad_norm": 5.540716412331475, + "learning_rate": 3.8050727322752726e-07, + "loss": 0.5574, + "step": 10188 + }, + { + "epoch": 0.8274993908876797, + "grad_norm": 8.417967136559637, + "learning_rate": 3.8015859587845233e-07, + "loss": 0.3885, + "step": 10189 + }, + { + "epoch": 0.8275806058637213, + "grad_norm": 53.802360428112415, + "learning_rate": 3.798100652111839e-07, + "loss": 0.4015, + "step": 10190 + }, + { + "epoch": 0.8276618208397628, + "grad_norm": 4.877120364803925, + "learning_rate": 3.7946168124983776e-07, + "loss": 0.3768, + "step": 10191 + }, + { + "epoch": 0.8277430358158044, + "grad_norm": 9.495267916803392, + "learning_rate": 3.791134440185201e-07, + "loss": 0.5635, + "step": 10192 + }, + { + "epoch": 0.827824250791846, + "grad_norm": 5.733855636295819, + "learning_rate": 3.787653535413277e-07, + "loss": 0.4829, + "step": 10193 + }, + { + "epoch": 0.8279054657678876, + "grad_norm": 5.873682615843861, + "learning_rate": 3.784174098423465e-07, + "loss": 0.5228, + "step": 10194 + }, + { + "epoch": 0.8279866807439292, + "grad_norm": 4.833822246575063, + "learning_rate": 3.780696129456521e-07, + "loss": 0.5421, + "step": 10195 + }, + { + "epoch": 0.8280678957199707, + "grad_norm": 5.208263826542225, + "learning_rate": 3.7772196287531066e-07, + "loss": 0.5473, + "step": 10196 + }, + { + "epoch": 0.8281491106960124, + "grad_norm": 8.378333393225612, + "learning_rate": 3.773744596553774e-07, + "loss": 0.4702, + "step": 10197 + }, + { + "epoch": 0.8282303256720539, + "grad_norm": 3.4663578157064583, + "learning_rate": 3.7702710330989765e-07, + "loss": 0.5652, + "step": 10198 + }, + { + "epoch": 0.8283115406480955, + "grad_norm": 6.827267196321197, + "learning_rate": 3.766798938629063e-07, + "loss": 0.4027, + "step": 10199 + }, + { + "epoch": 0.8283927556241371, + "grad_norm": 9.444190295092335, + "learning_rate": 3.7633283133842845e-07, + "loss": 0.5462, + "step": 10200 + }, + { + "epoch": 0.8284739706001787, + "grad_norm": 6.929524668677438, + "learning_rate": 3.7598591576048e-07, + "loss": 0.5488, + "step": 10201 + }, + { + "epoch": 0.8285551855762202, + "grad_norm": 5.6038151431290935, + "learning_rate": 3.756391471530646e-07, + "loss": 0.556, + "step": 10202 + }, + { + "epoch": 0.8286364005522618, + "grad_norm": 4.185345966650429, + "learning_rate": 3.7529252554017765e-07, + "loss": 0.5079, + "step": 10203 + }, + { + "epoch": 0.8287176155283034, + "grad_norm": 6.844019960274571, + "learning_rate": 3.7494605094580305e-07, + "loss": 0.427, + "step": 10204 + }, + { + "epoch": 0.828798830504345, + "grad_norm": 5.36417941344362, + "learning_rate": 3.7459972339391445e-07, + "loss": 0.797, + "step": 10205 + }, + { + "epoch": 0.8288800454803866, + "grad_norm": 5.328404842131845, + "learning_rate": 3.742535429084765e-07, + "loss": 0.3947, + "step": 10206 + }, + { + "epoch": 0.8289612604564282, + "grad_norm": 5.223058995984122, + "learning_rate": 3.739075095134437e-07, + "loss": 0.4567, + "step": 10207 + }, + { + "epoch": 0.8290424754324698, + "grad_norm": 4.706033163133879, + "learning_rate": 3.735616232327582e-07, + "loss": 0.4547, + "step": 10208 + }, + { + "epoch": 0.8291236904085113, + "grad_norm": 5.7396745415353845, + "learning_rate": 3.732158840903552e-07, + "loss": 0.5567, + "step": 10209 + }, + { + "epoch": 0.8292049053845529, + "grad_norm": 4.958530850319359, + "learning_rate": 3.728702921101571e-07, + "loss": 0.5436, + "step": 10210 + }, + { + "epoch": 0.8292861203605945, + "grad_norm": 4.861206028038226, + "learning_rate": 3.725248473160764e-07, + "loss": 0.5996, + "step": 10211 + }, + { + "epoch": 0.8293673353366361, + "grad_norm": 4.682688015412765, + "learning_rate": 3.721795497320174e-07, + "loss": 0.5646, + "step": 10212 + }, + { + "epoch": 0.8294485503126776, + "grad_norm": 5.262659812480152, + "learning_rate": 3.718343993818718e-07, + "loss": 0.5358, + "step": 10213 + }, + { + "epoch": 0.8295297652887192, + "grad_norm": 9.851823169517866, + "learning_rate": 3.7148939628952246e-07, + "loss": 0.4423, + "step": 10214 + }, + { + "epoch": 0.8296109802647608, + "grad_norm": 5.776223337431074, + "learning_rate": 3.7114454047884247e-07, + "loss": 0.3799, + "step": 10215 + }, + { + "epoch": 0.8296921952408024, + "grad_norm": 8.26924554342392, + "learning_rate": 3.707998319736936e-07, + "loss": 0.4953, + "step": 10216 + }, + { + "epoch": 0.829773410216844, + "grad_norm": 5.982564098065872, + "learning_rate": 3.7045527079792753e-07, + "loss": 0.593, + "step": 10217 + }, + { + "epoch": 0.8298546251928856, + "grad_norm": 5.7895130886280635, + "learning_rate": 3.7011085697538587e-07, + "loss": 0.4198, + "step": 10218 + }, + { + "epoch": 0.8299358401689272, + "grad_norm": 4.460178263466723, + "learning_rate": 3.6976659052990056e-07, + "loss": 0.6089, + "step": 10219 + }, + { + "epoch": 0.8300170551449687, + "grad_norm": 5.700291972867315, + "learning_rate": 3.694224714852937e-07, + "loss": 0.405, + "step": 10220 + }, + { + "epoch": 0.8300982701210103, + "grad_norm": 7.327954495107345, + "learning_rate": 3.6907849986537516e-07, + "loss": 0.4625, + "step": 10221 + }, + { + "epoch": 0.8301794850970519, + "grad_norm": 5.605599034700318, + "learning_rate": 3.687346756939475e-07, + "loss": 0.4958, + "step": 10222 + }, + { + "epoch": 0.8302607000730935, + "grad_norm": 4.754692475561762, + "learning_rate": 3.6839099899480033e-07, + "loss": 0.3742, + "step": 10223 + }, + { + "epoch": 0.830341915049135, + "grad_norm": 3.9081424392387327, + "learning_rate": 3.680474697917144e-07, + "loss": 0.6079, + "step": 10224 + }, + { + "epoch": 0.8304231300251766, + "grad_norm": 4.983983156364833, + "learning_rate": 3.677040881084609e-07, + "loss": 0.4596, + "step": 10225 + }, + { + "epoch": 0.8305043450012182, + "grad_norm": 6.779953311673761, + "learning_rate": 3.6736085396879896e-07, + "loss": 0.368, + "step": 10226 + }, + { + "epoch": 0.8305855599772598, + "grad_norm": 3.867084328558502, + "learning_rate": 3.6701776739647893e-07, + "loss": 0.5043, + "step": 10227 + }, + { + "epoch": 0.8306667749533014, + "grad_norm": 5.763370281929799, + "learning_rate": 3.666748284152413e-07, + "loss": 0.6709, + "step": 10228 + }, + { + "epoch": 0.830747989929343, + "grad_norm": 5.039857359159814, + "learning_rate": 3.663320370488152e-07, + "loss": 0.4427, + "step": 10229 + }, + { + "epoch": 0.8308292049053846, + "grad_norm": 3.9366691516556678, + "learning_rate": 3.659893933209191e-07, + "loss": 0.5353, + "step": 10230 + }, + { + "epoch": 0.8309104198814261, + "grad_norm": 16.823744196747114, + "learning_rate": 3.6564689725526377e-07, + "loss": 0.455, + "step": 10231 + }, + { + "epoch": 0.8309916348574677, + "grad_norm": 3.5430059865653996, + "learning_rate": 3.6530454887554636e-07, + "loss": 0.6269, + "step": 10232 + }, + { + "epoch": 0.8310728498335093, + "grad_norm": 4.922946722027889, + "learning_rate": 3.649623482054565e-07, + "loss": 0.4448, + "step": 10233 + }, + { + "epoch": 0.8311540648095509, + "grad_norm": 5.418331879921831, + "learning_rate": 3.6462029526867335e-07, + "loss": 0.3769, + "step": 10234 + }, + { + "epoch": 0.8312352797855924, + "grad_norm": 6.912181302002352, + "learning_rate": 3.642783900888644e-07, + "loss": 0.4543, + "step": 10235 + }, + { + "epoch": 0.831316494761634, + "grad_norm": 7.405882641111188, + "learning_rate": 3.639366326896876e-07, + "loss": 0.4988, + "step": 10236 + }, + { + "epoch": 0.8313977097376756, + "grad_norm": 6.548258048190026, + "learning_rate": 3.635950230947902e-07, + "loss": 0.4483, + "step": 10237 + }, + { + "epoch": 0.8314789247137172, + "grad_norm": 7.4684554933443925, + "learning_rate": 3.632535613278107e-07, + "loss": 0.4406, + "step": 10238 + }, + { + "epoch": 0.8315601396897588, + "grad_norm": 3.8512394389587703, + "learning_rate": 3.629122474123767e-07, + "loss": 0.4396, + "step": 10239 + }, + { + "epoch": 0.8316413546658004, + "grad_norm": 6.464656562967151, + "learning_rate": 3.6257108137210396e-07, + "loss": 0.37, + "step": 10240 + }, + { + "epoch": 0.831722569641842, + "grad_norm": 5.940372384619711, + "learning_rate": 3.622300632306011e-07, + "loss": 0.3829, + "step": 10241 + }, + { + "epoch": 0.8318037846178835, + "grad_norm": 7.830539757818605, + "learning_rate": 3.6188919301146375e-07, + "loss": 0.4114, + "step": 10242 + }, + { + "epoch": 0.8318849995939251, + "grad_norm": 4.839330335943199, + "learning_rate": 3.615484707382777e-07, + "loss": 0.5915, + "step": 10243 + }, + { + "epoch": 0.8319662145699667, + "grad_norm": 3.688765962182601, + "learning_rate": 3.6120789643462053e-07, + "loss": 0.5409, + "step": 10244 + }, + { + "epoch": 0.8320474295460083, + "grad_norm": 6.537020216802781, + "learning_rate": 3.608674701240572e-07, + "loss": 0.3567, + "step": 10245 + }, + { + "epoch": 0.8321286445220498, + "grad_norm": 4.589211449298252, + "learning_rate": 3.605271918301434e-07, + "loss": 0.5638, + "step": 10246 + }, + { + "epoch": 0.8322098594980915, + "grad_norm": 6.29504528521005, + "learning_rate": 3.601870615764258e-07, + "loss": 0.5068, + "step": 10247 + }, + { + "epoch": 0.832291074474133, + "grad_norm": 4.262119801386439, + "learning_rate": 3.5984707938643864e-07, + "loss": 0.4094, + "step": 10248 + }, + { + "epoch": 0.8323722894501746, + "grad_norm": 4.841197784154483, + "learning_rate": 3.5950724528370615e-07, + "loss": 0.4699, + "step": 10249 + }, + { + "epoch": 0.8324535044262162, + "grad_norm": 5.42937376505632, + "learning_rate": 3.591675592917449e-07, + "loss": 0.4166, + "step": 10250 + }, + { + "epoch": 0.8325347194022578, + "grad_norm": 6.296856237155061, + "learning_rate": 3.5882802143405755e-07, + "loss": 0.449, + "step": 10251 + }, + { + "epoch": 0.8326159343782994, + "grad_norm": 7.636038816244131, + "learning_rate": 3.584886317341396e-07, + "loss": 0.4694, + "step": 10252 + }, + { + "epoch": 0.8326971493543409, + "grad_norm": 21.61523367073273, + "learning_rate": 3.58149390215474e-07, + "loss": 0.5688, + "step": 10253 + }, + { + "epoch": 0.8327783643303825, + "grad_norm": 5.877406518099175, + "learning_rate": 3.5781029690153567e-07, + "loss": 0.5595, + "step": 10254 + }, + { + "epoch": 0.8328595793064241, + "grad_norm": 4.889479170900449, + "learning_rate": 3.574713518157874e-07, + "loss": 0.5255, + "step": 10255 + }, + { + "epoch": 0.8329407942824657, + "grad_norm": 6.633095888789909, + "learning_rate": 3.571325549816818e-07, + "loss": 0.3982, + "step": 10256 + }, + { + "epoch": 0.8330220092585072, + "grad_norm": 4.06737934876213, + "learning_rate": 3.56793906422663e-07, + "loss": 0.5486, + "step": 10257 + }, + { + "epoch": 0.8331032242345489, + "grad_norm": 4.0549895036371, + "learning_rate": 3.564554061621625e-07, + "loss": 0.4597, + "step": 10258 + }, + { + "epoch": 0.8331844392105904, + "grad_norm": 8.500590717560147, + "learning_rate": 3.5611705422360335e-07, + "loss": 0.491, + "step": 10259 + }, + { + "epoch": 0.833265654186632, + "grad_norm": 5.841348510557204, + "learning_rate": 3.557788506303986e-07, + "loss": 0.3571, + "step": 10260 + }, + { + "epoch": 0.8333468691626736, + "grad_norm": 8.428998076590059, + "learning_rate": 3.5544079540594884e-07, + "loss": 0.5252, + "step": 10261 + }, + { + "epoch": 0.8334280841387152, + "grad_norm": 6.0161944380779255, + "learning_rate": 3.551028885736457e-07, + "loss": 0.5061, + "step": 10262 + }, + { + "epoch": 0.8335092991147568, + "grad_norm": 26.06934398233002, + "learning_rate": 3.5476513015687136e-07, + "loss": 0.4221, + "step": 10263 + }, + { + "epoch": 0.8335905140907983, + "grad_norm": 6.725770937675891, + "learning_rate": 3.5442752017899625e-07, + "loss": 0.5557, + "step": 10264 + }, + { + "epoch": 0.83367172906684, + "grad_norm": 5.63216646803754, + "learning_rate": 3.5409005866338134e-07, + "loss": 0.4044, + "step": 10265 + }, + { + "epoch": 0.8337529440428815, + "grad_norm": 5.540976364008194, + "learning_rate": 3.537527456333778e-07, + "loss": 0.4748, + "step": 10266 + }, + { + "epoch": 0.8338341590189231, + "grad_norm": 4.118260128090111, + "learning_rate": 3.5341558111232547e-07, + "loss": 0.5088, + "step": 10267 + }, + { + "epoch": 0.8339153739949646, + "grad_norm": 4.454821659286079, + "learning_rate": 3.5307856512355354e-07, + "loss": 0.408, + "step": 10268 + }, + { + "epoch": 0.8339965889710063, + "grad_norm": 7.906091232376149, + "learning_rate": 3.527416976903833e-07, + "loss": 0.4369, + "step": 10269 + }, + { + "epoch": 0.8340778039470479, + "grad_norm": 4.946330213556263, + "learning_rate": 3.5240497883612333e-07, + "loss": 0.5383, + "step": 10270 + }, + { + "epoch": 0.8341590189230894, + "grad_norm": 3.921389705656694, + "learning_rate": 3.5206840858407225e-07, + "loss": 0.5159, + "step": 10271 + }, + { + "epoch": 0.834240233899131, + "grad_norm": 9.466775431263171, + "learning_rate": 3.517319869575195e-07, + "loss": 0.3524, + "step": 10272 + }, + { + "epoch": 0.8343214488751726, + "grad_norm": 7.781642113894127, + "learning_rate": 3.5139571397974416e-07, + "loss": 0.5091, + "step": 10273 + }, + { + "epoch": 0.8344026638512142, + "grad_norm": 5.031639260527779, + "learning_rate": 3.5105958967401404e-07, + "loss": 0.3777, + "step": 10274 + }, + { + "epoch": 0.8344838788272557, + "grad_norm": 4.936172284543063, + "learning_rate": 3.5072361406358696e-07, + "loss": 0.5142, + "step": 10275 + }, + { + "epoch": 0.8345650938032974, + "grad_norm": 5.62972700250256, + "learning_rate": 3.5038778717171123e-07, + "loss": 0.5169, + "step": 10276 + }, + { + "epoch": 0.8346463087793389, + "grad_norm": 4.876718013101768, + "learning_rate": 3.500521090216233e-07, + "loss": 0.5267, + "step": 10277 + }, + { + "epoch": 0.8347275237553805, + "grad_norm": 5.004111684696393, + "learning_rate": 3.497165796365512e-07, + "loss": 0.6079, + "step": 10278 + }, + { + "epoch": 0.834808738731422, + "grad_norm": 5.227016363738208, + "learning_rate": 3.4938119903971195e-07, + "loss": 0.3386, + "step": 10279 + }, + { + "epoch": 0.8348899537074637, + "grad_norm": 4.70841750512664, + "learning_rate": 3.49045967254312e-07, + "loss": 0.4524, + "step": 10280 + }, + { + "epoch": 0.8349711686835053, + "grad_norm": 4.398665081558391, + "learning_rate": 3.487108843035467e-07, + "loss": 0.4655, + "step": 10281 + }, + { + "epoch": 0.8350523836595468, + "grad_norm": 4.351497987088346, + "learning_rate": 3.4837595021060296e-07, + "loss": 0.4796, + "step": 10282 + }, + { + "epoch": 0.8351335986355884, + "grad_norm": 4.803405743264782, + "learning_rate": 3.480411649986565e-07, + "loss": 0.3933, + "step": 10283 + }, + { + "epoch": 0.83521481361163, + "grad_norm": 4.927789971086148, + "learning_rate": 3.477065286908715e-07, + "loss": 0.407, + "step": 10284 + }, + { + "epoch": 0.8352960285876716, + "grad_norm": 9.580044334289088, + "learning_rate": 3.4737204131040397e-07, + "loss": 0.5534, + "step": 10285 + }, + { + "epoch": 0.8353772435637131, + "grad_norm": 7.407853324575902, + "learning_rate": 3.470377028803992e-07, + "loss": 0.4651, + "step": 10286 + }, + { + "epoch": 0.8354584585397548, + "grad_norm": 4.484117272427251, + "learning_rate": 3.46703513423991e-07, + "loss": 0.5232, + "step": 10287 + }, + { + "epoch": 0.8355396735157963, + "grad_norm": 4.330327720269298, + "learning_rate": 3.4636947296430274e-07, + "loss": 0.4451, + "step": 10288 + }, + { + "epoch": 0.8356208884918379, + "grad_norm": 5.965597862522399, + "learning_rate": 3.460355815244498e-07, + "loss": 0.5906, + "step": 10289 + }, + { + "epoch": 0.8357021034678794, + "grad_norm": 8.22500497243567, + "learning_rate": 3.457018391275341e-07, + "loss": 0.3985, + "step": 10290 + }, + { + "epoch": 0.8357833184439211, + "grad_norm": 5.544151542569365, + "learning_rate": 3.4536824579665007e-07, + "loss": 0.4722, + "step": 10291 + }, + { + "epoch": 0.8358645334199627, + "grad_norm": 5.097926053416765, + "learning_rate": 3.4503480155488044e-07, + "loss": 0.5343, + "step": 10292 + }, + { + "epoch": 0.8359457483960042, + "grad_norm": 11.549757808229108, + "learning_rate": 3.447015064252976e-07, + "loss": 0.3995, + "step": 10293 + }, + { + "epoch": 0.8360269633720459, + "grad_norm": 4.072804987437322, + "learning_rate": 3.443683604309633e-07, + "loss": 0.4749, + "step": 10294 + }, + { + "epoch": 0.8361081783480874, + "grad_norm": 5.09382553179545, + "learning_rate": 3.4403536359493034e-07, + "loss": 0.4529, + "step": 10295 + }, + { + "epoch": 0.836189393324129, + "grad_norm": 4.720065781209744, + "learning_rate": 3.437025159402399e-07, + "loss": 0.58, + "step": 10296 + }, + { + "epoch": 0.8362706083001705, + "grad_norm": 4.694987719383441, + "learning_rate": 3.43369817489923e-07, + "loss": 0.4097, + "step": 10297 + }, + { + "epoch": 0.8363518232762122, + "grad_norm": 5.4566621726960145, + "learning_rate": 3.430372682670008e-07, + "loss": 0.4231, + "step": 10298 + }, + { + "epoch": 0.8364330382522537, + "grad_norm": 8.027749216651072, + "learning_rate": 3.4270486829448476e-07, + "loss": 0.4286, + "step": 10299 + }, + { + "epoch": 0.8365142532282953, + "grad_norm": 5.684585620616569, + "learning_rate": 3.423726175953737e-07, + "loss": 0.4205, + "step": 10300 + }, + { + "epoch": 0.8365954682043368, + "grad_norm": 3.9316108403026178, + "learning_rate": 3.4204051619265905e-07, + "loss": 0.4315, + "step": 10301 + }, + { + "epoch": 0.8366766831803785, + "grad_norm": 5.5134529395223915, + "learning_rate": 3.4170856410931986e-07, + "loss": 0.3852, + "step": 10302 + }, + { + "epoch": 0.8367578981564201, + "grad_norm": 6.0566636394615285, + "learning_rate": 3.41376761368325e-07, + "loss": 0.544, + "step": 10303 + }, + { + "epoch": 0.8368391131324616, + "grad_norm": 5.68024482168099, + "learning_rate": 3.4104510799263356e-07, + "loss": 0.4424, + "step": 10304 + }, + { + "epoch": 0.8369203281085033, + "grad_norm": 8.931124333462172, + "learning_rate": 3.407136040051953e-07, + "loss": 0.4358, + "step": 10305 + }, + { + "epoch": 0.8370015430845448, + "grad_norm": 6.202340802104501, + "learning_rate": 3.40382249428948e-07, + "loss": 0.6703, + "step": 10306 + }, + { + "epoch": 0.8370827580605864, + "grad_norm": 5.36293568403821, + "learning_rate": 3.400510442868185e-07, + "loss": 0.5796, + "step": 10307 + }, + { + "epoch": 0.8371639730366279, + "grad_norm": 5.206542300226407, + "learning_rate": 3.3971998860172605e-07, + "loss": 0.5902, + "step": 10308 + }, + { + "epoch": 0.8372451880126696, + "grad_norm": 3.560373786193406, + "learning_rate": 3.393890823965768e-07, + "loss": 0.4523, + "step": 10309 + }, + { + "epoch": 0.8373264029887111, + "grad_norm": 5.239048746596322, + "learning_rate": 3.390583256942681e-07, + "loss": 0.5946, + "step": 10310 + }, + { + "epoch": 0.8374076179647527, + "grad_norm": 4.838192804018576, + "learning_rate": 3.3872771851768737e-07, + "loss": 0.4644, + "step": 10311 + }, + { + "epoch": 0.8374888329407942, + "grad_norm": 7.90289534921277, + "learning_rate": 3.383972608897099e-07, + "loss": 0.4878, + "step": 10312 + }, + { + "epoch": 0.8375700479168359, + "grad_norm": 3.8504253330468092, + "learning_rate": 3.3806695283320145e-07, + "loss": 0.3547, + "step": 10313 + }, + { + "epoch": 0.8376512628928775, + "grad_norm": 11.90866871558082, + "learning_rate": 3.377367943710183e-07, + "loss": 0.4281, + "step": 10314 + }, + { + "epoch": 0.837732477868919, + "grad_norm": 4.797876474078643, + "learning_rate": 3.374067855260055e-07, + "loss": 0.4421, + "step": 10315 + }, + { + "epoch": 0.8378136928449607, + "grad_norm": 4.661745082017848, + "learning_rate": 3.370769263209975e-07, + "loss": 0.3877, + "step": 10316 + }, + { + "epoch": 0.8378949078210022, + "grad_norm": 4.545258248925443, + "learning_rate": 3.3674721677881853e-07, + "loss": 0.5798, + "step": 10317 + }, + { + "epoch": 0.8379761227970438, + "grad_norm": 6.305655433495236, + "learning_rate": 3.364176569222843e-07, + "loss": 0.5197, + "step": 10318 + }, + { + "epoch": 0.8380573377730853, + "grad_norm": 5.2525205813121945, + "learning_rate": 3.360882467741969e-07, + "loss": 0.4127, + "step": 10319 + }, + { + "epoch": 0.838138552749127, + "grad_norm": 4.370136047848057, + "learning_rate": 3.35758986357351e-07, + "loss": 0.6077, + "step": 10320 + }, + { + "epoch": 0.8382197677251685, + "grad_norm": 10.028481598001118, + "learning_rate": 3.354298756945293e-07, + "loss": 0.3249, + "step": 10321 + }, + { + "epoch": 0.8383009827012101, + "grad_norm": 4.972555184812446, + "learning_rate": 3.351009148085038e-07, + "loss": 0.5867, + "step": 10322 + }, + { + "epoch": 0.8383821976772516, + "grad_norm": 3.9492347620594224, + "learning_rate": 3.347721037220372e-07, + "loss": 0.4575, + "step": 10323 + }, + { + "epoch": 0.8384634126532933, + "grad_norm": 24.137740084826742, + "learning_rate": 3.344434424578824e-07, + "loss": 0.5544, + "step": 10324 + }, + { + "epoch": 0.8385446276293349, + "grad_norm": 7.000852410859473, + "learning_rate": 3.3411493103878036e-07, + "loss": 0.4401, + "step": 10325 + }, + { + "epoch": 0.8386258426053764, + "grad_norm": 6.577988762799303, + "learning_rate": 3.3378656948746176e-07, + "loss": 0.4975, + "step": 10326 + }, + { + "epoch": 0.8387070575814181, + "grad_norm": 5.455565928416991, + "learning_rate": 3.334583578266487e-07, + "loss": 0.489, + "step": 10327 + }, + { + "epoch": 0.8387882725574596, + "grad_norm": 4.806945898095566, + "learning_rate": 3.3313029607905087e-07, + "loss": 0.394, + "step": 10328 + }, + { + "epoch": 0.8388694875335012, + "grad_norm": 5.601127834993915, + "learning_rate": 3.328023842673678e-07, + "loss": 0.5171, + "step": 10329 + }, + { + "epoch": 0.8389507025095427, + "grad_norm": 8.23233143842366, + "learning_rate": 3.324746224142902e-07, + "loss": 0.3461, + "step": 10330 + }, + { + "epoch": 0.8390319174855844, + "grad_norm": 5.534926193999312, + "learning_rate": 3.321470105424979e-07, + "loss": 0.4056, + "step": 10331 + }, + { + "epoch": 0.8391131324616259, + "grad_norm": 4.70029893588722, + "learning_rate": 3.3181954867465864e-07, + "loss": 0.5831, + "step": 10332 + }, + { + "epoch": 0.8391943474376675, + "grad_norm": 4.310642438062851, + "learning_rate": 3.314922368334322e-07, + "loss": 0.4913, + "step": 10333 + }, + { + "epoch": 0.839275562413709, + "grad_norm": 3.3683032390188012, + "learning_rate": 3.3116507504146633e-07, + "loss": 0.5738, + "step": 10334 + }, + { + "epoch": 0.8393567773897507, + "grad_norm": 4.726658134420679, + "learning_rate": 3.3083806332139837e-07, + "loss": 0.3906, + "step": 10335 + }, + { + "epoch": 0.8394379923657923, + "grad_norm": 4.793027997624646, + "learning_rate": 3.305112016958562e-07, + "loss": 0.3441, + "step": 10336 + }, + { + "epoch": 0.8395192073418338, + "grad_norm": 15.53691157648671, + "learning_rate": 3.3018449018745765e-07, + "loss": 0.4705, + "step": 10337 + }, + { + "epoch": 0.8396004223178755, + "grad_norm": 5.292837088334354, + "learning_rate": 3.298579288188081e-07, + "loss": 0.5973, + "step": 10338 + }, + { + "epoch": 0.839681637293917, + "grad_norm": 7.13356115130351, + "learning_rate": 3.2953151761250526e-07, + "loss": 0.4081, + "step": 10339 + }, + { + "epoch": 0.8397628522699586, + "grad_norm": 4.657968605497615, + "learning_rate": 3.292052565911344e-07, + "loss": 0.3476, + "step": 10340 + }, + { + "epoch": 0.8398440672460001, + "grad_norm": 5.343433901408026, + "learning_rate": 3.288791457772708e-07, + "loss": 0.4242, + "step": 10341 + }, + { + "epoch": 0.8399252822220418, + "grad_norm": 7.943551058264865, + "learning_rate": 3.2855318519347924e-07, + "loss": 0.3954, + "step": 10342 + }, + { + "epoch": 0.8400064971980833, + "grad_norm": 4.253480461006163, + "learning_rate": 3.282273748623152e-07, + "loss": 0.6146, + "step": 10343 + }, + { + "epoch": 0.8400877121741249, + "grad_norm": 3.649658017480646, + "learning_rate": 3.279017148063235e-07, + "loss": 0.4316, + "step": 10344 + }, + { + "epoch": 0.8401689271501664, + "grad_norm": 6.247382983100074, + "learning_rate": 3.275762050480369e-07, + "loss": 0.6303, + "step": 10345 + }, + { + "epoch": 0.8402501421262081, + "grad_norm": 5.081757914347488, + "learning_rate": 3.272508456099799e-07, + "loss": 0.3951, + "step": 10346 + }, + { + "epoch": 0.8403313571022497, + "grad_norm": 4.2849275181266515, + "learning_rate": 3.269256365146653e-07, + "loss": 0.4885, + "step": 10347 + }, + { + "epoch": 0.8404125720782912, + "grad_norm": 5.408392743532504, + "learning_rate": 3.2660057778459513e-07, + "loss": 0.5593, + "step": 10348 + }, + { + "epoch": 0.8404937870543329, + "grad_norm": 16.566283426785315, + "learning_rate": 3.262756694422628e-07, + "loss": 0.4464, + "step": 10349 + }, + { + "epoch": 0.8405750020303744, + "grad_norm": 9.235201670008626, + "learning_rate": 3.2595091151015e-07, + "loss": 0.4379, + "step": 10350 + }, + { + "epoch": 0.840656217006416, + "grad_norm": 3.706535071305654, + "learning_rate": 3.2562630401072796e-07, + "loss": 0.3997, + "step": 10351 + }, + { + "epoch": 0.8407374319824575, + "grad_norm": 6.291131527025278, + "learning_rate": 3.2530184696645846e-07, + "loss": 0.3929, + "step": 10352 + }, + { + "epoch": 0.8408186469584992, + "grad_norm": 6.059066502473035, + "learning_rate": 3.249775403997915e-07, + "loss": 0.5271, + "step": 10353 + }, + { + "epoch": 0.8408998619345407, + "grad_norm": 5.828503261223924, + "learning_rate": 3.24653384333167e-07, + "loss": 0.4112, + "step": 10354 + }, + { + "epoch": 0.8409810769105823, + "grad_norm": 5.607819748297354, + "learning_rate": 3.243293787890162e-07, + "loss": 0.5535, + "step": 10355 + }, + { + "epoch": 0.8410622918866238, + "grad_norm": 4.947445738957033, + "learning_rate": 3.2400552378975744e-07, + "loss": 0.5297, + "step": 10356 + }, + { + "epoch": 0.8411435068626655, + "grad_norm": 12.539916176915135, + "learning_rate": 3.236818193577998e-07, + "loss": 0.5609, + "step": 10357 + }, + { + "epoch": 0.8412247218387071, + "grad_norm": 7.269085157428185, + "learning_rate": 3.233582655155429e-07, + "loss": 0.4367, + "step": 10358 + }, + { + "epoch": 0.8413059368147486, + "grad_norm": 7.3313861075084725, + "learning_rate": 3.2303486228537436e-07, + "loss": 0.5678, + "step": 10359 + }, + { + "epoch": 0.8413871517907903, + "grad_norm": 6.798734073899363, + "learning_rate": 3.227116096896718e-07, + "loss": 0.4094, + "step": 10360 + }, + { + "epoch": 0.8414683667668318, + "grad_norm": 5.446235513662214, + "learning_rate": 3.223885077508024e-07, + "loss": 0.5598, + "step": 10361 + }, + { + "epoch": 0.8415495817428734, + "grad_norm": 5.464432211318365, + "learning_rate": 3.220655564911232e-07, + "loss": 0.5195, + "step": 10362 + }, + { + "epoch": 0.8416307967189149, + "grad_norm": 3.4734065266272767, + "learning_rate": 3.217427559329814e-07, + "loss": 0.4573, + "step": 10363 + }, + { + "epoch": 0.8417120116949566, + "grad_norm": 4.612719279186468, + "learning_rate": 3.2142010609871236e-07, + "loss": 0.4274, + "step": 10364 + }, + { + "epoch": 0.8417932266709981, + "grad_norm": 5.548612820561558, + "learning_rate": 3.2109760701064227e-07, + "loss": 0.4346, + "step": 10365 + }, + { + "epoch": 0.8418744416470397, + "grad_norm": 7.436228501727278, + "learning_rate": 3.207752586910862e-07, + "loss": 0.4898, + "step": 10366 + }, + { + "epoch": 0.8419556566230812, + "grad_norm": 7.589666228758146, + "learning_rate": 3.2045306116234824e-07, + "loss": 0.5607, + "step": 10367 + }, + { + "epoch": 0.8420368715991229, + "grad_norm": 5.181080777503049, + "learning_rate": 3.2013101444672345e-07, + "loss": 0.4397, + "step": 10368 + }, + { + "epoch": 0.8421180865751645, + "grad_norm": 7.40645836332802, + "learning_rate": 3.198091185664964e-07, + "loss": 0.5543, + "step": 10369 + }, + { + "epoch": 0.842199301551206, + "grad_norm": 4.3886565655110035, + "learning_rate": 3.194873735439391e-07, + "loss": 0.394, + "step": 10370 + }, + { + "epoch": 0.8422805165272477, + "grad_norm": 5.3852633366108655, + "learning_rate": 3.1916577940131585e-07, + "loss": 0.6436, + "step": 10371 + }, + { + "epoch": 0.8423617315032892, + "grad_norm": 4.336066931433411, + "learning_rate": 3.188443361608787e-07, + "loss": 0.5485, + "step": 10372 + }, + { + "epoch": 0.8424429464793308, + "grad_norm": 7.090559367854015, + "learning_rate": 3.185230438448694e-07, + "loss": 0.5496, + "step": 10373 + }, + { + "epoch": 0.8425241614553723, + "grad_norm": 5.001322891880096, + "learning_rate": 3.182019024755209e-07, + "loss": 0.5178, + "step": 10374 + }, + { + "epoch": 0.842605376431414, + "grad_norm": 8.355308016625518, + "learning_rate": 3.1788091207505285e-07, + "loss": 0.5462, + "step": 10375 + }, + { + "epoch": 0.8426865914074555, + "grad_norm": 5.941932747298044, + "learning_rate": 3.175600726656772e-07, + "loss": 0.3143, + "step": 10376 + }, + { + "epoch": 0.8427678063834971, + "grad_norm": 13.808992644334168, + "learning_rate": 3.172393842695948e-07, + "loss": 0.5841, + "step": 10377 + }, + { + "epoch": 0.8428490213595387, + "grad_norm": 4.737960132276757, + "learning_rate": 3.169188469089945e-07, + "loss": 0.6052, + "step": 10378 + }, + { + "epoch": 0.8429302363355803, + "grad_norm": 6.490398809975363, + "learning_rate": 3.165984606060565e-07, + "loss": 0.4312, + "step": 10379 + }, + { + "epoch": 0.8430114513116219, + "grad_norm": 6.469336726754756, + "learning_rate": 3.1627822538294883e-07, + "loss": 0.381, + "step": 10380 + }, + { + "epoch": 0.8430926662876634, + "grad_norm": 4.644733440027643, + "learning_rate": 3.159581412618309e-07, + "loss": 0.488, + "step": 10381 + }, + { + "epoch": 0.8431738812637051, + "grad_norm": 5.02222613398405, + "learning_rate": 3.1563820826485127e-07, + "loss": 0.4021, + "step": 10382 + }, + { + "epoch": 0.8432550962397466, + "grad_norm": 5.6081490476021285, + "learning_rate": 3.153184264141465e-07, + "loss": 0.4034, + "step": 10383 + }, + { + "epoch": 0.8433363112157882, + "grad_norm": 12.62989950175683, + "learning_rate": 3.1499879573184486e-07, + "loss": 0.4546, + "step": 10384 + }, + { + "epoch": 0.8434175261918297, + "grad_norm": 4.275321566511599, + "learning_rate": 3.146793162400627e-07, + "loss": 0.3961, + "step": 10385 + }, + { + "epoch": 0.8434987411678714, + "grad_norm": 5.910594667196357, + "learning_rate": 3.143599879609055e-07, + "loss": 0.4436, + "step": 10386 + }, + { + "epoch": 0.8435799561439129, + "grad_norm": 4.218477387154232, + "learning_rate": 3.1404081091647027e-07, + "loss": 0.5959, + "step": 10387 + }, + { + "epoch": 0.8436611711199545, + "grad_norm": 7.34082449004889, + "learning_rate": 3.1372178512884154e-07, + "loss": 0.6363, + "step": 10388 + }, + { + "epoch": 0.843742386095996, + "grad_norm": 5.370383548087424, + "learning_rate": 3.1340291062009446e-07, + "loss": 0.5443, + "step": 10389 + }, + { + "epoch": 0.8438236010720377, + "grad_norm": 3.265960200811652, + "learning_rate": 3.130841874122942e-07, + "loss": 0.4581, + "step": 10390 + }, + { + "epoch": 0.8439048160480793, + "grad_norm": 8.155289410608034, + "learning_rate": 3.1276561552749415e-07, + "loss": 0.5283, + "step": 10391 + }, + { + "epoch": 0.8439860310241208, + "grad_norm": 4.910342387320026, + "learning_rate": 3.1244719498773693e-07, + "loss": 0.4737, + "step": 10392 + }, + { + "epoch": 0.8440672460001625, + "grad_norm": 7.838611981148996, + "learning_rate": 3.1212892581505697e-07, + "loss": 0.5164, + "step": 10393 + }, + { + "epoch": 0.844148460976204, + "grad_norm": 3.137926778602015, + "learning_rate": 3.118108080314758e-07, + "loss": 0.6133, + "step": 10394 + }, + { + "epoch": 0.8442296759522456, + "grad_norm": 6.8077151459524, + "learning_rate": 3.1149284165900627e-07, + "loss": 0.3985, + "step": 10395 + }, + { + "epoch": 0.8443108909282871, + "grad_norm": 4.575748337907764, + "learning_rate": 3.111750267196492e-07, + "loss": 0.6132, + "step": 10396 + }, + { + "epoch": 0.8443921059043288, + "grad_norm": 3.4885659741848833, + "learning_rate": 3.1085736323539647e-07, + "loss": 0.584, + "step": 10397 + }, + { + "epoch": 0.8444733208803703, + "grad_norm": 5.967423076508702, + "learning_rate": 3.1053985122822844e-07, + "loss": 0.6819, + "step": 10398 + }, + { + "epoch": 0.8445545358564119, + "grad_norm": 4.9263444027598124, + "learning_rate": 3.1022249072011455e-07, + "loss": 0.5207, + "step": 10399 + }, + { + "epoch": 0.8446357508324535, + "grad_norm": 7.840385643435142, + "learning_rate": 3.0990528173301557e-07, + "loss": 0.4711, + "step": 10400 + }, + { + "epoch": 0.8447169658084951, + "grad_norm": 6.013199743602192, + "learning_rate": 3.095882242888795e-07, + "loss": 0.4817, + "step": 10401 + }, + { + "epoch": 0.8447981807845367, + "grad_norm": 6.574555556543331, + "learning_rate": 3.09271318409646e-07, + "loss": 0.4581, + "step": 10402 + }, + { + "epoch": 0.8448793957605782, + "grad_norm": 3.106518820205016, + "learning_rate": 3.089545641172434e-07, + "loss": 0.549, + "step": 10403 + }, + { + "epoch": 0.8449606107366199, + "grad_norm": 4.4853372930685715, + "learning_rate": 3.086379614335891e-07, + "loss": 0.3649, + "step": 10404 + }, + { + "epoch": 0.8450418257126614, + "grad_norm": 6.692195853008346, + "learning_rate": 3.083215103805895e-07, + "loss": 0.5595, + "step": 10405 + }, + { + "epoch": 0.845123040688703, + "grad_norm": 3.961593687618352, + "learning_rate": 3.080052109801429e-07, + "loss": 0.4905, + "step": 10406 + }, + { + "epoch": 0.8452042556647446, + "grad_norm": 5.657024124857219, + "learning_rate": 3.0768906325413404e-07, + "loss": 0.304, + "step": 10407 + }, + { + "epoch": 0.8452854706407862, + "grad_norm": 4.947735022977894, + "learning_rate": 3.073730672244393e-07, + "loss": 0.5874, + "step": 10408 + }, + { + "epoch": 0.8453666856168277, + "grad_norm": 4.804603939591969, + "learning_rate": 3.0705722291292457e-07, + "loss": 0.5255, + "step": 10409 + }, + { + "epoch": 0.8454479005928693, + "grad_norm": 8.27849842792289, + "learning_rate": 3.067415303414442e-07, + "loss": 0.5203, + "step": 10410 + }, + { + "epoch": 0.8455291155689109, + "grad_norm": 10.32641648689632, + "learning_rate": 3.0642598953184164e-07, + "loss": 0.4933, + "step": 10411 + }, + { + "epoch": 0.8456103305449525, + "grad_norm": 3.764527737605284, + "learning_rate": 3.0611060050595166e-07, + "loss": 0.5453, + "step": 10412 + }, + { + "epoch": 0.8456915455209941, + "grad_norm": 5.190043324766475, + "learning_rate": 3.057953632855973e-07, + "loss": 0.5636, + "step": 10413 + }, + { + "epoch": 0.8457727604970356, + "grad_norm": 4.262101482407405, + "learning_rate": 3.0548027789259057e-07, + "loss": 0.5088, + "step": 10414 + }, + { + "epoch": 0.8458539754730773, + "grad_norm": 4.496146141780251, + "learning_rate": 3.05165344348734e-07, + "loss": 0.4839, + "step": 10415 + }, + { + "epoch": 0.8459351904491188, + "grad_norm": 5.9108085665989645, + "learning_rate": 3.0485056267582054e-07, + "loss": 0.6495, + "step": 10416 + }, + { + "epoch": 0.8460164054251604, + "grad_norm": 5.4963646098218355, + "learning_rate": 3.0453593289563015e-07, + "loss": 0.4643, + "step": 10417 + }, + { + "epoch": 0.846097620401202, + "grad_norm": 11.437063436049456, + "learning_rate": 3.0422145502993355e-07, + "loss": 0.4108, + "step": 10418 + }, + { + "epoch": 0.8461788353772436, + "grad_norm": 6.788003957398547, + "learning_rate": 3.0390712910049166e-07, + "loss": 0.3901, + "step": 10419 + }, + { + "epoch": 0.8462600503532851, + "grad_norm": 4.716129840076831, + "learning_rate": 3.035929551290534e-07, + "loss": 0.3811, + "step": 10420 + }, + { + "epoch": 0.8463412653293267, + "grad_norm": 4.429133558600282, + "learning_rate": 3.0327893313735814e-07, + "loss": 0.3569, + "step": 10421 + }, + { + "epoch": 0.8464224803053683, + "grad_norm": 5.2938219799131865, + "learning_rate": 3.0296506314713534e-07, + "loss": 0.4368, + "step": 10422 + }, + { + "epoch": 0.8465036952814099, + "grad_norm": 4.19248350579702, + "learning_rate": 3.0265134518010274e-07, + "loss": 0.5749, + "step": 10423 + }, + { + "epoch": 0.8465849102574515, + "grad_norm": 8.957100181001996, + "learning_rate": 3.0233777925796683e-07, + "loss": 0.4146, + "step": 10424 + }, + { + "epoch": 0.846666125233493, + "grad_norm": 4.999603068660096, + "learning_rate": 3.020243654024266e-07, + "loss": 0.3864, + "step": 10425 + }, + { + "epoch": 0.8467473402095347, + "grad_norm": 18.067549039426147, + "learning_rate": 3.017111036351672e-07, + "loss": 0.4653, + "step": 10426 + }, + { + "epoch": 0.8468285551855762, + "grad_norm": 6.599994188483718, + "learning_rate": 3.01397993977865e-07, + "loss": 0.4479, + "step": 10427 + }, + { + "epoch": 0.8469097701616178, + "grad_norm": 5.582107888533948, + "learning_rate": 3.010850364521853e-07, + "loss": 0.5203, + "step": 10428 + }, + { + "epoch": 0.8469909851376594, + "grad_norm": 4.2550759235692555, + "learning_rate": 3.007722310797842e-07, + "loss": 0.463, + "step": 10429 + }, + { + "epoch": 0.847072200113701, + "grad_norm": 4.496695788396444, + "learning_rate": 3.004595778823055e-07, + "loss": 0.6158, + "step": 10430 + }, + { + "epoch": 0.8471534150897425, + "grad_norm": 4.611503155703553, + "learning_rate": 3.0014707688138244e-07, + "loss": 0.504, + "step": 10431 + }, + { + "epoch": 0.8472346300657841, + "grad_norm": 5.19185166511966, + "learning_rate": 2.9983472809863996e-07, + "loss": 0.5022, + "step": 10432 + }, + { + "epoch": 0.8473158450418257, + "grad_norm": 4.311886074274771, + "learning_rate": 2.995225315556891e-07, + "loss": 0.5097, + "step": 10433 + }, + { + "epoch": 0.8473970600178673, + "grad_norm": 5.401400848063594, + "learning_rate": 2.992104872741336e-07, + "loss": 0.4966, + "step": 10434 + }, + { + "epoch": 0.8474782749939089, + "grad_norm": 4.369156116773576, + "learning_rate": 2.9889859527556517e-07, + "loss": 0.5447, + "step": 10435 + }, + { + "epoch": 0.8475594899699505, + "grad_norm": 4.166723822178224, + "learning_rate": 2.985868555815646e-07, + "loss": 0.5355, + "step": 10436 + }, + { + "epoch": 0.8476407049459921, + "grad_norm": 6.75124539957616, + "learning_rate": 2.9827526821370274e-07, + "loss": 0.6095, + "step": 10437 + }, + { + "epoch": 0.8477219199220336, + "grad_norm": 31.475529417596338, + "learning_rate": 2.9796383319353997e-07, + "loss": 0.4393, + "step": 10438 + }, + { + "epoch": 0.8478031348980752, + "grad_norm": 5.167153691651564, + "learning_rate": 2.976525505426253e-07, + "loss": 0.546, + "step": 10439 + }, + { + "epoch": 0.8478843498741168, + "grad_norm": 5.678318484729858, + "learning_rate": 2.9734142028249867e-07, + "loss": 0.5173, + "step": 10440 + }, + { + "epoch": 0.8479655648501584, + "grad_norm": 3.293870862181777, + "learning_rate": 2.970304424346887e-07, + "loss": 0.4497, + "step": 10441 + }, + { + "epoch": 0.8480467798261999, + "grad_norm": 7.221626978308798, + "learning_rate": 2.9671961702071314e-07, + "loss": 0.4407, + "step": 10442 + }, + { + "epoch": 0.8481279948022415, + "grad_norm": 17.363810182049495, + "learning_rate": 2.9640894406207875e-07, + "loss": 0.5855, + "step": 10443 + }, + { + "epoch": 0.8482092097782831, + "grad_norm": 74.21346545661238, + "learning_rate": 2.960984235802836e-07, + "loss": 0.5255, + "step": 10444 + }, + { + "epoch": 0.8482904247543247, + "grad_norm": 5.175532691743031, + "learning_rate": 2.957880555968137e-07, + "loss": 0.6829, + "step": 10445 + }, + { + "epoch": 0.8483716397303663, + "grad_norm": 6.761383142028467, + "learning_rate": 2.95477840133144e-07, + "loss": 0.4693, + "step": 10446 + }, + { + "epoch": 0.8484528547064079, + "grad_norm": 3.230874015786307, + "learning_rate": 2.951677772107406e-07, + "loss": 0.4897, + "step": 10447 + }, + { + "epoch": 0.8485340696824495, + "grad_norm": 4.35547207648812, + "learning_rate": 2.9485786685105876e-07, + "loss": 0.3499, + "step": 10448 + }, + { + "epoch": 0.848615284658491, + "grad_norm": 4.106258252858656, + "learning_rate": 2.945481090755417e-07, + "loss": 0.5062, + "step": 10449 + }, + { + "epoch": 0.8486964996345326, + "grad_norm": 7.391399136928735, + "learning_rate": 2.942385039056231e-07, + "loss": 0.5064, + "step": 10450 + }, + { + "epoch": 0.8487777146105742, + "grad_norm": 4.053993280844301, + "learning_rate": 2.939290513627266e-07, + "loss": 0.5534, + "step": 10451 + }, + { + "epoch": 0.8488589295866158, + "grad_norm": 7.209839450243032, + "learning_rate": 2.936197514682637e-07, + "loss": 0.5145, + "step": 10452 + }, + { + "epoch": 0.8489401445626573, + "grad_norm": 4.068133193825577, + "learning_rate": 2.933106042436368e-07, + "loss": 0.5653, + "step": 10453 + }, + { + "epoch": 0.849021359538699, + "grad_norm": 7.446752715397537, + "learning_rate": 2.930016097102378e-07, + "loss": 0.4394, + "step": 10454 + }, + { + "epoch": 0.8491025745147405, + "grad_norm": 8.151515842737028, + "learning_rate": 2.9269276788944726e-07, + "loss": 0.4768, + "step": 10455 + }, + { + "epoch": 0.8491837894907821, + "grad_norm": 4.210590324590446, + "learning_rate": 2.923840788026347e-07, + "loss": 0.5837, + "step": 10456 + }, + { + "epoch": 0.8492650044668237, + "grad_norm": 4.153598656728726, + "learning_rate": 2.9207554247116047e-07, + "loss": 0.4728, + "step": 10457 + }, + { + "epoch": 0.8493462194428653, + "grad_norm": 4.033479881839166, + "learning_rate": 2.917671589163737e-07, + "loss": 0.5434, + "step": 10458 + }, + { + "epoch": 0.8494274344189069, + "grad_norm": 4.9531557828779045, + "learning_rate": 2.9145892815961194e-07, + "loss": 0.3945, + "step": 10459 + }, + { + "epoch": 0.8495086493949484, + "grad_norm": 5.232476687979792, + "learning_rate": 2.911508502222041e-07, + "loss": 0.6322, + "step": 10460 + }, + { + "epoch": 0.84958986437099, + "grad_norm": 2.92678651363929, + "learning_rate": 2.908429251254674e-07, + "loss": 0.4652, + "step": 10461 + }, + { + "epoch": 0.8496710793470316, + "grad_norm": 6.325678937933874, + "learning_rate": 2.90535152890708e-07, + "loss": 0.4732, + "step": 10462 + }, + { + "epoch": 0.8497522943230732, + "grad_norm": 5.685417644326699, + "learning_rate": 2.902275335392232e-07, + "loss": 0.6602, + "step": 10463 + }, + { + "epoch": 0.8498335092991147, + "grad_norm": 4.163035284682951, + "learning_rate": 2.8992006709229803e-07, + "loss": 0.3988, + "step": 10464 + }, + { + "epoch": 0.8499147242751564, + "grad_norm": 4.57874278542473, + "learning_rate": 2.8961275357120704e-07, + "loss": 0.4362, + "step": 10465 + }, + { + "epoch": 0.8499959392511979, + "grad_norm": 5.572554269055086, + "learning_rate": 2.893055929972152e-07, + "loss": 0.4934, + "step": 10466 + }, + { + "epoch": 0.8500771542272395, + "grad_norm": 10.65165020002116, + "learning_rate": 2.8899858539157694e-07, + "loss": 0.5137, + "step": 10467 + }, + { + "epoch": 0.8501583692032811, + "grad_norm": 9.776998737199031, + "learning_rate": 2.886917307755349e-07, + "loss": 0.4341, + "step": 10468 + }, + { + "epoch": 0.8502395841793227, + "grad_norm": 6.586810382710384, + "learning_rate": 2.8838502917032136e-07, + "loss": 0.4922, + "step": 10469 + }, + { + "epoch": 0.8503207991553643, + "grad_norm": 5.154039748490755, + "learning_rate": 2.880784805971595e-07, + "loss": 0.3545, + "step": 10470 + }, + { + "epoch": 0.8504020141314058, + "grad_norm": 3.9568608557183516, + "learning_rate": 2.8777208507726056e-07, + "loss": 0.4876, + "step": 10471 + }, + { + "epoch": 0.8504832291074474, + "grad_norm": 4.288857280998075, + "learning_rate": 2.874658426318244e-07, + "loss": 0.4853, + "step": 10472 + }, + { + "epoch": 0.850564444083489, + "grad_norm": 4.368402834061044, + "learning_rate": 2.871597532820425e-07, + "loss": 0.4401, + "step": 10473 + }, + { + "epoch": 0.8506456590595306, + "grad_norm": 4.033924407915728, + "learning_rate": 2.86853817049095e-07, + "loss": 0.5619, + "step": 10474 + }, + { + "epoch": 0.8507268740355721, + "grad_norm": 5.6815820407123585, + "learning_rate": 2.865480339541496e-07, + "loss": 0.4642, + "step": 10475 + }, + { + "epoch": 0.8508080890116138, + "grad_norm": 3.948873040109591, + "learning_rate": 2.8624240401836647e-07, + "loss": 0.4468, + "step": 10476 + }, + { + "epoch": 0.8508893039876553, + "grad_norm": 3.2353533973872084, + "learning_rate": 2.859369272628928e-07, + "loss": 0.532, + "step": 10477 + }, + { + "epoch": 0.8509705189636969, + "grad_norm": 3.308823715003122, + "learning_rate": 2.856316037088655e-07, + "loss": 0.4157, + "step": 10478 + }, + { + "epoch": 0.8510517339397385, + "grad_norm": 13.811883074908797, + "learning_rate": 2.8532643337741195e-07, + "loss": 0.4894, + "step": 10479 + }, + { + "epoch": 0.8511329489157801, + "grad_norm": 3.2224568567146945, + "learning_rate": 2.8502141628964836e-07, + "loss": 0.4306, + "step": 10480 + }, + { + "epoch": 0.8512141638918217, + "grad_norm": 3.9763076127585313, + "learning_rate": 2.8471655246668007e-07, + "loss": 0.5575, + "step": 10481 + }, + { + "epoch": 0.8512953788678632, + "grad_norm": 6.740981385096001, + "learning_rate": 2.844118419296024e-07, + "loss": 0.4055, + "step": 10482 + }, + { + "epoch": 0.8513765938439048, + "grad_norm": 20.280874148042784, + "learning_rate": 2.841072846994994e-07, + "loss": 0.3875, + "step": 10483 + }, + { + "epoch": 0.8514578088199464, + "grad_norm": 9.735322905143065, + "learning_rate": 2.8380288079744494e-07, + "loss": 0.3133, + "step": 10484 + }, + { + "epoch": 0.851539023795988, + "grad_norm": 4.313636812248141, + "learning_rate": 2.8349863024450143e-07, + "loss": 0.4564, + "step": 10485 + }, + { + "epoch": 0.8516202387720295, + "grad_norm": 6.764374322762755, + "learning_rate": 2.8319453306172225e-07, + "loss": 0.396, + "step": 10486 + }, + { + "epoch": 0.8517014537480712, + "grad_norm": 5.159578175212937, + "learning_rate": 2.8289058927014944e-07, + "loss": 0.5508, + "step": 10487 + }, + { + "epoch": 0.8517826687241127, + "grad_norm": 9.297209132292254, + "learning_rate": 2.8258679889081346e-07, + "loss": 0.4523, + "step": 10488 + }, + { + "epoch": 0.8518638837001543, + "grad_norm": 3.958399617586093, + "learning_rate": 2.8228316194473607e-07, + "loss": 0.4515, + "step": 10489 + }, + { + "epoch": 0.8519450986761959, + "grad_norm": 6.059112536774565, + "learning_rate": 2.8197967845292687e-07, + "loss": 0.5989, + "step": 10490 + }, + { + "epoch": 0.8520263136522375, + "grad_norm": 7.811507918796633, + "learning_rate": 2.8167634843638434e-07, + "loss": 0.3805, + "step": 10491 + }, + { + "epoch": 0.8521075286282791, + "grad_norm": 6.346071562908303, + "learning_rate": 2.8137317191609864e-07, + "loss": 0.3511, + "step": 10492 + }, + { + "epoch": 0.8521887436043206, + "grad_norm": 8.889395270642837, + "learning_rate": 2.810701489130477e-07, + "loss": 0.3602, + "step": 10493 + }, + { + "epoch": 0.8522699585803623, + "grad_norm": 5.77337789424289, + "learning_rate": 2.807672794481986e-07, + "loss": 0.3249, + "step": 10494 + }, + { + "epoch": 0.8523511735564038, + "grad_norm": 5.856871874864799, + "learning_rate": 2.804645635425091e-07, + "loss": 0.4549, + "step": 10495 + }, + { + "epoch": 0.8524323885324454, + "grad_norm": 3.9708905797147924, + "learning_rate": 2.801620012169251e-07, + "loss": 0.4752, + "step": 10496 + }, + { + "epoch": 0.8525136035084869, + "grad_norm": 5.58347667715412, + "learning_rate": 2.7985959249238165e-07, + "loss": 0.5097, + "step": 10497 + }, + { + "epoch": 0.8525948184845286, + "grad_norm": 5.356550864085213, + "learning_rate": 2.7955733738980443e-07, + "loss": 0.3949, + "step": 10498 + }, + { + "epoch": 0.8526760334605701, + "grad_norm": 5.227735405567097, + "learning_rate": 2.792552359301087e-07, + "loss": 0.4579, + "step": 10499 + }, + { + "epoch": 0.8527572484366117, + "grad_norm": 10.79645556327445, + "learning_rate": 2.789532881341969e-07, + "loss": 0.5133, + "step": 10500 + }, + { + "epoch": 0.8528384634126533, + "grad_norm": 9.986241964497967, + "learning_rate": 2.786514940229634e-07, + "loss": 0.4597, + "step": 10501 + }, + { + "epoch": 0.8529196783886949, + "grad_norm": 5.528139225360127, + "learning_rate": 2.7834985361728987e-07, + "loss": 0.312, + "step": 10502 + }, + { + "epoch": 0.8530008933647365, + "grad_norm": 5.4346809701316685, + "learning_rate": 2.7804836693804905e-07, + "loss": 0.6169, + "step": 10503 + }, + { + "epoch": 0.853082108340778, + "grad_norm": 4.421996352718119, + "learning_rate": 2.7774703400610086e-07, + "loss": 0.5074, + "step": 10504 + }, + { + "epoch": 0.8531633233168197, + "grad_norm": 12.764502226614324, + "learning_rate": 2.7744585484229674e-07, + "loss": 0.2709, + "step": 10505 + }, + { + "epoch": 0.8532445382928612, + "grad_norm": 5.85671022027329, + "learning_rate": 2.771448294674775e-07, + "loss": 0.4455, + "step": 10506 + }, + { + "epoch": 0.8533257532689028, + "grad_norm": 8.599669196683427, + "learning_rate": 2.768439579024712e-07, + "loss": 0.5653, + "step": 10507 + }, + { + "epoch": 0.8534069682449443, + "grad_norm": 6.732667207212168, + "learning_rate": 2.7654324016809757e-07, + "loss": 0.4152, + "step": 10508 + }, + { + "epoch": 0.853488183220986, + "grad_norm": 4.779828616945185, + "learning_rate": 2.7624267628516445e-07, + "loss": 0.4589, + "step": 10509 + }, + { + "epoch": 0.8535693981970275, + "grad_norm": 11.113748713116502, + "learning_rate": 2.759422662744682e-07, + "loss": 0.3779, + "step": 10510 + }, + { + "epoch": 0.8536506131730691, + "grad_norm": 4.481185872009129, + "learning_rate": 2.7564201015679664e-07, + "loss": 0.4627, + "step": 10511 + }, + { + "epoch": 0.8537318281491107, + "grad_norm": 4.723745824336893, + "learning_rate": 2.7534190795292626e-07, + "loss": 0.5168, + "step": 10512 + }, + { + "epoch": 0.8538130431251523, + "grad_norm": 4.5772924241630895, + "learning_rate": 2.750419596836215e-07, + "loss": 0.5351, + "step": 10513 + }, + { + "epoch": 0.8538942581011939, + "grad_norm": 4.890888601625891, + "learning_rate": 2.7474216536963803e-07, + "loss": 0.358, + "step": 10514 + }, + { + "epoch": 0.8539754730772354, + "grad_norm": 9.34358416833541, + "learning_rate": 2.744425250317201e-07, + "loss": 0.3687, + "step": 10515 + }, + { + "epoch": 0.8540566880532771, + "grad_norm": 4.739733132295761, + "learning_rate": 2.7414303869059994e-07, + "loss": 0.5659, + "step": 10516 + }, + { + "epoch": 0.8541379030293186, + "grad_norm": 4.5314061539867145, + "learning_rate": 2.7384370636700187e-07, + "loss": 0.4997, + "step": 10517 + }, + { + "epoch": 0.8542191180053602, + "grad_norm": 10.166709743253572, + "learning_rate": 2.735445280816373e-07, + "loss": 0.3698, + "step": 10518 + }, + { + "epoch": 0.8543003329814017, + "grad_norm": 6.233844850352306, + "learning_rate": 2.7324550385520844e-07, + "loss": 0.4164, + "step": 10519 + }, + { + "epoch": 0.8543815479574434, + "grad_norm": 4.1121635178190665, + "learning_rate": 2.72946633708405e-07, + "loss": 0.5892, + "step": 10520 + }, + { + "epoch": 0.8544627629334849, + "grad_norm": 10.454864690447497, + "learning_rate": 2.726479176619087e-07, + "loss": 0.5313, + "step": 10521 + }, + { + "epoch": 0.8545439779095265, + "grad_norm": 3.298486331376736, + "learning_rate": 2.723493557363885e-07, + "loss": 0.6448, + "step": 10522 + }, + { + "epoch": 0.8546251928855682, + "grad_norm": 4.677659267745939, + "learning_rate": 2.720509479525027e-07, + "loss": 0.5786, + "step": 10523 + }, + { + "epoch": 0.8547064078616097, + "grad_norm": 4.621472290562395, + "learning_rate": 2.7175269433089984e-07, + "loss": 0.4443, + "step": 10524 + }, + { + "epoch": 0.8547876228376513, + "grad_norm": 4.075558864406144, + "learning_rate": 2.7145459489221845e-07, + "loss": 0.4766, + "step": 10525 + }, + { + "epoch": 0.8548688378136928, + "grad_norm": 3.9376155282925427, + "learning_rate": 2.7115664965708387e-07, + "loss": 0.5365, + "step": 10526 + }, + { + "epoch": 0.8549500527897345, + "grad_norm": 4.9988126907241135, + "learning_rate": 2.708588586461139e-07, + "loss": 0.5849, + "step": 10527 + }, + { + "epoch": 0.855031267765776, + "grad_norm": 4.260322647357954, + "learning_rate": 2.7056122187991306e-07, + "loss": 0.5862, + "step": 10528 + }, + { + "epoch": 0.8551124827418176, + "grad_norm": 4.653669694516675, + "learning_rate": 2.7026373937907636e-07, + "loss": 0.6337, + "step": 10529 + }, + { + "epoch": 0.8551936977178591, + "grad_norm": 16.425837953856785, + "learning_rate": 2.6996641116418863e-07, + "loss": 0.3838, + "step": 10530 + }, + { + "epoch": 0.8552749126939008, + "grad_norm": 8.7333194311729, + "learning_rate": 2.696692372558224e-07, + "loss": 0.3394, + "step": 10531 + }, + { + "epoch": 0.8553561276699423, + "grad_norm": 6.271008188155812, + "learning_rate": 2.6937221767454086e-07, + "loss": 0.4557, + "step": 10532 + }, + { + "epoch": 0.8554373426459839, + "grad_norm": 4.33027681898429, + "learning_rate": 2.690753524408973e-07, + "loss": 0.4263, + "step": 10533 + }, + { + "epoch": 0.8555185576220256, + "grad_norm": 8.869308357187672, + "learning_rate": 2.6877864157543204e-07, + "loss": 0.4403, + "step": 10534 + }, + { + "epoch": 0.8555997725980671, + "grad_norm": 2.978988832273135, + "learning_rate": 2.684820850986758e-07, + "loss": 0.5313, + "step": 10535 + }, + { + "epoch": 0.8556809875741087, + "grad_norm": 8.360644599601576, + "learning_rate": 2.6818568303114967e-07, + "loss": 0.422, + "step": 10536 + }, + { + "epoch": 0.8557622025501502, + "grad_norm": 11.386127329062182, + "learning_rate": 2.67889435393362e-07, + "loss": 0.3573, + "step": 10537 + }, + { + "epoch": 0.8558434175261919, + "grad_norm": 5.2062587599165475, + "learning_rate": 2.6759334220581273e-07, + "loss": 0.6315, + "step": 10538 + }, + { + "epoch": 0.8559246325022334, + "grad_norm": 4.445713015076414, + "learning_rate": 2.6729740348898886e-07, + "loss": 0.4583, + "step": 10539 + }, + { + "epoch": 0.856005847478275, + "grad_norm": 5.948969244528773, + "learning_rate": 2.670016192633687e-07, + "loss": 0.4175, + "step": 10540 + }, + { + "epoch": 0.8560870624543165, + "grad_norm": 7.049516828972635, + "learning_rate": 2.667059895494184e-07, + "loss": 0.4651, + "step": 10541 + }, + { + "epoch": 0.8561682774303582, + "grad_norm": 12.341170101279898, + "learning_rate": 2.6641051436759353e-07, + "loss": 0.397, + "step": 10542 + }, + { + "epoch": 0.8562494924063997, + "grad_norm": 4.024117199833696, + "learning_rate": 2.6611519373834076e-07, + "loss": 0.5794, + "step": 10543 + }, + { + "epoch": 0.8563307073824413, + "grad_norm": 9.173352219765476, + "learning_rate": 2.6582002768209326e-07, + "loss": 0.4269, + "step": 10544 + }, + { + "epoch": 0.856411922358483, + "grad_norm": 6.424827613404485, + "learning_rate": 2.6552501621927544e-07, + "loss": 0.5124, + "step": 10545 + }, + { + "epoch": 0.8564931373345245, + "grad_norm": 4.601328438451911, + "learning_rate": 2.6523015937030136e-07, + "loss": 0.4117, + "step": 10546 + }, + { + "epoch": 0.8565743523105661, + "grad_norm": 5.439497421746022, + "learning_rate": 2.649354571555729e-07, + "loss": 0.3709, + "step": 10547 + }, + { + "epoch": 0.8566555672866076, + "grad_norm": 5.6866827995608755, + "learning_rate": 2.6464090959548135e-07, + "loss": 0.4981, + "step": 10548 + }, + { + "epoch": 0.8567367822626493, + "grad_norm": 8.576641000068408, + "learning_rate": 2.6434651671040894e-07, + "loss": 0.4026, + "step": 10549 + }, + { + "epoch": 0.8568179972386908, + "grad_norm": 4.32299331991155, + "learning_rate": 2.6405227852072504e-07, + "loss": 0.5568, + "step": 10550 + }, + { + "epoch": 0.8568992122147324, + "grad_norm": 6.515668343832238, + "learning_rate": 2.637581950467896e-07, + "loss": 0.7008, + "step": 10551 + }, + { + "epoch": 0.8569804271907739, + "grad_norm": 4.115866997613363, + "learning_rate": 2.634642663089529e-07, + "loss": 0.5308, + "step": 10552 + }, + { + "epoch": 0.8570616421668156, + "grad_norm": 5.714701058196075, + "learning_rate": 2.6317049232755185e-07, + "loss": 0.4151, + "step": 10553 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 6.258820706442769, + "learning_rate": 2.628768731229142e-07, + "loss": 0.3542, + "step": 10554 + }, + { + "epoch": 0.8572240721188987, + "grad_norm": 7.447014078813493, + "learning_rate": 2.6258340871535753e-07, + "loss": 0.3555, + "step": 10555 + }, + { + "epoch": 0.8573052870949404, + "grad_norm": 4.687623344122876, + "learning_rate": 2.6229009912518754e-07, + "loss": 0.3792, + "step": 10556 + }, + { + "epoch": 0.8573865020709819, + "grad_norm": 4.765486433369644, + "learning_rate": 2.619969443726994e-07, + "loss": 0.5262, + "step": 10557 + }, + { + "epoch": 0.8574677170470235, + "grad_norm": 4.429572901564648, + "learning_rate": 2.6170394447817824e-07, + "loss": 0.3549, + "step": 10558 + }, + { + "epoch": 0.857548932023065, + "grad_norm": 5.054699713881342, + "learning_rate": 2.6141109946189874e-07, + "loss": 0.6322, + "step": 10559 + }, + { + "epoch": 0.8576301469991067, + "grad_norm": 5.352387291113756, + "learning_rate": 2.611184093441232e-07, + "loss": 0.4139, + "step": 10560 + }, + { + "epoch": 0.8577113619751482, + "grad_norm": 7.045189522256243, + "learning_rate": 2.608258741451045e-07, + "loss": 0.6806, + "step": 10561 + }, + { + "epoch": 0.8577925769511898, + "grad_norm": 4.4588250538525935, + "learning_rate": 2.605334938850851e-07, + "loss": 0.5799, + "step": 10562 + }, + { + "epoch": 0.8578737919272313, + "grad_norm": 4.741068158876179, + "learning_rate": 2.6024126858429503e-07, + "loss": 0.4641, + "step": 10563 + }, + { + "epoch": 0.857955006903273, + "grad_norm": 6.387856164555808, + "learning_rate": 2.599491982629554e-07, + "loss": 0.4024, + "step": 10564 + }, + { + "epoch": 0.8580362218793145, + "grad_norm": 11.111320718396847, + "learning_rate": 2.596572829412766e-07, + "loss": 0.5682, + "step": 10565 + }, + { + "epoch": 0.8581174368553561, + "grad_norm": 3.9432874147413366, + "learning_rate": 2.59365522639457e-07, + "loss": 0.4614, + "step": 10566 + }, + { + "epoch": 0.8581986518313978, + "grad_norm": 4.644099435339523, + "learning_rate": 2.590739173776841e-07, + "loss": 0.5058, + "step": 10567 + }, + { + "epoch": 0.8582798668074393, + "grad_norm": 6.2728581984877865, + "learning_rate": 2.5878246717613684e-07, + "loss": 0.3661, + "step": 10568 + }, + { + "epoch": 0.8583610817834809, + "grad_norm": 4.322705528890997, + "learning_rate": 2.5849117205498096e-07, + "loss": 0.6458, + "step": 10569 + }, + { + "epoch": 0.8584422967595224, + "grad_norm": 3.749986780079194, + "learning_rate": 2.582000320343728e-07, + "loss": 0.6983, + "step": 10570 + }, + { + "epoch": 0.8585235117355641, + "grad_norm": 7.654907749368039, + "learning_rate": 2.579090471344584e-07, + "loss": 0.4619, + "step": 10571 + }, + { + "epoch": 0.8586047267116056, + "grad_norm": 7.384217550856351, + "learning_rate": 2.576182173753719e-07, + "loss": 0.4562, + "step": 10572 + }, + { + "epoch": 0.8586859416876472, + "grad_norm": 6.675039979352513, + "learning_rate": 2.5732754277723703e-07, + "loss": 0.5025, + "step": 10573 + }, + { + "epoch": 0.8587671566636887, + "grad_norm": 4.025631310651602, + "learning_rate": 2.5703702336016654e-07, + "loss": 0.4889, + "step": 10574 + }, + { + "epoch": 0.8588483716397304, + "grad_norm": 4.195667201107953, + "learning_rate": 2.567466591442638e-07, + "loss": 0.3889, + "step": 10575 + }, + { + "epoch": 0.8589295866157719, + "grad_norm": 5.444485183310965, + "learning_rate": 2.5645645014961947e-07, + "loss": 0.4681, + "step": 10576 + }, + { + "epoch": 0.8590108015918135, + "grad_norm": 4.670632142056175, + "learning_rate": 2.561663963963151e-07, + "loss": 0.558, + "step": 10577 + }, + { + "epoch": 0.8590920165678552, + "grad_norm": 5.0298060033672645, + "learning_rate": 2.558764979044212e-07, + "loss": 0.5758, + "step": 10578 + }, + { + "epoch": 0.8591732315438967, + "grad_norm": 3.3846837426199845, + "learning_rate": 2.555867546939969e-07, + "loss": 0.4184, + "step": 10579 + }, + { + "epoch": 0.8592544465199383, + "grad_norm": 3.2790619896944957, + "learning_rate": 2.5529716678509007e-07, + "loss": 0.3676, + "step": 10580 + }, + { + "epoch": 0.8593356614959798, + "grad_norm": 8.215528066018926, + "learning_rate": 2.5500773419774e-07, + "loss": 0.5089, + "step": 10581 + }, + { + "epoch": 0.8594168764720215, + "grad_norm": 7.985001045047634, + "learning_rate": 2.547184569519728e-07, + "loss": 0.5952, + "step": 10582 + }, + { + "epoch": 0.859498091448063, + "grad_norm": 9.533391213718112, + "learning_rate": 2.5442933506780536e-07, + "loss": 0.4201, + "step": 10583 + }, + { + "epoch": 0.8595793064241046, + "grad_norm": 3.854806058643972, + "learning_rate": 2.541403685652438e-07, + "loss": 0.5651, + "step": 10584 + }, + { + "epoch": 0.8596605214001461, + "grad_norm": 10.475802887731723, + "learning_rate": 2.53851557464283e-07, + "loss": 0.4844, + "step": 10585 + }, + { + "epoch": 0.8597417363761878, + "grad_norm": 6.07171775635679, + "learning_rate": 2.535629017849062e-07, + "loss": 0.4356, + "step": 10586 + }, + { + "epoch": 0.8598229513522293, + "grad_norm": 9.000523149669734, + "learning_rate": 2.532744015470878e-07, + "loss": 0.3976, + "step": 10587 + }, + { + "epoch": 0.8599041663282709, + "grad_norm": 3.9919734198421164, + "learning_rate": 2.529860567707904e-07, + "loss": 0.4646, + "step": 10588 + }, + { + "epoch": 0.8599853813043126, + "grad_norm": 5.113963141533617, + "learning_rate": 2.5269786747596504e-07, + "loss": 0.5287, + "step": 10589 + }, + { + "epoch": 0.8600665962803541, + "grad_norm": 6.0753816648158345, + "learning_rate": 2.5240983368255365e-07, + "loss": 0.3916, + "step": 10590 + }, + { + "epoch": 0.8601478112563957, + "grad_norm": 5.777553433923641, + "learning_rate": 2.52121955410487e-07, + "loss": 0.7061, + "step": 10591 + }, + { + "epoch": 0.8602290262324372, + "grad_norm": 6.304746787186769, + "learning_rate": 2.518342326796844e-07, + "loss": 0.4157, + "step": 10592 + }, + { + "epoch": 0.8603102412084789, + "grad_norm": 8.184175586338531, + "learning_rate": 2.515466655100543e-07, + "loss": 0.4792, + "step": 10593 + }, + { + "epoch": 0.8603914561845204, + "grad_norm": 7.804180717198223, + "learning_rate": 2.5125925392149533e-07, + "loss": 0.4817, + "step": 10594 + }, + { + "epoch": 0.860472671160562, + "grad_norm": 7.390682075003671, + "learning_rate": 2.5097199793389456e-07, + "loss": 0.3667, + "step": 10595 + }, + { + "epoch": 0.8605538861366036, + "grad_norm": 4.844008089075363, + "learning_rate": 2.506848975671283e-07, + "loss": 0.4751, + "step": 10596 + }, + { + "epoch": 0.8606351011126452, + "grad_norm": 15.0313162783177, + "learning_rate": 2.5039795284106354e-07, + "loss": 0.47, + "step": 10597 + }, + { + "epoch": 0.8607163160886867, + "grad_norm": 7.827029414244254, + "learning_rate": 2.5011116377555463e-07, + "loss": 0.3705, + "step": 10598 + }, + { + "epoch": 0.8607975310647283, + "grad_norm": 4.165034621143684, + "learning_rate": 2.4982453039044536e-07, + "loss": 0.5141, + "step": 10599 + }, + { + "epoch": 0.86087874604077, + "grad_norm": 9.403450320448428, + "learning_rate": 2.495380527055699e-07, + "loss": 0.5018, + "step": 10600 + }, + { + "epoch": 0.8609599610168115, + "grad_norm": 3.669900811182216, + "learning_rate": 2.49251730740751e-07, + "loss": 0.5024, + "step": 10601 + }, + { + "epoch": 0.8610411759928531, + "grad_norm": 8.666914052435182, + "learning_rate": 2.4896556451579985e-07, + "loss": 0.3806, + "step": 10602 + }, + { + "epoch": 0.8611223909688946, + "grad_norm": 6.5382282595398955, + "learning_rate": 2.4867955405051826e-07, + "loss": 0.436, + "step": 10603 + }, + { + "epoch": 0.8612036059449363, + "grad_norm": 6.652949951974725, + "learning_rate": 2.483936993646971e-07, + "loss": 0.4528, + "step": 10604 + }, + { + "epoch": 0.8612848209209778, + "grad_norm": 7.183747958159308, + "learning_rate": 2.48108000478115e-07, + "loss": 0.4787, + "step": 10605 + }, + { + "epoch": 0.8613660358970194, + "grad_norm": 10.226892509366555, + "learning_rate": 2.4782245741054175e-07, + "loss": 0.5123, + "step": 10606 + }, + { + "epoch": 0.861447250873061, + "grad_norm": 6.451808047428951, + "learning_rate": 2.475370701817348e-07, + "loss": 0.3932, + "step": 10607 + }, + { + "epoch": 0.8615284658491026, + "grad_norm": 4.898759701665131, + "learning_rate": 2.4725183881144114e-07, + "loss": 0.5491, + "step": 10608 + }, + { + "epoch": 0.8616096808251441, + "grad_norm": 31.142791051653955, + "learning_rate": 2.4696676331939786e-07, + "loss": 0.4651, + "step": 10609 + }, + { + "epoch": 0.8616908958011857, + "grad_norm": 3.5669616787142697, + "learning_rate": 2.46681843725331e-07, + "loss": 0.5049, + "step": 10610 + }, + { + "epoch": 0.8617721107772274, + "grad_norm": 5.205643471968962, + "learning_rate": 2.4639708004895515e-07, + "loss": 0.4391, + "step": 10611 + }, + { + "epoch": 0.8618533257532689, + "grad_norm": 5.483632769288808, + "learning_rate": 2.4611247230997366e-07, + "loss": 0.5294, + "step": 10612 + }, + { + "epoch": 0.8619345407293105, + "grad_norm": 7.848858229369638, + "learning_rate": 2.458280205280811e-07, + "loss": 0.4576, + "step": 10613 + }, + { + "epoch": 0.862015755705352, + "grad_norm": 4.862179433393447, + "learning_rate": 2.455437247229595e-07, + "loss": 0.5552, + "step": 10614 + }, + { + "epoch": 0.8620969706813937, + "grad_norm": 7.004075796989462, + "learning_rate": 2.4525958491428026e-07, + "loss": 0.573, + "step": 10615 + }, + { + "epoch": 0.8621781856574352, + "grad_norm": 27.404691438629225, + "learning_rate": 2.4497560112170444e-07, + "loss": 0.5041, + "step": 10616 + }, + { + "epoch": 0.8622594006334768, + "grad_norm": 3.9710080611622955, + "learning_rate": 2.446917733648829e-07, + "loss": 0.6183, + "step": 10617 + }, + { + "epoch": 0.8623406156095184, + "grad_norm": 7.736011644339195, + "learning_rate": 2.444081016634545e-07, + "loss": 0.6269, + "step": 10618 + }, + { + "epoch": 0.86242183058556, + "grad_norm": 3.296357608833365, + "learning_rate": 2.4412458603704806e-07, + "loss": 0.6128, + "step": 10619 + }, + { + "epoch": 0.8625030455616015, + "grad_norm": 4.26650357660672, + "learning_rate": 2.438412265052814e-07, + "loss": 0.4254, + "step": 10620 + }, + { + "epoch": 0.8625842605376431, + "grad_norm": 7.332608647465775, + "learning_rate": 2.4355802308776073e-07, + "loss": 0.4871, + "step": 10621 + }, + { + "epoch": 0.8626654755136848, + "grad_norm": 4.755644735563533, + "learning_rate": 2.4327497580408285e-07, + "loss": 0.5566, + "step": 10622 + }, + { + "epoch": 0.8627466904897263, + "grad_norm": 3.8321180175370824, + "learning_rate": 2.4299208467383347e-07, + "loss": 0.4417, + "step": 10623 + }, + { + "epoch": 0.8628279054657679, + "grad_norm": 4.133812851532381, + "learning_rate": 2.427093497165864e-07, + "loss": 0.4393, + "step": 10624 + }, + { + "epoch": 0.8629091204418095, + "grad_norm": 4.292765424503046, + "learning_rate": 2.4242677095190623e-07, + "loss": 0.5653, + "step": 10625 + }, + { + "epoch": 0.8629903354178511, + "grad_norm": 5.735964641093858, + "learning_rate": 2.4214434839934545e-07, + "loss": 0.3592, + "step": 10626 + }, + { + "epoch": 0.8630715503938926, + "grad_norm": 8.285033867612887, + "learning_rate": 2.418620820784462e-07, + "loss": 0.5062, + "step": 10627 + }, + { + "epoch": 0.8631527653699342, + "grad_norm": 4.512095996980739, + "learning_rate": 2.4157997200873945e-07, + "loss": 0.3959, + "step": 10628 + }, + { + "epoch": 0.8632339803459758, + "grad_norm": 5.106558948145591, + "learning_rate": 2.4129801820974604e-07, + "loss": 0.4257, + "step": 10629 + }, + { + "epoch": 0.8633151953220174, + "grad_norm": 8.542776714887058, + "learning_rate": 2.410162207009761e-07, + "loss": 0.4211, + "step": 10630 + }, + { + "epoch": 0.8633964102980589, + "grad_norm": 4.654592367436493, + "learning_rate": 2.4073457950192806e-07, + "loss": 0.4189, + "step": 10631 + }, + { + "epoch": 0.8634776252741005, + "grad_norm": 6.844838480590244, + "learning_rate": 2.404530946320904e-07, + "loss": 0.5268, + "step": 10632 + }, + { + "epoch": 0.8635588402501422, + "grad_norm": 5.39856003020653, + "learning_rate": 2.401717661109401e-07, + "loss": 0.5319, + "step": 10633 + }, + { + "epoch": 0.8636400552261837, + "grad_norm": 6.044733492723436, + "learning_rate": 2.398905939579432e-07, + "loss": 0.4763, + "step": 10634 + }, + { + "epoch": 0.8637212702022253, + "grad_norm": 3.216826620946339, + "learning_rate": 2.396095781925556e-07, + "loss": 0.4624, + "step": 10635 + }, + { + "epoch": 0.8638024851782669, + "grad_norm": 4.322010048301707, + "learning_rate": 2.3932871883422276e-07, + "loss": 0.3858, + "step": 10636 + }, + { + "epoch": 0.8638837001543085, + "grad_norm": 6.581640577707594, + "learning_rate": 2.3904801590237783e-07, + "loss": 0.3858, + "step": 10637 + }, + { + "epoch": 0.86396491513035, + "grad_norm": 5.430902885102804, + "learning_rate": 2.3876746941644464e-07, + "loss": 0.547, + "step": 10638 + }, + { + "epoch": 0.8640461301063916, + "grad_norm": 9.539650370477867, + "learning_rate": 2.384870793958349e-07, + "loss": 0.4549, + "step": 10639 + }, + { + "epoch": 0.8641273450824332, + "grad_norm": 6.254836784455941, + "learning_rate": 2.3820684585995012e-07, + "loss": 0.5427, + "step": 10640 + }, + { + "epoch": 0.8642085600584748, + "grad_norm": 7.936489779390335, + "learning_rate": 2.379267688281814e-07, + "loss": 0.394, + "step": 10641 + }, + { + "epoch": 0.8642897750345163, + "grad_norm": 4.9403538410304, + "learning_rate": 2.3764684831990874e-07, + "loss": 0.5883, + "step": 10642 + }, + { + "epoch": 0.864370990010558, + "grad_norm": 5.718951894452898, + "learning_rate": 2.3736708435450033e-07, + "loss": 0.4907, + "step": 10643 + }, + { + "epoch": 0.8644522049865996, + "grad_norm": 4.730922288439977, + "learning_rate": 2.370874769513154e-07, + "loss": 0.4677, + "step": 10644 + }, + { + "epoch": 0.8645334199626411, + "grad_norm": 7.22516165081707, + "learning_rate": 2.3680802612970068e-07, + "loss": 0.4901, + "step": 10645 + }, + { + "epoch": 0.8646146349386827, + "grad_norm": 5.535751530022896, + "learning_rate": 2.365287319089929e-07, + "loss": 0.4482, + "step": 10646 + }, + { + "epoch": 0.8646958499147243, + "grad_norm": 4.330430490684841, + "learning_rate": 2.362495943085172e-07, + "loss": 0.3961, + "step": 10647 + }, + { + "epoch": 0.8647770648907659, + "grad_norm": 5.262984242628663, + "learning_rate": 2.3597061334758864e-07, + "loss": 0.4269, + "step": 10648 + }, + { + "epoch": 0.8648582798668074, + "grad_norm": 3.5730441803712014, + "learning_rate": 2.3569178904551181e-07, + "loss": 0.4865, + "step": 10649 + }, + { + "epoch": 0.864939494842849, + "grad_norm": 4.35982646863664, + "learning_rate": 2.3541312142157934e-07, + "loss": 0.62, + "step": 10650 + }, + { + "epoch": 0.8650207098188906, + "grad_norm": 5.501060251786152, + "learning_rate": 2.3513461049507385e-07, + "loss": 0.309, + "step": 10651 + }, + { + "epoch": 0.8651019247949322, + "grad_norm": 18.5509951879782, + "learning_rate": 2.3485625628526688e-07, + "loss": 0.3867, + "step": 10652 + }, + { + "epoch": 0.8651831397709737, + "grad_norm": 4.778384806825011, + "learning_rate": 2.3457805881141854e-07, + "loss": 0.4658, + "step": 10653 + }, + { + "epoch": 0.8652643547470154, + "grad_norm": 7.32649293368322, + "learning_rate": 2.3430001809277873e-07, + "loss": 0.483, + "step": 10654 + }, + { + "epoch": 0.865345569723057, + "grad_norm": 6.199897652234001, + "learning_rate": 2.340221341485871e-07, + "loss": 0.435, + "step": 10655 + }, + { + "epoch": 0.8654267846990985, + "grad_norm": 4.754389895318495, + "learning_rate": 2.3374440699807072e-07, + "loss": 0.5084, + "step": 10656 + }, + { + "epoch": 0.8655079996751401, + "grad_norm": 4.649740673308111, + "learning_rate": 2.334668366604481e-07, + "loss": 0.5881, + "step": 10657 + }, + { + "epoch": 0.8655892146511817, + "grad_norm": 6.638508068101479, + "learning_rate": 2.3318942315492477e-07, + "loss": 0.38, + "step": 10658 + }, + { + "epoch": 0.8656704296272233, + "grad_norm": 3.952860937759617, + "learning_rate": 2.3291216650069587e-07, + "loss": 0.4626, + "step": 10659 + }, + { + "epoch": 0.8657516446032648, + "grad_norm": 6.6956489482132, + "learning_rate": 2.3263506671694747e-07, + "loss": 0.4982, + "step": 10660 + }, + { + "epoch": 0.8658328595793064, + "grad_norm": 3.478866841645943, + "learning_rate": 2.323581238228517e-07, + "loss": 0.4391, + "step": 10661 + }, + { + "epoch": 0.865914074555348, + "grad_norm": 4.525751483245147, + "learning_rate": 2.3208133783757302e-07, + "loss": 0.5142, + "step": 10662 + }, + { + "epoch": 0.8659952895313896, + "grad_norm": 4.621330522104981, + "learning_rate": 2.3180470878026275e-07, + "loss": 0.5449, + "step": 10663 + }, + { + "epoch": 0.8660765045074311, + "grad_norm": 4.973654556392699, + "learning_rate": 2.3152823667006248e-07, + "loss": 0.4468, + "step": 10664 + }, + { + "epoch": 0.8661577194834728, + "grad_norm": 5.396466731566874, + "learning_rate": 2.3125192152610277e-07, + "loss": 0.633, + "step": 10665 + }, + { + "epoch": 0.8662389344595144, + "grad_norm": 4.918829274592111, + "learning_rate": 2.3097576336750248e-07, + "loss": 0.4153, + "step": 10666 + }, + { + "epoch": 0.8663201494355559, + "grad_norm": 6.0092199166872, + "learning_rate": 2.3069976221337054e-07, + "loss": 0.7968, + "step": 10667 + }, + { + "epoch": 0.8664013644115975, + "grad_norm": 13.224667725694793, + "learning_rate": 2.304239180828055e-07, + "loss": 0.6541, + "step": 10668 + }, + { + "epoch": 0.8664825793876391, + "grad_norm": 4.332528390591684, + "learning_rate": 2.3014823099489326e-07, + "loss": 0.4286, + "step": 10669 + }, + { + "epoch": 0.8665637943636807, + "grad_norm": 8.678328469216439, + "learning_rate": 2.2987270096871072e-07, + "loss": 0.457, + "step": 10670 + }, + { + "epoch": 0.8666450093397222, + "grad_norm": 6.543536578014087, + "learning_rate": 2.2959732802332296e-07, + "loss": 0.3446, + "step": 10671 + }, + { + "epoch": 0.8667262243157638, + "grad_norm": 5.10238287168412, + "learning_rate": 2.2932211217778388e-07, + "loss": 0.6514, + "step": 10672 + }, + { + "epoch": 0.8668074392918054, + "grad_norm": 4.902163867519704, + "learning_rate": 2.2904705345113743e-07, + "loss": 0.6377, + "step": 10673 + }, + { + "epoch": 0.866888654267847, + "grad_norm": 4.914208356693128, + "learning_rate": 2.287721518624156e-07, + "loss": 0.5384, + "step": 10674 + }, + { + "epoch": 0.8669698692438885, + "grad_norm": 4.695162916684552, + "learning_rate": 2.2849740743064063e-07, + "loss": 0.3763, + "step": 10675 + }, + { + "epoch": 0.8670510842199302, + "grad_norm": 5.516447784018673, + "learning_rate": 2.282228201748238e-07, + "loss": 0.4784, + "step": 10676 + }, + { + "epoch": 0.8671322991959718, + "grad_norm": 4.784027348664785, + "learning_rate": 2.2794839011396453e-07, + "loss": 0.3715, + "step": 10677 + }, + { + "epoch": 0.8672135141720133, + "grad_norm": 4.524507243360301, + "learning_rate": 2.2767411726705157e-07, + "loss": 0.66, + "step": 10678 + }, + { + "epoch": 0.8672947291480549, + "grad_norm": 6.2607763304203194, + "learning_rate": 2.2740000165306393e-07, + "loss": 0.4065, + "step": 10679 + }, + { + "epoch": 0.8673759441240965, + "grad_norm": 6.045389943291753, + "learning_rate": 2.2712604329096833e-07, + "loss": 0.5403, + "step": 10680 + }, + { + "epoch": 0.8674571591001381, + "grad_norm": 4.945546166899713, + "learning_rate": 2.2685224219972185e-07, + "loss": 0.6084, + "step": 10681 + }, + { + "epoch": 0.8675383740761796, + "grad_norm": 7.020025507043163, + "learning_rate": 2.2657859839826934e-07, + "loss": 0.4117, + "step": 10682 + }, + { + "epoch": 0.8676195890522213, + "grad_norm": 3.378493580413376, + "learning_rate": 2.2630511190554621e-07, + "loss": 0.3219, + "step": 10683 + }, + { + "epoch": 0.8677008040282628, + "grad_norm": 14.896543415086516, + "learning_rate": 2.260317827404762e-07, + "loss": 0.3889, + "step": 10684 + }, + { + "epoch": 0.8677820190043044, + "grad_norm": 4.531796982257481, + "learning_rate": 2.2575861092197143e-07, + "loss": 0.6512, + "step": 10685 + }, + { + "epoch": 0.8678632339803459, + "grad_norm": 7.006910779412939, + "learning_rate": 2.254855964689351e-07, + "loss": 0.5622, + "step": 10686 + }, + { + "epoch": 0.8679444489563876, + "grad_norm": 5.778562835427389, + "learning_rate": 2.2521273940025705e-07, + "loss": 0.5571, + "step": 10687 + }, + { + "epoch": 0.8680256639324292, + "grad_norm": 5.550998523968033, + "learning_rate": 2.2494003973481864e-07, + "loss": 0.481, + "step": 10688 + }, + { + "epoch": 0.8681068789084707, + "grad_norm": 5.8427464651275836, + "learning_rate": 2.2466749749148919e-07, + "loss": 0.4879, + "step": 10689 + }, + { + "epoch": 0.8681880938845123, + "grad_norm": 4.136202285236531, + "learning_rate": 2.2439511268912666e-07, + "loss": 0.3003, + "step": 10690 + }, + { + "epoch": 0.8682693088605539, + "grad_norm": 7.410829723701502, + "learning_rate": 2.2412288534657878e-07, + "loss": 0.556, + "step": 10691 + }, + { + "epoch": 0.8683505238365955, + "grad_norm": 5.668425487445851, + "learning_rate": 2.2385081548268268e-07, + "loss": 0.5302, + "step": 10692 + }, + { + "epoch": 0.868431738812637, + "grad_norm": 5.0755641679869505, + "learning_rate": 2.2357890311626328e-07, + "loss": 0.4406, + "step": 10693 + }, + { + "epoch": 0.8685129537886787, + "grad_norm": 4.6801666968241795, + "learning_rate": 2.2330714826613586e-07, + "loss": 0.3976, + "step": 10694 + }, + { + "epoch": 0.8685941687647202, + "grad_norm": 5.777100294603406, + "learning_rate": 2.2303555095110507e-07, + "loss": 0.4059, + "step": 10695 + }, + { + "epoch": 0.8686753837407618, + "grad_norm": 5.314872055548993, + "learning_rate": 2.2276411118996366e-07, + "loss": 0.4719, + "step": 10696 + }, + { + "epoch": 0.8687565987168033, + "grad_norm": 4.410426094256012, + "learning_rate": 2.22492829001493e-07, + "loss": 0.4898, + "step": 10697 + }, + { + "epoch": 0.868837813692845, + "grad_norm": 4.7733912494094755, + "learning_rate": 2.2222170440446557e-07, + "loss": 0.4248, + "step": 10698 + }, + { + "epoch": 0.8689190286688866, + "grad_norm": 6.898790830573995, + "learning_rate": 2.219507374176408e-07, + "loss": 0.4269, + "step": 10699 + }, + { + "epoch": 0.8690002436449281, + "grad_norm": 5.2231893468594635, + "learning_rate": 2.2167992805976896e-07, + "loss": 0.5232, + "step": 10700 + }, + { + "epoch": 0.8690814586209697, + "grad_norm": 7.520502969621975, + "learning_rate": 2.2140927634958788e-07, + "loss": 0.3991, + "step": 10701 + }, + { + "epoch": 0.8691626735970113, + "grad_norm": 4.652004187169794, + "learning_rate": 2.2113878230582615e-07, + "loss": 0.5095, + "step": 10702 + }, + { + "epoch": 0.8692438885730529, + "grad_norm": 3.840996235904483, + "learning_rate": 2.2086844594719993e-07, + "loss": 0.5209, + "step": 10703 + }, + { + "epoch": 0.8693251035490944, + "grad_norm": 5.428257976832266, + "learning_rate": 2.205982672924145e-07, + "loss": 0.4342, + "step": 10704 + }, + { + "epoch": 0.8694063185251361, + "grad_norm": 12.646126359342446, + "learning_rate": 2.203282463601661e-07, + "loss": 0.5407, + "step": 10705 + }, + { + "epoch": 0.8694875335011776, + "grad_norm": 8.151893101320535, + "learning_rate": 2.2005838316913746e-07, + "loss": 0.3985, + "step": 10706 + }, + { + "epoch": 0.8695687484772192, + "grad_norm": 3.6426553923017337, + "learning_rate": 2.1978867773800205e-07, + "loss": 0.6897, + "step": 10707 + }, + { + "epoch": 0.8696499634532607, + "grad_norm": 5.392872165186957, + "learning_rate": 2.1951913008542297e-07, + "loss": 0.318, + "step": 10708 + }, + { + "epoch": 0.8697311784293024, + "grad_norm": 6.652109485767294, + "learning_rate": 2.1924974023005086e-07, + "loss": 0.396, + "step": 10709 + }, + { + "epoch": 0.869812393405344, + "grad_norm": 6.329696488288848, + "learning_rate": 2.189805081905255e-07, + "loss": 0.3915, + "step": 10710 + }, + { + "epoch": 0.8698936083813855, + "grad_norm": 3.238598078513282, + "learning_rate": 2.1871143398547735e-07, + "loss": 0.5392, + "step": 10711 + }, + { + "epoch": 0.8699748233574272, + "grad_norm": 6.202895386014103, + "learning_rate": 2.184425176335239e-07, + "loss": 0.535, + "step": 10712 + }, + { + "epoch": 0.8700560383334687, + "grad_norm": 7.1313443357236075, + "learning_rate": 2.1817375915327342e-07, + "loss": 0.3804, + "step": 10713 + }, + { + "epoch": 0.8701372533095103, + "grad_norm": 4.686543627354921, + "learning_rate": 2.1790515856332268e-07, + "loss": 0.485, + "step": 10714 + }, + { + "epoch": 0.8702184682855518, + "grad_norm": 4.348632743659829, + "learning_rate": 2.1763671588225705e-07, + "loss": 0.4805, + "step": 10715 + }, + { + "epoch": 0.8702996832615935, + "grad_norm": 4.845358239649171, + "learning_rate": 2.173684311286517e-07, + "loss": 0.4082, + "step": 10716 + }, + { + "epoch": 0.870380898237635, + "grad_norm": 5.562307404701293, + "learning_rate": 2.1710030432106982e-07, + "loss": 0.5136, + "step": 10717 + }, + { + "epoch": 0.8704621132136766, + "grad_norm": 15.50221360580663, + "learning_rate": 2.1683233547806494e-07, + "loss": 0.4594, + "step": 10718 + }, + { + "epoch": 0.8705433281897181, + "grad_norm": 8.184235968979692, + "learning_rate": 2.1656452461817883e-07, + "loss": 0.5134, + "step": 10719 + }, + { + "epoch": 0.8706245431657598, + "grad_norm": 5.086338805312825, + "learning_rate": 2.162968717599423e-07, + "loss": 0.6206, + "step": 10720 + }, + { + "epoch": 0.8707057581418014, + "grad_norm": 6.866907411125338, + "learning_rate": 2.1602937692187685e-07, + "loss": 0.3373, + "step": 10721 + }, + { + "epoch": 0.8707869731178429, + "grad_norm": 6.472579080212052, + "learning_rate": 2.1576204012249053e-07, + "loss": 0.4828, + "step": 10722 + }, + { + "epoch": 0.8708681880938846, + "grad_norm": 6.9868588515529915, + "learning_rate": 2.1549486138028125e-07, + "loss": 0.537, + "step": 10723 + }, + { + "epoch": 0.8709494030699261, + "grad_norm": 4.648588890106703, + "learning_rate": 2.152278407137376e-07, + "loss": 0.4446, + "step": 10724 + }, + { + "epoch": 0.8710306180459677, + "grad_norm": 5.2378357215552915, + "learning_rate": 2.1496097814133503e-07, + "loss": 0.4319, + "step": 10725 + }, + { + "epoch": 0.8711118330220092, + "grad_norm": 4.349980838222139, + "learning_rate": 2.146942736815391e-07, + "loss": 0.3548, + "step": 10726 + }, + { + "epoch": 0.8711930479980509, + "grad_norm": 3.1974318652567026, + "learning_rate": 2.1442772735280532e-07, + "loss": 0.574, + "step": 10727 + }, + { + "epoch": 0.8712742629740924, + "grad_norm": 6.948883443858628, + "learning_rate": 2.1416133917357668e-07, + "loss": 0.5418, + "step": 10728 + }, + { + "epoch": 0.871355477950134, + "grad_norm": 7.741555115120562, + "learning_rate": 2.1389510916228513e-07, + "loss": 0.5892, + "step": 10729 + }, + { + "epoch": 0.8714366929261755, + "grad_norm": 4.815397769745156, + "learning_rate": 2.136290373373534e-07, + "loss": 0.6456, + "step": 10730 + }, + { + "epoch": 0.8715179079022172, + "grad_norm": 4.733327642778111, + "learning_rate": 2.1336312371719182e-07, + "loss": 0.5199, + "step": 10731 + }, + { + "epoch": 0.8715991228782588, + "grad_norm": 6.863949439219225, + "learning_rate": 2.130973683201998e-07, + "loss": 0.489, + "step": 10732 + }, + { + "epoch": 0.8716803378543003, + "grad_norm": 6.500756310540661, + "learning_rate": 2.128317711647665e-07, + "loss": 0.4344, + "step": 10733 + }, + { + "epoch": 0.871761552830342, + "grad_norm": 7.69706976013433, + "learning_rate": 2.125663322692706e-07, + "loss": 0.5088, + "step": 10734 + }, + { + "epoch": 0.8718427678063835, + "grad_norm": 6.331349228694531, + "learning_rate": 2.1230105165207848e-07, + "loss": 0.6559, + "step": 10735 + }, + { + "epoch": 0.8719239827824251, + "grad_norm": 5.041214163229692, + "learning_rate": 2.120359293315455e-07, + "loss": 0.6072, + "step": 10736 + }, + { + "epoch": 0.8720051977584666, + "grad_norm": 5.582839372557744, + "learning_rate": 2.1177096532601777e-07, + "loss": 0.4588, + "step": 10737 + }, + { + "epoch": 0.8720864127345083, + "grad_norm": 7.814578266112515, + "learning_rate": 2.115061596538287e-07, + "loss": 0.4815, + "step": 10738 + }, + { + "epoch": 0.8721676277105498, + "grad_norm": 5.583906428631362, + "learning_rate": 2.112415123333014e-07, + "loss": 0.5404, + "step": 10739 + }, + { + "epoch": 0.8722488426865914, + "grad_norm": 7.942670374026383, + "learning_rate": 2.1097702338274907e-07, + "loss": 0.4854, + "step": 10740 + }, + { + "epoch": 0.8723300576626329, + "grad_norm": 4.655699223151669, + "learning_rate": 2.1071269282047196e-07, + "loss": 0.6385, + "step": 10741 + }, + { + "epoch": 0.8724112726386746, + "grad_norm": 5.347652457276179, + "learning_rate": 2.1044852066476052e-07, + "loss": 0.5986, + "step": 10742 + }, + { + "epoch": 0.8724924876147162, + "grad_norm": 6.78105342737064, + "learning_rate": 2.1018450693389452e-07, + "loss": 0.4435, + "step": 10743 + }, + { + "epoch": 0.8725737025907577, + "grad_norm": 4.350453860498677, + "learning_rate": 2.099206516461419e-07, + "loss": 0.5274, + "step": 10744 + }, + { + "epoch": 0.8726549175667994, + "grad_norm": 4.768332070430383, + "learning_rate": 2.096569548197594e-07, + "loss": 0.551, + "step": 10745 + }, + { + "epoch": 0.8727361325428409, + "grad_norm": 6.078372849255497, + "learning_rate": 2.0939341647299437e-07, + "loss": 0.4384, + "step": 10746 + }, + { + "epoch": 0.8728173475188825, + "grad_norm": 11.608495384238454, + "learning_rate": 2.0913003662408254e-07, + "loss": 0.6075, + "step": 10747 + }, + { + "epoch": 0.872898562494924, + "grad_norm": 8.027806223001297, + "learning_rate": 2.0886681529124765e-07, + "loss": 0.4888, + "step": 10748 + }, + { + "epoch": 0.8729797774709657, + "grad_norm": 4.082351604317076, + "learning_rate": 2.086037524927037e-07, + "loss": 0.5281, + "step": 10749 + }, + { + "epoch": 0.8730609924470072, + "grad_norm": 5.82956001643256, + "learning_rate": 2.0834084824665314e-07, + "loss": 0.5854, + "step": 10750 + }, + { + "epoch": 0.8731422074230488, + "grad_norm": 9.953252455414237, + "learning_rate": 2.0807810257128692e-07, + "loss": 0.4345, + "step": 10751 + }, + { + "epoch": 0.8732234223990903, + "grad_norm": 5.897867057523133, + "learning_rate": 2.0781551548478607e-07, + "loss": 0.5252, + "step": 10752 + }, + { + "epoch": 0.873304637375132, + "grad_norm": 5.203890577661351, + "learning_rate": 2.0755308700532077e-07, + "loss": 0.3069, + "step": 10753 + }, + { + "epoch": 0.8733858523511736, + "grad_norm": 9.388507828923037, + "learning_rate": 2.0729081715104958e-07, + "loss": 0.3058, + "step": 10754 + }, + { + "epoch": 0.8734670673272151, + "grad_norm": 4.665994646948376, + "learning_rate": 2.070287059401191e-07, + "loss": 0.4658, + "step": 10755 + }, + { + "epoch": 0.8735482823032568, + "grad_norm": 4.509183704578975, + "learning_rate": 2.0676675339066726e-07, + "loss": 0.4733, + "step": 10756 + }, + { + "epoch": 0.8736294972792983, + "grad_norm": 4.982775473415814, + "learning_rate": 2.0650495952081935e-07, + "loss": 0.388, + "step": 10757 + }, + { + "epoch": 0.8737107122553399, + "grad_norm": 3.915834558563621, + "learning_rate": 2.062433243486897e-07, + "loss": 0.4357, + "step": 10758 + }, + { + "epoch": 0.8737919272313814, + "grad_norm": 5.779273705619936, + "learning_rate": 2.059818478923825e-07, + "loss": 0.4962, + "step": 10759 + }, + { + "epoch": 0.8738731422074231, + "grad_norm": 7.124975275931914, + "learning_rate": 2.0572053016999079e-07, + "loss": 0.4915, + "step": 10760 + }, + { + "epoch": 0.8739543571834646, + "grad_norm": 4.688739212653256, + "learning_rate": 2.0545937119959557e-07, + "loss": 0.5523, + "step": 10761 + }, + { + "epoch": 0.8740355721595062, + "grad_norm": 6.512142297147514, + "learning_rate": 2.0519837099926888e-07, + "loss": 0.4864, + "step": 10762 + }, + { + "epoch": 0.8741167871355477, + "grad_norm": 3.968204310846973, + "learning_rate": 2.0493752958706982e-07, + "loss": 0.4545, + "step": 10763 + }, + { + "epoch": 0.8741980021115894, + "grad_norm": 5.589051586917989, + "learning_rate": 2.0467684698104674e-07, + "loss": 0.3359, + "step": 10764 + }, + { + "epoch": 0.874279217087631, + "grad_norm": 9.371561135244747, + "learning_rate": 2.0441632319923798e-07, + "loss": 0.3767, + "step": 10765 + }, + { + "epoch": 0.8743604320636725, + "grad_norm": 6.212854338465917, + "learning_rate": 2.0415595825967084e-07, + "loss": 0.5425, + "step": 10766 + }, + { + "epoch": 0.8744416470397142, + "grad_norm": 4.1144373279605135, + "learning_rate": 2.0389575218036057e-07, + "loss": 0.6139, + "step": 10767 + }, + { + "epoch": 0.8745228620157557, + "grad_norm": 4.023569708161643, + "learning_rate": 2.0363570497931252e-07, + "loss": 0.4479, + "step": 10768 + }, + { + "epoch": 0.8746040769917973, + "grad_norm": 3.8449587292031686, + "learning_rate": 2.0337581667452034e-07, + "loss": 0.5775, + "step": 10769 + }, + { + "epoch": 0.8746852919678388, + "grad_norm": 6.240516560084869, + "learning_rate": 2.0311608728396658e-07, + "loss": 0.5565, + "step": 10770 + }, + { + "epoch": 0.8747665069438805, + "grad_norm": 7.754381694503205, + "learning_rate": 2.0285651682562357e-07, + "loss": 0.4689, + "step": 10771 + }, + { + "epoch": 0.874847721919922, + "grad_norm": 5.711700208395236, + "learning_rate": 2.0259710531745247e-07, + "loss": 0.4805, + "step": 10772 + }, + { + "epoch": 0.8749289368959636, + "grad_norm": 5.1893458798213326, + "learning_rate": 2.023378527774028e-07, + "loss": 0.4845, + "step": 10773 + }, + { + "epoch": 0.8750101518720051, + "grad_norm": 7.133320338283911, + "learning_rate": 2.020787592234133e-07, + "loss": 0.4978, + "step": 10774 + }, + { + "epoch": 0.8750913668480468, + "grad_norm": 22.519170871083364, + "learning_rate": 2.0181982467341238e-07, + "loss": 0.4116, + "step": 10775 + }, + { + "epoch": 0.8751725818240884, + "grad_norm": 9.287767664431676, + "learning_rate": 2.0156104914531656e-07, + "loss": 0.4641, + "step": 10776 + }, + { + "epoch": 0.8752537968001299, + "grad_norm": 4.893583526059918, + "learning_rate": 2.0130243265703148e-07, + "loss": 0.481, + "step": 10777 + }, + { + "epoch": 0.8753350117761716, + "grad_norm": 5.320206234300975, + "learning_rate": 2.010439752264523e-07, + "loss": 0.3727, + "step": 10778 + }, + { + "epoch": 0.8754162267522131, + "grad_norm": 4.677489372940032, + "learning_rate": 2.0078567687146333e-07, + "loss": 0.447, + "step": 10779 + }, + { + "epoch": 0.8754974417282547, + "grad_norm": 4.0249196656831465, + "learning_rate": 2.0052753760993693e-07, + "loss": 0.4569, + "step": 10780 + }, + { + "epoch": 0.8755786567042962, + "grad_norm": 6.529303245559503, + "learning_rate": 2.002695574597352e-07, + "loss": 0.3912, + "step": 10781 + }, + { + "epoch": 0.8756598716803379, + "grad_norm": 5.418332568962745, + "learning_rate": 2.0001173643870915e-07, + "loss": 0.5883, + "step": 10782 + }, + { + "epoch": 0.8757410866563794, + "grad_norm": 5.33583924657257, + "learning_rate": 1.9975407456469808e-07, + "loss": 0.4482, + "step": 10783 + }, + { + "epoch": 0.875822301632421, + "grad_norm": 5.518754450917286, + "learning_rate": 1.9949657185553113e-07, + "loss": 0.4846, + "step": 10784 + }, + { + "epoch": 0.8759035166084626, + "grad_norm": 4.466728527582831, + "learning_rate": 1.992392283290265e-07, + "loss": 0.4897, + "step": 10785 + }, + { + "epoch": 0.8759847315845042, + "grad_norm": 6.062215529147515, + "learning_rate": 1.9898204400299021e-07, + "loss": 0.4688, + "step": 10786 + }, + { + "epoch": 0.8760659465605458, + "grad_norm": 6.196645398811499, + "learning_rate": 1.9872501889521916e-07, + "loss": 0.4591, + "step": 10787 + }, + { + "epoch": 0.8761471615365873, + "grad_norm": 5.38690541555504, + "learning_rate": 1.984681530234972e-07, + "loss": 0.3928, + "step": 10788 + }, + { + "epoch": 0.876228376512629, + "grad_norm": 9.821582116837323, + "learning_rate": 1.9821144640559842e-07, + "loss": 0.4919, + "step": 10789 + }, + { + "epoch": 0.8763095914886705, + "grad_norm": 7.245646513525235, + "learning_rate": 1.9795489905928527e-07, + "loss": 0.459, + "step": 10790 + }, + { + "epoch": 0.8763908064647121, + "grad_norm": 7.6962316221213385, + "learning_rate": 1.976985110023094e-07, + "loss": 0.4777, + "step": 10791 + }, + { + "epoch": 0.8764720214407536, + "grad_norm": 4.354060604688436, + "learning_rate": 1.9744228225241248e-07, + "loss": 0.4522, + "step": 10792 + }, + { + "epoch": 0.8765532364167953, + "grad_norm": 4.660763544694212, + "learning_rate": 1.9718621282732302e-07, + "loss": 0.4305, + "step": 10793 + }, + { + "epoch": 0.8766344513928368, + "grad_norm": 6.796070237851726, + "learning_rate": 1.9693030274476054e-07, + "loss": 0.4301, + "step": 10794 + }, + { + "epoch": 0.8767156663688784, + "grad_norm": 6.8550929057120635, + "learning_rate": 1.9667455202243223e-07, + "loss": 0.4308, + "step": 10795 + }, + { + "epoch": 0.87679688134492, + "grad_norm": 10.526313953549316, + "learning_rate": 1.9641896067803452e-07, + "loss": 0.4715, + "step": 10796 + }, + { + "epoch": 0.8768780963209616, + "grad_norm": 4.6047250304925615, + "learning_rate": 1.9616352872925293e-07, + "loss": 0.529, + "step": 10797 + }, + { + "epoch": 0.8769593112970032, + "grad_norm": 6.880938367850307, + "learning_rate": 1.959082561937628e-07, + "loss": 0.5294, + "step": 10798 + }, + { + "epoch": 0.8770405262730447, + "grad_norm": 3.8531909306263272, + "learning_rate": 1.9565314308922666e-07, + "loss": 0.4086, + "step": 10799 + }, + { + "epoch": 0.8771217412490864, + "grad_norm": 5.451866169993255, + "learning_rate": 1.9539818943329792e-07, + "loss": 0.4557, + "step": 10800 + }, + { + "epoch": 0.8772029562251279, + "grad_norm": 6.999546249788755, + "learning_rate": 1.9514339524361742e-07, + "loss": 0.4339, + "step": 10801 + }, + { + "epoch": 0.8772841712011695, + "grad_norm": 5.662909641489243, + "learning_rate": 1.9488876053781552e-07, + "loss": 0.4453, + "step": 10802 + }, + { + "epoch": 0.877365386177211, + "grad_norm": 4.898755716797307, + "learning_rate": 1.9463428533351202e-07, + "loss": 0.6062, + "step": 10803 + }, + { + "epoch": 0.8774466011532527, + "grad_norm": 4.183435356460552, + "learning_rate": 1.943799696483145e-07, + "loss": 0.4585, + "step": 10804 + }, + { + "epoch": 0.8775278161292942, + "grad_norm": 4.396794358678346, + "learning_rate": 1.9412581349982113e-07, + "loss": 0.4212, + "step": 10805 + }, + { + "epoch": 0.8776090311053358, + "grad_norm": 4.627929173347972, + "learning_rate": 1.938718169056175e-07, + "loss": 0.5645, + "step": 10806 + }, + { + "epoch": 0.8776902460813774, + "grad_norm": 4.99764782498259, + "learning_rate": 1.9361797988327961e-07, + "loss": 0.6663, + "step": 10807 + }, + { + "epoch": 0.877771461057419, + "grad_norm": 3.9890801417363937, + "learning_rate": 1.933643024503712e-07, + "loss": 0.5455, + "step": 10808 + }, + { + "epoch": 0.8778526760334606, + "grad_norm": 5.065903978646239, + "learning_rate": 1.9311078462444484e-07, + "loss": 0.633, + "step": 10809 + }, + { + "epoch": 0.8779338910095021, + "grad_norm": 7.184247053486219, + "learning_rate": 1.928574264230429e-07, + "loss": 0.3581, + "step": 10810 + }, + { + "epoch": 0.8780151059855438, + "grad_norm": 7.648845912445987, + "learning_rate": 1.9260422786369747e-07, + "loss": 0.3958, + "step": 10811 + }, + { + "epoch": 0.8780963209615853, + "grad_norm": 2.778368867161788, + "learning_rate": 1.9235118896392706e-07, + "loss": 0.3277, + "step": 10812 + }, + { + "epoch": 0.8781775359376269, + "grad_norm": 4.8097717739494685, + "learning_rate": 1.9209830974124183e-07, + "loss": 0.4247, + "step": 10813 + }, + { + "epoch": 0.8782587509136685, + "grad_norm": 10.137967528923353, + "learning_rate": 1.9184559021313914e-07, + "loss": 0.4587, + "step": 10814 + }, + { + "epoch": 0.8783399658897101, + "grad_norm": 4.796708634400599, + "learning_rate": 1.9159303039710558e-07, + "loss": 0.4826, + "step": 10815 + }, + { + "epoch": 0.8784211808657516, + "grad_norm": 9.314388410278989, + "learning_rate": 1.9134063031061744e-07, + "loss": 0.516, + "step": 10816 + }, + { + "epoch": 0.8785023958417932, + "grad_norm": 7.232628959274363, + "learning_rate": 1.910883899711391e-07, + "loss": 0.5164, + "step": 10817 + }, + { + "epoch": 0.8785836108178348, + "grad_norm": 3.9022829908795322, + "learning_rate": 1.9083630939612407e-07, + "loss": 0.5444, + "step": 10818 + }, + { + "epoch": 0.8786648257938764, + "grad_norm": 3.8942957114269627, + "learning_rate": 1.9058438860301621e-07, + "loss": 0.3569, + "step": 10819 + }, + { + "epoch": 0.878746040769918, + "grad_norm": 3.166404623353523, + "learning_rate": 1.9033262760924598e-07, + "loss": 0.4069, + "step": 10820 + }, + { + "epoch": 0.8788272557459595, + "grad_norm": 5.035265769137359, + "learning_rate": 1.900810264322339e-07, + "loss": 0.3724, + "step": 10821 + }, + { + "epoch": 0.8789084707220012, + "grad_norm": 6.137216581765903, + "learning_rate": 1.8982958508938998e-07, + "loss": 0.4983, + "step": 10822 + }, + { + "epoch": 0.8789896856980427, + "grad_norm": 6.449621491427784, + "learning_rate": 1.895783035981119e-07, + "loss": 0.5643, + "step": 10823 + }, + { + "epoch": 0.8790709006740843, + "grad_norm": 7.017513815097841, + "learning_rate": 1.8932718197578802e-07, + "loss": 0.4371, + "step": 10824 + }, + { + "epoch": 0.8791521156501259, + "grad_norm": 4.915420085875843, + "learning_rate": 1.890762202397936e-07, + "loss": 0.7439, + "step": 10825 + }, + { + "epoch": 0.8792333306261675, + "grad_norm": 5.057095171528424, + "learning_rate": 1.8882541840749475e-07, + "loss": 0.5297, + "step": 10826 + }, + { + "epoch": 0.879314545602209, + "grad_norm": 4.992472857392407, + "learning_rate": 1.8857477649624533e-07, + "loss": 0.5945, + "step": 10827 + }, + { + "epoch": 0.8793957605782506, + "grad_norm": 8.570114515507747, + "learning_rate": 1.883242945233879e-07, + "loss": 0.5583, + "step": 10828 + }, + { + "epoch": 0.8794769755542922, + "grad_norm": 4.881453778874329, + "learning_rate": 1.8807397250625497e-07, + "loss": 0.3574, + "step": 10829 + }, + { + "epoch": 0.8795581905303338, + "grad_norm": 9.774498600782247, + "learning_rate": 1.878238104621677e-07, + "loss": 0.2625, + "step": 10830 + }, + { + "epoch": 0.8796394055063754, + "grad_norm": 5.93121202645404, + "learning_rate": 1.8757380840843526e-07, + "loss": 0.4918, + "step": 10831 + }, + { + "epoch": 0.879720620482417, + "grad_norm": 3.681222565715722, + "learning_rate": 1.8732396636235744e-07, + "loss": 0.6349, + "step": 10832 + }, + { + "epoch": 0.8798018354584586, + "grad_norm": 6.108142266361864, + "learning_rate": 1.8707428434122155e-07, + "loss": 0.576, + "step": 10833 + }, + { + "epoch": 0.8798830504345001, + "grad_norm": 6.054085118997518, + "learning_rate": 1.8682476236230372e-07, + "loss": 0.4904, + "step": 10834 + }, + { + "epoch": 0.8799642654105417, + "grad_norm": 7.843292571737394, + "learning_rate": 1.8657540044287047e-07, + "loss": 0.3619, + "step": 10835 + }, + { + "epoch": 0.8800454803865833, + "grad_norm": 3.6574632924680044, + "learning_rate": 1.8632619860017547e-07, + "loss": 0.3475, + "step": 10836 + }, + { + "epoch": 0.8801266953626249, + "grad_norm": 5.74599899447566, + "learning_rate": 1.8607715685146244e-07, + "loss": 0.463, + "step": 10837 + }, + { + "epoch": 0.8802079103386664, + "grad_norm": 4.130892724817378, + "learning_rate": 1.8582827521396453e-07, + "loss": 0.5069, + "step": 10838 + }, + { + "epoch": 0.880289125314708, + "grad_norm": 6.536393003461804, + "learning_rate": 1.855795537049021e-07, + "loss": 0.4378, + "step": 10839 + }, + { + "epoch": 0.8803703402907496, + "grad_norm": 5.520938271169849, + "learning_rate": 1.853309923414856e-07, + "loss": 0.481, + "step": 10840 + }, + { + "epoch": 0.8804515552667912, + "grad_norm": 13.77624681570704, + "learning_rate": 1.8508259114091432e-07, + "loss": 0.5875, + "step": 10841 + }, + { + "epoch": 0.8805327702428328, + "grad_norm": 4.241071474476681, + "learning_rate": 1.8483435012037587e-07, + "loss": 0.3521, + "step": 10842 + }, + { + "epoch": 0.8806139852188744, + "grad_norm": 8.981810629766965, + "learning_rate": 1.8458626929704821e-07, + "loss": 0.4218, + "step": 10843 + }, + { + "epoch": 0.880695200194916, + "grad_norm": 4.1379268820071236, + "learning_rate": 1.843383486880959e-07, + "loss": 0.6531, + "step": 10844 + }, + { + "epoch": 0.8807764151709575, + "grad_norm": 4.30762221426034, + "learning_rate": 1.840905883106747e-07, + "loss": 0.6161, + "step": 10845 + }, + { + "epoch": 0.8808576301469991, + "grad_norm": 6.207428929843423, + "learning_rate": 1.8384298818192814e-07, + "loss": 0.4597, + "step": 10846 + }, + { + "epoch": 0.8809388451230407, + "grad_norm": 6.194628295310705, + "learning_rate": 1.835955483189883e-07, + "loss": 0.4932, + "step": 10847 + }, + { + "epoch": 0.8810200600990823, + "grad_norm": 4.398170364958908, + "learning_rate": 1.833482687389776e-07, + "loss": 0.4016, + "step": 10848 + }, + { + "epoch": 0.8811012750751238, + "grad_norm": 6.570347236455752, + "learning_rate": 1.831011494590054e-07, + "loss": 0.4765, + "step": 10849 + }, + { + "epoch": 0.8811824900511654, + "grad_norm": 5.039000129540002, + "learning_rate": 1.828541904961717e-07, + "loss": 0.4721, + "step": 10850 + }, + { + "epoch": 0.881263705027207, + "grad_norm": 6.470581102201411, + "learning_rate": 1.8260739186756527e-07, + "loss": 0.7508, + "step": 10851 + }, + { + "epoch": 0.8813449200032486, + "grad_norm": 4.439815787899128, + "learning_rate": 1.8236075359026246e-07, + "loss": 0.4522, + "step": 10852 + }, + { + "epoch": 0.8814261349792902, + "grad_norm": 6.875926190964903, + "learning_rate": 1.8211427568132932e-07, + "loss": 0.4571, + "step": 10853 + }, + { + "epoch": 0.8815073499553318, + "grad_norm": 5.042487513250472, + "learning_rate": 1.8186795815782143e-07, + "loss": 0.4715, + "step": 10854 + }, + { + "epoch": 0.8815885649313734, + "grad_norm": 6.319914060990893, + "learning_rate": 1.8162180103678177e-07, + "loss": 0.357, + "step": 10855 + }, + { + "epoch": 0.8816697799074149, + "grad_norm": 5.98665836166037, + "learning_rate": 1.813758043352437e-07, + "loss": 0.6732, + "step": 10856 + }, + { + "epoch": 0.8817509948834565, + "grad_norm": 4.90779966317718, + "learning_rate": 1.8112996807022943e-07, + "loss": 0.4064, + "step": 10857 + }, + { + "epoch": 0.8818322098594981, + "grad_norm": 4.954078590442837, + "learning_rate": 1.8088429225874865e-07, + "loss": 0.5962, + "step": 10858 + }, + { + "epoch": 0.8819134248355397, + "grad_norm": 7.208628926964807, + "learning_rate": 1.8063877691780114e-07, + "loss": 0.4485, + "step": 10859 + }, + { + "epoch": 0.8819946398115812, + "grad_norm": 5.399228378212703, + "learning_rate": 1.8039342206437494e-07, + "loss": 0.6773, + "step": 10860 + }, + { + "epoch": 0.8820758547876228, + "grad_norm": 6.733238765318234, + "learning_rate": 1.8014822771544787e-07, + "loss": 0.5487, + "step": 10861 + }, + { + "epoch": 0.8821570697636644, + "grad_norm": 6.204508713784901, + "learning_rate": 1.7990319388798527e-07, + "loss": 0.379, + "step": 10862 + }, + { + "epoch": 0.882238284739706, + "grad_norm": 4.9418711968437545, + "learning_rate": 1.79658320598943e-07, + "loss": 0.4973, + "step": 10863 + }, + { + "epoch": 0.8823194997157476, + "grad_norm": 7.6609348179930805, + "learning_rate": 1.79413607865265e-07, + "loss": 0.5288, + "step": 10864 + }, + { + "epoch": 0.8824007146917892, + "grad_norm": 5.302913931640557, + "learning_rate": 1.7916905570388387e-07, + "loss": 0.4702, + "step": 10865 + }, + { + "epoch": 0.8824819296678308, + "grad_norm": 5.228383412229427, + "learning_rate": 1.7892466413172076e-07, + "loss": 0.3797, + "step": 10866 + }, + { + "epoch": 0.8825631446438723, + "grad_norm": 4.948041110181531, + "learning_rate": 1.7868043316568718e-07, + "loss": 0.3931, + "step": 10867 + }, + { + "epoch": 0.8826443596199139, + "grad_norm": 8.069840917352794, + "learning_rate": 1.784363628226818e-07, + "loss": 0.4122, + "step": 10868 + }, + { + "epoch": 0.8827255745959555, + "grad_norm": 5.771520601557648, + "learning_rate": 1.781924531195933e-07, + "loss": 0.4682, + "step": 10869 + }, + { + "epoch": 0.8828067895719971, + "grad_norm": 5.964777339901852, + "learning_rate": 1.7794870407329968e-07, + "loss": 0.3881, + "step": 10870 + }, + { + "epoch": 0.8828880045480386, + "grad_norm": 17.63419248026244, + "learning_rate": 1.7770511570066622e-07, + "loss": 0.4282, + "step": 10871 + }, + { + "epoch": 0.8829692195240803, + "grad_norm": 4.937868582682188, + "learning_rate": 1.7746168801854786e-07, + "loss": 0.4074, + "step": 10872 + }, + { + "epoch": 0.8830504345001218, + "grad_norm": 7.45687522217002, + "learning_rate": 1.772184210437894e-07, + "loss": 0.3702, + "step": 10873 + }, + { + "epoch": 0.8831316494761634, + "grad_norm": 4.456345546727624, + "learning_rate": 1.7697531479322294e-07, + "loss": 0.4297, + "step": 10874 + }, + { + "epoch": 0.883212864452205, + "grad_norm": 4.869647315752035, + "learning_rate": 1.7673236928366976e-07, + "loss": 0.4171, + "step": 10875 + }, + { + "epoch": 0.8832940794282466, + "grad_norm": 4.695411262956829, + "learning_rate": 1.7648958453194086e-07, + "loss": 0.4707, + "step": 10876 + }, + { + "epoch": 0.8833752944042882, + "grad_norm": 7.882022541168915, + "learning_rate": 1.7624696055483643e-07, + "loss": 0.4886, + "step": 10877 + }, + { + "epoch": 0.8834565093803297, + "grad_norm": 7.071815658627316, + "learning_rate": 1.7600449736914384e-07, + "loss": 0.4957, + "step": 10878 + }, + { + "epoch": 0.8835377243563713, + "grad_norm": 5.21615020327953, + "learning_rate": 1.7576219499163995e-07, + "loss": 0.395, + "step": 10879 + }, + { + "epoch": 0.8836189393324129, + "grad_norm": 3.7677226715193015, + "learning_rate": 1.7552005343909162e-07, + "loss": 0.6283, + "step": 10880 + }, + { + "epoch": 0.8837001543084545, + "grad_norm": 6.221136511822739, + "learning_rate": 1.7527807272825326e-07, + "loss": 0.4107, + "step": 10881 + }, + { + "epoch": 0.883781369284496, + "grad_norm": 4.483843051687907, + "learning_rate": 1.7503625287586896e-07, + "loss": 0.4108, + "step": 10882 + }, + { + "epoch": 0.8838625842605377, + "grad_norm": 5.371900604040972, + "learning_rate": 1.7479459389867141e-07, + "loss": 0.463, + "step": 10883 + }, + { + "epoch": 0.8839437992365792, + "grad_norm": 4.729136507795324, + "learning_rate": 1.7455309581338204e-07, + "loss": 0.5657, + "step": 10884 + }, + { + "epoch": 0.8840250142126208, + "grad_norm": 5.952224993059543, + "learning_rate": 1.7431175863671102e-07, + "loss": 0.462, + "step": 10885 + }, + { + "epoch": 0.8841062291886624, + "grad_norm": 5.332245081683401, + "learning_rate": 1.740705823853578e-07, + "loss": 0.5023, + "step": 10886 + }, + { + "epoch": 0.884187444164704, + "grad_norm": 4.301376869737246, + "learning_rate": 1.7382956707601068e-07, + "loss": 0.6023, + "step": 10887 + }, + { + "epoch": 0.8842686591407456, + "grad_norm": 7.399999113034231, + "learning_rate": 1.7358871272534604e-07, + "loss": 0.383, + "step": 10888 + }, + { + "epoch": 0.8843498741167871, + "grad_norm": 6.01935069429763, + "learning_rate": 1.7334801935003003e-07, + "loss": 0.3966, + "step": 10889 + }, + { + "epoch": 0.8844310890928287, + "grad_norm": 12.267510974208244, + "learning_rate": 1.7310748696671791e-07, + "loss": 0.4748, + "step": 10890 + }, + { + "epoch": 0.8845123040688703, + "grad_norm": 4.471683226123735, + "learning_rate": 1.728671155920525e-07, + "loss": 0.557, + "step": 10891 + }, + { + "epoch": 0.8845935190449119, + "grad_norm": 4.5797052007927626, + "learning_rate": 1.7262690524266658e-07, + "loss": 0.3825, + "step": 10892 + }, + { + "epoch": 0.8846747340209534, + "grad_norm": 5.819985449696891, + "learning_rate": 1.7238685593518157e-07, + "loss": 0.4803, + "step": 10893 + }, + { + "epoch": 0.8847559489969951, + "grad_norm": 12.610582423388232, + "learning_rate": 1.7214696768620699e-07, + "loss": 0.4021, + "step": 10894 + }, + { + "epoch": 0.8848371639730366, + "grad_norm": 5.295182105134932, + "learning_rate": 1.719072405123423e-07, + "loss": 0.4256, + "step": 10895 + }, + { + "epoch": 0.8849183789490782, + "grad_norm": 4.518250999036192, + "learning_rate": 1.7166767443017567e-07, + "loss": 0.4549, + "step": 10896 + }, + { + "epoch": 0.8849995939251198, + "grad_norm": 3.165277818453232, + "learning_rate": 1.7142826945628353e-07, + "loss": 0.7135, + "step": 10897 + }, + { + "epoch": 0.8850808089011614, + "grad_norm": 14.285562335378861, + "learning_rate": 1.7118902560723072e-07, + "loss": 0.5167, + "step": 10898 + }, + { + "epoch": 0.885162023877203, + "grad_norm": 5.520164322684935, + "learning_rate": 1.7094994289957285e-07, + "loss": 0.4811, + "step": 10899 + }, + { + "epoch": 0.8852432388532445, + "grad_norm": 6.172181533567202, + "learning_rate": 1.7071102134985224e-07, + "loss": 0.4762, + "step": 10900 + }, + { + "epoch": 0.8853244538292862, + "grad_norm": 4.846940632513523, + "learning_rate": 1.7047226097460123e-07, + "loss": 0.3914, + "step": 10901 + }, + { + "epoch": 0.8854056688053277, + "grad_norm": 3.9531646427451927, + "learning_rate": 1.7023366179034135e-07, + "loss": 0.4715, + "step": 10902 + }, + { + "epoch": 0.8854868837813693, + "grad_norm": 7.419094636905311, + "learning_rate": 1.6999522381358187e-07, + "loss": 0.3049, + "step": 10903 + }, + { + "epoch": 0.8855680987574108, + "grad_norm": 8.695140393852423, + "learning_rate": 1.6975694706082125e-07, + "loss": 0.4374, + "step": 10904 + }, + { + "epoch": 0.8856493137334525, + "grad_norm": 4.688891686843388, + "learning_rate": 1.6951883154854771e-07, + "loss": 0.3473, + "step": 10905 + }, + { + "epoch": 0.885730528709494, + "grad_norm": 9.242824312960995, + "learning_rate": 1.6928087729323695e-07, + "loss": 0.4753, + "step": 10906 + }, + { + "epoch": 0.8858117436855356, + "grad_norm": 21.34205228899008, + "learning_rate": 1.6904308431135414e-07, + "loss": 0.4638, + "step": 10907 + }, + { + "epoch": 0.8858929586615772, + "grad_norm": 7.234575319695479, + "learning_rate": 1.6880545261935333e-07, + "loss": 0.4495, + "step": 10908 + }, + { + "epoch": 0.8859741736376188, + "grad_norm": 4.584768372230779, + "learning_rate": 1.6856798223367777e-07, + "loss": 0.4903, + "step": 10909 + }, + { + "epoch": 0.8860553886136604, + "grad_norm": 3.8405465855706966, + "learning_rate": 1.6833067317075875e-07, + "loss": 0.5838, + "step": 10910 + }, + { + "epoch": 0.8861366035897019, + "grad_norm": 11.336953000753638, + "learning_rate": 1.680935254470173e-07, + "loss": 0.3647, + "step": 10911 + }, + { + "epoch": 0.8862178185657436, + "grad_norm": 3.4138677524491783, + "learning_rate": 1.6785653907886251e-07, + "loss": 0.4593, + "step": 10912 + }, + { + "epoch": 0.8862990335417851, + "grad_norm": 7.281119584083047, + "learning_rate": 1.6761971408269184e-07, + "loss": 0.4414, + "step": 10913 + }, + { + "epoch": 0.8863802485178267, + "grad_norm": 8.096587754790656, + "learning_rate": 1.673830504748933e-07, + "loss": 0.5625, + "step": 10914 + }, + { + "epoch": 0.8864614634938682, + "grad_norm": 4.3613469827373725, + "learning_rate": 1.6714654827184263e-07, + "loss": 0.6731, + "step": 10915 + }, + { + "epoch": 0.8865426784699099, + "grad_norm": 36.26226393417871, + "learning_rate": 1.6691020748990455e-07, + "loss": 0.4585, + "step": 10916 + }, + { + "epoch": 0.8866238934459514, + "grad_norm": 4.750194227102958, + "learning_rate": 1.6667402814543209e-07, + "loss": 0.5141, + "step": 10917 + }, + { + "epoch": 0.886705108421993, + "grad_norm": 7.5959663569614495, + "learning_rate": 1.66438010254768e-07, + "loss": 0.6047, + "step": 10918 + }, + { + "epoch": 0.8867863233980346, + "grad_norm": 6.985478995191811, + "learning_rate": 1.662021538342437e-07, + "loss": 0.5965, + "step": 10919 + }, + { + "epoch": 0.8868675383740762, + "grad_norm": 4.967674400774454, + "learning_rate": 1.6596645890017832e-07, + "loss": 0.4358, + "step": 10920 + }, + { + "epoch": 0.8869487533501178, + "grad_norm": 4.562428672417342, + "learning_rate": 1.6573092546888132e-07, + "loss": 0.4619, + "step": 10921 + }, + { + "epoch": 0.8870299683261593, + "grad_norm": 4.147461101360203, + "learning_rate": 1.6549555355665076e-07, + "loss": 0.452, + "step": 10922 + }, + { + "epoch": 0.887111183302201, + "grad_norm": 3.695271801611688, + "learning_rate": 1.6526034317977225e-07, + "loss": 0.5393, + "step": 10923 + }, + { + "epoch": 0.8871923982782425, + "grad_norm": 4.837405408033375, + "learning_rate": 1.650252943545222e-07, + "loss": 0.4885, + "step": 10924 + }, + { + "epoch": 0.8872736132542841, + "grad_norm": 4.704508929048984, + "learning_rate": 1.647904070971637e-07, + "loss": 0.5001, + "step": 10925 + }, + { + "epoch": 0.8873548282303256, + "grad_norm": 3.9115802054782427, + "learning_rate": 1.645556814239499e-07, + "loss": 0.4963, + "step": 10926 + }, + { + "epoch": 0.8874360432063673, + "grad_norm": 7.938004047544957, + "learning_rate": 1.6432111735112277e-07, + "loss": 0.4367, + "step": 10927 + }, + { + "epoch": 0.8875172581824088, + "grad_norm": 12.387873573548433, + "learning_rate": 1.6408671489491323e-07, + "loss": 0.3794, + "step": 10928 + }, + { + "epoch": 0.8875984731584504, + "grad_norm": 7.148993736986469, + "learning_rate": 1.6385247407154025e-07, + "loss": 0.4554, + "step": 10929 + }, + { + "epoch": 0.887679688134492, + "grad_norm": 6.143994601377806, + "learning_rate": 1.6361839489721227e-07, + "loss": 0.46, + "step": 10930 + }, + { + "epoch": 0.8877609031105336, + "grad_norm": 21.816971684053996, + "learning_rate": 1.6338447738812628e-07, + "loss": 0.6431, + "step": 10931 + }, + { + "epoch": 0.8878421180865752, + "grad_norm": 6.378758988903642, + "learning_rate": 1.631507215604683e-07, + "loss": 0.4062, + "step": 10932 + }, + { + "epoch": 0.8879233330626167, + "grad_norm": 8.818910417308302, + "learning_rate": 1.6291712743041226e-07, + "loss": 0.4481, + "step": 10933 + }, + { + "epoch": 0.8880045480386584, + "grad_norm": 3.3592851595714537, + "learning_rate": 1.6268369501412195e-07, + "loss": 0.6451, + "step": 10934 + }, + { + "epoch": 0.8880857630146999, + "grad_norm": 3.537250723511631, + "learning_rate": 1.6245042432775054e-07, + "loss": 0.4928, + "step": 10935 + }, + { + "epoch": 0.8881669779907415, + "grad_norm": 6.147519517852104, + "learning_rate": 1.622173153874379e-07, + "loss": 0.5083, + "step": 10936 + }, + { + "epoch": 0.888248192966783, + "grad_norm": 3.472301797973062, + "learning_rate": 1.61984368209315e-07, + "loss": 0.4876, + "step": 10937 + }, + { + "epoch": 0.8883294079428247, + "grad_norm": 4.755539554685603, + "learning_rate": 1.617515828095001e-07, + "loss": 0.3319, + "step": 10938 + }, + { + "epoch": 0.8884106229188662, + "grad_norm": 9.123690619357854, + "learning_rate": 1.615189592041e-07, + "loss": 0.5498, + "step": 10939 + }, + { + "epoch": 0.8884918378949078, + "grad_norm": 5.947150990803091, + "learning_rate": 1.6128649740921182e-07, + "loss": 0.4218, + "step": 10940 + }, + { + "epoch": 0.8885730528709495, + "grad_norm": 6.785328375370574, + "learning_rate": 1.6105419744092105e-07, + "loss": 0.5246, + "step": 10941 + }, + { + "epoch": 0.888654267846991, + "grad_norm": 7.518002211907693, + "learning_rate": 1.6082205931530064e-07, + "loss": 0.4103, + "step": 10942 + }, + { + "epoch": 0.8887354828230326, + "grad_norm": 5.241816204201098, + "learning_rate": 1.6059008304841417e-07, + "loss": 0.422, + "step": 10943 + }, + { + "epoch": 0.8888166977990741, + "grad_norm": 4.9351607203741175, + "learning_rate": 1.6035826865631292e-07, + "loss": 0.4762, + "step": 10944 + }, + { + "epoch": 0.8888979127751158, + "grad_norm": 19.003799619323694, + "learning_rate": 1.601266161550366e-07, + "loss": 0.6662, + "step": 10945 + }, + { + "epoch": 0.8889791277511573, + "grad_norm": 3.8825556090436786, + "learning_rate": 1.5989512556061516e-07, + "loss": 0.6389, + "step": 10946 + }, + { + "epoch": 0.8890603427271989, + "grad_norm": 7.947300455523199, + "learning_rate": 1.5966379688906576e-07, + "loss": 0.4424, + "step": 10947 + }, + { + "epoch": 0.8891415577032404, + "grad_norm": 5.795613017876368, + "learning_rate": 1.5943263015639614e-07, + "loss": 0.4964, + "step": 10948 + }, + { + "epoch": 0.8892227726792821, + "grad_norm": 6.8505646268664515, + "learning_rate": 1.592016253786008e-07, + "loss": 0.6119, + "step": 10949 + }, + { + "epoch": 0.8893039876553236, + "grad_norm": 8.889358889295613, + "learning_rate": 1.5897078257166492e-07, + "loss": 0.4562, + "step": 10950 + }, + { + "epoch": 0.8893852026313652, + "grad_norm": 5.6468959215119225, + "learning_rate": 1.5874010175156106e-07, + "loss": 0.4509, + "step": 10951 + }, + { + "epoch": 0.8894664176074069, + "grad_norm": 5.671002281059167, + "learning_rate": 1.585095829342509e-07, + "loss": 0.3751, + "step": 10952 + }, + { + "epoch": 0.8895476325834484, + "grad_norm": 7.946535661783076, + "learning_rate": 1.5827922613568524e-07, + "loss": 0.4659, + "step": 10953 + }, + { + "epoch": 0.88962884755949, + "grad_norm": 5.9522004926872025, + "learning_rate": 1.5804903137180415e-07, + "loss": 0.3708, + "step": 10954 + }, + { + "epoch": 0.8897100625355315, + "grad_norm": 10.178045190705534, + "learning_rate": 1.5781899865853544e-07, + "loss": 0.5587, + "step": 10955 + }, + { + "epoch": 0.8897912775115732, + "grad_norm": 4.253031697214437, + "learning_rate": 1.5758912801179637e-07, + "loss": 0.4611, + "step": 10956 + }, + { + "epoch": 0.8898724924876147, + "grad_norm": 6.692152756992295, + "learning_rate": 1.5735941944749255e-07, + "loss": 0.4811, + "step": 10957 + }, + { + "epoch": 0.8899537074636563, + "grad_norm": 4.694878917526122, + "learning_rate": 1.571298729815182e-07, + "loss": 0.3912, + "step": 10958 + }, + { + "epoch": 0.8900349224396978, + "grad_norm": 5.740301752503594, + "learning_rate": 1.569004886297576e-07, + "loss": 0.5173, + "step": 10959 + }, + { + "epoch": 0.8901161374157395, + "grad_norm": 4.500280402580769, + "learning_rate": 1.5667126640808216e-07, + "loss": 0.5058, + "step": 10960 + }, + { + "epoch": 0.890197352391781, + "grad_norm": 9.359821537794131, + "learning_rate": 1.564422063323534e-07, + "loss": 0.4839, + "step": 10961 + }, + { + "epoch": 0.8902785673678226, + "grad_norm": 11.943572724510128, + "learning_rate": 1.5621330841842086e-07, + "loss": 0.3788, + "step": 10962 + }, + { + "epoch": 0.8903597823438643, + "grad_norm": 5.141666870810457, + "learning_rate": 1.5598457268212353e-07, + "loss": 0.4711, + "step": 10963 + }, + { + "epoch": 0.8904409973199058, + "grad_norm": 3.330373351694768, + "learning_rate": 1.5575599913928735e-07, + "loss": 0.3897, + "step": 10964 + }, + { + "epoch": 0.8905222122959474, + "grad_norm": 3.825856052673269, + "learning_rate": 1.5552758780572995e-07, + "loss": 0.4227, + "step": 10965 + }, + { + "epoch": 0.8906034272719889, + "grad_norm": 5.142764433332306, + "learning_rate": 1.552993386972551e-07, + "loss": 0.4816, + "step": 10966 + }, + { + "epoch": 0.8906846422480306, + "grad_norm": 6.3600498260391785, + "learning_rate": 1.5507125182965737e-07, + "loss": 0.684, + "step": 10967 + }, + { + "epoch": 0.8907658572240721, + "grad_norm": 8.817725414443457, + "learning_rate": 1.5484332721871804e-07, + "loss": 0.5193, + "step": 10968 + }, + { + "epoch": 0.8908470722001137, + "grad_norm": 7.292802786468984, + "learning_rate": 1.5461556488020945e-07, + "loss": 0.4265, + "step": 10969 + }, + { + "epoch": 0.8909282871761552, + "grad_norm": 18.816459578152827, + "learning_rate": 1.5438796482989072e-07, + "loss": 0.4049, + "step": 10970 + }, + { + "epoch": 0.8910095021521969, + "grad_norm": 4.741507710988063, + "learning_rate": 1.541605270835106e-07, + "loss": 0.444, + "step": 10971 + }, + { + "epoch": 0.8910907171282384, + "grad_norm": 11.630609215202053, + "learning_rate": 1.5393325165680707e-07, + "loss": 0.4207, + "step": 10972 + }, + { + "epoch": 0.89117193210428, + "grad_norm": 3.301339255185129, + "learning_rate": 1.5370613856550615e-07, + "loss": 0.5021, + "step": 10973 + }, + { + "epoch": 0.8912531470803217, + "grad_norm": 4.4372396364921105, + "learning_rate": 1.534791878253228e-07, + "loss": 0.3981, + "step": 10974 + }, + { + "epoch": 0.8913343620563632, + "grad_norm": 8.507200043869714, + "learning_rate": 1.5325239945196108e-07, + "loss": 0.4659, + "step": 10975 + }, + { + "epoch": 0.8914155770324048, + "grad_norm": 6.152072516412701, + "learning_rate": 1.530257734611132e-07, + "loss": 0.4745, + "step": 10976 + }, + { + "epoch": 0.8914967920084463, + "grad_norm": 3.5425159922458582, + "learning_rate": 1.5279930986846047e-07, + "loss": 0.6653, + "step": 10977 + }, + { + "epoch": 0.891578006984488, + "grad_norm": 5.233121096684976, + "learning_rate": 1.5257300868967344e-07, + "loss": 0.5044, + "step": 10978 + }, + { + "epoch": 0.8916592219605295, + "grad_norm": 6.291818310904449, + "learning_rate": 1.5234686994041016e-07, + "loss": 0.4987, + "step": 10979 + }, + { + "epoch": 0.8917404369365711, + "grad_norm": 5.098504129650742, + "learning_rate": 1.521208936363186e-07, + "loss": 0.3563, + "step": 10980 + }, + { + "epoch": 0.8918216519126126, + "grad_norm": 4.803655200270107, + "learning_rate": 1.5189507979303575e-07, + "loss": 0.7008, + "step": 10981 + }, + { + "epoch": 0.8919028668886543, + "grad_norm": 4.05749114440078, + "learning_rate": 1.5166942842618632e-07, + "loss": 0.3977, + "step": 10982 + }, + { + "epoch": 0.8919840818646958, + "grad_norm": 6.222213392758151, + "learning_rate": 1.5144393955138336e-07, + "loss": 0.4221, + "step": 10983 + }, + { + "epoch": 0.8920652968407374, + "grad_norm": 3.854822685859984, + "learning_rate": 1.512186131842308e-07, + "loss": 0.4855, + "step": 10984 + }, + { + "epoch": 0.8921465118167791, + "grad_norm": 7.555916028857482, + "learning_rate": 1.5099344934031923e-07, + "loss": 0.4516, + "step": 10985 + }, + { + "epoch": 0.8922277267928206, + "grad_norm": 8.539386703282604, + "learning_rate": 1.507684480352292e-07, + "loss": 0.415, + "step": 10986 + }, + { + "epoch": 0.8923089417688622, + "grad_norm": 4.964839789056717, + "learning_rate": 1.5054360928452915e-07, + "loss": 0.4186, + "step": 10987 + }, + { + "epoch": 0.8923901567449037, + "grad_norm": 6.084627536003772, + "learning_rate": 1.5031893310377716e-07, + "loss": 0.5064, + "step": 10988 + }, + { + "epoch": 0.8924713717209454, + "grad_norm": 3.940604600092638, + "learning_rate": 1.5009441950851965e-07, + "loss": 0.5235, + "step": 10989 + }, + { + "epoch": 0.8925525866969869, + "grad_norm": 5.3695438746577375, + "learning_rate": 1.4987006851429147e-07, + "loss": 0.3932, + "step": 10990 + }, + { + "epoch": 0.8926338016730285, + "grad_norm": 9.145034230965127, + "learning_rate": 1.4964588013661657e-07, + "loss": 0.3775, + "step": 10991 + }, + { + "epoch": 0.89271501664907, + "grad_norm": 7.658853077669966, + "learning_rate": 1.4942185439100753e-07, + "loss": 0.4791, + "step": 10992 + }, + { + "epoch": 0.8927962316251117, + "grad_norm": 7.168128470178995, + "learning_rate": 1.4919799129296615e-07, + "loss": 0.3937, + "step": 10993 + }, + { + "epoch": 0.8928774466011532, + "grad_norm": 5.628929909843431, + "learning_rate": 1.489742908579822e-07, + "loss": 0.4437, + "step": 10994 + }, + { + "epoch": 0.8929586615771948, + "grad_norm": 6.054942900449149, + "learning_rate": 1.4875075310153504e-07, + "loss": 0.4699, + "step": 10995 + }, + { + "epoch": 0.8930398765532365, + "grad_norm": 5.835599987062635, + "learning_rate": 1.4852737803909167e-07, + "loss": 0.6048, + "step": 10996 + }, + { + "epoch": 0.893121091529278, + "grad_norm": 7.175331718017556, + "learning_rate": 1.4830416568610893e-07, + "loss": 0.5113, + "step": 10997 + }, + { + "epoch": 0.8932023065053196, + "grad_norm": 8.62919238424243, + "learning_rate": 1.4808111605803117e-07, + "loss": 0.4906, + "step": 10998 + }, + { + "epoch": 0.8932835214813611, + "grad_norm": 11.718705876342495, + "learning_rate": 1.4785822917029318e-07, + "loss": 0.415, + "step": 10999 + }, + { + "epoch": 0.8933647364574028, + "grad_norm": 6.786219876737145, + "learning_rate": 1.476355050383174e-07, + "loss": 0.483, + "step": 11000 + }, + { + "epoch": 0.8934459514334443, + "grad_norm": 4.549779356360936, + "learning_rate": 1.4741294367751484e-07, + "loss": 0.5062, + "step": 11001 + }, + { + "epoch": 0.8935271664094859, + "grad_norm": 4.594436506496313, + "learning_rate": 1.4719054510328595e-07, + "loss": 0.3839, + "step": 11002 + }, + { + "epoch": 0.8936083813855275, + "grad_norm": 3.87676469142472, + "learning_rate": 1.4696830933101868e-07, + "loss": 0.4757, + "step": 11003 + }, + { + "epoch": 0.8936895963615691, + "grad_norm": 4.757027256089799, + "learning_rate": 1.467462363760916e-07, + "loss": 0.5542, + "step": 11004 + }, + { + "epoch": 0.8937708113376106, + "grad_norm": 3.9669910321207587, + "learning_rate": 1.4652432625387013e-07, + "loss": 0.4679, + "step": 11005 + }, + { + "epoch": 0.8938520263136522, + "grad_norm": 5.6200804103086295, + "learning_rate": 1.4630257897970985e-07, + "loss": 0.4068, + "step": 11006 + }, + { + "epoch": 0.8939332412896939, + "grad_norm": 3.8084388168574894, + "learning_rate": 1.4608099456895452e-07, + "loss": 0.4993, + "step": 11007 + }, + { + "epoch": 0.8940144562657354, + "grad_norm": 6.108741515365576, + "learning_rate": 1.4585957303693664e-07, + "loss": 0.4431, + "step": 11008 + }, + { + "epoch": 0.894095671241777, + "grad_norm": 5.264223037773271, + "learning_rate": 1.4563831439897647e-07, + "loss": 0.5353, + "step": 11009 + }, + { + "epoch": 0.8941768862178185, + "grad_norm": 5.540285793593693, + "learning_rate": 1.4541721867038532e-07, + "loss": 0.4353, + "step": 11010 + }, + { + "epoch": 0.8942581011938602, + "grad_norm": 21.662383239919144, + "learning_rate": 1.4519628586646073e-07, + "loss": 0.4418, + "step": 11011 + }, + { + "epoch": 0.8943393161699017, + "grad_norm": 7.463645051609318, + "learning_rate": 1.4497551600249044e-07, + "loss": 0.4228, + "step": 11012 + }, + { + "epoch": 0.8944205311459433, + "grad_norm": 28.77597330752949, + "learning_rate": 1.447549090937511e-07, + "loss": 0.5227, + "step": 11013 + }, + { + "epoch": 0.8945017461219849, + "grad_norm": 3.957758324457693, + "learning_rate": 1.4453446515550724e-07, + "loss": 0.4077, + "step": 11014 + }, + { + "epoch": 0.8945829610980265, + "grad_norm": 7.878758982064296, + "learning_rate": 1.4431418420301157e-07, + "loss": 0.5104, + "step": 11015 + }, + { + "epoch": 0.894664176074068, + "grad_norm": 5.62814185101261, + "learning_rate": 1.440940662515075e-07, + "loss": 0.6012, + "step": 11016 + }, + { + "epoch": 0.8947453910501096, + "grad_norm": 5.517645951231921, + "learning_rate": 1.4387411131622592e-07, + "loss": 0.4324, + "step": 11017 + }, + { + "epoch": 0.8948266060261513, + "grad_norm": 4.575883407647924, + "learning_rate": 1.4365431941238544e-07, + "loss": 0.4455, + "step": 11018 + }, + { + "epoch": 0.8949078210021928, + "grad_norm": 4.43419935645762, + "learning_rate": 1.434346905551956e-07, + "loss": 0.3927, + "step": 11019 + }, + { + "epoch": 0.8949890359782344, + "grad_norm": 5.32989054760345, + "learning_rate": 1.432152247598534e-07, + "loss": 0.456, + "step": 11020 + }, + { + "epoch": 0.895070250954276, + "grad_norm": 4.528118555469795, + "learning_rate": 1.4299592204154445e-07, + "loss": 0.4092, + "step": 11021 + }, + { + "epoch": 0.8951514659303176, + "grad_norm": 5.024689885658865, + "learning_rate": 1.4277678241544328e-07, + "loss": 0.3977, + "step": 11022 + }, + { + "epoch": 0.8952326809063591, + "grad_norm": 6.316835150350028, + "learning_rate": 1.4255780589671337e-07, + "loss": 0.4807, + "step": 11023 + }, + { + "epoch": 0.8953138958824007, + "grad_norm": 5.3616028545471845, + "learning_rate": 1.423389925005067e-07, + "loss": 0.4474, + "step": 11024 + }, + { + "epoch": 0.8953951108584423, + "grad_norm": 3.2813886037054543, + "learning_rate": 1.421203422419637e-07, + "loss": 0.488, + "step": 11025 + }, + { + "epoch": 0.8954763258344839, + "grad_norm": 8.181117262341452, + "learning_rate": 1.4190185513621473e-07, + "loss": 0.4516, + "step": 11026 + }, + { + "epoch": 0.8955575408105254, + "grad_norm": 5.938608632171009, + "learning_rate": 1.416835311983772e-07, + "loss": 0.486, + "step": 11027 + }, + { + "epoch": 0.895638755786567, + "grad_norm": 9.269220540893665, + "learning_rate": 1.4146537044355785e-07, + "loss": 0.4106, + "step": 11028 + }, + { + "epoch": 0.8957199707626087, + "grad_norm": 18.999226973217382, + "learning_rate": 1.412473728868527e-07, + "loss": 0.4675, + "step": 11029 + }, + { + "epoch": 0.8958011857386502, + "grad_norm": 3.9421317838103103, + "learning_rate": 1.410295385433455e-07, + "loss": 0.4947, + "step": 11030 + }, + { + "epoch": 0.8958824007146918, + "grad_norm": 4.337954956064237, + "learning_rate": 1.4081186742810948e-07, + "loss": 0.4467, + "step": 11031 + }, + { + "epoch": 0.8959636156907334, + "grad_norm": 5.626043047717378, + "learning_rate": 1.4059435955620704e-07, + "loss": 0.5096, + "step": 11032 + }, + { + "epoch": 0.896044830666775, + "grad_norm": 4.67727289541783, + "learning_rate": 1.403770149426878e-07, + "loss": 0.4278, + "step": 11033 + }, + { + "epoch": 0.8961260456428165, + "grad_norm": 10.738068197611934, + "learning_rate": 1.4015983360259055e-07, + "loss": 0.5335, + "step": 11034 + }, + { + "epoch": 0.8962072606188581, + "grad_norm": 4.8693579129133235, + "learning_rate": 1.3994281555094386e-07, + "loss": 0.3716, + "step": 11035 + }, + { + "epoch": 0.8962884755948997, + "grad_norm": 2.9347782845843136, + "learning_rate": 1.3972596080276402e-07, + "loss": 0.7287, + "step": 11036 + }, + { + "epoch": 0.8963696905709413, + "grad_norm": 4.942121031320022, + "learning_rate": 1.395092693730557e-07, + "loss": 0.3883, + "step": 11037 + }, + { + "epoch": 0.8964509055469828, + "grad_norm": 5.046562171098466, + "learning_rate": 1.3929274127681303e-07, + "loss": 0.4705, + "step": 11038 + }, + { + "epoch": 0.8965321205230244, + "grad_norm": 3.9477589888855906, + "learning_rate": 1.3907637652901957e-07, + "loss": 0.5613, + "step": 11039 + }, + { + "epoch": 0.8966133354990661, + "grad_norm": 4.192994245868831, + "learning_rate": 1.3886017514464555e-07, + "loss": 0.39, + "step": 11040 + }, + { + "epoch": 0.8966945504751076, + "grad_norm": 4.637113922674406, + "learning_rate": 1.3864413713865098e-07, + "loss": 0.4266, + "step": 11041 + }, + { + "epoch": 0.8967757654511492, + "grad_norm": 5.8016593697423975, + "learning_rate": 1.38428262525985e-07, + "loss": 0.5181, + "step": 11042 + }, + { + "epoch": 0.8968569804271908, + "grad_norm": 10.961817792927185, + "learning_rate": 1.3821255132158456e-07, + "loss": 0.4214, + "step": 11043 + }, + { + "epoch": 0.8969381954032324, + "grad_norm": 6.98872156389094, + "learning_rate": 1.3799700354037605e-07, + "loss": 0.3722, + "step": 11044 + }, + { + "epoch": 0.8970194103792739, + "grad_norm": 6.014898811863191, + "learning_rate": 1.3778161919727472e-07, + "loss": 0.4123, + "step": 11045 + }, + { + "epoch": 0.8971006253553155, + "grad_norm": 4.681509306562985, + "learning_rate": 1.3756639830718316e-07, + "loss": 0.4154, + "step": 11046 + }, + { + "epoch": 0.8971818403313571, + "grad_norm": 5.1158065640859425, + "learning_rate": 1.373513408849936e-07, + "loss": 0.4988, + "step": 11047 + }, + { + "epoch": 0.8972630553073987, + "grad_norm": 5.355479142787348, + "learning_rate": 1.3713644694558742e-07, + "loss": 0.447, + "step": 11048 + }, + { + "epoch": 0.8973442702834402, + "grad_norm": 5.532841373770518, + "learning_rate": 1.369217165038339e-07, + "loss": 0.4586, + "step": 11049 + }, + { + "epoch": 0.8974254852594818, + "grad_norm": 14.185804561707538, + "learning_rate": 1.367071495745906e-07, + "loss": 0.4633, + "step": 11050 + }, + { + "epoch": 0.8975067002355235, + "grad_norm": 5.325288616133615, + "learning_rate": 1.3649274617270531e-07, + "loss": 0.539, + "step": 11051 + }, + { + "epoch": 0.897587915211565, + "grad_norm": 3.940700522997248, + "learning_rate": 1.3627850631301344e-07, + "loss": 0.4839, + "step": 11052 + }, + { + "epoch": 0.8976691301876066, + "grad_norm": 4.124491341239392, + "learning_rate": 1.3606443001033864e-07, + "loss": 0.4044, + "step": 11053 + }, + { + "epoch": 0.8977503451636482, + "grad_norm": 8.104468950407128, + "learning_rate": 1.3585051727949494e-07, + "loss": 0.4023, + "step": 11054 + }, + { + "epoch": 0.8978315601396898, + "grad_norm": 9.103539674813327, + "learning_rate": 1.3563676813528325e-07, + "loss": 0.3839, + "step": 11055 + }, + { + "epoch": 0.8979127751157313, + "grad_norm": 4.227081794199914, + "learning_rate": 1.354231825924937e-07, + "loss": 0.5269, + "step": 11056 + }, + { + "epoch": 0.8979939900917729, + "grad_norm": 4.986647735979627, + "learning_rate": 1.3520976066590557e-07, + "loss": 0.3988, + "step": 11057 + }, + { + "epoch": 0.8980752050678145, + "grad_norm": 4.88191309684984, + "learning_rate": 1.3499650237028677e-07, + "loss": 0.3746, + "step": 11058 + }, + { + "epoch": 0.8981564200438561, + "grad_norm": 8.27159257192499, + "learning_rate": 1.3478340772039328e-07, + "loss": 0.5004, + "step": 11059 + }, + { + "epoch": 0.8982376350198976, + "grad_norm": 7.866676776482577, + "learning_rate": 1.3457047673097024e-07, + "loss": 0.3978, + "step": 11060 + }, + { + "epoch": 0.8983188499959393, + "grad_norm": 6.3754067681303805, + "learning_rate": 1.343577094167514e-07, + "loss": 0.4521, + "step": 11061 + }, + { + "epoch": 0.8984000649719809, + "grad_norm": 3.9619701694967744, + "learning_rate": 1.341451057924592e-07, + "loss": 0.545, + "step": 11062 + }, + { + "epoch": 0.8984812799480224, + "grad_norm": 4.982039565280355, + "learning_rate": 1.3393266587280434e-07, + "loss": 0.8769, + "step": 11063 + }, + { + "epoch": 0.898562494924064, + "grad_norm": 4.163415192247937, + "learning_rate": 1.3372038967248647e-07, + "loss": 0.5124, + "step": 11064 + }, + { + "epoch": 0.8986437099001056, + "grad_norm": 5.048906567605772, + "learning_rate": 1.335082772061949e-07, + "loss": 0.5187, + "step": 11065 + }, + { + "epoch": 0.8987249248761472, + "grad_norm": 6.207042181936884, + "learning_rate": 1.3329632848860545e-07, + "loss": 0.6352, + "step": 11066 + }, + { + "epoch": 0.8988061398521887, + "grad_norm": 7.073497034721812, + "learning_rate": 1.33084543534385e-07, + "loss": 0.5995, + "step": 11067 + }, + { + "epoch": 0.8988873548282303, + "grad_norm": 9.491444143304246, + "learning_rate": 1.3287292235818732e-07, + "loss": 0.4552, + "step": 11068 + }, + { + "epoch": 0.8989685698042719, + "grad_norm": 3.2830020992185185, + "learning_rate": 1.326614649746555e-07, + "loss": 0.3663, + "step": 11069 + }, + { + "epoch": 0.8990497847803135, + "grad_norm": 4.692919809966446, + "learning_rate": 1.324501713984211e-07, + "loss": 0.443, + "step": 11070 + }, + { + "epoch": 0.899130999756355, + "grad_norm": 6.01529901714284, + "learning_rate": 1.3223904164410494e-07, + "loss": 0.3483, + "step": 11071 + }, + { + "epoch": 0.8992122147323967, + "grad_norm": 8.122333020006561, + "learning_rate": 1.3202807572631564e-07, + "loss": 0.5003, + "step": 11072 + }, + { + "epoch": 0.8992934297084383, + "grad_norm": 8.111867294079145, + "learning_rate": 1.318172736596518e-07, + "loss": 0.4683, + "step": 11073 + }, + { + "epoch": 0.8993746446844798, + "grad_norm": 9.069724308334372, + "learning_rate": 1.3160663545869896e-07, + "loss": 0.4858, + "step": 11074 + }, + { + "epoch": 0.8994558596605214, + "grad_norm": 7.1277675224613075, + "learning_rate": 1.3139616113803238e-07, + "loss": 0.6833, + "step": 11075 + }, + { + "epoch": 0.899537074636563, + "grad_norm": 4.1361872264740285, + "learning_rate": 1.3118585071221546e-07, + "loss": 0.6296, + "step": 11076 + }, + { + "epoch": 0.8996182896126046, + "grad_norm": 4.28970175862567, + "learning_rate": 1.3097570419580096e-07, + "loss": 0.5692, + "step": 11077 + }, + { + "epoch": 0.8996995045886461, + "grad_norm": 8.454970827458345, + "learning_rate": 1.3076572160333007e-07, + "loss": 0.431, + "step": 11078 + }, + { + "epoch": 0.8997807195646877, + "grad_norm": 9.012725656230002, + "learning_rate": 1.3055590294933196e-07, + "loss": 0.5454, + "step": 11079 + }, + { + "epoch": 0.8998619345407293, + "grad_norm": 5.94575386452439, + "learning_rate": 1.303462482483256e-07, + "loss": 0.4561, + "step": 11080 + }, + { + "epoch": 0.8999431495167709, + "grad_norm": 5.789262579104675, + "learning_rate": 1.301367575148177e-07, + "loss": 0.4424, + "step": 11081 + }, + { + "epoch": 0.9000243644928124, + "grad_norm": 38.814186598162095, + "learning_rate": 1.299274307633036e-07, + "loss": 0.5148, + "step": 11082 + }, + { + "epoch": 0.9001055794688541, + "grad_norm": 9.66072321360139, + "learning_rate": 1.297182680082676e-07, + "loss": 0.5525, + "step": 11083 + }, + { + "epoch": 0.9001867944448957, + "grad_norm": 4.4257331245909555, + "learning_rate": 1.2950926926418362e-07, + "loss": 0.5021, + "step": 11084 + }, + { + "epoch": 0.9002680094209372, + "grad_norm": 3.0817044530551954, + "learning_rate": 1.2930043454551178e-07, + "loss": 0.468, + "step": 11085 + }, + { + "epoch": 0.9003492243969788, + "grad_norm": 5.56969653241233, + "learning_rate": 1.2909176386670385e-07, + "loss": 0.3354, + "step": 11086 + }, + { + "epoch": 0.9004304393730204, + "grad_norm": 4.504308093649935, + "learning_rate": 1.2888325724219775e-07, + "loss": 0.5716, + "step": 11087 + }, + { + "epoch": 0.900511654349062, + "grad_norm": 4.663416416080067, + "learning_rate": 1.2867491468642106e-07, + "loss": 0.4541, + "step": 11088 + }, + { + "epoch": 0.9005928693251035, + "grad_norm": 4.148002537213163, + "learning_rate": 1.2846673621379035e-07, + "loss": 0.514, + "step": 11089 + }, + { + "epoch": 0.9006740843011452, + "grad_norm": 4.589279907686427, + "learning_rate": 1.282587218387102e-07, + "loss": 0.5123, + "step": 11090 + }, + { + "epoch": 0.9007552992771867, + "grad_norm": 3.974192487774201, + "learning_rate": 1.2805087157557434e-07, + "loss": 0.5089, + "step": 11091 + }, + { + "epoch": 0.9008365142532283, + "grad_norm": 5.602158178066266, + "learning_rate": 1.2784318543876463e-07, + "loss": 0.4654, + "step": 11092 + }, + { + "epoch": 0.9009177292292698, + "grad_norm": 8.541653052285914, + "learning_rate": 1.276356634426526e-07, + "loss": 0.4957, + "step": 11093 + }, + { + "epoch": 0.9009989442053115, + "grad_norm": 3.690935781414827, + "learning_rate": 1.274283056015968e-07, + "loss": 0.6248, + "step": 11094 + }, + { + "epoch": 0.9010801591813531, + "grad_norm": 4.875190589469171, + "learning_rate": 1.272211119299452e-07, + "loss": 0.4652, + "step": 11095 + }, + { + "epoch": 0.9011613741573946, + "grad_norm": 5.525144142434985, + "learning_rate": 1.270140824420349e-07, + "loss": 0.4016, + "step": 11096 + }, + { + "epoch": 0.9012425891334362, + "grad_norm": 3.8356059525271387, + "learning_rate": 1.2680721715219168e-07, + "loss": 0.4344, + "step": 11097 + }, + { + "epoch": 0.9013238041094778, + "grad_norm": 4.807969276574533, + "learning_rate": 1.2660051607472885e-07, + "loss": 0.6449, + "step": 11098 + }, + { + "epoch": 0.9014050190855194, + "grad_norm": 5.802173378442104, + "learning_rate": 1.2639397922394963e-07, + "loss": 0.5645, + "step": 11099 + }, + { + "epoch": 0.9014862340615609, + "grad_norm": 6.895498171533333, + "learning_rate": 1.261876066141446e-07, + "loss": 0.499, + "step": 11100 + }, + { + "epoch": 0.9015674490376026, + "grad_norm": 7.515564655019646, + "learning_rate": 1.2598139825959393e-07, + "loss": 0.4524, + "step": 11101 + }, + { + "epoch": 0.9016486640136441, + "grad_norm": 10.60238190605972, + "learning_rate": 1.2577535417456599e-07, + "loss": 0.3594, + "step": 11102 + }, + { + "epoch": 0.9017298789896857, + "grad_norm": 5.744098794749828, + "learning_rate": 1.255694743733185e-07, + "loss": 0.3964, + "step": 11103 + }, + { + "epoch": 0.9018110939657272, + "grad_norm": 3.9046557359930865, + "learning_rate": 1.253637588700965e-07, + "loss": 0.4082, + "step": 11104 + }, + { + "epoch": 0.9018923089417689, + "grad_norm": 5.1997775870318765, + "learning_rate": 1.251582076791352e-07, + "loss": 0.5643, + "step": 11105 + }, + { + "epoch": 0.9019735239178105, + "grad_norm": 8.139019581385137, + "learning_rate": 1.2495282081465747e-07, + "loss": 0.5973, + "step": 11106 + }, + { + "epoch": 0.902054738893852, + "grad_norm": 6.919395171153576, + "learning_rate": 1.2474759829087413e-07, + "loss": 0.5088, + "step": 11107 + }, + { + "epoch": 0.9021359538698936, + "grad_norm": 7.322279551123576, + "learning_rate": 1.2454254012198657e-07, + "loss": 0.5294, + "step": 11108 + }, + { + "epoch": 0.9022171688459352, + "grad_norm": 4.725685255377581, + "learning_rate": 1.2433764632218293e-07, + "loss": 0.4372, + "step": 11109 + }, + { + "epoch": 0.9022983838219768, + "grad_norm": 14.72704680471624, + "learning_rate": 1.2413291690564154e-07, + "loss": 0.4079, + "step": 11110 + }, + { + "epoch": 0.9023795987980183, + "grad_norm": 5.242560339794683, + "learning_rate": 1.239283518865278e-07, + "loss": 0.5958, + "step": 11111 + }, + { + "epoch": 0.90246081377406, + "grad_norm": 3.152784855668021, + "learning_rate": 1.2372395127899728e-07, + "loss": 0.4431, + "step": 11112 + }, + { + "epoch": 0.9025420287501015, + "grad_norm": 12.381085280329925, + "learning_rate": 1.2351971509719312e-07, + "loss": 0.3836, + "step": 11113 + }, + { + "epoch": 0.9026232437261431, + "grad_norm": 6.282293943148033, + "learning_rate": 1.233156433552471e-07, + "loss": 0.4327, + "step": 11114 + }, + { + "epoch": 0.9027044587021846, + "grad_norm": 9.80917559710566, + "learning_rate": 1.2311173606727982e-07, + "loss": 0.5872, + "step": 11115 + }, + { + "epoch": 0.9027856736782263, + "grad_norm": 4.868387447251448, + "learning_rate": 1.2290799324740144e-07, + "loss": 0.604, + "step": 11116 + }, + { + "epoch": 0.9028668886542679, + "grad_norm": 4.469666914330099, + "learning_rate": 1.2270441490970897e-07, + "loss": 0.4203, + "step": 11117 + }, + { + "epoch": 0.9029481036303094, + "grad_norm": 5.0365987552230935, + "learning_rate": 1.2250100106828978e-07, + "loss": 0.4397, + "step": 11118 + }, + { + "epoch": 0.903029318606351, + "grad_norm": 4.266209336176858, + "learning_rate": 1.222977517372184e-07, + "loss": 0.8365, + "step": 11119 + }, + { + "epoch": 0.9031105335823926, + "grad_norm": 5.161680402157924, + "learning_rate": 1.2209466693055867e-07, + "loss": 0.5762, + "step": 11120 + }, + { + "epoch": 0.9031917485584342, + "grad_norm": 4.148007256784386, + "learning_rate": 1.2189174666236314e-07, + "loss": 0.4376, + "step": 11121 + }, + { + "epoch": 0.9032729635344757, + "grad_norm": 5.331795822073706, + "learning_rate": 1.2168899094667257e-07, + "loss": 0.6871, + "step": 11122 + }, + { + "epoch": 0.9033541785105174, + "grad_norm": 4.452121593101198, + "learning_rate": 1.2148639979751686e-07, + "loss": 0.4611, + "step": 11123 + }, + { + "epoch": 0.9034353934865589, + "grad_norm": 5.092994028700953, + "learning_rate": 1.212839732289145e-07, + "loss": 0.3484, + "step": 11124 + }, + { + "epoch": 0.9035166084626005, + "grad_norm": 4.802229400487032, + "learning_rate": 1.2108171125487177e-07, + "loss": 0.3757, + "step": 11125 + }, + { + "epoch": 0.903597823438642, + "grad_norm": 5.17753286975903, + "learning_rate": 1.2087961388938473e-07, + "loss": 0.5139, + "step": 11126 + }, + { + "epoch": 0.9036790384146837, + "grad_norm": 4.289775333983445, + "learning_rate": 1.2067768114643635e-07, + "loss": 0.5869, + "step": 11127 + }, + { + "epoch": 0.9037602533907253, + "grad_norm": 5.723264204379845, + "learning_rate": 1.2047591304000044e-07, + "loss": 0.4739, + "step": 11128 + }, + { + "epoch": 0.9038414683667668, + "grad_norm": 10.15998509588337, + "learning_rate": 1.2027430958403808e-07, + "loss": 0.4689, + "step": 11129 + }, + { + "epoch": 0.9039226833428085, + "grad_norm": 4.420651137514857, + "learning_rate": 1.2007287079249863e-07, + "loss": 0.4664, + "step": 11130 + }, + { + "epoch": 0.90400389831885, + "grad_norm": 4.274612039891857, + "learning_rate": 1.1987159667932124e-07, + "loss": 0.5313, + "step": 11131 + }, + { + "epoch": 0.9040851132948916, + "grad_norm": 7.759017785750078, + "learning_rate": 1.1967048725843256e-07, + "loss": 0.4139, + "step": 11132 + }, + { + "epoch": 0.9041663282709331, + "grad_norm": 6.646277040961836, + "learning_rate": 1.1946954254374838e-07, + "loss": 0.5634, + "step": 11133 + }, + { + "epoch": 0.9042475432469748, + "grad_norm": 4.609209324854486, + "learning_rate": 1.1926876254917314e-07, + "loss": 0.4933, + "step": 11134 + }, + { + "epoch": 0.9043287582230163, + "grad_norm": 4.565133981153573, + "learning_rate": 1.190681472885996e-07, + "loss": 0.4394, + "step": 11135 + }, + { + "epoch": 0.9044099731990579, + "grad_norm": 8.18111120557696, + "learning_rate": 1.188676967759092e-07, + "loss": 0.5565, + "step": 11136 + }, + { + "epoch": 0.9044911881750994, + "grad_norm": 5.013373635726462, + "learning_rate": 1.1866741102497275e-07, + "loss": 0.5724, + "step": 11137 + }, + { + "epoch": 0.9045724031511411, + "grad_norm": 4.362396063750319, + "learning_rate": 1.1846729004964835e-07, + "loss": 0.4643, + "step": 11138 + }, + { + "epoch": 0.9046536181271827, + "grad_norm": 5.980174069654119, + "learning_rate": 1.1826733386378297e-07, + "loss": 0.4188, + "step": 11139 + }, + { + "epoch": 0.9047348331032242, + "grad_norm": 4.951897983742022, + "learning_rate": 1.1806754248121333e-07, + "loss": 0.4468, + "step": 11140 + }, + { + "epoch": 0.9048160480792659, + "grad_norm": 4.454466679854169, + "learning_rate": 1.1786791591576307e-07, + "loss": 0.5959, + "step": 11141 + }, + { + "epoch": 0.9048972630553074, + "grad_norm": 4.311779144133112, + "learning_rate": 1.176684541812459e-07, + "loss": 0.7119, + "step": 11142 + }, + { + "epoch": 0.904978478031349, + "grad_norm": 5.216361658357243, + "learning_rate": 1.174691572914638e-07, + "loss": 0.5982, + "step": 11143 + }, + { + "epoch": 0.9050596930073905, + "grad_norm": 4.598499041200991, + "learning_rate": 1.1727002526020631e-07, + "loss": 0.5352, + "step": 11144 + }, + { + "epoch": 0.9051409079834322, + "grad_norm": 3.192274398240333, + "learning_rate": 1.1707105810125297e-07, + "loss": 0.5892, + "step": 11145 + }, + { + "epoch": 0.9052221229594737, + "grad_norm": 11.971740880860915, + "learning_rate": 1.1687225582837052e-07, + "loss": 0.3351, + "step": 11146 + }, + { + "epoch": 0.9053033379355153, + "grad_norm": 8.973707498145792, + "learning_rate": 1.1667361845531578e-07, + "loss": 0.368, + "step": 11147 + }, + { + "epoch": 0.9053845529115568, + "grad_norm": 4.2462620886968825, + "learning_rate": 1.164751459958327e-07, + "loss": 0.3972, + "step": 11148 + }, + { + "epoch": 0.9054657678875985, + "grad_norm": 4.590797394683434, + "learning_rate": 1.1627683846365478e-07, + "loss": 0.548, + "step": 11149 + }, + { + "epoch": 0.9055469828636401, + "grad_norm": 7.826708434408468, + "learning_rate": 1.1607869587250464e-07, + "loss": 0.4099, + "step": 11150 + }, + { + "epoch": 0.9056281978396816, + "grad_norm": 4.643007677952196, + "learning_rate": 1.1588071823609159e-07, + "loss": 0.6002, + "step": 11151 + }, + { + "epoch": 0.9057094128157233, + "grad_norm": 6.195712206260984, + "learning_rate": 1.1568290556811495e-07, + "loss": 0.5826, + "step": 11152 + }, + { + "epoch": 0.9057906277917648, + "grad_norm": 5.225028473648337, + "learning_rate": 1.1548525788226267e-07, + "loss": 0.5314, + "step": 11153 + }, + { + "epoch": 0.9058718427678064, + "grad_norm": 9.685757941498554, + "learning_rate": 1.1528777519221046e-07, + "loss": 0.5205, + "step": 11154 + }, + { + "epoch": 0.9059530577438479, + "grad_norm": 3.9456543583410526, + "learning_rate": 1.1509045751162324e-07, + "loss": 0.4093, + "step": 11155 + }, + { + "epoch": 0.9060342727198896, + "grad_norm": 5.960099778891394, + "learning_rate": 1.1489330485415479e-07, + "loss": 0.3512, + "step": 11156 + }, + { + "epoch": 0.9061154876959311, + "grad_norm": 5.908029100963964, + "learning_rate": 1.1469631723344671e-07, + "loss": 0.4409, + "step": 11157 + }, + { + "epoch": 0.9061967026719727, + "grad_norm": 4.606484674584257, + "learning_rate": 1.1449949466312893e-07, + "loss": 0.4521, + "step": 11158 + }, + { + "epoch": 0.9062779176480142, + "grad_norm": 5.177826704703417, + "learning_rate": 1.1430283715682139e-07, + "loss": 0.5358, + "step": 11159 + }, + { + "epoch": 0.9063591326240559, + "grad_norm": 4.966905493065145, + "learning_rate": 1.1410634472813098e-07, + "loss": 0.4514, + "step": 11160 + }, + { + "epoch": 0.9064403476000975, + "grad_norm": 5.793696903665879, + "learning_rate": 1.1391001739065432e-07, + "loss": 0.5214, + "step": 11161 + }, + { + "epoch": 0.906521562576139, + "grad_norm": 5.420921312113236, + "learning_rate": 1.1371385515797695e-07, + "loss": 0.4365, + "step": 11162 + }, + { + "epoch": 0.9066027775521807, + "grad_norm": 3.588449960902482, + "learning_rate": 1.1351785804367105e-07, + "loss": 0.5352, + "step": 11163 + }, + { + "epoch": 0.9066839925282222, + "grad_norm": 5.796738656501365, + "learning_rate": 1.1332202606129938e-07, + "loss": 0.4415, + "step": 11164 + }, + { + "epoch": 0.9067652075042638, + "grad_norm": 5.625515444116812, + "learning_rate": 1.1312635922441195e-07, + "loss": 0.4904, + "step": 11165 + }, + { + "epoch": 0.9068464224803053, + "grad_norm": 4.830094151436432, + "learning_rate": 1.129308575465482e-07, + "loss": 0.6403, + "step": 11166 + }, + { + "epoch": 0.906927637456347, + "grad_norm": 4.562587755650938, + "learning_rate": 1.1273552104123564e-07, + "loss": 0.4543, + "step": 11167 + }, + { + "epoch": 0.9070088524323885, + "grad_norm": 7.282911765724218, + "learning_rate": 1.125403497219904e-07, + "loss": 0.4899, + "step": 11168 + }, + { + "epoch": 0.9070900674084301, + "grad_norm": 4.877190705118263, + "learning_rate": 1.123453436023178e-07, + "loss": 0.3675, + "step": 11169 + }, + { + "epoch": 0.9071712823844716, + "grad_norm": 6.299432110607247, + "learning_rate": 1.121505026957112e-07, + "loss": 0.4837, + "step": 11170 + }, + { + "epoch": 0.9072524973605133, + "grad_norm": 5.585090356105831, + "learning_rate": 1.1195582701565177e-07, + "loss": 0.5353, + "step": 11171 + }, + { + "epoch": 0.9073337123365549, + "grad_norm": 5.1062027169899435, + "learning_rate": 1.1176131657561095e-07, + "loss": 0.5091, + "step": 11172 + }, + { + "epoch": 0.9074149273125964, + "grad_norm": 3.970272507111287, + "learning_rate": 1.1156697138904715e-07, + "loss": 0.73, + "step": 11173 + }, + { + "epoch": 0.9074961422886381, + "grad_norm": 4.9753615625859196, + "learning_rate": 1.1137279146940821e-07, + "loss": 0.3436, + "step": 11174 + }, + { + "epoch": 0.9075773572646796, + "grad_norm": 4.2196215333931, + "learning_rate": 1.111787768301309e-07, + "loss": 0.5066, + "step": 11175 + }, + { + "epoch": 0.9076585722407212, + "grad_norm": 9.64461171338353, + "learning_rate": 1.1098492748463945e-07, + "loss": 0.4379, + "step": 11176 + }, + { + "epoch": 0.9077397872167627, + "grad_norm": 7.251627691507904, + "learning_rate": 1.1079124344634707e-07, + "loss": 0.4515, + "step": 11177 + }, + { + "epoch": 0.9078210021928044, + "grad_norm": 4.400388440896803, + "learning_rate": 1.1059772472865632e-07, + "loss": 0.408, + "step": 11178 + }, + { + "epoch": 0.9079022171688459, + "grad_norm": 4.694092152410516, + "learning_rate": 1.1040437134495708e-07, + "loss": 0.4494, + "step": 11179 + }, + { + "epoch": 0.9079834321448875, + "grad_norm": 4.510297602747996, + "learning_rate": 1.1021118330862835e-07, + "loss": 0.455, + "step": 11180 + }, + { + "epoch": 0.908064647120929, + "grad_norm": 3.8454716633958546, + "learning_rate": 1.1001816063303805e-07, + "loss": 0.5094, + "step": 11181 + }, + { + "epoch": 0.9081458620969707, + "grad_norm": 7.989031039920255, + "learning_rate": 1.0982530333154245e-07, + "loss": 0.6907, + "step": 11182 + }, + { + "epoch": 0.9082270770730123, + "grad_norm": 5.9615319524726695, + "learning_rate": 1.0963261141748616e-07, + "loss": 0.3874, + "step": 11183 + }, + { + "epoch": 0.9083082920490538, + "grad_norm": 4.85080597593589, + "learning_rate": 1.0944008490420183e-07, + "loss": 0.6927, + "step": 11184 + }, + { + "epoch": 0.9083895070250955, + "grad_norm": 12.043245022297707, + "learning_rate": 1.0924772380501215e-07, + "loss": 0.4814, + "step": 11185 + }, + { + "epoch": 0.908470722001137, + "grad_norm": 3.70058521293832, + "learning_rate": 1.0905552813322701e-07, + "loss": 0.43, + "step": 11186 + }, + { + "epoch": 0.9085519369771786, + "grad_norm": 5.677961162801236, + "learning_rate": 1.0886349790214495e-07, + "loss": 0.4644, + "step": 11187 + }, + { + "epoch": 0.9086331519532201, + "grad_norm": 5.958066806913473, + "learning_rate": 1.0867163312505452e-07, + "loss": 0.5697, + "step": 11188 + }, + { + "epoch": 0.9087143669292618, + "grad_norm": 8.122121992747614, + "learning_rate": 1.084799338152312e-07, + "loss": 0.4075, + "step": 11189 + }, + { + "epoch": 0.9087955819053033, + "grad_norm": 3.68707963935213, + "learning_rate": 1.082883999859391e-07, + "loss": 0.61, + "step": 11190 + }, + { + "epoch": 0.9088767968813449, + "grad_norm": 6.469919999736764, + "learning_rate": 1.0809703165043206e-07, + "loss": 0.4208, + "step": 11191 + }, + { + "epoch": 0.9089580118573864, + "grad_norm": 13.337941491052447, + "learning_rate": 1.0790582882195172e-07, + "loss": 0.3323, + "step": 11192 + }, + { + "epoch": 0.9090392268334281, + "grad_norm": 5.027652600451383, + "learning_rate": 1.0771479151372749e-07, + "loss": 0.3798, + "step": 11193 + }, + { + "epoch": 0.9091204418094697, + "grad_norm": 19.702276711092185, + "learning_rate": 1.0752391973897852e-07, + "loss": 0.4353, + "step": 11194 + }, + { + "epoch": 0.9092016567855112, + "grad_norm": 6.736698974070218, + "learning_rate": 1.0733321351091286e-07, + "loss": 0.4836, + "step": 11195 + }, + { + "epoch": 0.9092828717615529, + "grad_norm": 8.31372301888567, + "learning_rate": 1.071426728427255e-07, + "loss": 0.4187, + "step": 11196 + }, + { + "epoch": 0.9093640867375944, + "grad_norm": 5.818025708926807, + "learning_rate": 1.0695229774760147e-07, + "loss": 0.4106, + "step": 11197 + }, + { + "epoch": 0.909445301713636, + "grad_norm": 4.4508095887650105, + "learning_rate": 1.0676208823871326e-07, + "loss": 0.545, + "step": 11198 + }, + { + "epoch": 0.9095265166896775, + "grad_norm": 8.123684613091736, + "learning_rate": 1.065720443292223e-07, + "loss": 0.3934, + "step": 11199 + }, + { + "epoch": 0.9096077316657192, + "grad_norm": 5.968329057688115, + "learning_rate": 1.0638216603227892e-07, + "loss": 0.5218, + "step": 11200 + }, + { + "epoch": 0.9096889466417607, + "grad_norm": 8.367605133725334, + "learning_rate": 1.0619245336102174e-07, + "loss": 0.4462, + "step": 11201 + }, + { + "epoch": 0.9097701616178023, + "grad_norm": 7.432108062190521, + "learning_rate": 1.060029063285778e-07, + "loss": 0.5753, + "step": 11202 + }, + { + "epoch": 0.9098513765938439, + "grad_norm": 5.076717044768981, + "learning_rate": 1.0581352494806241e-07, + "loss": 0.3382, + "step": 11203 + }, + { + "epoch": 0.9099325915698855, + "grad_norm": 6.177656873247412, + "learning_rate": 1.0562430923258037e-07, + "loss": 0.5211, + "step": 11204 + }, + { + "epoch": 0.9100138065459271, + "grad_norm": 6.778475917464361, + "learning_rate": 1.0543525919522401e-07, + "loss": 0.4251, + "step": 11205 + }, + { + "epoch": 0.9100950215219686, + "grad_norm": 6.121707124021081, + "learning_rate": 1.0524637484907424e-07, + "loss": 0.3896, + "step": 11206 + }, + { + "epoch": 0.9101762364980103, + "grad_norm": 6.136173762658393, + "learning_rate": 1.0505765620720143e-07, + "loss": 0.6354, + "step": 11207 + }, + { + "epoch": 0.9102574514740518, + "grad_norm": 7.9613881203228525, + "learning_rate": 1.0486910328266403e-07, + "loss": 0.3988, + "step": 11208 + }, + { + "epoch": 0.9103386664500934, + "grad_norm": 5.222762969931984, + "learning_rate": 1.0468071608850827e-07, + "loss": 0.537, + "step": 11209 + }, + { + "epoch": 0.910419881426135, + "grad_norm": 4.889505356591718, + "learning_rate": 1.0449249463777039e-07, + "loss": 0.41, + "step": 11210 + }, + { + "epoch": 0.9105010964021766, + "grad_norm": 5.393977337842649, + "learning_rate": 1.0430443894347358e-07, + "loss": 0.8052, + "step": 11211 + }, + { + "epoch": 0.9105823113782181, + "grad_norm": 4.927676914766112, + "learning_rate": 1.041165490186305e-07, + "loss": 0.4754, + "step": 11212 + }, + { + "epoch": 0.9106635263542597, + "grad_norm": 4.738799420453396, + "learning_rate": 1.0392882487624212e-07, + "loss": 0.5263, + "step": 11213 + }, + { + "epoch": 0.9107447413303013, + "grad_norm": 3.4587373103230474, + "learning_rate": 1.0374126652929805e-07, + "loss": 0.3904, + "step": 11214 + }, + { + "epoch": 0.9108259563063429, + "grad_norm": 4.320689046060023, + "learning_rate": 1.0355387399077627e-07, + "loss": 0.5351, + "step": 11215 + }, + { + "epoch": 0.9109071712823845, + "grad_norm": 6.811312905006839, + "learning_rate": 1.033666472736436e-07, + "loss": 0.4286, + "step": 11216 + }, + { + "epoch": 0.910988386258426, + "grad_norm": 3.4936475176998845, + "learning_rate": 1.0317958639085524e-07, + "loss": 0.5945, + "step": 11217 + }, + { + "epoch": 0.9110696012344677, + "grad_norm": 4.327867349353077, + "learning_rate": 1.0299269135535416e-07, + "loss": 0.3791, + "step": 11218 + }, + { + "epoch": 0.9111508162105092, + "grad_norm": 7.91773166597089, + "learning_rate": 1.0280596218007254e-07, + "loss": 0.4511, + "step": 11219 + }, + { + "epoch": 0.9112320311865508, + "grad_norm": 5.982009386221902, + "learning_rate": 1.0261939887793143e-07, + "loss": 0.4298, + "step": 11220 + }, + { + "epoch": 0.9113132461625923, + "grad_norm": 9.681222480637057, + "learning_rate": 1.0243300146184048e-07, + "loss": 0.5457, + "step": 11221 + }, + { + "epoch": 0.911394461138634, + "grad_norm": 8.645718538389323, + "learning_rate": 1.0224676994469635e-07, + "loss": 0.4311, + "step": 11222 + }, + { + "epoch": 0.9114756761146755, + "grad_norm": 6.818193527467812, + "learning_rate": 1.020607043393862e-07, + "loss": 0.4284, + "step": 11223 + }, + { + "epoch": 0.9115568910907171, + "grad_norm": 4.876516812885544, + "learning_rate": 1.0187480465878418e-07, + "loss": 0.4452, + "step": 11224 + }, + { + "epoch": 0.9116381060667587, + "grad_norm": 4.240926353476526, + "learning_rate": 1.0168907091575364e-07, + "loss": 0.5541, + "step": 11225 + }, + { + "epoch": 0.9117193210428003, + "grad_norm": 5.868350340007438, + "learning_rate": 1.015035031231465e-07, + "loss": 0.4558, + "step": 11226 + }, + { + "epoch": 0.9118005360188419, + "grad_norm": 3.823767663089306, + "learning_rate": 1.0131810129380332e-07, + "loss": 0.6506, + "step": 11227 + }, + { + "epoch": 0.9118817509948834, + "grad_norm": 7.147187550366974, + "learning_rate": 1.0113286544055245e-07, + "loss": 0.4312, + "step": 11228 + }, + { + "epoch": 0.9119629659709251, + "grad_norm": 50.85174427248752, + "learning_rate": 1.0094779557621171e-07, + "loss": 0.4037, + "step": 11229 + }, + { + "epoch": 0.9120441809469666, + "grad_norm": 10.89692418155859, + "learning_rate": 1.0076289171358695e-07, + "loss": 0.5095, + "step": 11230 + }, + { + "epoch": 0.9121253959230082, + "grad_norm": 6.406674309261182, + "learning_rate": 1.0057815386547181e-07, + "loss": 0.6489, + "step": 11231 + }, + { + "epoch": 0.9122066108990498, + "grad_norm": 4.4242464166233955, + "learning_rate": 1.0039358204464943e-07, + "loss": 0.5091, + "step": 11232 + }, + { + "epoch": 0.9122878258750914, + "grad_norm": 4.743762192538487, + "learning_rate": 1.0020917626389209e-07, + "loss": 0.4469, + "step": 11233 + }, + { + "epoch": 0.9123690408511329, + "grad_norm": 4.14370710399411, + "learning_rate": 1.0002493653595902e-07, + "loss": 0.39, + "step": 11234 + }, + { + "epoch": 0.9124502558271745, + "grad_norm": 14.280052275876619, + "learning_rate": 9.984086287359806e-08, + "loss": 0.5105, + "step": 11235 + }, + { + "epoch": 0.9125314708032161, + "grad_norm": 6.244441893382094, + "learning_rate": 9.965695528954711e-08, + "loss": 0.357, + "step": 11236 + }, + { + "epoch": 0.9126126857792577, + "grad_norm": 7.898334173798384, + "learning_rate": 9.947321379653152e-08, + "loss": 0.4236, + "step": 11237 + }, + { + "epoch": 0.9126939007552993, + "grad_norm": 4.788515846324651, + "learning_rate": 9.928963840726418e-08, + "loss": 0.5023, + "step": 11238 + }, + { + "epoch": 0.9127751157313408, + "grad_norm": 25.04518107661059, + "learning_rate": 9.910622913444856e-08, + "loss": 0.5852, + "step": 11239 + }, + { + "epoch": 0.9128563307073825, + "grad_norm": 4.841830459052142, + "learning_rate": 9.89229859907756e-08, + "loss": 0.4615, + "step": 11240 + }, + { + "epoch": 0.912937545683424, + "grad_norm": 7.939078029749193, + "learning_rate": 9.873990898892405e-08, + "loss": 0.4802, + "step": 11241 + }, + { + "epoch": 0.9130187606594656, + "grad_norm": 9.609761146624544, + "learning_rate": 9.855699814156266e-08, + "loss": 0.4418, + "step": 11242 + }, + { + "epoch": 0.9130999756355072, + "grad_norm": 4.6087214575599145, + "learning_rate": 9.837425346134771e-08, + "loss": 0.5425, + "step": 11243 + }, + { + "epoch": 0.9131811906115488, + "grad_norm": 5.893925469137472, + "learning_rate": 9.819167496092352e-08, + "loss": 0.4984, + "step": 11244 + }, + { + "epoch": 0.9132624055875903, + "grad_norm": 5.730428639464196, + "learning_rate": 9.800926265292415e-08, + "loss": 0.4071, + "step": 11245 + }, + { + "epoch": 0.9133436205636319, + "grad_norm": 3.5844043953410853, + "learning_rate": 9.782701654997145e-08, + "loss": 0.5314, + "step": 11246 + }, + { + "epoch": 0.9134248355396735, + "grad_norm": 4.657341875734708, + "learning_rate": 9.764493666467589e-08, + "loss": 0.5415, + "step": 11247 + }, + { + "epoch": 0.9135060505157151, + "grad_norm": 5.362707479330835, + "learning_rate": 9.746302300963656e-08, + "loss": 0.5306, + "step": 11248 + }, + { + "epoch": 0.9135872654917567, + "grad_norm": 5.144669261464033, + "learning_rate": 9.728127559744089e-08, + "loss": 0.4642, + "step": 11249 + }, + { + "epoch": 0.9136684804677982, + "grad_norm": 8.491237111181297, + "learning_rate": 9.709969444066436e-08, + "loss": 0.4926, + "step": 11250 + }, + { + "epoch": 0.9137496954438399, + "grad_norm": 4.176442395340095, + "learning_rate": 9.691827955187222e-08, + "loss": 0.6502, + "step": 11251 + }, + { + "epoch": 0.9138309104198814, + "grad_norm": 9.913453531194941, + "learning_rate": 9.673703094361664e-08, + "loss": 0.609, + "step": 11252 + }, + { + "epoch": 0.913912125395923, + "grad_norm": 4.875302448335454, + "learning_rate": 9.655594862843953e-08, + "loss": 0.3666, + "step": 11253 + }, + { + "epoch": 0.9139933403719646, + "grad_norm": 4.516782702007909, + "learning_rate": 9.63750326188706e-08, + "loss": 0.4876, + "step": 11254 + }, + { + "epoch": 0.9140745553480062, + "grad_norm": 9.6025358191127, + "learning_rate": 9.619428292742872e-08, + "loss": 0.4328, + "step": 11255 + }, + { + "epoch": 0.9141557703240477, + "grad_norm": 6.319994193854443, + "learning_rate": 9.601369956662054e-08, + "loss": 0.5117, + "step": 11256 + }, + { + "epoch": 0.9142369853000893, + "grad_norm": 5.66199645711789, + "learning_rate": 9.583328254894109e-08, + "loss": 0.657, + "step": 11257 + }, + { + "epoch": 0.9143182002761309, + "grad_norm": 3.0411540989807317, + "learning_rate": 9.565303188687453e-08, + "loss": 0.6704, + "step": 11258 + }, + { + "epoch": 0.9143994152521725, + "grad_norm": 4.937602813989695, + "learning_rate": 9.547294759289366e-08, + "loss": 0.3464, + "step": 11259 + }, + { + "epoch": 0.9144806302282141, + "grad_norm": 4.5620361618138885, + "learning_rate": 9.52930296794588e-08, + "loss": 0.4903, + "step": 11260 + }, + { + "epoch": 0.9145618452042557, + "grad_norm": 4.986561186017303, + "learning_rate": 9.511327815902e-08, + "loss": 0.5204, + "step": 11261 + }, + { + "epoch": 0.9146430601802973, + "grad_norm": 3.2596608136193366, + "learning_rate": 9.493369304401423e-08, + "loss": 0.3049, + "step": 11262 + }, + { + "epoch": 0.9147242751563388, + "grad_norm": 3.798410117128246, + "learning_rate": 9.475427434686824e-08, + "loss": 0.5748, + "step": 11263 + }, + { + "epoch": 0.9148054901323804, + "grad_norm": 6.954981893846417, + "learning_rate": 9.457502207999736e-08, + "loss": 0.5748, + "step": 11264 + }, + { + "epoch": 0.914886705108422, + "grad_norm": 3.8501120004641143, + "learning_rate": 9.43959362558039e-08, + "loss": 0.525, + "step": 11265 + }, + { + "epoch": 0.9149679200844636, + "grad_norm": 4.333034848842079, + "learning_rate": 9.421701688668017e-08, + "loss": 0.4368, + "step": 11266 + }, + { + "epoch": 0.9150491350605051, + "grad_norm": 5.609179824260918, + "learning_rate": 9.403826398500654e-08, + "loss": 0.4947, + "step": 11267 + }, + { + "epoch": 0.9151303500365467, + "grad_norm": 4.516842058122665, + "learning_rate": 9.385967756315201e-08, + "loss": 0.5003, + "step": 11268 + }, + { + "epoch": 0.9152115650125883, + "grad_norm": 5.024303496834808, + "learning_rate": 9.368125763347336e-08, + "loss": 0.502, + "step": 11269 + }, + { + "epoch": 0.9152927799886299, + "grad_norm": 3.8890995639637858, + "learning_rate": 9.350300420831599e-08, + "loss": 0.4421, + "step": 11270 + }, + { + "epoch": 0.9153739949646715, + "grad_norm": 11.469385591277504, + "learning_rate": 9.332491730001448e-08, + "loss": 0.6776, + "step": 11271 + }, + { + "epoch": 0.915455209940713, + "grad_norm": 8.781322680809653, + "learning_rate": 9.314699692089202e-08, + "loss": 0.5468, + "step": 11272 + }, + { + "epoch": 0.9155364249167547, + "grad_norm": 5.122536771418473, + "learning_rate": 9.296924308325905e-08, + "loss": 0.5422, + "step": 11273 + }, + { + "epoch": 0.9156176398927962, + "grad_norm": 5.107338953835333, + "learning_rate": 9.279165579941546e-08, + "loss": 0.5195, + "step": 11274 + }, + { + "epoch": 0.9156988548688378, + "grad_norm": 7.603688810286707, + "learning_rate": 9.261423508164947e-08, + "loss": 0.4056, + "step": 11275 + }, + { + "epoch": 0.9157800698448794, + "grad_norm": 3.231334009016635, + "learning_rate": 9.243698094223735e-08, + "loss": 0.3926, + "step": 11276 + }, + { + "epoch": 0.915861284820921, + "grad_norm": 4.19277721624568, + "learning_rate": 9.225989339344432e-08, + "loss": 0.5653, + "step": 11277 + }, + { + "epoch": 0.9159424997969625, + "grad_norm": 4.727861994059561, + "learning_rate": 9.208297244752362e-08, + "loss": 0.6066, + "step": 11278 + }, + { + "epoch": 0.9160237147730041, + "grad_norm": 4.796597984182049, + "learning_rate": 9.190621811671769e-08, + "loss": 0.4402, + "step": 11279 + }, + { + "epoch": 0.9161049297490457, + "grad_norm": 17.756697315100613, + "learning_rate": 9.1729630413257e-08, + "loss": 0.4518, + "step": 11280 + }, + { + "epoch": 0.9161861447250873, + "grad_norm": 6.095353870840864, + "learning_rate": 9.155320934936041e-08, + "loss": 0.5031, + "step": 11281 + }, + { + "epoch": 0.9162673597011289, + "grad_norm": 6.9639308227226, + "learning_rate": 9.137695493723481e-08, + "loss": 0.4176, + "step": 11282 + }, + { + "epoch": 0.9163485746771705, + "grad_norm": 4.5604405756876485, + "learning_rate": 9.120086718907657e-08, + "loss": 0.4873, + "step": 11283 + }, + { + "epoch": 0.9164297896532121, + "grad_norm": 5.467910329584174, + "learning_rate": 9.10249461170698e-08, + "loss": 0.3554, + "step": 11284 + }, + { + "epoch": 0.9165110046292536, + "grad_norm": 5.007337212383075, + "learning_rate": 9.084919173338758e-08, + "loss": 0.499, + "step": 11285 + }, + { + "epoch": 0.9165922196052952, + "grad_norm": 9.584147277340062, + "learning_rate": 9.067360405019099e-08, + "loss": 0.4328, + "step": 11286 + }, + { + "epoch": 0.9166734345813368, + "grad_norm": 4.7954029212995355, + "learning_rate": 9.049818307963004e-08, + "loss": 0.4119, + "step": 11287 + }, + { + "epoch": 0.9167546495573784, + "grad_norm": 7.375196976362717, + "learning_rate": 9.03229288338428e-08, + "loss": 0.4901, + "step": 11288 + }, + { + "epoch": 0.9168358645334199, + "grad_norm": 5.606112983529189, + "learning_rate": 9.014784132495542e-08, + "loss": 0.5983, + "step": 11289 + }, + { + "epoch": 0.9169170795094616, + "grad_norm": 5.297505673484214, + "learning_rate": 8.997292056508372e-08, + "loss": 0.3459, + "step": 11290 + }, + { + "epoch": 0.9169982944855031, + "grad_norm": 9.227347098755619, + "learning_rate": 8.979816656633084e-08, + "loss": 0.4612, + "step": 11291 + }, + { + "epoch": 0.9170795094615447, + "grad_norm": 2.966819498823606, + "learning_rate": 8.962357934078874e-08, + "loss": 0.5246, + "step": 11292 + }, + { + "epoch": 0.9171607244375863, + "grad_norm": 6.824208201533333, + "learning_rate": 8.944915890053891e-08, + "loss": 0.4562, + "step": 11293 + }, + { + "epoch": 0.9172419394136279, + "grad_norm": 5.376195914843917, + "learning_rate": 8.927490525764942e-08, + "loss": 0.6399, + "step": 11294 + }, + { + "epoch": 0.9173231543896695, + "grad_norm": 4.79660630950142, + "learning_rate": 8.910081842417761e-08, + "loss": 0.5716, + "step": 11295 + }, + { + "epoch": 0.917404369365711, + "grad_norm": 4.642397495613151, + "learning_rate": 8.892689841216995e-08, + "loss": 0.5504, + "step": 11296 + }, + { + "epoch": 0.9174855843417526, + "grad_norm": 3.4372668067850913, + "learning_rate": 8.875314523366014e-08, + "loss": 0.5241, + "step": 11297 + }, + { + "epoch": 0.9175667993177942, + "grad_norm": 5.952139631988675, + "learning_rate": 8.857955890067132e-08, + "loss": 0.4511, + "step": 11298 + }, + { + "epoch": 0.9176480142938358, + "grad_norm": 6.811625003043075, + "learning_rate": 8.840613942521503e-08, + "loss": 0.4252, + "step": 11299 + }, + { + "epoch": 0.9177292292698773, + "grad_norm": 5.451600879549154, + "learning_rate": 8.823288681929082e-08, + "loss": 0.4875, + "step": 11300 + }, + { + "epoch": 0.917810444245919, + "grad_norm": 4.68192691825503, + "learning_rate": 8.80598010948866e-08, + "loss": 0.6457, + "step": 11301 + }, + { + "epoch": 0.9178916592219605, + "grad_norm": 4.012662107572786, + "learning_rate": 8.788688226397917e-08, + "loss": 0.4899, + "step": 11302 + }, + { + "epoch": 0.9179728741980021, + "grad_norm": 6.570602578436052, + "learning_rate": 8.771413033853343e-08, + "loss": 0.5515, + "step": 11303 + }, + { + "epoch": 0.9180540891740437, + "grad_norm": 7.278974590840044, + "learning_rate": 8.754154533050285e-08, + "loss": 0.3594, + "step": 11304 + }, + { + "epoch": 0.9181353041500853, + "grad_norm": 5.819321823620186, + "learning_rate": 8.736912725182983e-08, + "loss": 0.4738, + "step": 11305 + }, + { + "epoch": 0.9182165191261269, + "grad_norm": 4.523374291477625, + "learning_rate": 8.719687611444483e-08, + "loss": 0.3854, + "step": 11306 + }, + { + "epoch": 0.9182977341021684, + "grad_norm": 3.277263217530252, + "learning_rate": 8.702479193026608e-08, + "loss": 0.4129, + "step": 11307 + }, + { + "epoch": 0.91837894907821, + "grad_norm": 4.810473743314004, + "learning_rate": 8.68528747112013e-08, + "loss": 0.3563, + "step": 11308 + }, + { + "epoch": 0.9184601640542516, + "grad_norm": 4.361251259939714, + "learning_rate": 8.668112446914622e-08, + "loss": 0.4657, + "step": 11309 + }, + { + "epoch": 0.9185413790302932, + "grad_norm": 5.725837124786675, + "learning_rate": 8.650954121598471e-08, + "loss": 0.4554, + "step": 11310 + }, + { + "epoch": 0.9186225940063347, + "grad_norm": 7.948527051871499, + "learning_rate": 8.633812496358973e-08, + "loss": 0.4677, + "step": 11311 + }, + { + "epoch": 0.9187038089823764, + "grad_norm": 8.490193251568819, + "learning_rate": 8.616687572382293e-08, + "loss": 0.4757, + "step": 11312 + }, + { + "epoch": 0.9187850239584179, + "grad_norm": 10.643140675212129, + "learning_rate": 8.599579350853288e-08, + "loss": 0.5886, + "step": 11313 + }, + { + "epoch": 0.9188662389344595, + "grad_norm": 5.12172366794015, + "learning_rate": 8.582487832955788e-08, + "loss": 0.4511, + "step": 11314 + }, + { + "epoch": 0.9189474539105011, + "grad_norm": 4.267431188352919, + "learning_rate": 8.565413019872488e-08, + "loss": 0.4457, + "step": 11315 + }, + { + "epoch": 0.9190286688865427, + "grad_norm": 6.46937742703463, + "learning_rate": 8.548354912784801e-08, + "loss": 0.5193, + "step": 11316 + }, + { + "epoch": 0.9191098838625843, + "grad_norm": 10.178148390235016, + "learning_rate": 8.531313512873063e-08, + "loss": 0.4415, + "step": 11317 + }, + { + "epoch": 0.9191910988386258, + "grad_norm": 5.783583498365241, + "learning_rate": 8.514288821316524e-08, + "loss": 0.3627, + "step": 11318 + }, + { + "epoch": 0.9192723138146675, + "grad_norm": 4.717466038627756, + "learning_rate": 8.497280839293159e-08, + "loss": 0.4085, + "step": 11319 + }, + { + "epoch": 0.919353528790709, + "grad_norm": 4.386734491499693, + "learning_rate": 8.480289567979776e-08, + "loss": 0.5006, + "step": 11320 + }, + { + "epoch": 0.9194347437667506, + "grad_norm": 4.236143189731592, + "learning_rate": 8.463315008552158e-08, + "loss": 0.5219, + "step": 11321 + }, + { + "epoch": 0.9195159587427921, + "grad_norm": 6.377920595401447, + "learning_rate": 8.446357162184838e-08, + "loss": 0.5412, + "step": 11322 + }, + { + "epoch": 0.9195971737188338, + "grad_norm": 4.393949522713649, + "learning_rate": 8.429416030051179e-08, + "loss": 0.6075, + "step": 11323 + }, + { + "epoch": 0.9196783886948753, + "grad_norm": 3.2693096226160097, + "learning_rate": 8.412491613323415e-08, + "loss": 0.4949, + "step": 11324 + }, + { + "epoch": 0.9197596036709169, + "grad_norm": 5.690081161082877, + "learning_rate": 8.39558391317269e-08, + "loss": 0.333, + "step": 11325 + }, + { + "epoch": 0.9198408186469585, + "grad_norm": 4.591799853216551, + "learning_rate": 8.378692930768873e-08, + "loss": 0.5179, + "step": 11326 + }, + { + "epoch": 0.9199220336230001, + "grad_norm": 3.4431361012922017, + "learning_rate": 8.361818667280724e-08, + "loss": 0.429, + "step": 11327 + }, + { + "epoch": 0.9200032485990417, + "grad_norm": 7.374506756234914, + "learning_rate": 8.344961123875895e-08, + "loss": 0.4514, + "step": 11328 + }, + { + "epoch": 0.9200844635750832, + "grad_norm": 3.5074173112977345, + "learning_rate": 8.328120301720783e-08, + "loss": 0.5645, + "step": 11329 + }, + { + "epoch": 0.9201656785511249, + "grad_norm": 4.89262703046508, + "learning_rate": 8.311296201980734e-08, + "loss": 0.6032, + "step": 11330 + }, + { + "epoch": 0.9202468935271664, + "grad_norm": 3.5016861133769526, + "learning_rate": 8.294488825819875e-08, + "loss": 0.9195, + "step": 11331 + }, + { + "epoch": 0.920328108503208, + "grad_norm": 5.597223977106012, + "learning_rate": 8.277698174401189e-08, + "loss": 0.644, + "step": 11332 + }, + { + "epoch": 0.9204093234792495, + "grad_norm": 4.477628857500898, + "learning_rate": 8.260924248886471e-08, + "loss": 0.5054, + "step": 11333 + }, + { + "epoch": 0.9204905384552912, + "grad_norm": 5.7562752348647885, + "learning_rate": 8.244167050436402e-08, + "loss": 0.4682, + "step": 11334 + }, + { + "epoch": 0.9205717534313327, + "grad_norm": 5.880650217377458, + "learning_rate": 8.22742658021053e-08, + "loss": 0.6894, + "step": 11335 + }, + { + "epoch": 0.9206529684073743, + "grad_norm": 5.322917229821901, + "learning_rate": 8.210702839367146e-08, + "loss": 0.4346, + "step": 11336 + }, + { + "epoch": 0.920734183383416, + "grad_norm": 4.796792390299345, + "learning_rate": 8.193995829063467e-08, + "loss": 0.4673, + "step": 11337 + }, + { + "epoch": 0.9208153983594575, + "grad_norm": 8.87203052295575, + "learning_rate": 8.177305550455566e-08, + "loss": 0.444, + "step": 11338 + }, + { + "epoch": 0.9208966133354991, + "grad_norm": 4.841733687058009, + "learning_rate": 8.160632004698271e-08, + "loss": 0.4699, + "step": 11339 + }, + { + "epoch": 0.9209778283115406, + "grad_norm": 4.075654039554347, + "learning_rate": 8.143975192945325e-08, + "loss": 0.7124, + "step": 11340 + }, + { + "epoch": 0.9210590432875823, + "grad_norm": 4.698000621909273, + "learning_rate": 8.127335116349305e-08, + "loss": 0.4646, + "step": 11341 + }, + { + "epoch": 0.9211402582636238, + "grad_norm": 4.494572377526055, + "learning_rate": 8.110711776061597e-08, + "loss": 0.4942, + "step": 11342 + }, + { + "epoch": 0.9212214732396654, + "grad_norm": 7.279761243226819, + "learning_rate": 8.09410517323242e-08, + "loss": 0.4229, + "step": 11343 + }, + { + "epoch": 0.9213026882157069, + "grad_norm": 4.863071295173902, + "learning_rate": 8.077515309010936e-08, + "loss": 0.3941, + "step": 11344 + }, + { + "epoch": 0.9213839031917486, + "grad_norm": 5.892354030279855, + "learning_rate": 8.060942184545034e-08, + "loss": 0.5919, + "step": 11345 + }, + { + "epoch": 0.9214651181677901, + "grad_norm": 4.708709314273925, + "learning_rate": 8.044385800981464e-08, + "loss": 0.3786, + "step": 11346 + }, + { + "epoch": 0.9215463331438317, + "grad_norm": 8.147418646635652, + "learning_rate": 8.02784615946589e-08, + "loss": 0.5678, + "step": 11347 + }, + { + "epoch": 0.9216275481198734, + "grad_norm": 5.376975658563418, + "learning_rate": 8.011323261142734e-08, + "loss": 0.4418, + "step": 11348 + }, + { + "epoch": 0.9217087630959149, + "grad_norm": 5.665831071686028, + "learning_rate": 7.994817107155301e-08, + "loss": 0.4363, + "step": 11349 + }, + { + "epoch": 0.9217899780719565, + "grad_norm": 4.582354055457941, + "learning_rate": 7.978327698645705e-08, + "loss": 0.4622, + "step": 11350 + }, + { + "epoch": 0.921871193047998, + "grad_norm": 7.540612277290582, + "learning_rate": 7.961855036754978e-08, + "loss": 0.3925, + "step": 11351 + }, + { + "epoch": 0.9219524080240397, + "grad_norm": 4.845886462847071, + "learning_rate": 7.945399122622904e-08, + "loss": 0.494, + "step": 11352 + }, + { + "epoch": 0.9220336230000812, + "grad_norm": 3.6709111199072013, + "learning_rate": 7.928959957388154e-08, + "loss": 0.625, + "step": 11353 + }, + { + "epoch": 0.9221148379761228, + "grad_norm": 4.457865490095833, + "learning_rate": 7.912537542188264e-08, + "loss": 0.4271, + "step": 11354 + }, + { + "epoch": 0.9221960529521643, + "grad_norm": 6.9852450558056365, + "learning_rate": 7.89613187815949e-08, + "loss": 0.5399, + "step": 11355 + }, + { + "epoch": 0.922277267928206, + "grad_norm": 5.777646011684341, + "learning_rate": 7.879742966437092e-08, + "loss": 0.4892, + "step": 11356 + }, + { + "epoch": 0.9223584829042475, + "grad_norm": 3.543496182377441, + "learning_rate": 7.86337080815508e-08, + "loss": 0.5512, + "step": 11357 + }, + { + "epoch": 0.9224396978802891, + "grad_norm": 4.258648901552076, + "learning_rate": 7.847015404446352e-08, + "loss": 0.3128, + "step": 11358 + }, + { + "epoch": 0.9225209128563308, + "grad_norm": 5.717748386729547, + "learning_rate": 7.830676756442529e-08, + "loss": 0.5077, + "step": 11359 + }, + { + "epoch": 0.9226021278323723, + "grad_norm": 5.465873996372227, + "learning_rate": 7.814354865274237e-08, + "loss": 0.4398, + "step": 11360 + }, + { + "epoch": 0.9226833428084139, + "grad_norm": 9.321969343150906, + "learning_rate": 7.798049732070822e-08, + "loss": 0.5184, + "step": 11361 + }, + { + "epoch": 0.9227645577844554, + "grad_norm": 6.551159045640905, + "learning_rate": 7.78176135796052e-08, + "loss": 0.4808, + "step": 11362 + }, + { + "epoch": 0.9228457727604971, + "grad_norm": 6.18279810098398, + "learning_rate": 7.765489744070459e-08, + "loss": 0.495, + "step": 11363 + }, + { + "epoch": 0.9229269877365386, + "grad_norm": 4.489976108293694, + "learning_rate": 7.749234891526486e-08, + "loss": 0.5977, + "step": 11364 + }, + { + "epoch": 0.9230082027125802, + "grad_norm": 4.0322329291396475, + "learning_rate": 7.732996801453313e-08, + "loss": 0.7121, + "step": 11365 + }, + { + "epoch": 0.9230894176886217, + "grad_norm": 7.911313584451032, + "learning_rate": 7.716775474974625e-08, + "loss": 0.3647, + "step": 11366 + }, + { + "epoch": 0.9231706326646634, + "grad_norm": 4.246785998863406, + "learning_rate": 7.70057091321283e-08, + "loss": 0.3889, + "step": 11367 + }, + { + "epoch": 0.9232518476407049, + "grad_norm": 21.982620189789714, + "learning_rate": 7.684383117289141e-08, + "loss": 0.5681, + "step": 11368 + }, + { + "epoch": 0.9233330626167465, + "grad_norm": 16.563097433915587, + "learning_rate": 7.66821208832369e-08, + "loss": 0.4741, + "step": 11369 + }, + { + "epoch": 0.9234142775927882, + "grad_norm": 4.738887055673006, + "learning_rate": 7.652057827435444e-08, + "loss": 0.5937, + "step": 11370 + }, + { + "epoch": 0.9234954925688297, + "grad_norm": 5.91709828726191, + "learning_rate": 7.635920335742203e-08, + "loss": 0.458, + "step": 11371 + }, + { + "epoch": 0.9235767075448713, + "grad_norm": 4.025266759061047, + "learning_rate": 7.619799614360573e-08, + "loss": 0.5648, + "step": 11372 + }, + { + "epoch": 0.9236579225209128, + "grad_norm": 5.402057661799795, + "learning_rate": 7.603695664406053e-08, + "loss": 0.4054, + "step": 11373 + }, + { + "epoch": 0.9237391374969545, + "grad_norm": 5.575279520689622, + "learning_rate": 7.587608486992915e-08, + "loss": 0.4741, + "step": 11374 + }, + { + "epoch": 0.923820352472996, + "grad_norm": 3.7480482517554115, + "learning_rate": 7.571538083234298e-08, + "loss": 0.4311, + "step": 11375 + }, + { + "epoch": 0.9239015674490376, + "grad_norm": 3.873434245465527, + "learning_rate": 7.555484454242229e-08, + "loss": 0.5094, + "step": 11376 + }, + { + "epoch": 0.9239827824250791, + "grad_norm": 4.404320153846423, + "learning_rate": 7.539447601127542e-08, + "loss": 0.4184, + "step": 11377 + }, + { + "epoch": 0.9240639974011208, + "grad_norm": 4.253052401961252, + "learning_rate": 7.523427524999822e-08, + "loss": 0.4376, + "step": 11378 + }, + { + "epoch": 0.9241452123771623, + "grad_norm": 7.182394888454542, + "learning_rate": 7.507424226967681e-08, + "loss": 0.4695, + "step": 11379 + }, + { + "epoch": 0.9242264273532039, + "grad_norm": 6.303359290206109, + "learning_rate": 7.491437708138372e-08, + "loss": 0.5214, + "step": 11380 + }, + { + "epoch": 0.9243076423292456, + "grad_norm": 4.296067574108232, + "learning_rate": 7.475467969618122e-08, + "loss": 0.4966, + "step": 11381 + }, + { + "epoch": 0.9243888573052871, + "grad_norm": 3.9154299069883187, + "learning_rate": 7.459515012511937e-08, + "loss": 0.6921, + "step": 11382 + }, + { + "epoch": 0.9244700722813287, + "grad_norm": 3.5610883326964538, + "learning_rate": 7.443578837923709e-08, + "loss": 0.4532, + "step": 11383 + }, + { + "epoch": 0.9245512872573702, + "grad_norm": 4.0069328169869625, + "learning_rate": 7.427659446956087e-08, + "loss": 0.6126, + "step": 11384 + }, + { + "epoch": 0.9246325022334119, + "grad_norm": 3.4747848677201554, + "learning_rate": 7.41175684071066e-08, + "loss": 0.5281, + "step": 11385 + }, + { + "epoch": 0.9247137172094534, + "grad_norm": 4.640308832890803, + "learning_rate": 7.39587102028777e-08, + "loss": 0.448, + "step": 11386 + }, + { + "epoch": 0.924794932185495, + "grad_norm": 4.482953706151723, + "learning_rate": 7.38000198678665e-08, + "loss": 0.5122, + "step": 11387 + }, + { + "epoch": 0.9248761471615365, + "grad_norm": 7.944047198700474, + "learning_rate": 7.36414974130531e-08, + "loss": 0.4532, + "step": 11388 + }, + { + "epoch": 0.9249573621375782, + "grad_norm": 3.2439380546377583, + "learning_rate": 7.348314284940706e-08, + "loss": 0.4904, + "step": 11389 + }, + { + "epoch": 0.9250385771136197, + "grad_norm": 4.141858638205531, + "learning_rate": 7.332495618788516e-08, + "loss": 0.5272, + "step": 11390 + }, + { + "epoch": 0.9251197920896613, + "grad_norm": 3.3315239217657773, + "learning_rate": 7.316693743943364e-08, + "loss": 0.4314, + "step": 11391 + }, + { + "epoch": 0.925201007065703, + "grad_norm": 3.9642024980963573, + "learning_rate": 7.300908661498602e-08, + "loss": 0.545, + "step": 11392 + }, + { + "epoch": 0.9252822220417445, + "grad_norm": 6.271707028230578, + "learning_rate": 7.28514037254649e-08, + "loss": 0.4979, + "step": 11393 + }, + { + "epoch": 0.9253634370177861, + "grad_norm": 6.887788740548668, + "learning_rate": 7.26938887817813e-08, + "loss": 0.5243, + "step": 11394 + }, + { + "epoch": 0.9254446519938276, + "grad_norm": 4.351861411172992, + "learning_rate": 7.2536541794834e-08, + "loss": 0.563, + "step": 11395 + }, + { + "epoch": 0.9255258669698693, + "grad_norm": 13.306179850830969, + "learning_rate": 7.237936277551095e-08, + "loss": 0.4579, + "step": 11396 + }, + { + "epoch": 0.9256070819459108, + "grad_norm": 8.741284990101086, + "learning_rate": 7.22223517346879e-08, + "loss": 0.4368, + "step": 11397 + }, + { + "epoch": 0.9256882969219524, + "grad_norm": 4.724782847856221, + "learning_rate": 7.206550868322947e-08, + "loss": 0.5499, + "step": 11398 + }, + { + "epoch": 0.925769511897994, + "grad_norm": 4.443264413967751, + "learning_rate": 7.190883363198815e-08, + "loss": 0.4174, + "step": 11399 + }, + { + "epoch": 0.9258507268740356, + "grad_norm": 10.053412856400966, + "learning_rate": 7.175232659180492e-08, + "loss": 0.6192, + "step": 11400 + }, + { + "epoch": 0.9259319418500771, + "grad_norm": 8.327053855258349, + "learning_rate": 7.159598757350922e-08, + "loss": 0.4117, + "step": 11401 + }, + { + "epoch": 0.9260131568261187, + "grad_norm": 4.431594842283679, + "learning_rate": 7.143981658791933e-08, + "loss": 0.4699, + "step": 11402 + }, + { + "epoch": 0.9260943718021604, + "grad_norm": 3.250722005433043, + "learning_rate": 7.128381364584075e-08, + "loss": 0.5342, + "step": 11403 + }, + { + "epoch": 0.9261755867782019, + "grad_norm": 2.957886228743401, + "learning_rate": 7.112797875806904e-08, + "loss": 0.4175, + "step": 11404 + }, + { + "epoch": 0.9262568017542435, + "grad_norm": 6.59101582446327, + "learning_rate": 7.09723119353864e-08, + "loss": 0.4268, + "step": 11405 + }, + { + "epoch": 0.926338016730285, + "grad_norm": 4.053080793203663, + "learning_rate": 7.081681318856392e-08, + "loss": 0.4142, + "step": 11406 + }, + { + "epoch": 0.9264192317063267, + "grad_norm": 48.79556971471609, + "learning_rate": 7.066148252836219e-08, + "loss": 0.4385, + "step": 11407 + }, + { + "epoch": 0.9265004466823682, + "grad_norm": 4.586752487959576, + "learning_rate": 7.050631996552842e-08, + "loss": 0.4813, + "step": 11408 + }, + { + "epoch": 0.9265816616584098, + "grad_norm": 5.714365886095958, + "learning_rate": 7.035132551079932e-08, + "loss": 0.3704, + "step": 11409 + }, + { + "epoch": 0.9266628766344513, + "grad_norm": 8.579523295772237, + "learning_rate": 7.019649917490018e-08, + "loss": 0.3965, + "step": 11410 + }, + { + "epoch": 0.926744091610493, + "grad_norm": 3.0236689032912842, + "learning_rate": 7.004184096854356e-08, + "loss": 0.572, + "step": 11411 + }, + { + "epoch": 0.9268253065865345, + "grad_norm": 6.812565797682661, + "learning_rate": 6.988735090243142e-08, + "loss": 0.4516, + "step": 11412 + }, + { + "epoch": 0.9269065215625761, + "grad_norm": 7.700307129522538, + "learning_rate": 6.973302898725303e-08, + "loss": 0.4734, + "step": 11413 + }, + { + "epoch": 0.9269877365386178, + "grad_norm": 4.263009068805309, + "learning_rate": 6.957887523368678e-08, + "loss": 0.4478, + "step": 11414 + }, + { + "epoch": 0.9270689515146593, + "grad_norm": 5.759822571490719, + "learning_rate": 6.942488965240024e-08, + "loss": 0.4749, + "step": 11415 + }, + { + "epoch": 0.9271501664907009, + "grad_norm": 7.781687526540662, + "learning_rate": 6.92710722540471e-08, + "loss": 0.398, + "step": 11416 + }, + { + "epoch": 0.9272313814667424, + "grad_norm": 6.317228165360641, + "learning_rate": 6.911742304927166e-08, + "loss": 0.3865, + "step": 11417 + }, + { + "epoch": 0.9273125964427841, + "grad_norm": 6.2711857470132095, + "learning_rate": 6.896394204870538e-08, + "loss": 0.4675, + "step": 11418 + }, + { + "epoch": 0.9273938114188256, + "grad_norm": 4.60041184087192, + "learning_rate": 6.881062926296783e-08, + "loss": 0.5341, + "step": 11419 + }, + { + "epoch": 0.9274750263948672, + "grad_norm": 5.061233791327525, + "learning_rate": 6.865748470266803e-08, + "loss": 0.5314, + "step": 11420 + }, + { + "epoch": 0.9275562413709088, + "grad_norm": 3.7105712181882007, + "learning_rate": 6.85045083784025e-08, + "loss": 0.3024, + "step": 11421 + }, + { + "epoch": 0.9276374563469504, + "grad_norm": 12.81065039260799, + "learning_rate": 6.835170030075638e-08, + "loss": 0.4527, + "step": 11422 + }, + { + "epoch": 0.9277186713229919, + "grad_norm": 6.058599141728342, + "learning_rate": 6.819906048030345e-08, + "loss": 0.4631, + "step": 11423 + }, + { + "epoch": 0.9277998862990335, + "grad_norm": 6.482547478325477, + "learning_rate": 6.804658892760552e-08, + "loss": 0.4521, + "step": 11424 + }, + { + "epoch": 0.9278811012750752, + "grad_norm": 14.54413960358652, + "learning_rate": 6.789428565321249e-08, + "loss": 0.3769, + "step": 11425 + }, + { + "epoch": 0.9279623162511167, + "grad_norm": 5.183303848496903, + "learning_rate": 6.774215066766344e-08, + "loss": 0.535, + "step": 11426 + }, + { + "epoch": 0.9280435312271583, + "grad_norm": 4.383546792589708, + "learning_rate": 6.759018398148464e-08, + "loss": 0.3777, + "step": 11427 + }, + { + "epoch": 0.9281247462031998, + "grad_norm": 14.039870214824262, + "learning_rate": 6.743838560519189e-08, + "loss": 0.508, + "step": 11428 + }, + { + "epoch": 0.9282059611792415, + "grad_norm": 6.608844886900423, + "learning_rate": 6.728675554928898e-08, + "loss": 0.3648, + "step": 11429 + }, + { + "epoch": 0.928287176155283, + "grad_norm": 4.663438602760716, + "learning_rate": 6.713529382426726e-08, + "loss": 0.3972, + "step": 11430 + }, + { + "epoch": 0.9283683911313246, + "grad_norm": 3.8710006305176226, + "learning_rate": 6.698400044060777e-08, + "loss": 0.4688, + "step": 11431 + }, + { + "epoch": 0.9284496061073662, + "grad_norm": 3.7449549283487324, + "learning_rate": 6.683287540877853e-08, + "loss": 0.539, + "step": 11432 + }, + { + "epoch": 0.9285308210834078, + "grad_norm": 4.662838653734012, + "learning_rate": 6.668191873923701e-08, + "loss": 0.5233, + "step": 11433 + }, + { + "epoch": 0.9286120360594493, + "grad_norm": 4.3917574513396955, + "learning_rate": 6.653113044242904e-08, + "loss": 0.5587, + "step": 11434 + }, + { + "epoch": 0.9286932510354909, + "grad_norm": 9.052354440821007, + "learning_rate": 6.638051052878736e-08, + "loss": 0.5373, + "step": 11435 + }, + { + "epoch": 0.9287744660115326, + "grad_norm": 3.219724415933315, + "learning_rate": 6.623005900873474e-08, + "loss": 0.4253, + "step": 11436 + }, + { + "epoch": 0.9288556809875741, + "grad_norm": 3.5215886145233584, + "learning_rate": 6.607977589268177e-08, + "loss": 0.447, + "step": 11437 + }, + { + "epoch": 0.9289368959636157, + "grad_norm": 6.121975994953795, + "learning_rate": 6.59296611910265e-08, + "loss": 0.4416, + "step": 11438 + }, + { + "epoch": 0.9290181109396572, + "grad_norm": 7.404459895688475, + "learning_rate": 6.577971491415674e-08, + "loss": 0.4477, + "step": 11439 + }, + { + "epoch": 0.9290993259156989, + "grad_norm": 9.284787320280776, + "learning_rate": 6.56299370724478e-08, + "loss": 0.5049, + "step": 11440 + }, + { + "epoch": 0.9291805408917404, + "grad_norm": 7.115335886705074, + "learning_rate": 6.548032767626333e-08, + "loss": 0.4512, + "step": 11441 + }, + { + "epoch": 0.929261755867782, + "grad_norm": 6.530284682735083, + "learning_rate": 6.533088673595589e-08, + "loss": 0.3873, + "step": 11442 + }, + { + "epoch": 0.9293429708438236, + "grad_norm": 5.05350752987569, + "learning_rate": 6.51816142618658e-08, + "loss": 0.5982, + "step": 11443 + }, + { + "epoch": 0.9294241858198652, + "grad_norm": 8.402220447701428, + "learning_rate": 6.503251026432179e-08, + "loss": 0.517, + "step": 11444 + }, + { + "epoch": 0.9295054007959067, + "grad_norm": 5.712552445076794, + "learning_rate": 6.48835747536411e-08, + "loss": 0.4236, + "step": 11445 + }, + { + "epoch": 0.9295866157719483, + "grad_norm": 8.723406889128468, + "learning_rate": 6.473480774012941e-08, + "loss": 0.4614, + "step": 11446 + }, + { + "epoch": 0.92966783074799, + "grad_norm": 3.518539374576921, + "learning_rate": 6.458620923408044e-08, + "loss": 0.697, + "step": 11447 + }, + { + "epoch": 0.9297490457240315, + "grad_norm": 4.996661458005611, + "learning_rate": 6.443777924577676e-08, + "loss": 0.7767, + "step": 11448 + }, + { + "epoch": 0.9298302607000731, + "grad_norm": 6.029619825429544, + "learning_rate": 6.428951778548881e-08, + "loss": 0.5338, + "step": 11449 + }, + { + "epoch": 0.9299114756761147, + "grad_norm": 6.072175958874135, + "learning_rate": 6.414142486347557e-08, + "loss": 0.53, + "step": 11450 + }, + { + "epoch": 0.9299926906521563, + "grad_norm": 4.106779337844846, + "learning_rate": 6.39935004899836e-08, + "loss": 0.5014, + "step": 11451 + }, + { + "epoch": 0.9300739056281978, + "grad_norm": 5.028928964661914, + "learning_rate": 6.38457446752494e-08, + "loss": 0.3896, + "step": 11452 + }, + { + "epoch": 0.9301551206042394, + "grad_norm": 3.916926797126131, + "learning_rate": 6.36981574294962e-08, + "loss": 0.4988, + "step": 11453 + }, + { + "epoch": 0.930236335580281, + "grad_norm": 11.573875343354155, + "learning_rate": 6.355073876293638e-08, + "loss": 0.4861, + "step": 11454 + }, + { + "epoch": 0.9303175505563226, + "grad_norm": 2.953512192237326, + "learning_rate": 6.340348868577123e-08, + "loss": 0.4556, + "step": 11455 + }, + { + "epoch": 0.9303987655323641, + "grad_norm": 5.593041199500861, + "learning_rate": 6.325640720818899e-08, + "loss": 0.4905, + "step": 11456 + }, + { + "epoch": 0.9304799805084057, + "grad_norm": 5.055425216937131, + "learning_rate": 6.310949434036707e-08, + "loss": 0.5173, + "step": 11457 + }, + { + "epoch": 0.9305611954844474, + "grad_norm": 5.280470361998423, + "learning_rate": 6.296275009247121e-08, + "loss": 0.4225, + "step": 11458 + }, + { + "epoch": 0.9306424104604889, + "grad_norm": 3.6832709176499785, + "learning_rate": 6.28161744746547e-08, + "loss": 0.453, + "step": 11459 + }, + { + "epoch": 0.9307236254365305, + "grad_norm": 7.653920651385478, + "learning_rate": 6.266976749706055e-08, + "loss": 0.4662, + "step": 11460 + }, + { + "epoch": 0.930804840412572, + "grad_norm": 5.873778228921937, + "learning_rate": 6.252352916981924e-08, + "loss": 0.4553, + "step": 11461 + }, + { + "epoch": 0.9308860553886137, + "grad_norm": 4.554092544444619, + "learning_rate": 6.237745950304963e-08, + "loss": 0.6166, + "step": 11462 + }, + { + "epoch": 0.9309672703646552, + "grad_norm": 5.080682795743635, + "learning_rate": 6.223155850685864e-08, + "loss": 0.5027, + "step": 11463 + }, + { + "epoch": 0.9310484853406968, + "grad_norm": 16.051986684845595, + "learning_rate": 6.208582619134234e-08, + "loss": 0.5546, + "step": 11464 + }, + { + "epoch": 0.9311297003167384, + "grad_norm": 7.21160759933066, + "learning_rate": 6.194026256658437e-08, + "loss": 0.412, + "step": 11465 + }, + { + "epoch": 0.93121091529278, + "grad_norm": 7.017201794326772, + "learning_rate": 6.179486764265663e-08, + "loss": 0.5635, + "step": 11466 + }, + { + "epoch": 0.9312921302688215, + "grad_norm": 6.919895696187786, + "learning_rate": 6.164964142962027e-08, + "loss": 0.4289, + "step": 11467 + }, + { + "epoch": 0.9313733452448631, + "grad_norm": 6.03620856801876, + "learning_rate": 6.15045839375239e-08, + "loss": 0.5504, + "step": 11468 + }, + { + "epoch": 0.9314545602209048, + "grad_norm": 6.279814421491574, + "learning_rate": 6.135969517640506e-08, + "loss": 0.3537, + "step": 11469 + }, + { + "epoch": 0.9315357751969463, + "grad_norm": 9.154160836609854, + "learning_rate": 6.12149751562885e-08, + "loss": 0.6001, + "step": 11470 + }, + { + "epoch": 0.9316169901729879, + "grad_norm": 6.53932524076683, + "learning_rate": 6.107042388718898e-08, + "loss": 0.3881, + "step": 11471 + }, + { + "epoch": 0.9316982051490295, + "grad_norm": 6.633007691995465, + "learning_rate": 6.092604137910768e-08, + "loss": 0.5065, + "step": 11472 + }, + { + "epoch": 0.9317794201250711, + "grad_norm": 5.199512015394908, + "learning_rate": 6.078182764203605e-08, + "loss": 0.5226, + "step": 11473 + }, + { + "epoch": 0.9318606351011126, + "grad_norm": 7.046681958143693, + "learning_rate": 6.063778268595278e-08, + "loss": 0.468, + "step": 11474 + }, + { + "epoch": 0.9319418500771542, + "grad_norm": 7.741942160235145, + "learning_rate": 6.04939065208246e-08, + "loss": 0.4611, + "step": 11475 + }, + { + "epoch": 0.9320230650531958, + "grad_norm": 6.7144122182681745, + "learning_rate": 6.035019915660717e-08, + "loss": 0.4898, + "step": 11476 + }, + { + "epoch": 0.9321042800292374, + "grad_norm": 4.677019846346351, + "learning_rate": 6.020666060324448e-08, + "loss": 0.4645, + "step": 11477 + }, + { + "epoch": 0.9321854950052789, + "grad_norm": 4.133353885157457, + "learning_rate": 6.006329087066831e-08, + "loss": 0.36, + "step": 11478 + }, + { + "epoch": 0.9322667099813206, + "grad_norm": 6.172109607468845, + "learning_rate": 5.992008996879906e-08, + "loss": 0.4392, + "step": 11479 + }, + { + "epoch": 0.9323479249573622, + "grad_norm": 4.757665364741502, + "learning_rate": 5.977705790754546e-08, + "loss": 0.5154, + "step": 11480 + }, + { + "epoch": 0.9324291399334037, + "grad_norm": 9.820132371604062, + "learning_rate": 5.963419469680543e-08, + "loss": 0.4637, + "step": 11481 + }, + { + "epoch": 0.9325103549094453, + "grad_norm": 4.899371695926488, + "learning_rate": 5.9491500346463005e-08, + "loss": 0.5009, + "step": 11482 + }, + { + "epoch": 0.9325915698854869, + "grad_norm": 3.7801882505750366, + "learning_rate": 5.934897486639307e-08, + "loss": 0.4272, + "step": 11483 + }, + { + "epoch": 0.9326727848615285, + "grad_norm": 7.561614245206794, + "learning_rate": 5.9206618266456904e-08, + "loss": 0.5861, + "step": 11484 + }, + { + "epoch": 0.93275399983757, + "grad_norm": 4.538003207030416, + "learning_rate": 5.906443055650496e-08, + "loss": 0.6691, + "step": 11485 + }, + { + "epoch": 0.9328352148136116, + "grad_norm": 5.474716210455667, + "learning_rate": 5.892241174637575e-08, + "loss": 0.3982, + "step": 11486 + }, + { + "epoch": 0.9329164297896532, + "grad_norm": 8.069204894922803, + "learning_rate": 5.8780561845896697e-08, + "loss": 0.52, + "step": 11487 + }, + { + "epoch": 0.9329976447656948, + "grad_norm": 5.907171852729622, + "learning_rate": 5.863888086488301e-08, + "loss": 0.5574, + "step": 11488 + }, + { + "epoch": 0.9330788597417363, + "grad_norm": 6.090121506693037, + "learning_rate": 5.849736881313767e-08, + "loss": 0.4025, + "step": 11489 + }, + { + "epoch": 0.933160074717778, + "grad_norm": 8.516034082485882, + "learning_rate": 5.835602570045312e-08, + "loss": 0.4993, + "step": 11490 + }, + { + "epoch": 0.9332412896938196, + "grad_norm": 6.277551920267633, + "learning_rate": 5.8214851536609326e-08, + "loss": 0.4205, + "step": 11491 + }, + { + "epoch": 0.9333225046698611, + "grad_norm": 8.065629651818462, + "learning_rate": 5.807384633137459e-08, + "loss": 0.4213, + "step": 11492 + }, + { + "epoch": 0.9334037196459027, + "grad_norm": 4.95647438456384, + "learning_rate": 5.793301009450636e-08, + "loss": 0.4459, + "step": 11493 + }, + { + "epoch": 0.9334849346219443, + "grad_norm": 4.983399216729912, + "learning_rate": 5.779234283574936e-08, + "loss": 0.5453, + "step": 11494 + }, + { + "epoch": 0.9335661495979859, + "grad_norm": 5.591248101931953, + "learning_rate": 5.765184456483664e-08, + "loss": 0.453, + "step": 11495 + }, + { + "epoch": 0.9336473645740274, + "grad_norm": 5.18268156667367, + "learning_rate": 5.7511515291490686e-08, + "loss": 0.4371, + "step": 11496 + }, + { + "epoch": 0.933728579550069, + "grad_norm": 6.395159139697958, + "learning_rate": 5.737135502542124e-08, + "loss": 0.5132, + "step": 11497 + }, + { + "epoch": 0.9338097945261106, + "grad_norm": 5.026018785036258, + "learning_rate": 5.7231363776326096e-08, + "loss": 0.4138, + "step": 11498 + }, + { + "epoch": 0.9338910095021522, + "grad_norm": 4.941851075904659, + "learning_rate": 5.709154155389279e-08, + "loss": 0.5284, + "step": 11499 + }, + { + "epoch": 0.9339722244781937, + "grad_norm": 5.637616770839022, + "learning_rate": 5.6951888367795804e-08, + "loss": 0.5964, + "step": 11500 + }, + { + "epoch": 0.9340534394542354, + "grad_norm": 6.004225890183753, + "learning_rate": 5.681240422769879e-08, + "loss": 0.4451, + "step": 11501 + }, + { + "epoch": 0.934134654430277, + "grad_norm": 4.723000628789615, + "learning_rate": 5.6673089143252646e-08, + "loss": 0.3078, + "step": 11502 + }, + { + "epoch": 0.9342158694063185, + "grad_norm": 6.077929375109591, + "learning_rate": 5.653394312409771e-08, + "loss": 0.6627, + "step": 11503 + }, + { + "epoch": 0.9342970843823601, + "grad_norm": 6.472316849971879, + "learning_rate": 5.639496617986184e-08, + "loss": 0.4792, + "step": 11504 + }, + { + "epoch": 0.9343782993584017, + "grad_norm": 6.293958361792907, + "learning_rate": 5.625615832016179e-08, + "loss": 0.438, + "step": 11505 + }, + { + "epoch": 0.9344595143344433, + "grad_norm": 5.09268953575727, + "learning_rate": 5.6117519554602375e-08, + "loss": 0.5255, + "step": 11506 + }, + { + "epoch": 0.9345407293104848, + "grad_norm": 5.054483428815471, + "learning_rate": 5.597904989277675e-08, + "loss": 0.437, + "step": 11507 + }, + { + "epoch": 0.9346219442865265, + "grad_norm": 3.9858596078394943, + "learning_rate": 5.584074934426559e-08, + "loss": 0.5096, + "step": 11508 + }, + { + "epoch": 0.934703159262568, + "grad_norm": 8.170161084504295, + "learning_rate": 5.570261791863957e-08, + "loss": 0.4268, + "step": 11509 + }, + { + "epoch": 0.9347843742386096, + "grad_norm": 5.199339732120409, + "learning_rate": 5.5564655625455766e-08, + "loss": 0.7232, + "step": 11510 + }, + { + "epoch": 0.9348655892146511, + "grad_norm": 4.643135079968665, + "learning_rate": 5.5426862474260986e-08, + "loss": 0.4998, + "step": 11511 + }, + { + "epoch": 0.9349468041906928, + "grad_norm": 4.163342887629166, + "learning_rate": 5.528923847458928e-08, + "loss": 0.4594, + "step": 11512 + }, + { + "epoch": 0.9350280191667344, + "grad_norm": 5.970898841399132, + "learning_rate": 5.5151783635964126e-08, + "loss": 0.46, + "step": 11513 + }, + { + "epoch": 0.9351092341427759, + "grad_norm": 5.478270806254457, + "learning_rate": 5.5014497967896266e-08, + "loss": 0.4887, + "step": 11514 + }, + { + "epoch": 0.9351904491188175, + "grad_norm": 4.460190714960654, + "learning_rate": 5.4877381479885307e-08, + "loss": 0.5598, + "step": 11515 + }, + { + "epoch": 0.9352716640948591, + "grad_norm": 4.1338050878154355, + "learning_rate": 5.4740434181418945e-08, + "loss": 0.47, + "step": 11516 + }, + { + "epoch": 0.9353528790709007, + "grad_norm": 6.435871850940438, + "learning_rate": 5.460365608197293e-08, + "loss": 0.4158, + "step": 11517 + }, + { + "epoch": 0.9354340940469422, + "grad_norm": 7.643223731934275, + "learning_rate": 5.4467047191011924e-08, + "loss": 0.3233, + "step": 11518 + }, + { + "epoch": 0.9355153090229839, + "grad_norm": 5.4260804495525825, + "learning_rate": 5.4330607517988635e-08, + "loss": 0.5388, + "step": 11519 + }, + { + "epoch": 0.9355965239990254, + "grad_norm": 4.953162854169758, + "learning_rate": 5.419433707234356e-08, + "loss": 0.4487, + "step": 11520 + }, + { + "epoch": 0.935677738975067, + "grad_norm": 3.6992673910297293, + "learning_rate": 5.4058235863506116e-08, + "loss": 0.5167, + "step": 11521 + }, + { + "epoch": 0.9357589539511085, + "grad_norm": 4.996367276417503, + "learning_rate": 5.392230390089404e-08, + "loss": 0.5104, + "step": 11522 + }, + { + "epoch": 0.9358401689271502, + "grad_norm": 4.62381128315659, + "learning_rate": 5.3786541193912854e-08, + "loss": 0.5088, + "step": 11523 + }, + { + "epoch": 0.9359213839031918, + "grad_norm": 7.587560626305793, + "learning_rate": 5.3650947751956174e-08, + "loss": 0.3284, + "step": 11524 + }, + { + "epoch": 0.9360025988792333, + "grad_norm": 5.005872247413698, + "learning_rate": 5.351552358440704e-08, + "loss": 0.4261, + "step": 11525 + }, + { + "epoch": 0.936083813855275, + "grad_norm": 5.695031699056669, + "learning_rate": 5.3380268700636006e-08, + "loss": 0.4862, + "step": 11526 + }, + { + "epoch": 0.9361650288313165, + "grad_norm": 5.066704771887348, + "learning_rate": 5.324518311000143e-08, + "loss": 0.5233, + "step": 11527 + }, + { + "epoch": 0.9362462438073581, + "grad_norm": 3.891689553812398, + "learning_rate": 5.311026682185139e-08, + "loss": 0.4053, + "step": 11528 + }, + { + "epoch": 0.9363274587833996, + "grad_norm": 4.149090685885997, + "learning_rate": 5.297551984552063e-08, + "loss": 0.5201, + "step": 11529 + }, + { + "epoch": 0.9364086737594413, + "grad_norm": 4.452851895114947, + "learning_rate": 5.2840942190333086e-08, + "loss": 0.4647, + "step": 11530 + }, + { + "epoch": 0.9364898887354828, + "grad_norm": 5.464100401282328, + "learning_rate": 5.270653386560104e-08, + "loss": 0.4444, + "step": 11531 + }, + { + "epoch": 0.9365711037115244, + "grad_norm": 7.585338070087896, + "learning_rate": 5.257229488062482e-08, + "loss": 0.4242, + "step": 11532 + }, + { + "epoch": 0.9366523186875659, + "grad_norm": 9.756341519461458, + "learning_rate": 5.243822524469283e-08, + "loss": 0.3792, + "step": 11533 + }, + { + "epoch": 0.9367335336636076, + "grad_norm": 6.0305409951018225, + "learning_rate": 5.23043249670821e-08, + "loss": 0.4986, + "step": 11534 + }, + { + "epoch": 0.9368147486396492, + "grad_norm": 8.933422659020032, + "learning_rate": 5.2170594057058264e-08, + "loss": 0.647, + "step": 11535 + }, + { + "epoch": 0.9368959636156907, + "grad_norm": 7.945666217999835, + "learning_rate": 5.2037032523873654e-08, + "loss": 0.4578, + "step": 11536 + }, + { + "epoch": 0.9369771785917324, + "grad_norm": 8.210836981044634, + "learning_rate": 5.190364037677142e-08, + "loss": 0.4933, + "step": 11537 + }, + { + "epoch": 0.9370583935677739, + "grad_norm": 4.671044337470819, + "learning_rate": 5.1770417624980306e-08, + "loss": 0.4541, + "step": 11538 + }, + { + "epoch": 0.9371396085438155, + "grad_norm": 6.043521315334802, + "learning_rate": 5.1637364277719595e-08, + "loss": 0.5181, + "step": 11539 + }, + { + "epoch": 0.937220823519857, + "grad_norm": 5.1011629271022185, + "learning_rate": 5.150448034419525e-08, + "loss": 0.5324, + "step": 11540 + }, + { + "epoch": 0.9373020384958987, + "grad_norm": 3.8201585871136734, + "learning_rate": 5.1371765833602703e-08, + "loss": 0.4803, + "step": 11541 + }, + { + "epoch": 0.9373832534719402, + "grad_norm": 5.758818308718375, + "learning_rate": 5.123922075512461e-08, + "loss": 0.3395, + "step": 11542 + }, + { + "epoch": 0.9374644684479818, + "grad_norm": 3.435971230625552, + "learning_rate": 5.110684511793251e-08, + "loss": 0.6348, + "step": 11543 + }, + { + "epoch": 0.9375456834240233, + "grad_norm": 7.13731160639292, + "learning_rate": 5.0974638931186036e-08, + "loss": 0.3566, + "step": 11544 + }, + { + "epoch": 0.937626898400065, + "grad_norm": 9.350076135939414, + "learning_rate": 5.084260220403342e-08, + "loss": 0.4923, + "step": 11545 + }, + { + "epoch": 0.9377081133761066, + "grad_norm": 4.901127986761642, + "learning_rate": 5.0710734945610686e-08, + "loss": 0.4283, + "step": 11546 + }, + { + "epoch": 0.9377893283521481, + "grad_norm": 4.685757580589546, + "learning_rate": 5.057903716504248e-08, + "loss": 0.491, + "step": 11547 + }, + { + "epoch": 0.9378705433281898, + "grad_norm": 5.048648910312251, + "learning_rate": 5.044750887144151e-08, + "loss": 0.2839, + "step": 11548 + }, + { + "epoch": 0.9379517583042313, + "grad_norm": 6.1324537629824345, + "learning_rate": 5.0316150073908555e-08, + "loss": 0.4318, + "step": 11549 + }, + { + "epoch": 0.9380329732802729, + "grad_norm": 5.300490311250102, + "learning_rate": 5.0184960781533844e-08, + "loss": 0.3291, + "step": 11550 + }, + { + "epoch": 0.9381141882563144, + "grad_norm": 4.481525918571603, + "learning_rate": 5.005394100339373e-08, + "loss": 0.6305, + "step": 11551 + }, + { + "epoch": 0.9381954032323561, + "grad_norm": 5.628772875047986, + "learning_rate": 4.992309074855484e-08, + "loss": 0.4441, + "step": 11552 + }, + { + "epoch": 0.9382766182083976, + "grad_norm": 5.631338094522123, + "learning_rate": 4.97924100260716e-08, + "loss": 0.362, + "step": 11553 + }, + { + "epoch": 0.9383578331844392, + "grad_norm": 4.450334044578378, + "learning_rate": 4.966189884498596e-08, + "loss": 0.4233, + "step": 11554 + }, + { + "epoch": 0.9384390481604807, + "grad_norm": 4.584301414669546, + "learning_rate": 4.953155721432873e-08, + "loss": 0.4751, + "step": 11555 + }, + { + "epoch": 0.9385202631365224, + "grad_norm": 4.749314594480385, + "learning_rate": 4.940138514311854e-08, + "loss": 0.5722, + "step": 11556 + }, + { + "epoch": 0.938601478112564, + "grad_norm": 4.863986365737658, + "learning_rate": 4.927138264036291e-08, + "loss": 0.4222, + "step": 11557 + }, + { + "epoch": 0.9386826930886055, + "grad_norm": 5.649739794167962, + "learning_rate": 4.9141549715057415e-08, + "loss": 0.4626, + "step": 11558 + }, + { + "epoch": 0.9387639080646472, + "grad_norm": 3.4200650534492807, + "learning_rate": 4.90118863761857e-08, + "loss": 0.412, + "step": 11559 + }, + { + "epoch": 0.9388451230406887, + "grad_norm": 8.38158690698774, + "learning_rate": 4.888239263271977e-08, + "loss": 0.5341, + "step": 11560 + }, + { + "epoch": 0.9389263380167303, + "grad_norm": 3.2373374116022893, + "learning_rate": 4.875306849361994e-08, + "loss": 0.4586, + "step": 11561 + }, + { + "epoch": 0.9390075529927718, + "grad_norm": 4.301803597627936, + "learning_rate": 4.862391396783461e-08, + "loss": 0.3589, + "step": 11562 + }, + { + "epoch": 0.9390887679688135, + "grad_norm": 5.310812898753721, + "learning_rate": 4.849492906430081e-08, + "loss": 0.4699, + "step": 11563 + }, + { + "epoch": 0.939169982944855, + "grad_norm": 5.309327329673865, + "learning_rate": 4.836611379194334e-08, + "loss": 0.5494, + "step": 11564 + }, + { + "epoch": 0.9392511979208966, + "grad_norm": 8.887102738638378, + "learning_rate": 4.8237468159675896e-08, + "loss": 0.3454, + "step": 11565 + }, + { + "epoch": 0.9393324128969381, + "grad_norm": 4.344793062039761, + "learning_rate": 4.810899217639997e-08, + "loss": 0.3838, + "step": 11566 + }, + { + "epoch": 0.9394136278729798, + "grad_norm": 3.970983027129392, + "learning_rate": 4.798068585100513e-08, + "loss": 0.463, + "step": 11567 + }, + { + "epoch": 0.9394948428490214, + "grad_norm": 6.57362417638814, + "learning_rate": 4.785254919236954e-08, + "loss": 0.4031, + "step": 11568 + }, + { + "epoch": 0.9395760578250629, + "grad_norm": 3.7805553967871552, + "learning_rate": 4.772458220936027e-08, + "loss": 0.4996, + "step": 11569 + }, + { + "epoch": 0.9396572728011046, + "grad_norm": 5.108413544198211, + "learning_rate": 4.7596784910830804e-08, + "loss": 0.4602, + "step": 11570 + }, + { + "epoch": 0.9397384877771461, + "grad_norm": 4.497648219050411, + "learning_rate": 4.74691573056249e-08, + "loss": 0.3966, + "step": 11571 + }, + { + "epoch": 0.9398197027531877, + "grad_norm": 4.1059390155620115, + "learning_rate": 4.7341699402573546e-08, + "loss": 0.3698, + "step": 11572 + }, + { + "epoch": 0.9399009177292292, + "grad_norm": 4.091197560447974, + "learning_rate": 4.721441121049608e-08, + "loss": 0.4798, + "step": 11573 + }, + { + "epoch": 0.9399821327052709, + "grad_norm": 5.006114532781862, + "learning_rate": 4.7087292738200454e-08, + "loss": 0.3846, + "step": 11574 + }, + { + "epoch": 0.9400633476813124, + "grad_norm": 7.010800653937726, + "learning_rate": 4.696034399448185e-08, + "loss": 0.3729, + "step": 11575 + }, + { + "epoch": 0.940144562657354, + "grad_norm": 5.687679281302264, + "learning_rate": 4.6833564988124914e-08, + "loss": 0.4974, + "step": 11576 + }, + { + "epoch": 0.9402257776333955, + "grad_norm": 5.740323370278037, + "learning_rate": 4.670695572790235e-08, + "loss": 0.4069, + "step": 11577 + }, + { + "epoch": 0.9403069926094372, + "grad_norm": 8.241228285955831, + "learning_rate": 4.658051622257437e-08, + "loss": 0.6067, + "step": 11578 + }, + { + "epoch": 0.9403882075854788, + "grad_norm": 4.495511178750625, + "learning_rate": 4.6454246480890084e-08, + "loss": 0.5117, + "step": 11579 + }, + { + "epoch": 0.9404694225615203, + "grad_norm": 6.551988717623826, + "learning_rate": 4.632814651158696e-08, + "loss": 0.4359, + "step": 11580 + }, + { + "epoch": 0.940550637537562, + "grad_norm": 9.683649036003146, + "learning_rate": 4.620221632338995e-08, + "loss": 0.4789, + "step": 11581 + }, + { + "epoch": 0.9406318525136035, + "grad_norm": 5.298725894305153, + "learning_rate": 4.607645592501347e-08, + "loss": 0.4102, + "step": 11582 + }, + { + "epoch": 0.9407130674896451, + "grad_norm": 5.2892762046601165, + "learning_rate": 4.5950865325158636e-08, + "loss": 0.5595, + "step": 11583 + }, + { + "epoch": 0.9407942824656866, + "grad_norm": 6.532711062097573, + "learning_rate": 4.582544453251597e-08, + "loss": 0.3798, + "step": 11584 + }, + { + "epoch": 0.9408754974417283, + "grad_norm": 7.025731896892658, + "learning_rate": 4.57001935557641e-08, + "loss": 0.444, + "step": 11585 + }, + { + "epoch": 0.9409567124177698, + "grad_norm": 4.380484134636201, + "learning_rate": 4.5575112403569985e-08, + "loss": 0.4212, + "step": 11586 + }, + { + "epoch": 0.9410379273938114, + "grad_norm": 78.3339524286403, + "learning_rate": 4.545020108458781e-08, + "loss": 0.428, + "step": 11587 + }, + { + "epoch": 0.941119142369853, + "grad_norm": 4.997755103806688, + "learning_rate": 4.5325459607461485e-08, + "loss": 0.374, + "step": 11588 + }, + { + "epoch": 0.9412003573458946, + "grad_norm": 4.4071049437250505, + "learning_rate": 4.5200887980821897e-08, + "loss": 0.6705, + "step": 11589 + }, + { + "epoch": 0.9412815723219362, + "grad_norm": 4.496642306863949, + "learning_rate": 4.5076486213289086e-08, + "loss": 0.4301, + "step": 11590 + }, + { + "epoch": 0.9413627872979777, + "grad_norm": 6.033449529408409, + "learning_rate": 4.495225431347089e-08, + "loss": 0.4866, + "step": 11591 + }, + { + "epoch": 0.9414440022740194, + "grad_norm": 8.981792164965075, + "learning_rate": 4.482819228996377e-08, + "loss": 0.6155, + "step": 11592 + }, + { + "epoch": 0.9415252172500609, + "grad_norm": 6.80461573911351, + "learning_rate": 4.470430015135197e-08, + "loss": 0.5014, + "step": 11593 + }, + { + "epoch": 0.9416064322261025, + "grad_norm": 3.2551970239032495, + "learning_rate": 4.458057790620779e-08, + "loss": 0.4495, + "step": 11594 + }, + { + "epoch": 0.941687647202144, + "grad_norm": 6.288968375149101, + "learning_rate": 4.4457025563092724e-08, + "loss": 0.4876, + "step": 11595 + }, + { + "epoch": 0.9417688621781857, + "grad_norm": 5.114911602404954, + "learning_rate": 4.433364313055549e-08, + "loss": 0.6019, + "step": 11596 + }, + { + "epoch": 0.9418500771542272, + "grad_norm": 10.285824379728702, + "learning_rate": 4.42104306171337e-08, + "loss": 0.3655, + "step": 11597 + }, + { + "epoch": 0.9419312921302688, + "grad_norm": 4.250288445214354, + "learning_rate": 4.4087388031353316e-08, + "loss": 0.5237, + "step": 11598 + }, + { + "epoch": 0.9420125071063103, + "grad_norm": 4.607829438175469, + "learning_rate": 4.39645153817278e-08, + "loss": 0.5283, + "step": 11599 + }, + { + "epoch": 0.942093722082352, + "grad_norm": 7.915466583935277, + "learning_rate": 4.384181267675952e-08, + "loss": 0.4855, + "step": 11600 + }, + { + "epoch": 0.9421749370583936, + "grad_norm": 5.46136708291508, + "learning_rate": 4.3719279924938626e-08, + "loss": 0.4598, + "step": 11601 + }, + { + "epoch": 0.9422561520344351, + "grad_norm": 9.165200607430856, + "learning_rate": 4.35969171347439e-08, + "loss": 0.6132, + "step": 11602 + }, + { + "epoch": 0.9423373670104768, + "grad_norm": 5.415018652981148, + "learning_rate": 4.347472431464217e-08, + "loss": 0.5896, + "step": 11603 + }, + { + "epoch": 0.9424185819865183, + "grad_norm": 7.296700648217176, + "learning_rate": 4.335270147308862e-08, + "loss": 0.378, + "step": 11604 + }, + { + "epoch": 0.9424997969625599, + "grad_norm": 4.175947964206813, + "learning_rate": 4.32308486185265e-08, + "loss": 0.4858, + "step": 11605 + }, + { + "epoch": 0.9425810119386014, + "grad_norm": 5.139558164610286, + "learning_rate": 4.3109165759387115e-08, + "loss": 0.3888, + "step": 11606 + }, + { + "epoch": 0.9426622269146431, + "grad_norm": 4.109603857789413, + "learning_rate": 4.298765290409096e-08, + "loss": 0.5018, + "step": 11607 + }, + { + "epoch": 0.9427434418906846, + "grad_norm": 5.23991591103504, + "learning_rate": 4.286631006104547e-08, + "loss": 0.523, + "step": 11608 + }, + { + "epoch": 0.9428246568667262, + "grad_norm": 5.661548234950453, + "learning_rate": 4.2745137238646984e-08, + "loss": 0.4094, + "step": 11609 + }, + { + "epoch": 0.9429058718427678, + "grad_norm": 5.499414318975432, + "learning_rate": 4.2624134445280186e-08, + "loss": 0.6268, + "step": 11610 + }, + { + "epoch": 0.9429870868188094, + "grad_norm": 4.495827244130177, + "learning_rate": 4.25033016893181e-08, + "loss": 0.548, + "step": 11611 + }, + { + "epoch": 0.943068301794851, + "grad_norm": 3.8103963823755094, + "learning_rate": 4.238263897912126e-08, + "loss": 0.7681, + "step": 11612 + }, + { + "epoch": 0.9431495167708925, + "grad_norm": 9.357203074745977, + "learning_rate": 4.22621463230391e-08, + "loss": 0.3692, + "step": 11613 + }, + { + "epoch": 0.9432307317469342, + "grad_norm": 3.6750136809793412, + "learning_rate": 4.214182372940884e-08, + "loss": 0.601, + "step": 11614 + }, + { + "epoch": 0.9433119467229757, + "grad_norm": 7.385444469733557, + "learning_rate": 4.202167120655631e-08, + "loss": 0.4761, + "step": 11615 + }, + { + "epoch": 0.9433931616990173, + "grad_norm": 5.303774677219012, + "learning_rate": 4.190168876279571e-08, + "loss": 0.4371, + "step": 11616 + }, + { + "epoch": 0.9434743766750588, + "grad_norm": 4.763056521648861, + "learning_rate": 4.1781876406428725e-08, + "loss": 0.4442, + "step": 11617 + }, + { + "epoch": 0.9435555916511005, + "grad_norm": 5.484993623084503, + "learning_rate": 4.1662234145746214e-08, + "loss": 0.5426, + "step": 11618 + }, + { + "epoch": 0.943636806627142, + "grad_norm": 4.303398425305994, + "learning_rate": 4.154276198902629e-08, + "loss": 0.5846, + "step": 11619 + }, + { + "epoch": 0.9437180216031836, + "grad_norm": 5.295687518325545, + "learning_rate": 4.1423459944536224e-08, + "loss": 0.6349, + "step": 11620 + }, + { + "epoch": 0.9437992365792252, + "grad_norm": 4.737076827144997, + "learning_rate": 4.1304328020530804e-08, + "loss": 0.4874, + "step": 11621 + }, + { + "epoch": 0.9438804515552668, + "grad_norm": 15.165489379351301, + "learning_rate": 4.118536622525315e-08, + "loss": 0.3641, + "step": 11622 + }, + { + "epoch": 0.9439616665313084, + "grad_norm": 3.5716624018017877, + "learning_rate": 4.10665745669353e-08, + "loss": 0.5441, + "step": 11623 + }, + { + "epoch": 0.9440428815073499, + "grad_norm": 10.121245968679512, + "learning_rate": 4.094795305379679e-08, + "loss": 0.4997, + "step": 11624 + }, + { + "epoch": 0.9441240964833916, + "grad_norm": 7.249621116373978, + "learning_rate": 4.082950169404548e-08, + "loss": 0.4358, + "step": 11625 + }, + { + "epoch": 0.9442053114594331, + "grad_norm": 7.423045226596472, + "learning_rate": 4.071122049587789e-08, + "loss": 0.4766, + "step": 11626 + }, + { + "epoch": 0.9442865264354747, + "grad_norm": 7.43889332245654, + "learning_rate": 4.059310946747802e-08, + "loss": 0.3625, + "step": 11627 + }, + { + "epoch": 0.9443677414115162, + "grad_norm": 5.999420084859953, + "learning_rate": 4.047516861701878e-08, + "loss": 0.4215, + "step": 11628 + }, + { + "epoch": 0.9444489563875579, + "grad_norm": 3.9495688216293363, + "learning_rate": 4.035739795266086e-08, + "loss": 0.5308, + "step": 11629 + }, + { + "epoch": 0.9445301713635994, + "grad_norm": 8.225837590539006, + "learning_rate": 4.0239797482553856e-08, + "loss": 0.4483, + "step": 11630 + }, + { + "epoch": 0.944611386339641, + "grad_norm": 4.507895023066098, + "learning_rate": 4.012236721483487e-08, + "loss": 0.5678, + "step": 11631 + }, + { + "epoch": 0.9446926013156827, + "grad_norm": 3.795653600393925, + "learning_rate": 4.0005107157628786e-08, + "loss": 0.5971, + "step": 11632 + }, + { + "epoch": 0.9447738162917242, + "grad_norm": 5.829598757301327, + "learning_rate": 3.988801731905051e-08, + "loss": 0.4818, + "step": 11633 + }, + { + "epoch": 0.9448550312677658, + "grad_norm": 6.893765169454947, + "learning_rate": 3.9771097707201056e-08, + "loss": 0.3942, + "step": 11634 + }, + { + "epoch": 0.9449362462438073, + "grad_norm": 7.0587960070765075, + "learning_rate": 3.965434833017118e-08, + "loss": 0.5054, + "step": 11635 + }, + { + "epoch": 0.945017461219849, + "grad_norm": 8.051189199148112, + "learning_rate": 3.9537769196039134e-08, + "loss": 0.3643, + "step": 11636 + }, + { + "epoch": 0.9450986761958905, + "grad_norm": 4.735681936699135, + "learning_rate": 3.9421360312871804e-08, + "loss": 0.6727, + "step": 11637 + }, + { + "epoch": 0.9451798911719321, + "grad_norm": 3.9449239206953925, + "learning_rate": 3.9305121688723855e-08, + "loss": 0.5668, + "step": 11638 + }, + { + "epoch": 0.9452611061479737, + "grad_norm": 6.946560704332956, + "learning_rate": 3.918905333163858e-08, + "loss": 0.6188, + "step": 11639 + }, + { + "epoch": 0.9453423211240153, + "grad_norm": 8.04163254796676, + "learning_rate": 3.9073155249647055e-08, + "loss": 0.6601, + "step": 11640 + }, + { + "epoch": 0.9454235361000568, + "grad_norm": 7.0432614921051675, + "learning_rate": 3.895742745076869e-08, + "loss": 0.4597, + "step": 11641 + }, + { + "epoch": 0.9455047510760984, + "grad_norm": 5.740385711050734, + "learning_rate": 3.8841869943011534e-08, + "loss": 0.4694, + "step": 11642 + }, + { + "epoch": 0.9455859660521401, + "grad_norm": 3.826367448009286, + "learning_rate": 3.872648273437168e-08, + "loss": 0.4556, + "step": 11643 + }, + { + "epoch": 0.9456671810281816, + "grad_norm": 3.153187096292459, + "learning_rate": 3.861126583283303e-08, + "loss": 0.3736, + "step": 11644 + }, + { + "epoch": 0.9457483960042232, + "grad_norm": 5.786888401113132, + "learning_rate": 3.849621924636809e-08, + "loss": 0.5045, + "step": 11645 + }, + { + "epoch": 0.9458296109802647, + "grad_norm": 5.70036944601648, + "learning_rate": 3.838134298293744e-08, + "loss": 0.3993, + "step": 11646 + }, + { + "epoch": 0.9459108259563064, + "grad_norm": 5.14700753120722, + "learning_rate": 3.8266637050489716e-08, + "loss": 0.6705, + "step": 11647 + }, + { + "epoch": 0.9459920409323479, + "grad_norm": 5.028518558037041, + "learning_rate": 3.815210145696219e-08, + "loss": 0.5472, + "step": 11648 + }, + { + "epoch": 0.9460732559083895, + "grad_norm": 5.08541011233092, + "learning_rate": 3.803773621028045e-08, + "loss": 0.4755, + "step": 11649 + }, + { + "epoch": 0.946154470884431, + "grad_norm": 5.674196216201576, + "learning_rate": 3.792354131835735e-08, + "loss": 0.4256, + "step": 11650 + }, + { + "epoch": 0.9462356858604727, + "grad_norm": 5.863265288069775, + "learning_rate": 3.780951678909489e-08, + "loss": 0.5258, + "step": 11651 + }, + { + "epoch": 0.9463169008365142, + "grad_norm": 3.5058013583786476, + "learning_rate": 3.769566263038288e-08, + "loss": 0.534, + "step": 11652 + }, + { + "epoch": 0.9463981158125558, + "grad_norm": 12.342326134411538, + "learning_rate": 3.7581978850099456e-08, + "loss": 0.5344, + "step": 11653 + }, + { + "epoch": 0.9464793307885975, + "grad_norm": 4.819965937928789, + "learning_rate": 3.7468465456110825e-08, + "loss": 0.4811, + "step": 11654 + }, + { + "epoch": 0.946560545764639, + "grad_norm": 7.405750445686644, + "learning_rate": 3.735512245627182e-08, + "loss": 0.3502, + "step": 11655 + }, + { + "epoch": 0.9466417607406806, + "grad_norm": 9.446771351873101, + "learning_rate": 3.7241949858424777e-08, + "loss": 0.4258, + "step": 11656 + }, + { + "epoch": 0.9467229757167221, + "grad_norm": 6.936102522174853, + "learning_rate": 3.712894767040093e-08, + "loss": 0.5589, + "step": 11657 + }, + { + "epoch": 0.9468041906927638, + "grad_norm": 4.033630186749134, + "learning_rate": 3.7016115900019575e-08, + "loss": 0.4372, + "step": 11658 + }, + { + "epoch": 0.9468854056688053, + "grad_norm": 5.941857393690612, + "learning_rate": 3.690345455508754e-08, + "loss": 0.5026, + "step": 11659 + }, + { + "epoch": 0.9469666206448469, + "grad_norm": 9.14091774177606, + "learning_rate": 3.679096364340079e-08, + "loss": 0.4604, + "step": 11660 + }, + { + "epoch": 0.9470478356208885, + "grad_norm": 4.223097671499599, + "learning_rate": 3.6678643172742836e-08, + "loss": 0.4393, + "step": 11661 + }, + { + "epoch": 0.9471290505969301, + "grad_norm": 3.9468831332746643, + "learning_rate": 3.656649315088606e-08, + "loss": 0.5762, + "step": 11662 + }, + { + "epoch": 0.9472102655729716, + "grad_norm": 4.743190373097494, + "learning_rate": 3.6454513585590376e-08, + "loss": 0.4932, + "step": 11663 + }, + { + "epoch": 0.9472914805490132, + "grad_norm": 3.896032850566264, + "learning_rate": 3.634270448460403e-08, + "loss": 0.5384, + "step": 11664 + }, + { + "epoch": 0.9473726955250549, + "grad_norm": 4.452680361174725, + "learning_rate": 3.623106585566388e-08, + "loss": 0.3909, + "step": 11665 + }, + { + "epoch": 0.9474539105010964, + "grad_norm": 6.087481905533953, + "learning_rate": 3.611959770649487e-08, + "loss": 0.4937, + "step": 11666 + }, + { + "epoch": 0.947535125477138, + "grad_norm": 4.040614567901036, + "learning_rate": 3.600830004480943e-08, + "loss": 0.5005, + "step": 11667 + }, + { + "epoch": 0.9476163404531796, + "grad_norm": 4.44420665256172, + "learning_rate": 3.589717287830946e-08, + "loss": 0.4575, + "step": 11668 + }, + { + "epoch": 0.9476975554292212, + "grad_norm": 5.616498527932605, + "learning_rate": 3.578621621468381e-08, + "loss": 0.5212, + "step": 11669 + }, + { + "epoch": 0.9477787704052627, + "grad_norm": 5.201502802582349, + "learning_rate": 3.567543006161051e-08, + "loss": 0.4217, + "step": 11670 + }, + { + "epoch": 0.9478599853813043, + "grad_norm": 16.378504345168306, + "learning_rate": 3.556481442675508e-08, + "loss": 0.4972, + "step": 11671 + }, + { + "epoch": 0.9479412003573459, + "grad_norm": 5.083869322555315, + "learning_rate": 3.5454369317771686e-08, + "loss": 0.4814, + "step": 11672 + }, + { + "epoch": 0.9480224153333875, + "grad_norm": 7.335908845803885, + "learning_rate": 3.534409474230255e-08, + "loss": 0.4003, + "step": 11673 + }, + { + "epoch": 0.948103630309429, + "grad_norm": 10.162528148573347, + "learning_rate": 3.523399070797795e-08, + "loss": 0.4699, + "step": 11674 + }, + { + "epoch": 0.9481848452854706, + "grad_norm": 4.64890275823902, + "learning_rate": 3.512405722241652e-08, + "loss": 0.4235, + "step": 11675 + }, + { + "epoch": 0.9482660602615123, + "grad_norm": 5.016364672410445, + "learning_rate": 3.501429429322522e-08, + "loss": 0.5031, + "step": 11676 + }, + { + "epoch": 0.9483472752375538, + "grad_norm": 5.858582608768708, + "learning_rate": 3.4904701927999385e-08, + "loss": 0.4388, + "step": 11677 + }, + { + "epoch": 0.9484284902135954, + "grad_norm": 4.81391584316998, + "learning_rate": 3.479528013432154e-08, + "loss": 0.4828, + "step": 11678 + }, + { + "epoch": 0.948509705189637, + "grad_norm": 8.006015757453834, + "learning_rate": 3.468602891976314e-08, + "loss": 0.5023, + "step": 11679 + }, + { + "epoch": 0.9485909201656786, + "grad_norm": 13.29141782498199, + "learning_rate": 3.457694829188452e-08, + "loss": 0.4342, + "step": 11680 + }, + { + "epoch": 0.9486721351417201, + "grad_norm": 10.20237535145513, + "learning_rate": 3.446803825823269e-08, + "loss": 0.3647, + "step": 11681 + }, + { + "epoch": 0.9487533501177617, + "grad_norm": 5.890040192499068, + "learning_rate": 3.435929882634415e-08, + "loss": 0.3275, + "step": 11682 + }, + { + "epoch": 0.9488345650938033, + "grad_norm": 10.83456434309621, + "learning_rate": 3.425073000374257e-08, + "loss": 0.4327, + "step": 11683 + }, + { + "epoch": 0.9489157800698449, + "grad_norm": 4.338209138202084, + "learning_rate": 3.4142331797940855e-08, + "loss": 0.5753, + "step": 11684 + }, + { + "epoch": 0.9489969950458864, + "grad_norm": 4.739057059307357, + "learning_rate": 3.4034104216439655e-08, + "loss": 0.5673, + "step": 11685 + }, + { + "epoch": 0.949078210021928, + "grad_norm": 7.495421522605058, + "learning_rate": 3.3926047266727155e-08, + "loss": 0.3894, + "step": 11686 + }, + { + "epoch": 0.9491594249979697, + "grad_norm": 7.2165806929599166, + "learning_rate": 3.381816095628071e-08, + "loss": 0.5159, + "step": 11687 + }, + { + "epoch": 0.9492406399740112, + "grad_norm": 3.9257600126354624, + "learning_rate": 3.371044529256573e-08, + "loss": 0.389, + "step": 11688 + }, + { + "epoch": 0.9493218549500528, + "grad_norm": 9.018221626405468, + "learning_rate": 3.360290028303487e-08, + "loss": 0.4364, + "step": 11689 + }, + { + "epoch": 0.9494030699260944, + "grad_norm": 4.246553322453479, + "learning_rate": 3.34955259351305e-08, + "loss": 0.5857, + "step": 11690 + }, + { + "epoch": 0.949484284902136, + "grad_norm": 8.521074043393677, + "learning_rate": 3.3388322256281694e-08, + "loss": 0.4385, + "step": 11691 + }, + { + "epoch": 0.9495654998781775, + "grad_norm": 8.037563989138466, + "learning_rate": 3.328128925390667e-08, + "loss": 0.5179, + "step": 11692 + }, + { + "epoch": 0.9496467148542191, + "grad_norm": 5.337969212336468, + "learning_rate": 3.317442693541145e-08, + "loss": 0.5983, + "step": 11693 + }, + { + "epoch": 0.9497279298302607, + "grad_norm": 5.550986004035013, + "learning_rate": 3.306773530819041e-08, + "loss": 0.5231, + "step": 11694 + }, + { + "epoch": 0.9498091448063023, + "grad_norm": 5.0987699618930975, + "learning_rate": 3.296121437962624e-08, + "loss": 0.6063, + "step": 11695 + }, + { + "epoch": 0.9498903597823438, + "grad_norm": 6.3006147509879105, + "learning_rate": 3.2854864157089164e-08, + "loss": 0.3196, + "step": 11696 + }, + { + "epoch": 0.9499715747583855, + "grad_norm": 4.720216946251891, + "learning_rate": 3.2748684647938564e-08, + "loss": 0.5137, + "step": 11697 + }, + { + "epoch": 0.9500527897344271, + "grad_norm": 17.425444481965904, + "learning_rate": 3.264267585952108e-08, + "loss": 0.5927, + "step": 11698 + }, + { + "epoch": 0.9501340047104686, + "grad_norm": 6.961626143865487, + "learning_rate": 3.253683779917194e-08, + "loss": 0.4547, + "step": 11699 + }, + { + "epoch": 0.9502152196865102, + "grad_norm": 7.6904422344104955, + "learning_rate": 3.243117047421501e-08, + "loss": 0.5546, + "step": 11700 + }, + { + "epoch": 0.9502964346625518, + "grad_norm": 5.910333731484439, + "learning_rate": 3.2325673891961394e-08, + "loss": 0.641, + "step": 11701 + }, + { + "epoch": 0.9503776496385934, + "grad_norm": 7.209494401473691, + "learning_rate": 3.222034805971136e-08, + "loss": 0.4257, + "step": 11702 + }, + { + "epoch": 0.9504588646146349, + "grad_norm": 5.915799539490019, + "learning_rate": 3.2115192984752684e-08, + "loss": 0.3847, + "step": 11703 + }, + { + "epoch": 0.9505400795906765, + "grad_norm": 8.657367196517342, + "learning_rate": 3.2010208674361774e-08, + "loss": 0.6126, + "step": 11704 + }, + { + "epoch": 0.9506212945667181, + "grad_norm": 5.775245158747669, + "learning_rate": 3.190539513580226e-08, + "loss": 0.4298, + "step": 11705 + }, + { + "epoch": 0.9507025095427597, + "grad_norm": 5.310693978477437, + "learning_rate": 3.1800752376327515e-08, + "loss": 0.5534, + "step": 11706 + }, + { + "epoch": 0.9507837245188012, + "grad_norm": 4.560752491636321, + "learning_rate": 3.169628040317785e-08, + "loss": 0.3459, + "step": 11707 + }, + { + "epoch": 0.9508649394948429, + "grad_norm": 3.5975956386159296, + "learning_rate": 3.15919792235822e-08, + "loss": 0.4466, + "step": 11708 + }, + { + "epoch": 0.9509461544708845, + "grad_norm": 4.190834372560731, + "learning_rate": 3.1487848844757865e-08, + "loss": 0.5566, + "step": 11709 + }, + { + "epoch": 0.951027369446926, + "grad_norm": 4.688075724324062, + "learning_rate": 3.138388927391017e-08, + "loss": 0.5138, + "step": 11710 + }, + { + "epoch": 0.9511085844229676, + "grad_norm": 6.558872976862139, + "learning_rate": 3.1280100518231994e-08, + "loss": 0.4949, + "step": 11711 + }, + { + "epoch": 0.9511897993990092, + "grad_norm": 6.1291704390663515, + "learning_rate": 3.1176482584905356e-08, + "loss": 0.6129, + "step": 11712 + }, + { + "epoch": 0.9512710143750508, + "grad_norm": 4.609364781861959, + "learning_rate": 3.107303548110008e-08, + "loss": 0.5307, + "step": 11713 + }, + { + "epoch": 0.9513522293510923, + "grad_norm": 4.543991780518672, + "learning_rate": 3.0969759213974324e-08, + "loss": 0.4039, + "step": 11714 + }, + { + "epoch": 0.951433444327134, + "grad_norm": 5.065242928777515, + "learning_rate": 3.086665379067405e-08, + "loss": 0.4611, + "step": 11715 + }, + { + "epoch": 0.9515146593031755, + "grad_norm": 7.643230789019235, + "learning_rate": 3.0763719218333545e-08, + "loss": 0.4837, + "step": 11716 + }, + { + "epoch": 0.9515958742792171, + "grad_norm": 3.926885842127398, + "learning_rate": 3.066095550407544e-08, + "loss": 0.4451, + "step": 11717 + }, + { + "epoch": 0.9516770892552586, + "grad_norm": 5.510721045832869, + "learning_rate": 3.0558362655010443e-08, + "loss": 0.5284, + "step": 11718 + }, + { + "epoch": 0.9517583042313003, + "grad_norm": 4.815199357131891, + "learning_rate": 3.045594067823704e-08, + "loss": 0.4785, + "step": 11719 + }, + { + "epoch": 0.9518395192073419, + "grad_norm": 9.394045320573086, + "learning_rate": 3.0353689580843174e-08, + "loss": 0.5275, + "step": 11720 + }, + { + "epoch": 0.9519207341833834, + "grad_norm": 7.043341703763583, + "learning_rate": 3.025160936990318e-08, + "loss": 0.4034, + "step": 11721 + }, + { + "epoch": 0.952001949159425, + "grad_norm": 5.105234133065711, + "learning_rate": 3.0149700052481135e-08, + "loss": 0.4658, + "step": 11722 + }, + { + "epoch": 0.9520831641354666, + "grad_norm": 20.843773029070668, + "learning_rate": 3.004796163562834e-08, + "loss": 0.4, + "step": 11723 + }, + { + "epoch": 0.9521643791115082, + "grad_norm": 4.834619084091482, + "learning_rate": 2.994639412638445e-08, + "loss": 0.4094, + "step": 11724 + }, + { + "epoch": 0.9522455940875497, + "grad_norm": 5.958564630244604, + "learning_rate": 2.984499753177772e-08, + "loss": 0.4347, + "step": 11725 + }, + { + "epoch": 0.9523268090635914, + "grad_norm": 7.525314088951139, + "learning_rate": 2.9743771858823657e-08, + "loss": 0.3909, + "step": 11726 + }, + { + "epoch": 0.9524080240396329, + "grad_norm": 4.660331555963379, + "learning_rate": 2.9642717114527208e-08, + "loss": 0.4202, + "step": 11727 + }, + { + "epoch": 0.9524892390156745, + "grad_norm": 4.986110534638319, + "learning_rate": 2.9541833305880287e-08, + "loss": 0.3993, + "step": 11728 + }, + { + "epoch": 0.952570453991716, + "grad_norm": 3.464614961858637, + "learning_rate": 2.9441120439864246e-08, + "loss": 0.5475, + "step": 11729 + }, + { + "epoch": 0.9526516689677577, + "grad_norm": 21.551152924357776, + "learning_rate": 2.9340578523447127e-08, + "loss": 0.4057, + "step": 11730 + }, + { + "epoch": 0.9527328839437993, + "grad_norm": 3.6579610094520283, + "learning_rate": 2.9240207563586142e-08, + "loss": 0.5294, + "step": 11731 + }, + { + "epoch": 0.9528140989198408, + "grad_norm": 5.768773438976465, + "learning_rate": 2.914000756722657e-08, + "loss": 0.4105, + "step": 11732 + }, + { + "epoch": 0.9528953138958824, + "grad_norm": 6.3584167885272285, + "learning_rate": 2.903997854130147e-08, + "loss": 0.5214, + "step": 11733 + }, + { + "epoch": 0.952976528871924, + "grad_norm": 7.692904252140693, + "learning_rate": 2.8940120492732537e-08, + "loss": 0.4954, + "step": 11734 + }, + { + "epoch": 0.9530577438479656, + "grad_norm": 5.655701924850993, + "learning_rate": 2.8840433428429514e-08, + "loss": 0.7178, + "step": 11735 + }, + { + "epoch": 0.9531389588240071, + "grad_norm": 4.970188795620924, + "learning_rate": 2.8740917355290222e-08, + "loss": 0.5701, + "step": 11736 + }, + { + "epoch": 0.9532201738000488, + "grad_norm": 5.290922267712118, + "learning_rate": 2.864157228019998e-08, + "loss": 0.4525, + "step": 11737 + }, + { + "epoch": 0.9533013887760903, + "grad_norm": 4.627627408383265, + "learning_rate": 2.854239821003385e-08, + "loss": 0.4818, + "step": 11738 + }, + { + "epoch": 0.9533826037521319, + "grad_norm": 7.8256164051009325, + "learning_rate": 2.8443395151653562e-08, + "loss": 0.5673, + "step": 11739 + }, + { + "epoch": 0.9534638187281734, + "grad_norm": 6.249505047993975, + "learning_rate": 2.834456311190975e-08, + "loss": 0.3818, + "step": 11740 + }, + { + "epoch": 0.9535450337042151, + "grad_norm": 6.005374198159657, + "learning_rate": 2.8245902097641388e-08, + "loss": 0.6104, + "step": 11741 + }, + { + "epoch": 0.9536262486802567, + "grad_norm": 5.110767892723842, + "learning_rate": 2.8147412115674955e-08, + "loss": 0.4915, + "step": 11742 + }, + { + "epoch": 0.9537074636562982, + "grad_norm": 4.5731007153331955, + "learning_rate": 2.8049093172825282e-08, + "loss": 0.4256, + "step": 11743 + }, + { + "epoch": 0.9537886786323398, + "grad_norm": 10.106582363128767, + "learning_rate": 2.795094527589609e-08, + "loss": 0.4245, + "step": 11744 + }, + { + "epoch": 0.9538698936083814, + "grad_norm": 4.273795431273181, + "learning_rate": 2.7852968431678064e-08, + "loss": 0.4379, + "step": 11745 + }, + { + "epoch": 0.953951108584423, + "grad_norm": 4.307573087745445, + "learning_rate": 2.7755162646950773e-08, + "loss": 0.6883, + "step": 11746 + }, + { + "epoch": 0.9540323235604645, + "grad_norm": 5.5704933532695105, + "learning_rate": 2.7657527928482418e-08, + "loss": 0.503, + "step": 11747 + }, + { + "epoch": 0.9541135385365062, + "grad_norm": 5.927129889565711, + "learning_rate": 2.756006428302843e-08, + "loss": 0.5112, + "step": 11748 + }, + { + "epoch": 0.9541947535125477, + "grad_norm": 7.739005076724821, + "learning_rate": 2.746277171733258e-08, + "loss": 0.4949, + "step": 11749 + }, + { + "epoch": 0.9542759684885893, + "grad_norm": 4.857345101026582, + "learning_rate": 2.736565023812754e-08, + "loss": 0.4114, + "step": 11750 + }, + { + "epoch": 0.9543571834646308, + "grad_norm": 5.671784265799197, + "learning_rate": 2.726869985213293e-08, + "loss": 0.4542, + "step": 11751 + }, + { + "epoch": 0.9544383984406725, + "grad_norm": 4.542817164476761, + "learning_rate": 2.717192056605783e-08, + "loss": 0.4281, + "step": 11752 + }, + { + "epoch": 0.9545196134167141, + "grad_norm": 4.657810531667513, + "learning_rate": 2.7075312386598274e-08, + "loss": 0.5609, + "step": 11753 + }, + { + "epoch": 0.9546008283927556, + "grad_norm": 4.663673614763122, + "learning_rate": 2.697887532043947e-08, + "loss": 0.4775, + "step": 11754 + }, + { + "epoch": 0.9546820433687973, + "grad_norm": 5.2321773718184765, + "learning_rate": 2.688260937425413e-08, + "loss": 0.5095, + "step": 11755 + }, + { + "epoch": 0.9547632583448388, + "grad_norm": 4.5112988407221835, + "learning_rate": 2.67865145547036e-08, + "loss": 0.4052, + "step": 11756 + }, + { + "epoch": 0.9548444733208804, + "grad_norm": 4.994296884071479, + "learning_rate": 2.6690590868436728e-08, + "loss": 0.5502, + "step": 11757 + }, + { + "epoch": 0.9549256882969219, + "grad_norm": 5.907644955879376, + "learning_rate": 2.6594838322091255e-08, + "loss": 0.5292, + "step": 11758 + }, + { + "epoch": 0.9550069032729636, + "grad_norm": 4.76086278575601, + "learning_rate": 2.6499256922292715e-08, + "loss": 0.532, + "step": 11759 + }, + { + "epoch": 0.9550881182490051, + "grad_norm": 10.420115891569926, + "learning_rate": 2.640384667565471e-08, + "loss": 0.5241, + "step": 11760 + }, + { + "epoch": 0.9551693332250467, + "grad_norm": 8.326204967482434, + "learning_rate": 2.6308607588779177e-08, + "loss": 0.3722, + "step": 11761 + }, + { + "epoch": 0.9552505482010882, + "grad_norm": 4.358771745047045, + "learning_rate": 2.6213539668256126e-08, + "loss": 0.4494, + "step": 11762 + }, + { + "epoch": 0.9553317631771299, + "grad_norm": 3.6696692614668676, + "learning_rate": 2.6118642920663906e-08, + "loss": 0.4823, + "step": 11763 + }, + { + "epoch": 0.9554129781531715, + "grad_norm": 3.432418349481183, + "learning_rate": 2.6023917352568652e-08, + "loss": 0.4218, + "step": 11764 + }, + { + "epoch": 0.955494193129213, + "grad_norm": 5.4984450992011515, + "learning_rate": 2.592936297052512e-08, + "loss": 0.5265, + "step": 11765 + }, + { + "epoch": 0.9555754081052547, + "grad_norm": 4.899194056779067, + "learning_rate": 2.5834979781075854e-08, + "loss": 0.3908, + "step": 11766 + }, + { + "epoch": 0.9556566230812962, + "grad_norm": 3.7039379030025317, + "learning_rate": 2.5740767790751463e-08, + "loss": 0.3868, + "step": 11767 + }, + { + "epoch": 0.9557378380573378, + "grad_norm": 7.5200178859973725, + "learning_rate": 2.5646727006071182e-08, + "loss": 0.4753, + "step": 11768 + }, + { + "epoch": 0.9558190530333793, + "grad_norm": 5.533797272476705, + "learning_rate": 2.55528574335423e-08, + "loss": 0.5694, + "step": 11769 + }, + { + "epoch": 0.955900268009421, + "grad_norm": 4.804791486887786, + "learning_rate": 2.5459159079659625e-08, + "loss": 0.4299, + "step": 11770 + }, + { + "epoch": 0.9559814829854625, + "grad_norm": 5.494381293218291, + "learning_rate": 2.5365631950906856e-08, + "loss": 0.2743, + "step": 11771 + }, + { + "epoch": 0.9560626979615041, + "grad_norm": 5.394100187529947, + "learning_rate": 2.5272276053755207e-08, + "loss": 0.5166, + "step": 11772 + }, + { + "epoch": 0.9561439129375456, + "grad_norm": 3.979565293315754, + "learning_rate": 2.5179091394665346e-08, + "loss": 0.3647, + "step": 11773 + }, + { + "epoch": 0.9562251279135873, + "grad_norm": 5.9411168688277245, + "learning_rate": 2.5086077980084057e-08, + "loss": 0.4242, + "step": 11774 + }, + { + "epoch": 0.9563063428896289, + "grad_norm": 6.4302270705636975, + "learning_rate": 2.4993235816448136e-08, + "loss": 0.4753, + "step": 11775 + }, + { + "epoch": 0.9563875578656704, + "grad_norm": 4.533132653222595, + "learning_rate": 2.4900564910181334e-08, + "loss": 0.4724, + "step": 11776 + }, + { + "epoch": 0.9564687728417121, + "grad_norm": 6.793870761327284, + "learning_rate": 2.4808065267696303e-08, + "loss": 0.3712, + "step": 11777 + }, + { + "epoch": 0.9565499878177536, + "grad_norm": 6.118222175954092, + "learning_rate": 2.4715736895393195e-08, + "loss": 0.4916, + "step": 11778 + }, + { + "epoch": 0.9566312027937952, + "grad_norm": 6.201932383609763, + "learning_rate": 2.462357979966107e-08, + "loss": 0.4406, + "step": 11779 + }, + { + "epoch": 0.9567124177698367, + "grad_norm": 5.730634163072851, + "learning_rate": 2.453159398687649e-08, + "loss": 0.47, + "step": 11780 + }, + { + "epoch": 0.9567936327458784, + "grad_norm": 5.240490008736156, + "learning_rate": 2.443977946340409e-08, + "loss": 0.5144, + "step": 11781 + }, + { + "epoch": 0.9568748477219199, + "grad_norm": 2.6161093483379783, + "learning_rate": 2.4348136235597398e-08, + "loss": 0.4603, + "step": 11782 + }, + { + "epoch": 0.9569560626979615, + "grad_norm": 4.15867662957049, + "learning_rate": 2.425666430979773e-08, + "loss": 0.5706, + "step": 11783 + }, + { + "epoch": 0.957037277674003, + "grad_norm": 9.450975853907856, + "learning_rate": 2.416536369233391e-08, + "loss": 0.477, + "step": 11784 + }, + { + "epoch": 0.9571184926500447, + "grad_norm": 7.347938995622321, + "learning_rate": 2.4074234389523665e-08, + "loss": 0.502, + "step": 11785 + }, + { + "epoch": 0.9571997076260863, + "grad_norm": 4.253055407962608, + "learning_rate": 2.3983276407672784e-08, + "loss": 0.4742, + "step": 11786 + }, + { + "epoch": 0.9572809226021278, + "grad_norm": 3.0805970942274135, + "learning_rate": 2.389248975307512e-08, + "loss": 0.6164, + "step": 11787 + }, + { + "epoch": 0.9573621375781695, + "grad_norm": 5.749655956883042, + "learning_rate": 2.3801874432012594e-08, + "loss": 0.5219, + "step": 11788 + }, + { + "epoch": 0.957443352554211, + "grad_norm": 4.8340592635662345, + "learning_rate": 2.371143045075519e-08, + "loss": 0.5919, + "step": 11789 + }, + { + "epoch": 0.9575245675302526, + "grad_norm": 3.8977820934124066, + "learning_rate": 2.3621157815561237e-08, + "loss": 0.6845, + "step": 11790 + }, + { + "epoch": 0.9576057825062941, + "grad_norm": 9.678166602329553, + "learning_rate": 2.3531056532677122e-08, + "loss": 0.5546, + "step": 11791 + }, + { + "epoch": 0.9576869974823358, + "grad_norm": 7.305637570872725, + "learning_rate": 2.3441126608337304e-08, + "loss": 0.455, + "step": 11792 + }, + { + "epoch": 0.9577682124583773, + "grad_norm": 3.8733070193102384, + "learning_rate": 2.335136804876459e-08, + "loss": 0.5323, + "step": 11793 + }, + { + "epoch": 0.9578494274344189, + "grad_norm": 3.9403704900752636, + "learning_rate": 2.3261780860169558e-08, + "loss": 0.4325, + "step": 11794 + }, + { + "epoch": 0.9579306424104604, + "grad_norm": 5.6576542673173575, + "learning_rate": 2.31723650487517e-08, + "loss": 0.4517, + "step": 11795 + }, + { + "epoch": 0.9580118573865021, + "grad_norm": 6.1828309207281755, + "learning_rate": 2.3083120620697453e-08, + "loss": 0.3581, + "step": 11796 + }, + { + "epoch": 0.9580930723625437, + "grad_norm": 4.945073828835356, + "learning_rate": 2.2994047582182433e-08, + "loss": 0.3434, + "step": 11797 + }, + { + "epoch": 0.9581742873385852, + "grad_norm": 6.8876285655205285, + "learning_rate": 2.2905145939369765e-08, + "loss": 0.4171, + "step": 11798 + }, + { + "epoch": 0.9582555023146269, + "grad_norm": 5.814280241290536, + "learning_rate": 2.2816415698411475e-08, + "loss": 0.518, + "step": 11799 + }, + { + "epoch": 0.9583367172906684, + "grad_norm": 6.165510069966149, + "learning_rate": 2.272785686544682e-08, + "loss": 0.4784, + "step": 11800 + }, + { + "epoch": 0.95841793226671, + "grad_norm": 5.755892898880744, + "learning_rate": 2.263946944660367e-08, + "loss": 0.4269, + "step": 11801 + }, + { + "epoch": 0.9584991472427515, + "grad_norm": 10.987936416240586, + "learning_rate": 2.2551253447997968e-08, + "loss": 0.3444, + "step": 11802 + }, + { + "epoch": 0.9585803622187932, + "grad_norm": 15.074930100573221, + "learning_rate": 2.2463208875733723e-08, + "loss": 0.5635, + "step": 11803 + }, + { + "epoch": 0.9586615771948347, + "grad_norm": 4.1773061107164935, + "learning_rate": 2.237533573590328e-08, + "loss": 0.5389, + "step": 11804 + }, + { + "epoch": 0.9587427921708763, + "grad_norm": 4.63238528075123, + "learning_rate": 2.228763403458706e-08, + "loss": 0.6207, + "step": 11805 + }, + { + "epoch": 0.9588240071469178, + "grad_norm": 5.9729583458173945, + "learning_rate": 2.2200103777853255e-08, + "loss": 0.5594, + "step": 11806 + }, + { + "epoch": 0.9589052221229595, + "grad_norm": 9.831624315329341, + "learning_rate": 2.211274497175897e-08, + "loss": 0.58, + "step": 11807 + }, + { + "epoch": 0.9589864370990011, + "grad_norm": 6.030561079423977, + "learning_rate": 2.2025557622348537e-08, + "loss": 0.5508, + "step": 11808 + }, + { + "epoch": 0.9590676520750426, + "grad_norm": 5.732610023291474, + "learning_rate": 2.1938541735655183e-08, + "loss": 0.5854, + "step": 11809 + }, + { + "epoch": 0.9591488670510843, + "grad_norm": 16.34752832829806, + "learning_rate": 2.1851697317699373e-08, + "loss": 0.3753, + "step": 11810 + }, + { + "epoch": 0.9592300820271258, + "grad_norm": 8.516533326876681, + "learning_rate": 2.1765024374491018e-08, + "loss": 0.4773, + "step": 11811 + }, + { + "epoch": 0.9593112970031674, + "grad_norm": 4.534008961064627, + "learning_rate": 2.1678522912026988e-08, + "loss": 0.4447, + "step": 11812 + }, + { + "epoch": 0.9593925119792089, + "grad_norm": 5.048729203766718, + "learning_rate": 2.1592192936292777e-08, + "loss": 0.4867, + "step": 11813 + }, + { + "epoch": 0.9594737269552506, + "grad_norm": 5.401071455659973, + "learning_rate": 2.1506034453262214e-08, + "loss": 0.4589, + "step": 11814 + }, + { + "epoch": 0.9595549419312921, + "grad_norm": 4.550782933767591, + "learning_rate": 2.142004746889692e-08, + "loss": 0.4399, + "step": 11815 + }, + { + "epoch": 0.9596361569073337, + "grad_norm": 7.657034015101817, + "learning_rate": 2.1334231989146304e-08, + "loss": 0.5894, + "step": 11816 + }, + { + "epoch": 0.9597173718833752, + "grad_norm": 4.68873295706115, + "learning_rate": 2.124858801994867e-08, + "loss": 0.5836, + "step": 11817 + }, + { + "epoch": 0.9597985868594169, + "grad_norm": 5.932018957670042, + "learning_rate": 2.1163115567230386e-08, + "loss": 0.6573, + "step": 11818 + }, + { + "epoch": 0.9598798018354585, + "grad_norm": 6.610287249201689, + "learning_rate": 2.1077814636905337e-08, + "loss": 0.5133, + "step": 11819 + }, + { + "epoch": 0.9599610168115, + "grad_norm": 4.576678178324454, + "learning_rate": 2.099268523487602e-08, + "loss": 0.406, + "step": 11820 + }, + { + "epoch": 0.9600422317875417, + "grad_norm": 5.480905722183299, + "learning_rate": 2.0907727367033005e-08, + "loss": 0.5465, + "step": 11821 + }, + { + "epoch": 0.9601234467635832, + "grad_norm": 10.7296131406781, + "learning_rate": 2.0822941039254642e-08, + "loss": 0.5529, + "step": 11822 + }, + { + "epoch": 0.9602046617396248, + "grad_norm": 4.499843737247379, + "learning_rate": 2.0738326257407904e-08, + "loss": 0.4012, + "step": 11823 + }, + { + "epoch": 0.9602858767156663, + "grad_norm": 5.585081499968746, + "learning_rate": 2.0653883027347832e-08, + "loss": 0.4078, + "step": 11824 + }, + { + "epoch": 0.960367091691708, + "grad_norm": 5.781778186861071, + "learning_rate": 2.056961135491725e-08, + "loss": 0.4095, + "step": 11825 + }, + { + "epoch": 0.9604483066677495, + "grad_norm": 5.2006226427597415, + "learning_rate": 2.048551124594733e-08, + "loss": 0.5062, + "step": 11826 + }, + { + "epoch": 0.9605295216437911, + "grad_norm": 9.000238161766775, + "learning_rate": 2.0401582706257304e-08, + "loss": 0.4405, + "step": 11827 + }, + { + "epoch": 0.9606107366198327, + "grad_norm": 5.406010740569412, + "learning_rate": 2.031782574165475e-08, + "loss": 0.5036, + "step": 11828 + }, + { + "epoch": 0.9606919515958743, + "grad_norm": 5.004999912991991, + "learning_rate": 2.0234240357935032e-08, + "loss": 0.4662, + "step": 11829 + }, + { + "epoch": 0.9607731665719159, + "grad_norm": 4.837946844168023, + "learning_rate": 2.015082656088213e-08, + "loss": 0.496, + "step": 11830 + }, + { + "epoch": 0.9608543815479574, + "grad_norm": 6.627370947037891, + "learning_rate": 2.0067584356267545e-08, + "loss": 0.375, + "step": 11831 + }, + { + "epoch": 0.9609355965239991, + "grad_norm": 10.674671755581318, + "learning_rate": 1.998451374985111e-08, + "loss": 0.5067, + "step": 11832 + }, + { + "epoch": 0.9610168115000406, + "grad_norm": 17.13229424192217, + "learning_rate": 1.9901614747381004e-08, + "loss": 0.3741, + "step": 11833 + }, + { + "epoch": 0.9610980264760822, + "grad_norm": 6.635134277566872, + "learning_rate": 1.981888735459375e-08, + "loss": 0.5692, + "step": 11834 + }, + { + "epoch": 0.9611792414521237, + "grad_norm": 3.8371011025366055, + "learning_rate": 1.973633157721283e-08, + "loss": 0.5985, + "step": 11835 + }, + { + "epoch": 0.9612604564281654, + "grad_norm": 5.13228830947514, + "learning_rate": 1.9653947420951448e-08, + "loss": 0.492, + "step": 11836 + }, + { + "epoch": 0.9613416714042069, + "grad_norm": 6.733892581752105, + "learning_rate": 1.9571734891509763e-08, + "loss": 0.4515, + "step": 11837 + }, + { + "epoch": 0.9614228863802485, + "grad_norm": 5.22909116303276, + "learning_rate": 1.9489693994576563e-08, + "loss": 0.6163, + "step": 11838 + }, + { + "epoch": 0.96150410135629, + "grad_norm": 4.302408834520267, + "learning_rate": 1.9407824735828696e-08, + "loss": 0.4538, + "step": 11839 + }, + { + "epoch": 0.9615853163323317, + "grad_norm": 5.043172467524965, + "learning_rate": 1.932612712093107e-08, + "loss": 0.4024, + "step": 11840 + }, + { + "epoch": 0.9616665313083733, + "grad_norm": 8.816715558441784, + "learning_rate": 1.9244601155536392e-08, + "loss": 0.4981, + "step": 11841 + }, + { + "epoch": 0.9617477462844148, + "grad_norm": 4.8079654103051475, + "learning_rate": 1.9163246845286253e-08, + "loss": 0.4698, + "step": 11842 + }, + { + "epoch": 0.9618289612604565, + "grad_norm": 6.163595049432749, + "learning_rate": 1.908206419580977e-08, + "loss": 0.5798, + "step": 11843 + }, + { + "epoch": 0.961910176236498, + "grad_norm": 4.866986542895558, + "learning_rate": 1.9001053212724387e-08, + "loss": 0.5856, + "step": 11844 + }, + { + "epoch": 0.9619913912125396, + "grad_norm": 5.782563129780745, + "learning_rate": 1.892021390163562e-08, + "loss": 0.4907, + "step": 11845 + }, + { + "epoch": 0.9620726061885811, + "grad_norm": 5.712857730829221, + "learning_rate": 1.8839546268137054e-08, + "loss": 0.4423, + "step": 11846 + }, + { + "epoch": 0.9621538211646228, + "grad_norm": 4.414806524066914, + "learning_rate": 1.8759050317810612e-08, + "loss": 0.4499, + "step": 11847 + }, + { + "epoch": 0.9622350361406643, + "grad_norm": 4.949387208955272, + "learning_rate": 1.8678726056226004e-08, + "loss": 0.4297, + "step": 11848 + }, + { + "epoch": 0.9623162511167059, + "grad_norm": 4.396280286867585, + "learning_rate": 1.8598573488941285e-08, + "loss": 0.5549, + "step": 11849 + }, + { + "epoch": 0.9623974660927475, + "grad_norm": 6.050149072377, + "learning_rate": 1.8518592621502852e-08, + "loss": 0.5692, + "step": 11850 + }, + { + "epoch": 0.9624786810687891, + "grad_norm": 5.0620158081409246, + "learning_rate": 1.8438783459444608e-08, + "loss": 0.5378, + "step": 11851 + }, + { + "epoch": 0.9625598960448307, + "grad_norm": 3.929260770758667, + "learning_rate": 1.8359146008289087e-08, + "loss": 0.5249, + "step": 11852 + }, + { + "epoch": 0.9626411110208722, + "grad_norm": 6.751642731964542, + "learning_rate": 1.8279680273546874e-08, + "loss": 0.3583, + "step": 11853 + }, + { + "epoch": 0.9627223259969139, + "grad_norm": 7.566915713105148, + "learning_rate": 1.8200386260716352e-08, + "loss": 0.5411, + "step": 11854 + }, + { + "epoch": 0.9628035409729554, + "grad_norm": 4.154405022428572, + "learning_rate": 1.812126397528452e-08, + "loss": 0.5512, + "step": 11855 + }, + { + "epoch": 0.962884755948997, + "grad_norm": 7.42761465533202, + "learning_rate": 1.804231342272589e-08, + "loss": 0.5403, + "step": 11856 + }, + { + "epoch": 0.9629659709250386, + "grad_norm": 7.694593764622218, + "learning_rate": 1.796353460850331e-08, + "loss": 0.6331, + "step": 11857 + }, + { + "epoch": 0.9630471859010802, + "grad_norm": 16.086802457999514, + "learning_rate": 1.7884927538068532e-08, + "loss": 0.4567, + "step": 11858 + }, + { + "epoch": 0.9631284008771217, + "grad_norm": 6.812124405495696, + "learning_rate": 1.7806492216860537e-08, + "loss": 0.5277, + "step": 11859 + }, + { + "epoch": 0.9632096158531633, + "grad_norm": 6.335913061374134, + "learning_rate": 1.77282286503061e-08, + "loss": 0.4104, + "step": 11860 + }, + { + "epoch": 0.9632908308292049, + "grad_norm": 5.290742984147669, + "learning_rate": 1.7650136843821163e-08, + "loss": 0.5395, + "step": 11861 + }, + { + "epoch": 0.9633720458052465, + "grad_norm": 7.397405822219676, + "learning_rate": 1.7572216802808907e-08, + "loss": 0.5077, + "step": 11862 + }, + { + "epoch": 0.9634532607812881, + "grad_norm": 5.393822284388817, + "learning_rate": 1.74944685326614e-08, + "loss": 0.447, + "step": 11863 + }, + { + "epoch": 0.9635344757573296, + "grad_norm": 4.001728592267749, + "learning_rate": 1.741689203875796e-08, + "loss": 0.5923, + "step": 11864 + }, + { + "epoch": 0.9636156907333713, + "grad_norm": 6.112171929702209, + "learning_rate": 1.7339487326466787e-08, + "loss": 0.5556, + "step": 11865 + }, + { + "epoch": 0.9636969057094128, + "grad_norm": 4.7616632757082575, + "learning_rate": 1.7262254401143873e-08, + "loss": 0.5483, + "step": 11866 + }, + { + "epoch": 0.9637781206854544, + "grad_norm": 5.833975112122491, + "learning_rate": 1.7185193268133282e-08, + "loss": 0.4426, + "step": 11867 + }, + { + "epoch": 0.963859335661496, + "grad_norm": 3.698744760296142, + "learning_rate": 1.7108303932767135e-08, + "loss": 0.4634, + "step": 11868 + }, + { + "epoch": 0.9639405506375376, + "grad_norm": 6.198330496132483, + "learning_rate": 1.7031586400365895e-08, + "loss": 0.4089, + "step": 11869 + }, + { + "epoch": 0.9640217656135791, + "grad_norm": 4.92658425213048, + "learning_rate": 1.695504067623782e-08, + "loss": 0.571, + "step": 11870 + }, + { + "epoch": 0.9641029805896207, + "grad_norm": 4.570520680219551, + "learning_rate": 1.6878666765679507e-08, + "loss": 0.4378, + "step": 11871 + }, + { + "epoch": 0.9641841955656623, + "grad_norm": 4.130327563075009, + "learning_rate": 1.6802464673975893e-08, + "loss": 0.4161, + "step": 11872 + }, + { + "epoch": 0.9642654105417039, + "grad_norm": 5.2911892010001775, + "learning_rate": 1.6726434406399704e-08, + "loss": 0.4437, + "step": 11873 + }, + { + "epoch": 0.9643466255177455, + "grad_norm": 7.92651230061654, + "learning_rate": 1.6650575968211458e-08, + "loss": 0.3474, + "step": 11874 + }, + { + "epoch": 0.964427840493787, + "grad_norm": 4.955382254039267, + "learning_rate": 1.6574889364660564e-08, + "loss": 0.4664, + "step": 11875 + }, + { + "epoch": 0.9645090554698287, + "grad_norm": 5.462298377112138, + "learning_rate": 1.6499374600983943e-08, + "loss": 0.4326, + "step": 11876 + }, + { + "epoch": 0.9645902704458702, + "grad_norm": 3.395717603402873, + "learning_rate": 1.642403168240686e-08, + "loss": 0.5458, + "step": 11877 + }, + { + "epoch": 0.9646714854219118, + "grad_norm": 6.599864821668275, + "learning_rate": 1.6348860614142646e-08, + "loss": 0.5629, + "step": 11878 + }, + { + "epoch": 0.9647527003979534, + "grad_norm": 3.790645168309616, + "learning_rate": 1.62738614013927e-08, + "loss": 0.511, + "step": 11879 + }, + { + "epoch": 0.964833915373995, + "grad_norm": 2.6841332503368918, + "learning_rate": 1.6199034049346474e-08, + "loss": 0.3197, + "step": 11880 + }, + { + "epoch": 0.9649151303500365, + "grad_norm": 8.916212568990264, + "learning_rate": 1.6124378563182053e-08, + "loss": 0.4072, + "step": 11881 + }, + { + "epoch": 0.9649963453260781, + "grad_norm": 6.5493673229576945, + "learning_rate": 1.6049894948064748e-08, + "loss": 0.5502, + "step": 11882 + }, + { + "epoch": 0.9650775603021197, + "grad_norm": 3.681237380060799, + "learning_rate": 1.597558320914849e-08, + "loss": 0.5055, + "step": 11883 + }, + { + "epoch": 0.9651587752781613, + "grad_norm": 5.868912226870429, + "learning_rate": 1.5901443351575563e-08, + "loss": 0.4587, + "step": 11884 + }, + { + "epoch": 0.9652399902542029, + "grad_norm": 3.216888608216173, + "learning_rate": 1.5827475380475744e-08, + "loss": 0.4016, + "step": 11885 + }, + { + "epoch": 0.9653212052302445, + "grad_norm": 6.658602355248441, + "learning_rate": 1.575367930096716e-08, + "loss": 0.6319, + "step": 11886 + }, + { + "epoch": 0.9654024202062861, + "grad_norm": 7.62706341624403, + "learning_rate": 1.5680055118156566e-08, + "loss": 0.4729, + "step": 11887 + }, + { + "epoch": 0.9654836351823276, + "grad_norm": 5.573341730209802, + "learning_rate": 1.5606602837137942e-08, + "loss": 0.3682, + "step": 11888 + }, + { + "epoch": 0.9655648501583692, + "grad_norm": 9.836559250337336, + "learning_rate": 1.5533322462993884e-08, + "loss": 0.3449, + "step": 11889 + }, + { + "epoch": 0.9656460651344108, + "grad_norm": 5.722705721907483, + "learning_rate": 1.546021400079506e-08, + "loss": 0.5415, + "step": 11890 + }, + { + "epoch": 0.9657272801104524, + "grad_norm": 13.038255642017731, + "learning_rate": 1.538727745560048e-08, + "loss": 0.5975, + "step": 11891 + }, + { + "epoch": 0.9658084950864939, + "grad_norm": 4.658075474008751, + "learning_rate": 1.5314512832456385e-08, + "loss": 0.3905, + "step": 11892 + }, + { + "epoch": 0.9658897100625355, + "grad_norm": 3.609827209694475, + "learning_rate": 1.5241920136397913e-08, + "loss": 0.6306, + "step": 11893 + }, + { + "epoch": 0.9659709250385771, + "grad_norm": 4.924247089415893, + "learning_rate": 1.516949937244827e-08, + "loss": 0.6758, + "step": 11894 + }, + { + "epoch": 0.9660521400146187, + "grad_norm": 7.844480475751513, + "learning_rate": 1.5097250545618447e-08, + "loss": 0.5907, + "step": 11895 + }, + { + "epoch": 0.9661333549906603, + "grad_norm": 7.5845268596408655, + "learning_rate": 1.5025173660907776e-08, + "loss": 0.3675, + "step": 11896 + }, + { + "epoch": 0.9662145699667019, + "grad_norm": 6.1316891281247425, + "learning_rate": 1.495326872330366e-08, + "loss": 0.5117, + "step": 11897 + }, + { + "epoch": 0.9662957849427435, + "grad_norm": 13.323521108000474, + "learning_rate": 1.4881535737781282e-08, + "loss": 0.2842, + "step": 11898 + }, + { + "epoch": 0.966376999918785, + "grad_norm": 6.657166105699875, + "learning_rate": 1.4809974709304176e-08, + "loss": 0.4965, + "step": 11899 + }, + { + "epoch": 0.9664582148948266, + "grad_norm": 4.572946832553962, + "learning_rate": 1.4738585642824488e-08, + "loss": 0.5393, + "step": 11900 + }, + { + "epoch": 0.9665394298708682, + "grad_norm": 5.179202728578802, + "learning_rate": 1.4667368543281324e-08, + "loss": 0.3911, + "step": 11901 + }, + { + "epoch": 0.9666206448469098, + "grad_norm": 3.397470952294324, + "learning_rate": 1.4596323415602965e-08, + "loss": 0.5457, + "step": 11902 + }, + { + "epoch": 0.9667018598229513, + "grad_norm": 5.014658354565728, + "learning_rate": 1.4525450264705198e-08, + "loss": 0.495, + "step": 11903 + }, + { + "epoch": 0.966783074798993, + "grad_norm": 4.876907477649273, + "learning_rate": 1.4454749095491883e-08, + "loss": 0.386, + "step": 11904 + }, + { + "epoch": 0.9668642897750345, + "grad_norm": 6.641652702622383, + "learning_rate": 1.438421991285549e-08, + "loss": 0.5017, + "step": 11905 + }, + { + "epoch": 0.9669455047510761, + "grad_norm": 6.852642511727459, + "learning_rate": 1.4313862721676285e-08, + "loss": 0.5132, + "step": 11906 + }, + { + "epoch": 0.9670267197271177, + "grad_norm": 4.757471303632329, + "learning_rate": 1.4243677526822319e-08, + "loss": 0.5419, + "step": 11907 + }, + { + "epoch": 0.9671079347031593, + "grad_norm": 7.349401986982847, + "learning_rate": 1.4173664333149983e-08, + "loss": 0.4691, + "step": 11908 + }, + { + "epoch": 0.9671891496792009, + "grad_norm": 4.683264212562743, + "learning_rate": 1.4103823145504292e-08, + "loss": 0.396, + "step": 11909 + }, + { + "epoch": 0.9672703646552424, + "grad_norm": 8.004000185912112, + "learning_rate": 1.4034153968717768e-08, + "loss": 0.5567, + "step": 11910 + }, + { + "epoch": 0.967351579631284, + "grad_norm": 9.131569052331146, + "learning_rate": 1.3964656807610721e-08, + "loss": 0.4341, + "step": 11911 + }, + { + "epoch": 0.9674327946073256, + "grad_norm": 4.584784368488165, + "learning_rate": 1.3895331666992361e-08, + "loss": 0.5043, + "step": 11912 + }, + { + "epoch": 0.9675140095833672, + "grad_norm": 5.0665939824517165, + "learning_rate": 1.3826178551659686e-08, + "loss": 0.4195, + "step": 11913 + }, + { + "epoch": 0.9675952245594087, + "grad_norm": 4.105167424178751, + "learning_rate": 1.37571974663972e-08, + "loss": 0.4746, + "step": 11914 + }, + { + "epoch": 0.9676764395354504, + "grad_norm": 7.389787805986678, + "learning_rate": 1.3688388415978581e-08, + "loss": 0.5362, + "step": 11915 + }, + { + "epoch": 0.9677576545114919, + "grad_norm": 5.680461317779846, + "learning_rate": 1.361975140516475e-08, + "loss": 0.5635, + "step": 11916 + }, + { + "epoch": 0.9678388694875335, + "grad_norm": 3.8294537161607476, + "learning_rate": 1.3551286438705513e-08, + "loss": 0.517, + "step": 11917 + }, + { + "epoch": 0.9679200844635751, + "grad_norm": 5.093597645059693, + "learning_rate": 1.3482993521337362e-08, + "loss": 0.4311, + "step": 11918 + }, + { + "epoch": 0.9680012994396167, + "grad_norm": 4.15007656176377, + "learning_rate": 1.3414872657786793e-08, + "loss": 0.5585, + "step": 11919 + }, + { + "epoch": 0.9680825144156583, + "grad_norm": 5.618091315100742, + "learning_rate": 1.3346923852766702e-08, + "loss": 0.4277, + "step": 11920 + }, + { + "epoch": 0.9681637293916998, + "grad_norm": 3.77740666889434, + "learning_rate": 1.3279147110979163e-08, + "loss": 0.4445, + "step": 11921 + }, + { + "epoch": 0.9682449443677414, + "grad_norm": 14.059774837034583, + "learning_rate": 1.3211542437113755e-08, + "loss": 0.4987, + "step": 11922 + }, + { + "epoch": 0.968326159343783, + "grad_norm": 5.451347013670637, + "learning_rate": 1.3144109835848685e-08, + "loss": 0.4612, + "step": 11923 + }, + { + "epoch": 0.9684073743198246, + "grad_norm": 9.206132420636997, + "learning_rate": 1.3076849311849382e-08, + "loss": 0.5038, + "step": 11924 + }, + { + "epoch": 0.9684885892958661, + "grad_norm": 11.483754762302455, + "learning_rate": 1.300976086977046e-08, + "loss": 0.4655, + "step": 11925 + }, + { + "epoch": 0.9685698042719078, + "grad_norm": 6.743288245848484, + "learning_rate": 1.2942844514254038e-08, + "loss": 0.5264, + "step": 11926 + }, + { + "epoch": 0.9686510192479493, + "grad_norm": 6.053479065590689, + "learning_rate": 1.2876100249930024e-08, + "loss": 0.4499, + "step": 11927 + }, + { + "epoch": 0.9687322342239909, + "grad_norm": 3.6207902961878853, + "learning_rate": 1.2809528081416667e-08, + "loss": 0.536, + "step": 11928 + }, + { + "epoch": 0.9688134492000325, + "grad_norm": 6.866156397584753, + "learning_rate": 1.2743128013321115e-08, + "loss": 0.3062, + "step": 11929 + }, + { + "epoch": 0.9688946641760741, + "grad_norm": 5.924869562979613, + "learning_rate": 1.2676900050237472e-08, + "loss": 0.4935, + "step": 11930 + }, + { + "epoch": 0.9689758791521157, + "grad_norm": 5.999357098508331, + "learning_rate": 1.2610844196748184e-08, + "loss": 0.3896, + "step": 11931 + }, + { + "epoch": 0.9690570941281572, + "grad_norm": 24.091678245998175, + "learning_rate": 1.2544960457424316e-08, + "loss": 0.4804, + "step": 11932 + }, + { + "epoch": 0.9691383091041988, + "grad_norm": 4.839750186365689, + "learning_rate": 1.2479248836824165e-08, + "loss": 0.4291, + "step": 11933 + }, + { + "epoch": 0.9692195240802404, + "grad_norm": 5.821546388018869, + "learning_rate": 1.2413709339495205e-08, + "loss": 0.3908, + "step": 11934 + }, + { + "epoch": 0.969300739056282, + "grad_norm": 4.807004678084896, + "learning_rate": 1.2348341969972143e-08, + "loss": 0.461, + "step": 11935 + }, + { + "epoch": 0.9693819540323235, + "grad_norm": 11.290614300170317, + "learning_rate": 1.2283146732778306e-08, + "loss": 0.498, + "step": 11936 + }, + { + "epoch": 0.9694631690083652, + "grad_norm": 4.900177210666115, + "learning_rate": 1.2218123632424527e-08, + "loss": 0.5445, + "step": 11937 + }, + { + "epoch": 0.9695443839844067, + "grad_norm": 4.9843232783971665, + "learning_rate": 1.2153272673409989e-08, + "loss": 0.4344, + "step": 11938 + }, + { + "epoch": 0.9696255989604483, + "grad_norm": 9.329861274666198, + "learning_rate": 1.2088593860222487e-08, + "loss": 0.523, + "step": 11939 + }, + { + "epoch": 0.9697068139364899, + "grad_norm": 5.949707334731747, + "learning_rate": 1.2024087197337053e-08, + "loss": 0.4087, + "step": 11940 + }, + { + "epoch": 0.9697880289125315, + "grad_norm": 3.2312999306605934, + "learning_rate": 1.1959752689217342e-08, + "loss": 0.4933, + "step": 11941 + }, + { + "epoch": 0.9698692438885731, + "grad_norm": 4.579568524123334, + "learning_rate": 1.1895590340315343e-08, + "loss": 0.5973, + "step": 11942 + }, + { + "epoch": 0.9699504588646146, + "grad_norm": 4.04721463907932, + "learning_rate": 1.183160015507001e-08, + "loss": 0.5191, + "step": 11943 + }, + { + "epoch": 0.9700316738406563, + "grad_norm": 3.4507628169869564, + "learning_rate": 1.1767782137909467e-08, + "loss": 0.4671, + "step": 11944 + }, + { + "epoch": 0.9701128888166978, + "grad_norm": 4.199682308907727, + "learning_rate": 1.17041362932499e-08, + "loss": 0.5631, + "step": 11945 + }, + { + "epoch": 0.9701941037927394, + "grad_norm": 4.406904013569409, + "learning_rate": 1.1640662625494737e-08, + "loss": 0.5799, + "step": 11946 + }, + { + "epoch": 0.9702753187687809, + "grad_norm": 6.977075217955481, + "learning_rate": 1.1577361139036292e-08, + "loss": 0.5053, + "step": 11947 + }, + { + "epoch": 0.9703565337448226, + "grad_norm": 5.4502036515579295, + "learning_rate": 1.1514231838254674e-08, + "loss": 0.4786, + "step": 11948 + }, + { + "epoch": 0.9704377487208641, + "grad_norm": 11.48385762088185, + "learning_rate": 1.1451274727518058e-08, + "loss": 0.4975, + "step": 11949 + }, + { + "epoch": 0.9705189636969057, + "grad_norm": 4.167108685353742, + "learning_rate": 1.1388489811182957e-08, + "loss": 0.5192, + "step": 11950 + }, + { + "epoch": 0.9706001786729473, + "grad_norm": 5.2318458897159115, + "learning_rate": 1.1325877093593396e-08, + "loss": 0.5431, + "step": 11951 + }, + { + "epoch": 0.9706813936489889, + "grad_norm": 6.6175704623771505, + "learning_rate": 1.1263436579082022e-08, + "loss": 0.6257, + "step": 11952 + }, + { + "epoch": 0.9707626086250305, + "grad_norm": 3.4493770443599465, + "learning_rate": 1.1201168271969266e-08, + "loss": 0.5008, + "step": 11953 + }, + { + "epoch": 0.970843823601072, + "grad_norm": 9.823049761761046, + "learning_rate": 1.1139072176564181e-08, + "loss": 0.6227, + "step": 11954 + }, + { + "epoch": 0.9709250385771137, + "grad_norm": 12.037056297654933, + "learning_rate": 1.1077148297163053e-08, + "loss": 0.4436, + "step": 11955 + }, + { + "epoch": 0.9710062535531552, + "grad_norm": 47.76636023695995, + "learning_rate": 1.101539663805079e-08, + "loss": 0.44, + "step": 11956 + }, + { + "epoch": 0.9710874685291968, + "grad_norm": 5.334949326289944, + "learning_rate": 1.0953817203500084e-08, + "loss": 0.4165, + "step": 11957 + }, + { + "epoch": 0.9711686835052383, + "grad_norm": 9.47687334833281, + "learning_rate": 1.0892409997772524e-08, + "loss": 0.4979, + "step": 11958 + }, + { + "epoch": 0.97124989848128, + "grad_norm": 5.061927562596902, + "learning_rate": 1.0831175025116658e-08, + "loss": 0.4791, + "step": 11959 + }, + { + "epoch": 0.9713311134573215, + "grad_norm": 8.5596022285608, + "learning_rate": 1.0770112289769653e-08, + "loss": 0.5829, + "step": 11960 + }, + { + "epoch": 0.9714123284333631, + "grad_norm": 4.508772275279344, + "learning_rate": 1.0709221795956738e-08, + "loss": 0.5874, + "step": 11961 + }, + { + "epoch": 0.9714935434094047, + "grad_norm": 4.701424170250187, + "learning_rate": 1.0648503547891487e-08, + "loss": 0.4767, + "step": 11962 + }, + { + "epoch": 0.9715747583854463, + "grad_norm": 3.965969498170774, + "learning_rate": 1.0587957549774986e-08, + "loss": 0.5583, + "step": 11963 + }, + { + "epoch": 0.9716559733614879, + "grad_norm": 7.964781868944689, + "learning_rate": 1.052758380579666e-08, + "loss": 0.5368, + "step": 11964 + }, + { + "epoch": 0.9717371883375294, + "grad_norm": 5.249845918760695, + "learning_rate": 1.0467382320134279e-08, + "loss": 0.4113, + "step": 11965 + }, + { + "epoch": 0.971818403313571, + "grad_norm": 5.825530015816844, + "learning_rate": 1.0407353096953398e-08, + "loss": 0.3504, + "step": 11966 + }, + { + "epoch": 0.9718996182896126, + "grad_norm": 9.06281122515128, + "learning_rate": 1.034749614040792e-08, + "loss": 0.3699, + "step": 11967 + }, + { + "epoch": 0.9719808332656542, + "grad_norm": 7.498622251328044, + "learning_rate": 1.0287811454639252e-08, + "loss": 0.4268, + "step": 11968 + }, + { + "epoch": 0.9720620482416957, + "grad_norm": 6.263774598272872, + "learning_rate": 1.0228299043777146e-08, + "loss": 0.4959, + "step": 11969 + }, + { + "epoch": 0.9721432632177374, + "grad_norm": 5.061985215032839, + "learning_rate": 1.0168958911939975e-08, + "loss": 0.5099, + "step": 11970 + }, + { + "epoch": 0.9722244781937789, + "grad_norm": 8.537598436187121, + "learning_rate": 1.0109791063233898e-08, + "loss": 0.3745, + "step": 11971 + }, + { + "epoch": 0.9723056931698205, + "grad_norm": 3.710316240876283, + "learning_rate": 1.0050795501752309e-08, + "loss": 0.462, + "step": 11972 + }, + { + "epoch": 0.9723869081458621, + "grad_norm": 4.819345845034754, + "learning_rate": 9.991972231577774e-09, + "loss": 0.568, + "step": 11973 + }, + { + "epoch": 0.9724681231219037, + "grad_norm": 7.126395810841205, + "learning_rate": 9.933321256780925e-09, + "loss": 0.4745, + "step": 11974 + }, + { + "epoch": 0.9725493380979453, + "grad_norm": 4.57445118125883, + "learning_rate": 9.874842581419631e-09, + "loss": 0.3644, + "step": 11975 + }, + { + "epoch": 0.9726305530739868, + "grad_norm": 8.751285985895777, + "learning_rate": 9.816536209540373e-09, + "loss": 0.3933, + "step": 11976 + }, + { + "epoch": 0.9727117680500285, + "grad_norm": 6.666232456269325, + "learning_rate": 9.758402145177703e-09, + "loss": 0.499, + "step": 11977 + }, + { + "epoch": 0.97279298302607, + "grad_norm": 7.576271556752231, + "learning_rate": 9.70044039235396e-09, + "loss": 0.4075, + "step": 11978 + }, + { + "epoch": 0.9728741980021116, + "grad_norm": 4.487672059560348, + "learning_rate": 9.642650955080379e-09, + "loss": 0.3907, + "step": 11979 + }, + { + "epoch": 0.9729554129781531, + "grad_norm": 3.6526471218089864, + "learning_rate": 9.585033837355151e-09, + "loss": 0.5327, + "step": 11980 + }, + { + "epoch": 0.9730366279541948, + "grad_norm": 4.83213034119808, + "learning_rate": 9.527589043165086e-09, + "loss": 0.5155, + "step": 11981 + }, + { + "epoch": 0.9731178429302363, + "grad_norm": 4.370525558131413, + "learning_rate": 9.470316576485616e-09, + "loss": 0.4436, + "step": 11982 + }, + { + "epoch": 0.9731990579062779, + "grad_norm": 6.8962180265204065, + "learning_rate": 9.41321644127885e-09, + "loss": 0.4835, + "step": 11983 + }, + { + "epoch": 0.9732802728823196, + "grad_norm": 5.928084396311675, + "learning_rate": 9.356288641496624e-09, + "loss": 0.3822, + "step": 11984 + }, + { + "epoch": 0.9733614878583611, + "grad_norm": 5.61471404924446, + "learning_rate": 9.299533181077458e-09, + "loss": 0.36, + "step": 11985 + }, + { + "epoch": 0.9734427028344027, + "grad_norm": 4.928900628567097, + "learning_rate": 9.242950063948763e-09, + "loss": 0.4844, + "step": 11986 + }, + { + "epoch": 0.9735239178104442, + "grad_norm": 6.227465912824045, + "learning_rate": 9.18653929402602e-09, + "loss": 0.6315, + "step": 11987 + }, + { + "epoch": 0.9736051327864859, + "grad_norm": 5.256061365992016, + "learning_rate": 9.13030087521194e-09, + "loss": 0.5286, + "step": 11988 + }, + { + "epoch": 0.9736863477625274, + "grad_norm": 5.4504099987714465, + "learning_rate": 9.074234811398408e-09, + "loss": 0.4854, + "step": 11989 + }, + { + "epoch": 0.973767562738569, + "grad_norm": 6.539725065060604, + "learning_rate": 9.018341106464823e-09, + "loss": 0.4206, + "step": 11990 + }, + { + "epoch": 0.9738487777146105, + "grad_norm": 4.947085140988941, + "learning_rate": 8.962619764278923e-09, + "loss": 0.4808, + "step": 11991 + }, + { + "epoch": 0.9739299926906522, + "grad_norm": 6.855737833780214, + "learning_rate": 8.907070788695681e-09, + "loss": 0.3113, + "step": 11992 + }, + { + "epoch": 0.9740112076666937, + "grad_norm": 5.986387589137889, + "learning_rate": 8.851694183559523e-09, + "loss": 0.3221, + "step": 11993 + }, + { + "epoch": 0.9740924226427353, + "grad_norm": 6.396980239435106, + "learning_rate": 8.796489952701825e-09, + "loss": 0.5371, + "step": 11994 + }, + { + "epoch": 0.974173637618777, + "grad_norm": 4.469906653866564, + "learning_rate": 8.741458099942313e-09, + "loss": 0.5594, + "step": 11995 + }, + { + "epoch": 0.9742548525948185, + "grad_norm": 6.102062048541683, + "learning_rate": 8.686598629089326e-09, + "loss": 0.382, + "step": 11996 + }, + { + "epoch": 0.9743360675708601, + "grad_norm": 4.275407491240544, + "learning_rate": 8.63191154393872e-09, + "loss": 0.4869, + "step": 11997 + }, + { + "epoch": 0.9744172825469016, + "grad_norm": 6.27848767223886, + "learning_rate": 8.577396848274134e-09, + "loss": 0.4735, + "step": 11998 + }, + { + "epoch": 0.9744984975229433, + "grad_norm": 6.216106726624828, + "learning_rate": 8.523054545868381e-09, + "loss": 0.5145, + "step": 11999 + }, + { + "epoch": 0.9745797124989848, + "grad_norm": 6.83477643803915, + "learning_rate": 8.468884640480956e-09, + "loss": 0.4235, + "step": 12000 + }, + { + "epoch": 0.9746609274750264, + "grad_norm": 12.60083591448807, + "learning_rate": 8.414887135860528e-09, + "loss": 0.3258, + "step": 12001 + }, + { + "epoch": 0.9747421424510679, + "grad_norm": 4.645778183923307, + "learning_rate": 8.36106203574355e-09, + "loss": 0.5243, + "step": 12002 + }, + { + "epoch": 0.9748233574271096, + "grad_norm": 6.124524623697457, + "learning_rate": 8.307409343854267e-09, + "loss": 0.599, + "step": 12003 + }, + { + "epoch": 0.9749045724031511, + "grad_norm": 21.473022616832324, + "learning_rate": 8.253929063904986e-09, + "loss": 0.382, + "step": 12004 + }, + { + "epoch": 0.9749857873791927, + "grad_norm": 4.661852786954021, + "learning_rate": 8.200621199596359e-09, + "loss": 0.6825, + "step": 12005 + }, + { + "epoch": 0.9750670023552344, + "grad_norm": 3.538632513896874, + "learning_rate": 8.147485754617379e-09, + "loss": 0.4224, + "step": 12006 + }, + { + "epoch": 0.9751482173312759, + "grad_norm": 4.889222428461826, + "learning_rate": 8.094522732644272e-09, + "loss": 0.5328, + "step": 12007 + }, + { + "epoch": 0.9752294323073175, + "grad_norm": 8.835696874530043, + "learning_rate": 8.041732137341885e-09, + "loss": 0.3872, + "step": 12008 + }, + { + "epoch": 0.975310647283359, + "grad_norm": 5.49476829680145, + "learning_rate": 7.989113972363406e-09, + "loss": 0.6145, + "step": 12009 + }, + { + "epoch": 0.9753918622594007, + "grad_norm": 5.131999174481017, + "learning_rate": 7.936668241349255e-09, + "loss": 0.5043, + "step": 12010 + }, + { + "epoch": 0.9754730772354422, + "grad_norm": 4.572903995548805, + "learning_rate": 7.884394947928476e-09, + "loss": 0.5749, + "step": 12011 + }, + { + "epoch": 0.9755542922114838, + "grad_norm": 4.3477540547148825, + "learning_rate": 7.832294095718452e-09, + "loss": 0.4026, + "step": 12012 + }, + { + "epoch": 0.9756355071875253, + "grad_norm": 7.454961091377597, + "learning_rate": 7.780365688323798e-09, + "loss": 0.3078, + "step": 12013 + }, + { + "epoch": 0.975716722163567, + "grad_norm": 4.170795190072248, + "learning_rate": 7.72860972933831e-09, + "loss": 0.5571, + "step": 12014 + }, + { + "epoch": 0.9757979371396085, + "grad_norm": 7.707305765402371, + "learning_rate": 7.677026222342454e-09, + "loss": 0.3938, + "step": 12015 + }, + { + "epoch": 0.9758791521156501, + "grad_norm": 4.686956759781069, + "learning_rate": 7.625615170906153e-09, + "loss": 0.6371, + "step": 12016 + }, + { + "epoch": 0.9759603670916918, + "grad_norm": 6.526544319588936, + "learning_rate": 7.57437657858684e-09, + "loss": 0.4409, + "step": 12017 + }, + { + "epoch": 0.9760415820677333, + "grad_norm": 5.749185856694462, + "learning_rate": 7.523310448929178e-09, + "loss": 0.2862, + "step": 12018 + }, + { + "epoch": 0.9761227970437749, + "grad_norm": 6.413191599955744, + "learning_rate": 7.472416785467563e-09, + "loss": 0.5205, + "step": 12019 + }, + { + "epoch": 0.9762040120198164, + "grad_norm": 4.702110106071757, + "learning_rate": 7.421695591723066e-09, + "loss": 0.4963, + "step": 12020 + }, + { + "epoch": 0.9762852269958581, + "grad_norm": 4.552204118507779, + "learning_rate": 7.371146871205381e-09, + "loss": 0.5284, + "step": 12021 + }, + { + "epoch": 0.9763664419718996, + "grad_norm": 5.639485771969086, + "learning_rate": 7.320770627412543e-09, + "loss": 0.5295, + "step": 12022 + }, + { + "epoch": 0.9764476569479412, + "grad_norm": 6.8637890745131145, + "learning_rate": 7.27056686382982e-09, + "loss": 0.4764, + "step": 12023 + }, + { + "epoch": 0.9765288719239827, + "grad_norm": 5.248727371974205, + "learning_rate": 7.220535583931099e-09, + "loss": 0.4443, + "step": 12024 + }, + { + "epoch": 0.9766100869000244, + "grad_norm": 5.140443422767862, + "learning_rate": 7.17067679117861e-09, + "loss": 0.5761, + "step": 12025 + }, + { + "epoch": 0.9766913018760659, + "grad_norm": 4.603187702423827, + "learning_rate": 7.120990489022373e-09, + "loss": 0.4652, + "step": 12026 + }, + { + "epoch": 0.9767725168521075, + "grad_norm": 5.4771501562832245, + "learning_rate": 7.071476680900191e-09, + "loss": 0.3577, + "step": 12027 + }, + { + "epoch": 0.9768537318281492, + "grad_norm": 9.457901018585092, + "learning_rate": 7.022135370237937e-09, + "loss": 0.5396, + "step": 12028 + }, + { + "epoch": 0.9769349468041907, + "grad_norm": 6.7496814308010995, + "learning_rate": 6.972966560450101e-09, + "loss": 0.4375, + "step": 12029 + }, + { + "epoch": 0.9770161617802323, + "grad_norm": 9.618733980000252, + "learning_rate": 6.923970254938961e-09, + "loss": 0.463, + "step": 12030 + }, + { + "epoch": 0.9770973767562738, + "grad_norm": 3.6437791716015155, + "learning_rate": 6.875146457094583e-09, + "loss": 0.466, + "step": 12031 + }, + { + "epoch": 0.9771785917323155, + "grad_norm": 4.087276926389463, + "learning_rate": 6.8264951702951e-09, + "loss": 0.3838, + "step": 12032 + }, + { + "epoch": 0.977259806708357, + "grad_norm": 7.648508375870743, + "learning_rate": 6.778016397907539e-09, + "loss": 0.3517, + "step": 12033 + }, + { + "epoch": 0.9773410216843986, + "grad_norm": 6.478817726316786, + "learning_rate": 6.729710143286161e-09, + "loss": 0.3296, + "step": 12034 + }, + { + "epoch": 0.9774222366604401, + "grad_norm": 4.785457296804816, + "learning_rate": 6.681576409773016e-09, + "loss": 0.5188, + "step": 12035 + }, + { + "epoch": 0.9775034516364818, + "grad_norm": 7.296274375209841, + "learning_rate": 6.633615200699328e-09, + "loss": 0.6136, + "step": 12036 + }, + { + "epoch": 0.9775846666125233, + "grad_norm": 4.160049221454699, + "learning_rate": 6.5858265193835536e-09, + "loss": 0.514, + "step": 12037 + }, + { + "epoch": 0.9776658815885649, + "grad_norm": 5.830756185483621, + "learning_rate": 6.538210369132214e-09, + "loss": 0.4866, + "step": 12038 + }, + { + "epoch": 0.9777470965646066, + "grad_norm": 5.407452911485265, + "learning_rate": 6.490766753240174e-09, + "loss": 0.6008, + "step": 12039 + }, + { + "epoch": 0.9778283115406481, + "grad_norm": 5.873928827822523, + "learning_rate": 6.443495674990641e-09, + "loss": 0.5819, + "step": 12040 + }, + { + "epoch": 0.9779095265166897, + "grad_norm": 6.00288359521902, + "learning_rate": 6.396397137654054e-09, + "loss": 0.3921, + "step": 12041 + }, + { + "epoch": 0.9779907414927312, + "grad_norm": 7.297473039045752, + "learning_rate": 6.3494711444897495e-09, + "loss": 0.4012, + "step": 12042 + }, + { + "epoch": 0.9780719564687729, + "grad_norm": 4.082939335655915, + "learning_rate": 6.302717698744298e-09, + "loss": 0.4805, + "step": 12043 + }, + { + "epoch": 0.9781531714448144, + "grad_norm": 9.09294805829235, + "learning_rate": 6.2561368036531676e-09, + "loss": 0.5639, + "step": 12044 + }, + { + "epoch": 0.978234386420856, + "grad_norm": 6.290896132053336, + "learning_rate": 6.209728462439613e-09, + "loss": 0.5002, + "step": 12045 + }, + { + "epoch": 0.9783156013968975, + "grad_norm": 8.382100561674394, + "learning_rate": 6.1634926783143975e-09, + "loss": 0.4165, + "step": 12046 + }, + { + "epoch": 0.9783968163729392, + "grad_norm": 5.891844640379527, + "learning_rate": 6.117429454477186e-09, + "loss": 0.3735, + "step": 12047 + }, + { + "epoch": 0.9784780313489807, + "grad_norm": 5.2181840686707766, + "learning_rate": 6.071538794115151e-09, + "loss": 0.5358, + "step": 12048 + }, + { + "epoch": 0.9785592463250223, + "grad_norm": 5.170788538583498, + "learning_rate": 6.025820700403529e-09, + "loss": 0.5949, + "step": 12049 + }, + { + "epoch": 0.978640461301064, + "grad_norm": 9.342590625760383, + "learning_rate": 5.9802751765061785e-09, + "loss": 0.553, + "step": 12050 + }, + { + "epoch": 0.9787216762771055, + "grad_norm": 6.856775571985887, + "learning_rate": 5.9349022255741905e-09, + "loss": 0.5531, + "step": 12051 + }, + { + "epoch": 0.9788028912531471, + "grad_norm": 4.222463014742318, + "learning_rate": 5.889701850747276e-09, + "loss": 0.5136, + "step": 12052 + }, + { + "epoch": 0.9788841062291886, + "grad_norm": 6.316498496538767, + "learning_rate": 5.844674055153487e-09, + "loss": 0.4999, + "step": 12053 + }, + { + "epoch": 0.9789653212052303, + "grad_norm": 5.951917290095282, + "learning_rate": 5.799818841907556e-09, + "loss": 0.4719, + "step": 12054 + }, + { + "epoch": 0.9790465361812718, + "grad_norm": 10.835272741624479, + "learning_rate": 5.7551362141142205e-09, + "loss": 0.3861, + "step": 12055 + }, + { + "epoch": 0.9791277511573134, + "grad_norm": 6.226477036302456, + "learning_rate": 5.71062617486462e-09, + "loss": 0.5043, + "step": 12056 + }, + { + "epoch": 0.979208966133355, + "grad_norm": 9.360240176470688, + "learning_rate": 5.666288727239066e-09, + "loss": 0.4041, + "step": 12057 + }, + { + "epoch": 0.9792901811093966, + "grad_norm": 8.608220193471967, + "learning_rate": 5.622123874305108e-09, + "loss": 0.3628, + "step": 12058 + }, + { + "epoch": 0.9793713960854381, + "grad_norm": 5.828438095145518, + "learning_rate": 5.578131619118909e-09, + "loss": 0.3776, + "step": 12059 + }, + { + "epoch": 0.9794526110614797, + "grad_norm": 4.52241346057629, + "learning_rate": 5.534311964724426e-09, + "loss": 0.4952, + "step": 12060 + }, + { + "epoch": 0.9795338260375214, + "grad_norm": 3.8196975497917736, + "learning_rate": 5.490664914153676e-09, + "loss": 0.4629, + "step": 12061 + }, + { + "epoch": 0.9796150410135629, + "grad_norm": 4.37711626897485, + "learning_rate": 5.447190470427022e-09, + "loss": 0.4722, + "step": 12062 + }, + { + "epoch": 0.9796962559896045, + "grad_norm": 3.977099684985591, + "learning_rate": 5.4038886365523346e-09, + "loss": 0.5534, + "step": 12063 + }, + { + "epoch": 0.979777470965646, + "grad_norm": 3.64657798618061, + "learning_rate": 5.360759415526385e-09, + "loss": 0.4216, + "step": 12064 + }, + { + "epoch": 0.9798586859416877, + "grad_norm": 5.625694324125192, + "learning_rate": 5.3178028103331725e-09, + "loss": 0.4299, + "step": 12065 + }, + { + "epoch": 0.9799399009177292, + "grad_norm": 3.7515462457345823, + "learning_rate": 5.275018823945044e-09, + "loss": 0.4106, + "step": 12066 + }, + { + "epoch": 0.9800211158937708, + "grad_norm": 4.423453741648708, + "learning_rate": 5.232407459322408e-09, + "loss": 0.3944, + "step": 12067 + }, + { + "epoch": 0.9801023308698124, + "grad_norm": 5.1224553290352635, + "learning_rate": 5.189968719413741e-09, + "loss": 0.379, + "step": 12068 + }, + { + "epoch": 0.980183545845854, + "grad_norm": 7.181016487036675, + "learning_rate": 5.14770260715558e-09, + "loss": 0.4096, + "step": 12069 + }, + { + "epoch": 0.9802647608218955, + "grad_norm": 5.486473028790214, + "learning_rate": 5.10560912547281e-09, + "loss": 0.5059, + "step": 12070 + }, + { + "epoch": 0.9803459757979371, + "grad_norm": 4.843785469577583, + "learning_rate": 5.063688277277545e-09, + "loss": 0.444, + "step": 12071 + }, + { + "epoch": 0.9804271907739788, + "grad_norm": 12.938046238864795, + "learning_rate": 5.021940065471076e-09, + "loss": 0.4866, + "step": 12072 + }, + { + "epoch": 0.9805084057500203, + "grad_norm": 10.093029172739994, + "learning_rate": 4.980364492941924e-09, + "loss": 0.4679, + "step": 12073 + }, + { + "epoch": 0.9805896207260619, + "grad_norm": 7.273329994281363, + "learning_rate": 4.938961562566402e-09, + "loss": 0.3914, + "step": 12074 + }, + { + "epoch": 0.9806708357021034, + "grad_norm": 3.3725698256307437, + "learning_rate": 4.8977312772102715e-09, + "loss": 0.5183, + "step": 12075 + }, + { + "epoch": 0.9807520506781451, + "grad_norm": 5.1217541737470835, + "learning_rate": 4.856673639725695e-09, + "loss": 0.498, + "step": 12076 + }, + { + "epoch": 0.9808332656541866, + "grad_norm": 3.8730107181355535, + "learning_rate": 4.815788652954012e-09, + "loss": 0.4615, + "step": 12077 + }, + { + "epoch": 0.9809144806302282, + "grad_norm": 8.294253938971737, + "learning_rate": 4.775076319724348e-09, + "loss": 0.5011, + "step": 12078 + }, + { + "epoch": 0.9809956956062698, + "grad_norm": 3.478676476091108, + "learning_rate": 4.734536642853338e-09, + "loss": 0.5997, + "step": 12079 + }, + { + "epoch": 0.9810769105823114, + "grad_norm": 4.181120233195193, + "learning_rate": 4.6941696251465165e-09, + "loss": 0.6451, + "step": 12080 + }, + { + "epoch": 0.9811581255583529, + "grad_norm": 3.6134990519276466, + "learning_rate": 4.6539752693969265e-09, + "loss": 0.4854, + "step": 12081 + }, + { + "epoch": 0.9812393405343945, + "grad_norm": 11.700469427119463, + "learning_rate": 4.613953578385954e-09, + "loss": 0.3663, + "step": 12082 + }, + { + "epoch": 0.9813205555104362, + "grad_norm": 4.369290046988292, + "learning_rate": 4.574104554882497e-09, + "loss": 0.7481, + "step": 12083 + }, + { + "epoch": 0.9814017704864777, + "grad_norm": 5.401987698332693, + "learning_rate": 4.534428201644348e-09, + "loss": 0.423, + "step": 12084 + }, + { + "epoch": 0.9814829854625193, + "grad_norm": 9.430350487972829, + "learning_rate": 4.494924521416533e-09, + "loss": 0.4895, + "step": 12085 + }, + { + "epoch": 0.9815642004385609, + "grad_norm": 3.7707234936613565, + "learning_rate": 4.455593516932699e-09, + "loss": 0.5178, + "step": 12086 + }, + { + "epoch": 0.9816454154146025, + "grad_norm": 6.985985247371312, + "learning_rate": 4.4164351909142815e-09, + "loss": 0.3872, + "step": 12087 + }, + { + "epoch": 0.981726630390644, + "grad_norm": 5.1330117460195215, + "learning_rate": 4.377449546071055e-09, + "loss": 0.4994, + "step": 12088 + }, + { + "epoch": 0.9818078453666856, + "grad_norm": 6.234609568273921, + "learning_rate": 4.338636585100309e-09, + "loss": 0.5062, + "step": 12089 + }, + { + "epoch": 0.9818890603427272, + "grad_norm": 7.581218288677617, + "learning_rate": 4.299996310687671e-09, + "loss": 0.5828, + "step": 12090 + }, + { + "epoch": 0.9819702753187688, + "grad_norm": 5.845752308546046, + "learning_rate": 4.261528725507113e-09, + "loss": 0.472, + "step": 12091 + }, + { + "epoch": 0.9820514902948103, + "grad_norm": 7.146820683688464, + "learning_rate": 4.223233832220397e-09, + "loss": 0.4632, + "step": 12092 + }, + { + "epoch": 0.982132705270852, + "grad_norm": 3.2381775735767935, + "learning_rate": 4.18511163347679e-09, + "loss": 0.6537, + "step": 12093 + }, + { + "epoch": 0.9822139202468936, + "grad_norm": 5.730027453232988, + "learning_rate": 4.147162131914739e-09, + "loss": 0.3759, + "step": 12094 + }, + { + "epoch": 0.9822951352229351, + "grad_norm": 3.5984616089033827, + "learning_rate": 4.109385330159921e-09, + "loss": 0.5452, + "step": 12095 + }, + { + "epoch": 0.9823763501989767, + "grad_norm": 6.769530777534317, + "learning_rate": 4.071781230826355e-09, + "loss": 0.4365, + "step": 12096 + }, + { + "epoch": 0.9824575651750183, + "grad_norm": 5.819933789150477, + "learning_rate": 4.034349836516127e-09, + "loss": 0.4541, + "step": 12097 + }, + { + "epoch": 0.9825387801510599, + "grad_norm": 5.491397739648694, + "learning_rate": 3.99709114981911e-09, + "loss": 0.3554, + "step": 12098 + }, + { + "epoch": 0.9826199951271014, + "grad_norm": 5.29270938282257, + "learning_rate": 3.960005173313519e-09, + "loss": 0.3631, + "step": 12099 + }, + { + "epoch": 0.982701210103143, + "grad_norm": 5.1898426034045615, + "learning_rate": 3.923091909565357e-09, + "loss": 0.4277, + "step": 12100 + }, + { + "epoch": 0.9827824250791846, + "grad_norm": 4.856901331420813, + "learning_rate": 3.88635136112897e-09, + "loss": 0.4089, + "step": 12101 + }, + { + "epoch": 0.9828636400552262, + "grad_norm": 3.4872568138932762, + "learning_rate": 3.8497835305464915e-09, + "loss": 0.6127, + "step": 12102 + }, + { + "epoch": 0.9829448550312677, + "grad_norm": 13.086059636580629, + "learning_rate": 3.813388420348396e-09, + "loss": 0.4843, + "step": 12103 + }, + { + "epoch": 0.9830260700073093, + "grad_norm": 5.395590802101605, + "learning_rate": 3.777166033052948e-09, + "loss": 0.4887, + "step": 12104 + }, + { + "epoch": 0.983107284983351, + "grad_norm": 4.809183849119693, + "learning_rate": 3.741116371166476e-09, + "loss": 0.5797, + "step": 12105 + }, + { + "epoch": 0.9831884999593925, + "grad_norm": 7.000518609996093, + "learning_rate": 3.705239437183372e-09, + "loss": 0.6074, + "step": 12106 + }, + { + "epoch": 0.9832697149354341, + "grad_norm": 7.749862840244079, + "learning_rate": 3.6695352335863745e-09, + "loss": 0.3449, + "step": 12107 + }, + { + "epoch": 0.9833509299114757, + "grad_norm": 5.2466883835337494, + "learning_rate": 3.6340037628460057e-09, + "loss": 0.4941, + "step": 12108 + }, + { + "epoch": 0.9834321448875173, + "grad_norm": 20.385619121552477, + "learning_rate": 3.5986450274205776e-09, + "loss": 0.4583, + "step": 12109 + }, + { + "epoch": 0.9835133598635588, + "grad_norm": 4.0593617565984035, + "learning_rate": 3.5634590297570215e-09, + "loss": 0.4225, + "step": 12110 + }, + { + "epoch": 0.9835945748396004, + "grad_norm": 6.465413464659027, + "learning_rate": 3.528445772289779e-09, + "loss": 0.5616, + "step": 12111 + }, + { + "epoch": 0.983675789815642, + "grad_norm": 8.991522570276532, + "learning_rate": 3.4936052574416345e-09, + "loss": 0.3596, + "step": 12112 + }, + { + "epoch": 0.9837570047916836, + "grad_norm": 4.818729279084131, + "learning_rate": 3.458937487623437e-09, + "loss": 0.4936, + "step": 12113 + }, + { + "epoch": 0.9838382197677251, + "grad_norm": 6.112039073938186, + "learning_rate": 3.424442465234101e-09, + "loss": 0.4451, + "step": 12114 + }, + { + "epoch": 0.9839194347437668, + "grad_norm": 6.296027842842914, + "learning_rate": 3.3901201926606063e-09, + "loss": 0.5395, + "step": 12115 + }, + { + "epoch": 0.9840006497198084, + "grad_norm": 5.253597703880921, + "learning_rate": 3.3559706722774423e-09, + "loss": 0.6561, + "step": 12116 + }, + { + "epoch": 0.9840818646958499, + "grad_norm": 5.551682606805065, + "learning_rate": 3.3219939064477182e-09, + "loss": 0.3641, + "step": 12117 + }, + { + "epoch": 0.9841630796718915, + "grad_norm": 7.465132500098328, + "learning_rate": 3.288189897522609e-09, + "loss": 0.4635, + "step": 12118 + }, + { + "epoch": 0.9842442946479331, + "grad_norm": 8.173920927315894, + "learning_rate": 3.254558647841077e-09, + "loss": 0.5209, + "step": 12119 + }, + { + "epoch": 0.9843255096239747, + "grad_norm": 4.788329870369596, + "learning_rate": 3.2211001597304283e-09, + "loss": 0.5823, + "step": 12120 + }, + { + "epoch": 0.9844067246000162, + "grad_norm": 6.8587617655903355, + "learning_rate": 3.187814435505199e-09, + "loss": 0.4951, + "step": 12121 + }, + { + "epoch": 0.9844879395760578, + "grad_norm": 5.481814887439901, + "learning_rate": 3.1547014774693797e-09, + "loss": 0.3174, + "step": 12122 + }, + { + "epoch": 0.9845691545520994, + "grad_norm": 8.885860015107669, + "learning_rate": 3.1217612879139158e-09, + "loss": 0.4645, + "step": 12123 + }, + { + "epoch": 0.984650369528141, + "grad_norm": 4.284808548816115, + "learning_rate": 3.088993869117818e-09, + "loss": 0.4884, + "step": 12124 + }, + { + "epoch": 0.9847315845041825, + "grad_norm": 4.896143196735318, + "learning_rate": 3.056399223348716e-09, + "loss": 0.3905, + "step": 12125 + }, + { + "epoch": 0.9848127994802242, + "grad_norm": 4.786981373349127, + "learning_rate": 3.023977352861751e-09, + "loss": 0.6363, + "step": 12126 + }, + { + "epoch": 0.9848940144562658, + "grad_norm": 5.440565139296447, + "learning_rate": 2.991728259900684e-09, + "loss": 0.4777, + "step": 12127 + }, + { + "epoch": 0.9849752294323073, + "grad_norm": 4.432486814000843, + "learning_rate": 2.959651946696507e-09, + "loss": 0.4696, + "step": 12128 + }, + { + "epoch": 0.9850564444083489, + "grad_norm": 4.593612284410864, + "learning_rate": 2.927748415469389e-09, + "loss": 0.4412, + "step": 12129 + }, + { + "epoch": 0.9851376593843905, + "grad_norm": 11.349530667232669, + "learning_rate": 2.8960176684261767e-09, + "loss": 0.5133, + "step": 12130 + }, + { + "epoch": 0.9852188743604321, + "grad_norm": 7.977641323262256, + "learning_rate": 2.86445970776289e-09, + "loss": 0.4928, + "step": 12131 + }, + { + "epoch": 0.9853000893364736, + "grad_norm": 4.0771155412598805, + "learning_rate": 2.833074535663338e-09, + "loss": 0.4539, + "step": 12132 + }, + { + "epoch": 0.9853813043125152, + "grad_norm": 4.941422589211022, + "learning_rate": 2.8018621542988402e-09, + "loss": 0.4797, + "step": 12133 + }, + { + "epoch": 0.9854625192885568, + "grad_norm": 8.421332670522345, + "learning_rate": 2.7708225658290566e-09, + "loss": 0.5164, + "step": 12134 + }, + { + "epoch": 0.9855437342645984, + "grad_norm": 7.591462565867755, + "learning_rate": 2.739955772401992e-09, + "loss": 0.621, + "step": 12135 + }, + { + "epoch": 0.9856249492406399, + "grad_norm": 5.772643166679314, + "learning_rate": 2.709261776153438e-09, + "loss": 0.4788, + "step": 12136 + }, + { + "epoch": 0.9857061642166816, + "grad_norm": 6.132821992629694, + "learning_rate": 2.6787405792072507e-09, + "loss": 0.4194, + "step": 12137 + }, + { + "epoch": 0.9857873791927232, + "grad_norm": 5.663712178693497, + "learning_rate": 2.6483921836753525e-09, + "loss": 0.4067, + "step": 12138 + }, + { + "epoch": 0.9858685941687647, + "grad_norm": 5.534184126963757, + "learning_rate": 2.6182165916577295e-09, + "loss": 0.547, + "step": 12139 + }, + { + "epoch": 0.9859498091448063, + "grad_norm": 4.047838425287923, + "learning_rate": 2.5882138052421567e-09, + "loss": 0.4595, + "step": 12140 + }, + { + "epoch": 0.9860310241208479, + "grad_norm": 3.5712541301580667, + "learning_rate": 2.5583838265050286e-09, + "loss": 0.4067, + "step": 12141 + }, + { + "epoch": 0.9861122390968895, + "grad_norm": 5.1536702438204065, + "learning_rate": 2.52872665751025e-09, + "loss": 0.5706, + "step": 12142 + }, + { + "epoch": 0.986193454072931, + "grad_norm": 16.616810366998326, + "learning_rate": 2.4992423003095124e-09, + "loss": 0.5131, + "step": 12143 + }, + { + "epoch": 0.9862746690489727, + "grad_norm": 5.2196354867906685, + "learning_rate": 2.4699307569436835e-09, + "loss": 0.4475, + "step": 12144 + }, + { + "epoch": 0.9863558840250142, + "grad_norm": 4.94188443326249, + "learning_rate": 2.4407920294405864e-09, + "loss": 0.381, + "step": 12145 + }, + { + "epoch": 0.9864370990010558, + "grad_norm": 6.982635794914591, + "learning_rate": 2.4118261198166625e-09, + "loss": 0.4683, + "step": 12146 + }, + { + "epoch": 0.9865183139770973, + "grad_norm": 8.68074477826504, + "learning_rate": 2.383033030075865e-09, + "loss": 0.4486, + "step": 12147 + }, + { + "epoch": 0.986599528953139, + "grad_norm": 4.488613369870494, + "learning_rate": 2.354412762210767e-09, + "loss": 0.5145, + "step": 12148 + }, + { + "epoch": 0.9866807439291806, + "grad_norm": 8.342369544568935, + "learning_rate": 2.325965318201728e-09, + "loss": 0.4742, + "step": 12149 + }, + { + "epoch": 0.9867619589052221, + "grad_norm": 4.473725413880338, + "learning_rate": 2.2976907000171743e-09, + "loss": 0.4517, + "step": 12150 + }, + { + "epoch": 0.9868431738812637, + "grad_norm": 5.732059155908079, + "learning_rate": 2.2695889096133184e-09, + "loss": 0.5598, + "step": 12151 + }, + { + "epoch": 0.9869243888573053, + "grad_norm": 6.627352082908765, + "learning_rate": 2.2416599489349933e-09, + "loss": 0.4432, + "step": 12152 + }, + { + "epoch": 0.9870056038333469, + "grad_norm": 6.4462903139711525, + "learning_rate": 2.2139038199145424e-09, + "loss": 0.641, + "step": 12153 + }, + { + "epoch": 0.9870868188093884, + "grad_norm": 4.235134074339075, + "learning_rate": 2.1863205244726514e-09, + "loss": 0.3059, + "step": 12154 + }, + { + "epoch": 0.98716803378543, + "grad_norm": 10.34898874466498, + "learning_rate": 2.1589100645180715e-09, + "loss": 0.4217, + "step": 12155 + }, + { + "epoch": 0.9872492487614716, + "grad_norm": 7.083106400400486, + "learning_rate": 2.1316724419470637e-09, + "loss": 0.4262, + "step": 12156 + }, + { + "epoch": 0.9873304637375132, + "grad_norm": 4.880842030469988, + "learning_rate": 2.1046076586445084e-09, + "loss": 0.4115, + "step": 12157 + }, + { + "epoch": 0.9874116787135547, + "grad_norm": 5.428807688681806, + "learning_rate": 2.077715716483353e-09, + "loss": 0.4031, + "step": 12158 + }, + { + "epoch": 0.9874928936895964, + "grad_norm": 3.5615581342610594, + "learning_rate": 2.0509966173240524e-09, + "loss": 0.4478, + "step": 12159 + }, + { + "epoch": 0.987574108665638, + "grad_norm": 4.9802028956802245, + "learning_rate": 2.0244503630154066e-09, + "loss": 0.566, + "step": 12160 + }, + { + "epoch": 0.9876553236416795, + "grad_norm": 3.88910810121843, + "learning_rate": 1.9980769553948344e-09, + "loss": 0.3933, + "step": 12161 + }, + { + "epoch": 0.9877365386177211, + "grad_norm": 4.551525974114428, + "learning_rate": 1.9718763962867094e-09, + "loss": 0.4622, + "step": 12162 + }, + { + "epoch": 0.9878177535937627, + "grad_norm": 6.650520927884947, + "learning_rate": 1.945848687504026e-09, + "loss": 0.5204, + "step": 12163 + }, + { + "epoch": 0.9878989685698043, + "grad_norm": 5.106322645005586, + "learning_rate": 1.919993830847844e-09, + "loss": 0.3685, + "step": 12164 + }, + { + "epoch": 0.9879801835458458, + "grad_norm": 5.293912542968066, + "learning_rate": 1.8943118281070095e-09, + "loss": 0.4464, + "step": 12165 + }, + { + "epoch": 0.9880613985218875, + "grad_norm": 4.885311151399439, + "learning_rate": 1.86880268105899e-09, + "loss": 0.5068, + "step": 12166 + }, + { + "epoch": 0.988142613497929, + "grad_norm": 5.1380448467955295, + "learning_rate": 1.8434663914687623e-09, + "loss": 0.4485, + "step": 12167 + }, + { + "epoch": 0.9882238284739706, + "grad_norm": 4.9192525339798925, + "learning_rate": 1.8183029610890912e-09, + "loss": 0.5346, + "step": 12168 + }, + { + "epoch": 0.9883050434500121, + "grad_norm": 4.9277595849293565, + "learning_rate": 1.7933123916613614e-09, + "loss": 0.4065, + "step": 12169 + }, + { + "epoch": 0.9883862584260538, + "grad_norm": 5.8993559652023215, + "learning_rate": 1.7684946849150232e-09, + "loss": 0.459, + "step": 12170 + }, + { + "epoch": 0.9884674734020954, + "grad_norm": 4.936973153328386, + "learning_rate": 1.7438498425673135e-09, + "loss": 0.5248, + "step": 12171 + }, + { + "epoch": 0.9885486883781369, + "grad_norm": 4.4227762528236125, + "learning_rate": 1.7193778663229799e-09, + "loss": 0.5324, + "step": 12172 + }, + { + "epoch": 0.9886299033541786, + "grad_norm": 4.957741580907209, + "learning_rate": 1.6950787578759453e-09, + "loss": 0.5095, + "step": 12173 + }, + { + "epoch": 0.9887111183302201, + "grad_norm": 6.64145740938836, + "learning_rate": 1.6709525189073649e-09, + "loss": 0.4332, + "step": 12174 + }, + { + "epoch": 0.9887923333062617, + "grad_norm": 6.145921162498026, + "learning_rate": 1.646999151086459e-09, + "loss": 0.3923, + "step": 12175 + }, + { + "epoch": 0.9888735482823032, + "grad_norm": 5.254183282644911, + "learning_rate": 1.6232186560710684e-09, + "loss": 0.4596, + "step": 12176 + }, + { + "epoch": 0.9889547632583449, + "grad_norm": 17.99187941281074, + "learning_rate": 1.599611035506543e-09, + "loss": 0.6023, + "step": 12177 + }, + { + "epoch": 0.9890359782343864, + "grad_norm": 10.290258462437267, + "learning_rate": 1.5761762910260214e-09, + "loss": 0.5383, + "step": 12178 + }, + { + "epoch": 0.989117193210428, + "grad_norm": 6.646644372837433, + "learning_rate": 1.5529144242518167e-09, + "loss": 0.5216, + "step": 12179 + }, + { + "epoch": 0.9891984081864695, + "grad_norm": 4.4766333642428675, + "learning_rate": 1.5298254367926424e-09, + "loss": 0.4035, + "step": 12180 + }, + { + "epoch": 0.9892796231625112, + "grad_norm": 9.914986537758677, + "learning_rate": 1.5069093302469418e-09, + "loss": 0.4324, + "step": 12181 + }, + { + "epoch": 0.9893608381385528, + "grad_norm": 4.176667675043893, + "learning_rate": 1.4841661061998358e-09, + "loss": 0.4502, + "step": 12182 + }, + { + "epoch": 0.9894420531145943, + "grad_norm": 3.795368320920534, + "learning_rate": 1.4615957662250657e-09, + "loss": 0.4701, + "step": 12183 + }, + { + "epoch": 0.989523268090636, + "grad_norm": 5.32972800464418, + "learning_rate": 1.4391983118847152e-09, + "loss": 0.605, + "step": 12184 + }, + { + "epoch": 0.9896044830666775, + "grad_norm": 6.008043856391167, + "learning_rate": 1.4169737447283782e-09, + "loss": 0.3262, + "step": 12185 + }, + { + "epoch": 0.9896856980427191, + "grad_norm": 7.84003745927447, + "learning_rate": 1.394922066293991e-09, + "loss": 0.5486, + "step": 12186 + }, + { + "epoch": 0.9897669130187606, + "grad_norm": 6.066959827307744, + "learning_rate": 1.3730432781070002e-09, + "loss": 0.4331, + "step": 12187 + }, + { + "epoch": 0.9898481279948023, + "grad_norm": 17.157061321750632, + "learning_rate": 1.3513373816820274e-09, + "loss": 0.642, + "step": 12188 + }, + { + "epoch": 0.9899293429708438, + "grad_norm": 3.932173196085788, + "learning_rate": 1.3298043785203718e-09, + "loss": 0.4469, + "step": 12189 + }, + { + "epoch": 0.9900105579468854, + "grad_norm": 8.805868997589002, + "learning_rate": 1.30844427011223e-09, + "loss": 0.485, + "step": 12190 + }, + { + "epoch": 0.9900917729229269, + "grad_norm": 4.027618081932802, + "learning_rate": 1.287257057935587e-09, + "loss": 0.4275, + "step": 12191 + }, + { + "epoch": 0.9901729878989686, + "grad_norm": 8.085780436510987, + "learning_rate": 1.2662427434564916e-09, + "loss": 0.5487, + "step": 12192 + }, + { + "epoch": 0.9902542028750102, + "grad_norm": 6.6055275665447875, + "learning_rate": 1.2454013281290589e-09, + "loss": 0.4155, + "step": 12193 + }, + { + "epoch": 0.9903354178510517, + "grad_norm": 5.7796904505260365, + "learning_rate": 1.2247328133954683e-09, + "loss": 0.4356, + "step": 12194 + }, + { + "epoch": 0.9904166328270934, + "grad_norm": 4.4243001082018445, + "learning_rate": 1.2042372006856873e-09, + "loss": 0.4525, + "step": 12195 + }, + { + "epoch": 0.9904978478031349, + "grad_norm": 5.706811651819559, + "learning_rate": 1.1839144914180256e-09, + "loss": 0.3391, + "step": 12196 + }, + { + "epoch": 0.9905790627791765, + "grad_norm": 5.337953154660965, + "learning_rate": 1.1637646869985809e-09, + "loss": 0.4905, + "step": 12197 + }, + { + "epoch": 0.990660277755218, + "grad_norm": 7.132197947930379, + "learning_rate": 1.143787788821793e-09, + "loss": 0.4736, + "step": 12198 + }, + { + "epoch": 0.9907414927312597, + "grad_norm": 8.488570779233747, + "learning_rate": 1.1239837982698898e-09, + "loss": 0.4538, + "step": 12199 + }, + { + "epoch": 0.9908227077073012, + "grad_norm": 8.566390662400726, + "learning_rate": 1.104352716713164e-09, + "loss": 0.5289, + "step": 12200 + }, + { + "epoch": 0.9909039226833428, + "grad_norm": 7.713592201697891, + "learning_rate": 1.0848945455099734e-09, + "loss": 0.4059, + "step": 12201 + }, + { + "epoch": 0.9909851376593843, + "grad_norm": 8.262715346834725, + "learning_rate": 1.0656092860067413e-09, + "loss": 0.367, + "step": 12202 + }, + { + "epoch": 0.991066352635426, + "grad_norm": 5.432388233712127, + "learning_rate": 1.046496939538233e-09, + "loss": 0.3928, + "step": 12203 + }, + { + "epoch": 0.9911475676114676, + "grad_norm": 5.147447230251187, + "learning_rate": 1.027557507426169e-09, + "loss": 0.5214, + "step": 12204 + }, + { + "epoch": 0.9912287825875091, + "grad_norm": 5.0721034321084515, + "learning_rate": 1.0087909909817228e-09, + "loss": 0.3809, + "step": 12205 + }, + { + "epoch": 0.9913099975635508, + "grad_norm": 3.1443588440586963, + "learning_rate": 9.901973915033004e-10, + "loss": 0.5828, + "step": 12206 + }, + { + "epoch": 0.9913912125395923, + "grad_norm": 7.70458840955126, + "learning_rate": 9.717767102770947e-10, + "loss": 0.3964, + "step": 12207 + }, + { + "epoch": 0.9914724275156339, + "grad_norm": 5.013950694674269, + "learning_rate": 9.535289485781973e-10, + "loss": 0.5808, + "step": 12208 + }, + { + "epoch": 0.9915536424916754, + "grad_norm": 5.467259982799362, + "learning_rate": 9.354541076692092e-10, + "loss": 0.3382, + "step": 12209 + }, + { + "epoch": 0.9916348574677171, + "grad_norm": 3.502450749974113, + "learning_rate": 9.17552188800519e-10, + "loss": 0.4828, + "step": 12210 + }, + { + "epoch": 0.9917160724437586, + "grad_norm": 4.810040707477097, + "learning_rate": 8.998231932108581e-10, + "loss": 0.4266, + "step": 12211 + }, + { + "epoch": 0.9917972874198002, + "grad_norm": 4.375638079472969, + "learning_rate": 8.822671221273005e-10, + "loss": 0.5797, + "step": 12212 + }, + { + "epoch": 0.9918785023958417, + "grad_norm": 9.293670755342289, + "learning_rate": 8.648839767644302e-10, + "loss": 0.621, + "step": 12213 + }, + { + "epoch": 0.9919597173718834, + "grad_norm": 6.6261486011288335, + "learning_rate": 8.476737583251737e-10, + "loss": 0.4257, + "step": 12214 + }, + { + "epoch": 0.992040932347925, + "grad_norm": 4.4991871759075375, + "learning_rate": 8.306364680002454e-10, + "loss": 0.522, + "step": 12215 + }, + { + "epoch": 0.9921221473239665, + "grad_norm": 4.881228997280039, + "learning_rate": 8.137721069687021e-10, + "loss": 0.526, + "step": 12216 + }, + { + "epoch": 0.9922033623000082, + "grad_norm": 3.6297898118275964, + "learning_rate": 7.970806763973882e-10, + "loss": 0.4575, + "step": 12217 + }, + { + "epoch": 0.9922845772760497, + "grad_norm": 6.246461103137779, + "learning_rate": 7.805621774409356e-10, + "loss": 0.3662, + "step": 12218 + }, + { + "epoch": 0.9923657922520913, + "grad_norm": 5.111388107129699, + "learning_rate": 7.642166112428739e-10, + "loss": 0.5867, + "step": 12219 + }, + { + "epoch": 0.9924470072281328, + "grad_norm": 6.103417103656217, + "learning_rate": 7.480439789339655e-10, + "loss": 0.489, + "step": 12220 + }, + { + "epoch": 0.9925282222041745, + "grad_norm": 5.872345642365211, + "learning_rate": 7.320442816333151e-10, + "loss": 0.4852, + "step": 12221 + }, + { + "epoch": 0.992609437180216, + "grad_norm": 4.896384077479555, + "learning_rate": 7.162175204480926e-10, + "loss": 0.415, + "step": 12222 + }, + { + "epoch": 0.9926906521562576, + "grad_norm": 11.24619397959855, + "learning_rate": 7.005636964732554e-10, + "loss": 0.5892, + "step": 12223 + }, + { + "epoch": 0.9927718671322991, + "grad_norm": 6.417486574394487, + "learning_rate": 6.850828107921037e-10, + "loss": 0.4542, + "step": 12224 + }, + { + "epoch": 0.9928530821083408, + "grad_norm": 5.669079347074385, + "learning_rate": 6.697748644757252e-10, + "loss": 0.4736, + "step": 12225 + }, + { + "epoch": 0.9929342970843824, + "grad_norm": 5.1508249817000475, + "learning_rate": 6.546398585832725e-10, + "loss": 0.3881, + "step": 12226 + }, + { + "epoch": 0.9930155120604239, + "grad_norm": 6.284781187211057, + "learning_rate": 6.396777941622412e-10, + "loss": 0.5204, + "step": 12227 + }, + { + "epoch": 0.9930967270364656, + "grad_norm": 5.867023818166328, + "learning_rate": 6.248886722479142e-10, + "loss": 0.3868, + "step": 12228 + }, + { + "epoch": 0.9931779420125071, + "grad_norm": 7.865759003313058, + "learning_rate": 6.10272493863362e-10, + "loss": 0.4652, + "step": 12229 + }, + { + "epoch": 0.9932591569885487, + "grad_norm": 5.771846652668555, + "learning_rate": 5.958292600202753e-10, + "loss": 0.4368, + "step": 12230 + }, + { + "epoch": 0.9933403719645902, + "grad_norm": 5.555877566179887, + "learning_rate": 5.81558971717855e-10, + "loss": 0.4511, + "step": 12231 + }, + { + "epoch": 0.9934215869406319, + "grad_norm": 7.602271947993169, + "learning_rate": 5.674616299436441e-10, + "loss": 0.3559, + "step": 12232 + }, + { + "epoch": 0.9935028019166734, + "grad_norm": 3.168714257439046, + "learning_rate": 5.53537235672974e-10, + "loss": 0.6782, + "step": 12233 + }, + { + "epoch": 0.993584016892715, + "grad_norm": 8.0565315209248, + "learning_rate": 5.397857898692404e-10, + "loss": 0.4928, + "step": 12234 + }, + { + "epoch": 0.9936652318687565, + "grad_norm": 6.7281592050291525, + "learning_rate": 5.262072934841822e-10, + "loss": 0.4097, + "step": 12235 + }, + { + "epoch": 0.9937464468447982, + "grad_norm": 5.03523479370938, + "learning_rate": 5.128017474573254e-10, + "loss": 0.5035, + "step": 12236 + }, + { + "epoch": 0.9938276618208398, + "grad_norm": 5.581578649372743, + "learning_rate": 4.995691527162616e-10, + "loss": 0.6748, + "step": 12237 + }, + { + "epoch": 0.9939088767968813, + "grad_norm": 5.1959863608312755, + "learning_rate": 4.86509510176647e-10, + "loss": 0.325, + "step": 12238 + }, + { + "epoch": 0.993990091772923, + "grad_norm": 5.218818979766321, + "learning_rate": 4.736228207419258e-10, + "loss": 0.5546, + "step": 12239 + }, + { + "epoch": 0.9940713067489645, + "grad_norm": 6.565937064015396, + "learning_rate": 4.60909085304162e-10, + "loss": 0.4902, + "step": 12240 + }, + { + "epoch": 0.9941525217250061, + "grad_norm": 4.111169296382956, + "learning_rate": 4.4836830474265235e-10, + "loss": 0.4279, + "step": 12241 + }, + { + "epoch": 0.9942337367010476, + "grad_norm": 4.28308564761494, + "learning_rate": 4.3600047992559124e-10, + "loss": 0.5878, + "step": 12242 + }, + { + "epoch": 0.9943149516770893, + "grad_norm": 7.57733637051035, + "learning_rate": 4.2380561170840553e-10, + "loss": 0.7041, + "step": 12243 + }, + { + "epoch": 0.9943961666531308, + "grad_norm": 6.372067043265081, + "learning_rate": 4.1178370093486463e-10, + "loss": 0.4418, + "step": 12244 + }, + { + "epoch": 0.9944773816291724, + "grad_norm": 3.884746049499681, + "learning_rate": 3.9993474843735837e-10, + "loss": 0.4731, + "step": 12245 + }, + { + "epoch": 0.994558596605214, + "grad_norm": 8.66268673867393, + "learning_rate": 3.882587550349537e-10, + "loss": 0.4845, + "step": 12246 + }, + { + "epoch": 0.9946398115812556, + "grad_norm": 4.113892528273869, + "learning_rate": 3.7675572153644814e-10, + "loss": 0.4564, + "step": 12247 + }, + { + "epoch": 0.9947210265572972, + "grad_norm": 4.958163424799454, + "learning_rate": 3.6542564873731645e-10, + "loss": 0.4016, + "step": 12248 + }, + { + "epoch": 0.9948022415333387, + "grad_norm": 3.6688498786954162, + "learning_rate": 3.5426853742137613e-10, + "loss": 0.6589, + "step": 12249 + }, + { + "epoch": 0.9948834565093804, + "grad_norm": 10.717003613916313, + "learning_rate": 3.432843883610648e-10, + "loss": 0.4469, + "step": 12250 + }, + { + "epoch": 0.9949646714854219, + "grad_norm": 4.462573986139782, + "learning_rate": 3.3247320231605265e-10, + "loss": 0.5045, + "step": 12251 + }, + { + "epoch": 0.9950458864614635, + "grad_norm": 5.007934033669896, + "learning_rate": 3.218349800346299e-10, + "loss": 0.4665, + "step": 12252 + }, + { + "epoch": 0.995127101437505, + "grad_norm": 3.873104868996485, + "learning_rate": 3.1136972225315197e-10, + "loss": 0.5173, + "step": 12253 + }, + { + "epoch": 0.9952083164135467, + "grad_norm": 3.3876902753690796, + "learning_rate": 3.0107742969520683e-10, + "loss": 0.5485, + "step": 12254 + }, + { + "epoch": 0.9952895313895882, + "grad_norm": 6.699091681394484, + "learning_rate": 2.9095810307328e-10, + "loss": 0.367, + "step": 12255 + }, + { + "epoch": 0.9953707463656298, + "grad_norm": 6.171401454308493, + "learning_rate": 2.810117430873671e-10, + "loss": 0.3962, + "step": 12256 + }, + { + "epoch": 0.9954519613416714, + "grad_norm": 4.923317857251655, + "learning_rate": 2.71238350426084e-10, + "loss": 0.4938, + "step": 12257 + }, + { + "epoch": 0.995533176317713, + "grad_norm": 5.816170040992201, + "learning_rate": 2.61637925765279e-10, + "loss": 0.3092, + "step": 12258 + }, + { + "epoch": 0.9956143912937546, + "grad_norm": 4.783682438235011, + "learning_rate": 2.522104697696981e-10, + "loss": 0.4861, + "step": 12259 + }, + { + "epoch": 0.9956956062697961, + "grad_norm": 6.542965672565677, + "learning_rate": 2.4295598309131973e-10, + "loss": 0.4059, + "step": 12260 + }, + { + "epoch": 0.9957768212458378, + "grad_norm": 5.290685230558423, + "learning_rate": 2.3387446637046506e-10, + "loss": 0.433, + "step": 12261 + }, + { + "epoch": 0.9958580362218793, + "grad_norm": 4.569512260893426, + "learning_rate": 2.2496592023579789e-10, + "loss": 0.5698, + "step": 12262 + }, + { + "epoch": 0.9959392511979209, + "grad_norm": 13.1705343773864, + "learning_rate": 2.1623034530349197e-10, + "loss": 0.4349, + "step": 12263 + }, + { + "epoch": 0.9960204661739624, + "grad_norm": 11.027548037308877, + "learning_rate": 2.076677421783413e-10, + "loss": 0.3677, + "step": 12264 + }, + { + "epoch": 0.9961016811500041, + "grad_norm": 7.761548829661015, + "learning_rate": 1.992781114523723e-10, + "loss": 0.4086, + "step": 12265 + }, + { + "epoch": 0.9961828961260456, + "grad_norm": 4.496137975811519, + "learning_rate": 1.910614537065092e-10, + "loss": 0.523, + "step": 12266 + }, + { + "epoch": 0.9962641111020872, + "grad_norm": 4.01367922274939, + "learning_rate": 1.8301776950918615e-10, + "loss": 0.4518, + "step": 12267 + }, + { + "epoch": 0.9963453260781288, + "grad_norm": 11.271093585704282, + "learning_rate": 1.7514705941690247e-10, + "loss": 0.4174, + "step": 12268 + }, + { + "epoch": 0.9964265410541704, + "grad_norm": 5.748127242285962, + "learning_rate": 1.6744932397422254e-10, + "loss": 0.5133, + "step": 12269 + }, + { + "epoch": 0.996507756030212, + "grad_norm": 5.1019116955334916, + "learning_rate": 1.5992456371377584e-10, + "loss": 0.3742, + "step": 12270 + }, + { + "epoch": 0.9965889710062535, + "grad_norm": 6.408098676761947, + "learning_rate": 1.5257277915653458e-10, + "loss": 0.4672, + "step": 12271 + }, + { + "epoch": 0.9966701859822952, + "grad_norm": 6.484656082022969, + "learning_rate": 1.4539397081070328e-10, + "loss": 0.3491, + "step": 12272 + }, + { + "epoch": 0.9967514009583367, + "grad_norm": 3.999338555705399, + "learning_rate": 1.3838813917366188e-10, + "loss": 0.5493, + "step": 12273 + }, + { + "epoch": 0.9968326159343783, + "grad_norm": 5.4475266405766565, + "learning_rate": 1.3155528472974523e-10, + "loss": 0.3952, + "step": 12274 + }, + { + "epoch": 0.9969138309104199, + "grad_norm": 4.872599475892686, + "learning_rate": 1.2489540795163068e-10, + "loss": 0.5474, + "step": 12275 + }, + { + "epoch": 0.9969950458864615, + "grad_norm": 3.670731259776475, + "learning_rate": 1.18408509300616e-10, + "loss": 0.5644, + "step": 12276 + }, + { + "epoch": 0.997076260862503, + "grad_norm": 6.2602727752630045, + "learning_rate": 1.1209458922495365e-10, + "loss": 0.5351, + "step": 12277 + }, + { + "epoch": 0.9971574758385446, + "grad_norm": 4.875410969826889, + "learning_rate": 1.0595364816207155e-10, + "loss": 0.5048, + "step": 12278 + }, + { + "epoch": 0.9972386908145862, + "grad_norm": 4.9678315360142395, + "learning_rate": 9.998568653690754e-11, + "loss": 0.5401, + "step": 12279 + }, + { + "epoch": 0.9973199057906278, + "grad_norm": 4.51671927225658, + "learning_rate": 9.41907047619095e-11, + "loss": 0.516, + "step": 12280 + }, + { + "epoch": 0.9974011207666694, + "grad_norm": 4.930864987720356, + "learning_rate": 8.856870323842304e-11, + "loss": 0.5275, + "step": 12281 + }, + { + "epoch": 0.997482335742711, + "grad_norm": 6.047707965039309, + "learning_rate": 8.311968235530376e-11, + "loss": 0.4833, + "step": 12282 + }, + { + "epoch": 0.9975635507187526, + "grad_norm": 7.174138199102842, + "learning_rate": 7.784364248974996e-11, + "loss": 0.5413, + "step": 12283 + }, + { + "epoch": 0.9976447656947941, + "grad_norm": 6.751919593589131, + "learning_rate": 7.274058400674744e-11, + "loss": 0.4525, + "step": 12284 + }, + { + "epoch": 0.9977259806708357, + "grad_norm": 5.318816576138242, + "learning_rate": 6.781050725962468e-11, + "loss": 0.4845, + "step": 12285 + }, + { + "epoch": 0.9978071956468773, + "grad_norm": 4.612324143619252, + "learning_rate": 6.30534125889426e-11, + "loss": 0.5208, + "step": 12286 + }, + { + "epoch": 0.9978884106229189, + "grad_norm": 6.7018444490349065, + "learning_rate": 5.846930032443743e-11, + "loss": 0.4859, + "step": 12287 + }, + { + "epoch": 0.9979696255989604, + "grad_norm": 7.423300865438458, + "learning_rate": 5.4058170783077845e-11, + "loss": 0.3708, + "step": 12288 + }, + { + "epoch": 0.998050840575002, + "grad_norm": 6.668113758835644, + "learning_rate": 4.982002427017518e-11, + "loss": 0.5712, + "step": 12289 + }, + { + "epoch": 0.9981320555510436, + "grad_norm": 7.202885642026081, + "learning_rate": 4.5754861078828314e-11, + "loss": 0.4483, + "step": 12290 + }, + { + "epoch": 0.9982132705270852, + "grad_norm": 8.215415290405625, + "learning_rate": 4.186268149047879e-11, + "loss": 0.4036, + "step": 12291 + }, + { + "epoch": 0.9982944855031268, + "grad_norm": 3.4403988676406008, + "learning_rate": 3.814348577435567e-11, + "loss": 0.4738, + "step": 12292 + }, + { + "epoch": 0.9983757004791683, + "grad_norm": 4.536469686726864, + "learning_rate": 3.4597274187753163e-11, + "loss": 0.5479, + "step": 12293 + }, + { + "epoch": 0.99845691545521, + "grad_norm": 4.166942290927568, + "learning_rate": 3.122404697603054e-11, + "loss": 0.4988, + "step": 12294 + }, + { + "epoch": 0.9985381304312515, + "grad_norm": 3.821490393417459, + "learning_rate": 2.8023804372889762e-11, + "loss": 0.6298, + "step": 12295 + }, + { + "epoch": 0.9986193454072931, + "grad_norm": 4.579385987949685, + "learning_rate": 2.499654659954276e-11, + "loss": 0.4335, + "step": 12296 + }, + { + "epoch": 0.9987005603833347, + "grad_norm": 3.5660288116414067, + "learning_rate": 2.214227386554413e-11, + "loss": 0.4252, + "step": 12297 + }, + { + "epoch": 0.9987817753593763, + "grad_norm": 3.8887534314797567, + "learning_rate": 1.9460986368513568e-11, + "loss": 0.5141, + "step": 12298 + }, + { + "epoch": 0.9988629903354178, + "grad_norm": 9.174267597596703, + "learning_rate": 1.6952684293580767e-11, + "loss": 0.4097, + "step": 12299 + }, + { + "epoch": 0.9989442053114594, + "grad_norm": 4.342933188190943, + "learning_rate": 1.4617367814495632e-11, + "loss": 0.498, + "step": 12300 + }, + { + "epoch": 0.999025420287501, + "grad_norm": 5.079424217042842, + "learning_rate": 1.2455037093073163e-11, + "loss": 0.5183, + "step": 12301 + }, + { + "epoch": 0.9991066352635426, + "grad_norm": 8.135085893159895, + "learning_rate": 1.0465692278638361e-11, + "loss": 0.5642, + "step": 12302 + }, + { + "epoch": 0.9991878502395842, + "grad_norm": 30.1448448129456, + "learning_rate": 8.649333509136438e-12, + "loss": 0.4073, + "step": 12303 + }, + { + "epoch": 0.9992690652156258, + "grad_norm": 3.5750066680144217, + "learning_rate": 7.005960910022591e-12, + "loss": 0.4412, + "step": 12304 + }, + { + "epoch": 0.9993502801916674, + "grad_norm": 13.396727140828338, + "learning_rate": 5.535574594817128e-12, + "loss": 0.3997, + "step": 12305 + }, + { + "epoch": 0.9994314951677089, + "grad_norm": 6.163825951561884, + "learning_rate": 4.238174665938122e-12, + "loss": 0.3945, + "step": 12306 + }, + { + "epoch": 0.9995127101437505, + "grad_norm": 5.7285348007196975, + "learning_rate": 3.11376121248097e-12, + "loss": 0.4673, + "step": 12307 + }, + { + "epoch": 0.9995939251197921, + "grad_norm": 4.103396802719272, + "learning_rate": 2.1623343124388405e-12, + "loss": 0.4399, + "step": 12308 + }, + { + "epoch": 0.9996751400958337, + "grad_norm": 4.630608713223542, + "learning_rate": 1.3838940318700034e-12, + "loss": 0.4208, + "step": 12309 + }, + { + "epoch": 0.9997563550718752, + "grad_norm": 5.594155269906072, + "learning_rate": 7.784404243427191e-13, + "loss": 0.5584, + "step": 12310 + }, + { + "epoch": 0.9998375700479168, + "grad_norm": 7.544818372955951, + "learning_rate": 3.4597353176790696e-13, + "loss": 0.4048, + "step": 12311 + }, + { + "epoch": 0.9999187850239584, + "grad_norm": 4.9742349717120895, + "learning_rate": 8.649338439914445e-14, + "loss": 0.5781, + "step": 12312 + }, + { + "epoch": 1.0, + "grad_norm": 4.419415245809011, + "learning_rate": 0.0, + "loss": 0.4966, + "step": 12313 + }, + { + "epoch": 1.0, + "step": 12313, + "total_flos": 582090100039680.0, + "train_loss": 0.5386334688529102, + "train_runtime": 33230.6995, + "train_samples_per_second": 11.857, + "train_steps_per_second": 0.371 + } + ], + "logging_steps": 1.0, + "max_steps": 12313, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 4101, + "total_flos": 582090100039680.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}