diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,58857 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9998215029452013, + "eval_steps": 500, + "global_step": 8402, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023799607306479442, + "grad_norm": 5.3633106806179, + "learning_rate": 0.0, + "loss": 0.7123, + "step": 1 + }, + { + "epoch": 0.00047599214612958885, + "grad_norm": 3.8186829684019856, + "learning_rate": 3.952569169960474e-08, + "loss": 0.6005, + "step": 2 + }, + { + "epoch": 0.0007139882191943833, + "grad_norm": 3.650677826595722, + "learning_rate": 7.905138339920948e-08, + "loss": 0.5807, + "step": 3 + }, + { + "epoch": 0.0009519842922591777, + "grad_norm": 3.7298913667173106, + "learning_rate": 1.1857707509881423e-07, + "loss": 0.7382, + "step": 4 + }, + { + "epoch": 0.0011899803653239722, + "grad_norm": 5.228820067839456, + "learning_rate": 1.5810276679841897e-07, + "loss": 0.6882, + "step": 5 + }, + { + "epoch": 0.0014279764383887665, + "grad_norm": 3.4388897772378844, + "learning_rate": 1.9762845849802374e-07, + "loss": 0.5336, + "step": 6 + }, + { + "epoch": 0.001665972511453561, + "grad_norm": 3.788288992632434, + "learning_rate": 2.3715415019762845e-07, + "loss": 0.6794, + "step": 7 + }, + { + "epoch": 0.0019039685845183554, + "grad_norm": 4.769198995741846, + "learning_rate": 2.766798418972332e-07, + "loss": 0.6865, + "step": 8 + }, + { + "epoch": 0.00214196465758315, + "grad_norm": 4.131645314597065, + "learning_rate": 3.1620553359683794e-07, + "loss": 0.6119, + "step": 9 + }, + { + "epoch": 0.0023799607306479445, + "grad_norm": 3.453158794881233, + "learning_rate": 3.5573122529644276e-07, + "loss": 0.5578, + "step": 10 + }, + { + "epoch": 0.0026179568037127386, + "grad_norm": 4.116364371771232, + "learning_rate": 3.9525691699604747e-07, + "loss": 0.6975, + "step": 11 + }, + { + "epoch": 0.002855952876777533, + "grad_norm": 4.918270216385396, + "learning_rate": 4.347826086956522e-07, + "loss": 0.6732, + "step": 12 + }, + { + "epoch": 0.0030939489498423276, + "grad_norm": 3.817650403686872, + "learning_rate": 4.743083003952569e-07, + "loss": 0.6094, + "step": 13 + }, + { + "epoch": 0.003331945022907122, + "grad_norm": 3.369346282433572, + "learning_rate": 5.138339920948617e-07, + "loss": 0.6053, + "step": 14 + }, + { + "epoch": 0.0035699410959719163, + "grad_norm": 3.988126056910898, + "learning_rate": 5.533596837944664e-07, + "loss": 0.7204, + "step": 15 + }, + { + "epoch": 0.003807937169036711, + "grad_norm": 3.8112595042647035, + "learning_rate": 5.928853754940712e-07, + "loss": 0.5855, + "step": 16 + }, + { + "epoch": 0.004045933242101505, + "grad_norm": 2.48224547819952, + "learning_rate": 6.324110671936759e-07, + "loss": 0.5257, + "step": 17 + }, + { + "epoch": 0.0042839293151663, + "grad_norm": 2.733961457300926, + "learning_rate": 6.719367588932807e-07, + "loss": 0.6612, + "step": 18 + }, + { + "epoch": 0.004521925388231094, + "grad_norm": 3.2790957516477235, + "learning_rate": 7.114624505928855e-07, + "loss": 0.6498, + "step": 19 + }, + { + "epoch": 0.004759921461295889, + "grad_norm": 2.5124910550047352, + "learning_rate": 7.509881422924902e-07, + "loss": 0.5692, + "step": 20 + }, + { + "epoch": 0.0049979175343606835, + "grad_norm": 2.659608784971935, + "learning_rate": 7.905138339920949e-07, + "loss": 0.5649, + "step": 21 + }, + { + "epoch": 0.005235913607425477, + "grad_norm": 2.198967450226314, + "learning_rate": 8.300395256916997e-07, + "loss": 0.6861, + "step": 22 + }, + { + "epoch": 0.005473909680490272, + "grad_norm": 1.8298179674027764, + "learning_rate": 8.695652173913044e-07, + "loss": 0.5675, + "step": 23 + }, + { + "epoch": 0.005711905753555066, + "grad_norm": 1.696370720789889, + "learning_rate": 9.090909090909091e-07, + "loss": 0.5139, + "step": 24 + }, + { + "epoch": 0.005949901826619861, + "grad_norm": 1.8785219368535397, + "learning_rate": 9.486166007905138e-07, + "loss": 0.6436, + "step": 25 + }, + { + "epoch": 0.006187897899684655, + "grad_norm": 2.08357255427808, + "learning_rate": 9.881422924901187e-07, + "loss": 0.6934, + "step": 26 + }, + { + "epoch": 0.00642589397274945, + "grad_norm": 1.6150355998820158, + "learning_rate": 1.0276679841897233e-06, + "loss": 0.5594, + "step": 27 + }, + { + "epoch": 0.006663890045814244, + "grad_norm": 1.5788592814751126, + "learning_rate": 1.067193675889328e-06, + "loss": 0.5212, + "step": 28 + }, + { + "epoch": 0.006901886118879039, + "grad_norm": 1.7395751013863352, + "learning_rate": 1.1067193675889329e-06, + "loss": 0.6369, + "step": 29 + }, + { + "epoch": 0.0071398821919438325, + "grad_norm": 1.7730246621108292, + "learning_rate": 1.1462450592885378e-06, + "loss": 0.636, + "step": 30 + }, + { + "epoch": 0.007377878265008627, + "grad_norm": 1.245992545689414, + "learning_rate": 1.1857707509881424e-06, + "loss": 0.5341, + "step": 31 + }, + { + "epoch": 0.007615874338073422, + "grad_norm": 1.259904616724805, + "learning_rate": 1.225296442687747e-06, + "loss": 0.5601, + "step": 32 + }, + { + "epoch": 0.007853870411138216, + "grad_norm": 1.427704060064298, + "learning_rate": 1.2648221343873517e-06, + "loss": 0.6261, + "step": 33 + }, + { + "epoch": 0.00809186648420301, + "grad_norm": 1.4266849280066096, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.5241, + "step": 34 + }, + { + "epoch": 0.008329862557267805, + "grad_norm": 1.1492899123050067, + "learning_rate": 1.3438735177865615e-06, + "loss": 0.5351, + "step": 35 + }, + { + "epoch": 0.0085678586303326, + "grad_norm": 1.3426107399500602, + "learning_rate": 1.3833992094861662e-06, + "loss": 0.6201, + "step": 36 + }, + { + "epoch": 0.008805854703397394, + "grad_norm": 1.3943538319404358, + "learning_rate": 1.422924901185771e-06, + "loss": 0.5752, + "step": 37 + }, + { + "epoch": 0.009043850776462189, + "grad_norm": 1.2295476918388797, + "learning_rate": 1.4624505928853755e-06, + "loss": 0.5134, + "step": 38 + }, + { + "epoch": 0.009281846849526983, + "grad_norm": 1.0233852069436877, + "learning_rate": 1.5019762845849804e-06, + "loss": 0.5413, + "step": 39 + }, + { + "epoch": 0.009519842922591778, + "grad_norm": 1.095723970715187, + "learning_rate": 1.541501976284585e-06, + "loss": 0.6027, + "step": 40 + }, + { + "epoch": 0.009757838995656572, + "grad_norm": 0.9021710359927738, + "learning_rate": 1.5810276679841899e-06, + "loss": 0.5096, + "step": 41 + }, + { + "epoch": 0.009995835068721367, + "grad_norm": 0.8271645037826616, + "learning_rate": 1.6205533596837948e-06, + "loss": 0.4744, + "step": 42 + }, + { + "epoch": 0.01023383114178616, + "grad_norm": 0.9178185602286086, + "learning_rate": 1.6600790513833994e-06, + "loss": 0.5669, + "step": 43 + }, + { + "epoch": 0.010471827214850954, + "grad_norm": 0.7642553443300366, + "learning_rate": 1.699604743083004e-06, + "loss": 0.5444, + "step": 44 + }, + { + "epoch": 0.010709823287915749, + "grad_norm": 0.7527668145998719, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.48, + "step": 45 + }, + { + "epoch": 0.010947819360980543, + "grad_norm": 0.7022856285565289, + "learning_rate": 1.7786561264822136e-06, + "loss": 0.5277, + "step": 46 + }, + { + "epoch": 0.011185815434045338, + "grad_norm": 0.7455577433162014, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.6125, + "step": 47 + }, + { + "epoch": 0.011423811507110132, + "grad_norm": 0.7225816964027125, + "learning_rate": 1.8577075098814232e-06, + "loss": 0.4642, + "step": 48 + }, + { + "epoch": 0.011661807580174927, + "grad_norm": 0.6856376423837439, + "learning_rate": 1.8972332015810276e-06, + "loss": 0.4817, + "step": 49 + }, + { + "epoch": 0.011899803653239721, + "grad_norm": 0.7776257134551714, + "learning_rate": 1.9367588932806323e-06, + "loss": 0.5714, + "step": 50 + }, + { + "epoch": 0.012137799726304516, + "grad_norm": 0.6598276781972773, + "learning_rate": 1.9762845849802374e-06, + "loss": 0.5405, + "step": 51 + }, + { + "epoch": 0.01237579579936931, + "grad_norm": 0.7519528179240484, + "learning_rate": 2.015810276679842e-06, + "loss": 0.4796, + "step": 52 + }, + { + "epoch": 0.012613791872434105, + "grad_norm": 0.6641024419350181, + "learning_rate": 2.0553359683794467e-06, + "loss": 0.4447, + "step": 53 + }, + { + "epoch": 0.0128517879454989, + "grad_norm": 0.7974261768047928, + "learning_rate": 2.0948616600790518e-06, + "loss": 0.5726, + "step": 54 + }, + { + "epoch": 0.013089784018563694, + "grad_norm": 0.701806597200162, + "learning_rate": 2.134387351778656e-06, + "loss": 0.5425, + "step": 55 + }, + { + "epoch": 0.013327780091628489, + "grad_norm": 0.6533475855597715, + "learning_rate": 2.173913043478261e-06, + "loss": 0.4617, + "step": 56 + }, + { + "epoch": 0.013565776164693283, + "grad_norm": 0.6176303785724955, + "learning_rate": 2.2134387351778658e-06, + "loss": 0.525, + "step": 57 + }, + { + "epoch": 0.013803772237758078, + "grad_norm": 0.6092588831474826, + "learning_rate": 2.2529644268774704e-06, + "loss": 0.5486, + "step": 58 + }, + { + "epoch": 0.014041768310822872, + "grad_norm": 0.6409287405924915, + "learning_rate": 2.2924901185770755e-06, + "loss": 0.4525, + "step": 59 + }, + { + "epoch": 0.014279764383887665, + "grad_norm": 0.5448182821305082, + "learning_rate": 2.33201581027668e-06, + "loss": 0.4333, + "step": 60 + }, + { + "epoch": 0.01451776045695246, + "grad_norm": 0.5985899326268923, + "learning_rate": 2.371541501976285e-06, + "loss": 0.5382, + "step": 61 + }, + { + "epoch": 0.014755756530017254, + "grad_norm": 0.5562460345435642, + "learning_rate": 2.4110671936758895e-06, + "loss": 0.5455, + "step": 62 + }, + { + "epoch": 0.014993752603082049, + "grad_norm": 0.5762099087323772, + "learning_rate": 2.450592885375494e-06, + "loss": 0.4017, + "step": 63 + }, + { + "epoch": 0.015231748676146843, + "grad_norm": 0.5743174512558051, + "learning_rate": 2.4901185770750993e-06, + "loss": 0.4655, + "step": 64 + }, + { + "epoch": 0.015469744749211638, + "grad_norm": 0.5478944257204411, + "learning_rate": 2.5296442687747035e-06, + "loss": 0.5516, + "step": 65 + }, + { + "epoch": 0.015707740822276432, + "grad_norm": 0.5348787537296499, + "learning_rate": 2.5691699604743086e-06, + "loss": 0.4859, + "step": 66 + }, + { + "epoch": 0.015945736895341227, + "grad_norm": 0.513662426262249, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.4405, + "step": 67 + }, + { + "epoch": 0.01618373296840602, + "grad_norm": 0.5477522117902747, + "learning_rate": 2.6482213438735183e-06, + "loss": 0.5469, + "step": 68 + }, + { + "epoch": 0.016421729041470816, + "grad_norm": 0.5353133875838187, + "learning_rate": 2.687747035573123e-06, + "loss": 0.5546, + "step": 69 + }, + { + "epoch": 0.01665972511453561, + "grad_norm": 0.5247379742690176, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.45, + "step": 70 + }, + { + "epoch": 0.016897721187600405, + "grad_norm": 0.48632866471872, + "learning_rate": 2.7667984189723323e-06, + "loss": 0.4319, + "step": 71 + }, + { + "epoch": 0.0171357172606652, + "grad_norm": 0.5339643875178105, + "learning_rate": 2.806324110671937e-06, + "loss": 0.5189, + "step": 72 + }, + { + "epoch": 0.017373713333729994, + "grad_norm": 0.5148248669651565, + "learning_rate": 2.845849802371542e-06, + "loss": 0.5148, + "step": 73 + }, + { + "epoch": 0.01761170940679479, + "grad_norm": 0.4906003639934845, + "learning_rate": 2.8853754940711463e-06, + "loss": 0.4181, + "step": 74 + }, + { + "epoch": 0.017849705479859583, + "grad_norm": 0.5090667895636138, + "learning_rate": 2.924901185770751e-06, + "loss": 0.4989, + "step": 75 + }, + { + "epoch": 0.018087701552924378, + "grad_norm": 0.47929822302895847, + "learning_rate": 2.964426877470356e-06, + "loss": 0.51, + "step": 76 + }, + { + "epoch": 0.018325697625989172, + "grad_norm": 0.49337262495522655, + "learning_rate": 3.0039525691699607e-06, + "loss": 0.416, + "step": 77 + }, + { + "epoch": 0.018563693699053967, + "grad_norm": 0.46721565809919324, + "learning_rate": 3.043478260869566e-06, + "loss": 0.4243, + "step": 78 + }, + { + "epoch": 0.01880168977211876, + "grad_norm": 0.47787380347630104, + "learning_rate": 3.08300395256917e-06, + "loss": 0.5134, + "step": 79 + }, + { + "epoch": 0.019039685845183556, + "grad_norm": 0.4888904145322131, + "learning_rate": 3.1225296442687747e-06, + "loss": 0.4965, + "step": 80 + }, + { + "epoch": 0.01927768191824835, + "grad_norm": 0.474440273062018, + "learning_rate": 3.1620553359683798e-06, + "loss": 0.4231, + "step": 81 + }, + { + "epoch": 0.019515677991313145, + "grad_norm": 0.4917931480050054, + "learning_rate": 3.2015810276679844e-06, + "loss": 0.5044, + "step": 82 + }, + { + "epoch": 0.01975367406437794, + "grad_norm": 0.4850752880482312, + "learning_rate": 3.2411067193675895e-06, + "loss": 0.4966, + "step": 83 + }, + { + "epoch": 0.019991670137442734, + "grad_norm": 0.504493166936379, + "learning_rate": 3.2806324110671938e-06, + "loss": 0.416, + "step": 84 + }, + { + "epoch": 0.020229666210507525, + "grad_norm": 0.5145462573417079, + "learning_rate": 3.320158102766799e-06, + "loss": 0.405, + "step": 85 + }, + { + "epoch": 0.02046766228357232, + "grad_norm": 0.4950180063312114, + "learning_rate": 3.3596837944664035e-06, + "loss": 0.5179, + "step": 86 + }, + { + "epoch": 0.020705658356637114, + "grad_norm": 0.4499055758325075, + "learning_rate": 3.399209486166008e-06, + "loss": 0.5045, + "step": 87 + }, + { + "epoch": 0.02094365442970191, + "grad_norm": 0.4886696541409708, + "learning_rate": 3.4387351778656133e-06, + "loss": 0.4223, + "step": 88 + }, + { + "epoch": 0.021181650502766703, + "grad_norm": 0.47670535706005207, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.4569, + "step": 89 + }, + { + "epoch": 0.021419646575831498, + "grad_norm": 0.4382725012174603, + "learning_rate": 3.5177865612648226e-06, + "loss": 0.5505, + "step": 90 + }, + { + "epoch": 0.021657642648896292, + "grad_norm": 0.47014527706939824, + "learning_rate": 3.5573122529644273e-06, + "loss": 0.4423, + "step": 91 + }, + { + "epoch": 0.021895638721961087, + "grad_norm": 0.4776076310968795, + "learning_rate": 3.5968379446640315e-06, + "loss": 0.4014, + "step": 92 + }, + { + "epoch": 0.02213363479502588, + "grad_norm": 0.48030073876619656, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.483, + "step": 93 + }, + { + "epoch": 0.022371630868090676, + "grad_norm": 0.4298351648691775, + "learning_rate": 3.6758893280632412e-06, + "loss": 0.5171, + "step": 94 + }, + { + "epoch": 0.02260962694115547, + "grad_norm": 0.4368816040607989, + "learning_rate": 3.7154150197628463e-06, + "loss": 0.4107, + "step": 95 + }, + { + "epoch": 0.022847623014220265, + "grad_norm": 0.47265191527306455, + "learning_rate": 3.754940711462451e-06, + "loss": 0.4379, + "step": 96 + }, + { + "epoch": 0.02308561908728506, + "grad_norm": 0.44472115691541, + "learning_rate": 3.7944664031620552e-06, + "loss": 0.5029, + "step": 97 + }, + { + "epoch": 0.023323615160349854, + "grad_norm": 0.47516731747233143, + "learning_rate": 3.833992094861661e-06, + "loss": 0.4948, + "step": 98 + }, + { + "epoch": 0.02356161123341465, + "grad_norm": 0.4756183678548411, + "learning_rate": 3.8735177865612646e-06, + "loss": 0.4284, + "step": 99 + }, + { + "epoch": 0.023799607306479443, + "grad_norm": 0.4648909223572498, + "learning_rate": 3.91304347826087e-06, + "loss": 0.4613, + "step": 100 + }, + { + "epoch": 0.024037603379544237, + "grad_norm": 0.45494168373864374, + "learning_rate": 3.952569169960475e-06, + "loss": 0.5071, + "step": 101 + }, + { + "epoch": 0.024275599452609032, + "grad_norm": 0.47165993720085936, + "learning_rate": 3.992094861660079e-06, + "loss": 0.4297, + "step": 102 + }, + { + "epoch": 0.024513595525673827, + "grad_norm": 0.46600664566797045, + "learning_rate": 4.031620553359684e-06, + "loss": 0.4006, + "step": 103 + }, + { + "epoch": 0.02475159159873862, + "grad_norm": 0.440136381866, + "learning_rate": 4.071146245059289e-06, + "loss": 0.5229, + "step": 104 + }, + { + "epoch": 0.024989587671803416, + "grad_norm": 0.4874483108078875, + "learning_rate": 4.110671936758893e-06, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 0.02522758374486821, + "grad_norm": 0.4412999202522836, + "learning_rate": 4.150197628458498e-06, + "loss": 0.3896, + "step": 106 + }, + { + "epoch": 0.025465579817933005, + "grad_norm": 0.4843482283829763, + "learning_rate": 4.1897233201581036e-06, + "loss": 0.5004, + "step": 107 + }, + { + "epoch": 0.0257035758909978, + "grad_norm": 0.5192505494175473, + "learning_rate": 4.229249011857708e-06, + "loss": 0.4848, + "step": 108 + }, + { + "epoch": 0.025941571964062594, + "grad_norm": 0.4697882778878796, + "learning_rate": 4.268774703557312e-06, + "loss": 0.4183, + "step": 109 + }, + { + "epoch": 0.026179568037127388, + "grad_norm": 0.47243383971128333, + "learning_rate": 4.3083003952569175e-06, + "loss": 0.4165, + "step": 110 + }, + { + "epoch": 0.026417564110192183, + "grad_norm": 0.46058732243836664, + "learning_rate": 4.347826086956522e-06, + "loss": 0.4516, + "step": 111 + }, + { + "epoch": 0.026655560183256977, + "grad_norm": 0.439182011456468, + "learning_rate": 4.387351778656127e-06, + "loss": 0.4817, + "step": 112 + }, + { + "epoch": 0.026893556256321772, + "grad_norm": 0.43801194497466545, + "learning_rate": 4.4268774703557315e-06, + "loss": 0.4033, + "step": 113 + }, + { + "epoch": 0.027131552329386566, + "grad_norm": 0.47578753276213764, + "learning_rate": 4.466403162055336e-06, + "loss": 0.4751, + "step": 114 + }, + { + "epoch": 0.02736954840245136, + "grad_norm": 0.44222504076871677, + "learning_rate": 4.505928853754941e-06, + "loss": 0.5385, + "step": 115 + }, + { + "epoch": 0.027607544475516155, + "grad_norm": 0.4619468918214023, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.4023, + "step": 116 + }, + { + "epoch": 0.02784554054858095, + "grad_norm": 0.4554362965127849, + "learning_rate": 4.584980237154151e-06, + "loss": 0.3981, + "step": 117 + }, + { + "epoch": 0.028083536621645745, + "grad_norm": 0.42154073719188745, + "learning_rate": 4.624505928853755e-06, + "loss": 0.4805, + "step": 118 + }, + { + "epoch": 0.028321532694710536, + "grad_norm": 0.4407938952654761, + "learning_rate": 4.66403162055336e-06, + "loss": 0.4883, + "step": 119 + }, + { + "epoch": 0.02855952876777533, + "grad_norm": 0.4533865445704307, + "learning_rate": 4.703557312252965e-06, + "loss": 0.4043, + "step": 120 + }, + { + "epoch": 0.028797524840840125, + "grad_norm": 0.4501769062330425, + "learning_rate": 4.74308300395257e-06, + "loss": 0.424, + "step": 121 + }, + { + "epoch": 0.02903552091390492, + "grad_norm": 0.4964204618463292, + "learning_rate": 4.782608695652174e-06, + "loss": 0.5057, + "step": 122 + }, + { + "epoch": 0.029273516986969714, + "grad_norm": 0.43608831339404674, + "learning_rate": 4.822134387351779e-06, + "loss": 0.4411, + "step": 123 + }, + { + "epoch": 0.029511513060034508, + "grad_norm": 0.5134189283501325, + "learning_rate": 4.861660079051384e-06, + "loss": 0.3686, + "step": 124 + }, + { + "epoch": 0.029749509133099303, + "grad_norm": 0.4454413745824674, + "learning_rate": 4.901185770750988e-06, + "loss": 0.4342, + "step": 125 + }, + { + "epoch": 0.029987505206164097, + "grad_norm": 0.4206745271958203, + "learning_rate": 4.940711462450593e-06, + "loss": 0.5246, + "step": 126 + }, + { + "epoch": 0.030225501279228892, + "grad_norm": 0.45531729242028907, + "learning_rate": 4.9802371541501985e-06, + "loss": 0.4269, + "step": 127 + }, + { + "epoch": 0.030463497352293686, + "grad_norm": 0.43018413490903146, + "learning_rate": 5.019762845849802e-06, + "loss": 0.4158, + "step": 128 + }, + { + "epoch": 0.03070149342535848, + "grad_norm": 0.4452267415803146, + "learning_rate": 5.059288537549407e-06, + "loss": 0.498, + "step": 129 + }, + { + "epoch": 0.030939489498423275, + "grad_norm": 0.44751910045276433, + "learning_rate": 5.0988142292490125e-06, + "loss": 0.4583, + "step": 130 + }, + { + "epoch": 0.03117748557148807, + "grad_norm": 0.4474828998483563, + "learning_rate": 5.138339920948617e-06, + "loss": 0.3873, + "step": 131 + }, + { + "epoch": 0.031415481644552865, + "grad_norm": 0.46141860455017775, + "learning_rate": 5.177865612648222e-06, + "loss": 0.4635, + "step": 132 + }, + { + "epoch": 0.03165347771761766, + "grad_norm": 0.4406433909527717, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.5175, + "step": 133 + }, + { + "epoch": 0.031891473790682454, + "grad_norm": 0.46817970001375386, + "learning_rate": 5.256916996047431e-06, + "loss": 0.4269, + "step": 134 + }, + { + "epoch": 0.03212946986374725, + "grad_norm": 0.4582029027978222, + "learning_rate": 5.296442687747037e-06, + "loss": 0.3855, + "step": 135 + }, + { + "epoch": 0.03236746593681204, + "grad_norm": 0.4483408124347049, + "learning_rate": 5.335968379446641e-06, + "loss": 0.5035, + "step": 136 + }, + { + "epoch": 0.03260546200987684, + "grad_norm": 0.4521188947407404, + "learning_rate": 5.375494071146246e-06, + "loss": 0.4773, + "step": 137 + }, + { + "epoch": 0.03284345808294163, + "grad_norm": 0.4584775312885165, + "learning_rate": 5.41501976284585e-06, + "loss": 0.4263, + "step": 138 + }, + { + "epoch": 0.033081454156006426, + "grad_norm": 0.44388523673544455, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.4206, + "step": 139 + }, + { + "epoch": 0.03331945022907122, + "grad_norm": 0.4443963263357755, + "learning_rate": 5.49407114624506e-06, + "loss": 0.5354, + "step": 140 + }, + { + "epoch": 0.033557446302136015, + "grad_norm": 0.4624952425984824, + "learning_rate": 5.533596837944665e-06, + "loss": 0.4237, + "step": 141 + }, + { + "epoch": 0.03379544237520081, + "grad_norm": 0.4372307380751775, + "learning_rate": 5.573122529644269e-06, + "loss": 0.4082, + "step": 142 + }, + { + "epoch": 0.034033438448265604, + "grad_norm": 0.4494964774973684, + "learning_rate": 5.612648221343874e-06, + "loss": 0.5033, + "step": 143 + }, + { + "epoch": 0.0342714345213304, + "grad_norm": 0.4308495087586641, + "learning_rate": 5.652173913043479e-06, + "loss": 0.4855, + "step": 144 + }, + { + "epoch": 0.03450943059439519, + "grad_norm": 0.45898397608033087, + "learning_rate": 5.691699604743084e-06, + "loss": 0.3943, + "step": 145 + }, + { + "epoch": 0.03474742666745999, + "grad_norm": 0.444667683614168, + "learning_rate": 5.731225296442689e-06, + "loss": 0.3907, + "step": 146 + }, + { + "epoch": 0.03498542274052478, + "grad_norm": 0.43190369082425845, + "learning_rate": 5.770750988142293e-06, + "loss": 0.5306, + "step": 147 + }, + { + "epoch": 0.03522341881358958, + "grad_norm": 0.47772442543782057, + "learning_rate": 5.810276679841897e-06, + "loss": 0.4328, + "step": 148 + }, + { + "epoch": 0.03546141488665437, + "grad_norm": 0.5149054927207557, + "learning_rate": 5.849802371541502e-06, + "loss": 0.4038, + "step": 149 + }, + { + "epoch": 0.035699410959719166, + "grad_norm": 0.46489298468292706, + "learning_rate": 5.8893280632411074e-06, + "loss": 0.4641, + "step": 150 + }, + { + "epoch": 0.03593740703278396, + "grad_norm": 0.43822016358847443, + "learning_rate": 5.928853754940712e-06, + "loss": 0.511, + "step": 151 + }, + { + "epoch": 0.036175403105848755, + "grad_norm": 0.4780094956775413, + "learning_rate": 5.968379446640317e-06, + "loss": 0.4091, + "step": 152 + }, + { + "epoch": 0.03641339917891355, + "grad_norm": 0.5193809604213568, + "learning_rate": 6.007905138339921e-06, + "loss": 0.3704, + "step": 153 + }, + { + "epoch": 0.036651395251978344, + "grad_norm": 0.47203611226409087, + "learning_rate": 6.047430830039526e-06, + "loss": 0.5057, + "step": 154 + }, + { + "epoch": 0.03688939132504314, + "grad_norm": 0.4255995467291753, + "learning_rate": 6.086956521739132e-06, + "loss": 0.4863, + "step": 155 + }, + { + "epoch": 0.03712738739810793, + "grad_norm": 0.4317511844695257, + "learning_rate": 6.126482213438736e-06, + "loss": 0.4005, + "step": 156 + }, + { + "epoch": 0.03736538347117273, + "grad_norm": 0.4463988044172272, + "learning_rate": 6.16600790513834e-06, + "loss": 0.4169, + "step": 157 + }, + { + "epoch": 0.03760337954423752, + "grad_norm": 0.4301302395556984, + "learning_rate": 6.205533596837945e-06, + "loss": 0.4749, + "step": 158 + }, + { + "epoch": 0.03784137561730232, + "grad_norm": 0.4885006908777435, + "learning_rate": 6.245059288537549e-06, + "loss": 0.4176, + "step": 159 + }, + { + "epoch": 0.03807937169036711, + "grad_norm": 0.490807059909575, + "learning_rate": 6.284584980237155e-06, + "loss": 0.3878, + "step": 160 + }, + { + "epoch": 0.038317367763431906, + "grad_norm": 0.43132409383451087, + "learning_rate": 6.3241106719367596e-06, + "loss": 0.4834, + "step": 161 + }, + { + "epoch": 0.0385553638364967, + "grad_norm": 0.44865911014332444, + "learning_rate": 6.363636363636364e-06, + "loss": 0.4821, + "step": 162 + }, + { + "epoch": 0.038793359909561495, + "grad_norm": 0.4260431090618812, + "learning_rate": 6.403162055335969e-06, + "loss": 0.4174, + "step": 163 + }, + { + "epoch": 0.03903135598262629, + "grad_norm": 0.49901659393595554, + "learning_rate": 6.442687747035574e-06, + "loss": 0.4296, + "step": 164 + }, + { + "epoch": 0.039269352055691084, + "grad_norm": 0.4897547343200327, + "learning_rate": 6.482213438735179e-06, + "loss": 0.512, + "step": 165 + }, + { + "epoch": 0.03950734812875588, + "grad_norm": 0.4813236527289336, + "learning_rate": 6.521739130434783e-06, + "loss": 0.431, + "step": 166 + }, + { + "epoch": 0.03974534420182067, + "grad_norm": 0.45927226014349076, + "learning_rate": 6.5612648221343875e-06, + "loss": 0.3718, + "step": 167 + }, + { + "epoch": 0.03998334027488547, + "grad_norm": 0.4965136015728435, + "learning_rate": 6.600790513833992e-06, + "loss": 0.4695, + "step": 168 + }, + { + "epoch": 0.04022133634795026, + "grad_norm": 0.4683557415256707, + "learning_rate": 6.640316205533598e-06, + "loss": 0.4778, + "step": 169 + }, + { + "epoch": 0.04045933242101505, + "grad_norm": 0.4634555253102173, + "learning_rate": 6.679841897233202e-06, + "loss": 0.3995, + "step": 170 + }, + { + "epoch": 0.040697328494079844, + "grad_norm": 0.48973289383270246, + "learning_rate": 6.719367588932807e-06, + "loss": 0.3976, + "step": 171 + }, + { + "epoch": 0.04093532456714464, + "grad_norm": 0.42723303350236147, + "learning_rate": 6.758893280632412e-06, + "loss": 0.494, + "step": 172 + }, + { + "epoch": 0.04117332064020943, + "grad_norm": 0.7587112192005009, + "learning_rate": 6.798418972332016e-06, + "loss": 0.3958, + "step": 173 + }, + { + "epoch": 0.04141131671327423, + "grad_norm": 0.4915235015843637, + "learning_rate": 6.837944664031622e-06, + "loss": 0.3719, + "step": 174 + }, + { + "epoch": 0.04164931278633902, + "grad_norm": 0.48028138363352874, + "learning_rate": 6.8774703557312265e-06, + "loss": 0.4144, + "step": 175 + }, + { + "epoch": 0.04188730885940382, + "grad_norm": 0.43339371237253094, + "learning_rate": 6.91699604743083e-06, + "loss": 0.523, + "step": 176 + }, + { + "epoch": 0.04212530493246861, + "grad_norm": 0.45447194682552483, + "learning_rate": 6.956521739130435e-06, + "loss": 0.3567, + "step": 177 + }, + { + "epoch": 0.042363301005533406, + "grad_norm": 0.4552581773897187, + "learning_rate": 6.99604743083004e-06, + "loss": 0.3722, + "step": 178 + }, + { + "epoch": 0.0426012970785982, + "grad_norm": 0.4796620850546896, + "learning_rate": 7.035573122529645e-06, + "loss": 0.4874, + "step": 179 + }, + { + "epoch": 0.042839293151662995, + "grad_norm": 0.43749845312110847, + "learning_rate": 7.07509881422925e-06, + "loss": 0.4922, + "step": 180 + }, + { + "epoch": 0.04307728922472779, + "grad_norm": 0.46789960493230326, + "learning_rate": 7.1146245059288545e-06, + "loss": 0.383, + "step": 181 + }, + { + "epoch": 0.043315285297792584, + "grad_norm": 0.5224298682432643, + "learning_rate": 7.154150197628459e-06, + "loss": 0.4526, + "step": 182 + }, + { + "epoch": 0.04355328137085738, + "grad_norm": 0.4746636793562093, + "learning_rate": 7.193675889328063e-06, + "loss": 0.4934, + "step": 183 + }, + { + "epoch": 0.04379127744392217, + "grad_norm": 0.4701682318036117, + "learning_rate": 7.233201581027669e-06, + "loss": 0.4034, + "step": 184 + }, + { + "epoch": 0.04402927351698697, + "grad_norm": 0.5015499630814017, + "learning_rate": 7.272727272727273e-06, + "loss": 0.3577, + "step": 185 + }, + { + "epoch": 0.04426726959005176, + "grad_norm": 0.447979905112201, + "learning_rate": 7.312252964426878e-06, + "loss": 0.4454, + "step": 186 + }, + { + "epoch": 0.04450526566311656, + "grad_norm": 0.4360535244392955, + "learning_rate": 7.3517786561264825e-06, + "loss": 0.4708, + "step": 187 + }, + { + "epoch": 0.04474326173618135, + "grad_norm": 0.4906094221266247, + "learning_rate": 7.391304347826087e-06, + "loss": 0.3734, + "step": 188 + }, + { + "epoch": 0.044981257809246146, + "grad_norm": 0.47012438815001395, + "learning_rate": 7.430830039525693e-06, + "loss": 0.4194, + "step": 189 + }, + { + "epoch": 0.04521925388231094, + "grad_norm": 0.4668496482235658, + "learning_rate": 7.470355731225297e-06, + "loss": 0.4804, + "step": 190 + }, + { + "epoch": 0.045457249955375735, + "grad_norm": 0.43523540977913394, + "learning_rate": 7.509881422924902e-06, + "loss": 0.4067, + "step": 191 + }, + { + "epoch": 0.04569524602844053, + "grad_norm": 0.4388184201414626, + "learning_rate": 7.549407114624507e-06, + "loss": 0.405, + "step": 192 + }, + { + "epoch": 0.045933242101505324, + "grad_norm": 0.4710177222648498, + "learning_rate": 7.5889328063241105e-06, + "loss": 0.4461, + "step": 193 + }, + { + "epoch": 0.04617123817457012, + "grad_norm": 0.45506913551818734, + "learning_rate": 7.628458498023717e-06, + "loss": 0.5013, + "step": 194 + }, + { + "epoch": 0.04640923424763491, + "grad_norm": 0.4503230582686603, + "learning_rate": 7.667984189723321e-06, + "loss": 0.3812, + "step": 195 + }, + { + "epoch": 0.04664723032069971, + "grad_norm": 0.4900687410351956, + "learning_rate": 7.707509881422925e-06, + "loss": 0.3929, + "step": 196 + }, + { + "epoch": 0.0468852263937645, + "grad_norm": 0.4422937878413219, + "learning_rate": 7.747035573122529e-06, + "loss": 0.4826, + "step": 197 + }, + { + "epoch": 0.0471232224668293, + "grad_norm": 0.4696233806991265, + "learning_rate": 7.786561264822135e-06, + "loss": 0.4321, + "step": 198 + }, + { + "epoch": 0.04736121853989409, + "grad_norm": 0.5036764448381432, + "learning_rate": 7.82608695652174e-06, + "loss": 0.3737, + "step": 199 + }, + { + "epoch": 0.047599214612958886, + "grad_norm": 0.4460954367231951, + "learning_rate": 7.865612648221344e-06, + "loss": 0.4502, + "step": 200 + }, + { + "epoch": 0.04783721068602368, + "grad_norm": 0.4121996924774416, + "learning_rate": 7.90513833992095e-06, + "loss": 0.4872, + "step": 201 + }, + { + "epoch": 0.048075206759088475, + "grad_norm": 0.5377437642053337, + "learning_rate": 7.944664031620553e-06, + "loss": 0.3756, + "step": 202 + }, + { + "epoch": 0.04831320283215327, + "grad_norm": 0.4869568221691017, + "learning_rate": 7.984189723320159e-06, + "loss": 0.3994, + "step": 203 + }, + { + "epoch": 0.048551198905218064, + "grad_norm": 0.45051544052407894, + "learning_rate": 8.023715415019764e-06, + "loss": 0.4758, + "step": 204 + }, + { + "epoch": 0.04878919497828286, + "grad_norm": 0.46693597119879243, + "learning_rate": 8.063241106719368e-06, + "loss": 0.4724, + "step": 205 + }, + { + "epoch": 0.04902719105134765, + "grad_norm": 0.4434729096123986, + "learning_rate": 8.102766798418974e-06, + "loss": 0.3654, + "step": 206 + }, + { + "epoch": 0.04926518712441245, + "grad_norm": 0.4516410800019458, + "learning_rate": 8.142292490118577e-06, + "loss": 0.4328, + "step": 207 + }, + { + "epoch": 0.04950318319747724, + "grad_norm": 0.4092851533761814, + "learning_rate": 8.181818181818183e-06, + "loss": 0.4973, + "step": 208 + }, + { + "epoch": 0.04974117927054204, + "grad_norm": 0.4458979348243189, + "learning_rate": 8.221343873517787e-06, + "loss": 0.3875, + "step": 209 + }, + { + "epoch": 0.04997917534360683, + "grad_norm": 0.4575088617794585, + "learning_rate": 8.260869565217392e-06, + "loss": 0.3812, + "step": 210 + }, + { + "epoch": 0.050217171416671626, + "grad_norm": 0.42998279360778746, + "learning_rate": 8.300395256916996e-06, + "loss": 0.4689, + "step": 211 + }, + { + "epoch": 0.05045516748973642, + "grad_norm": 0.4664154763779984, + "learning_rate": 8.339920948616602e-06, + "loss": 0.4628, + "step": 212 + }, + { + "epoch": 0.050693163562801215, + "grad_norm": 0.48372522149295644, + "learning_rate": 8.379446640316207e-06, + "loss": 0.3713, + "step": 213 + }, + { + "epoch": 0.05093115963586601, + "grad_norm": 0.45614467932799935, + "learning_rate": 8.418972332015811e-06, + "loss": 0.4565, + "step": 214 + }, + { + "epoch": 0.051169155708930804, + "grad_norm": 0.4521533500555388, + "learning_rate": 8.458498023715416e-06, + "loss": 0.4939, + "step": 215 + }, + { + "epoch": 0.0514071517819956, + "grad_norm": 0.44133282573284666, + "learning_rate": 8.49802371541502e-06, + "loss": 0.4448, + "step": 216 + }, + { + "epoch": 0.05164514785506039, + "grad_norm": 0.4461748694701973, + "learning_rate": 8.537549407114624e-06, + "loss": 0.3685, + "step": 217 + }, + { + "epoch": 0.05188314392812519, + "grad_norm": 0.43361066733503273, + "learning_rate": 8.57707509881423e-06, + "loss": 0.4443, + "step": 218 + }, + { + "epoch": 0.05212114000118998, + "grad_norm": 0.44192635288128085, + "learning_rate": 8.616600790513835e-06, + "loss": 0.4858, + "step": 219 + }, + { + "epoch": 0.052359136074254777, + "grad_norm": 0.4765904637426461, + "learning_rate": 8.656126482213439e-06, + "loss": 0.37, + "step": 220 + }, + { + "epoch": 0.05259713214731957, + "grad_norm": 0.42950301730198376, + "learning_rate": 8.695652173913044e-06, + "loss": 0.4087, + "step": 221 + }, + { + "epoch": 0.052835128220384366, + "grad_norm": 0.4493355585707809, + "learning_rate": 8.735177865612648e-06, + "loss": 0.4808, + "step": 222 + }, + { + "epoch": 0.05307312429344916, + "grad_norm": 0.4473843798566356, + "learning_rate": 8.774703557312254e-06, + "loss": 0.4421, + "step": 223 + }, + { + "epoch": 0.053311120366513955, + "grad_norm": 0.4662717413273855, + "learning_rate": 8.81422924901186e-06, + "loss": 0.3826, + "step": 224 + }, + { + "epoch": 0.05354911643957875, + "grad_norm": 0.4753294327238269, + "learning_rate": 8.853754940711463e-06, + "loss": 0.435, + "step": 225 + }, + { + "epoch": 0.053787112512643544, + "grad_norm": 0.43479966227192995, + "learning_rate": 8.893280632411067e-06, + "loss": 0.4577, + "step": 226 + }, + { + "epoch": 0.05402510858570834, + "grad_norm": 0.4540970891602704, + "learning_rate": 8.932806324110672e-06, + "loss": 0.3734, + "step": 227 + }, + { + "epoch": 0.05426310465877313, + "grad_norm": 0.47014652353944864, + "learning_rate": 8.972332015810278e-06, + "loss": 0.4018, + "step": 228 + }, + { + "epoch": 0.05450110073183793, + "grad_norm": 0.4588196915897715, + "learning_rate": 9.011857707509882e-06, + "loss": 0.4821, + "step": 229 + }, + { + "epoch": 0.05473909680490272, + "grad_norm": 0.4290397451443549, + "learning_rate": 9.051383399209487e-06, + "loss": 0.4443, + "step": 230 + }, + { + "epoch": 0.054977092877967516, + "grad_norm": 0.4634653667823266, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3622, + "step": 231 + }, + { + "epoch": 0.05521508895103231, + "grad_norm": 0.4799790289966676, + "learning_rate": 9.130434782608697e-06, + "loss": 0.4462, + "step": 232 + }, + { + "epoch": 0.055453085024097105, + "grad_norm": 0.4480981798171716, + "learning_rate": 9.169960474308302e-06, + "loss": 0.4328, + "step": 233 + }, + { + "epoch": 0.0556910810971619, + "grad_norm": 0.46767604607383606, + "learning_rate": 9.209486166007906e-06, + "loss": 0.412, + "step": 234 + }, + { + "epoch": 0.055929077170226695, + "grad_norm": 0.5333947256403828, + "learning_rate": 9.24901185770751e-06, + "loss": 0.3851, + "step": 235 + }, + { + "epoch": 0.05616707324329149, + "grad_norm": 0.44207186013928046, + "learning_rate": 9.288537549407115e-06, + "loss": 0.4775, + "step": 236 + }, + { + "epoch": 0.056405069316356284, + "grad_norm": 0.4579089731862815, + "learning_rate": 9.32806324110672e-06, + "loss": 0.4796, + "step": 237 + }, + { + "epoch": 0.05664306538942107, + "grad_norm": 0.4916045682404132, + "learning_rate": 9.367588932806325e-06, + "loss": 0.3495, + "step": 238 + }, + { + "epoch": 0.056881061462485866, + "grad_norm": 0.48592356270000286, + "learning_rate": 9.40711462450593e-06, + "loss": 0.3762, + "step": 239 + }, + { + "epoch": 0.05711905753555066, + "grad_norm": 0.40760679787076876, + "learning_rate": 9.446640316205534e-06, + "loss": 0.4946, + "step": 240 + }, + { + "epoch": 0.057357053608615455, + "grad_norm": 0.5404446805301054, + "learning_rate": 9.48616600790514e-06, + "loss": 0.3878, + "step": 241 + }, + { + "epoch": 0.05759504968168025, + "grad_norm": 0.5023864741183144, + "learning_rate": 9.525691699604745e-06, + "loss": 0.3769, + "step": 242 + }, + { + "epoch": 0.057833045754745044, + "grad_norm": 0.4291166201410665, + "learning_rate": 9.565217391304349e-06, + "loss": 0.4405, + "step": 243 + }, + { + "epoch": 0.05807104182780984, + "grad_norm": 0.4370205569354994, + "learning_rate": 9.604743083003954e-06, + "loss": 0.4543, + "step": 244 + }, + { + "epoch": 0.05830903790087463, + "grad_norm": 0.4726343079593376, + "learning_rate": 9.644268774703558e-06, + "loss": 0.3977, + "step": 245 + }, + { + "epoch": 0.05854703397393943, + "grad_norm": 0.48561129532487857, + "learning_rate": 9.683794466403162e-06, + "loss": 0.3915, + "step": 246 + }, + { + "epoch": 0.05878503004700422, + "grad_norm": 0.4757606568781063, + "learning_rate": 9.723320158102767e-06, + "loss": 0.4472, + "step": 247 + }, + { + "epoch": 0.059023026120069016, + "grad_norm": 0.515327431255371, + "learning_rate": 9.762845849802373e-06, + "loss": 0.4027, + "step": 248 + }, + { + "epoch": 0.05926102219313381, + "grad_norm": 0.4762224245393142, + "learning_rate": 9.802371541501977e-06, + "loss": 0.3695, + "step": 249 + }, + { + "epoch": 0.059499018266198606, + "grad_norm": 0.4459966492940902, + "learning_rate": 9.841897233201582e-06, + "loss": 0.4651, + "step": 250 + }, + { + "epoch": 0.0597370143392634, + "grad_norm": 0.4460885254332536, + "learning_rate": 9.881422924901186e-06, + "loss": 0.4789, + "step": 251 + }, + { + "epoch": 0.059975010412328195, + "grad_norm": 0.5301749599674928, + "learning_rate": 9.920948616600791e-06, + "loss": 0.3894, + "step": 252 + }, + { + "epoch": 0.06021300648539299, + "grad_norm": 0.5074601139708311, + "learning_rate": 9.960474308300397e-06, + "loss": 0.379, + "step": 253 + }, + { + "epoch": 0.060451002558457784, + "grad_norm": 0.46943388848534723, + "learning_rate": 1e-05, + "loss": 0.4968, + "step": 254 + }, + { + "epoch": 0.06068899863152258, + "grad_norm": 0.4849058663631589, + "learning_rate": 9.999999628438155e-06, + "loss": 0.4239, + "step": 255 + }, + { + "epoch": 0.06092699470458737, + "grad_norm": 0.5159279619948409, + "learning_rate": 9.999998513752668e-06, + "loss": 0.3897, + "step": 256 + }, + { + "epoch": 0.06116499077765217, + "grad_norm": 0.4913799031325082, + "learning_rate": 9.999996655943708e-06, + "loss": 0.4329, + "step": 257 + }, + { + "epoch": 0.06140298685071696, + "grad_norm": 0.4609134555633486, + "learning_rate": 9.999994055011552e-06, + "loss": 0.4635, + "step": 258 + }, + { + "epoch": 0.061640982923781756, + "grad_norm": 0.4600665037690504, + "learning_rate": 9.999990710956586e-06, + "loss": 0.361, + "step": 259 + }, + { + "epoch": 0.06187897899684655, + "grad_norm": 0.4893892245206209, + "learning_rate": 9.999986623779307e-06, + "loss": 0.3929, + "step": 260 + }, + { + "epoch": 0.062116975069911345, + "grad_norm": 0.47317754883053104, + "learning_rate": 9.99998179348032e-06, + "loss": 0.4411, + "step": 261 + }, + { + "epoch": 0.06235497114297614, + "grad_norm": 0.46027884774340916, + "learning_rate": 9.999976220060347e-06, + "loss": 0.4357, + "step": 262 + }, + { + "epoch": 0.06259296721604093, + "grad_norm": 0.5110985377219065, + "learning_rate": 9.999969903520212e-06, + "loss": 0.368, + "step": 263 + }, + { + "epoch": 0.06283096328910573, + "grad_norm": 0.5424172208623367, + "learning_rate": 9.999962843860858e-06, + "loss": 0.3945, + "step": 264 + }, + { + "epoch": 0.06306895936217052, + "grad_norm": 0.4455661724192156, + "learning_rate": 9.999955041083332e-06, + "loss": 0.4574, + "step": 265 + }, + { + "epoch": 0.06330695543523532, + "grad_norm": 0.4967774114157658, + "learning_rate": 9.999946495188793e-06, + "loss": 0.4139, + "step": 266 + }, + { + "epoch": 0.06354495150830011, + "grad_norm": 0.48548245385801053, + "learning_rate": 9.999937206178512e-06, + "loss": 0.369, + "step": 267 + }, + { + "epoch": 0.06378294758136491, + "grad_norm": 0.44764818752534175, + "learning_rate": 9.999927174053872e-06, + "loss": 0.4449, + "step": 268 + }, + { + "epoch": 0.0640209436544297, + "grad_norm": 0.47571255336778956, + "learning_rate": 9.999916398816359e-06, + "loss": 0.4252, + "step": 269 + }, + { + "epoch": 0.0642589397274945, + "grad_norm": 0.5176454688899305, + "learning_rate": 9.999904880467579e-06, + "loss": 0.4172, + "step": 270 + }, + { + "epoch": 0.06449693580055929, + "grad_norm": 0.47797256428291063, + "learning_rate": 9.99989261900924e-06, + "loss": 0.4234, + "step": 271 + }, + { + "epoch": 0.06473493187362409, + "grad_norm": 0.45140623928149853, + "learning_rate": 9.999879614443168e-06, + "loss": 0.4936, + "step": 272 + }, + { + "epoch": 0.06497292794668888, + "grad_norm": 0.4619192022175719, + "learning_rate": 9.999865866771295e-06, + "loss": 0.472, + "step": 273 + }, + { + "epoch": 0.06521092401975367, + "grad_norm": 0.5262244410688721, + "learning_rate": 9.999851375995662e-06, + "loss": 0.3445, + "step": 274 + }, + { + "epoch": 0.06544892009281847, + "grad_norm": 0.4520208956154366, + "learning_rate": 9.999836142118424e-06, + "loss": 0.4155, + "step": 275 + }, + { + "epoch": 0.06568691616588326, + "grad_norm": 0.46440432038472335, + "learning_rate": 9.999820165141845e-06, + "loss": 0.4586, + "step": 276 + }, + { + "epoch": 0.06592491223894806, + "grad_norm": 0.4573062726758063, + "learning_rate": 9.9998034450683e-06, + "loss": 0.3835, + "step": 277 + }, + { + "epoch": 0.06616290831201285, + "grad_norm": 0.4475402421191899, + "learning_rate": 9.999785981900277e-06, + "loss": 0.3686, + "step": 278 + }, + { + "epoch": 0.06640090438507765, + "grad_norm": 0.4696018693236434, + "learning_rate": 9.999767775640364e-06, + "loss": 0.4623, + "step": 279 + }, + { + "epoch": 0.06663890045814244, + "grad_norm": 0.4618737104284479, + "learning_rate": 9.999748826291273e-06, + "loss": 0.4177, + "step": 280 + }, + { + "epoch": 0.06687689653120724, + "grad_norm": 0.4799653750371601, + "learning_rate": 9.99972913385582e-06, + "loss": 0.3504, + "step": 281 + }, + { + "epoch": 0.06711489260427203, + "grad_norm": 0.4497108981102174, + "learning_rate": 9.999708698336929e-06, + "loss": 0.4563, + "step": 282 + }, + { + "epoch": 0.06735288867733683, + "grad_norm": 0.44395236286884626, + "learning_rate": 9.999687519737639e-06, + "loss": 0.4782, + "step": 283 + }, + { + "epoch": 0.06759088475040162, + "grad_norm": 0.6091422041147246, + "learning_rate": 9.999665598061097e-06, + "loss": 0.3892, + "step": 284 + }, + { + "epoch": 0.06782888082346641, + "grad_norm": 0.49705585154406684, + "learning_rate": 9.999642933310561e-06, + "loss": 0.3826, + "step": 285 + }, + { + "epoch": 0.06806687689653121, + "grad_norm": 0.4568270789179068, + "learning_rate": 9.9996195254894e-06, + "loss": 0.4423, + "step": 286 + }, + { + "epoch": 0.068304872969596, + "grad_norm": 0.45526888232059454, + "learning_rate": 9.999595374601093e-06, + "loss": 0.4749, + "step": 287 + }, + { + "epoch": 0.0685428690426608, + "grad_norm": 0.46039335829876826, + "learning_rate": 9.99957048064923e-06, + "loss": 0.3574, + "step": 288 + }, + { + "epoch": 0.06878086511572559, + "grad_norm": 0.4484215237280295, + "learning_rate": 9.999544843637509e-06, + "loss": 0.4244, + "step": 289 + }, + { + "epoch": 0.06901886118879039, + "grad_norm": 0.4156896188113592, + "learning_rate": 9.999518463569742e-06, + "loss": 0.4771, + "step": 290 + }, + { + "epoch": 0.06925685726185518, + "grad_norm": 0.4207781838747178, + "learning_rate": 9.99949134044985e-06, + "loss": 0.3879, + "step": 291 + }, + { + "epoch": 0.06949485333491998, + "grad_norm": 0.45221148207375844, + "learning_rate": 9.999463474281862e-06, + "loss": 0.403, + "step": 292 + }, + { + "epoch": 0.06973284940798477, + "grad_norm": 0.4377876809771397, + "learning_rate": 9.999434865069922e-06, + "loss": 0.4193, + "step": 293 + }, + { + "epoch": 0.06997084548104957, + "grad_norm": 0.4024216767935093, + "learning_rate": 9.99940551281828e-06, + "loss": 0.4875, + "step": 294 + }, + { + "epoch": 0.07020884155411436, + "grad_norm": 0.43322473693066027, + "learning_rate": 9.999375417531301e-06, + "loss": 0.3583, + "step": 295 + }, + { + "epoch": 0.07044683762717915, + "grad_norm": 0.471599149075093, + "learning_rate": 9.999344579213455e-06, + "loss": 0.42, + "step": 296 + }, + { + "epoch": 0.07068483370024395, + "grad_norm": 0.44747663908303237, + "learning_rate": 9.999312997869326e-06, + "loss": 0.503, + "step": 297 + }, + { + "epoch": 0.07092282977330874, + "grad_norm": 0.5069193101593006, + "learning_rate": 9.99928067350361e-06, + "loss": 0.4093, + "step": 298 + }, + { + "epoch": 0.07116082584637354, + "grad_norm": 0.4859190395369954, + "learning_rate": 9.99924760612111e-06, + "loss": 0.3736, + "step": 299 + }, + { + "epoch": 0.07139882191943833, + "grad_norm": 0.4497898536509786, + "learning_rate": 9.999213795726738e-06, + "loss": 0.4518, + "step": 300 + }, + { + "epoch": 0.07163681799250313, + "grad_norm": 0.4399132919727615, + "learning_rate": 9.999179242325523e-06, + "loss": 0.5143, + "step": 301 + }, + { + "epoch": 0.07187481406556792, + "grad_norm": 0.40980962676569144, + "learning_rate": 9.999143945922599e-06, + "loss": 0.3701, + "step": 302 + }, + { + "epoch": 0.07211281013863272, + "grad_norm": 0.4266818493401362, + "learning_rate": 9.999107906523212e-06, + "loss": 0.3847, + "step": 303 + }, + { + "epoch": 0.07235080621169751, + "grad_norm": 0.4120201125570514, + "learning_rate": 9.999071124132717e-06, + "loss": 0.4753, + "step": 304 + }, + { + "epoch": 0.0725888022847623, + "grad_norm": 0.5036108776564726, + "learning_rate": 9.999033598756583e-06, + "loss": 0.4513, + "step": 305 + }, + { + "epoch": 0.0728267983578271, + "grad_norm": 0.45433796497230666, + "learning_rate": 9.998995330400385e-06, + "loss": 0.38, + "step": 306 + }, + { + "epoch": 0.0730647944308919, + "grad_norm": 0.4604572613372604, + "learning_rate": 9.998956319069813e-06, + "loss": 0.4294, + "step": 307 + }, + { + "epoch": 0.07330279050395669, + "grad_norm": 0.4239655605570626, + "learning_rate": 9.998916564770662e-06, + "loss": 0.5023, + "step": 308 + }, + { + "epoch": 0.07354078657702148, + "grad_norm": 0.49639603018946726, + "learning_rate": 9.998876067508846e-06, + "loss": 0.3821, + "step": 309 + }, + { + "epoch": 0.07377878265008628, + "grad_norm": 0.4498046706633747, + "learning_rate": 9.998834827290376e-06, + "loss": 0.3897, + "step": 310 + }, + { + "epoch": 0.07401677872315107, + "grad_norm": 0.4847626369166094, + "learning_rate": 9.998792844121386e-06, + "loss": 0.4587, + "step": 311 + }, + { + "epoch": 0.07425477479621587, + "grad_norm": 0.4754430506699406, + "learning_rate": 9.998750118008117e-06, + "loss": 0.4725, + "step": 312 + }, + { + "epoch": 0.07449277086928066, + "grad_norm": 0.48598157686223126, + "learning_rate": 9.998706648956916e-06, + "loss": 0.3919, + "step": 313 + }, + { + "epoch": 0.07473076694234546, + "grad_norm": 0.4660284468209422, + "learning_rate": 9.998662436974246e-06, + "loss": 0.422, + "step": 314 + }, + { + "epoch": 0.07496876301541025, + "grad_norm": 0.4755957103806313, + "learning_rate": 9.998617482066677e-06, + "loss": 0.4716, + "step": 315 + }, + { + "epoch": 0.07520675908847504, + "grad_norm": 0.49198072627395434, + "learning_rate": 9.998571784240889e-06, + "loss": 0.3805, + "step": 316 + }, + { + "epoch": 0.07544475516153984, + "grad_norm": 0.45039434089900343, + "learning_rate": 9.998525343503676e-06, + "loss": 0.3578, + "step": 317 + }, + { + "epoch": 0.07568275123460463, + "grad_norm": 0.4567179079014157, + "learning_rate": 9.998478159861938e-06, + "loss": 0.4384, + "step": 318 + }, + { + "epoch": 0.07592074730766943, + "grad_norm": 0.44281871668390654, + "learning_rate": 9.99843023332269e-06, + "loss": 0.4717, + "step": 319 + }, + { + "epoch": 0.07615874338073422, + "grad_norm": 0.5238647769899825, + "learning_rate": 9.998381563893056e-06, + "loss": 0.4097, + "step": 320 + }, + { + "epoch": 0.07639673945379902, + "grad_norm": 0.4477956066826517, + "learning_rate": 9.998332151580266e-06, + "loss": 0.3769, + "step": 321 + }, + { + "epoch": 0.07663473552686381, + "grad_norm": 0.44274206034195474, + "learning_rate": 9.998281996391665e-06, + "loss": 0.4546, + "step": 322 + }, + { + "epoch": 0.0768727315999286, + "grad_norm": 0.43869730085122627, + "learning_rate": 9.998231098334708e-06, + "loss": 0.4046, + "step": 323 + }, + { + "epoch": 0.0771107276729934, + "grad_norm": 0.5146875829687024, + "learning_rate": 9.99817945741696e-06, + "loss": 0.3706, + "step": 324 + }, + { + "epoch": 0.0773487237460582, + "grad_norm": 0.4036470374218696, + "learning_rate": 9.998127073646095e-06, + "loss": 0.4352, + "step": 325 + }, + { + "epoch": 0.07758671981912299, + "grad_norm": 0.4630902945017874, + "learning_rate": 9.998073947029899e-06, + "loss": 0.4768, + "step": 326 + }, + { + "epoch": 0.07782471589218778, + "grad_norm": 0.5458520408430351, + "learning_rate": 9.99802007757627e-06, + "loss": 0.3639, + "step": 327 + }, + { + "epoch": 0.07806271196525258, + "grad_norm": 0.42241640395474855, + "learning_rate": 9.997965465293208e-06, + "loss": 0.3646, + "step": 328 + }, + { + "epoch": 0.07830070803831737, + "grad_norm": 0.4473593312203704, + "learning_rate": 9.99791011018884e-06, + "loss": 0.4494, + "step": 329 + }, + { + "epoch": 0.07853870411138217, + "grad_norm": 0.4617723155081359, + "learning_rate": 9.997854012271383e-06, + "loss": 0.4168, + "step": 330 + }, + { + "epoch": 0.07877670018444696, + "grad_norm": 0.4796048928242152, + "learning_rate": 9.99779717154918e-06, + "loss": 0.3781, + "step": 331 + }, + { + "epoch": 0.07901469625751176, + "grad_norm": 0.4409776472086194, + "learning_rate": 9.99773958803068e-06, + "loss": 0.4194, + "step": 332 + }, + { + "epoch": 0.07925269233057655, + "grad_norm": 0.49694630682051916, + "learning_rate": 9.997681261724436e-06, + "loss": 0.4955, + "step": 333 + }, + { + "epoch": 0.07949068840364135, + "grad_norm": 0.5521495917382526, + "learning_rate": 9.99762219263912e-06, + "loss": 0.3829, + "step": 334 + }, + { + "epoch": 0.07972868447670614, + "grad_norm": 0.4448039189655745, + "learning_rate": 9.997562380783512e-06, + "loss": 0.3625, + "step": 335 + }, + { + "epoch": 0.07996668054977094, + "grad_norm": 0.4233213810496793, + "learning_rate": 9.997501826166502e-06, + "loss": 0.4431, + "step": 336 + }, + { + "epoch": 0.08020467662283573, + "grad_norm": 0.4511409424625309, + "learning_rate": 9.997440528797087e-06, + "loss": 0.4574, + "step": 337 + }, + { + "epoch": 0.08044267269590052, + "grad_norm": 0.5076129237328914, + "learning_rate": 9.997378488684376e-06, + "loss": 0.3615, + "step": 338 + }, + { + "epoch": 0.0806806687689653, + "grad_norm": 0.41838318534134256, + "learning_rate": 9.997315705837596e-06, + "loss": 0.3779, + "step": 339 + }, + { + "epoch": 0.0809186648420301, + "grad_norm": 0.44373668851344966, + "learning_rate": 9.997252180266074e-06, + "loss": 0.4479, + "step": 340 + }, + { + "epoch": 0.0811566609150949, + "grad_norm": 0.5204118151673891, + "learning_rate": 9.997187911979252e-06, + "loss": 0.4187, + "step": 341 + }, + { + "epoch": 0.08139465698815969, + "grad_norm": 0.5043252971282625, + "learning_rate": 9.99712290098668e-06, + "loss": 0.3434, + "step": 342 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.44526891013124525, + "learning_rate": 9.997057147298024e-06, + "loss": 0.4474, + "step": 343 + }, + { + "epoch": 0.08187064913428928, + "grad_norm": 0.477769396408338, + "learning_rate": 9.996990650923053e-06, + "loss": 0.4611, + "step": 344 + }, + { + "epoch": 0.08210864520735407, + "grad_norm": 0.5121141417084246, + "learning_rate": 9.996923411871653e-06, + "loss": 0.3596, + "step": 345 + }, + { + "epoch": 0.08234664128041887, + "grad_norm": 0.4424669784191923, + "learning_rate": 9.996855430153816e-06, + "loss": 0.3949, + "step": 346 + }, + { + "epoch": 0.08258463735348366, + "grad_norm": 0.4508136073799909, + "learning_rate": 9.996786705779645e-06, + "loss": 0.4567, + "step": 347 + }, + { + "epoch": 0.08282263342654846, + "grad_norm": 0.5106951134749175, + "learning_rate": 9.996717238759355e-06, + "loss": 0.3982, + "step": 348 + }, + { + "epoch": 0.08306062949961325, + "grad_norm": 0.4710547751602934, + "learning_rate": 9.996647029103271e-06, + "loss": 0.334, + "step": 349 + }, + { + "epoch": 0.08329862557267805, + "grad_norm": 0.45527398227968663, + "learning_rate": 9.996576076821827e-06, + "loss": 0.4494, + "step": 350 + }, + { + "epoch": 0.08353662164574284, + "grad_norm": 0.43961786486243276, + "learning_rate": 9.99650438192557e-06, + "loss": 0.4702, + "step": 351 + }, + { + "epoch": 0.08377461771880763, + "grad_norm": 0.4920860251140083, + "learning_rate": 9.996431944425154e-06, + "loss": 0.3757, + "step": 352 + }, + { + "epoch": 0.08401261379187243, + "grad_norm": 0.4544455547851751, + "learning_rate": 9.996358764331344e-06, + "loss": 0.3715, + "step": 353 + }, + { + "epoch": 0.08425060986493722, + "grad_norm": 0.4383129033433832, + "learning_rate": 9.996284841655017e-06, + "loss": 0.4829, + "step": 354 + }, + { + "epoch": 0.08448860593800202, + "grad_norm": 0.4911519487114972, + "learning_rate": 9.996210176407163e-06, + "loss": 0.4288, + "step": 355 + }, + { + "epoch": 0.08472660201106681, + "grad_norm": 0.4165648910362421, + "learning_rate": 9.996134768598874e-06, + "loss": 0.3398, + "step": 356 + }, + { + "epoch": 0.0849645980841316, + "grad_norm": 0.4583575292505661, + "learning_rate": 9.996058618241362e-06, + "loss": 0.4005, + "step": 357 + }, + { + "epoch": 0.0852025941571964, + "grad_norm": 0.41965836884164454, + "learning_rate": 9.995981725345941e-06, + "loss": 0.4569, + "step": 358 + }, + { + "epoch": 0.0854405902302612, + "grad_norm": 0.45798270095341687, + "learning_rate": 9.995904089924043e-06, + "loss": 0.4057, + "step": 359 + }, + { + "epoch": 0.08567858630332599, + "grad_norm": 0.46062392413472286, + "learning_rate": 9.995825711987202e-06, + "loss": 0.3556, + "step": 360 + }, + { + "epoch": 0.08591658237639078, + "grad_norm": 0.4533838682476381, + "learning_rate": 9.99574659154707e-06, + "loss": 0.4538, + "step": 361 + }, + { + "epoch": 0.08615457844945558, + "grad_norm": 0.46749855571344134, + "learning_rate": 9.995666728615407e-06, + "loss": 0.4573, + "step": 362 + }, + { + "epoch": 0.08639257452252037, + "grad_norm": 0.41387520189963245, + "learning_rate": 9.995586123204083e-06, + "loss": 0.3657, + "step": 363 + }, + { + "epoch": 0.08663057059558517, + "grad_norm": 0.4516587880902474, + "learning_rate": 9.995504775325073e-06, + "loss": 0.4202, + "step": 364 + }, + { + "epoch": 0.08686856666864996, + "grad_norm": 0.42225242395315815, + "learning_rate": 9.99542268499047e-06, + "loss": 0.4667, + "step": 365 + }, + { + "epoch": 0.08710656274171476, + "grad_norm": 0.44765554289425097, + "learning_rate": 9.995339852212478e-06, + "loss": 0.4053, + "step": 366 + }, + { + "epoch": 0.08734455881477955, + "grad_norm": 0.40737919321911104, + "learning_rate": 9.995256277003403e-06, + "loss": 0.3616, + "step": 367 + }, + { + "epoch": 0.08758255488784435, + "grad_norm": 0.47373973714542855, + "learning_rate": 9.99517195937567e-06, + "loss": 0.4084, + "step": 368 + }, + { + "epoch": 0.08782055096090914, + "grad_norm": 0.4530682065428116, + "learning_rate": 9.995086899341808e-06, + "loss": 0.4571, + "step": 369 + }, + { + "epoch": 0.08805854703397394, + "grad_norm": 1.3913274762621268, + "learning_rate": 9.995001096914462e-06, + "loss": 0.3454, + "step": 370 + }, + { + "epoch": 0.08829654310703873, + "grad_norm": 0.47478901107913046, + "learning_rate": 9.99491455210638e-06, + "loss": 0.3953, + "step": 371 + }, + { + "epoch": 0.08853453918010352, + "grad_norm": 0.5018434335802849, + "learning_rate": 9.994827264930432e-06, + "loss": 0.4965, + "step": 372 + }, + { + "epoch": 0.08877253525316832, + "grad_norm": 0.4450551192390846, + "learning_rate": 9.994739235399581e-06, + "loss": 0.4243, + "step": 373 + }, + { + "epoch": 0.08901053132623311, + "grad_norm": 0.4515333750014817, + "learning_rate": 9.994650463526918e-06, + "loss": 0.3676, + "step": 374 + }, + { + "epoch": 0.08924852739929791, + "grad_norm": 0.48851016231525185, + "learning_rate": 9.994560949325636e-06, + "loss": 0.467, + "step": 375 + }, + { + "epoch": 0.0894865234723627, + "grad_norm": 0.4195656189924732, + "learning_rate": 9.994470692809036e-06, + "loss": 0.4756, + "step": 376 + }, + { + "epoch": 0.0897245195454275, + "grad_norm": 0.43584449035060197, + "learning_rate": 9.994379693990533e-06, + "loss": 0.3715, + "step": 377 + }, + { + "epoch": 0.08996251561849229, + "grad_norm": 0.4385497336895332, + "learning_rate": 9.994287952883652e-06, + "loss": 0.3578, + "step": 378 + }, + { + "epoch": 0.09020051169155709, + "grad_norm": 0.47979109705837614, + "learning_rate": 9.994195469502031e-06, + "loss": 0.4422, + "step": 379 + }, + { + "epoch": 0.09043850776462188, + "grad_norm": 0.4035725189989237, + "learning_rate": 9.994102243859412e-06, + "loss": 0.4285, + "step": 380 + }, + { + "epoch": 0.09067650383768668, + "grad_norm": 0.40727590335667724, + "learning_rate": 9.99400827596965e-06, + "loss": 0.3412, + "step": 381 + }, + { + "epoch": 0.09091449991075147, + "grad_norm": 0.4480428671921397, + "learning_rate": 9.993913565846713e-06, + "loss": 0.4233, + "step": 382 + }, + { + "epoch": 0.09115249598381626, + "grad_norm": 0.45603055794812214, + "learning_rate": 9.993818113504678e-06, + "loss": 0.4748, + "step": 383 + }, + { + "epoch": 0.09139049205688106, + "grad_norm": 0.4662947179384981, + "learning_rate": 9.993721918957728e-06, + "loss": 0.3761, + "step": 384 + }, + { + "epoch": 0.09162848812994585, + "grad_norm": 0.4974648803891763, + "learning_rate": 9.993624982220164e-06, + "loss": 0.3555, + "step": 385 + }, + { + "epoch": 0.09186648420301065, + "grad_norm": 0.4643066974713487, + "learning_rate": 9.99352730330639e-06, + "loss": 0.4187, + "step": 386 + }, + { + "epoch": 0.09210448027607544, + "grad_norm": 0.43172365645329547, + "learning_rate": 9.993428882230925e-06, + "loss": 0.4664, + "step": 387 + }, + { + "epoch": 0.09234247634914024, + "grad_norm": 0.4714894865640499, + "learning_rate": 9.993329719008397e-06, + "loss": 0.3685, + "step": 388 + }, + { + "epoch": 0.09258047242220503, + "grad_norm": 0.41913422983030557, + "learning_rate": 9.993229813653544e-06, + "loss": 0.3675, + "step": 389 + }, + { + "epoch": 0.09281846849526983, + "grad_norm": 0.43975663296852435, + "learning_rate": 9.993129166181215e-06, + "loss": 0.4556, + "step": 390 + }, + { + "epoch": 0.09305646456833462, + "grad_norm": 0.4494552356684672, + "learning_rate": 9.993027776606365e-06, + "loss": 0.4227, + "step": 391 + }, + { + "epoch": 0.09329446064139942, + "grad_norm": 0.4288268452457883, + "learning_rate": 9.992925644944068e-06, + "loss": 0.3461, + "step": 392 + }, + { + "epoch": 0.09353245671446421, + "grad_norm": 0.4489798380818497, + "learning_rate": 9.992822771209501e-06, + "loss": 0.4179, + "step": 393 + }, + { + "epoch": 0.093770452787529, + "grad_norm": 0.4509441961780933, + "learning_rate": 9.992719155417954e-06, + "loss": 0.4886, + "step": 394 + }, + { + "epoch": 0.0940084488605938, + "grad_norm": 0.4449441936420067, + "learning_rate": 9.992614797584825e-06, + "loss": 0.3609, + "step": 395 + }, + { + "epoch": 0.0942464449336586, + "grad_norm": 0.4517902195455245, + "learning_rate": 9.992509697725627e-06, + "loss": 0.3913, + "step": 396 + }, + { + "epoch": 0.09448444100672339, + "grad_norm": 0.4272371117143219, + "learning_rate": 9.992403855855979e-06, + "loss": 0.4825, + "step": 397 + }, + { + "epoch": 0.09472243707978818, + "grad_norm": 0.49237207377468334, + "learning_rate": 9.992297271991611e-06, + "loss": 0.4321, + "step": 398 + }, + { + "epoch": 0.09496043315285298, + "grad_norm": 0.4742011509459983, + "learning_rate": 9.992189946148366e-06, + "loss": 0.3706, + "step": 399 + }, + { + "epoch": 0.09519842922591777, + "grad_norm": 0.481623368264721, + "learning_rate": 9.992081878342196e-06, + "loss": 0.433, + "step": 400 + }, + { + "epoch": 0.09543642529898257, + "grad_norm": 0.43969308523488704, + "learning_rate": 9.991973068589157e-06, + "loss": 0.497, + "step": 401 + }, + { + "epoch": 0.09567442137204736, + "grad_norm": 0.45002640361694246, + "learning_rate": 9.991863516905428e-06, + "loss": 0.3763, + "step": 402 + }, + { + "epoch": 0.09591241744511216, + "grad_norm": 0.4140916324473366, + "learning_rate": 9.991753223307283e-06, + "loss": 0.4141, + "step": 403 + }, + { + "epoch": 0.09615041351817695, + "grad_norm": 0.4987232508457122, + "learning_rate": 9.991642187811122e-06, + "loss": 0.4597, + "step": 404 + }, + { + "epoch": 0.09638840959124174, + "grad_norm": 0.502335430018636, + "learning_rate": 9.991530410433446e-06, + "loss": 0.4302, + "step": 405 + }, + { + "epoch": 0.09662640566430654, + "grad_norm": 0.4984534349500167, + "learning_rate": 9.991417891190864e-06, + "loss": 0.3477, + "step": 406 + }, + { + "epoch": 0.09686440173737133, + "grad_norm": 0.43148480208491835, + "learning_rate": 9.991304630100103e-06, + "loss": 0.4022, + "step": 407 + }, + { + "epoch": 0.09710239781043613, + "grad_norm": 0.42523411463325067, + "learning_rate": 9.991190627177993e-06, + "loss": 0.4572, + "step": 408 + }, + { + "epoch": 0.09734039388350092, + "grad_norm": 0.5251992385161286, + "learning_rate": 9.991075882441482e-06, + "loss": 0.3742, + "step": 409 + }, + { + "epoch": 0.09757838995656572, + "grad_norm": 0.5071319884695096, + "learning_rate": 9.990960395907621e-06, + "loss": 0.3707, + "step": 410 + }, + { + "epoch": 0.09781638602963051, + "grad_norm": 0.449266770757162, + "learning_rate": 9.990844167593574e-06, + "loss": 0.4296, + "step": 411 + }, + { + "epoch": 0.0980543821026953, + "grad_norm": 0.48923465192436794, + "learning_rate": 9.990727197516617e-06, + "loss": 0.4722, + "step": 412 + }, + { + "epoch": 0.0982923781757601, + "grad_norm": 0.525715202859848, + "learning_rate": 9.990609485694133e-06, + "loss": 0.3591, + "step": 413 + }, + { + "epoch": 0.0985303742488249, + "grad_norm": 0.4460753119588959, + "learning_rate": 9.990491032143619e-06, + "loss": 0.3947, + "step": 414 + }, + { + "epoch": 0.09876837032188969, + "grad_norm": 0.4540712349304704, + "learning_rate": 9.990371836882678e-06, + "loss": 0.4902, + "step": 415 + }, + { + "epoch": 0.09900636639495448, + "grad_norm": 0.45967087446771393, + "learning_rate": 9.990251899929026e-06, + "loss": 0.3735, + "step": 416 + }, + { + "epoch": 0.09924436246801928, + "grad_norm": 0.5344550556485966, + "learning_rate": 9.990131221300489e-06, + "loss": 0.3667, + "step": 417 + }, + { + "epoch": 0.09948235854108407, + "grad_norm": 0.44602952588771577, + "learning_rate": 9.990009801015003e-06, + "loss": 0.4267, + "step": 418 + }, + { + "epoch": 0.09972035461414887, + "grad_norm": 0.5195582033175876, + "learning_rate": 9.989887639090614e-06, + "loss": 0.4233, + "step": 419 + }, + { + "epoch": 0.09995835068721366, + "grad_norm": 0.5142299290549335, + "learning_rate": 9.989764735545477e-06, + "loss": 0.378, + "step": 420 + }, + { + "epoch": 0.10019634676027846, + "grad_norm": 0.4854121376426571, + "learning_rate": 9.98964109039786e-06, + "loss": 0.361, + "step": 421 + }, + { + "epoch": 0.10043434283334325, + "grad_norm": 0.4550651737012537, + "learning_rate": 9.98951670366614e-06, + "loss": 0.4618, + "step": 422 + }, + { + "epoch": 0.10067233890640805, + "grad_norm": 0.5234336286250443, + "learning_rate": 9.989391575368802e-06, + "loss": 0.3958, + "step": 423 + }, + { + "epoch": 0.10091033497947284, + "grad_norm": 0.47063073761951685, + "learning_rate": 9.989265705524444e-06, + "loss": 0.3747, + "step": 424 + }, + { + "epoch": 0.10114833105253764, + "grad_norm": 0.46952341342697657, + "learning_rate": 9.989139094151773e-06, + "loss": 0.4112, + "step": 425 + }, + { + "epoch": 0.10138632712560243, + "grad_norm": 0.42526909585554723, + "learning_rate": 9.98901174126961e-06, + "loss": 0.4603, + "step": 426 + }, + { + "epoch": 0.10162432319866722, + "grad_norm": 0.43907211057538165, + "learning_rate": 9.988883646896877e-06, + "loss": 0.3813, + "step": 427 + }, + { + "epoch": 0.10186231927173202, + "grad_norm": 0.4940877209700441, + "learning_rate": 9.988754811052616e-06, + "loss": 0.355, + "step": 428 + }, + { + "epoch": 0.10210031534479681, + "grad_norm": 0.4302285563808865, + "learning_rate": 9.988625233755975e-06, + "loss": 0.4783, + "step": 429 + }, + { + "epoch": 0.10233831141786161, + "grad_norm": 0.4399773493860619, + "learning_rate": 9.988494915026213e-06, + "loss": 0.4771, + "step": 430 + }, + { + "epoch": 0.1025763074909264, + "grad_norm": 0.5749592576382808, + "learning_rate": 9.988363854882694e-06, + "loss": 0.3388, + "step": 431 + }, + { + "epoch": 0.1028143035639912, + "grad_norm": 0.4171737508333019, + "learning_rate": 9.988232053344901e-06, + "loss": 0.4032, + "step": 432 + }, + { + "epoch": 0.10305229963705599, + "grad_norm": 0.44690528937324775, + "learning_rate": 9.988099510432422e-06, + "loss": 0.4354, + "step": 433 + }, + { + "epoch": 0.10329029571012079, + "grad_norm": 0.7413219284083276, + "learning_rate": 9.987966226164958e-06, + "loss": 0.3871, + "step": 434 + }, + { + "epoch": 0.10352829178318558, + "grad_norm": 0.572534433584837, + "learning_rate": 9.987832200562315e-06, + "loss": 0.3343, + "step": 435 + }, + { + "epoch": 0.10376628785625037, + "grad_norm": 0.4024474823593315, + "learning_rate": 9.987697433644414e-06, + "loss": 0.4353, + "step": 436 + }, + { + "epoch": 0.10400428392931517, + "grad_norm": 0.5053428492479292, + "learning_rate": 9.987561925431283e-06, + "loss": 0.456, + "step": 437 + }, + { + "epoch": 0.10424228000237996, + "grad_norm": 0.5684801045695161, + "learning_rate": 9.987425675943067e-06, + "loss": 0.3426, + "step": 438 + }, + { + "epoch": 0.10448027607544476, + "grad_norm": 0.5703263329939863, + "learning_rate": 9.98728868520001e-06, + "loss": 0.417, + "step": 439 + }, + { + "epoch": 0.10471827214850955, + "grad_norm": 0.5736516226532222, + "learning_rate": 9.987150953222476e-06, + "loss": 0.4352, + "step": 440 + }, + { + "epoch": 0.10495626822157435, + "grad_norm": 0.4287015110638932, + "learning_rate": 9.987012480030934e-06, + "loss": 0.4033, + "step": 441 + }, + { + "epoch": 0.10519426429463914, + "grad_norm": 0.46545342863477973, + "learning_rate": 9.986873265645965e-06, + "loss": 0.3712, + "step": 442 + }, + { + "epoch": 0.10543226036770394, + "grad_norm": 0.4709132222065424, + "learning_rate": 9.98673331008826e-06, + "loss": 0.4198, + "step": 443 + }, + { + "epoch": 0.10567025644076873, + "grad_norm": 0.5082536813625272, + "learning_rate": 9.986592613378616e-06, + "loss": 0.4497, + "step": 444 + }, + { + "epoch": 0.10590825251383353, + "grad_norm": 0.4722946115830572, + "learning_rate": 9.98645117553795e-06, + "loss": 0.3466, + "step": 445 + }, + { + "epoch": 0.10614624858689832, + "grad_norm": 1.2081855072445622, + "learning_rate": 9.98630899658728e-06, + "loss": 0.3711, + "step": 446 + }, + { + "epoch": 0.10638424465996311, + "grad_norm": 0.49995351880997874, + "learning_rate": 9.98616607654774e-06, + "loss": 0.4477, + "step": 447 + }, + { + "epoch": 0.10662224073302791, + "grad_norm": 0.5032188297810777, + "learning_rate": 9.986022415440564e-06, + "loss": 0.4162, + "step": 448 + }, + { + "epoch": 0.1068602368060927, + "grad_norm": 0.7647181678703971, + "learning_rate": 9.985878013287113e-06, + "loss": 0.3694, + "step": 449 + }, + { + "epoch": 0.1070982328791575, + "grad_norm": 0.6789559238085723, + "learning_rate": 9.985732870108843e-06, + "loss": 0.4079, + "step": 450 + }, + { + "epoch": 0.10733622895222229, + "grad_norm": 0.6758534206958869, + "learning_rate": 9.985586985927328e-06, + "loss": 0.4949, + "step": 451 + }, + { + "epoch": 0.10757422502528709, + "grad_norm": 0.63305874198617, + "learning_rate": 9.98544036076425e-06, + "loss": 0.3443, + "step": 452 + }, + { + "epoch": 0.10781222109835188, + "grad_norm": 0.5712028770173395, + "learning_rate": 9.985292994641398e-06, + "loss": 0.3686, + "step": 453 + }, + { + "epoch": 0.10805021717141668, + "grad_norm": 0.42711184550985515, + "learning_rate": 9.98514488758068e-06, + "loss": 0.4819, + "step": 454 + }, + { + "epoch": 0.10828821324448147, + "grad_norm": 0.45616716468152596, + "learning_rate": 9.984996039604102e-06, + "loss": 0.4337, + "step": 455 + }, + { + "epoch": 0.10852620931754627, + "grad_norm": 0.44388087258594616, + "learning_rate": 9.98484645073379e-06, + "loss": 0.3341, + "step": 456 + }, + { + "epoch": 0.10876420539061106, + "grad_norm": 0.5001916577790845, + "learning_rate": 9.984696120991979e-06, + "loss": 0.431, + "step": 457 + }, + { + "epoch": 0.10900220146367585, + "grad_norm": 0.44705716190669925, + "learning_rate": 9.984545050401007e-06, + "loss": 0.4467, + "step": 458 + }, + { + "epoch": 0.10924019753674065, + "grad_norm": 0.45150670551113553, + "learning_rate": 9.98439323898333e-06, + "loss": 0.3919, + "step": 459 + }, + { + "epoch": 0.10947819360980544, + "grad_norm": 0.520796434754494, + "learning_rate": 9.98424068676151e-06, + "loss": 0.3459, + "step": 460 + }, + { + "epoch": 0.10971618968287024, + "grad_norm": 0.4208531093058114, + "learning_rate": 9.984087393758218e-06, + "loss": 0.444, + "step": 461 + }, + { + "epoch": 0.10995418575593503, + "grad_norm": 0.45915739817950607, + "learning_rate": 9.983933359996241e-06, + "loss": 0.4736, + "step": 462 + }, + { + "epoch": 0.11019218182899983, + "grad_norm": 0.4461278219351183, + "learning_rate": 9.983778585498468e-06, + "loss": 0.3765, + "step": 463 + }, + { + "epoch": 0.11043017790206462, + "grad_norm": 0.40738699092689734, + "learning_rate": 9.983623070287905e-06, + "loss": 0.3933, + "step": 464 + }, + { + "epoch": 0.11066817397512942, + "grad_norm": 0.4913832741443862, + "learning_rate": 9.983466814387666e-06, + "loss": 0.4552, + "step": 465 + }, + { + "epoch": 0.11090617004819421, + "grad_norm": 0.530231130629648, + "learning_rate": 9.983309817820972e-06, + "loss": 0.3586, + "step": 466 + }, + { + "epoch": 0.111144166121259, + "grad_norm": 0.4241816209307345, + "learning_rate": 9.983152080611158e-06, + "loss": 0.326, + "step": 467 + }, + { + "epoch": 0.1113821621943238, + "grad_norm": 0.43804873868170846, + "learning_rate": 9.98299360278167e-06, + "loss": 0.4251, + "step": 468 + }, + { + "epoch": 0.1116201582673886, + "grad_norm": 0.4393810388350614, + "learning_rate": 9.982834384356057e-06, + "loss": 0.4675, + "step": 469 + }, + { + "epoch": 0.11185815434045339, + "grad_norm": 0.46372765200679533, + "learning_rate": 9.982674425357985e-06, + "loss": 0.3493, + "step": 470 + }, + { + "epoch": 0.11209615041351818, + "grad_norm": 0.4436475358645403, + "learning_rate": 9.982513725811228e-06, + "loss": 0.4243, + "step": 471 + }, + { + "epoch": 0.11233414648658298, + "grad_norm": 0.4173010199129987, + "learning_rate": 9.98235228573967e-06, + "loss": 0.472, + "step": 472 + }, + { + "epoch": 0.11257214255964777, + "grad_norm": 0.535729642414528, + "learning_rate": 9.982190105167306e-06, + "loss": 0.4164, + "step": 473 + }, + { + "epoch": 0.11281013863271257, + "grad_norm": 0.4772334936227505, + "learning_rate": 9.982027184118236e-06, + "loss": 0.349, + "step": 474 + }, + { + "epoch": 0.11304813470577735, + "grad_norm": 0.47313830652761185, + "learning_rate": 9.981863522616681e-06, + "loss": 0.4435, + "step": 475 + }, + { + "epoch": 0.11328613077884214, + "grad_norm": 0.47425353304897183, + "learning_rate": 9.981699120686959e-06, + "loss": 0.4695, + "step": 476 + }, + { + "epoch": 0.11352412685190694, + "grad_norm": 0.44292841465417715, + "learning_rate": 9.981533978353508e-06, + "loss": 0.3857, + "step": 477 + }, + { + "epoch": 0.11376212292497173, + "grad_norm": 0.5239164407002358, + "learning_rate": 9.981368095640868e-06, + "loss": 0.3726, + "step": 478 + }, + { + "epoch": 0.11400011899803653, + "grad_norm": 0.44488449687995085, + "learning_rate": 9.981201472573698e-06, + "loss": 0.4432, + "step": 479 + }, + { + "epoch": 0.11423811507110132, + "grad_norm": 0.43967538016218216, + "learning_rate": 9.98103410917676e-06, + "loss": 0.4076, + "step": 480 + }, + { + "epoch": 0.11447611114416611, + "grad_norm": 0.739424564589288, + "learning_rate": 9.980866005474928e-06, + "loss": 0.3675, + "step": 481 + }, + { + "epoch": 0.11471410721723091, + "grad_norm": 0.4993736830245461, + "learning_rate": 9.980697161493185e-06, + "loss": 0.4131, + "step": 482 + }, + { + "epoch": 0.1149521032902957, + "grad_norm": 1.3276469711665722, + "learning_rate": 9.980527577256629e-06, + "loss": 0.4678, + "step": 483 + }, + { + "epoch": 0.1151900993633605, + "grad_norm": 0.46128218449640923, + "learning_rate": 9.980357252790464e-06, + "loss": 0.3607, + "step": 484 + }, + { + "epoch": 0.1154280954364253, + "grad_norm": 0.41316023220047166, + "learning_rate": 9.980186188120002e-06, + "loss": 0.351, + "step": 485 + }, + { + "epoch": 0.11566609150949009, + "grad_norm": 0.482093707108989, + "learning_rate": 9.980014383270668e-06, + "loss": 0.3955, + "step": 486 + }, + { + "epoch": 0.11590408758255488, + "grad_norm": 0.5011032501000284, + "learning_rate": 9.979841838267999e-06, + "loss": 0.458, + "step": 487 + }, + { + "epoch": 0.11614208365561968, + "grad_norm": 0.4135106798022244, + "learning_rate": 9.979668553137635e-06, + "loss": 0.3878, + "step": 488 + }, + { + "epoch": 0.11638007972868447, + "grad_norm": 0.44904272180640614, + "learning_rate": 9.979494527905334e-06, + "loss": 0.3706, + "step": 489 + }, + { + "epoch": 0.11661807580174927, + "grad_norm": 0.41254283183456614, + "learning_rate": 9.979319762596959e-06, + "loss": 0.4731, + "step": 490 + }, + { + "epoch": 0.11685607187481406, + "grad_norm": 0.4326477250584368, + "learning_rate": 9.979144257238484e-06, + "loss": 0.3882, + "step": 491 + }, + { + "epoch": 0.11709406794787885, + "grad_norm": 0.47385412746747546, + "learning_rate": 9.978968011855996e-06, + "loss": 0.3596, + "step": 492 + }, + { + "epoch": 0.11733206402094365, + "grad_norm": 0.48119232908966797, + "learning_rate": 9.978791026475689e-06, + "loss": 0.4342, + "step": 493 + }, + { + "epoch": 0.11757006009400844, + "grad_norm": 0.4441552118826538, + "learning_rate": 9.978613301123864e-06, + "loss": 0.4489, + "step": 494 + }, + { + "epoch": 0.11780805616707324, + "grad_norm": 0.46690414644784983, + "learning_rate": 9.978434835826937e-06, + "loss": 0.3777, + "step": 495 + }, + { + "epoch": 0.11804605224013803, + "grad_norm": 0.44970400252197734, + "learning_rate": 9.978255630611432e-06, + "loss": 0.3701, + "step": 496 + }, + { + "epoch": 0.11828404831320283, + "grad_norm": 0.42146124042697336, + "learning_rate": 9.978075685503988e-06, + "loss": 0.4581, + "step": 497 + }, + { + "epoch": 0.11852204438626762, + "grad_norm": 0.4287576458047421, + "learning_rate": 9.977895000531343e-06, + "loss": 0.4068, + "step": 498 + }, + { + "epoch": 0.11876004045933242, + "grad_norm": 0.4661011135993105, + "learning_rate": 9.977713575720354e-06, + "loss": 0.3648, + "step": 499 + }, + { + "epoch": 0.11899803653239721, + "grad_norm": 0.44336193791681305, + "learning_rate": 9.977531411097985e-06, + "loss": 0.3956, + "step": 500 + }, + { + "epoch": 0.119236032605462, + "grad_norm": 0.40135410660420456, + "learning_rate": 9.97734850669131e-06, + "loss": 0.4393, + "step": 501 + }, + { + "epoch": 0.1194740286785268, + "grad_norm": 0.4559807876897745, + "learning_rate": 9.977164862527512e-06, + "loss": 0.3646, + "step": 502 + }, + { + "epoch": 0.1197120247515916, + "grad_norm": 0.4347794693076568, + "learning_rate": 9.976980478633888e-06, + "loss": 0.3785, + "step": 503 + }, + { + "epoch": 0.11995002082465639, + "grad_norm": 0.46153771301045676, + "learning_rate": 9.97679535503784e-06, + "loss": 0.4069, + "step": 504 + }, + { + "epoch": 0.12018801689772118, + "grad_norm": 0.424717397560516, + "learning_rate": 9.976609491766883e-06, + "loss": 0.4598, + "step": 505 + }, + { + "epoch": 0.12042601297078598, + "grad_norm": 0.4434747636911381, + "learning_rate": 9.97642288884864e-06, + "loss": 0.356, + "step": 506 + }, + { + "epoch": 0.12066400904385077, + "grad_norm": 0.4452496263517166, + "learning_rate": 9.976235546310844e-06, + "loss": 0.4129, + "step": 507 + }, + { + "epoch": 0.12090200511691557, + "grad_norm": 0.4578774709736023, + "learning_rate": 9.97604746418134e-06, + "loss": 0.4812, + "step": 508 + }, + { + "epoch": 0.12114000118998036, + "grad_norm": 0.41675783377655806, + "learning_rate": 9.975858642488081e-06, + "loss": 0.3771, + "step": 509 + }, + { + "epoch": 0.12137799726304516, + "grad_norm": 0.40069287579572055, + "learning_rate": 9.975669081259132e-06, + "loss": 0.3734, + "step": 510 + }, + { + "epoch": 0.12161599333610995, + "grad_norm": 0.4309476794229487, + "learning_rate": 9.975478780522664e-06, + "loss": 0.4206, + "step": 511 + }, + { + "epoch": 0.12185398940917475, + "grad_norm": 0.46800262518826174, + "learning_rate": 9.975287740306962e-06, + "loss": 0.4462, + "step": 512 + }, + { + "epoch": 0.12209198548223954, + "grad_norm": 0.42699693782693693, + "learning_rate": 9.97509596064042e-06, + "loss": 0.3419, + "step": 513 + }, + { + "epoch": 0.12232998155530433, + "grad_norm": 0.4386357295282374, + "learning_rate": 9.97490344155154e-06, + "loss": 0.398, + "step": 514 + }, + { + "epoch": 0.12256797762836913, + "grad_norm": 0.5979447179868536, + "learning_rate": 9.974710183068935e-06, + "loss": 0.4266, + "step": 515 + }, + { + "epoch": 0.12280597370143392, + "grad_norm": 0.4391789380414681, + "learning_rate": 9.97451618522133e-06, + "loss": 0.3549, + "step": 516 + }, + { + "epoch": 0.12304396977449872, + "grad_norm": 0.4254442498856795, + "learning_rate": 9.974321448037553e-06, + "loss": 0.372, + "step": 517 + }, + { + "epoch": 0.12328196584756351, + "grad_norm": 0.4203710488596454, + "learning_rate": 9.974125971546553e-06, + "loss": 0.4029, + "step": 518 + }, + { + "epoch": 0.12351996192062831, + "grad_norm": 0.4235704728114593, + "learning_rate": 9.973929755777379e-06, + "loss": 0.4569, + "step": 519 + }, + { + "epoch": 0.1237579579936931, + "grad_norm": 0.43253576502031255, + "learning_rate": 9.973732800759193e-06, + "loss": 0.3386, + "step": 520 + }, + { + "epoch": 0.1239959540667579, + "grad_norm": 0.4130102595578799, + "learning_rate": 9.97353510652127e-06, + "loss": 0.3849, + "step": 521 + }, + { + "epoch": 0.12423395013982269, + "grad_norm": 0.43914912023331804, + "learning_rate": 9.97333667309299e-06, + "loss": 0.4497, + "step": 522 + }, + { + "epoch": 0.12447194621288749, + "grad_norm": 0.3972463467100798, + "learning_rate": 9.973137500503846e-06, + "loss": 0.3995, + "step": 523 + }, + { + "epoch": 0.12470994228595228, + "grad_norm": 0.4452000977643881, + "learning_rate": 9.97293758878344e-06, + "loss": 0.3296, + "step": 524 + }, + { + "epoch": 0.12494793835901707, + "grad_norm": 0.4194230322974543, + "learning_rate": 9.972736937961484e-06, + "loss": 0.4134, + "step": 525 + }, + { + "epoch": 0.12518593443208187, + "grad_norm": 0.4146732557797122, + "learning_rate": 9.9725355480678e-06, + "loss": 0.474, + "step": 526 + }, + { + "epoch": 0.12542393050514666, + "grad_norm": 0.40700998263113747, + "learning_rate": 9.972333419132319e-06, + "loss": 0.383, + "step": 527 + }, + { + "epoch": 0.12566192657821146, + "grad_norm": 0.4023415477618789, + "learning_rate": 9.97213055118508e-06, + "loss": 0.3658, + "step": 528 + }, + { + "epoch": 0.12589992265127625, + "grad_norm": 0.44336944898631714, + "learning_rate": 9.971926944256239e-06, + "loss": 0.476, + "step": 529 + }, + { + "epoch": 0.12613791872434105, + "grad_norm": 0.40291923826893716, + "learning_rate": 9.971722598376054e-06, + "loss": 0.4248, + "step": 530 + }, + { + "epoch": 0.12637591479740584, + "grad_norm": 0.3986265875109195, + "learning_rate": 9.971517513574896e-06, + "loss": 0.3415, + "step": 531 + }, + { + "epoch": 0.12661391087047064, + "grad_norm": 0.43032291172443293, + "learning_rate": 9.971311689883247e-06, + "loss": 0.4259, + "step": 532 + }, + { + "epoch": 0.12685190694353543, + "grad_norm": 0.39866454624485, + "learning_rate": 9.971105127331695e-06, + "loss": 0.4444, + "step": 533 + }, + { + "epoch": 0.12708990301660023, + "grad_norm": 0.43444250324758876, + "learning_rate": 9.970897825950942e-06, + "loss": 0.3788, + "step": 534 + }, + { + "epoch": 0.12732789908966502, + "grad_norm": 0.419118191593301, + "learning_rate": 9.970689785771798e-06, + "loss": 0.3715, + "step": 535 + }, + { + "epoch": 0.12756589516272981, + "grad_norm": 0.4333465547745647, + "learning_rate": 9.970481006825185e-06, + "loss": 0.4261, + "step": 536 + }, + { + "epoch": 0.1278038912357946, + "grad_norm": 0.4353660970004805, + "learning_rate": 9.970271489142127e-06, + "loss": 0.4315, + "step": 537 + }, + { + "epoch": 0.1280418873088594, + "grad_norm": 0.45342539645281116, + "learning_rate": 9.97006123275377e-06, + "loss": 0.4001, + "step": 538 + }, + { + "epoch": 0.1282798833819242, + "grad_norm": 0.44126076267178, + "learning_rate": 9.96985023769136e-06, + "loss": 0.3999, + "step": 539 + }, + { + "epoch": 0.128517879454989, + "grad_norm": 0.42194551508228195, + "learning_rate": 9.969638503986256e-06, + "loss": 0.4385, + "step": 540 + }, + { + "epoch": 0.1287558755280538, + "grad_norm": 0.4069592336480195, + "learning_rate": 9.969426031669928e-06, + "loss": 0.3883, + "step": 541 + }, + { + "epoch": 0.12899387160111858, + "grad_norm": 0.4178468875829833, + "learning_rate": 9.969212820773952e-06, + "loss": 0.3655, + "step": 542 + }, + { + "epoch": 0.12923186767418338, + "grad_norm": 0.4570434100827094, + "learning_rate": 9.968998871330021e-06, + "loss": 0.4334, + "step": 543 + }, + { + "epoch": 0.12946986374724817, + "grad_norm": 0.4166848553496299, + "learning_rate": 9.968784183369929e-06, + "loss": 0.4484, + "step": 544 + }, + { + "epoch": 0.12970785982031297, + "grad_norm": 0.42519002468336536, + "learning_rate": 9.968568756925588e-06, + "loss": 0.3469, + "step": 545 + }, + { + "epoch": 0.12994585589337776, + "grad_norm": 0.44962945270540455, + "learning_rate": 9.968352592029011e-06, + "loss": 0.3744, + "step": 546 + }, + { + "epoch": 0.13018385196644255, + "grad_norm": 0.40876516260944795, + "learning_rate": 9.968135688712328e-06, + "loss": 0.4431, + "step": 547 + }, + { + "epoch": 0.13042184803950735, + "grad_norm": 0.47209127285023517, + "learning_rate": 9.967918047007775e-06, + "loss": 0.3748, + "step": 548 + }, + { + "epoch": 0.13065984411257214, + "grad_norm": 0.4089317804173208, + "learning_rate": 9.967699666947702e-06, + "loss": 0.3127, + "step": 549 + }, + { + "epoch": 0.13089784018563694, + "grad_norm": 0.4235984982251744, + "learning_rate": 9.96748054856456e-06, + "loss": 0.4103, + "step": 550 + }, + { + "epoch": 0.13113583625870173, + "grad_norm": 0.47319011240693865, + "learning_rate": 9.967260691890924e-06, + "loss": 0.4523, + "step": 551 + }, + { + "epoch": 0.13137383233176653, + "grad_norm": 0.5145055531378172, + "learning_rate": 9.967040096959462e-06, + "loss": 0.3992, + "step": 552 + }, + { + "epoch": 0.13161182840483132, + "grad_norm": 0.4195472052055503, + "learning_rate": 9.966818763802963e-06, + "loss": 0.344, + "step": 553 + }, + { + "epoch": 0.13184982447789612, + "grad_norm": 0.43253895948369897, + "learning_rate": 9.966596692454323e-06, + "loss": 0.4628, + "step": 554 + }, + { + "epoch": 0.1320878205509609, + "grad_norm": 0.418918991234706, + "learning_rate": 9.966373882946546e-06, + "loss": 0.3967, + "step": 555 + }, + { + "epoch": 0.1323258166240257, + "grad_norm": 0.45069506384756325, + "learning_rate": 9.966150335312747e-06, + "loss": 0.3592, + "step": 556 + }, + { + "epoch": 0.1325638126970905, + "grad_norm": 0.4112688079328426, + "learning_rate": 9.965926049586154e-06, + "loss": 0.4051, + "step": 557 + }, + { + "epoch": 0.1328018087701553, + "grad_norm": 0.4306546680664368, + "learning_rate": 9.965701025800098e-06, + "loss": 0.4501, + "step": 558 + }, + { + "epoch": 0.1330398048432201, + "grad_norm": 0.47742995855119286, + "learning_rate": 9.965475263988024e-06, + "loss": 0.3709, + "step": 559 + }, + { + "epoch": 0.13327780091628488, + "grad_norm": 0.43252541349342494, + "learning_rate": 9.965248764183486e-06, + "loss": 0.3474, + "step": 560 + }, + { + "epoch": 0.13351579698934968, + "grad_norm": 0.4244516806580105, + "learning_rate": 9.965021526420146e-06, + "loss": 0.4157, + "step": 561 + }, + { + "epoch": 0.13375379306241447, + "grad_norm": 0.4501211384384804, + "learning_rate": 9.96479355073178e-06, + "loss": 0.4489, + "step": 562 + }, + { + "epoch": 0.13399178913547927, + "grad_norm": 0.433684978151453, + "learning_rate": 9.964564837152268e-06, + "loss": 0.3325, + "step": 563 + }, + { + "epoch": 0.13422978520854406, + "grad_norm": 0.4143350623973339, + "learning_rate": 9.964335385715607e-06, + "loss": 0.4103, + "step": 564 + }, + { + "epoch": 0.13446778128160886, + "grad_norm": 0.4356764057938835, + "learning_rate": 9.964105196455892e-06, + "loss": 0.4426, + "step": 565 + }, + { + "epoch": 0.13470577735467365, + "grad_norm": 0.43904123994773603, + "learning_rate": 9.963874269407342e-06, + "loss": 0.3823, + "step": 566 + }, + { + "epoch": 0.13494377342773844, + "grad_norm": 0.4195562006737919, + "learning_rate": 9.963642604604273e-06, + "loss": 0.358, + "step": 567 + }, + { + "epoch": 0.13518176950080324, + "grad_norm": 0.4134567373776786, + "learning_rate": 9.963410202081118e-06, + "loss": 0.3699, + "step": 568 + }, + { + "epoch": 0.13541976557386803, + "grad_norm": 0.4677127657760346, + "learning_rate": 9.96317706187242e-06, + "loss": 0.4375, + "step": 569 + }, + { + "epoch": 0.13565776164693283, + "grad_norm": 0.459487545289008, + "learning_rate": 9.962943184012826e-06, + "loss": 0.3227, + "step": 570 + }, + { + "epoch": 0.13589575771999762, + "grad_norm": 0.46349644278250035, + "learning_rate": 9.962708568537099e-06, + "loss": 0.382, + "step": 571 + }, + { + "epoch": 0.13613375379306242, + "grad_norm": 0.42793811916759666, + "learning_rate": 9.962473215480106e-06, + "loss": 0.4328, + "step": 572 + }, + { + "epoch": 0.1363717498661272, + "grad_norm": 0.4132709478039074, + "learning_rate": 9.962237124876828e-06, + "loss": 0.4126, + "step": 573 + }, + { + "epoch": 0.136609745939192, + "grad_norm": 0.4876228484575214, + "learning_rate": 9.962000296762352e-06, + "loss": 0.3429, + "step": 574 + }, + { + "epoch": 0.1368477420122568, + "grad_norm": 0.3977015514378766, + "learning_rate": 9.96176273117188e-06, + "loss": 0.4365, + "step": 575 + }, + { + "epoch": 0.1370857380853216, + "grad_norm": 0.4078608665053635, + "learning_rate": 9.961524428140716e-06, + "loss": 0.4907, + "step": 576 + }, + { + "epoch": 0.1373237341583864, + "grad_norm": 0.43371567468393196, + "learning_rate": 9.961285387704283e-06, + "loss": 0.3612, + "step": 577 + }, + { + "epoch": 0.13756173023145118, + "grad_norm": 0.4879930467640949, + "learning_rate": 9.961045609898103e-06, + "loss": 0.3276, + "step": 578 + }, + { + "epoch": 0.13779972630451598, + "grad_norm": 0.40451489231885396, + "learning_rate": 9.960805094757815e-06, + "loss": 0.4356, + "step": 579 + }, + { + "epoch": 0.13803772237758077, + "grad_norm": 0.44097468379761057, + "learning_rate": 9.960563842319164e-06, + "loss": 0.4218, + "step": 580 + }, + { + "epoch": 0.13827571845064557, + "grad_norm": 0.5834242401504829, + "learning_rate": 9.96032185261801e-06, + "loss": 0.3877, + "step": 581 + }, + { + "epoch": 0.13851371452371036, + "grad_norm": 0.45653361970962053, + "learning_rate": 9.960079125690317e-06, + "loss": 0.3879, + "step": 582 + }, + { + "epoch": 0.13875171059677516, + "grad_norm": 0.4476140474436517, + "learning_rate": 9.959835661572158e-06, + "loss": 0.4525, + "step": 583 + }, + { + "epoch": 0.13898970666983995, + "grad_norm": 0.4543506355962775, + "learning_rate": 9.959591460299719e-06, + "loss": 0.4012, + "step": 584 + }, + { + "epoch": 0.13922770274290475, + "grad_norm": 0.4800146363435457, + "learning_rate": 9.959346521909295e-06, + "loss": 0.3524, + "step": 585 + }, + { + "epoch": 0.13946569881596954, + "grad_norm": 0.4366124496205618, + "learning_rate": 9.95910084643729e-06, + "loss": 0.4189, + "step": 586 + }, + { + "epoch": 0.13970369488903434, + "grad_norm": 0.44571391749959416, + "learning_rate": 9.958854433920215e-06, + "loss": 0.4703, + "step": 587 + }, + { + "epoch": 0.13994169096209913, + "grad_norm": 0.49239121417665377, + "learning_rate": 9.958607284394696e-06, + "loss": 0.343, + "step": 588 + }, + { + "epoch": 0.14017968703516392, + "grad_norm": 0.4317901276773715, + "learning_rate": 9.958359397897465e-06, + "loss": 0.3861, + "step": 589 + }, + { + "epoch": 0.14041768310822872, + "grad_norm": 0.4147928486152337, + "learning_rate": 9.958110774465364e-06, + "loss": 0.4516, + "step": 590 + }, + { + "epoch": 0.1406556791812935, + "grad_norm": 0.4611062418346612, + "learning_rate": 9.957861414135343e-06, + "loss": 0.3849, + "step": 591 + }, + { + "epoch": 0.1408936752543583, + "grad_norm": 0.5052327412000163, + "learning_rate": 9.957611316944465e-06, + "loss": 0.4016, + "step": 592 + }, + { + "epoch": 0.1411316713274231, + "grad_norm": 0.4732644464070684, + "learning_rate": 9.957360482929898e-06, + "loss": 0.3911, + "step": 593 + }, + { + "epoch": 0.1413696674004879, + "grad_norm": 0.45055177739669783, + "learning_rate": 9.957108912128927e-06, + "loss": 0.4721, + "step": 594 + }, + { + "epoch": 0.1416076634735527, + "grad_norm": 0.5207702754193928, + "learning_rate": 9.956856604578937e-06, + "loss": 0.3663, + "step": 595 + }, + { + "epoch": 0.1418456595466175, + "grad_norm": 0.4532039740900448, + "learning_rate": 9.95660356031743e-06, + "loss": 0.3646, + "step": 596 + }, + { + "epoch": 0.14208365561968228, + "grad_norm": 0.45255299216112244, + "learning_rate": 9.956349779382014e-06, + "loss": 0.4761, + "step": 597 + }, + { + "epoch": 0.14232165169274708, + "grad_norm": 0.4294348039394876, + "learning_rate": 9.956095261810404e-06, + "loss": 0.4312, + "step": 598 + }, + { + "epoch": 0.14255964776581187, + "grad_norm": 0.4145607881038628, + "learning_rate": 9.955840007640432e-06, + "loss": 0.3167, + "step": 599 + }, + { + "epoch": 0.14279764383887666, + "grad_norm": 0.44934929004475394, + "learning_rate": 9.955584016910033e-06, + "loss": 0.3974, + "step": 600 + }, + { + "epoch": 0.14303563991194146, + "grad_norm": 0.39995571799722107, + "learning_rate": 9.955327289657253e-06, + "loss": 0.4574, + "step": 601 + }, + { + "epoch": 0.14327363598500625, + "grad_norm": 0.4365868901329556, + "learning_rate": 9.955069825920249e-06, + "loss": 0.3627, + "step": 602 + }, + { + "epoch": 0.14351163205807105, + "grad_norm": 0.4542607300367256, + "learning_rate": 9.954811625737289e-06, + "loss": 0.3208, + "step": 603 + }, + { + "epoch": 0.14374962813113584, + "grad_norm": 0.42854644501077194, + "learning_rate": 9.954552689146743e-06, + "loss": 0.4154, + "step": 604 + }, + { + "epoch": 0.14398762420420064, + "grad_norm": 0.43253450331881993, + "learning_rate": 9.954293016187098e-06, + "loss": 0.4209, + "step": 605 + }, + { + "epoch": 0.14422562027726543, + "grad_norm": 0.4283765316302236, + "learning_rate": 9.954032606896946e-06, + "loss": 0.3312, + "step": 606 + }, + { + "epoch": 0.14446361635033023, + "grad_norm": 0.4341091606907487, + "learning_rate": 9.953771461314994e-06, + "loss": 0.4196, + "step": 607 + }, + { + "epoch": 0.14470161242339502, + "grad_norm": 0.41550595013594926, + "learning_rate": 9.953509579480052e-06, + "loss": 0.4728, + "step": 608 + }, + { + "epoch": 0.14493960849645982, + "grad_norm": 0.41104915967555633, + "learning_rate": 9.953246961431043e-06, + "loss": 0.3532, + "step": 609 + }, + { + "epoch": 0.1451776045695246, + "grad_norm": 0.44491387505754176, + "learning_rate": 9.952983607206996e-06, + "loss": 0.3567, + "step": 610 + }, + { + "epoch": 0.1454156006425894, + "grad_norm": 0.41859179197947394, + "learning_rate": 9.952719516847055e-06, + "loss": 0.4096, + "step": 611 + }, + { + "epoch": 0.1456535967156542, + "grad_norm": 0.422145361038327, + "learning_rate": 9.95245469039047e-06, + "loss": 0.4246, + "step": 612 + }, + { + "epoch": 0.145891592788719, + "grad_norm": 0.4500980509847598, + "learning_rate": 9.9521891278766e-06, + "loss": 0.3426, + "step": 613 + }, + { + "epoch": 0.1461295888617838, + "grad_norm": 0.43000982194705367, + "learning_rate": 9.951922829344914e-06, + "loss": 0.3946, + "step": 614 + }, + { + "epoch": 0.14636758493484858, + "grad_norm": 0.41053172531397664, + "learning_rate": 9.951655794834991e-06, + "loss": 0.4663, + "step": 615 + }, + { + "epoch": 0.14660558100791338, + "grad_norm": 0.3923350216414785, + "learning_rate": 9.951388024386519e-06, + "loss": 0.3757, + "step": 616 + }, + { + "epoch": 0.14684357708097817, + "grad_norm": 0.4599433113284599, + "learning_rate": 9.951119518039297e-06, + "loss": 0.3385, + "step": 617 + }, + { + "epoch": 0.14708157315404297, + "grad_norm": 0.39908104730389854, + "learning_rate": 9.950850275833226e-06, + "loss": 0.4074, + "step": 618 + }, + { + "epoch": 0.14731956922710776, + "grad_norm": 0.4511403028324386, + "learning_rate": 9.950580297808329e-06, + "loss": 0.4194, + "step": 619 + }, + { + "epoch": 0.14755756530017256, + "grad_norm": 0.4617542670919041, + "learning_rate": 9.950309584004728e-06, + "loss": 0.3355, + "step": 620 + }, + { + "epoch": 0.14779556137323735, + "grad_norm": 0.4226130485474674, + "learning_rate": 9.950038134462655e-06, + "loss": 0.3681, + "step": 621 + }, + { + "epoch": 0.14803355744630214, + "grad_norm": 0.4247694573529077, + "learning_rate": 9.949765949222461e-06, + "loss": 0.4761, + "step": 622 + }, + { + "epoch": 0.14827155351936694, + "grad_norm": 0.4516914981541404, + "learning_rate": 9.949493028324593e-06, + "loss": 0.417, + "step": 623 + }, + { + "epoch": 0.14850954959243173, + "grad_norm": 0.45995529369267013, + "learning_rate": 9.949219371809618e-06, + "loss": 0.355, + "step": 624 + }, + { + "epoch": 0.14874754566549653, + "grad_norm": 0.4610870216065006, + "learning_rate": 9.948944979718206e-06, + "loss": 0.4184, + "step": 625 + }, + { + "epoch": 0.14898554173856132, + "grad_norm": 0.46221460180343904, + "learning_rate": 9.94866985209114e-06, + "loss": 0.4689, + "step": 626 + }, + { + "epoch": 0.14922353781162612, + "grad_norm": 0.4241292231021412, + "learning_rate": 9.948393988969307e-06, + "loss": 0.3911, + "step": 627 + }, + { + "epoch": 0.1494615338846909, + "grad_norm": 0.40337434368955516, + "learning_rate": 9.948117390393713e-06, + "loss": 0.3635, + "step": 628 + }, + { + "epoch": 0.1496995299577557, + "grad_norm": 0.40666940123628886, + "learning_rate": 9.947840056405461e-06, + "loss": 0.4265, + "step": 629 + }, + { + "epoch": 0.1499375260308205, + "grad_norm": 0.408720859695881, + "learning_rate": 9.947561987045777e-06, + "loss": 0.4675, + "step": 630 + }, + { + "epoch": 0.1501755221038853, + "grad_norm": 0.46059437009326065, + "learning_rate": 9.947283182355982e-06, + "loss": 0.3846, + "step": 631 + }, + { + "epoch": 0.1504135181769501, + "grad_norm": 0.40219876977724317, + "learning_rate": 9.947003642377517e-06, + "loss": 0.3583, + "step": 632 + }, + { + "epoch": 0.15065151425001488, + "grad_norm": 0.40741994114516356, + "learning_rate": 9.946723367151929e-06, + "loss": 0.4467, + "step": 633 + }, + { + "epoch": 0.15088951032307968, + "grad_norm": 0.4170974461418337, + "learning_rate": 9.94644235672087e-06, + "loss": 0.3964, + "step": 634 + }, + { + "epoch": 0.15112750639614447, + "grad_norm": 0.3959538443481256, + "learning_rate": 9.94616061112611e-06, + "loss": 0.3624, + "step": 635 + }, + { + "epoch": 0.15136550246920927, + "grad_norm": 0.41399916071040127, + "learning_rate": 9.94587813040952e-06, + "loss": 0.3988, + "step": 636 + }, + { + "epoch": 0.15160349854227406, + "grad_norm": 0.42565733949269147, + "learning_rate": 9.945594914613085e-06, + "loss": 0.4244, + "step": 637 + }, + { + "epoch": 0.15184149461533886, + "grad_norm": 0.4176195376045644, + "learning_rate": 9.945310963778897e-06, + "loss": 0.3422, + "step": 638 + }, + { + "epoch": 0.15207949068840365, + "grad_norm": 0.4091797435607871, + "learning_rate": 9.945026277949159e-06, + "loss": 0.391, + "step": 639 + }, + { + "epoch": 0.15231748676146845, + "grad_norm": 0.3752152693514474, + "learning_rate": 9.944740857166181e-06, + "loss": 0.4218, + "step": 640 + }, + { + "epoch": 0.15255548283453324, + "grad_norm": 0.7050883051614025, + "learning_rate": 9.944454701472387e-06, + "loss": 0.3827, + "step": 641 + }, + { + "epoch": 0.15279347890759803, + "grad_norm": 0.4433283675728129, + "learning_rate": 9.944167810910304e-06, + "loss": 0.3533, + "step": 642 + }, + { + "epoch": 0.15303147498066283, + "grad_norm": 0.42554987379842735, + "learning_rate": 9.94388018552257e-06, + "loss": 0.4089, + "step": 643 + }, + { + "epoch": 0.15326947105372762, + "grad_norm": 0.45679111789324556, + "learning_rate": 9.943591825351934e-06, + "loss": 0.4456, + "step": 644 + }, + { + "epoch": 0.15350746712679242, + "grad_norm": 0.42999137088818135, + "learning_rate": 9.943302730441258e-06, + "loss": 0.3426, + "step": 645 + }, + { + "epoch": 0.1537454631998572, + "grad_norm": 0.43170003868689516, + "learning_rate": 9.943012900833503e-06, + "loss": 0.3613, + "step": 646 + }, + { + "epoch": 0.153983459272922, + "grad_norm": 0.40514208501535537, + "learning_rate": 9.942722336571746e-06, + "loss": 0.459, + "step": 647 + }, + { + "epoch": 0.1542214553459868, + "grad_norm": 0.42503224905946857, + "learning_rate": 9.942431037699171e-06, + "loss": 0.4123, + "step": 648 + }, + { + "epoch": 0.1544594514190516, + "grad_norm": 0.43534422977552745, + "learning_rate": 9.942139004259077e-06, + "loss": 0.3489, + "step": 649 + }, + { + "epoch": 0.1546974474921164, + "grad_norm": 0.4042874898448957, + "learning_rate": 9.941846236294863e-06, + "loss": 0.417, + "step": 650 + }, + { + "epoch": 0.15493544356518119, + "grad_norm": 0.41685565190135837, + "learning_rate": 9.941552733850044e-06, + "loss": 0.4741, + "step": 651 + }, + { + "epoch": 0.15517343963824598, + "grad_norm": 0.41165103596561803, + "learning_rate": 9.941258496968238e-06, + "loss": 0.3744, + "step": 652 + }, + { + "epoch": 0.15541143571131077, + "grad_norm": 0.4215966209056301, + "learning_rate": 9.940963525693181e-06, + "loss": 0.3437, + "step": 653 + }, + { + "epoch": 0.15564943178437557, + "grad_norm": 0.4524386189484754, + "learning_rate": 9.94066782006871e-06, + "loss": 0.4291, + "step": 654 + }, + { + "epoch": 0.15588742785744036, + "grad_norm": 0.43482975599739715, + "learning_rate": 9.940371380138774e-06, + "loss": 0.4164, + "step": 655 + }, + { + "epoch": 0.15612542393050516, + "grad_norm": 0.4120821183177027, + "learning_rate": 9.940074205947432e-06, + "loss": 0.3363, + "step": 656 + }, + { + "epoch": 0.15636342000356995, + "grad_norm": 0.4309229287404229, + "learning_rate": 9.939776297538853e-06, + "loss": 0.389, + "step": 657 + }, + { + "epoch": 0.15660141607663475, + "grad_norm": 0.416688460486632, + "learning_rate": 9.93947765495731e-06, + "loss": 0.4333, + "step": 658 + }, + { + "epoch": 0.15683941214969954, + "grad_norm": 0.3965370666176588, + "learning_rate": 9.939178278247192e-06, + "loss": 0.3478, + "step": 659 + }, + { + "epoch": 0.15707740822276434, + "grad_norm": 0.439104289476318, + "learning_rate": 9.938878167452991e-06, + "loss": 0.3427, + "step": 660 + }, + { + "epoch": 0.15731540429582913, + "grad_norm": 0.398185404048493, + "learning_rate": 9.938577322619315e-06, + "loss": 0.3966, + "step": 661 + }, + { + "epoch": 0.15755340036889393, + "grad_norm": 0.4488481583859823, + "learning_rate": 9.938275743790872e-06, + "loss": 0.4251, + "step": 662 + }, + { + "epoch": 0.15779139644195872, + "grad_norm": 0.4417275681933899, + "learning_rate": 9.937973431012488e-06, + "loss": 0.3735, + "step": 663 + }, + { + "epoch": 0.15802939251502351, + "grad_norm": 0.4693818789749731, + "learning_rate": 9.937670384329092e-06, + "loss": 0.3718, + "step": 664 + }, + { + "epoch": 0.1582673885880883, + "grad_norm": 0.44060498282158367, + "learning_rate": 9.937366603785725e-06, + "loss": 0.4519, + "step": 665 + }, + { + "epoch": 0.1585053846611531, + "grad_norm": 0.4132724605397321, + "learning_rate": 9.937062089427534e-06, + "loss": 0.3665, + "step": 666 + }, + { + "epoch": 0.1587433807342179, + "grad_norm": 0.43487153487168945, + "learning_rate": 9.936756841299782e-06, + "loss": 0.3501, + "step": 667 + }, + { + "epoch": 0.1589813768072827, + "grad_norm": 0.4454465202025332, + "learning_rate": 9.936450859447833e-06, + "loss": 0.4349, + "step": 668 + }, + { + "epoch": 0.1592193728803475, + "grad_norm": 0.43517274964693736, + "learning_rate": 9.936144143917164e-06, + "loss": 0.4639, + "step": 669 + }, + { + "epoch": 0.15945736895341228, + "grad_norm": 0.4325185752961809, + "learning_rate": 9.935836694753363e-06, + "loss": 0.355, + "step": 670 + }, + { + "epoch": 0.15969536502647708, + "grad_norm": 0.41185000783549897, + "learning_rate": 9.93552851200212e-06, + "loss": 0.3643, + "step": 671 + }, + { + "epoch": 0.15993336109954187, + "grad_norm": 0.44265593925954544, + "learning_rate": 9.935219595709242e-06, + "loss": 0.4427, + "step": 672 + }, + { + "epoch": 0.16017135717260667, + "grad_norm": 0.47046598610709517, + "learning_rate": 9.93490994592064e-06, + "loss": 0.3921, + "step": 673 + }, + { + "epoch": 0.16040935324567146, + "grad_norm": 0.427596260881768, + "learning_rate": 9.934599562682337e-06, + "loss": 0.3758, + "step": 674 + }, + { + "epoch": 0.16064734931873625, + "grad_norm": 0.42891689676165107, + "learning_rate": 9.934288446040462e-06, + "loss": 0.3723, + "step": 675 + }, + { + "epoch": 0.16088534539180105, + "grad_norm": 0.40195671977273745, + "learning_rate": 9.933976596041257e-06, + "loss": 0.4701, + "step": 676 + }, + { + "epoch": 0.16112334146486584, + "grad_norm": 0.43868503957164495, + "learning_rate": 9.933664012731067e-06, + "loss": 0.3516, + "step": 677 + }, + { + "epoch": 0.1613613375379306, + "grad_norm": 0.45694726273935127, + "learning_rate": 9.933350696156354e-06, + "loss": 0.3707, + "step": 678 + }, + { + "epoch": 0.1615993336109954, + "grad_norm": 0.42986712701679414, + "learning_rate": 9.933036646363681e-06, + "loss": 0.4296, + "step": 679 + }, + { + "epoch": 0.1618373296840602, + "grad_norm": 0.38103639863281863, + "learning_rate": 9.932721863399726e-06, + "loss": 0.4153, + "step": 680 + }, + { + "epoch": 0.162075325757125, + "grad_norm": 0.4320260369164701, + "learning_rate": 9.93240634731127e-06, + "loss": 0.3337, + "step": 681 + }, + { + "epoch": 0.1623133218301898, + "grad_norm": 0.42202425560257006, + "learning_rate": 9.93209009814521e-06, + "loss": 0.3749, + "step": 682 + }, + { + "epoch": 0.16255131790325458, + "grad_norm": 0.39918790424517825, + "learning_rate": 9.93177311594855e-06, + "loss": 0.4151, + "step": 683 + }, + { + "epoch": 0.16278931397631938, + "grad_norm": 0.41085741596254716, + "learning_rate": 9.931455400768396e-06, + "loss": 0.3713, + "step": 684 + }, + { + "epoch": 0.16302731004938417, + "grad_norm": 0.4935289189499038, + "learning_rate": 9.931136952651971e-06, + "loss": 0.3496, + "step": 685 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.45454318553894524, + "learning_rate": 9.930817771646606e-06, + "loss": 0.4423, + "step": 686 + }, + { + "epoch": 0.16350330219551376, + "grad_norm": 0.48363104373210164, + "learning_rate": 9.930497857799737e-06, + "loss": 0.4216, + "step": 687 + }, + { + "epoch": 0.16374129826857856, + "grad_norm": 0.4024082102755574, + "learning_rate": 9.93017721115891e-06, + "loss": 0.3559, + "step": 688 + }, + { + "epoch": 0.16397929434164335, + "grad_norm": 0.4423776261521702, + "learning_rate": 9.929855831771787e-06, + "loss": 0.3834, + "step": 689 + }, + { + "epoch": 0.16421729041470814, + "grad_norm": 0.44126311420707715, + "learning_rate": 9.929533719686123e-06, + "loss": 0.4315, + "step": 690 + }, + { + "epoch": 0.16445528648777294, + "grad_norm": 0.4387429322912089, + "learning_rate": 9.929210874949802e-06, + "loss": 0.3487, + "step": 691 + }, + { + "epoch": 0.16469328256083773, + "grad_norm": 0.48730072451609946, + "learning_rate": 9.928887297610801e-06, + "loss": 0.3254, + "step": 692 + }, + { + "epoch": 0.16493127863390253, + "grad_norm": 0.4229571083939119, + "learning_rate": 9.928562987717211e-06, + "loss": 0.3936, + "step": 693 + }, + { + "epoch": 0.16516927470696732, + "grad_norm": 0.3987563067271361, + "learning_rate": 9.928237945317237e-06, + "loss": 0.4242, + "step": 694 + }, + { + "epoch": 0.16540727078003212, + "grad_norm": 0.4566722968242409, + "learning_rate": 9.927912170459183e-06, + "loss": 0.346, + "step": 695 + }, + { + "epoch": 0.1656452668530969, + "grad_norm": 0.46544117565173826, + "learning_rate": 9.927585663191472e-06, + "loss": 0.377, + "step": 696 + }, + { + "epoch": 0.1658832629261617, + "grad_norm": 0.42065889884419416, + "learning_rate": 9.927258423562628e-06, + "loss": 0.4574, + "step": 697 + }, + { + "epoch": 0.1661212589992265, + "grad_norm": 0.41117147375225527, + "learning_rate": 9.926930451621286e-06, + "loss": 0.411, + "step": 698 + }, + { + "epoch": 0.1663592550722913, + "grad_norm": 0.41917410595753857, + "learning_rate": 9.926601747416194e-06, + "loss": 0.3413, + "step": 699 + }, + { + "epoch": 0.1665972511453561, + "grad_norm": 0.513199643744508, + "learning_rate": 9.926272310996205e-06, + "loss": 0.4063, + "step": 700 + }, + { + "epoch": 0.16683524721842088, + "grad_norm": 0.42672332270069796, + "learning_rate": 9.92594214241028e-06, + "loss": 0.4861, + "step": 701 + }, + { + "epoch": 0.16707324329148568, + "grad_norm": 0.42431192157186515, + "learning_rate": 9.925611241707491e-06, + "loss": 0.3939, + "step": 702 + }, + { + "epoch": 0.16731123936455047, + "grad_norm": 0.44467532229285445, + "learning_rate": 9.925279608937014e-06, + "loss": 0.3696, + "step": 703 + }, + { + "epoch": 0.16754923543761527, + "grad_norm": 0.4515098096528452, + "learning_rate": 9.924947244148147e-06, + "loss": 0.4285, + "step": 704 + }, + { + "epoch": 0.16778723151068006, + "grad_norm": 0.5043978783638837, + "learning_rate": 9.924614147390278e-06, + "loss": 0.407, + "step": 705 + }, + { + "epoch": 0.16802522758374486, + "grad_norm": 0.44962332273904415, + "learning_rate": 9.92428031871292e-06, + "loss": 0.3473, + "step": 706 + }, + { + "epoch": 0.16826322365680965, + "grad_norm": 0.4212768922385841, + "learning_rate": 9.923945758165686e-06, + "loss": 0.3899, + "step": 707 + }, + { + "epoch": 0.16850121972987445, + "grad_norm": 0.42396104130969997, + "learning_rate": 9.923610465798298e-06, + "loss": 0.4148, + "step": 708 + }, + { + "epoch": 0.16873921580293924, + "grad_norm": 0.43780183253810606, + "learning_rate": 9.923274441660593e-06, + "loss": 0.3524, + "step": 709 + }, + { + "epoch": 0.16897721187600404, + "grad_norm": 0.4610507977132175, + "learning_rate": 9.922937685802508e-06, + "loss": 0.3738, + "step": 710 + }, + { + "epoch": 0.16921520794906883, + "grad_norm": 0.41170477402274724, + "learning_rate": 9.922600198274096e-06, + "loss": 0.4127, + "step": 711 + }, + { + "epoch": 0.16945320402213362, + "grad_norm": 0.42904482538288863, + "learning_rate": 9.922261979125516e-06, + "loss": 0.4375, + "step": 712 + }, + { + "epoch": 0.16969120009519842, + "grad_norm": 0.4288980919174409, + "learning_rate": 9.921923028407034e-06, + "loss": 0.3382, + "step": 713 + }, + { + "epoch": 0.1699291961682632, + "grad_norm": 0.400850498314156, + "learning_rate": 9.921583346169027e-06, + "loss": 0.3488, + "step": 714 + }, + { + "epoch": 0.170167192241328, + "grad_norm": 0.46108960454734477, + "learning_rate": 9.92124293246198e-06, + "loss": 0.4419, + "step": 715 + }, + { + "epoch": 0.1704051883143928, + "grad_norm": 0.41992077014825, + "learning_rate": 9.92090178733649e-06, + "loss": 0.4189, + "step": 716 + }, + { + "epoch": 0.1706431843874576, + "grad_norm": 0.48252193952318556, + "learning_rate": 9.920559910843255e-06, + "loss": 0.3313, + "step": 717 + }, + { + "epoch": 0.1708811804605224, + "grad_norm": 0.40035812359955575, + "learning_rate": 9.920217303033091e-06, + "loss": 0.4049, + "step": 718 + }, + { + "epoch": 0.1711191765335872, + "grad_norm": 0.4431295892233984, + "learning_rate": 9.919873963956914e-06, + "loss": 0.4627, + "step": 719 + }, + { + "epoch": 0.17135717260665198, + "grad_norm": 0.4546851162600903, + "learning_rate": 9.919529893665753e-06, + "loss": 0.3459, + "step": 720 + }, + { + "epoch": 0.17159516867971678, + "grad_norm": 0.41399727681843856, + "learning_rate": 9.919185092210748e-06, + "loss": 0.3504, + "step": 721 + }, + { + "epoch": 0.17183316475278157, + "grad_norm": 0.39447751341186205, + "learning_rate": 9.918839559643143e-06, + "loss": 0.4322, + "step": 722 + }, + { + "epoch": 0.17207116082584636, + "grad_norm": 0.4316965191862936, + "learning_rate": 9.918493296014294e-06, + "loss": 0.3914, + "step": 723 + }, + { + "epoch": 0.17230915689891116, + "grad_norm": 0.4190761857592741, + "learning_rate": 9.918146301375663e-06, + "loss": 0.3526, + "step": 724 + }, + { + "epoch": 0.17254715297197595, + "grad_norm": 0.4405259759469165, + "learning_rate": 9.917798575778821e-06, + "loss": 0.405, + "step": 725 + }, + { + "epoch": 0.17278514904504075, + "grad_norm": 0.39930877974901136, + "learning_rate": 9.917450119275452e-06, + "loss": 0.4576, + "step": 726 + }, + { + "epoch": 0.17302314511810554, + "grad_norm": 0.4312658018371688, + "learning_rate": 9.917100931917343e-06, + "loss": 0.3494, + "step": 727 + }, + { + "epoch": 0.17326114119117034, + "grad_norm": 0.3952105575090353, + "learning_rate": 9.916751013756393e-06, + "loss": 0.3349, + "step": 728 + }, + { + "epoch": 0.17349913726423513, + "grad_norm": 0.4114863899271299, + "learning_rate": 9.916400364844608e-06, + "loss": 0.4229, + "step": 729 + }, + { + "epoch": 0.17373713333729993, + "grad_norm": 0.42634304050454525, + "learning_rate": 9.916048985234102e-06, + "loss": 0.4014, + "step": 730 + }, + { + "epoch": 0.17397512941036472, + "grad_norm": 0.46609894460053153, + "learning_rate": 9.9156968749771e-06, + "loss": 0.3438, + "step": 731 + }, + { + "epoch": 0.17421312548342952, + "grad_norm": 0.4204390869113515, + "learning_rate": 9.915344034125931e-06, + "loss": 0.4, + "step": 732 + }, + { + "epoch": 0.1744511215564943, + "grad_norm": 0.411331032799109, + "learning_rate": 9.914990462733042e-06, + "loss": 0.4429, + "step": 733 + }, + { + "epoch": 0.1746891176295591, + "grad_norm": 0.4311784563828495, + "learning_rate": 9.914636160850979e-06, + "loss": 0.3685, + "step": 734 + }, + { + "epoch": 0.1749271137026239, + "grad_norm": 0.43611688575958385, + "learning_rate": 9.914281128532399e-06, + "loss": 0.3606, + "step": 735 + }, + { + "epoch": 0.1751651097756887, + "grad_norm": 0.4083035624121461, + "learning_rate": 9.91392536583007e-06, + "loss": 0.4403, + "step": 736 + }, + { + "epoch": 0.1754031058487535, + "grad_norm": 0.4515206561127267, + "learning_rate": 9.913568872796867e-06, + "loss": 0.4095, + "step": 737 + }, + { + "epoch": 0.17564110192181828, + "grad_norm": 0.39490410731530035, + "learning_rate": 9.913211649485776e-06, + "loss": 0.3601, + "step": 738 + }, + { + "epoch": 0.17587909799488308, + "grad_norm": 0.4383233004999705, + "learning_rate": 9.912853695949884e-06, + "loss": 0.3656, + "step": 739 + }, + { + "epoch": 0.17611709406794787, + "grad_norm": 0.4395334470031347, + "learning_rate": 9.912495012242396e-06, + "loss": 0.4211, + "step": 740 + }, + { + "epoch": 0.17635509014101267, + "grad_norm": 0.4397061820033599, + "learning_rate": 9.91213559841662e-06, + "loss": 0.3788, + "step": 741 + }, + { + "epoch": 0.17659308621407746, + "grad_norm": 0.4378799122528206, + "learning_rate": 9.911775454525974e-06, + "loss": 0.3375, + "step": 742 + }, + { + "epoch": 0.17683108228714225, + "grad_norm": 0.40859435165523883, + "learning_rate": 9.911414580623983e-06, + "loss": 0.3983, + "step": 743 + }, + { + "epoch": 0.17706907836020705, + "grad_norm": 0.4344842442523242, + "learning_rate": 9.911052976764282e-06, + "loss": 0.4477, + "step": 744 + }, + { + "epoch": 0.17730707443327184, + "grad_norm": 0.42409343119659304, + "learning_rate": 9.910690643000617e-06, + "loss": 0.331, + "step": 745 + }, + { + "epoch": 0.17754507050633664, + "grad_norm": 0.44353774653745326, + "learning_rate": 9.910327579386836e-06, + "loss": 0.3685, + "step": 746 + }, + { + "epoch": 0.17778306657940143, + "grad_norm": 0.39112060910154156, + "learning_rate": 9.909963785976902e-06, + "loss": 0.4222, + "step": 747 + }, + { + "epoch": 0.17802106265246623, + "grad_norm": 0.41719770556324215, + "learning_rate": 9.909599262824882e-06, + "loss": 0.4003, + "step": 748 + }, + { + "epoch": 0.17825905872553102, + "grad_norm": 0.43782801010917444, + "learning_rate": 9.909234009984956e-06, + "loss": 0.3264, + "step": 749 + }, + { + "epoch": 0.17849705479859582, + "grad_norm": 0.43932280484024017, + "learning_rate": 9.908868027511407e-06, + "loss": 0.3948, + "step": 750 + }, + { + "epoch": 0.1787350508716606, + "grad_norm": 0.40931591350618013, + "learning_rate": 9.908501315458628e-06, + "loss": 0.4369, + "step": 751 + }, + { + "epoch": 0.1789730469447254, + "grad_norm": 0.440394461817193, + "learning_rate": 9.908133873881125e-06, + "loss": 0.3308, + "step": 752 + }, + { + "epoch": 0.1792110430177902, + "grad_norm": 0.45585993680493553, + "learning_rate": 9.907765702833506e-06, + "loss": 0.3327, + "step": 753 + }, + { + "epoch": 0.179449039090855, + "grad_norm": 0.39891546180528054, + "learning_rate": 9.90739680237049e-06, + "loss": 0.4372, + "step": 754 + }, + { + "epoch": 0.1796870351639198, + "grad_norm": 0.4449856003881457, + "learning_rate": 9.907027172546907e-06, + "loss": 0.3646, + "step": 755 + }, + { + "epoch": 0.17992503123698458, + "grad_norm": 0.43483545394451123, + "learning_rate": 9.906656813417692e-06, + "loss": 0.366, + "step": 756 + }, + { + "epoch": 0.18016302731004938, + "grad_norm": 0.422673166996465, + "learning_rate": 9.90628572503789e-06, + "loss": 0.3759, + "step": 757 + }, + { + "epoch": 0.18040102338311417, + "grad_norm": 0.39567142752341666, + "learning_rate": 9.905913907462655e-06, + "loss": 0.4366, + "step": 758 + }, + { + "epoch": 0.18063901945617897, + "grad_norm": 0.4683314806260047, + "learning_rate": 9.905541360747244e-06, + "loss": 0.362, + "step": 759 + }, + { + "epoch": 0.18087701552924376, + "grad_norm": 0.41522141951305347, + "learning_rate": 9.90516808494703e-06, + "loss": 0.3395, + "step": 760 + }, + { + "epoch": 0.18111501160230856, + "grad_norm": 0.4303510055280143, + "learning_rate": 9.904794080117493e-06, + "loss": 0.4321, + "step": 761 + }, + { + "epoch": 0.18135300767537335, + "grad_norm": 0.4228437966345767, + "learning_rate": 9.904419346314216e-06, + "loss": 0.441, + "step": 762 + }, + { + "epoch": 0.18159100374843815, + "grad_norm": 0.4453386428802394, + "learning_rate": 9.904043883592893e-06, + "loss": 0.3773, + "step": 763 + }, + { + "epoch": 0.18182899982150294, + "grad_norm": 1.180486578415128, + "learning_rate": 9.90366769200933e-06, + "loss": 0.3609, + "step": 764 + }, + { + "epoch": 0.18206699589456773, + "grad_norm": 0.42781342072889006, + "learning_rate": 9.903290771619437e-06, + "loss": 0.4432, + "step": 765 + }, + { + "epoch": 0.18230499196763253, + "grad_norm": 0.38960119770687374, + "learning_rate": 9.902913122479235e-06, + "loss": 0.3728, + "step": 766 + }, + { + "epoch": 0.18254298804069732, + "grad_norm": 0.4321322538249792, + "learning_rate": 9.902534744644848e-06, + "loss": 0.3431, + "step": 767 + }, + { + "epoch": 0.18278098411376212, + "grad_norm": 0.41307757787063704, + "learning_rate": 9.902155638172517e-06, + "loss": 0.395, + "step": 768 + }, + { + "epoch": 0.1830189801868269, + "grad_norm": 0.43556469923967867, + "learning_rate": 9.901775803118584e-06, + "loss": 0.4441, + "step": 769 + }, + { + "epoch": 0.1832569762598917, + "grad_norm": 0.43167762486626765, + "learning_rate": 9.901395239539502e-06, + "loss": 0.3372, + "step": 770 + }, + { + "epoch": 0.1834949723329565, + "grad_norm": 0.416467157274216, + "learning_rate": 9.901013947491834e-06, + "loss": 0.355, + "step": 771 + }, + { + "epoch": 0.1837329684060213, + "grad_norm": 0.4141392350996442, + "learning_rate": 9.900631927032247e-06, + "loss": 0.4116, + "step": 772 + }, + { + "epoch": 0.1839709644790861, + "grad_norm": 0.42131938024420973, + "learning_rate": 9.90024917821752e-06, + "loss": 0.3775, + "step": 773 + }, + { + "epoch": 0.18420896055215089, + "grad_norm": 0.38656999194996833, + "learning_rate": 9.899865701104542e-06, + "loss": 0.3404, + "step": 774 + }, + { + "epoch": 0.18444695662521568, + "grad_norm": 0.4002326366629186, + "learning_rate": 9.8994814957503e-06, + "loss": 0.4006, + "step": 775 + }, + { + "epoch": 0.18468495269828047, + "grad_norm": 0.389329793747449, + "learning_rate": 9.899096562211902e-06, + "loss": 0.4635, + "step": 776 + }, + { + "epoch": 0.18492294877134527, + "grad_norm": 0.412679750416995, + "learning_rate": 9.898710900546557e-06, + "loss": 0.3451, + "step": 777 + }, + { + "epoch": 0.18516094484441006, + "grad_norm": 0.42487703444957303, + "learning_rate": 9.898324510811583e-06, + "loss": 0.3428, + "step": 778 + }, + { + "epoch": 0.18539894091747486, + "grad_norm": 0.3907619553000587, + "learning_rate": 9.89793739306441e-06, + "loss": 0.4392, + "step": 779 + }, + { + "epoch": 0.18563693699053965, + "grad_norm": 0.4193572680245468, + "learning_rate": 9.897549547362569e-06, + "loss": 0.4373, + "step": 780 + }, + { + "epoch": 0.18587493306360445, + "grad_norm": 0.39493589356753456, + "learning_rate": 9.897160973763706e-06, + "loss": 0.3539, + "step": 781 + }, + { + "epoch": 0.18611292913666924, + "grad_norm": 0.4440516227501279, + "learning_rate": 9.896771672325574e-06, + "loss": 0.3769, + "step": 782 + }, + { + "epoch": 0.18635092520973404, + "grad_norm": 0.44352023507130256, + "learning_rate": 9.89638164310603e-06, + "loss": 0.4338, + "step": 783 + }, + { + "epoch": 0.18658892128279883, + "grad_norm": 0.7671181173054992, + "learning_rate": 9.895990886163043e-06, + "loss": 0.3319, + "step": 784 + }, + { + "epoch": 0.18682691735586363, + "grad_norm": 0.4685322094472268, + "learning_rate": 9.89559940155469e-06, + "loss": 0.3454, + "step": 785 + }, + { + "epoch": 0.18706491342892842, + "grad_norm": 0.3828784304357339, + "learning_rate": 9.895207189339154e-06, + "loss": 0.4445, + "step": 786 + }, + { + "epoch": 0.18730290950199321, + "grad_norm": 0.42324641037940064, + "learning_rate": 9.89481424957473e-06, + "loss": 0.451, + "step": 787 + }, + { + "epoch": 0.187540905575058, + "grad_norm": 0.4365750038109837, + "learning_rate": 9.894420582319814e-06, + "loss": 0.3191, + "step": 788 + }, + { + "epoch": 0.1877789016481228, + "grad_norm": 0.40443174222338457, + "learning_rate": 9.894026187632917e-06, + "loss": 0.3748, + "step": 789 + }, + { + "epoch": 0.1880168977211876, + "grad_norm": 0.43152637845700487, + "learning_rate": 9.893631065572659e-06, + "loss": 0.4441, + "step": 790 + }, + { + "epoch": 0.1882548937942524, + "grad_norm": 0.43199571990589086, + "learning_rate": 9.893235216197761e-06, + "loss": 0.3711, + "step": 791 + }, + { + "epoch": 0.1884928898673172, + "grad_norm": 0.46404959339010077, + "learning_rate": 9.892838639567057e-06, + "loss": 0.359, + "step": 792 + }, + { + "epoch": 0.18873088594038198, + "grad_norm": 0.4441492172576122, + "learning_rate": 9.892441335739487e-06, + "loss": 0.4042, + "step": 793 + }, + { + "epoch": 0.18896888201344678, + "grad_norm": 0.42927081504797776, + "learning_rate": 9.892043304774102e-06, + "loss": 0.4278, + "step": 794 + }, + { + "epoch": 0.18920687808651157, + "grad_norm": 0.402576668783229, + "learning_rate": 9.89164454673006e-06, + "loss": 0.38, + "step": 795 + }, + { + "epoch": 0.18944487415957637, + "grad_norm": 0.37582462659523325, + "learning_rate": 9.891245061666622e-06, + "loss": 0.3614, + "step": 796 + }, + { + "epoch": 0.18968287023264116, + "grad_norm": 0.3988214342258959, + "learning_rate": 9.890844849643166e-06, + "loss": 0.4387, + "step": 797 + }, + { + "epoch": 0.18992086630570595, + "grad_norm": 0.4910334602397654, + "learning_rate": 9.890443910719171e-06, + "loss": 0.3962, + "step": 798 + }, + { + "epoch": 0.19015886237877075, + "grad_norm": 0.44461874782652483, + "learning_rate": 9.890042244954229e-06, + "loss": 0.3385, + "step": 799 + }, + { + "epoch": 0.19039685845183554, + "grad_norm": 0.4129631941986688, + "learning_rate": 9.889639852408035e-06, + "loss": 0.3928, + "step": 800 + }, + { + "epoch": 0.19063485452490034, + "grad_norm": 0.41914251475794334, + "learning_rate": 9.889236733140393e-06, + "loss": 0.4463, + "step": 801 + }, + { + "epoch": 0.19087285059796513, + "grad_norm": 0.41664422509463567, + "learning_rate": 9.88883288721122e-06, + "loss": 0.3765, + "step": 802 + }, + { + "epoch": 0.19111084667102993, + "grad_norm": 0.4045420912022183, + "learning_rate": 9.888428314680536e-06, + "loss": 0.3567, + "step": 803 + }, + { + "epoch": 0.19134884274409472, + "grad_norm": 0.4044005074831901, + "learning_rate": 9.888023015608471e-06, + "loss": 0.3931, + "step": 804 + }, + { + "epoch": 0.19158683881715952, + "grad_norm": 0.4368135710203652, + "learning_rate": 9.887616990055262e-06, + "loss": 0.4112, + "step": 805 + }, + { + "epoch": 0.1918248348902243, + "grad_norm": 0.4882695812113364, + "learning_rate": 9.887210238081253e-06, + "loss": 0.3695, + "step": 806 + }, + { + "epoch": 0.1920628309632891, + "grad_norm": 0.4246207428107935, + "learning_rate": 9.8868027597469e-06, + "loss": 0.3797, + "step": 807 + }, + { + "epoch": 0.1923008270363539, + "grad_norm": 0.46283957801455083, + "learning_rate": 9.886394555112764e-06, + "loss": 0.4513, + "step": 808 + }, + { + "epoch": 0.1925388231094187, + "grad_norm": 0.4047912499967522, + "learning_rate": 9.885985624239513e-06, + "loss": 0.3578, + "step": 809 + }, + { + "epoch": 0.1927768191824835, + "grad_norm": 0.4174808369463013, + "learning_rate": 9.885575967187924e-06, + "loss": 0.3537, + "step": 810 + }, + { + "epoch": 0.19301481525554828, + "grad_norm": 0.4347320503909233, + "learning_rate": 9.885165584018882e-06, + "loss": 0.3847, + "step": 811 + }, + { + "epoch": 0.19325281132861308, + "grad_norm": 0.44226534184677574, + "learning_rate": 9.884754474793383e-06, + "loss": 0.4511, + "step": 812 + }, + { + "epoch": 0.19349080740167787, + "grad_norm": 0.4019420309353, + "learning_rate": 9.884342639572526e-06, + "loss": 0.3352, + "step": 813 + }, + { + "epoch": 0.19372880347474267, + "grad_norm": 0.42514275612922164, + "learning_rate": 9.88393007841752e-06, + "loss": 0.3673, + "step": 814 + }, + { + "epoch": 0.19396679954780746, + "grad_norm": 0.4524089013354289, + "learning_rate": 9.88351679138968e-06, + "loss": 0.4277, + "step": 815 + }, + { + "epoch": 0.19420479562087226, + "grad_norm": 0.4380590877999043, + "learning_rate": 9.883102778550434e-06, + "loss": 0.3627, + "step": 816 + }, + { + "epoch": 0.19444279169393705, + "grad_norm": 0.4165421579055438, + "learning_rate": 9.882688039961312e-06, + "loss": 0.345, + "step": 817 + }, + { + "epoch": 0.19468078776700184, + "grad_norm": 0.43180936146351057, + "learning_rate": 9.882272575683956e-06, + "loss": 0.4111, + "step": 818 + }, + { + "epoch": 0.19491878384006664, + "grad_norm": 0.41648821424267424, + "learning_rate": 9.881856385780115e-06, + "loss": 0.4578, + "step": 819 + }, + { + "epoch": 0.19515677991313143, + "grad_norm": 0.4246404967531619, + "learning_rate": 9.881439470311642e-06, + "loss": 0.3842, + "step": 820 + }, + { + "epoch": 0.19539477598619623, + "grad_norm": 0.4158895838872154, + "learning_rate": 9.881021829340502e-06, + "loss": 0.41, + "step": 821 + }, + { + "epoch": 0.19563277205926102, + "grad_norm": 0.4149994996559407, + "learning_rate": 9.880603462928769e-06, + "loss": 0.4231, + "step": 822 + }, + { + "epoch": 0.19587076813232582, + "grad_norm": 0.4089944684040092, + "learning_rate": 9.880184371138621e-06, + "loss": 0.3609, + "step": 823 + }, + { + "epoch": 0.1961087642053906, + "grad_norm": 0.44441855579372713, + "learning_rate": 9.879764554032345e-06, + "loss": 0.3333, + "step": 824 + }, + { + "epoch": 0.1963467602784554, + "grad_norm": 0.4230323557425423, + "learning_rate": 9.879344011672337e-06, + "loss": 0.4026, + "step": 825 + }, + { + "epoch": 0.1965847563515202, + "grad_norm": 0.42605983334806596, + "learning_rate": 9.8789227441211e-06, + "loss": 0.4439, + "step": 826 + }, + { + "epoch": 0.196822752424585, + "grad_norm": 0.44788672346466435, + "learning_rate": 9.878500751441244e-06, + "loss": 0.3576, + "step": 827 + }, + { + "epoch": 0.1970607484976498, + "grad_norm": 0.42604961830620175, + "learning_rate": 9.878078033695488e-06, + "loss": 0.3765, + "step": 828 + }, + { + "epoch": 0.19729874457071458, + "grad_norm": 0.4348482551029076, + "learning_rate": 9.877654590946659e-06, + "loss": 0.4327, + "step": 829 + }, + { + "epoch": 0.19753674064377938, + "grad_norm": 0.41378312933719785, + "learning_rate": 9.87723042325769e-06, + "loss": 0.4134, + "step": 830 + }, + { + "epoch": 0.19777473671684417, + "grad_norm": 0.473934565450725, + "learning_rate": 9.876805530691622e-06, + "loss": 0.3382, + "step": 831 + }, + { + "epoch": 0.19801273278990897, + "grad_norm": 0.45241327431142664, + "learning_rate": 9.876379913311607e-06, + "loss": 0.3837, + "step": 832 + }, + { + "epoch": 0.19825072886297376, + "grad_norm": 0.4046931569037168, + "learning_rate": 9.875953571180901e-06, + "loss": 0.4148, + "step": 833 + }, + { + "epoch": 0.19848872493603856, + "grad_norm": 0.4515187001854004, + "learning_rate": 9.875526504362868e-06, + "loss": 0.3804, + "step": 834 + }, + { + "epoch": 0.19872672100910335, + "grad_norm": 0.4533213069296073, + "learning_rate": 9.875098712920983e-06, + "loss": 0.3442, + "step": 835 + }, + { + "epoch": 0.19896471708216815, + "grad_norm": 0.4523183437793231, + "learning_rate": 9.874670196918824e-06, + "loss": 0.4091, + "step": 836 + }, + { + "epoch": 0.19920271315523294, + "grad_norm": 0.4119738644745412, + "learning_rate": 9.874240956420082e-06, + "loss": 0.4687, + "step": 837 + }, + { + "epoch": 0.19944070922829774, + "grad_norm": 0.4699475156689954, + "learning_rate": 9.87381099148855e-06, + "loss": 0.3553, + "step": 838 + }, + { + "epoch": 0.19967870530136253, + "grad_norm": 0.41221446850629756, + "learning_rate": 9.873380302188133e-06, + "loss": 0.3856, + "step": 839 + }, + { + "epoch": 0.19991670137442732, + "grad_norm": 0.3967941653688478, + "learning_rate": 9.87294888858284e-06, + "loss": 0.4608, + "step": 840 + }, + { + "epoch": 0.20015469744749212, + "grad_norm": 0.4446657496487925, + "learning_rate": 9.872516750736793e-06, + "loss": 0.4013, + "step": 841 + }, + { + "epoch": 0.2003926935205569, + "grad_norm": 0.4896648889382981, + "learning_rate": 9.872083888714217e-06, + "loss": 0.356, + "step": 842 + }, + { + "epoch": 0.2006306895936217, + "grad_norm": 0.4087173120218746, + "learning_rate": 9.871650302579443e-06, + "loss": 0.3968, + "step": 843 + }, + { + "epoch": 0.2008686856666865, + "grad_norm": 0.4824564537672377, + "learning_rate": 9.871215992396917e-06, + "loss": 0.4077, + "step": 844 + }, + { + "epoch": 0.2011066817397513, + "grad_norm": 0.44137153458158945, + "learning_rate": 9.870780958231186e-06, + "loss": 0.3466, + "step": 845 + }, + { + "epoch": 0.2013446778128161, + "grad_norm": 0.5156882053665172, + "learning_rate": 9.870345200146907e-06, + "loss": 0.375, + "step": 846 + }, + { + "epoch": 0.2015826738858809, + "grad_norm": 0.3651462313643267, + "learning_rate": 9.869908718208845e-06, + "loss": 0.4293, + "step": 847 + }, + { + "epoch": 0.20182066995894568, + "grad_norm": 0.4248226402638282, + "learning_rate": 9.869471512481872e-06, + "loss": 0.4093, + "step": 848 + }, + { + "epoch": 0.20205866603201048, + "grad_norm": 0.4715061071976561, + "learning_rate": 9.869033583030967e-06, + "loss": 0.3187, + "step": 849 + }, + { + "epoch": 0.20229666210507527, + "grad_norm": 0.4360335200519038, + "learning_rate": 9.868594929921217e-06, + "loss": 0.3947, + "step": 850 + }, + { + "epoch": 0.20253465817814006, + "grad_norm": 0.4128931217238059, + "learning_rate": 9.86815555321782e-06, + "loss": 0.4361, + "step": 851 + }, + { + "epoch": 0.20277265425120486, + "grad_norm": 0.46321878255592136, + "learning_rate": 9.867715452986073e-06, + "loss": 0.3231, + "step": 852 + }, + { + "epoch": 0.20301065032426965, + "grad_norm": 0.4537539113370709, + "learning_rate": 9.867274629291387e-06, + "loss": 0.3619, + "step": 853 + }, + { + "epoch": 0.20324864639733445, + "grad_norm": 0.38249330366239725, + "learning_rate": 9.866833082199283e-06, + "loss": 0.3993, + "step": 854 + }, + { + "epoch": 0.20348664247039924, + "grad_norm": 0.43771271278034846, + "learning_rate": 9.866390811775382e-06, + "loss": 0.3913, + "step": 855 + }, + { + "epoch": 0.20372463854346404, + "grad_norm": 0.46356127634456645, + "learning_rate": 9.865947818085417e-06, + "loss": 0.3215, + "step": 856 + }, + { + "epoch": 0.20396263461652883, + "grad_norm": 0.495299460550108, + "learning_rate": 9.86550410119523e-06, + "loss": 0.3485, + "step": 857 + }, + { + "epoch": 0.20420063068959363, + "grad_norm": 0.4143122108558567, + "learning_rate": 9.865059661170767e-06, + "loss": 0.4334, + "step": 858 + }, + { + "epoch": 0.20443862676265842, + "grad_norm": 0.4325780620093191, + "learning_rate": 9.86461449807808e-06, + "loss": 0.379, + "step": 859 + }, + { + "epoch": 0.20467662283572322, + "grad_norm": 0.5084472841414205, + "learning_rate": 9.864168611983336e-06, + "loss": 0.3371, + "step": 860 + }, + { + "epoch": 0.204914618908788, + "grad_norm": 0.4451974603394198, + "learning_rate": 9.863722002952803e-06, + "loss": 0.3918, + "step": 861 + }, + { + "epoch": 0.2051526149818528, + "grad_norm": 0.4293240665529044, + "learning_rate": 9.863274671052857e-06, + "loss": 0.4285, + "step": 862 + }, + { + "epoch": 0.2053906110549176, + "grad_norm": 0.42020107235283133, + "learning_rate": 9.862826616349981e-06, + "loss": 0.3356, + "step": 863 + }, + { + "epoch": 0.2056286071279824, + "grad_norm": 0.45803176046948774, + "learning_rate": 9.862377838910771e-06, + "loss": 0.4254, + "step": 864 + }, + { + "epoch": 0.2058666032010472, + "grad_norm": 0.3886605229465983, + "learning_rate": 9.861928338801926e-06, + "loss": 0.4262, + "step": 865 + }, + { + "epoch": 0.20610459927411198, + "grad_norm": 0.43472442169007486, + "learning_rate": 9.86147811609025e-06, + "loss": 0.3713, + "step": 866 + }, + { + "epoch": 0.20634259534717678, + "grad_norm": 0.4414211701916826, + "learning_rate": 9.861027170842659e-06, + "loss": 0.3361, + "step": 867 + }, + { + "epoch": 0.20658059142024157, + "grad_norm": 0.408979611759581, + "learning_rate": 9.860575503126175e-06, + "loss": 0.4036, + "step": 868 + }, + { + "epoch": 0.20681858749330637, + "grad_norm": 0.446536065672617, + "learning_rate": 9.860123113007928e-06, + "loss": 0.4488, + "step": 869 + }, + { + "epoch": 0.20705658356637116, + "grad_norm": 0.436694557262441, + "learning_rate": 9.85967000055515e-06, + "loss": 0.3634, + "step": 870 + }, + { + "epoch": 0.20729457963943596, + "grad_norm": 0.4541956461930287, + "learning_rate": 9.859216165835188e-06, + "loss": 0.3835, + "step": 871 + }, + { + "epoch": 0.20753257571250075, + "grad_norm": 0.4213848003370073, + "learning_rate": 9.858761608915492e-06, + "loss": 0.4334, + "step": 872 + }, + { + "epoch": 0.20777057178556554, + "grad_norm": 0.3930202275134771, + "learning_rate": 9.858306329863623e-06, + "loss": 0.3955, + "step": 873 + }, + { + "epoch": 0.20800856785863034, + "grad_norm": 0.427845820752473, + "learning_rate": 9.857850328747243e-06, + "loss": 0.3356, + "step": 874 + }, + { + "epoch": 0.20824656393169513, + "grad_norm": 0.4116624892541286, + "learning_rate": 9.857393605634126e-06, + "loss": 0.3699, + "step": 875 + }, + { + "epoch": 0.20848456000475993, + "grad_norm": 0.3935251767233762, + "learning_rate": 9.856936160592155e-06, + "loss": 0.4307, + "step": 876 + }, + { + "epoch": 0.20872255607782472, + "grad_norm": 0.42170977712231184, + "learning_rate": 9.856477993689316e-06, + "loss": 0.3545, + "step": 877 + }, + { + "epoch": 0.20896055215088952, + "grad_norm": 0.41048028309068124, + "learning_rate": 9.856019104993702e-06, + "loss": 0.3381, + "step": 878 + }, + { + "epoch": 0.2091985482239543, + "grad_norm": 0.40569183356117694, + "learning_rate": 9.855559494573517e-06, + "loss": 0.4362, + "step": 879 + }, + { + "epoch": 0.2094365442970191, + "grad_norm": 0.42666793677125886, + "learning_rate": 9.855099162497071e-06, + "loss": 0.3783, + "step": 880 + }, + { + "epoch": 0.2096745403700839, + "grad_norm": 0.43438959276226846, + "learning_rate": 9.854638108832781e-06, + "loss": 0.3769, + "step": 881 + }, + { + "epoch": 0.2099125364431487, + "grad_norm": 0.44294773883559535, + "learning_rate": 9.854176333649169e-06, + "loss": 0.401, + "step": 882 + }, + { + "epoch": 0.2101505325162135, + "grad_norm": 0.40033962525880623, + "learning_rate": 9.853713837014867e-06, + "loss": 0.4307, + "step": 883 + }, + { + "epoch": 0.21038852858927828, + "grad_norm": 0.42544362225337135, + "learning_rate": 9.853250618998612e-06, + "loss": 0.3605, + "step": 884 + }, + { + "epoch": 0.21062652466234308, + "grad_norm": 0.43069551338393347, + "learning_rate": 9.852786679669256e-06, + "loss": 0.3538, + "step": 885 + }, + { + "epoch": 0.21086452073540787, + "grad_norm": 0.4061340048491632, + "learning_rate": 9.852322019095744e-06, + "loss": 0.3855, + "step": 886 + }, + { + "epoch": 0.21110251680847267, + "grad_norm": 0.4200438920047967, + "learning_rate": 9.85185663734714e-06, + "loss": 0.4657, + "step": 887 + }, + { + "epoch": 0.21134051288153746, + "grad_norm": 0.40315771492500546, + "learning_rate": 9.85139053449261e-06, + "loss": 0.323, + "step": 888 + }, + { + "epoch": 0.21157850895460226, + "grad_norm": 0.451869373976093, + "learning_rate": 9.85092371060143e-06, + "loss": 0.4123, + "step": 889 + }, + { + "epoch": 0.21181650502766705, + "grad_norm": 0.41009915304865363, + "learning_rate": 9.85045616574298e-06, + "loss": 0.4342, + "step": 890 + }, + { + "epoch": 0.21205450110073185, + "grad_norm": 0.41018214294921573, + "learning_rate": 9.84998789998675e-06, + "loss": 0.4131, + "step": 891 + }, + { + "epoch": 0.21229249717379664, + "grad_norm": 0.41395012445520085, + "learning_rate": 9.849518913402334e-06, + "loss": 0.3472, + "step": 892 + }, + { + "epoch": 0.21253049324686143, + "grad_norm": 0.41478482807464195, + "learning_rate": 9.849049206059435e-06, + "loss": 0.4359, + "step": 893 + }, + { + "epoch": 0.21276848931992623, + "grad_norm": 0.3969815086554544, + "learning_rate": 9.848578778027867e-06, + "loss": 0.4184, + "step": 894 + }, + { + "epoch": 0.21300648539299102, + "grad_norm": 0.4297335515729988, + "learning_rate": 9.848107629377544e-06, + "loss": 0.3719, + "step": 895 + }, + { + "epoch": 0.21324448146605582, + "grad_norm": 0.3855698648491326, + "learning_rate": 9.84763576017849e-06, + "loss": 0.3658, + "step": 896 + }, + { + "epoch": 0.2134824775391206, + "grad_norm": 0.40093853952594977, + "learning_rate": 9.847163170500837e-06, + "loss": 0.4394, + "step": 897 + }, + { + "epoch": 0.2137204736121854, + "grad_norm": 0.4485624156969946, + "learning_rate": 9.846689860414824e-06, + "loss": 0.3776, + "step": 898 + }, + { + "epoch": 0.2139584696852502, + "grad_norm": 0.3794469849541886, + "learning_rate": 9.846215829990797e-06, + "loss": 0.3115, + "step": 899 + }, + { + "epoch": 0.214196465758315, + "grad_norm": 0.3972841317444796, + "learning_rate": 9.84574107929921e-06, + "loss": 0.3894, + "step": 900 + }, + { + "epoch": 0.2144344618313798, + "grad_norm": 0.38348004951058207, + "learning_rate": 9.845265608410616e-06, + "loss": 0.4287, + "step": 901 + }, + { + "epoch": 0.21467245790444459, + "grad_norm": 0.41303667553057977, + "learning_rate": 9.84478941739569e-06, + "loss": 0.3239, + "step": 902 + }, + { + "epoch": 0.21491045397750938, + "grad_norm": 0.40879152143826697, + "learning_rate": 9.844312506325202e-06, + "loss": 0.3797, + "step": 903 + }, + { + "epoch": 0.21514845005057417, + "grad_norm": 0.4604571778006916, + "learning_rate": 9.843834875270032e-06, + "loss": 0.4199, + "step": 904 + }, + { + "epoch": 0.21538644612363897, + "grad_norm": 0.4152085389985221, + "learning_rate": 9.84335652430117e-06, + "loss": 0.3847, + "step": 905 + }, + { + "epoch": 0.21562444219670376, + "grad_norm": 0.4149816757480418, + "learning_rate": 9.842877453489708e-06, + "loss": 0.3592, + "step": 906 + }, + { + "epoch": 0.21586243826976856, + "grad_norm": 0.4299620220648408, + "learning_rate": 9.84239766290685e-06, + "loss": 0.4043, + "step": 907 + }, + { + "epoch": 0.21610043434283335, + "grad_norm": 0.41035423107133623, + "learning_rate": 9.841917152623905e-06, + "loss": 0.4585, + "step": 908 + }, + { + "epoch": 0.21633843041589815, + "grad_norm": 0.44913348339639636, + "learning_rate": 9.841435922712288e-06, + "loss": 0.3911, + "step": 909 + }, + { + "epoch": 0.21657642648896294, + "grad_norm": 0.405292221298209, + "learning_rate": 9.84095397324352e-06, + "loss": 0.3582, + "step": 910 + }, + { + "epoch": 0.21681442256202774, + "grad_norm": 0.38536854563574374, + "learning_rate": 9.840471304289233e-06, + "loss": 0.4007, + "step": 911 + }, + { + "epoch": 0.21705241863509253, + "grad_norm": 0.47808379504573534, + "learning_rate": 9.839987915921163e-06, + "loss": 0.4611, + "step": 912 + }, + { + "epoch": 0.21729041470815733, + "grad_norm": 0.3934343577154057, + "learning_rate": 9.839503808211153e-06, + "loss": 0.3176, + "step": 913 + }, + { + "epoch": 0.21752841078122212, + "grad_norm": 0.4253557198903114, + "learning_rate": 9.839018981231151e-06, + "loss": 0.3818, + "step": 914 + }, + { + "epoch": 0.21776640685428691, + "grad_norm": 0.4060612395908892, + "learning_rate": 9.838533435053221e-06, + "loss": 0.4327, + "step": 915 + }, + { + "epoch": 0.2180044029273517, + "grad_norm": 0.4044627954830839, + "learning_rate": 9.83804716974952e-06, + "loss": 0.3697, + "step": 916 + }, + { + "epoch": 0.2182423990004165, + "grad_norm": 0.3898450504134681, + "learning_rate": 9.837560185392325e-06, + "loss": 0.3444, + "step": 917 + }, + { + "epoch": 0.2184803950734813, + "grad_norm": 0.41317608963767616, + "learning_rate": 9.837072482054009e-06, + "loss": 0.4076, + "step": 918 + }, + { + "epoch": 0.2187183911465461, + "grad_norm": 0.4138083448258549, + "learning_rate": 9.83658405980706e-06, + "loss": 0.4545, + "step": 919 + }, + { + "epoch": 0.2189563872196109, + "grad_norm": 0.41247702913642026, + "learning_rate": 9.836094918724067e-06, + "loss": 0.3613, + "step": 920 + }, + { + "epoch": 0.21919438329267568, + "grad_norm": 0.4121156299354065, + "learning_rate": 9.83560505887773e-06, + "loss": 0.3765, + "step": 921 + }, + { + "epoch": 0.21943237936574048, + "grad_norm": 0.43268431314703193, + "learning_rate": 9.835114480340855e-06, + "loss": 0.4259, + "step": 922 + }, + { + "epoch": 0.21967037543880527, + "grad_norm": 0.38798899747199356, + "learning_rate": 9.834623183186352e-06, + "loss": 0.3945, + "step": 923 + }, + { + "epoch": 0.21990837151187007, + "grad_norm": 0.40463819440452653, + "learning_rate": 9.834131167487241e-06, + "loss": 0.3419, + "step": 924 + }, + { + "epoch": 0.22014636758493486, + "grad_norm": 0.4158947107356108, + "learning_rate": 9.833638433316647e-06, + "loss": 0.3918, + "step": 925 + }, + { + "epoch": 0.22038436365799965, + "grad_norm": 0.3951265208744111, + "learning_rate": 9.833144980747806e-06, + "loss": 0.4467, + "step": 926 + }, + { + "epoch": 0.22062235973106445, + "grad_norm": 0.3866457296285503, + "learning_rate": 9.832650809854054e-06, + "loss": 0.3626, + "step": 927 + }, + { + "epoch": 0.22086035580412924, + "grad_norm": 0.4148351326369386, + "learning_rate": 9.832155920708838e-06, + "loss": 0.3362, + "step": 928 + }, + { + "epoch": 0.22109835187719404, + "grad_norm": 0.3976030627374797, + "learning_rate": 9.831660313385709e-06, + "loss": 0.4314, + "step": 929 + }, + { + "epoch": 0.22133634795025883, + "grad_norm": 0.4157333689112259, + "learning_rate": 9.831163987958329e-06, + "loss": 0.4044, + "step": 930 + }, + { + "epoch": 0.22157434402332363, + "grad_norm": 0.4408772739009037, + "learning_rate": 9.830666944500462e-06, + "loss": 0.3282, + "step": 931 + }, + { + "epoch": 0.22181234009638842, + "grad_norm": 0.41164918567935477, + "learning_rate": 9.830169183085983e-06, + "loss": 0.381, + "step": 932 + }, + { + "epoch": 0.22205033616945322, + "grad_norm": 0.3644162693747717, + "learning_rate": 9.829670703788873e-06, + "loss": 0.4151, + "step": 933 + }, + { + "epoch": 0.222288332242518, + "grad_norm": 0.4207226846601998, + "learning_rate": 9.829171506683211e-06, + "loss": 0.3884, + "step": 934 + }, + { + "epoch": 0.2225263283155828, + "grad_norm": 0.45066402524896876, + "learning_rate": 9.828671591843198e-06, + "loss": 0.3369, + "step": 935 + }, + { + "epoch": 0.2227643243886476, + "grad_norm": 0.4168689897275966, + "learning_rate": 9.828170959343131e-06, + "loss": 0.3938, + "step": 936 + }, + { + "epoch": 0.2230023204617124, + "grad_norm": 0.44074947992378183, + "learning_rate": 9.827669609257417e-06, + "loss": 0.433, + "step": 937 + }, + { + "epoch": 0.2232403165347772, + "grad_norm": 0.47103576178023904, + "learning_rate": 9.827167541660568e-06, + "loss": 0.3565, + "step": 938 + }, + { + "epoch": 0.22347831260784198, + "grad_norm": 0.4532327477806086, + "learning_rate": 9.826664756627202e-06, + "loss": 0.3961, + "step": 939 + }, + { + "epoch": 0.22371630868090678, + "grad_norm": 0.39014864862033083, + "learning_rate": 9.826161254232048e-06, + "loss": 0.4257, + "step": 940 + }, + { + "epoch": 0.22395430475397157, + "grad_norm": 0.40042087107333996, + "learning_rate": 9.825657034549939e-06, + "loss": 0.3866, + "step": 941 + }, + { + "epoch": 0.22419230082703637, + "grad_norm": 0.44006439544231385, + "learning_rate": 9.825152097655813e-06, + "loss": 0.3589, + "step": 942 + }, + { + "epoch": 0.22443029690010116, + "grad_norm": 0.3999068012035896, + "learning_rate": 9.824646443624717e-06, + "loss": 0.3891, + "step": 943 + }, + { + "epoch": 0.22466829297316596, + "grad_norm": 0.4539949228586678, + "learning_rate": 9.824140072531805e-06, + "loss": 0.4532, + "step": 944 + }, + { + "epoch": 0.22490628904623075, + "grad_norm": 0.3981879315816523, + "learning_rate": 9.823632984452331e-06, + "loss": 0.3465, + "step": 945 + }, + { + "epoch": 0.22514428511929555, + "grad_norm": 0.6439992315492964, + "learning_rate": 9.823125179461668e-06, + "loss": 0.376, + "step": 946 + }, + { + "epoch": 0.22538228119236034, + "grad_norm": 0.404887917419206, + "learning_rate": 9.822616657635284e-06, + "loss": 0.4472, + "step": 947 + }, + { + "epoch": 0.22562027726542513, + "grad_norm": 0.37708651982188657, + "learning_rate": 9.822107419048759e-06, + "loss": 0.4146, + "step": 948 + }, + { + "epoch": 0.2258582733384899, + "grad_norm": 0.44302674607158987, + "learning_rate": 9.821597463777779e-06, + "loss": 0.321, + "step": 949 + }, + { + "epoch": 0.2260962694115547, + "grad_norm": 0.412240348327169, + "learning_rate": 9.821086791898133e-06, + "loss": 0.3731, + "step": 950 + }, + { + "epoch": 0.2263342654846195, + "grad_norm": 0.40579745891438523, + "learning_rate": 9.820575403485724e-06, + "loss": 0.4699, + "step": 951 + }, + { + "epoch": 0.22657226155768428, + "grad_norm": 0.415668474457593, + "learning_rate": 9.820063298616553e-06, + "loss": 0.3718, + "step": 952 + }, + { + "epoch": 0.22681025763074908, + "grad_norm": 0.416764959736151, + "learning_rate": 9.819550477366735e-06, + "loss": 0.3393, + "step": 953 + }, + { + "epoch": 0.22704825370381387, + "grad_norm": 0.44453596559445757, + "learning_rate": 9.819036939812485e-06, + "loss": 0.4483, + "step": 954 + }, + { + "epoch": 0.22728624977687867, + "grad_norm": 0.3709861408947193, + "learning_rate": 9.818522686030127e-06, + "loss": 0.4047, + "step": 955 + }, + { + "epoch": 0.22752424584994346, + "grad_norm": 0.40210692784551644, + "learning_rate": 9.818007716096096e-06, + "loss": 0.3185, + "step": 956 + }, + { + "epoch": 0.22776224192300826, + "grad_norm": 0.3862126896605156, + "learning_rate": 9.817492030086926e-06, + "loss": 0.3905, + "step": 957 + }, + { + "epoch": 0.22800023799607305, + "grad_norm": 0.43152498874354184, + "learning_rate": 9.816975628079261e-06, + "loss": 0.4296, + "step": 958 + }, + { + "epoch": 0.22823823406913785, + "grad_norm": 0.38525294582007225, + "learning_rate": 9.816458510149852e-06, + "loss": 0.3463, + "step": 959 + }, + { + "epoch": 0.22847623014220264, + "grad_norm": 0.47012279193131934, + "learning_rate": 9.815940676375554e-06, + "loss": 0.3733, + "step": 960 + }, + { + "epoch": 0.22871422621526744, + "grad_norm": 0.4002528394439282, + "learning_rate": 9.815422126833332e-06, + "loss": 0.4243, + "step": 961 + }, + { + "epoch": 0.22895222228833223, + "grad_norm": 0.4248826037103133, + "learning_rate": 9.814902861600252e-06, + "loss": 0.4592, + "step": 962 + }, + { + "epoch": 0.22919021836139702, + "grad_norm": 0.4022803320480314, + "learning_rate": 9.814382880753493e-06, + "loss": 0.361, + "step": 963 + }, + { + "epoch": 0.22942821443446182, + "grad_norm": 0.424538658023901, + "learning_rate": 9.813862184370338e-06, + "loss": 0.3863, + "step": 964 + }, + { + "epoch": 0.2296662105075266, + "grad_norm": 0.44976984034315987, + "learning_rate": 9.81334077252817e-06, + "loss": 0.4503, + "step": 965 + }, + { + "epoch": 0.2299042065805914, + "grad_norm": 0.406304483755596, + "learning_rate": 9.812818645304488e-06, + "loss": 0.3875, + "step": 966 + }, + { + "epoch": 0.2301422026536562, + "grad_norm": 0.48831090846163266, + "learning_rate": 9.812295802776893e-06, + "loss": 0.3351, + "step": 967 + }, + { + "epoch": 0.230380198726721, + "grad_norm": 0.4010981600344825, + "learning_rate": 9.81177224502309e-06, + "loss": 0.4074, + "step": 968 + }, + { + "epoch": 0.2306181947997858, + "grad_norm": 0.39821279677246363, + "learning_rate": 9.811247972120895e-06, + "loss": 0.4441, + "step": 969 + }, + { + "epoch": 0.2308561908728506, + "grad_norm": 0.40402056505396167, + "learning_rate": 9.810722984148224e-06, + "loss": 0.3292, + "step": 970 + }, + { + "epoch": 0.23109418694591538, + "grad_norm": 0.4031749039115374, + "learning_rate": 9.810197281183109e-06, + "loss": 0.3967, + "step": 971 + }, + { + "epoch": 0.23133218301898018, + "grad_norm": 0.4254043618777442, + "learning_rate": 9.809670863303678e-06, + "loss": 0.4345, + "step": 972 + }, + { + "epoch": 0.23157017909204497, + "grad_norm": 0.41821250677571714, + "learning_rate": 9.809143730588172e-06, + "loss": 0.3698, + "step": 973 + }, + { + "epoch": 0.23180817516510976, + "grad_norm": 0.4186518496968692, + "learning_rate": 9.808615883114935e-06, + "loss": 0.3435, + "step": 974 + }, + { + "epoch": 0.23204617123817456, + "grad_norm": 0.447054433602328, + "learning_rate": 9.808087320962418e-06, + "loss": 0.3868, + "step": 975 + }, + { + "epoch": 0.23228416731123935, + "grad_norm": 0.40528303959506257, + "learning_rate": 9.807558044209178e-06, + "loss": 0.4353, + "step": 976 + }, + { + "epoch": 0.23252216338430415, + "grad_norm": 0.4201076442750277, + "learning_rate": 9.80702805293388e-06, + "loss": 0.3498, + "step": 977 + }, + { + "epoch": 0.23276015945736894, + "grad_norm": 0.43371214248349416, + "learning_rate": 9.806497347215294e-06, + "loss": 0.3148, + "step": 978 + }, + { + "epoch": 0.23299815553043374, + "grad_norm": 0.41147178302674675, + "learning_rate": 9.805965927132294e-06, + "loss": 0.4017, + "step": 979 + }, + { + "epoch": 0.23323615160349853, + "grad_norm": 0.4447084847180371, + "learning_rate": 9.805433792763866e-06, + "loss": 0.4182, + "step": 980 + }, + { + "epoch": 0.23347414767656333, + "grad_norm": 0.41126728345921754, + "learning_rate": 9.804900944189093e-06, + "loss": 0.3648, + "step": 981 + }, + { + "epoch": 0.23371214374962812, + "grad_norm": 0.41273761768511535, + "learning_rate": 9.804367381487172e-06, + "loss": 0.385, + "step": 982 + }, + { + "epoch": 0.23395013982269292, + "grad_norm": 0.39008444226884603, + "learning_rate": 9.803833104737406e-06, + "loss": 0.4549, + "step": 983 + }, + { + "epoch": 0.2341881358957577, + "grad_norm": 0.40144010395161167, + "learning_rate": 9.803298114019198e-06, + "loss": 0.371, + "step": 984 + }, + { + "epoch": 0.2344261319688225, + "grad_norm": 0.39554229899897064, + "learning_rate": 9.802762409412062e-06, + "loss": 0.3357, + "step": 985 + }, + { + "epoch": 0.2346641280418873, + "grad_norm": 0.7571829198075979, + "learning_rate": 9.802225990995618e-06, + "loss": 0.3972, + "step": 986 + }, + { + "epoch": 0.2349021241149521, + "grad_norm": 0.4175979906090598, + "learning_rate": 9.801688858849589e-06, + "loss": 0.4162, + "step": 987 + }, + { + "epoch": 0.2351401201880169, + "grad_norm": 0.39744051393120755, + "learning_rate": 9.80115101305381e-06, + "loss": 0.3587, + "step": 988 + }, + { + "epoch": 0.23537811626108168, + "grad_norm": 0.3999755158754268, + "learning_rate": 9.800612453688214e-06, + "loss": 0.3679, + "step": 989 + }, + { + "epoch": 0.23561611233414648, + "grad_norm": 0.41712508601670123, + "learning_rate": 9.800073180832848e-06, + "loss": 0.4463, + "step": 990 + }, + { + "epoch": 0.23585410840721127, + "grad_norm": 0.4401207780378245, + "learning_rate": 9.799533194567856e-06, + "loss": 0.3739, + "step": 991 + }, + { + "epoch": 0.23609210448027607, + "grad_norm": 0.47720852526507335, + "learning_rate": 9.7989924949735e-06, + "loss": 0.3634, + "step": 992 + }, + { + "epoch": 0.23633010055334086, + "grad_norm": 0.41441117101780556, + "learning_rate": 9.798451082130136e-06, + "loss": 0.4021, + "step": 993 + }, + { + "epoch": 0.23656809662640566, + "grad_norm": 0.4079927127031793, + "learning_rate": 9.797908956118233e-06, + "loss": 0.4398, + "step": 994 + }, + { + "epoch": 0.23680609269947045, + "grad_norm": 0.4514084993876973, + "learning_rate": 9.797366117018365e-06, + "loss": 0.353, + "step": 995 + }, + { + "epoch": 0.23704408877253524, + "grad_norm": 0.41156120093358467, + "learning_rate": 9.79682256491121e-06, + "loss": 0.3714, + "step": 996 + }, + { + "epoch": 0.23728208484560004, + "grad_norm": 0.39736420037308545, + "learning_rate": 9.796278299877556e-06, + "loss": 0.4682, + "step": 997 + }, + { + "epoch": 0.23752008091866483, + "grad_norm": 0.419750377621859, + "learning_rate": 9.795733321998291e-06, + "loss": 0.3953, + "step": 998 + }, + { + "epoch": 0.23775807699172963, + "grad_norm": 0.4523615063724095, + "learning_rate": 9.795187631354415e-06, + "loss": 0.3176, + "step": 999 + }, + { + "epoch": 0.23799607306479442, + "grad_norm": 0.43280687276990515, + "learning_rate": 9.794641228027029e-06, + "loss": 0.3767, + "step": 1000 + }, + { + "epoch": 0.23823406913785922, + "grad_norm": 0.41701831566090125, + "learning_rate": 9.794094112097342e-06, + "loss": 0.4651, + "step": 1001 + }, + { + "epoch": 0.238472065210924, + "grad_norm": 0.4670265558164397, + "learning_rate": 9.793546283646671e-06, + "loss": 0.3804, + "step": 1002 + }, + { + "epoch": 0.2387100612839888, + "grad_norm": 0.4073253123192443, + "learning_rate": 9.792997742756433e-06, + "loss": 0.3447, + "step": 1003 + }, + { + "epoch": 0.2389480573570536, + "grad_norm": 0.41912347417284207, + "learning_rate": 9.792448489508161e-06, + "loss": 0.4335, + "step": 1004 + }, + { + "epoch": 0.2391860534301184, + "grad_norm": 0.4601404438983729, + "learning_rate": 9.791898523983483e-06, + "loss": 0.3913, + "step": 1005 + }, + { + "epoch": 0.2394240495031832, + "grad_norm": 0.40669062578592385, + "learning_rate": 9.791347846264137e-06, + "loss": 0.3174, + "step": 1006 + }, + { + "epoch": 0.23966204557624798, + "grad_norm": 0.40618171733993275, + "learning_rate": 9.790796456431971e-06, + "loss": 0.4043, + "step": 1007 + }, + { + "epoch": 0.23990004164931278, + "grad_norm": 0.41827989422393813, + "learning_rate": 9.79024435456893e-06, + "loss": 0.4413, + "step": 1008 + }, + { + "epoch": 0.24013803772237757, + "grad_norm": 0.431949416274619, + "learning_rate": 9.789691540757076e-06, + "loss": 0.3945, + "step": 1009 + }, + { + "epoch": 0.24037603379544237, + "grad_norm": 0.41413010096326724, + "learning_rate": 9.789138015078565e-06, + "loss": 0.3339, + "step": 1010 + }, + { + "epoch": 0.24061402986850716, + "grad_norm": 0.4140278014836118, + "learning_rate": 9.78858377761567e-06, + "loss": 0.4029, + "step": 1011 + }, + { + "epoch": 0.24085202594157196, + "grad_norm": 0.4285228881221896, + "learning_rate": 9.78802882845076e-06, + "loss": 0.4099, + "step": 1012 + }, + { + "epoch": 0.24109002201463675, + "grad_norm": 0.4491959646131581, + "learning_rate": 9.787473167666316e-06, + "loss": 0.3299, + "step": 1013 + }, + { + "epoch": 0.24132801808770155, + "grad_norm": 0.45078633145305924, + "learning_rate": 9.786916795344925e-06, + "loss": 0.3761, + "step": 1014 + }, + { + "epoch": 0.24156601416076634, + "grad_norm": 0.40484830252152926, + "learning_rate": 9.786359711569273e-06, + "loss": 0.4174, + "step": 1015 + }, + { + "epoch": 0.24180401023383113, + "grad_norm": 0.4129343063024655, + "learning_rate": 9.785801916422162e-06, + "loss": 0.3641, + "step": 1016 + }, + { + "epoch": 0.24204200630689593, + "grad_norm": 0.3896049676049958, + "learning_rate": 9.78524340998649e-06, + "loss": 0.3469, + "step": 1017 + }, + { + "epoch": 0.24228000237996072, + "grad_norm": 0.397046837899395, + "learning_rate": 9.784684192345264e-06, + "loss": 0.3873, + "step": 1018 + }, + { + "epoch": 0.24251799845302552, + "grad_norm": 0.46413478846406936, + "learning_rate": 9.7841242635816e-06, + "loss": 0.4772, + "step": 1019 + }, + { + "epoch": 0.2427559945260903, + "grad_norm": 0.4277807175436547, + "learning_rate": 9.78356362377872e-06, + "loss": 0.3502, + "step": 1020 + }, + { + "epoch": 0.2429939905991551, + "grad_norm": 0.40246848535234875, + "learning_rate": 9.783002273019942e-06, + "loss": 0.3317, + "step": 1021 + }, + { + "epoch": 0.2432319866722199, + "grad_norm": 0.40816497903302856, + "learning_rate": 9.782440211388703e-06, + "loss": 0.4474, + "step": 1022 + }, + { + "epoch": 0.2434699827452847, + "grad_norm": 0.43384610531907175, + "learning_rate": 9.781877438968536e-06, + "loss": 0.3799, + "step": 1023 + }, + { + "epoch": 0.2437079788183495, + "grad_norm": 0.425087792025929, + "learning_rate": 9.781313955843084e-06, + "loss": 0.3573, + "step": 1024 + }, + { + "epoch": 0.24394597489141429, + "grad_norm": 0.4026711207325404, + "learning_rate": 9.780749762096093e-06, + "loss": 0.3878, + "step": 1025 + }, + { + "epoch": 0.24418397096447908, + "grad_norm": 0.4098774528539441, + "learning_rate": 9.780184857811419e-06, + "loss": 0.4786, + "step": 1026 + }, + { + "epoch": 0.24442196703754387, + "grad_norm": 0.4396169899920244, + "learning_rate": 9.779619243073017e-06, + "loss": 0.3444, + "step": 1027 + }, + { + "epoch": 0.24465996311060867, + "grad_norm": 0.4136758278535869, + "learning_rate": 9.779052917964955e-06, + "loss": 0.3948, + "step": 1028 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.3739736169388403, + "learning_rate": 9.7784858825714e-06, + "loss": 0.3961, + "step": 1029 + }, + { + "epoch": 0.24513595525673826, + "grad_norm": 0.4184534974361922, + "learning_rate": 9.77791813697663e-06, + "loss": 0.4134, + "step": 1030 + }, + { + "epoch": 0.24537395132980305, + "grad_norm": 0.4728932911361564, + "learning_rate": 9.777349681265024e-06, + "loss": 0.3557, + "step": 1031 + }, + { + "epoch": 0.24561194740286785, + "grad_norm": 0.41399344692303164, + "learning_rate": 9.77678051552107e-06, + "loss": 0.3869, + "step": 1032 + }, + { + "epoch": 0.24584994347593264, + "grad_norm": 0.49034199619398566, + "learning_rate": 9.77621063982936e-06, + "loss": 0.4363, + "step": 1033 + }, + { + "epoch": 0.24608793954899744, + "grad_norm": 0.4165647555586178, + "learning_rate": 9.775640054274591e-06, + "loss": 0.3841, + "step": 1034 + }, + { + "epoch": 0.24632593562206223, + "grad_norm": 0.4360715981984538, + "learning_rate": 9.775068758941566e-06, + "loss": 0.3205, + "step": 1035 + }, + { + "epoch": 0.24656393169512703, + "grad_norm": 0.389169460851251, + "learning_rate": 9.774496753915193e-06, + "loss": 0.3785, + "step": 1036 + }, + { + "epoch": 0.24680192776819182, + "grad_norm": 0.39337607363580057, + "learning_rate": 9.773924039280488e-06, + "loss": 0.4302, + "step": 1037 + }, + { + "epoch": 0.24703992384125661, + "grad_norm": 0.39250700422998375, + "learning_rate": 9.77335061512257e-06, + "loss": 0.3382, + "step": 1038 + }, + { + "epoch": 0.2472779199143214, + "grad_norm": 0.43938604330305114, + "learning_rate": 9.772776481526662e-06, + "loss": 0.3888, + "step": 1039 + }, + { + "epoch": 0.2475159159873862, + "grad_norm": 0.39206815609324325, + "learning_rate": 9.772201638578099e-06, + "loss": 0.4238, + "step": 1040 + }, + { + "epoch": 0.247753912060451, + "grad_norm": 0.4053806301912741, + "learning_rate": 9.77162608636231e-06, + "loss": 0.3748, + "step": 1041 + }, + { + "epoch": 0.2479919081335158, + "grad_norm": 0.4413688344206532, + "learning_rate": 9.771049824964843e-06, + "loss": 0.3448, + "step": 1042 + }, + { + "epoch": 0.2482299042065806, + "grad_norm": 0.4272845904110967, + "learning_rate": 9.77047285447134e-06, + "loss": 0.4019, + "step": 1043 + }, + { + "epoch": 0.24846790027964538, + "grad_norm": 0.41367780286322975, + "learning_rate": 9.769895174967556e-06, + "loss": 0.4339, + "step": 1044 + }, + { + "epoch": 0.24870589635271018, + "grad_norm": 0.41454713221485506, + "learning_rate": 9.769316786539349e-06, + "loss": 0.368, + "step": 1045 + }, + { + "epoch": 0.24894389242577497, + "grad_norm": 0.40425323505335387, + "learning_rate": 9.768737689272678e-06, + "loss": 0.3679, + "step": 1046 + }, + { + "epoch": 0.24918188849883977, + "grad_norm": 0.38710204326010633, + "learning_rate": 9.768157883253616e-06, + "loss": 0.4258, + "step": 1047 + }, + { + "epoch": 0.24941988457190456, + "grad_norm": 0.40041946638737086, + "learning_rate": 9.76757736856833e-06, + "loss": 0.4006, + "step": 1048 + }, + { + "epoch": 0.24965788064496935, + "grad_norm": 0.36660667068666375, + "learning_rate": 9.766996145303107e-06, + "loss": 0.3208, + "step": 1049 + }, + { + "epoch": 0.24989587671803415, + "grad_norm": 0.40641295360789265, + "learning_rate": 9.766414213544325e-06, + "loss": 0.3991, + "step": 1050 + }, + { + "epoch": 0.25013387279109894, + "grad_norm": 0.3963317481756286, + "learning_rate": 9.765831573378474e-06, + "loss": 0.467, + "step": 1051 + }, + { + "epoch": 0.25037186886416374, + "grad_norm": 0.4102266739054517, + "learning_rate": 9.765248224892152e-06, + "loss": 0.3432, + "step": 1052 + }, + { + "epoch": 0.25060986493722853, + "grad_norm": 0.5225965842019219, + "learning_rate": 9.764664168172057e-06, + "loss": 0.3248, + "step": 1053 + }, + { + "epoch": 0.2508478610102933, + "grad_norm": 0.39422579626773063, + "learning_rate": 9.764079403304996e-06, + "loss": 0.441, + "step": 1054 + }, + { + "epoch": 0.2510858570833581, + "grad_norm": 0.43871315920548154, + "learning_rate": 9.763493930377877e-06, + "loss": 0.4206, + "step": 1055 + }, + { + "epoch": 0.2513238531564229, + "grad_norm": 0.39618156243497527, + "learning_rate": 9.762907749477717e-06, + "loss": 0.3543, + "step": 1056 + }, + { + "epoch": 0.2515618492294877, + "grad_norm": 0.40119944869118124, + "learning_rate": 9.762320860691636e-06, + "loss": 0.3664, + "step": 1057 + }, + { + "epoch": 0.2517998453025525, + "grad_norm": 0.4448657055439697, + "learning_rate": 9.76173326410686e-06, + "loss": 0.4404, + "step": 1058 + }, + { + "epoch": 0.2520378413756173, + "grad_norm": 0.4504360863126996, + "learning_rate": 9.761144959810723e-06, + "loss": 0.3729, + "step": 1059 + }, + { + "epoch": 0.2522758374486821, + "grad_norm": 0.3858257157958738, + "learning_rate": 9.760555947890659e-06, + "loss": 0.3264, + "step": 1060 + }, + { + "epoch": 0.2525138335217469, + "grad_norm": 0.42098269355466916, + "learning_rate": 9.759966228434212e-06, + "loss": 0.4308, + "step": 1061 + }, + { + "epoch": 0.2527518295948117, + "grad_norm": 0.4414996944483038, + "learning_rate": 9.759375801529026e-06, + "loss": 0.4122, + "step": 1062 + }, + { + "epoch": 0.2529898256678765, + "grad_norm": 0.4083823838111461, + "learning_rate": 9.758784667262856e-06, + "loss": 0.3455, + "step": 1063 + }, + { + "epoch": 0.2532278217409413, + "grad_norm": 0.39532890350795247, + "learning_rate": 9.758192825723556e-06, + "loss": 0.3742, + "step": 1064 + }, + { + "epoch": 0.25346581781400607, + "grad_norm": 0.7090467989608195, + "learning_rate": 9.757600276999092e-06, + "loss": 0.4249, + "step": 1065 + }, + { + "epoch": 0.25370381388707086, + "grad_norm": 0.40621648230909274, + "learning_rate": 9.757007021177529e-06, + "loss": 0.3793, + "step": 1066 + }, + { + "epoch": 0.25394180996013566, + "grad_norm": 0.4208314833352698, + "learning_rate": 9.756413058347039e-06, + "loss": 0.3424, + "step": 1067 + }, + { + "epoch": 0.25417980603320045, + "grad_norm": 0.377078139573875, + "learning_rate": 9.7558183885959e-06, + "loss": 0.3902, + "step": 1068 + }, + { + "epoch": 0.25441780210626525, + "grad_norm": 0.44783916724457257, + "learning_rate": 9.755223012012498e-06, + "loss": 0.4407, + "step": 1069 + }, + { + "epoch": 0.25465579817933004, + "grad_norm": 0.39045199889729937, + "learning_rate": 9.754626928685315e-06, + "loss": 0.3372, + "step": 1070 + }, + { + "epoch": 0.25489379425239483, + "grad_norm": 0.4319090741276457, + "learning_rate": 9.754030138702948e-06, + "loss": 0.3226, + "step": 1071 + }, + { + "epoch": 0.25513179032545963, + "grad_norm": 0.3789725941806326, + "learning_rate": 9.753432642154093e-06, + "loss": 0.4311, + "step": 1072 + }, + { + "epoch": 0.2553697863985244, + "grad_norm": 0.4119736475459839, + "learning_rate": 9.752834439127554e-06, + "loss": 0.4179, + "step": 1073 + }, + { + "epoch": 0.2556077824715892, + "grad_norm": 0.46500296910275435, + "learning_rate": 9.752235529712237e-06, + "loss": 0.3472, + "step": 1074 + }, + { + "epoch": 0.255845778544654, + "grad_norm": 0.410321050312457, + "learning_rate": 9.751635913997155e-06, + "loss": 0.3981, + "step": 1075 + }, + { + "epoch": 0.2560837746177188, + "grad_norm": 0.41821928052237395, + "learning_rate": 9.751035592071427e-06, + "loss": 0.4358, + "step": 1076 + }, + { + "epoch": 0.2563217706907836, + "grad_norm": 0.4250687984115567, + "learning_rate": 9.750434564024276e-06, + "loss": 0.3738, + "step": 1077 + }, + { + "epoch": 0.2565597667638484, + "grad_norm": 0.4407759908452419, + "learning_rate": 9.749832829945027e-06, + "loss": 0.3292, + "step": 1078 + }, + { + "epoch": 0.2567977628369132, + "grad_norm": 0.36605112835248904, + "learning_rate": 9.749230389923117e-06, + "loss": 0.4139, + "step": 1079 + }, + { + "epoch": 0.257035758909978, + "grad_norm": 0.46252554242389143, + "learning_rate": 9.748627244048077e-06, + "loss": 0.4072, + "step": 1080 + }, + { + "epoch": 0.2572737549830428, + "grad_norm": 0.4131656517840768, + "learning_rate": 9.748023392409556e-06, + "loss": 0.3444, + "step": 1081 + }, + { + "epoch": 0.2575117510561076, + "grad_norm": 0.4244606448210475, + "learning_rate": 9.747418835097298e-06, + "loss": 0.385, + "step": 1082 + }, + { + "epoch": 0.25774974712917237, + "grad_norm": 0.39038280156155775, + "learning_rate": 9.746813572201154e-06, + "loss": 0.442, + "step": 1083 + }, + { + "epoch": 0.25798774320223716, + "grad_norm": 0.4367941002506421, + "learning_rate": 9.746207603811085e-06, + "loss": 0.3848, + "step": 1084 + }, + { + "epoch": 0.25822573927530196, + "grad_norm": 0.43484071040684835, + "learning_rate": 9.74560093001715e-06, + "loss": 0.317, + "step": 1085 + }, + { + "epoch": 0.25846373534836675, + "grad_norm": 0.35367482678247547, + "learning_rate": 9.744993550909514e-06, + "loss": 0.3873, + "step": 1086 + }, + { + "epoch": 0.25870173142143155, + "grad_norm": 0.4941729589297459, + "learning_rate": 9.744385466578453e-06, + "loss": 0.4777, + "step": 1087 + }, + { + "epoch": 0.25893972749449634, + "grad_norm": 0.4272644514387727, + "learning_rate": 9.74377667711434e-06, + "loss": 0.3518, + "step": 1088 + }, + { + "epoch": 0.25917772356756114, + "grad_norm": 0.4169441718338264, + "learning_rate": 9.743167182607659e-06, + "loss": 0.3746, + "step": 1089 + }, + { + "epoch": 0.25941571964062593, + "grad_norm": 0.4436977383688021, + "learning_rate": 9.742556983148994e-06, + "loss": 0.4359, + "step": 1090 + }, + { + "epoch": 0.2596537157136907, + "grad_norm": 0.44907822644616785, + "learning_rate": 9.741946078829035e-06, + "loss": 0.3661, + "step": 1091 + }, + { + "epoch": 0.2598917117867555, + "grad_norm": 0.5085499247152285, + "learning_rate": 9.74133446973858e-06, + "loss": 0.3236, + "step": 1092 + }, + { + "epoch": 0.2601297078598203, + "grad_norm": 0.4095219628657235, + "learning_rate": 9.740722155968527e-06, + "loss": 0.3725, + "step": 1093 + }, + { + "epoch": 0.2603677039328851, + "grad_norm": 0.4492820629768859, + "learning_rate": 9.74010913760988e-06, + "loss": 0.4523, + "step": 1094 + }, + { + "epoch": 0.2606057000059499, + "grad_norm": 0.4632589956141342, + "learning_rate": 9.739495414753754e-06, + "loss": 0.3544, + "step": 1095 + }, + { + "epoch": 0.2608436960790147, + "grad_norm": 0.43509465444418566, + "learning_rate": 9.738880987491357e-06, + "loss": 0.3868, + "step": 1096 + }, + { + "epoch": 0.2610816921520795, + "grad_norm": 0.4155977515794823, + "learning_rate": 9.738265855914014e-06, + "loss": 0.413, + "step": 1097 + }, + { + "epoch": 0.2613196882251443, + "grad_norm": 0.4944891223357048, + "learning_rate": 9.737650020113143e-06, + "loss": 0.385, + "step": 1098 + }, + { + "epoch": 0.2615576842982091, + "grad_norm": 0.4408252014561249, + "learning_rate": 9.737033480180276e-06, + "loss": 0.3339, + "step": 1099 + }, + { + "epoch": 0.2617956803712739, + "grad_norm": 0.40705862767289636, + "learning_rate": 9.736416236207046e-06, + "loss": 0.3871, + "step": 1100 + }, + { + "epoch": 0.26203367644433867, + "grad_norm": 0.4087671198515162, + "learning_rate": 9.735798288285189e-06, + "loss": 0.4766, + "step": 1101 + }, + { + "epoch": 0.26227167251740346, + "grad_norm": 0.4129939242716965, + "learning_rate": 9.73517963650655e-06, + "loss": 0.3562, + "step": 1102 + }, + { + "epoch": 0.26250966859046826, + "grad_norm": 0.3870745733554611, + "learning_rate": 9.734560280963072e-06, + "loss": 0.3615, + "step": 1103 + }, + { + "epoch": 0.26274766466353305, + "grad_norm": 0.39750563414632717, + "learning_rate": 9.73394022174681e-06, + "loss": 0.4273, + "step": 1104 + }, + { + "epoch": 0.26298566073659785, + "grad_norm": 0.4523534366058027, + "learning_rate": 9.73331945894992e-06, + "loss": 0.3872, + "step": 1105 + }, + { + "epoch": 0.26322365680966264, + "grad_norm": 0.48990020782286, + "learning_rate": 9.73269799266466e-06, + "loss": 0.3577, + "step": 1106 + }, + { + "epoch": 0.26346165288272744, + "grad_norm": 0.43917343471689085, + "learning_rate": 9.732075822983398e-06, + "loss": 0.3667, + "step": 1107 + }, + { + "epoch": 0.26369964895579223, + "grad_norm": 0.4263835822823487, + "learning_rate": 9.731452949998603e-06, + "loss": 0.4421, + "step": 1108 + }, + { + "epoch": 0.263937645028857, + "grad_norm": 0.46278837021200253, + "learning_rate": 9.730829373802847e-06, + "loss": 0.3615, + "step": 1109 + }, + { + "epoch": 0.2641756411019218, + "grad_norm": 0.40000848194964195, + "learning_rate": 9.730205094488813e-06, + "loss": 0.3516, + "step": 1110 + }, + { + "epoch": 0.2644136371749866, + "grad_norm": 0.40932668897348323, + "learning_rate": 9.729580112149283e-06, + "loss": 0.3911, + "step": 1111 + }, + { + "epoch": 0.2646516332480514, + "grad_norm": 0.42423822678370915, + "learning_rate": 9.728954426877142e-06, + "loss": 0.3937, + "step": 1112 + }, + { + "epoch": 0.2648896293211162, + "grad_norm": 0.4536666880057892, + "learning_rate": 9.728328038765387e-06, + "loss": 0.3364, + "step": 1113 + }, + { + "epoch": 0.265127625394181, + "grad_norm": 0.4096764042742354, + "learning_rate": 9.72770094790711e-06, + "loss": 0.3995, + "step": 1114 + }, + { + "epoch": 0.2653656214672458, + "grad_norm": 0.41042568452857936, + "learning_rate": 9.727073154395516e-06, + "loss": 0.4235, + "step": 1115 + }, + { + "epoch": 0.2656036175403106, + "grad_norm": 0.4929851761073694, + "learning_rate": 9.726444658323908e-06, + "loss": 0.3269, + "step": 1116 + }, + { + "epoch": 0.2658416136133754, + "grad_norm": 0.44283586974793293, + "learning_rate": 9.725815459785696e-06, + "loss": 0.3496, + "step": 1117 + }, + { + "epoch": 0.2660796096864402, + "grad_norm": 0.4146273039324895, + "learning_rate": 9.725185558874399e-06, + "loss": 0.3658, + "step": 1118 + }, + { + "epoch": 0.26631760575950497, + "grad_norm": 0.4319382476751413, + "learning_rate": 9.72455495568363e-06, + "loss": 0.4564, + "step": 1119 + }, + { + "epoch": 0.26655560183256977, + "grad_norm": 0.48931654146756204, + "learning_rate": 9.723923650307116e-06, + "loss": 0.3553, + "step": 1120 + }, + { + "epoch": 0.26679359790563456, + "grad_norm": 0.4701876581191389, + "learning_rate": 9.723291642838682e-06, + "loss": 0.3524, + "step": 1121 + }, + { + "epoch": 0.26703159397869936, + "grad_norm": 0.39377148630113995, + "learning_rate": 9.722658933372262e-06, + "loss": 0.4599, + "step": 1122 + }, + { + "epoch": 0.26726959005176415, + "grad_norm": 0.4558315940897939, + "learning_rate": 9.722025522001892e-06, + "loss": 0.389, + "step": 1123 + }, + { + "epoch": 0.26750758612482894, + "grad_norm": 0.4215971407730172, + "learning_rate": 9.721391408821713e-06, + "loss": 0.3064, + "step": 1124 + }, + { + "epoch": 0.26774558219789374, + "grad_norm": 0.3911192317304776, + "learning_rate": 9.720756593925967e-06, + "loss": 0.424, + "step": 1125 + }, + { + "epoch": 0.26798357827095853, + "grad_norm": 0.3867608031443001, + "learning_rate": 9.720121077409006e-06, + "loss": 0.4329, + "step": 1126 + }, + { + "epoch": 0.26822157434402333, + "grad_norm": 0.4335469962868136, + "learning_rate": 9.719484859365283e-06, + "loss": 0.3384, + "step": 1127 + }, + { + "epoch": 0.2684595704170881, + "grad_norm": 0.41163503862716694, + "learning_rate": 9.718847939889354e-06, + "loss": 0.3526, + "step": 1128 + }, + { + "epoch": 0.2686975664901529, + "grad_norm": 0.3892507716482463, + "learning_rate": 9.718210319075883e-06, + "loss": 0.428, + "step": 1129 + }, + { + "epoch": 0.2689355625632177, + "grad_norm": 0.4116306866869224, + "learning_rate": 9.717571997019637e-06, + "loss": 0.3982, + "step": 1130 + }, + { + "epoch": 0.2691735586362825, + "grad_norm": 0.39033870830232814, + "learning_rate": 9.716932973815485e-06, + "loss": 0.3799, + "step": 1131 + }, + { + "epoch": 0.2694115547093473, + "grad_norm": 0.40225942909727946, + "learning_rate": 9.716293249558401e-06, + "loss": 0.3807, + "step": 1132 + }, + { + "epoch": 0.2696495507824121, + "grad_norm": 0.39112529026808723, + "learning_rate": 9.715652824343465e-06, + "loss": 0.4281, + "step": 1133 + }, + { + "epoch": 0.2698875468554769, + "grad_norm": 0.3851730979067281, + "learning_rate": 9.71501169826586e-06, + "loss": 0.3295, + "step": 1134 + }, + { + "epoch": 0.2701255429285417, + "grad_norm": 0.41373661540628287, + "learning_rate": 9.714369871420872e-06, + "loss": 0.3928, + "step": 1135 + }, + { + "epoch": 0.2703635390016065, + "grad_norm": 0.4461170158222069, + "learning_rate": 9.713727343903893e-06, + "loss": 0.4109, + "step": 1136 + }, + { + "epoch": 0.2706015350746713, + "grad_norm": 0.4212053908749594, + "learning_rate": 9.71308411581042e-06, + "loss": 0.4206, + "step": 1137 + }, + { + "epoch": 0.27083953114773607, + "grad_norm": 0.4563483334376799, + "learning_rate": 9.71244018723605e-06, + "loss": 0.3494, + "step": 1138 + }, + { + "epoch": 0.27107752722080086, + "grad_norm": 0.4063198246744073, + "learning_rate": 9.711795558276489e-06, + "loss": 0.3797, + "step": 1139 + }, + { + "epoch": 0.27131552329386566, + "grad_norm": 0.4766611835404524, + "learning_rate": 9.711150229027544e-06, + "loss": 0.4136, + "step": 1140 + }, + { + "epoch": 0.27155351936693045, + "grad_norm": 0.43835373298275027, + "learning_rate": 9.710504199585127e-06, + "loss": 0.3632, + "step": 1141 + }, + { + "epoch": 0.27179151543999525, + "grad_norm": 0.4164970849311708, + "learning_rate": 9.709857470045251e-06, + "loss": 0.329, + "step": 1142 + }, + { + "epoch": 0.27202951151306004, + "grad_norm": 0.40401054483859816, + "learning_rate": 9.709210040504042e-06, + "loss": 0.3835, + "step": 1143 + }, + { + "epoch": 0.27226750758612484, + "grad_norm": 0.39714074001954824, + "learning_rate": 9.708561911057719e-06, + "loss": 0.4795, + "step": 1144 + }, + { + "epoch": 0.27250550365918963, + "grad_norm": 0.4680110103103199, + "learning_rate": 9.707913081802613e-06, + "loss": 0.341, + "step": 1145 + }, + { + "epoch": 0.2727434997322544, + "grad_norm": 0.4089665368213675, + "learning_rate": 9.707263552835153e-06, + "loss": 0.3453, + "step": 1146 + }, + { + "epoch": 0.2729814958053192, + "grad_norm": 0.39910736552697396, + "learning_rate": 9.70661332425188e-06, + "loss": 0.3954, + "step": 1147 + }, + { + "epoch": 0.273219491878384, + "grad_norm": 0.4278429044590152, + "learning_rate": 9.705962396149428e-06, + "loss": 0.3727, + "step": 1148 + }, + { + "epoch": 0.2734574879514488, + "grad_norm": 0.4381769229333954, + "learning_rate": 9.705310768624545e-06, + "loss": 0.3003, + "step": 1149 + }, + { + "epoch": 0.2736954840245136, + "grad_norm": 0.3892819098989479, + "learning_rate": 9.704658441774078e-06, + "loss": 0.3448, + "step": 1150 + }, + { + "epoch": 0.2739334800975784, + "grad_norm": 0.45797697785561536, + "learning_rate": 9.704005415694979e-06, + "loss": 0.4175, + "step": 1151 + }, + { + "epoch": 0.2741714761706432, + "grad_norm": 0.4458801671185096, + "learning_rate": 9.703351690484305e-06, + "loss": 0.3442, + "step": 1152 + }, + { + "epoch": 0.274409472243708, + "grad_norm": 0.43193201208559157, + "learning_rate": 9.702697266239211e-06, + "loss": 0.3373, + "step": 1153 + }, + { + "epoch": 0.2746474683167728, + "grad_norm": 0.41679902408312985, + "learning_rate": 9.702042143056966e-06, + "loss": 0.3958, + "step": 1154 + }, + { + "epoch": 0.2748854643898376, + "grad_norm": 0.443366075981643, + "learning_rate": 9.701386321034937e-06, + "loss": 0.4387, + "step": 1155 + }, + { + "epoch": 0.27512346046290237, + "grad_norm": 0.4278061958287098, + "learning_rate": 9.700729800270592e-06, + "loss": 0.3615, + "step": 1156 + }, + { + "epoch": 0.27536145653596716, + "grad_norm": 0.4281403145666002, + "learning_rate": 9.700072580861511e-06, + "loss": 0.3827, + "step": 1157 + }, + { + "epoch": 0.27559945260903196, + "grad_norm": 0.4404470075524074, + "learning_rate": 9.699414662905368e-06, + "loss": 0.4316, + "step": 1158 + }, + { + "epoch": 0.27583744868209675, + "grad_norm": 0.44475673605890376, + "learning_rate": 9.698756046499948e-06, + "loss": 0.3357, + "step": 1159 + }, + { + "epoch": 0.27607544475516155, + "grad_norm": 0.43252690813011324, + "learning_rate": 9.698096731743139e-06, + "loss": 0.3298, + "step": 1160 + }, + { + "epoch": 0.27631344082822634, + "grad_norm": 0.40596838215176706, + "learning_rate": 9.69743671873293e-06, + "loss": 0.3876, + "step": 1161 + }, + { + "epoch": 0.27655143690129114, + "grad_norm": 0.38244031961085745, + "learning_rate": 9.696776007567414e-06, + "loss": 0.42, + "step": 1162 + }, + { + "epoch": 0.27678943297435593, + "grad_norm": 0.39348329580696617, + "learning_rate": 9.696114598344794e-06, + "loss": 0.3526, + "step": 1163 + }, + { + "epoch": 0.2770274290474207, + "grad_norm": 0.40702007665049295, + "learning_rate": 9.695452491163367e-06, + "loss": 0.3586, + "step": 1164 + }, + { + "epoch": 0.2772654251204855, + "grad_norm": 0.3806659463696473, + "learning_rate": 9.694789686121538e-06, + "loss": 0.4213, + "step": 1165 + }, + { + "epoch": 0.2775034211935503, + "grad_norm": 0.526039242215381, + "learning_rate": 9.69412618331782e-06, + "loss": 0.3702, + "step": 1166 + }, + { + "epoch": 0.2777414172666151, + "grad_norm": 0.4034865628814754, + "learning_rate": 9.693461982850824e-06, + "loss": 0.3582, + "step": 1167 + }, + { + "epoch": 0.2779794133396799, + "grad_norm": 0.36522029047320065, + "learning_rate": 9.692797084819265e-06, + "loss": 0.3887, + "step": 1168 + }, + { + "epoch": 0.2782174094127447, + "grad_norm": 0.40180604628740285, + "learning_rate": 9.692131489321968e-06, + "loss": 0.4349, + "step": 1169 + }, + { + "epoch": 0.2784554054858095, + "grad_norm": 0.4153489962964815, + "learning_rate": 9.691465196457852e-06, + "loss": 0.3318, + "step": 1170 + }, + { + "epoch": 0.2786934015588743, + "grad_norm": 0.3762423285238886, + "learning_rate": 9.690798206325947e-06, + "loss": 0.3601, + "step": 1171 + }, + { + "epoch": 0.2789313976319391, + "grad_norm": 0.41201247864367047, + "learning_rate": 9.690130519025382e-06, + "loss": 0.444, + "step": 1172 + }, + { + "epoch": 0.2791693937050039, + "grad_norm": 0.4139424779362683, + "learning_rate": 9.689462134655396e-06, + "loss": 0.4238, + "step": 1173 + }, + { + "epoch": 0.27940738977806867, + "grad_norm": 0.4213437892645641, + "learning_rate": 9.688793053315324e-06, + "loss": 0.3575, + "step": 1174 + }, + { + "epoch": 0.27964538585113347, + "grad_norm": 0.42068466169572954, + "learning_rate": 9.688123275104611e-06, + "loss": 0.3755, + "step": 1175 + }, + { + "epoch": 0.27988338192419826, + "grad_norm": 0.44836158304136864, + "learning_rate": 9.6874528001228e-06, + "loss": 0.4326, + "step": 1176 + }, + { + "epoch": 0.28012137799726305, + "grad_norm": 0.4033459951159842, + "learning_rate": 9.686781628469537e-06, + "loss": 0.366, + "step": 1177 + }, + { + "epoch": 0.28035937407032785, + "grad_norm": 0.427641362560405, + "learning_rate": 9.686109760244583e-06, + "loss": 0.3167, + "step": 1178 + }, + { + "epoch": 0.28059737014339264, + "grad_norm": 0.4312381613704766, + "learning_rate": 9.685437195547788e-06, + "loss": 0.4068, + "step": 1179 + }, + { + "epoch": 0.28083536621645744, + "grad_norm": 0.38271427698464, + "learning_rate": 9.684763934479116e-06, + "loss": 0.3542, + "step": 1180 + }, + { + "epoch": 0.28107336228952223, + "grad_norm": 0.40970093005177965, + "learning_rate": 9.684089977138625e-06, + "loss": 0.3241, + "step": 1181 + }, + { + "epoch": 0.281311358362587, + "grad_norm": 0.47135849095032084, + "learning_rate": 9.683415323626487e-06, + "loss": 0.3948, + "step": 1182 + }, + { + "epoch": 0.2815493544356518, + "grad_norm": 0.4150750744550006, + "learning_rate": 9.682739974042967e-06, + "loss": 0.4717, + "step": 1183 + }, + { + "epoch": 0.2817873505087166, + "grad_norm": 0.3988087657033761, + "learning_rate": 9.682063928488444e-06, + "loss": 0.3406, + "step": 1184 + }, + { + "epoch": 0.2820253465817814, + "grad_norm": 0.3730302298116086, + "learning_rate": 9.68138718706339e-06, + "loss": 0.3291, + "step": 1185 + }, + { + "epoch": 0.2822633426548462, + "grad_norm": 0.41362346247623033, + "learning_rate": 9.68070974986839e-06, + "loss": 0.4121, + "step": 1186 + }, + { + "epoch": 0.282501338727911, + "grad_norm": 0.4049749435958661, + "learning_rate": 9.680031617004127e-06, + "loss": 0.4326, + "step": 1187 + }, + { + "epoch": 0.2827393348009758, + "grad_norm": 0.43281535830805595, + "learning_rate": 9.679352788571385e-06, + "loss": 0.3082, + "step": 1188 + }, + { + "epoch": 0.2829773308740406, + "grad_norm": 0.38623462120774466, + "learning_rate": 9.678673264671057e-06, + "loss": 0.3458, + "step": 1189 + }, + { + "epoch": 0.2832153269471054, + "grad_norm": 0.40708256399448933, + "learning_rate": 9.677993045404138e-06, + "loss": 0.4467, + "step": 1190 + }, + { + "epoch": 0.2834533230201702, + "grad_norm": 0.3996160590648185, + "learning_rate": 9.677312130871724e-06, + "loss": 0.3843, + "step": 1191 + }, + { + "epoch": 0.283691319093235, + "grad_norm": 0.43448067080567276, + "learning_rate": 9.676630521175017e-06, + "loss": 0.3182, + "step": 1192 + }, + { + "epoch": 0.28392931516629977, + "grad_norm": 0.42327090704304354, + "learning_rate": 9.67594821641532e-06, + "loss": 0.3932, + "step": 1193 + }, + { + "epoch": 0.28416731123936456, + "grad_norm": 0.39790381975349265, + "learning_rate": 9.675265216694041e-06, + "loss": 0.4357, + "step": 1194 + }, + { + "epoch": 0.28440530731242936, + "grad_norm": 0.42504381205655906, + "learning_rate": 9.67458152211269e-06, + "loss": 0.3491, + "step": 1195 + }, + { + "epoch": 0.28464330338549415, + "grad_norm": 0.37577693430446824, + "learning_rate": 9.673897132772881e-06, + "loss": 0.3572, + "step": 1196 + }, + { + "epoch": 0.28488129945855895, + "grad_norm": 0.38908763308437694, + "learning_rate": 9.67321204877633e-06, + "loss": 0.4457, + "step": 1197 + }, + { + "epoch": 0.28511929553162374, + "grad_norm": 0.4153450452315847, + "learning_rate": 9.672526270224861e-06, + "loss": 0.4017, + "step": 1198 + }, + { + "epoch": 0.28535729160468853, + "grad_norm": 0.3652029600299754, + "learning_rate": 9.671839797220394e-06, + "loss": 0.312, + "step": 1199 + }, + { + "epoch": 0.28559528767775333, + "grad_norm": 0.3965275761201531, + "learning_rate": 9.67115262986496e-06, + "loss": 0.3594, + "step": 1200 + }, + { + "epoch": 0.2858332837508181, + "grad_norm": 0.3978539830135749, + "learning_rate": 9.670464768260684e-06, + "loss": 0.433, + "step": 1201 + }, + { + "epoch": 0.2860712798238829, + "grad_norm": 0.3911610545469143, + "learning_rate": 9.669776212509802e-06, + "loss": 0.3459, + "step": 1202 + }, + { + "epoch": 0.2863092758969477, + "grad_norm": 0.4563276723079206, + "learning_rate": 9.669086962714651e-06, + "loss": 0.3656, + "step": 1203 + }, + { + "epoch": 0.2865472719700125, + "grad_norm": 0.37157585507019325, + "learning_rate": 9.66839701897767e-06, + "loss": 0.4093, + "step": 1204 + }, + { + "epoch": 0.2867852680430773, + "grad_norm": 0.43109471368785796, + "learning_rate": 9.667706381401401e-06, + "loss": 0.4173, + "step": 1205 + }, + { + "epoch": 0.2870232641161421, + "grad_norm": 0.4106000046269324, + "learning_rate": 9.667015050088489e-06, + "loss": 0.3157, + "step": 1206 + }, + { + "epoch": 0.2872612601892069, + "grad_norm": 0.49437005628521224, + "learning_rate": 9.666323025141687e-06, + "loss": 0.4168, + "step": 1207 + }, + { + "epoch": 0.2874992562622717, + "grad_norm": 0.423786447227811, + "learning_rate": 9.66563030666384e-06, + "loss": 0.4489, + "step": 1208 + }, + { + "epoch": 0.2877372523353365, + "grad_norm": 0.41056267338791924, + "learning_rate": 9.66493689475791e-06, + "loss": 0.3675, + "step": 1209 + }, + { + "epoch": 0.2879752484084013, + "grad_norm": 0.38996061322766, + "learning_rate": 9.664242789526952e-06, + "loss": 0.3332, + "step": 1210 + }, + { + "epoch": 0.28821324448146607, + "grad_norm": 0.38150783205902883, + "learning_rate": 9.663547991074129e-06, + "loss": 0.3972, + "step": 1211 + }, + { + "epoch": 0.28845124055453086, + "grad_norm": 0.4577771253949555, + "learning_rate": 9.662852499502702e-06, + "loss": 0.4332, + "step": 1212 + }, + { + "epoch": 0.28868923662759566, + "grad_norm": 0.39041968301837926, + "learning_rate": 9.66215631491604e-06, + "loss": 0.3268, + "step": 1213 + }, + { + "epoch": 0.28892723270066045, + "grad_norm": 0.39588825022995383, + "learning_rate": 9.661459437417616e-06, + "loss": 0.3653, + "step": 1214 + }, + { + "epoch": 0.28916522877372525, + "grad_norm": 0.38238412045047987, + "learning_rate": 9.660761867110997e-06, + "loss": 0.4436, + "step": 1215 + }, + { + "epoch": 0.28940322484679004, + "grad_norm": 0.4388597314826315, + "learning_rate": 9.660063604099866e-06, + "loss": 0.3574, + "step": 1216 + }, + { + "epoch": 0.28964122091985484, + "grad_norm": 0.3979764946768326, + "learning_rate": 9.659364648487997e-06, + "loss": 0.3016, + "step": 1217 + }, + { + "epoch": 0.28987921699291963, + "grad_norm": 0.38473163200776395, + "learning_rate": 9.658665000379275e-06, + "loss": 0.4002, + "step": 1218 + }, + { + "epoch": 0.2901172130659844, + "grad_norm": 0.410009814193448, + "learning_rate": 9.657964659877683e-06, + "loss": 0.4234, + "step": 1219 + }, + { + "epoch": 0.2903552091390492, + "grad_norm": 0.44464576009309903, + "learning_rate": 9.657263627087312e-06, + "loss": 0.3546, + "step": 1220 + }, + { + "epoch": 0.290593205212114, + "grad_norm": 0.3867603419054925, + "learning_rate": 9.656561902112349e-06, + "loss": 0.3863, + "step": 1221 + }, + { + "epoch": 0.2908312012851788, + "grad_norm": 0.3979315475026651, + "learning_rate": 9.655859485057091e-06, + "loss": 0.4364, + "step": 1222 + }, + { + "epoch": 0.2910691973582436, + "grad_norm": 0.3855514558708465, + "learning_rate": 9.655156376025932e-06, + "loss": 0.3804, + "step": 1223 + }, + { + "epoch": 0.2913071934313084, + "grad_norm": 0.39833146644821477, + "learning_rate": 9.654452575123373e-06, + "loss": 0.3113, + "step": 1224 + }, + { + "epoch": 0.2915451895043732, + "grad_norm": 0.3873575017477939, + "learning_rate": 9.653748082454016e-06, + "loss": 0.4139, + "step": 1225 + }, + { + "epoch": 0.291783185577438, + "grad_norm": 0.4339801773731958, + "learning_rate": 9.653042898122565e-06, + "loss": 0.4278, + "step": 1226 + }, + { + "epoch": 0.2920211816505028, + "grad_norm": 0.4448702833601803, + "learning_rate": 9.652337022233829e-06, + "loss": 0.3272, + "step": 1227 + }, + { + "epoch": 0.2922591777235676, + "grad_norm": 0.42298609998301084, + "learning_rate": 9.651630454892718e-06, + "loss": 0.3342, + "step": 1228 + }, + { + "epoch": 0.29249717379663237, + "grad_norm": 0.386945622028957, + "learning_rate": 9.650923196204248e-06, + "loss": 0.4296, + "step": 1229 + }, + { + "epoch": 0.29273516986969716, + "grad_norm": 0.41831065108508075, + "learning_rate": 9.650215246273529e-06, + "loss": 0.4312, + "step": 1230 + }, + { + "epoch": 0.29297316594276196, + "grad_norm": 0.39310689224557216, + "learning_rate": 9.649506605205786e-06, + "loss": 0.3573, + "step": 1231 + }, + { + "epoch": 0.29321116201582675, + "grad_norm": 0.40652348372519026, + "learning_rate": 9.648797273106338e-06, + "loss": 0.3865, + "step": 1232 + }, + { + "epoch": 0.29344915808889155, + "grad_norm": 0.38816699963135914, + "learning_rate": 9.648087250080609e-06, + "loss": 0.4138, + "step": 1233 + }, + { + "epoch": 0.29368715416195634, + "grad_norm": 0.42050242112991676, + "learning_rate": 9.647376536234126e-06, + "loss": 0.3695, + "step": 1234 + }, + { + "epoch": 0.29392515023502114, + "grad_norm": 0.45027811269540524, + "learning_rate": 9.64666513167252e-06, + "loss": 0.3329, + "step": 1235 + }, + { + "epoch": 0.29416314630808593, + "grad_norm": 0.4324780377574711, + "learning_rate": 9.645953036501521e-06, + "loss": 0.412, + "step": 1236 + }, + { + "epoch": 0.2944011423811507, + "grad_norm": 0.4229530775773719, + "learning_rate": 9.645240250826969e-06, + "loss": 0.4417, + "step": 1237 + }, + { + "epoch": 0.2946391384542155, + "grad_norm": 0.5100450045808651, + "learning_rate": 9.644526774754794e-06, + "loss": 0.3331, + "step": 1238 + }, + { + "epoch": 0.2948771345272803, + "grad_norm": 0.39386712906627963, + "learning_rate": 9.643812608391042e-06, + "loss": 0.4048, + "step": 1239 + }, + { + "epoch": 0.2951151306003451, + "grad_norm": 0.4161776575953358, + "learning_rate": 9.643097751841854e-06, + "loss": 0.4666, + "step": 1240 + }, + { + "epoch": 0.2953531266734099, + "grad_norm": 0.40609935662404656, + "learning_rate": 9.642382205213476e-06, + "loss": 0.3557, + "step": 1241 + }, + { + "epoch": 0.2955911227464747, + "grad_norm": 0.40786129971537194, + "learning_rate": 9.641665968612254e-06, + "loss": 0.368, + "step": 1242 + }, + { + "epoch": 0.2958291188195395, + "grad_norm": 0.39892048689403614, + "learning_rate": 9.640949042144641e-06, + "loss": 0.374, + "step": 1243 + }, + { + "epoch": 0.2960671148926043, + "grad_norm": 0.42072884743876504, + "learning_rate": 9.640231425917186e-06, + "loss": 0.4409, + "step": 1244 + }, + { + "epoch": 0.2963051109656691, + "grad_norm": 0.3809774278312238, + "learning_rate": 9.63951312003655e-06, + "loss": 0.3276, + "step": 1245 + }, + { + "epoch": 0.2965431070387339, + "grad_norm": 0.4112816053420641, + "learning_rate": 9.638794124609487e-06, + "loss": 0.3342, + "step": 1246 + }, + { + "epoch": 0.29678110311179867, + "grad_norm": 0.41137858878488315, + "learning_rate": 9.63807443974286e-06, + "loss": 0.4281, + "step": 1247 + }, + { + "epoch": 0.29701909918486347, + "grad_norm": 0.4301212258272213, + "learning_rate": 9.63735406554363e-06, + "loss": 0.333, + "step": 1248 + }, + { + "epoch": 0.29725709525792826, + "grad_norm": 0.47855201036627526, + "learning_rate": 9.636633002118865e-06, + "loss": 0.3555, + "step": 1249 + }, + { + "epoch": 0.29749509133099306, + "grad_norm": 0.3926815914537034, + "learning_rate": 9.635911249575729e-06, + "loss": 0.3723, + "step": 1250 + }, + { + "epoch": 0.29773308740405785, + "grad_norm": 0.41169709632873663, + "learning_rate": 9.635188808021496e-06, + "loss": 0.443, + "step": 1251 + }, + { + "epoch": 0.29797108347712264, + "grad_norm": 0.41928978072848383, + "learning_rate": 9.634465677563537e-06, + "loss": 0.3731, + "step": 1252 + }, + { + "epoch": 0.29820907955018744, + "grad_norm": 0.4466280351365597, + "learning_rate": 9.633741858309325e-06, + "loss": 0.3359, + "step": 1253 + }, + { + "epoch": 0.29844707562325223, + "grad_norm": 0.5973113828554666, + "learning_rate": 9.633017350366441e-06, + "loss": 0.4, + "step": 1254 + }, + { + "epoch": 0.29868507169631703, + "grad_norm": 0.4181046582171295, + "learning_rate": 9.632292153842565e-06, + "loss": 0.4306, + "step": 1255 + }, + { + "epoch": 0.2989230677693818, + "grad_norm": 0.4310021297808129, + "learning_rate": 9.631566268845476e-06, + "loss": 0.3564, + "step": 1256 + }, + { + "epoch": 0.2991610638424466, + "grad_norm": 0.38843714102902427, + "learning_rate": 9.630839695483059e-06, + "loss": 0.3675, + "step": 1257 + }, + { + "epoch": 0.2993990599155114, + "grad_norm": 0.40697877470197674, + "learning_rate": 9.630112433863304e-06, + "loss": 0.4735, + "step": 1258 + }, + { + "epoch": 0.2996370559885762, + "grad_norm": 0.4383586613694374, + "learning_rate": 9.629384484094296e-06, + "loss": 0.3543, + "step": 1259 + }, + { + "epoch": 0.299875052061641, + "grad_norm": 0.44985065245385947, + "learning_rate": 9.628655846284228e-06, + "loss": 0.3336, + "step": 1260 + }, + { + "epoch": 0.3001130481347058, + "grad_norm": 0.394156383482113, + "learning_rate": 9.627926520541395e-06, + "loss": 0.3836, + "step": 1261 + }, + { + "epoch": 0.3003510442077706, + "grad_norm": 0.48348036503403014, + "learning_rate": 9.627196506974192e-06, + "loss": 0.4122, + "step": 1262 + }, + { + "epoch": 0.3005890402808354, + "grad_norm": 0.4046212158723506, + "learning_rate": 9.626465805691117e-06, + "loss": 0.3412, + "step": 1263 + }, + { + "epoch": 0.3008270363539002, + "grad_norm": 0.4637536508877546, + "learning_rate": 9.625734416800768e-06, + "loss": 0.3617, + "step": 1264 + }, + { + "epoch": 0.301065032426965, + "grad_norm": 0.3696735431596717, + "learning_rate": 9.625002340411851e-06, + "loss": 0.421, + "step": 1265 + }, + { + "epoch": 0.30130302850002977, + "grad_norm": 0.44039294060962975, + "learning_rate": 9.624269576633168e-06, + "loss": 0.342, + "step": 1266 + }, + { + "epoch": 0.30154102457309456, + "grad_norm": 0.4066558094391624, + "learning_rate": 9.623536125573628e-06, + "loss": 0.3259, + "step": 1267 + }, + { + "epoch": 0.30177902064615936, + "grad_norm": 0.4228494081970754, + "learning_rate": 9.622801987342239e-06, + "loss": 0.3966, + "step": 1268 + }, + { + "epoch": 0.30201701671922415, + "grad_norm": 0.3929044823680207, + "learning_rate": 9.622067162048111e-06, + "loss": 0.4214, + "step": 1269 + }, + { + "epoch": 0.30225501279228895, + "grad_norm": 0.4364618202087389, + "learning_rate": 9.62133164980046e-06, + "loss": 0.3539, + "step": 1270 + }, + { + "epoch": 0.30249300886535374, + "grad_norm": 0.3973240447603393, + "learning_rate": 9.620595450708598e-06, + "loss": 0.3228, + "step": 1271 + }, + { + "epoch": 0.30273100493841854, + "grad_norm": 0.423749076343449, + "learning_rate": 9.619858564881945e-06, + "loss": 0.4197, + "step": 1272 + }, + { + "epoch": 0.30296900101148333, + "grad_norm": 0.42928761287682166, + "learning_rate": 9.61912099243002e-06, + "loss": 0.3675, + "step": 1273 + }, + { + "epoch": 0.3032069970845481, + "grad_norm": 0.4262017901034838, + "learning_rate": 9.618382733462443e-06, + "loss": 0.3333, + "step": 1274 + }, + { + "epoch": 0.3034449931576129, + "grad_norm": 0.41851797958081777, + "learning_rate": 9.617643788088938e-06, + "loss": 0.4031, + "step": 1275 + }, + { + "epoch": 0.3036829892306777, + "grad_norm": 0.44040318016484536, + "learning_rate": 9.616904156419332e-06, + "loss": 0.4259, + "step": 1276 + }, + { + "epoch": 0.3039209853037425, + "grad_norm": 0.41176036786394593, + "learning_rate": 9.616163838563551e-06, + "loss": 0.323, + "step": 1277 + }, + { + "epoch": 0.3041589813768073, + "grad_norm": 0.39969851565063835, + "learning_rate": 9.615422834631627e-06, + "loss": 0.311, + "step": 1278 + }, + { + "epoch": 0.3043969774498721, + "grad_norm": 0.4226290204089705, + "learning_rate": 9.614681144733688e-06, + "loss": 0.4125, + "step": 1279 + }, + { + "epoch": 0.3046349735229369, + "grad_norm": 0.43676015567681714, + "learning_rate": 9.61393876897997e-06, + "loss": 0.4041, + "step": 1280 + }, + { + "epoch": 0.3048729695960017, + "grad_norm": 0.3863010410381727, + "learning_rate": 9.613195707480808e-06, + "loss": 0.3125, + "step": 1281 + }, + { + "epoch": 0.3051109656690665, + "grad_norm": 0.41088520843304066, + "learning_rate": 9.612451960346636e-06, + "loss": 0.3645, + "step": 1282 + }, + { + "epoch": 0.3053489617421313, + "grad_norm": 0.4163362500331567, + "learning_rate": 9.611707527688e-06, + "loss": 0.4114, + "step": 1283 + }, + { + "epoch": 0.30558695781519607, + "grad_norm": 0.4627242806057053, + "learning_rate": 9.610962409615534e-06, + "loss": 0.3343, + "step": 1284 + }, + { + "epoch": 0.30582495388826086, + "grad_norm": 0.4027539448469817, + "learning_rate": 9.610216606239987e-06, + "loss": 0.3279, + "step": 1285 + }, + { + "epoch": 0.30606294996132566, + "grad_norm": 0.4145181397037847, + "learning_rate": 9.609470117672199e-06, + "loss": 0.4067, + "step": 1286 + }, + { + "epoch": 0.30630094603439045, + "grad_norm": 0.41297813308066056, + "learning_rate": 9.608722944023119e-06, + "loss": 0.3966, + "step": 1287 + }, + { + "epoch": 0.30653894210745525, + "grad_norm": 0.410910470454594, + "learning_rate": 9.607975085403796e-06, + "loss": 0.3297, + "step": 1288 + }, + { + "epoch": 0.30677693818052004, + "grad_norm": 0.3854845755046187, + "learning_rate": 9.607226541925379e-06, + "loss": 0.3767, + "step": 1289 + }, + { + "epoch": 0.30701493425358484, + "grad_norm": 0.4443276950244607, + "learning_rate": 9.60647731369912e-06, + "loss": 0.4304, + "step": 1290 + }, + { + "epoch": 0.30725293032664963, + "grad_norm": 0.48085590127195105, + "learning_rate": 9.605727400836373e-06, + "loss": 0.3869, + "step": 1291 + }, + { + "epoch": 0.3074909263997144, + "grad_norm": 0.421202050029993, + "learning_rate": 9.604976803448596e-06, + "loss": 0.3321, + "step": 1292 + }, + { + "epoch": 0.3077289224727792, + "grad_norm": 0.4177390717152937, + "learning_rate": 9.604225521647343e-06, + "loss": 0.4087, + "step": 1293 + }, + { + "epoch": 0.307966918545844, + "grad_norm": 0.38933105632433157, + "learning_rate": 9.603473555544277e-06, + "loss": 0.4267, + "step": 1294 + }, + { + "epoch": 0.3082049146189088, + "grad_norm": 0.4249818846999146, + "learning_rate": 9.602720905251153e-06, + "loss": 0.3674, + "step": 1295 + }, + { + "epoch": 0.3084429106919736, + "grad_norm": 0.3917934403028404, + "learning_rate": 9.601967570879837e-06, + "loss": 0.3504, + "step": 1296 + }, + { + "epoch": 0.3086809067650384, + "grad_norm": 0.4115539403873786, + "learning_rate": 9.601213552542295e-06, + "loss": 0.4318, + "step": 1297 + }, + { + "epoch": 0.3089189028381032, + "grad_norm": 0.40804809857958424, + "learning_rate": 9.600458850350588e-06, + "loss": 0.3798, + "step": 1298 + }, + { + "epoch": 0.309156898911168, + "grad_norm": 0.4101236081509535, + "learning_rate": 9.599703464416888e-06, + "loss": 0.322, + "step": 1299 + }, + { + "epoch": 0.3093948949842328, + "grad_norm": 0.40151490793067635, + "learning_rate": 9.598947394853459e-06, + "loss": 0.3762, + "step": 1300 + }, + { + "epoch": 0.3096328910572976, + "grad_norm": 0.4266339246416687, + "learning_rate": 9.598190641772678e-06, + "loss": 0.4228, + "step": 1301 + }, + { + "epoch": 0.30987088713036237, + "grad_norm": 0.43973887455955873, + "learning_rate": 9.597433205287013e-06, + "loss": 0.3686, + "step": 1302 + }, + { + "epoch": 0.31010888320342717, + "grad_norm": 0.3914498413085817, + "learning_rate": 9.596675085509037e-06, + "loss": 0.3618, + "step": 1303 + }, + { + "epoch": 0.31034687927649196, + "grad_norm": 0.3984178847846324, + "learning_rate": 9.595916282551429e-06, + "loss": 0.4209, + "step": 1304 + }, + { + "epoch": 0.31058487534955675, + "grad_norm": 0.39984926069708504, + "learning_rate": 9.595156796526963e-06, + "loss": 0.4215, + "step": 1305 + }, + { + "epoch": 0.31082287142262155, + "grad_norm": 0.4070793173356604, + "learning_rate": 9.59439662754852e-06, + "loss": 0.3554, + "step": 1306 + }, + { + "epoch": 0.31106086749568634, + "grad_norm": 0.3874699043146027, + "learning_rate": 9.593635775729075e-06, + "loss": 0.3398, + "step": 1307 + }, + { + "epoch": 0.31129886356875114, + "grad_norm": 0.39786057856428775, + "learning_rate": 9.592874241181715e-06, + "loss": 0.4606, + "step": 1308 + }, + { + "epoch": 0.31153685964181593, + "grad_norm": 0.4280746953470422, + "learning_rate": 9.59211202401962e-06, + "loss": 0.3604, + "step": 1309 + }, + { + "epoch": 0.3117748557148807, + "grad_norm": 0.4021864469571394, + "learning_rate": 9.591349124356075e-06, + "loss": 0.3332, + "step": 1310 + }, + { + "epoch": 0.3120128517879455, + "grad_norm": 0.39554183184913133, + "learning_rate": 9.590585542304466e-06, + "loss": 0.3999, + "step": 1311 + }, + { + "epoch": 0.3122508478610103, + "grad_norm": 0.4424536440928366, + "learning_rate": 9.58982127797828e-06, + "loss": 0.4063, + "step": 1312 + }, + { + "epoch": 0.3124888439340751, + "grad_norm": 0.45440939559298216, + "learning_rate": 9.589056331491103e-06, + "loss": 0.3491, + "step": 1313 + }, + { + "epoch": 0.3127268400071399, + "grad_norm": 0.39330929295389805, + "learning_rate": 9.58829070295663e-06, + "loss": 0.3901, + "step": 1314 + }, + { + "epoch": 0.3129648360802047, + "grad_norm": 0.38833530931998134, + "learning_rate": 9.587524392488647e-06, + "loss": 0.4466, + "step": 1315 + }, + { + "epoch": 0.3132028321532695, + "grad_norm": 0.44664852368249736, + "learning_rate": 9.586757400201052e-06, + "loss": 0.3538, + "step": 1316 + }, + { + "epoch": 0.3134408282263343, + "grad_norm": 0.40740414428696925, + "learning_rate": 9.585989726207837e-06, + "loss": 0.3446, + "step": 1317 + }, + { + "epoch": 0.3136788242993991, + "grad_norm": 0.38148457875580716, + "learning_rate": 9.585221370623095e-06, + "loss": 0.3682, + "step": 1318 + }, + { + "epoch": 0.3139168203724639, + "grad_norm": 0.43478324832719745, + "learning_rate": 9.584452333561024e-06, + "loss": 0.424, + "step": 1319 + }, + { + "epoch": 0.3141548164455287, + "grad_norm": 0.4000748087214559, + "learning_rate": 9.583682615135923e-06, + "loss": 0.3392, + "step": 1320 + }, + { + "epoch": 0.31439281251859347, + "grad_norm": 0.3835145914512324, + "learning_rate": 9.58291221546219e-06, + "loss": 0.3482, + "step": 1321 + }, + { + "epoch": 0.31463080859165826, + "grad_norm": 0.45176179860066545, + "learning_rate": 9.582141134654327e-06, + "loss": 0.4232, + "step": 1322 + }, + { + "epoch": 0.31486880466472306, + "grad_norm": 0.4268907832728068, + "learning_rate": 9.581369372826933e-06, + "loss": 0.4082, + "step": 1323 + }, + { + "epoch": 0.31510680073778785, + "grad_norm": 0.41052383905112105, + "learning_rate": 9.580596930094716e-06, + "loss": 0.3438, + "step": 1324 + }, + { + "epoch": 0.31534479681085265, + "grad_norm": 0.37945978663945285, + "learning_rate": 9.579823806572474e-06, + "loss": 0.4025, + "step": 1325 + }, + { + "epoch": 0.31558279288391744, + "grad_norm": 0.46441757305264597, + "learning_rate": 9.579050002375115e-06, + "loss": 0.4172, + "step": 1326 + }, + { + "epoch": 0.31582078895698223, + "grad_norm": 0.4393276338549456, + "learning_rate": 9.578275517617646e-06, + "loss": 0.3395, + "step": 1327 + }, + { + "epoch": 0.31605878503004703, + "grad_norm": 0.41000616260473466, + "learning_rate": 9.577500352415174e-06, + "loss": 0.3254, + "step": 1328 + }, + { + "epoch": 0.3162967811031118, + "grad_norm": 0.420002315041422, + "learning_rate": 9.576724506882908e-06, + "loss": 0.4023, + "step": 1329 + }, + { + "epoch": 0.3165347771761766, + "grad_norm": 0.4236036611226363, + "learning_rate": 9.575947981136158e-06, + "loss": 0.367, + "step": 1330 + }, + { + "epoch": 0.3167727732492414, + "grad_norm": 0.4483840411549537, + "learning_rate": 9.575170775290333e-06, + "loss": 0.366, + "step": 1331 + }, + { + "epoch": 0.3170107693223062, + "grad_norm": 0.4313647137240431, + "learning_rate": 9.574392889460947e-06, + "loss": 0.3813, + "step": 1332 + }, + { + "epoch": 0.317248765395371, + "grad_norm": 0.38650979945066577, + "learning_rate": 9.573614323763613e-06, + "loss": 0.4281, + "step": 1333 + }, + { + "epoch": 0.3174867614684358, + "grad_norm": 0.41737500297562424, + "learning_rate": 9.572835078314044e-06, + "loss": 0.3513, + "step": 1334 + }, + { + "epoch": 0.3177247575415006, + "grad_norm": 0.4348449187581223, + "learning_rate": 9.572055153228056e-06, + "loss": 0.3564, + "step": 1335 + }, + { + "epoch": 0.3179627536145654, + "grad_norm": 0.39019537145273137, + "learning_rate": 9.571274548621566e-06, + "loss": 0.3876, + "step": 1336 + }, + { + "epoch": 0.3182007496876302, + "grad_norm": 0.4246028933708349, + "learning_rate": 9.570493264610589e-06, + "loss": 0.397, + "step": 1337 + }, + { + "epoch": 0.318438745760695, + "grad_norm": 0.39930253543441263, + "learning_rate": 9.569711301311247e-06, + "loss": 0.3201, + "step": 1338 + }, + { + "epoch": 0.31867674183375977, + "grad_norm": 0.45407334734341276, + "learning_rate": 9.568928658839754e-06, + "loss": 0.3434, + "step": 1339 + }, + { + "epoch": 0.31891473790682456, + "grad_norm": 0.39643025758667066, + "learning_rate": 9.568145337312432e-06, + "loss": 0.4361, + "step": 1340 + }, + { + "epoch": 0.31915273397988936, + "grad_norm": 0.421815069567987, + "learning_rate": 9.567361336845704e-06, + "loss": 0.3701, + "step": 1341 + }, + { + "epoch": 0.31939073005295415, + "grad_norm": 0.38833337925881534, + "learning_rate": 9.566576657556089e-06, + "loss": 0.3495, + "step": 1342 + }, + { + "epoch": 0.31962872612601895, + "grad_norm": 0.4180768293262518, + "learning_rate": 9.565791299560211e-06, + "loss": 0.397, + "step": 1343 + }, + { + "epoch": 0.31986672219908374, + "grad_norm": 0.4164700070432731, + "learning_rate": 9.565005262974795e-06, + "loss": 0.4271, + "step": 1344 + }, + { + "epoch": 0.32010471827214854, + "grad_norm": 0.4468347817517033, + "learning_rate": 9.564218547916664e-06, + "loss": 0.3495, + "step": 1345 + }, + { + "epoch": 0.32034271434521333, + "grad_norm": 0.40663558186072163, + "learning_rate": 9.563431154502742e-06, + "loss": 0.3238, + "step": 1346 + }, + { + "epoch": 0.3205807104182781, + "grad_norm": 0.3817636234384891, + "learning_rate": 9.562643082850058e-06, + "loss": 0.4335, + "step": 1347 + }, + { + "epoch": 0.3208187064913429, + "grad_norm": 0.4623172946195981, + "learning_rate": 9.561854333075737e-06, + "loss": 0.3936, + "step": 1348 + }, + { + "epoch": 0.3210567025644077, + "grad_norm": 0.41347372584642295, + "learning_rate": 9.561064905297007e-06, + "loss": 0.3192, + "step": 1349 + }, + { + "epoch": 0.3212946986374725, + "grad_norm": 0.45698235146730654, + "learning_rate": 9.560274799631196e-06, + "loss": 0.4274, + "step": 1350 + }, + { + "epoch": 0.3215326947105373, + "grad_norm": 0.4125490994148022, + "learning_rate": 9.559484016195734e-06, + "loss": 0.4421, + "step": 1351 + }, + { + "epoch": 0.3217706907836021, + "grad_norm": 0.3839103030987149, + "learning_rate": 9.558692555108153e-06, + "loss": 0.3393, + "step": 1352 + }, + { + "epoch": 0.3220086868566669, + "grad_norm": 0.4091260775678065, + "learning_rate": 9.557900416486082e-06, + "loss": 0.3491, + "step": 1353 + }, + { + "epoch": 0.3222466829297317, + "grad_norm": 0.3807018578165834, + "learning_rate": 9.55710760044725e-06, + "loss": 0.4185, + "step": 1354 + }, + { + "epoch": 0.3224846790027965, + "grad_norm": 0.40209441835271953, + "learning_rate": 9.556314107109492e-06, + "loss": 0.4234, + "step": 1355 + }, + { + "epoch": 0.3227226750758612, + "grad_norm": 0.40937014075451966, + "learning_rate": 9.555519936590739e-06, + "loss": 0.3316, + "step": 1356 + }, + { + "epoch": 0.322960671148926, + "grad_norm": 0.3884304266780269, + "learning_rate": 9.554725089009028e-06, + "loss": 0.4138, + "step": 1357 + }, + { + "epoch": 0.3231986672219908, + "grad_norm": 0.44485363699581787, + "learning_rate": 9.553929564482486e-06, + "loss": 0.4333, + "step": 1358 + }, + { + "epoch": 0.3234366632950556, + "grad_norm": 0.3994706589980082, + "learning_rate": 9.553133363129354e-06, + "loss": 0.3354, + "step": 1359 + }, + { + "epoch": 0.3236746593681204, + "grad_norm": 0.38575800184976405, + "learning_rate": 9.552336485067966e-06, + "loss": 0.3318, + "step": 1360 + }, + { + "epoch": 0.3239126554411852, + "grad_norm": 0.38907588895075834, + "learning_rate": 9.551538930416757e-06, + "loss": 0.3931, + "step": 1361 + }, + { + "epoch": 0.32415065151425, + "grad_norm": 0.4005989165863969, + "learning_rate": 9.550740699294263e-06, + "loss": 0.4475, + "step": 1362 + }, + { + "epoch": 0.3243886475873148, + "grad_norm": 0.3978604964741817, + "learning_rate": 9.54994179181912e-06, + "loss": 0.3326, + "step": 1363 + }, + { + "epoch": 0.3246266436603796, + "grad_norm": 0.41103874966576015, + "learning_rate": 9.549142208110069e-06, + "loss": 0.3415, + "step": 1364 + }, + { + "epoch": 0.32486463973344437, + "grad_norm": 0.407764483418257, + "learning_rate": 9.548341948285945e-06, + "loss": 0.4316, + "step": 1365 + }, + { + "epoch": 0.32510263580650917, + "grad_norm": 0.41035785125072904, + "learning_rate": 9.547541012465684e-06, + "loss": 0.3737, + "step": 1366 + }, + { + "epoch": 0.32534063187957396, + "grad_norm": 0.43499657258118146, + "learning_rate": 9.54673940076833e-06, + "loss": 0.3175, + "step": 1367 + }, + { + "epoch": 0.32557862795263875, + "grad_norm": 0.4004695765130625, + "learning_rate": 9.545937113313019e-06, + "loss": 0.3881, + "step": 1368 + }, + { + "epoch": 0.32581662402570355, + "grad_norm": 0.4020663184428851, + "learning_rate": 9.545134150218993e-06, + "loss": 0.4156, + "step": 1369 + }, + { + "epoch": 0.32605462009876834, + "grad_norm": 0.40642741251741493, + "learning_rate": 9.544330511605591e-06, + "loss": 0.3247, + "step": 1370 + }, + { + "epoch": 0.32629261617183314, + "grad_norm": 0.3641358027632288, + "learning_rate": 9.543526197592255e-06, + "loss": 0.345, + "step": 1371 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3989061307234226, + "learning_rate": 9.542721208298522e-06, + "loss": 0.4211, + "step": 1372 + }, + { + "epoch": 0.3267686083179627, + "grad_norm": 0.38215478134894165, + "learning_rate": 9.541915543844036e-06, + "loss": 0.3929, + "step": 1373 + }, + { + "epoch": 0.3270066043910275, + "grad_norm": 0.39691981690953293, + "learning_rate": 9.54110920434854e-06, + "loss": 0.3495, + "step": 1374 + }, + { + "epoch": 0.3272446004640923, + "grad_norm": 0.38215194964724575, + "learning_rate": 9.540302189931874e-06, + "loss": 0.385, + "step": 1375 + }, + { + "epoch": 0.3274825965371571, + "grad_norm": 0.3606406479597386, + "learning_rate": 9.539494500713982e-06, + "loss": 0.4526, + "step": 1376 + }, + { + "epoch": 0.3277205926102219, + "grad_norm": 0.40105227105638763, + "learning_rate": 9.538686136814905e-06, + "loss": 0.3464, + "step": 1377 + }, + { + "epoch": 0.3279585886832867, + "grad_norm": 0.41070873761480936, + "learning_rate": 9.537877098354787e-06, + "loss": 0.3559, + "step": 1378 + }, + { + "epoch": 0.3281965847563515, + "grad_norm": 0.40931042414878005, + "learning_rate": 9.53706738545387e-06, + "loss": 0.3865, + "step": 1379 + }, + { + "epoch": 0.3284345808294163, + "grad_norm": 0.4254957758069491, + "learning_rate": 9.536256998232496e-06, + "loss": 0.3928, + "step": 1380 + }, + { + "epoch": 0.3286725769024811, + "grad_norm": 0.4198060992786074, + "learning_rate": 9.535445936811111e-06, + "loss": 0.3368, + "step": 1381 + }, + { + "epoch": 0.3289105729755459, + "grad_norm": 0.39019263851197356, + "learning_rate": 9.53463420131026e-06, + "loss": 0.3788, + "step": 1382 + }, + { + "epoch": 0.3291485690486107, + "grad_norm": 0.3881968657734908, + "learning_rate": 9.533821791850585e-06, + "loss": 0.4478, + "step": 1383 + }, + { + "epoch": 0.32938656512167547, + "grad_norm": 0.41104330210033657, + "learning_rate": 9.533008708552829e-06, + "loss": 0.3468, + "step": 1384 + }, + { + "epoch": 0.32962456119474026, + "grad_norm": 0.4668376033750086, + "learning_rate": 9.532194951537838e-06, + "loss": 0.339, + "step": 1385 + }, + { + "epoch": 0.32986255726780506, + "grad_norm": 0.41125619080230763, + "learning_rate": 9.531380520926559e-06, + "loss": 0.3966, + "step": 1386 + }, + { + "epoch": 0.33010055334086985, + "grad_norm": 0.4471468164842655, + "learning_rate": 9.53056541684003e-06, + "loss": 0.4227, + "step": 1387 + }, + { + "epoch": 0.33033854941393465, + "grad_norm": 0.3796087714093602, + "learning_rate": 9.5297496393994e-06, + "loss": 0.3417, + "step": 1388 + }, + { + "epoch": 0.33057654548699944, + "grad_norm": 0.4097059136074371, + "learning_rate": 9.528933188725913e-06, + "loss": 0.3784, + "step": 1389 + }, + { + "epoch": 0.33081454156006423, + "grad_norm": 0.41994702732943573, + "learning_rate": 9.528116064940915e-06, + "loss": 0.4296, + "step": 1390 + }, + { + "epoch": 0.33105253763312903, + "grad_norm": 0.43876123590734234, + "learning_rate": 9.52729826816585e-06, + "loss": 0.3459, + "step": 1391 + }, + { + "epoch": 0.3312905337061938, + "grad_norm": 0.44759563573787553, + "learning_rate": 9.526479798522261e-06, + "loss": 0.3338, + "step": 1392 + }, + { + "epoch": 0.3315285297792586, + "grad_norm": 0.39997856436860535, + "learning_rate": 9.525660656131794e-06, + "loss": 0.3782, + "step": 1393 + }, + { + "epoch": 0.3317665258523234, + "grad_norm": 0.4496886369666572, + "learning_rate": 9.524840841116194e-06, + "loss": 0.4685, + "step": 1394 + }, + { + "epoch": 0.3320045219253882, + "grad_norm": 0.4074594148653707, + "learning_rate": 9.524020353597306e-06, + "loss": 0.3464, + "step": 1395 + }, + { + "epoch": 0.332242517998453, + "grad_norm": 0.4240780629108365, + "learning_rate": 9.523199193697076e-06, + "loss": 0.3664, + "step": 1396 + }, + { + "epoch": 0.3324805140715178, + "grad_norm": 0.3724547733959635, + "learning_rate": 9.522377361537546e-06, + "loss": 0.4263, + "step": 1397 + }, + { + "epoch": 0.3327185101445826, + "grad_norm": 0.3848768663283912, + "learning_rate": 9.521554857240863e-06, + "loss": 0.3997, + "step": 1398 + }, + { + "epoch": 0.3329565062176474, + "grad_norm": 0.3747787306372805, + "learning_rate": 9.520731680929268e-06, + "loss": 0.3324, + "step": 1399 + }, + { + "epoch": 0.3331945022907122, + "grad_norm": 0.40168499777226635, + "learning_rate": 9.51990783272511e-06, + "loss": 0.3988, + "step": 1400 + }, + { + "epoch": 0.333432498363777, + "grad_norm": 0.37263893468315484, + "learning_rate": 9.519083312750829e-06, + "loss": 0.433, + "step": 1401 + }, + { + "epoch": 0.33367049443684177, + "grad_norm": 0.39450747544417814, + "learning_rate": 9.518258121128971e-06, + "loss": 0.3332, + "step": 1402 + }, + { + "epoch": 0.33390849050990656, + "grad_norm": 0.4159109527655681, + "learning_rate": 9.517432257982182e-06, + "loss": 0.3519, + "step": 1403 + }, + { + "epoch": 0.33414648658297136, + "grad_norm": 0.3758964436379061, + "learning_rate": 9.516605723433202e-06, + "loss": 0.404, + "step": 1404 + }, + { + "epoch": 0.33438448265603615, + "grad_norm": 0.4167452335375169, + "learning_rate": 9.515778517604876e-06, + "loss": 0.3771, + "step": 1405 + }, + { + "epoch": 0.33462247872910095, + "grad_norm": 0.4231708121211018, + "learning_rate": 9.514950640620148e-06, + "loss": 0.3153, + "step": 1406 + }, + { + "epoch": 0.33486047480216574, + "grad_norm": 0.4358774633836512, + "learning_rate": 9.514122092602061e-06, + "loss": 0.3527, + "step": 1407 + }, + { + "epoch": 0.33509847087523054, + "grad_norm": 0.3773178127174748, + "learning_rate": 9.513292873673757e-06, + "loss": 0.4515, + "step": 1408 + }, + { + "epoch": 0.33533646694829533, + "grad_norm": 0.39844835991429833, + "learning_rate": 9.512462983958476e-06, + "loss": 0.3841, + "step": 1409 + }, + { + "epoch": 0.3355744630213601, + "grad_norm": 0.41246380594274273, + "learning_rate": 9.511632423579564e-06, + "loss": 0.312, + "step": 1410 + }, + { + "epoch": 0.3358124590944249, + "grad_norm": 0.39771084162538206, + "learning_rate": 9.510801192660463e-06, + "loss": 0.4066, + "step": 1411 + }, + { + "epoch": 0.3360504551674897, + "grad_norm": 0.40735314602043377, + "learning_rate": 9.509969291324711e-06, + "loss": 0.4026, + "step": 1412 + }, + { + "epoch": 0.3362884512405545, + "grad_norm": 0.4306392515796982, + "learning_rate": 9.509136719695952e-06, + "loss": 0.3391, + "step": 1413 + }, + { + "epoch": 0.3365264473136193, + "grad_norm": 0.38787745703798865, + "learning_rate": 9.508303477897925e-06, + "loss": 0.3623, + "step": 1414 + }, + { + "epoch": 0.3367644433866841, + "grad_norm": 0.38221096427891993, + "learning_rate": 9.507469566054472e-06, + "loss": 0.4347, + "step": 1415 + }, + { + "epoch": 0.3370024394597489, + "grad_norm": 0.39794787975787377, + "learning_rate": 9.506634984289532e-06, + "loss": 0.3685, + "step": 1416 + }, + { + "epoch": 0.3372404355328137, + "grad_norm": 0.42203140787297544, + "learning_rate": 9.505799732727144e-06, + "loss": 0.3255, + "step": 1417 + }, + { + "epoch": 0.3374784316058785, + "grad_norm": 0.4130150729651241, + "learning_rate": 9.504963811491448e-06, + "loss": 0.3852, + "step": 1418 + }, + { + "epoch": 0.3377164276789433, + "grad_norm": 0.4035329355042889, + "learning_rate": 9.50412722070668e-06, + "loss": 0.4231, + "step": 1419 + }, + { + "epoch": 0.33795442375200807, + "grad_norm": 0.42264021258045936, + "learning_rate": 9.503289960497184e-06, + "loss": 0.3096, + "step": 1420 + }, + { + "epoch": 0.33819241982507287, + "grad_norm": 0.41033520433424203, + "learning_rate": 9.502452030987392e-06, + "loss": 0.3462, + "step": 1421 + }, + { + "epoch": 0.33843041589813766, + "grad_norm": 0.3734652547743581, + "learning_rate": 9.501613432301843e-06, + "loss": 0.4026, + "step": 1422 + }, + { + "epoch": 0.33866841197120245, + "grad_norm": 0.44488771827168927, + "learning_rate": 9.500774164565172e-06, + "loss": 0.403, + "step": 1423 + }, + { + "epoch": 0.33890640804426725, + "grad_norm": 0.41540692997233103, + "learning_rate": 9.499934227902119e-06, + "loss": 0.3038, + "step": 1424 + }, + { + "epoch": 0.33914440411733204, + "grad_norm": 0.3759433684152908, + "learning_rate": 9.499093622437516e-06, + "loss": 0.3459, + "step": 1425 + }, + { + "epoch": 0.33938240019039684, + "grad_norm": 0.3790112278132182, + "learning_rate": 9.498252348296298e-06, + "loss": 0.4454, + "step": 1426 + }, + { + "epoch": 0.33962039626346163, + "grad_norm": 0.4478868649304047, + "learning_rate": 9.4974104056035e-06, + "loss": 0.3263, + "step": 1427 + }, + { + "epoch": 0.3398583923365264, + "grad_norm": 0.3687692738652499, + "learning_rate": 9.496567794484254e-06, + "loss": 0.3447, + "step": 1428 + }, + { + "epoch": 0.3400963884095912, + "grad_norm": 0.3815394491514148, + "learning_rate": 9.495724515063795e-06, + "loss": 0.411, + "step": 1429 + }, + { + "epoch": 0.340334384482656, + "grad_norm": 0.41607171762494266, + "learning_rate": 9.494880567467454e-06, + "loss": 0.4161, + "step": 1430 + }, + { + "epoch": 0.3405723805557208, + "grad_norm": 0.39572328438968524, + "learning_rate": 9.494035951820662e-06, + "loss": 0.332, + "step": 1431 + }, + { + "epoch": 0.3408103766287856, + "grad_norm": 0.37481472394853776, + "learning_rate": 9.493190668248951e-06, + "loss": 0.3546, + "step": 1432 + }, + { + "epoch": 0.3410483727018504, + "grad_norm": 0.40603528903956776, + "learning_rate": 9.49234471687795e-06, + "loss": 0.4331, + "step": 1433 + }, + { + "epoch": 0.3412863687749152, + "grad_norm": 0.3562649954495181, + "learning_rate": 9.491498097833391e-06, + "loss": 0.3442, + "step": 1434 + }, + { + "epoch": 0.34152436484798, + "grad_norm": 0.4633907342034106, + "learning_rate": 9.490650811241098e-06, + "loss": 0.3301, + "step": 1435 + }, + { + "epoch": 0.3417623609210448, + "grad_norm": 0.4196029970855259, + "learning_rate": 9.489802857227001e-06, + "loss": 0.4032, + "step": 1436 + }, + { + "epoch": 0.3420003569941096, + "grad_norm": 0.4036864305636982, + "learning_rate": 9.488954235917129e-06, + "loss": 0.4371, + "step": 1437 + }, + { + "epoch": 0.3422383530671744, + "grad_norm": 0.4424631609113763, + "learning_rate": 9.488104947437606e-06, + "loss": 0.3401, + "step": 1438 + }, + { + "epoch": 0.34247634914023917, + "grad_norm": 0.3868950566216425, + "learning_rate": 9.487254991914655e-06, + "loss": 0.3546, + "step": 1439 + }, + { + "epoch": 0.34271434521330396, + "grad_norm": 0.3939081607602161, + "learning_rate": 9.486404369474605e-06, + "loss": 0.4171, + "step": 1440 + }, + { + "epoch": 0.34295234128636876, + "grad_norm": 0.42524765774393825, + "learning_rate": 9.485553080243877e-06, + "loss": 0.4112, + "step": 1441 + }, + { + "epoch": 0.34319033735943355, + "grad_norm": 0.40156763127279305, + "learning_rate": 9.484701124348994e-06, + "loss": 0.3195, + "step": 1442 + }, + { + "epoch": 0.34342833343249834, + "grad_norm": 0.3769325847215234, + "learning_rate": 9.483848501916578e-06, + "loss": 0.3893, + "step": 1443 + }, + { + "epoch": 0.34366632950556314, + "grad_norm": 0.38885175403043437, + "learning_rate": 9.482995213073349e-06, + "loss": 0.4137, + "step": 1444 + }, + { + "epoch": 0.34390432557862793, + "grad_norm": 0.41829331584548823, + "learning_rate": 9.482141257946128e-06, + "loss": 0.3256, + "step": 1445 + }, + { + "epoch": 0.34414232165169273, + "grad_norm": 0.39698859193541414, + "learning_rate": 9.481286636661832e-06, + "loss": 0.3461, + "step": 1446 + }, + { + "epoch": 0.3443803177247575, + "grad_norm": 0.40947635708434343, + "learning_rate": 9.480431349347482e-06, + "loss": 0.4253, + "step": 1447 + }, + { + "epoch": 0.3446183137978223, + "grad_norm": 0.4008658220372033, + "learning_rate": 9.479575396130192e-06, + "loss": 0.3822, + "step": 1448 + }, + { + "epoch": 0.3448563098708871, + "grad_norm": 0.38802614165640026, + "learning_rate": 9.478718777137178e-06, + "loss": 0.3252, + "step": 1449 + }, + { + "epoch": 0.3450943059439519, + "grad_norm": 0.40732799270557263, + "learning_rate": 9.477861492495757e-06, + "loss": 0.3823, + "step": 1450 + }, + { + "epoch": 0.3453323020170167, + "grad_norm": 0.41644066379485684, + "learning_rate": 9.47700354233334e-06, + "loss": 0.4282, + "step": 1451 + }, + { + "epoch": 0.3455702980900815, + "grad_norm": 0.4224474904646012, + "learning_rate": 9.476144926777441e-06, + "loss": 0.3412, + "step": 1452 + }, + { + "epoch": 0.3458082941631463, + "grad_norm": 0.3909474423211058, + "learning_rate": 9.475285645955672e-06, + "loss": 0.3453, + "step": 1453 + }, + { + "epoch": 0.3460462902362111, + "grad_norm": 0.3842803363987257, + "learning_rate": 9.474425699995741e-06, + "loss": 0.4122, + "step": 1454 + }, + { + "epoch": 0.3462842863092759, + "grad_norm": 0.3862238147346478, + "learning_rate": 9.473565089025463e-06, + "loss": 0.3803, + "step": 1455 + }, + { + "epoch": 0.3465222823823407, + "grad_norm": 0.39766464008162167, + "learning_rate": 9.472703813172739e-06, + "loss": 0.3174, + "step": 1456 + }, + { + "epoch": 0.34676027845540547, + "grad_norm": 0.37570231724988296, + "learning_rate": 9.47184187256558e-06, + "loss": 0.3791, + "step": 1457 + }, + { + "epoch": 0.34699827452847026, + "grad_norm": 0.4142156067032061, + "learning_rate": 9.47097926733209e-06, + "loss": 0.4024, + "step": 1458 + }, + { + "epoch": 0.34723627060153506, + "grad_norm": 0.398792512942407, + "learning_rate": 9.470115997600474e-06, + "loss": 0.3421, + "step": 1459 + }, + { + "epoch": 0.34747426667459985, + "grad_norm": 0.43012148252168575, + "learning_rate": 9.469252063499036e-06, + "loss": 0.3314, + "step": 1460 + }, + { + "epoch": 0.34771226274766465, + "grad_norm": 0.4927776656728705, + "learning_rate": 9.468387465156176e-06, + "loss": 0.4153, + "step": 1461 + }, + { + "epoch": 0.34795025882072944, + "grad_norm": 0.4096929455276192, + "learning_rate": 9.467522202700399e-06, + "loss": 0.4071, + "step": 1462 + }, + { + "epoch": 0.34818825489379424, + "grad_norm": 0.44407738784995165, + "learning_rate": 9.4666562762603e-06, + "loss": 0.3292, + "step": 1463 + }, + { + "epoch": 0.34842625096685903, + "grad_norm": 0.4261438734366603, + "learning_rate": 9.465789685964579e-06, + "loss": 0.3609, + "step": 1464 + }, + { + "epoch": 0.3486642470399238, + "grad_norm": 0.3842203365009242, + "learning_rate": 9.464922431942032e-06, + "loss": 0.4201, + "step": 1465 + }, + { + "epoch": 0.3489022431129886, + "grad_norm": 0.378083104719912, + "learning_rate": 9.464054514321554e-06, + "loss": 0.3694, + "step": 1466 + }, + { + "epoch": 0.3491402391860534, + "grad_norm": 0.4040049074787795, + "learning_rate": 9.46318593323214e-06, + "loss": 0.3564, + "step": 1467 + }, + { + "epoch": 0.3493782352591182, + "grad_norm": 0.425890416302795, + "learning_rate": 9.462316688802884e-06, + "loss": 0.389, + "step": 1468 + }, + { + "epoch": 0.349616231332183, + "grad_norm": 0.3803658953106735, + "learning_rate": 9.461446781162974e-06, + "loss": 0.4331, + "step": 1469 + }, + { + "epoch": 0.3498542274052478, + "grad_norm": 0.4060084138224938, + "learning_rate": 9.4605762104417e-06, + "loss": 0.3327, + "step": 1470 + }, + { + "epoch": 0.3500922234783126, + "grad_norm": 0.640815748060444, + "learning_rate": 9.459704976768455e-06, + "loss": 0.3434, + "step": 1471 + }, + { + "epoch": 0.3503302195513774, + "grad_norm": 0.42837101898379226, + "learning_rate": 9.458833080272723e-06, + "loss": 0.424, + "step": 1472 + }, + { + "epoch": 0.3505682156244422, + "grad_norm": 0.4030658639433663, + "learning_rate": 9.457960521084087e-06, + "loss": 0.3982, + "step": 1473 + }, + { + "epoch": 0.350806211697507, + "grad_norm": 0.3984195226956701, + "learning_rate": 9.457087299332232e-06, + "loss": 0.3275, + "step": 1474 + }, + { + "epoch": 0.35104420777057177, + "grad_norm": 0.39898439381782036, + "learning_rate": 9.456213415146943e-06, + "loss": 0.3891, + "step": 1475 + }, + { + "epoch": 0.35128220384363656, + "grad_norm": 0.39964981876808653, + "learning_rate": 9.4553388686581e-06, + "loss": 0.4239, + "step": 1476 + }, + { + "epoch": 0.35152019991670136, + "grad_norm": 0.41473687287806216, + "learning_rate": 9.454463659995678e-06, + "loss": 0.3502, + "step": 1477 + }, + { + "epoch": 0.35175819598976615, + "grad_norm": 0.40780054376337166, + "learning_rate": 9.453587789289762e-06, + "loss": 0.3565, + "step": 1478 + }, + { + "epoch": 0.35199619206283095, + "grad_norm": 0.4040364947021049, + "learning_rate": 9.452711256670521e-06, + "loss": 0.4009, + "step": 1479 + }, + { + "epoch": 0.35223418813589574, + "grad_norm": 0.43117573330093506, + "learning_rate": 9.451834062268234e-06, + "loss": 0.3894, + "step": 1480 + }, + { + "epoch": 0.35247218420896054, + "grad_norm": 0.4115001277743943, + "learning_rate": 9.450956206213272e-06, + "loss": 0.3176, + "step": 1481 + }, + { + "epoch": 0.35271018028202533, + "grad_norm": 0.3928594010225446, + "learning_rate": 9.450077688636107e-06, + "loss": 0.3696, + "step": 1482 + }, + { + "epoch": 0.3529481763550901, + "grad_norm": 0.3924725511243915, + "learning_rate": 9.449198509667307e-06, + "loss": 0.429, + "step": 1483 + }, + { + "epoch": 0.3531861724281549, + "grad_norm": 0.4026567257458122, + "learning_rate": 9.448318669437541e-06, + "loss": 0.3303, + "step": 1484 + }, + { + "epoch": 0.3534241685012197, + "grad_norm": 0.38358768538776655, + "learning_rate": 9.447438168077574e-06, + "loss": 0.3477, + "step": 1485 + }, + { + "epoch": 0.3536621645742845, + "grad_norm": 0.38097335550299927, + "learning_rate": 9.446557005718271e-06, + "loss": 0.3577, + "step": 1486 + }, + { + "epoch": 0.3539001606473493, + "grad_norm": 0.3625205031776034, + "learning_rate": 9.445675182490594e-06, + "loss": 0.4126, + "step": 1487 + }, + { + "epoch": 0.3541381567204141, + "grad_norm": 0.3598134954454926, + "learning_rate": 9.444792698525606e-06, + "loss": 0.3341, + "step": 1488 + }, + { + "epoch": 0.3543761527934789, + "grad_norm": 0.4479832164810966, + "learning_rate": 9.443909553954463e-06, + "loss": 0.3451, + "step": 1489 + }, + { + "epoch": 0.3546141488665437, + "grad_norm": 0.38803228377964066, + "learning_rate": 9.443025748908423e-06, + "loss": 0.4234, + "step": 1490 + }, + { + "epoch": 0.3548521449396085, + "grad_norm": 0.4066075957863286, + "learning_rate": 9.442141283518842e-06, + "loss": 0.3549, + "step": 1491 + }, + { + "epoch": 0.3550901410126733, + "grad_norm": 0.42403239997363124, + "learning_rate": 9.441256157917174e-06, + "loss": 0.3344, + "step": 1492 + }, + { + "epoch": 0.35532813708573807, + "grad_norm": 0.38290202011308166, + "learning_rate": 9.440370372234968e-06, + "loss": 0.3861, + "step": 1493 + }, + { + "epoch": 0.35556613315880287, + "grad_norm": 0.3890748893970797, + "learning_rate": 9.439483926603876e-06, + "loss": 0.4325, + "step": 1494 + }, + { + "epoch": 0.35580412923186766, + "grad_norm": 0.41264784304893215, + "learning_rate": 9.438596821155644e-06, + "loss": 0.3185, + "step": 1495 + }, + { + "epoch": 0.35604212530493246, + "grad_norm": 0.4150247829602727, + "learning_rate": 9.43770905602212e-06, + "loss": 0.3659, + "step": 1496 + }, + { + "epoch": 0.35628012137799725, + "grad_norm": 0.3925170437737852, + "learning_rate": 9.436820631335245e-06, + "loss": 0.4141, + "step": 1497 + }, + { + "epoch": 0.35651811745106204, + "grad_norm": 0.4286671248087841, + "learning_rate": 9.435931547227064e-06, + "loss": 0.381, + "step": 1498 + }, + { + "epoch": 0.35675611352412684, + "grad_norm": 0.4388357656753384, + "learning_rate": 9.435041803829716e-06, + "loss": 0.3278, + "step": 1499 + }, + { + "epoch": 0.35699410959719163, + "grad_norm": 0.418842687865229, + "learning_rate": 9.434151401275436e-06, + "loss": 0.3708, + "step": 1500 + }, + { + "epoch": 0.35723210567025643, + "grad_norm": 0.4811664724649453, + "learning_rate": 9.433260339696564e-06, + "loss": 0.4451, + "step": 1501 + }, + { + "epoch": 0.3574701017433212, + "grad_norm": 0.4847264758830864, + "learning_rate": 9.432368619225532e-06, + "loss": 0.3434, + "step": 1502 + }, + { + "epoch": 0.357708097816386, + "grad_norm": 0.4157855613581623, + "learning_rate": 9.43147623999487e-06, + "loss": 0.3456, + "step": 1503 + }, + { + "epoch": 0.3579460938894508, + "grad_norm": 0.39518229863812704, + "learning_rate": 9.43058320213721e-06, + "loss": 0.3947, + "step": 1504 + }, + { + "epoch": 0.3581840899625156, + "grad_norm": 0.4171833220322581, + "learning_rate": 9.42968950578528e-06, + "loss": 0.3934, + "step": 1505 + }, + { + "epoch": 0.3584220860355804, + "grad_norm": 0.46935586691478637, + "learning_rate": 9.428795151071904e-06, + "loss": 0.335, + "step": 1506 + }, + { + "epoch": 0.3586600821086452, + "grad_norm": 0.4076327299219336, + "learning_rate": 9.427900138130005e-06, + "loss": 0.3517, + "step": 1507 + }, + { + "epoch": 0.35889807818171, + "grad_norm": 0.39393198965417636, + "learning_rate": 9.427004467092604e-06, + "loss": 0.4382, + "step": 1508 + }, + { + "epoch": 0.3591360742547748, + "grad_norm": 0.4165484912582241, + "learning_rate": 9.42610813809282e-06, + "loss": 0.3512, + "step": 1509 + }, + { + "epoch": 0.3593740703278396, + "grad_norm": 0.45933344516028873, + "learning_rate": 9.425211151263871e-06, + "loss": 0.3291, + "step": 1510 + }, + { + "epoch": 0.3596120664009044, + "grad_norm": 0.3722603137882553, + "learning_rate": 9.42431350673907e-06, + "loss": 0.3777, + "step": 1511 + }, + { + "epoch": 0.35985006247396917, + "grad_norm": 0.4539895853130569, + "learning_rate": 9.42341520465183e-06, + "loss": 0.4187, + "step": 1512 + }, + { + "epoch": 0.36008805854703396, + "grad_norm": 0.4630126425642641, + "learning_rate": 9.42251624513566e-06, + "loss": 0.3322, + "step": 1513 + }, + { + "epoch": 0.36032605462009876, + "grad_norm": 0.4571830447922558, + "learning_rate": 9.421616628324168e-06, + "loss": 0.376, + "step": 1514 + }, + { + "epoch": 0.36056405069316355, + "grad_norm": 0.3805355362272922, + "learning_rate": 9.42071635435106e-06, + "loss": 0.4204, + "step": 1515 + }, + { + "epoch": 0.36080204676622835, + "grad_norm": 0.43191632003281544, + "learning_rate": 9.41981542335014e-06, + "loss": 0.3545, + "step": 1516 + }, + { + "epoch": 0.36104004283929314, + "grad_norm": 0.40851897479644855, + "learning_rate": 9.418913835455306e-06, + "loss": 0.3339, + "step": 1517 + }, + { + "epoch": 0.36127803891235793, + "grad_norm": 0.37903195693910496, + "learning_rate": 9.418011590800556e-06, + "loss": 0.3913, + "step": 1518 + }, + { + "epoch": 0.36151603498542273, + "grad_norm": 0.4520503442337851, + "learning_rate": 9.41710868951999e-06, + "loss": 0.4169, + "step": 1519 + }, + { + "epoch": 0.3617540310584875, + "grad_norm": 0.43239012543775074, + "learning_rate": 9.416205131747796e-06, + "loss": 0.3376, + "step": 1520 + }, + { + "epoch": 0.3619920271315523, + "grad_norm": 0.41380758515497446, + "learning_rate": 9.415300917618269e-06, + "loss": 0.3672, + "step": 1521 + }, + { + "epoch": 0.3622300232046171, + "grad_norm": 0.3843142207624058, + "learning_rate": 9.414396047265797e-06, + "loss": 0.4091, + "step": 1522 + }, + { + "epoch": 0.3624680192776819, + "grad_norm": 0.37767641451983713, + "learning_rate": 9.413490520824864e-06, + "loss": 0.3759, + "step": 1523 + }, + { + "epoch": 0.3627060153507467, + "grad_norm": 0.3684023556846619, + "learning_rate": 9.412584338430056e-06, + "loss": 0.3418, + "step": 1524 + }, + { + "epoch": 0.3629440114238115, + "grad_norm": 0.5087384806908768, + "learning_rate": 9.411677500216053e-06, + "loss": 0.3908, + "step": 1525 + }, + { + "epoch": 0.3631820074968763, + "grad_norm": 0.36310042219702077, + "learning_rate": 9.410770006317634e-06, + "loss": 0.4402, + "step": 1526 + }, + { + "epoch": 0.3634200035699411, + "grad_norm": 0.36444105379216124, + "learning_rate": 9.409861856869676e-06, + "loss": 0.3427, + "step": 1527 + }, + { + "epoch": 0.3636579996430059, + "grad_norm": 0.48371401020099114, + "learning_rate": 9.40895305200715e-06, + "loss": 0.3294, + "step": 1528 + }, + { + "epoch": 0.3638959957160707, + "grad_norm": 0.4123934901314048, + "learning_rate": 9.408043591865129e-06, + "loss": 0.3941, + "step": 1529 + }, + { + "epoch": 0.36413399178913547, + "grad_norm": 0.39143950957309054, + "learning_rate": 9.407133476578778e-06, + "loss": 0.3836, + "step": 1530 + }, + { + "epoch": 0.36437198786220026, + "grad_norm": 0.4513000975255898, + "learning_rate": 9.406222706283368e-06, + "loss": 0.3188, + "step": 1531 + }, + { + "epoch": 0.36460998393526506, + "grad_norm": 0.39125708494077566, + "learning_rate": 9.405311281114258e-06, + "loss": 0.3845, + "step": 1532 + }, + { + "epoch": 0.36484798000832985, + "grad_norm": 0.39160593726214166, + "learning_rate": 9.404399201206908e-06, + "loss": 0.4172, + "step": 1533 + }, + { + "epoch": 0.36508597608139465, + "grad_norm": 0.46690194148815245, + "learning_rate": 9.40348646669688e-06, + "loss": 0.3384, + "step": 1534 + }, + { + "epoch": 0.36532397215445944, + "grad_norm": 0.43376276227001326, + "learning_rate": 9.402573077719825e-06, + "loss": 0.3107, + "step": 1535 + }, + { + "epoch": 0.36556196822752424, + "grad_norm": 0.3748979225551678, + "learning_rate": 9.401659034411496e-06, + "loss": 0.3881, + "step": 1536 + }, + { + "epoch": 0.36579996430058903, + "grad_norm": 0.41357476791350894, + "learning_rate": 9.400744336907743e-06, + "loss": 0.4555, + "step": 1537 + }, + { + "epoch": 0.3660379603736538, + "grad_norm": 0.40289808520289133, + "learning_rate": 9.399828985344513e-06, + "loss": 0.3289, + "step": 1538 + }, + { + "epoch": 0.3662759564467186, + "grad_norm": 0.37195330338030785, + "learning_rate": 9.398912979857848e-06, + "loss": 0.3662, + "step": 1539 + }, + { + "epoch": 0.3665139525197834, + "grad_norm": 0.379044773247951, + "learning_rate": 9.39799632058389e-06, + "loss": 0.4099, + "step": 1540 + }, + { + "epoch": 0.3667519485928482, + "grad_norm": 0.40409817593250036, + "learning_rate": 9.397079007658878e-06, + "loss": 0.3912, + "step": 1541 + }, + { + "epoch": 0.366989944665913, + "grad_norm": 0.38093441918353926, + "learning_rate": 9.396161041219147e-06, + "loss": 0.3345, + "step": 1542 + }, + { + "epoch": 0.3672279407389778, + "grad_norm": 0.4006873027568648, + "learning_rate": 9.39524242140113e-06, + "loss": 0.386, + "step": 1543 + }, + { + "epoch": 0.3674659368120426, + "grad_norm": 0.407045960911348, + "learning_rate": 9.394323148341355e-06, + "loss": 0.4196, + "step": 1544 + }, + { + "epoch": 0.3677039328851074, + "grad_norm": 0.45641130604553176, + "learning_rate": 9.393403222176451e-06, + "loss": 0.3589, + "step": 1545 + }, + { + "epoch": 0.3679419289581722, + "grad_norm": 0.44633345529058044, + "learning_rate": 9.392482643043142e-06, + "loss": 0.3553, + "step": 1546 + }, + { + "epoch": 0.368179925031237, + "grad_norm": 0.3856503746950216, + "learning_rate": 9.391561411078245e-06, + "loss": 0.414, + "step": 1547 + }, + { + "epoch": 0.36841792110430177, + "grad_norm": 0.43687712395015343, + "learning_rate": 9.39063952641868e-06, + "loss": 0.3678, + "step": 1548 + }, + { + "epoch": 0.36865591717736657, + "grad_norm": 0.43545658546660954, + "learning_rate": 9.389716989201464e-06, + "loss": 0.3333, + "step": 1549 + }, + { + "epoch": 0.36889391325043136, + "grad_norm": 0.41050982682577164, + "learning_rate": 9.388793799563706e-06, + "loss": 0.392, + "step": 1550 + }, + { + "epoch": 0.36913190932349615, + "grad_norm": 0.37175911777408144, + "learning_rate": 9.387869957642616e-06, + "loss": 0.4177, + "step": 1551 + }, + { + "epoch": 0.36936990539656095, + "grad_norm": 0.4825250816525954, + "learning_rate": 9.3869454635755e-06, + "loss": 0.3373, + "step": 1552 + }, + { + "epoch": 0.36960790146962574, + "grad_norm": 0.46259012792132853, + "learning_rate": 9.38602031749976e-06, + "loss": 0.3139, + "step": 1553 + }, + { + "epoch": 0.36984589754269054, + "grad_norm": 0.43835976647878444, + "learning_rate": 9.385094519552896e-06, + "loss": 0.368, + "step": 1554 + }, + { + "epoch": 0.37008389361575533, + "grad_norm": 0.4476474289989832, + "learning_rate": 9.384168069872505e-06, + "loss": 0.4029, + "step": 1555 + }, + { + "epoch": 0.3703218896888201, + "grad_norm": 0.4670049546010331, + "learning_rate": 9.38324096859628e-06, + "loss": 0.3323, + "step": 1556 + }, + { + "epoch": 0.3705598857618849, + "grad_norm": 0.379594355730863, + "learning_rate": 9.382313215862009e-06, + "loss": 0.376, + "step": 1557 + }, + { + "epoch": 0.3707978818349497, + "grad_norm": 0.44935069232834013, + "learning_rate": 9.38138481180758e-06, + "loss": 0.4237, + "step": 1558 + }, + { + "epoch": 0.3710358779080145, + "grad_norm": 0.5090826653616003, + "learning_rate": 9.38045575657098e-06, + "loss": 0.3653, + "step": 1559 + }, + { + "epoch": 0.3712738739810793, + "grad_norm": 0.47245779020299394, + "learning_rate": 9.37952605029029e-06, + "loss": 0.3275, + "step": 1560 + }, + { + "epoch": 0.3715118700541441, + "grad_norm": 0.40765167623096316, + "learning_rate": 9.378595693103681e-06, + "loss": 0.4096, + "step": 1561 + }, + { + "epoch": 0.3717498661272089, + "grad_norm": 0.4459617175575773, + "learning_rate": 9.37766468514943e-06, + "loss": 0.4226, + "step": 1562 + }, + { + "epoch": 0.3719878622002737, + "grad_norm": 0.43777206352478504, + "learning_rate": 9.376733026565911e-06, + "loss": 0.3128, + "step": 1563 + }, + { + "epoch": 0.3722258582733385, + "grad_norm": 0.43171693526067784, + "learning_rate": 9.375800717491588e-06, + "loss": 0.3511, + "step": 1564 + }, + { + "epoch": 0.3724638543464033, + "grad_norm": 0.37666167095835446, + "learning_rate": 9.374867758065027e-06, + "loss": 0.4001, + "step": 1565 + }, + { + "epoch": 0.3727018504194681, + "grad_norm": 0.45130931365820925, + "learning_rate": 9.373934148424887e-06, + "loss": 0.371, + "step": 1566 + }, + { + "epoch": 0.37293984649253287, + "grad_norm": 0.4067510576017569, + "learning_rate": 9.372999888709927e-06, + "loss": 0.3598, + "step": 1567 + }, + { + "epoch": 0.37317784256559766, + "grad_norm": 0.3956103473115734, + "learning_rate": 9.372064979059001e-06, + "loss": 0.3735, + "step": 1568 + }, + { + "epoch": 0.37341583863866246, + "grad_norm": 0.38688699762267664, + "learning_rate": 9.371129419611059e-06, + "loss": 0.4281, + "step": 1569 + }, + { + "epoch": 0.37365383471172725, + "grad_norm": 0.42553445781670524, + "learning_rate": 9.37019321050515e-06, + "loss": 0.3071, + "step": 1570 + }, + { + "epoch": 0.37389183078479205, + "grad_norm": 0.43085996359356, + "learning_rate": 9.369256351880415e-06, + "loss": 0.3452, + "step": 1571 + }, + { + "epoch": 0.37412982685785684, + "grad_norm": 0.40686052760995856, + "learning_rate": 9.368318843876097e-06, + "loss": 0.4233, + "step": 1572 + }, + { + "epoch": 0.37436782293092163, + "grad_norm": 0.359734329282973, + "learning_rate": 9.36738068663153e-06, + "loss": 0.3651, + "step": 1573 + }, + { + "epoch": 0.37460581900398643, + "grad_norm": 0.3856761545147769, + "learning_rate": 9.36644188028615e-06, + "loss": 0.3182, + "step": 1574 + }, + { + "epoch": 0.3748438150770512, + "grad_norm": 0.3957759378316436, + "learning_rate": 9.365502424979488e-06, + "loss": 0.4013, + "step": 1575 + }, + { + "epoch": 0.375081811150116, + "grad_norm": 0.40850014589920525, + "learning_rate": 9.364562320851167e-06, + "loss": 0.4193, + "step": 1576 + }, + { + "epoch": 0.3753198072231808, + "grad_norm": 0.38661039932841545, + "learning_rate": 9.36362156804091e-06, + "loss": 0.3553, + "step": 1577 + }, + { + "epoch": 0.3755578032962456, + "grad_norm": 0.4003228203902698, + "learning_rate": 9.362680166688538e-06, + "loss": 0.3396, + "step": 1578 + }, + { + "epoch": 0.3757957993693104, + "grad_norm": 0.3831137546008049, + "learning_rate": 9.361738116933967e-06, + "loss": 0.3872, + "step": 1579 + }, + { + "epoch": 0.3760337954423752, + "grad_norm": 0.44314506167594236, + "learning_rate": 9.360795418917205e-06, + "loss": 0.3819, + "step": 1580 + }, + { + "epoch": 0.37627179151544, + "grad_norm": 0.46088081221952837, + "learning_rate": 9.359852072778365e-06, + "loss": 0.3196, + "step": 1581 + }, + { + "epoch": 0.3765097875885048, + "grad_norm": 0.481649845998446, + "learning_rate": 9.35890807865765e-06, + "loss": 0.3606, + "step": 1582 + }, + { + "epoch": 0.3767477836615696, + "grad_norm": 0.4324693237486104, + "learning_rate": 9.357963436695357e-06, + "loss": 0.4206, + "step": 1583 + }, + { + "epoch": 0.3769857797346344, + "grad_norm": 0.47833760040095646, + "learning_rate": 9.357018147031888e-06, + "loss": 0.3558, + "step": 1584 + }, + { + "epoch": 0.37722377580769917, + "grad_norm": 0.3956699015096483, + "learning_rate": 9.356072209807737e-06, + "loss": 0.3198, + "step": 1585 + }, + { + "epoch": 0.37746177188076396, + "grad_norm": 0.3864788423378403, + "learning_rate": 9.35512562516349e-06, + "loss": 0.4045, + "step": 1586 + }, + { + "epoch": 0.37769976795382876, + "grad_norm": 0.4160431742514611, + "learning_rate": 9.354178393239834e-06, + "loss": 0.4082, + "step": 1587 + }, + { + "epoch": 0.37793776402689355, + "grad_norm": 0.42646842694183207, + "learning_rate": 9.353230514177553e-06, + "loss": 0.3299, + "step": 1588 + }, + { + "epoch": 0.37817576009995835, + "grad_norm": 0.4132760310497568, + "learning_rate": 9.352281988117521e-06, + "loss": 0.4134, + "step": 1589 + }, + { + "epoch": 0.37841375617302314, + "grad_norm": 0.39054592818013045, + "learning_rate": 9.35133281520072e-06, + "loss": 0.4301, + "step": 1590 + }, + { + "epoch": 0.37865175224608794, + "grad_norm": 0.4404186715801395, + "learning_rate": 9.350382995568213e-06, + "loss": 0.3641, + "step": 1591 + }, + { + "epoch": 0.37888974831915273, + "grad_norm": 0.38490449743758554, + "learning_rate": 9.349432529361168e-06, + "loss": 0.3679, + "step": 1592 + }, + { + "epoch": 0.3791277443922175, + "grad_norm": 0.38256739427952213, + "learning_rate": 9.348481416720852e-06, + "loss": 0.4013, + "step": 1593 + }, + { + "epoch": 0.3793657404652823, + "grad_norm": 0.392281809140338, + "learning_rate": 9.34752965778862e-06, + "loss": 0.4262, + "step": 1594 + }, + { + "epoch": 0.3796037365383471, + "grad_norm": 0.3603426933987028, + "learning_rate": 9.346577252705929e-06, + "loss": 0.3434, + "step": 1595 + }, + { + "epoch": 0.3798417326114119, + "grad_norm": 0.3862895458269031, + "learning_rate": 9.345624201614328e-06, + "loss": 0.3273, + "step": 1596 + }, + { + "epoch": 0.3800797286844767, + "grad_norm": 0.4424213661034337, + "learning_rate": 9.344670504655466e-06, + "loss": 0.4163, + "step": 1597 + }, + { + "epoch": 0.3803177247575415, + "grad_norm": 0.3946042222856275, + "learning_rate": 9.343716161971084e-06, + "loss": 0.3853, + "step": 1598 + }, + { + "epoch": 0.3805557208306063, + "grad_norm": 0.48228594107422296, + "learning_rate": 9.342761173703023e-06, + "loss": 0.3121, + "step": 1599 + }, + { + "epoch": 0.3807937169036711, + "grad_norm": 0.4055270748007401, + "learning_rate": 9.341805539993216e-06, + "loss": 0.3568, + "step": 1600 + }, + { + "epoch": 0.3810317129767359, + "grad_norm": 0.37266993056107267, + "learning_rate": 9.340849260983695e-06, + "loss": 0.4455, + "step": 1601 + }, + { + "epoch": 0.3812697090498007, + "grad_norm": 0.4040859091869553, + "learning_rate": 9.339892336816587e-06, + "loss": 0.316, + "step": 1602 + }, + { + "epoch": 0.38150770512286547, + "grad_norm": 0.3984553047662032, + "learning_rate": 9.338934767634114e-06, + "loss": 0.3299, + "step": 1603 + }, + { + "epoch": 0.38174570119593026, + "grad_norm": 0.3937006805260035, + "learning_rate": 9.337976553578593e-06, + "loss": 0.4225, + "step": 1604 + }, + { + "epoch": 0.38198369726899506, + "grad_norm": 0.37712895414520553, + "learning_rate": 9.337017694792441e-06, + "loss": 0.3939, + "step": 1605 + }, + { + "epoch": 0.38222169334205985, + "grad_norm": 0.4069249598197311, + "learning_rate": 9.336058191418167e-06, + "loss": 0.3414, + "step": 1606 + }, + { + "epoch": 0.38245968941512465, + "grad_norm": 0.37857978246964663, + "learning_rate": 9.335098043598376e-06, + "loss": 0.3751, + "step": 1607 + }, + { + "epoch": 0.38269768548818944, + "grad_norm": 0.37160587062598166, + "learning_rate": 9.334137251475771e-06, + "loss": 0.4226, + "step": 1608 + }, + { + "epoch": 0.38293568156125424, + "grad_norm": 0.4083708370687077, + "learning_rate": 9.333175815193149e-06, + "loss": 0.3667, + "step": 1609 + }, + { + "epoch": 0.38317367763431903, + "grad_norm": 0.39386664440771063, + "learning_rate": 9.332213734893406e-06, + "loss": 0.3182, + "step": 1610 + }, + { + "epoch": 0.3834116737073838, + "grad_norm": 0.4343970310481392, + "learning_rate": 9.331251010719525e-06, + "loss": 0.4, + "step": 1611 + }, + { + "epoch": 0.3836496697804486, + "grad_norm": 0.43461958864883965, + "learning_rate": 9.330287642814593e-06, + "loss": 0.4267, + "step": 1612 + }, + { + "epoch": 0.3838876658535134, + "grad_norm": 0.41875602341308255, + "learning_rate": 9.329323631321793e-06, + "loss": 0.3314, + "step": 1613 + }, + { + "epoch": 0.3841256619265782, + "grad_norm": 0.3891910114438827, + "learning_rate": 9.328358976384398e-06, + "loss": 0.3693, + "step": 1614 + }, + { + "epoch": 0.384363657999643, + "grad_norm": 0.3988395541503049, + "learning_rate": 9.327393678145781e-06, + "loss": 0.4159, + "step": 1615 + }, + { + "epoch": 0.3846016540727078, + "grad_norm": 0.4273306553986297, + "learning_rate": 9.32642773674941e-06, + "loss": 0.3702, + "step": 1616 + }, + { + "epoch": 0.3848396501457726, + "grad_norm": 0.4297415267388038, + "learning_rate": 9.325461152338846e-06, + "loss": 0.3125, + "step": 1617 + }, + { + "epoch": 0.3850776462188374, + "grad_norm": 0.3713890087288812, + "learning_rate": 9.324493925057747e-06, + "loss": 0.3745, + "step": 1618 + }, + { + "epoch": 0.3853156422919022, + "grad_norm": 0.397582778645666, + "learning_rate": 9.32352605504987e-06, + "loss": 0.4028, + "step": 1619 + }, + { + "epoch": 0.385553638364967, + "grad_norm": 0.3879589456647183, + "learning_rate": 9.322557542459061e-06, + "loss": 0.3442, + "step": 1620 + }, + { + "epoch": 0.38579163443803177, + "grad_norm": 0.3734347689407556, + "learning_rate": 9.321588387429266e-06, + "loss": 0.3492, + "step": 1621 + }, + { + "epoch": 0.38602963051109657, + "grad_norm": 0.38372821085957537, + "learning_rate": 9.320618590104525e-06, + "loss": 0.4057, + "step": 1622 + }, + { + "epoch": 0.38626762658416136, + "grad_norm": 0.39004512408218667, + "learning_rate": 9.319648150628978e-06, + "loss": 0.3837, + "step": 1623 + }, + { + "epoch": 0.38650562265722616, + "grad_norm": 0.43356887512118736, + "learning_rate": 9.318677069146848e-06, + "loss": 0.3141, + "step": 1624 + }, + { + "epoch": 0.38674361873029095, + "grad_norm": 0.4399012491269116, + "learning_rate": 9.31770534580247e-06, + "loss": 0.4022, + "step": 1625 + }, + { + "epoch": 0.38698161480335574, + "grad_norm": 0.40492521303478335, + "learning_rate": 9.316732980740262e-06, + "loss": 0.419, + "step": 1626 + }, + { + "epoch": 0.38721961087642054, + "grad_norm": 0.4061119104072092, + "learning_rate": 9.315759974104741e-06, + "loss": 0.3166, + "step": 1627 + }, + { + "epoch": 0.38745760694948533, + "grad_norm": 0.3909427341832737, + "learning_rate": 9.314786326040523e-06, + "loss": 0.3332, + "step": 1628 + }, + { + "epoch": 0.38769560302255013, + "grad_norm": 0.42034544526119255, + "learning_rate": 9.313812036692314e-06, + "loss": 0.3961, + "step": 1629 + }, + { + "epoch": 0.3879335990956149, + "grad_norm": 0.43657367145331466, + "learning_rate": 9.312837106204916e-06, + "loss": 0.373, + "step": 1630 + }, + { + "epoch": 0.3881715951686797, + "grad_norm": 0.37018141911486274, + "learning_rate": 9.31186153472323e-06, + "loss": 0.3114, + "step": 1631 + }, + { + "epoch": 0.3884095912417445, + "grad_norm": 0.38503056353803755, + "learning_rate": 9.31088532239225e-06, + "loss": 0.386, + "step": 1632 + }, + { + "epoch": 0.3886475873148093, + "grad_norm": 0.39233949025594983, + "learning_rate": 9.309908469357067e-06, + "loss": 0.416, + "step": 1633 + }, + { + "epoch": 0.3888855833878741, + "grad_norm": 0.4433267496831673, + "learning_rate": 9.308930975762862e-06, + "loss": 0.3621, + "step": 1634 + }, + { + "epoch": 0.3891235794609389, + "grad_norm": 0.3952777289255298, + "learning_rate": 9.307952841754916e-06, + "loss": 0.3255, + "step": 1635 + }, + { + "epoch": 0.3893615755340037, + "grad_norm": 0.4022306208679625, + "learning_rate": 9.306974067478602e-06, + "loss": 0.3977, + "step": 1636 + }, + { + "epoch": 0.3895995716070685, + "grad_norm": 0.41753431253514406, + "learning_rate": 9.305994653079396e-06, + "loss": 0.4056, + "step": 1637 + }, + { + "epoch": 0.3898375676801333, + "grad_norm": 0.40246752404914526, + "learning_rate": 9.305014598702857e-06, + "loss": 0.3407, + "step": 1638 + }, + { + "epoch": 0.3900755637531981, + "grad_norm": 0.400365444870483, + "learning_rate": 9.304033904494649e-06, + "loss": 0.3742, + "step": 1639 + }, + { + "epoch": 0.39031355982626287, + "grad_norm": 0.4157295616816406, + "learning_rate": 9.303052570600524e-06, + "loss": 0.4111, + "step": 1640 + }, + { + "epoch": 0.39055155589932766, + "grad_norm": 0.4525080007717071, + "learning_rate": 9.302070597166337e-06, + "loss": 0.3378, + "step": 1641 + }, + { + "epoch": 0.39078955197239246, + "grad_norm": 0.3854211007538121, + "learning_rate": 9.301087984338029e-06, + "loss": 0.3288, + "step": 1642 + }, + { + "epoch": 0.39102754804545725, + "grad_norm": 0.4832368919472511, + "learning_rate": 9.300104732261645e-06, + "loss": 0.3952, + "step": 1643 + }, + { + "epoch": 0.39126554411852205, + "grad_norm": 0.42640640785474865, + "learning_rate": 9.299120841083317e-06, + "loss": 0.4237, + "step": 1644 + }, + { + "epoch": 0.39150354019158684, + "grad_norm": 0.41078488904477306, + "learning_rate": 9.298136310949278e-06, + "loss": 0.3299, + "step": 1645 + }, + { + "epoch": 0.39174153626465164, + "grad_norm": 0.3496318552591008, + "learning_rate": 9.297151142005852e-06, + "loss": 0.3274, + "step": 1646 + }, + { + "epoch": 0.39197953233771643, + "grad_norm": 0.45341271456511995, + "learning_rate": 9.296165334399458e-06, + "loss": 0.4078, + "step": 1647 + }, + { + "epoch": 0.3922175284107812, + "grad_norm": 0.4210029643414087, + "learning_rate": 9.295178888276615e-06, + "loss": 0.3651, + "step": 1648 + }, + { + "epoch": 0.392455524483846, + "grad_norm": 0.38599905665403594, + "learning_rate": 9.294191803783931e-06, + "loss": 0.3477, + "step": 1649 + }, + { + "epoch": 0.3926935205569108, + "grad_norm": 0.4189352579336025, + "learning_rate": 9.293204081068113e-06, + "loss": 0.3932, + "step": 1650 + }, + { + "epoch": 0.3929315166299756, + "grad_norm": 0.43257183707928304, + "learning_rate": 9.292215720275959e-06, + "loss": 0.422, + "step": 1651 + }, + { + "epoch": 0.3931695127030404, + "grad_norm": 0.4838474825444332, + "learning_rate": 9.291226721554364e-06, + "loss": 0.3335, + "step": 1652 + }, + { + "epoch": 0.3934075087761052, + "grad_norm": 0.36325017737845905, + "learning_rate": 9.290237085050318e-06, + "loss": 0.3438, + "step": 1653 + }, + { + "epoch": 0.39364550484917, + "grad_norm": 0.412707690424985, + "learning_rate": 9.289246810910909e-06, + "loss": 0.4129, + "step": 1654 + }, + { + "epoch": 0.3938835009222348, + "grad_norm": 0.41393730782449073, + "learning_rate": 9.288255899283309e-06, + "loss": 0.4021, + "step": 1655 + }, + { + "epoch": 0.3941214969952996, + "grad_norm": 0.4386727788302991, + "learning_rate": 9.287264350314797e-06, + "loss": 0.3621, + "step": 1656 + }, + { + "epoch": 0.3943594930683644, + "grad_norm": 0.3681925197909611, + "learning_rate": 9.286272164152744e-06, + "loss": 0.3769, + "step": 1657 + }, + { + "epoch": 0.39459748914142917, + "grad_norm": 0.4635910656535038, + "learning_rate": 9.285279340944607e-06, + "loss": 0.4101, + "step": 1658 + }, + { + "epoch": 0.39483548521449396, + "grad_norm": 0.4598290354163402, + "learning_rate": 9.284285880837947e-06, + "loss": 0.3532, + "step": 1659 + }, + { + "epoch": 0.39507348128755876, + "grad_norm": 0.3933380949916356, + "learning_rate": 9.283291783980418e-06, + "loss": 0.3303, + "step": 1660 + }, + { + "epoch": 0.39531147736062355, + "grad_norm": 0.39884801926811836, + "learning_rate": 9.282297050519767e-06, + "loss": 0.3897, + "step": 1661 + }, + { + "epoch": 0.39554947343368835, + "grad_norm": 0.5083918488666362, + "learning_rate": 9.281301680603834e-06, + "loss": 0.4365, + "step": 1662 + }, + { + "epoch": 0.39578746950675314, + "grad_norm": 0.5222377151000596, + "learning_rate": 9.280305674380558e-06, + "loss": 0.3347, + "step": 1663 + }, + { + "epoch": 0.39602546557981794, + "grad_norm": 0.40550870332567346, + "learning_rate": 9.279309031997968e-06, + "loss": 0.3609, + "step": 1664 + }, + { + "epoch": 0.39626346165288273, + "grad_norm": 0.4047710761598781, + "learning_rate": 9.278311753604192e-06, + "loss": 0.437, + "step": 1665 + }, + { + "epoch": 0.3965014577259475, + "grad_norm": 0.5151918734857975, + "learning_rate": 9.277313839347449e-06, + "loss": 0.3752, + "step": 1666 + }, + { + "epoch": 0.3967394537990123, + "grad_norm": 0.4697431600744339, + "learning_rate": 9.276315289376052e-06, + "loss": 0.3242, + "step": 1667 + }, + { + "epoch": 0.3969774498720771, + "grad_norm": 0.43709669423640757, + "learning_rate": 9.275316103838414e-06, + "loss": 0.3797, + "step": 1668 + }, + { + "epoch": 0.3972154459451419, + "grad_norm": 0.4192015191302519, + "learning_rate": 9.274316282883037e-06, + "loss": 0.4335, + "step": 1669 + }, + { + "epoch": 0.3974534420182067, + "grad_norm": 0.4117716004967471, + "learning_rate": 9.273315826658518e-06, + "loss": 0.3522, + "step": 1670 + }, + { + "epoch": 0.3976914380912715, + "grad_norm": 0.42633382432239425, + "learning_rate": 9.27231473531355e-06, + "loss": 0.3689, + "step": 1671 + }, + { + "epoch": 0.3979294341643363, + "grad_norm": 0.38633621983522715, + "learning_rate": 9.271313008996922e-06, + "loss": 0.4201, + "step": 1672 + }, + { + "epoch": 0.3981674302374011, + "grad_norm": 0.42827859761102866, + "learning_rate": 9.270310647857513e-06, + "loss": 0.3973, + "step": 1673 + }, + { + "epoch": 0.3984054263104659, + "grad_norm": 0.5003093170938062, + "learning_rate": 9.269307652044298e-06, + "loss": 0.326, + "step": 1674 + }, + { + "epoch": 0.3986434223835307, + "grad_norm": 0.43808182708918314, + "learning_rate": 9.26830402170635e-06, + "loss": 0.3888, + "step": 1675 + }, + { + "epoch": 0.39888141845659547, + "grad_norm": 0.41053386389897284, + "learning_rate": 9.267299756992829e-06, + "loss": 0.4315, + "step": 1676 + }, + { + "epoch": 0.39911941452966027, + "grad_norm": 0.4230397692191716, + "learning_rate": 9.266294858052998e-06, + "loss": 0.3346, + "step": 1677 + }, + { + "epoch": 0.39935741060272506, + "grad_norm": 0.431898235869182, + "learning_rate": 9.265289325036209e-06, + "loss": 0.3241, + "step": 1678 + }, + { + "epoch": 0.39959540667578985, + "grad_norm": 0.39729706567685696, + "learning_rate": 9.264283158091909e-06, + "loss": 0.4009, + "step": 1679 + }, + { + "epoch": 0.39983340274885465, + "grad_norm": 0.4004504040408847, + "learning_rate": 9.263276357369635e-06, + "loss": 0.3942, + "step": 1680 + }, + { + "epoch": 0.40007139882191944, + "grad_norm": 0.4300597724152983, + "learning_rate": 9.262268923019028e-06, + "loss": 0.3161, + "step": 1681 + }, + { + "epoch": 0.40030939489498424, + "grad_norm": 0.5008687287891455, + "learning_rate": 9.261260855189815e-06, + "loss": 0.3539, + "step": 1682 + }, + { + "epoch": 0.40054739096804903, + "grad_norm": 0.36119133039165174, + "learning_rate": 9.26025215403182e-06, + "loss": 0.4348, + "step": 1683 + }, + { + "epoch": 0.4007853870411138, + "grad_norm": 0.42846844104954446, + "learning_rate": 9.259242819694963e-06, + "loss": 0.3417, + "step": 1684 + }, + { + "epoch": 0.4010233831141786, + "grad_norm": 0.4165944881715832, + "learning_rate": 9.258232852329253e-06, + "loss": 0.317, + "step": 1685 + }, + { + "epoch": 0.4012613791872434, + "grad_norm": 0.3890173772649005, + "learning_rate": 9.257222252084798e-06, + "loss": 0.3775, + "step": 1686 + }, + { + "epoch": 0.4014993752603082, + "grad_norm": 0.38942060350209845, + "learning_rate": 9.256211019111799e-06, + "loss": 0.4156, + "step": 1687 + }, + { + "epoch": 0.401737371333373, + "grad_norm": 0.4134669751208099, + "learning_rate": 9.255199153560546e-06, + "loss": 0.3203, + "step": 1688 + }, + { + "epoch": 0.4019753674064378, + "grad_norm": 0.40855285972159666, + "learning_rate": 9.254186655581431e-06, + "loss": 0.3485, + "step": 1689 + }, + { + "epoch": 0.4022133634795026, + "grad_norm": 0.4066844623707453, + "learning_rate": 9.253173525324937e-06, + "loss": 0.448, + "step": 1690 + }, + { + "epoch": 0.4024513595525674, + "grad_norm": 0.40579610697102486, + "learning_rate": 9.252159762941638e-06, + "loss": 0.3493, + "step": 1691 + }, + { + "epoch": 0.4026893556256322, + "grad_norm": 0.4612752035521552, + "learning_rate": 9.251145368582204e-06, + "loss": 0.314, + "step": 1692 + }, + { + "epoch": 0.402927351698697, + "grad_norm": 0.38917703961071853, + "learning_rate": 9.2501303423974e-06, + "loss": 0.3809, + "step": 1693 + }, + { + "epoch": 0.4031653477717618, + "grad_norm": 0.42688170513775625, + "learning_rate": 9.249114684538087e-06, + "loss": 0.439, + "step": 1694 + }, + { + "epoch": 0.40340334384482657, + "grad_norm": 0.4270682354405054, + "learning_rate": 9.248098395155212e-06, + "loss": 0.3197, + "step": 1695 + }, + { + "epoch": 0.40364133991789136, + "grad_norm": 0.38427191838232466, + "learning_rate": 9.247081474399821e-06, + "loss": 0.3653, + "step": 1696 + }, + { + "epoch": 0.40387933599095616, + "grad_norm": 0.3635373879235468, + "learning_rate": 9.246063922423057e-06, + "loss": 0.3965, + "step": 1697 + }, + { + "epoch": 0.40411733206402095, + "grad_norm": 0.36800213725949676, + "learning_rate": 9.24504573937615e-06, + "loss": 0.3629, + "step": 1698 + }, + { + "epoch": 0.40435532813708575, + "grad_norm": 0.46951813533096415, + "learning_rate": 9.24402692541043e-06, + "loss": 0.3301, + "step": 1699 + }, + { + "epoch": 0.40459332421015054, + "grad_norm": 0.40731159678572715, + "learning_rate": 9.243007480677317e-06, + "loss": 0.3892, + "step": 1700 + }, + { + "epoch": 0.40483132028321533, + "grad_norm": 0.39470797179107836, + "learning_rate": 9.241987405328325e-06, + "loss": 0.4199, + "step": 1701 + }, + { + "epoch": 0.40506931635628013, + "grad_norm": 0.379892047471591, + "learning_rate": 9.240966699515062e-06, + "loss": 0.339, + "step": 1702 + }, + { + "epoch": 0.4053073124293449, + "grad_norm": 0.4299870200864259, + "learning_rate": 9.239945363389233e-06, + "loss": 0.3226, + "step": 1703 + }, + { + "epoch": 0.4055453085024097, + "grad_norm": 0.38016909458014425, + "learning_rate": 9.238923397102629e-06, + "loss": 0.4155, + "step": 1704 + }, + { + "epoch": 0.4057833045754745, + "grad_norm": 0.41162849886265007, + "learning_rate": 9.237900800807144e-06, + "loss": 0.3961, + "step": 1705 + }, + { + "epoch": 0.4060213006485393, + "grad_norm": 0.41727989058008336, + "learning_rate": 9.23687757465476e-06, + "loss": 0.3234, + "step": 1706 + }, + { + "epoch": 0.4062592967216041, + "grad_norm": 0.4257234199843476, + "learning_rate": 9.235853718797552e-06, + "loss": 0.3652, + "step": 1707 + }, + { + "epoch": 0.4064972927946689, + "grad_norm": 0.38093238010454294, + "learning_rate": 9.234829233387692e-06, + "loss": 0.4513, + "step": 1708 + }, + { + "epoch": 0.4067352888677337, + "grad_norm": 0.42134937392720456, + "learning_rate": 9.233804118577442e-06, + "loss": 0.3595, + "step": 1709 + }, + { + "epoch": 0.4069732849407985, + "grad_norm": 0.36011012728718184, + "learning_rate": 9.232778374519162e-06, + "loss": 0.3139, + "step": 1710 + }, + { + "epoch": 0.4072112810138633, + "grad_norm": 0.3848429350956219, + "learning_rate": 9.231752001365301e-06, + "loss": 0.3874, + "step": 1711 + }, + { + "epoch": 0.4074492770869281, + "grad_norm": 0.42989698919103714, + "learning_rate": 9.230724999268405e-06, + "loss": 0.4137, + "step": 1712 + }, + { + "epoch": 0.40768727315999287, + "grad_norm": 0.40356043597179625, + "learning_rate": 9.22969736838111e-06, + "loss": 0.3497, + "step": 1713 + }, + { + "epoch": 0.40792526923305766, + "grad_norm": 0.38088846927432707, + "learning_rate": 9.22866910885615e-06, + "loss": 0.3511, + "step": 1714 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.36991122947115884, + "learning_rate": 9.227640220846345e-06, + "loss": 0.4034, + "step": 1715 + }, + { + "epoch": 0.40840126137918725, + "grad_norm": 0.4121306552946537, + "learning_rate": 9.226610704504619e-06, + "loss": 0.3843, + "step": 1716 + }, + { + "epoch": 0.40863925745225205, + "grad_norm": 0.39375147468162974, + "learning_rate": 9.225580559983982e-06, + "loss": 0.3329, + "step": 1717 + }, + { + "epoch": 0.40887725352531684, + "grad_norm": 0.39350805991360777, + "learning_rate": 9.224549787437536e-06, + "loss": 0.398, + "step": 1718 + }, + { + "epoch": 0.40911524959838164, + "grad_norm": 0.40232745688418725, + "learning_rate": 9.223518387018481e-06, + "loss": 0.4178, + "step": 1719 + }, + { + "epoch": 0.40935324567144643, + "grad_norm": 0.39632952276294475, + "learning_rate": 9.22248635888011e-06, + "loss": 0.3531, + "step": 1720 + }, + { + "epoch": 0.4095912417445112, + "grad_norm": 0.38179777362704265, + "learning_rate": 9.221453703175805e-06, + "loss": 0.3252, + "step": 1721 + }, + { + "epoch": 0.409829237817576, + "grad_norm": 0.3717944610724434, + "learning_rate": 9.220420420059048e-06, + "loss": 0.4133, + "step": 1722 + }, + { + "epoch": 0.4100672338906408, + "grad_norm": 0.3996883821281656, + "learning_rate": 9.219386509683408e-06, + "loss": 0.38, + "step": 1723 + }, + { + "epoch": 0.4103052299637056, + "grad_norm": 0.47689973154409665, + "learning_rate": 9.21835197220255e-06, + "loss": 0.3705, + "step": 1724 + }, + { + "epoch": 0.4105432260367704, + "grad_norm": 0.40061030822431853, + "learning_rate": 9.217316807770232e-06, + "loss": 0.3798, + "step": 1725 + }, + { + "epoch": 0.4107812221098352, + "grad_norm": 0.3891661943835348, + "learning_rate": 9.216281016540305e-06, + "loss": 0.413, + "step": 1726 + }, + { + "epoch": 0.4110192181829, + "grad_norm": 0.40874575169057586, + "learning_rate": 9.215244598666712e-06, + "loss": 0.3299, + "step": 1727 + }, + { + "epoch": 0.4112572142559648, + "grad_norm": 0.4192275317369746, + "learning_rate": 9.214207554303492e-06, + "loss": 0.3232, + "step": 1728 + }, + { + "epoch": 0.4114952103290296, + "grad_norm": 0.39895282478246985, + "learning_rate": 9.213169883604776e-06, + "loss": 0.4003, + "step": 1729 + }, + { + "epoch": 0.4117332064020944, + "grad_norm": 0.4354043645900782, + "learning_rate": 9.212131586724787e-06, + "loss": 0.3975, + "step": 1730 + }, + { + "epoch": 0.41197120247515917, + "grad_norm": 0.4120078987709819, + "learning_rate": 9.211092663817839e-06, + "loss": 0.3097, + "step": 1731 + }, + { + "epoch": 0.41220919854822397, + "grad_norm": 0.389265114227728, + "learning_rate": 9.210053115038345e-06, + "loss": 0.3601, + "step": 1732 + }, + { + "epoch": 0.41244719462128876, + "grad_norm": 0.5583703836161701, + "learning_rate": 9.209012940540806e-06, + "loss": 0.4214, + "step": 1733 + }, + { + "epoch": 0.41268519069435355, + "grad_norm": 0.3795249796311343, + "learning_rate": 9.207972140479817e-06, + "loss": 0.3467, + "step": 1734 + }, + { + "epoch": 0.41292318676741835, + "grad_norm": 0.37887544114873867, + "learning_rate": 9.206930715010069e-06, + "loss": 0.3176, + "step": 1735 + }, + { + "epoch": 0.41316118284048314, + "grad_norm": 0.3908429675593901, + "learning_rate": 9.205888664286343e-06, + "loss": 0.3866, + "step": 1736 + }, + { + "epoch": 0.41339917891354794, + "grad_norm": 0.4053593309349944, + "learning_rate": 9.20484598846351e-06, + "loss": 0.4207, + "step": 1737 + }, + { + "epoch": 0.41363717498661273, + "grad_norm": 0.40431227924333274, + "learning_rate": 9.203802687696543e-06, + "loss": 0.2908, + "step": 1738 + }, + { + "epoch": 0.4138751710596775, + "grad_norm": 0.41350971082279164, + "learning_rate": 9.2027587621405e-06, + "loss": 0.3968, + "step": 1739 + }, + { + "epoch": 0.4141131671327423, + "grad_norm": 0.37481649448861415, + "learning_rate": 9.201714211950532e-06, + "loss": 0.4003, + "step": 1740 + }, + { + "epoch": 0.4143511632058071, + "grad_norm": 0.3800874685713264, + "learning_rate": 9.200669037281888e-06, + "loss": 0.3346, + "step": 1741 + }, + { + "epoch": 0.4145891592788719, + "grad_norm": 0.4080865424809329, + "learning_rate": 9.199623238289903e-06, + "loss": 0.3263, + "step": 1742 + }, + { + "epoch": 0.4148271553519367, + "grad_norm": 0.38620720186828605, + "learning_rate": 9.198576815130013e-06, + "loss": 0.3869, + "step": 1743 + }, + { + "epoch": 0.4150651514250015, + "grad_norm": 0.42012676664637677, + "learning_rate": 9.197529767957742e-06, + "loss": 0.4334, + "step": 1744 + }, + { + "epoch": 0.4153031474980663, + "grad_norm": 0.44141968095141493, + "learning_rate": 9.196482096928702e-06, + "loss": 0.3451, + "step": 1745 + }, + { + "epoch": 0.4155411435711311, + "grad_norm": 0.3868284133888196, + "learning_rate": 9.19543380219861e-06, + "loss": 0.3473, + "step": 1746 + }, + { + "epoch": 0.4157791396441959, + "grad_norm": 0.3881735751224557, + "learning_rate": 9.194384883923262e-06, + "loss": 0.4139, + "step": 1747 + }, + { + "epoch": 0.4160171357172607, + "grad_norm": 0.4022882687609258, + "learning_rate": 9.193335342258558e-06, + "loss": 0.3668, + "step": 1748 + }, + { + "epoch": 0.41625513179032547, + "grad_norm": 0.36310020776516483, + "learning_rate": 9.192285177360482e-06, + "loss": 0.3155, + "step": 1749 + }, + { + "epoch": 0.41649312786339027, + "grad_norm": 0.3999886426578984, + "learning_rate": 9.191234389385119e-06, + "loss": 0.3557, + "step": 1750 + }, + { + "epoch": 0.41673112393645506, + "grad_norm": 0.3616937980400372, + "learning_rate": 9.19018297848864e-06, + "loss": 0.4089, + "step": 1751 + }, + { + "epoch": 0.41696912000951986, + "grad_norm": 0.3834384131011674, + "learning_rate": 9.189130944827308e-06, + "loss": 0.3247, + "step": 1752 + }, + { + "epoch": 0.41720711608258465, + "grad_norm": 0.38320709923324137, + "learning_rate": 9.188078288557485e-06, + "loss": 0.3301, + "step": 1753 + }, + { + "epoch": 0.41744511215564944, + "grad_norm": 0.38222131872575443, + "learning_rate": 9.18702500983562e-06, + "loss": 0.4085, + "step": 1754 + }, + { + "epoch": 0.41768310822871424, + "grad_norm": 0.3824160404741556, + "learning_rate": 9.185971108818254e-06, + "loss": 0.4017, + "step": 1755 + }, + { + "epoch": 0.41792110430177903, + "grad_norm": 0.36169585842835983, + "learning_rate": 9.184916585662029e-06, + "loss": 0.3161, + "step": 1756 + }, + { + "epoch": 0.41815910037484383, + "grad_norm": 0.372151927298045, + "learning_rate": 9.183861440523667e-06, + "loss": 0.386, + "step": 1757 + }, + { + "epoch": 0.4183970964479086, + "grad_norm": 0.39912606436044784, + "learning_rate": 9.182805673559993e-06, + "loss": 0.4162, + "step": 1758 + }, + { + "epoch": 0.4186350925209734, + "grad_norm": 0.4088578633797585, + "learning_rate": 9.181749284927917e-06, + "loss": 0.3041, + "step": 1759 + }, + { + "epoch": 0.4188730885940382, + "grad_norm": 0.3872367130929281, + "learning_rate": 9.180692274784445e-06, + "loss": 0.3263, + "step": 1760 + }, + { + "epoch": 0.419111084667103, + "grad_norm": 0.3909524702284084, + "learning_rate": 9.179634643286677e-06, + "loss": 0.4072, + "step": 1761 + }, + { + "epoch": 0.4193490807401678, + "grad_norm": 0.39296104928130404, + "learning_rate": 9.178576390591803e-06, + "loss": 0.4156, + "step": 1762 + }, + { + "epoch": 0.4195870768132326, + "grad_norm": 0.39592073252457005, + "learning_rate": 9.177517516857102e-06, + "loss": 0.3354, + "step": 1763 + }, + { + "epoch": 0.4198250728862974, + "grad_norm": 0.4058493679855248, + "learning_rate": 9.176458022239954e-06, + "loss": 0.3626, + "step": 1764 + }, + { + "epoch": 0.4200630689593622, + "grad_norm": 0.3971663194653234, + "learning_rate": 9.175397906897821e-06, + "loss": 0.4147, + "step": 1765 + }, + { + "epoch": 0.420301065032427, + "grad_norm": 0.39799197552647586, + "learning_rate": 9.174337170988265e-06, + "loss": 0.3773, + "step": 1766 + }, + { + "epoch": 0.4205390611054918, + "grad_norm": 0.3674682210725563, + "learning_rate": 9.173275814668937e-06, + "loss": 0.3324, + "step": 1767 + }, + { + "epoch": 0.42077705717855657, + "grad_norm": 0.4437292514829072, + "learning_rate": 9.17221383809758e-06, + "loss": 0.3865, + "step": 1768 + }, + { + "epoch": 0.42101505325162136, + "grad_norm": 0.3802556994977754, + "learning_rate": 9.171151241432034e-06, + "loss": 0.4228, + "step": 1769 + }, + { + "epoch": 0.42125304932468616, + "grad_norm": 0.4701590991382848, + "learning_rate": 9.170088024830223e-06, + "loss": 0.3363, + "step": 1770 + }, + { + "epoch": 0.42149104539775095, + "grad_norm": 0.40536949953318796, + "learning_rate": 9.169024188450169e-06, + "loss": 0.3709, + "step": 1771 + }, + { + "epoch": 0.42172904147081575, + "grad_norm": 0.36959719177247113, + "learning_rate": 9.167959732449983e-06, + "loss": 0.407, + "step": 1772 + }, + { + "epoch": 0.42196703754388054, + "grad_norm": 0.4019427295156592, + "learning_rate": 9.16689465698787e-06, + "loss": 0.387, + "step": 1773 + }, + { + "epoch": 0.42220503361694534, + "grad_norm": 0.3795424586658338, + "learning_rate": 9.165828962222128e-06, + "loss": 0.3037, + "step": 1774 + }, + { + "epoch": 0.42244302969001013, + "grad_norm": 0.41841783370483765, + "learning_rate": 9.164762648311142e-06, + "loss": 0.3775, + "step": 1775 + }, + { + "epoch": 0.4226810257630749, + "grad_norm": 0.3622382328620447, + "learning_rate": 9.163695715413399e-06, + "loss": 0.3942, + "step": 1776 + }, + { + "epoch": 0.4229190218361397, + "grad_norm": 0.36992410907411066, + "learning_rate": 9.162628163687466e-06, + "loss": 0.3483, + "step": 1777 + }, + { + "epoch": 0.4231570179092045, + "grad_norm": 0.3847751360001051, + "learning_rate": 9.16155999329201e-06, + "loss": 0.3104, + "step": 1778 + }, + { + "epoch": 0.4233950139822693, + "grad_norm": 0.395246830815519, + "learning_rate": 9.160491204385786e-06, + "loss": 0.3947, + "step": 1779 + }, + { + "epoch": 0.4236330100553341, + "grad_norm": 0.38146783859148997, + "learning_rate": 9.159421797127643e-06, + "loss": 0.4296, + "step": 1780 + }, + { + "epoch": 0.4238710061283989, + "grad_norm": 0.4112992635868304, + "learning_rate": 9.158351771676523e-06, + "loss": 0.3382, + "step": 1781 + }, + { + "epoch": 0.4241090022014637, + "grad_norm": 0.36930776652809816, + "learning_rate": 9.157281128191458e-06, + "loss": 0.3593, + "step": 1782 + }, + { + "epoch": 0.4243469982745285, + "grad_norm": 0.3908100344030319, + "learning_rate": 9.156209866831568e-06, + "loss": 0.3928, + "step": 1783 + }, + { + "epoch": 0.4245849943475933, + "grad_norm": 0.36549619200757544, + "learning_rate": 9.155137987756075e-06, + "loss": 0.3465, + "step": 1784 + }, + { + "epoch": 0.4248229904206581, + "grad_norm": 0.3849426680408924, + "learning_rate": 9.154065491124284e-06, + "loss": 0.2999, + "step": 1785 + }, + { + "epoch": 0.42506098649372287, + "grad_norm": 0.3782937649081525, + "learning_rate": 9.152992377095594e-06, + "loss": 0.3892, + "step": 1786 + }, + { + "epoch": 0.42529898256678766, + "grad_norm": 0.4181419981500838, + "learning_rate": 9.151918645829495e-06, + "loss": 0.4155, + "step": 1787 + }, + { + "epoch": 0.42553697863985246, + "grad_norm": 0.40443588328082936, + "learning_rate": 9.150844297485573e-06, + "loss": 0.3487, + "step": 1788 + }, + { + "epoch": 0.42577497471291725, + "grad_norm": 0.4037548245270624, + "learning_rate": 9.149769332223502e-06, + "loss": 0.3502, + "step": 1789 + }, + { + "epoch": 0.42601297078598205, + "grad_norm": 0.37518033203980755, + "learning_rate": 9.148693750203046e-06, + "loss": 0.4035, + "step": 1790 + }, + { + "epoch": 0.42625096685904684, + "grad_norm": 0.4112116522406348, + "learning_rate": 9.147617551584066e-06, + "loss": 0.3702, + "step": 1791 + }, + { + "epoch": 0.42648896293211164, + "grad_norm": 0.403896574510639, + "learning_rate": 9.14654073652651e-06, + "loss": 0.3408, + "step": 1792 + }, + { + "epoch": 0.42672695900517643, + "grad_norm": 0.3757284298168529, + "learning_rate": 9.145463305190422e-06, + "loss": 0.3561, + "step": 1793 + }, + { + "epoch": 0.4269649550782412, + "grad_norm": 0.40025611305787007, + "learning_rate": 9.144385257735931e-06, + "loss": 0.407, + "step": 1794 + }, + { + "epoch": 0.427202951151306, + "grad_norm": 0.3825822712921448, + "learning_rate": 9.143306594323265e-06, + "loss": 0.3361, + "step": 1795 + }, + { + "epoch": 0.4274409472243708, + "grad_norm": 0.3979473641423701, + "learning_rate": 9.142227315112739e-06, + "loss": 0.3523, + "step": 1796 + }, + { + "epoch": 0.4276789432974356, + "grad_norm": 0.37047270991043285, + "learning_rate": 9.141147420264758e-06, + "loss": 0.3966, + "step": 1797 + }, + { + "epoch": 0.4279169393705004, + "grad_norm": 0.45550159986727345, + "learning_rate": 9.140066909939824e-06, + "loss": 0.3841, + "step": 1798 + }, + { + "epoch": 0.4281549354435652, + "grad_norm": 0.4044470476748523, + "learning_rate": 9.138985784298528e-06, + "loss": 0.3111, + "step": 1799 + }, + { + "epoch": 0.42839293151663, + "grad_norm": 0.41668001069625243, + "learning_rate": 9.13790404350155e-06, + "loss": 0.3681, + "step": 1800 + }, + { + "epoch": 0.4286309275896948, + "grad_norm": 0.36553533604315613, + "learning_rate": 9.136821687709664e-06, + "loss": 0.4334, + "step": 1801 + }, + { + "epoch": 0.4288689236627596, + "grad_norm": 0.37044586892206033, + "learning_rate": 9.135738717083738e-06, + "loss": 0.3309, + "step": 1802 + }, + { + "epoch": 0.4291069197358244, + "grad_norm": 0.40087470340481024, + "learning_rate": 9.134655131784723e-06, + "loss": 0.3188, + "step": 1803 + }, + { + "epoch": 0.42934491580888917, + "grad_norm": 0.35972133195166445, + "learning_rate": 9.133570931973668e-06, + "loss": 0.4269, + "step": 1804 + }, + { + "epoch": 0.42958291188195397, + "grad_norm": 0.4047830499287121, + "learning_rate": 9.132486117811715e-06, + "loss": 0.3887, + "step": 1805 + }, + { + "epoch": 0.42982090795501876, + "grad_norm": 0.372292118382629, + "learning_rate": 9.131400689460091e-06, + "loss": 0.3166, + "step": 1806 + }, + { + "epoch": 0.43005890402808356, + "grad_norm": 0.3801968581448093, + "learning_rate": 9.130314647080118e-06, + "loss": 0.3544, + "step": 1807 + }, + { + "epoch": 0.43029690010114835, + "grad_norm": 0.41390771036953183, + "learning_rate": 9.129227990833212e-06, + "loss": 0.4201, + "step": 1808 + }, + { + "epoch": 0.43053489617421314, + "grad_norm": 0.37114026857947074, + "learning_rate": 9.128140720880872e-06, + "loss": 0.3168, + "step": 1809 + }, + { + "epoch": 0.43077289224727794, + "grad_norm": 0.37943566827382635, + "learning_rate": 9.127052837384696e-06, + "loss": 0.3383, + "step": 1810 + }, + { + "epoch": 0.43101088832034273, + "grad_norm": 0.37039949864005134, + "learning_rate": 9.125964340506371e-06, + "loss": 0.3921, + "step": 1811 + }, + { + "epoch": 0.43124888439340753, + "grad_norm": 0.4009046077092876, + "learning_rate": 9.12487523040767e-06, + "loss": 0.4055, + "step": 1812 + }, + { + "epoch": 0.4314868804664723, + "grad_norm": 0.415891186146942, + "learning_rate": 9.12378550725047e-06, + "loss": 0.3205, + "step": 1813 + }, + { + "epoch": 0.4317248765395371, + "grad_norm": 0.39458867165479744, + "learning_rate": 9.122695171196724e-06, + "loss": 0.3616, + "step": 1814 + }, + { + "epoch": 0.4319628726126019, + "grad_norm": 0.36931772090998816, + "learning_rate": 9.121604222408484e-06, + "loss": 0.4181, + "step": 1815 + }, + { + "epoch": 0.4322008686856667, + "grad_norm": 0.4095231673961699, + "learning_rate": 9.120512661047895e-06, + "loss": 0.3514, + "step": 1816 + }, + { + "epoch": 0.4324388647587315, + "grad_norm": 0.4938169837089172, + "learning_rate": 9.119420487277186e-06, + "loss": 0.3169, + "step": 1817 + }, + { + "epoch": 0.4326768608317963, + "grad_norm": 0.41916255433374716, + "learning_rate": 9.118327701258685e-06, + "loss": 0.3944, + "step": 1818 + }, + { + "epoch": 0.4329148569048611, + "grad_norm": 0.40908642674760587, + "learning_rate": 9.117234303154806e-06, + "loss": 0.4355, + "step": 1819 + }, + { + "epoch": 0.4331528529779259, + "grad_norm": 0.3857608329887888, + "learning_rate": 9.11614029312805e-06, + "loss": 0.3259, + "step": 1820 + }, + { + "epoch": 0.4333908490509907, + "grad_norm": 0.4067680142939106, + "learning_rate": 9.11504567134102e-06, + "loss": 0.3557, + "step": 1821 + }, + { + "epoch": 0.4336288451240555, + "grad_norm": 0.37531284613422866, + "learning_rate": 9.113950437956403e-06, + "loss": 0.4032, + "step": 1822 + }, + { + "epoch": 0.43386684119712027, + "grad_norm": 0.39254495182058463, + "learning_rate": 9.112854593136976e-06, + "loss": 0.3923, + "step": 1823 + }, + { + "epoch": 0.43410483727018506, + "grad_norm": 0.4058157753281052, + "learning_rate": 9.111758137045609e-06, + "loss": 0.3266, + "step": 1824 + }, + { + "epoch": 0.43434283334324986, + "grad_norm": 0.36694701438635535, + "learning_rate": 9.110661069845263e-06, + "loss": 0.3684, + "step": 1825 + }, + { + "epoch": 0.43458082941631465, + "grad_norm": 0.36461266098587286, + "learning_rate": 9.10956339169899e-06, + "loss": 0.4108, + "step": 1826 + }, + { + "epoch": 0.43481882548937945, + "grad_norm": 0.39631841763000103, + "learning_rate": 9.10846510276993e-06, + "loss": 0.337, + "step": 1827 + }, + { + "epoch": 0.43505682156244424, + "grad_norm": 0.40343286423290825, + "learning_rate": 9.107366203221318e-06, + "loss": 0.344, + "step": 1828 + }, + { + "epoch": 0.43529481763550903, + "grad_norm": 0.36284461096995285, + "learning_rate": 9.106266693216477e-06, + "loss": 0.3818, + "step": 1829 + }, + { + "epoch": 0.43553281370857383, + "grad_norm": 0.42037283100640777, + "learning_rate": 9.10516657291882e-06, + "loss": 0.4134, + "step": 1830 + }, + { + "epoch": 0.4357708097816386, + "grad_norm": 0.4135556119364222, + "learning_rate": 9.104065842491854e-06, + "loss": 0.3355, + "step": 1831 + }, + { + "epoch": 0.4360088058547034, + "grad_norm": 0.5512601557394179, + "learning_rate": 9.102964502099175e-06, + "loss": 0.3644, + "step": 1832 + }, + { + "epoch": 0.4362468019277682, + "grad_norm": 0.3726460220199828, + "learning_rate": 9.101862551904467e-06, + "loss": 0.4127, + "step": 1833 + }, + { + "epoch": 0.436484798000833, + "grad_norm": 0.3889721147844331, + "learning_rate": 9.100759992071509e-06, + "loss": 0.3612, + "step": 1834 + }, + { + "epoch": 0.4367227940738978, + "grad_norm": 0.3827603318719862, + "learning_rate": 9.099656822764169e-06, + "loss": 0.2957, + "step": 1835 + }, + { + "epoch": 0.4369607901469626, + "grad_norm": 0.3637594579871201, + "learning_rate": 9.098553044146404e-06, + "loss": 0.3518, + "step": 1836 + }, + { + "epoch": 0.4371987862200274, + "grad_norm": 0.41257520091473093, + "learning_rate": 9.097448656382263e-06, + "loss": 0.4387, + "step": 1837 + }, + { + "epoch": 0.4374367822930922, + "grad_norm": 0.43864336163698847, + "learning_rate": 9.096343659635887e-06, + "loss": 0.3276, + "step": 1838 + }, + { + "epoch": 0.437674778366157, + "grad_norm": 0.3841715752329595, + "learning_rate": 9.095238054071505e-06, + "loss": 0.3416, + "step": 1839 + }, + { + "epoch": 0.4379127744392218, + "grad_norm": 0.370655417628842, + "learning_rate": 9.094131839853435e-06, + "loss": 0.3964, + "step": 1840 + }, + { + "epoch": 0.43815077051228657, + "grad_norm": 0.3513470462073168, + "learning_rate": 9.093025017146089e-06, + "loss": 0.3362, + "step": 1841 + }, + { + "epoch": 0.43838876658535136, + "grad_norm": 0.3802770131312354, + "learning_rate": 9.09191758611397e-06, + "loss": 0.3301, + "step": 1842 + }, + { + "epoch": 0.43862676265841616, + "grad_norm": 0.4226138218606067, + "learning_rate": 9.09080954692167e-06, + "loss": 0.3848, + "step": 1843 + }, + { + "epoch": 0.43886475873148095, + "grad_norm": 0.4535187719262108, + "learning_rate": 9.089700899733867e-06, + "loss": 0.4167, + "step": 1844 + }, + { + "epoch": 0.43910275480454575, + "grad_norm": 0.3702844760500393, + "learning_rate": 9.088591644715338e-06, + "loss": 0.3273, + "step": 1845 + }, + { + "epoch": 0.43934075087761054, + "grad_norm": 0.4038109651897949, + "learning_rate": 9.087481782030943e-06, + "loss": 0.3614, + "step": 1846 + }, + { + "epoch": 0.43957874695067534, + "grad_norm": 0.40604586580793806, + "learning_rate": 9.086371311845636e-06, + "loss": 0.4053, + "step": 1847 + }, + { + "epoch": 0.43981674302374013, + "grad_norm": 0.421721246391553, + "learning_rate": 9.08526023432446e-06, + "loss": 0.3641, + "step": 1848 + }, + { + "epoch": 0.4400547390968049, + "grad_norm": 0.38810442228840714, + "learning_rate": 9.084148549632547e-06, + "loss": 0.3291, + "step": 1849 + }, + { + "epoch": 0.4402927351698697, + "grad_norm": 0.4074211523419894, + "learning_rate": 9.083036257935125e-06, + "loss": 0.3818, + "step": 1850 + }, + { + "epoch": 0.4405307312429345, + "grad_norm": 0.3979947415081816, + "learning_rate": 9.081923359397504e-06, + "loss": 0.4428, + "step": 1851 + }, + { + "epoch": 0.4407687273159993, + "grad_norm": 0.4180778911918767, + "learning_rate": 9.080809854185091e-06, + "loss": 0.3094, + "step": 1852 + }, + { + "epoch": 0.4410067233890641, + "grad_norm": 0.44372997553941645, + "learning_rate": 9.07969574246338e-06, + "loss": 0.3535, + "step": 1853 + }, + { + "epoch": 0.4412447194621289, + "grad_norm": 0.41225588433239624, + "learning_rate": 9.078581024397952e-06, + "loss": 0.4067, + "step": 1854 + }, + { + "epoch": 0.4414827155351937, + "grad_norm": 0.4001859550814012, + "learning_rate": 9.077465700154487e-06, + "loss": 0.433, + "step": 1855 + }, + { + "epoch": 0.4417207116082585, + "grad_norm": 0.4167341678602161, + "learning_rate": 9.076349769898746e-06, + "loss": 0.3567, + "step": 1856 + }, + { + "epoch": 0.4419587076813233, + "grad_norm": 0.40392402299201724, + "learning_rate": 9.075233233796585e-06, + "loss": 0.3763, + "step": 1857 + }, + { + "epoch": 0.4421967037543881, + "grad_norm": 0.362779786516785, + "learning_rate": 9.074116092013952e-06, + "loss": 0.3903, + "step": 1858 + }, + { + "epoch": 0.44243469982745287, + "grad_norm": 0.3781623110186848, + "learning_rate": 9.072998344716875e-06, + "loss": 0.3343, + "step": 1859 + }, + { + "epoch": 0.44267269590051767, + "grad_norm": 0.38836103267593736, + "learning_rate": 9.071879992071484e-06, + "loss": 0.3055, + "step": 1860 + }, + { + "epoch": 0.44291069197358246, + "grad_norm": 0.37564508597971813, + "learning_rate": 9.070761034243995e-06, + "loss": 0.4162, + "step": 1861 + }, + { + "epoch": 0.44314868804664725, + "grad_norm": 0.381826072613129, + "learning_rate": 9.069641471400707e-06, + "loss": 0.3939, + "step": 1862 + }, + { + "epoch": 0.44338668411971205, + "grad_norm": 0.41067999322559867, + "learning_rate": 9.06852130370802e-06, + "loss": 0.314, + "step": 1863 + }, + { + "epoch": 0.44362468019277684, + "grad_norm": 0.4382314242050576, + "learning_rate": 9.067400531332418e-06, + "loss": 0.3397, + "step": 1864 + }, + { + "epoch": 0.44386267626584164, + "grad_norm": 0.3671784649407471, + "learning_rate": 9.066279154440474e-06, + "loss": 0.4134, + "step": 1865 + }, + { + "epoch": 0.44410067233890643, + "grad_norm": 0.38775612781990293, + "learning_rate": 9.065157173198852e-06, + "loss": 0.3365, + "step": 1866 + }, + { + "epoch": 0.4443386684119712, + "grad_norm": 0.399921745750612, + "learning_rate": 9.064034587774307e-06, + "loss": 0.3262, + "step": 1867 + }, + { + "epoch": 0.444576664485036, + "grad_norm": 0.3883120453391914, + "learning_rate": 9.062911398333682e-06, + "loss": 0.3783, + "step": 1868 + }, + { + "epoch": 0.4448146605581008, + "grad_norm": 0.42591617134823834, + "learning_rate": 9.061787605043913e-06, + "loss": 0.4329, + "step": 1869 + }, + { + "epoch": 0.4450526566311656, + "grad_norm": 0.4145202625576308, + "learning_rate": 9.060663208072022e-06, + "loss": 0.319, + "step": 1870 + }, + { + "epoch": 0.4452906527042304, + "grad_norm": 0.4256230711123469, + "learning_rate": 9.059538207585123e-06, + "loss": 0.3498, + "step": 1871 + }, + { + "epoch": 0.4455286487772952, + "grad_norm": 0.36200201893045636, + "learning_rate": 9.058412603750417e-06, + "loss": 0.4051, + "step": 1872 + }, + { + "epoch": 0.44576664485036, + "grad_norm": 0.4088077633298198, + "learning_rate": 9.057286396735198e-06, + "loss": 0.352, + "step": 1873 + }, + { + "epoch": 0.4460046409234248, + "grad_norm": 0.411767781342609, + "learning_rate": 9.056159586706847e-06, + "loss": 0.3159, + "step": 1874 + }, + { + "epoch": 0.4462426369964896, + "grad_norm": 0.38770515767105623, + "learning_rate": 9.055032173832838e-06, + "loss": 0.3435, + "step": 1875 + }, + { + "epoch": 0.4464806330695544, + "grad_norm": 0.4063020777640675, + "learning_rate": 9.053904158280731e-06, + "loss": 0.448, + "step": 1876 + }, + { + "epoch": 0.4467186291426192, + "grad_norm": 0.4108973315393291, + "learning_rate": 9.052775540218178e-06, + "loss": 0.3257, + "step": 1877 + }, + { + "epoch": 0.44695662521568397, + "grad_norm": 0.3962899590466375, + "learning_rate": 9.05164631981292e-06, + "loss": 0.3428, + "step": 1878 + }, + { + "epoch": 0.44719462128874876, + "grad_norm": 0.39154457859950453, + "learning_rate": 9.050516497232783e-06, + "loss": 0.4113, + "step": 1879 + }, + { + "epoch": 0.44743261736181356, + "grad_norm": 0.40521710300411384, + "learning_rate": 9.049386072645691e-06, + "loss": 0.3697, + "step": 1880 + }, + { + "epoch": 0.44767061343487835, + "grad_norm": 0.41448812938566526, + "learning_rate": 9.048255046219652e-06, + "loss": 0.3128, + "step": 1881 + }, + { + "epoch": 0.44790860950794315, + "grad_norm": 0.3732301223230006, + "learning_rate": 9.047123418122762e-06, + "loss": 0.3486, + "step": 1882 + }, + { + "epoch": 0.44814660558100794, + "grad_norm": 0.37645904480775877, + "learning_rate": 9.045991188523213e-06, + "loss": 0.4007, + "step": 1883 + }, + { + "epoch": 0.44838460165407273, + "grad_norm": 0.5589491366235092, + "learning_rate": 9.044858357589281e-06, + "loss": 0.3507, + "step": 1884 + }, + { + "epoch": 0.44862259772713753, + "grad_norm": 0.41677787585285414, + "learning_rate": 9.043724925489332e-06, + "loss": 0.3181, + "step": 1885 + }, + { + "epoch": 0.4488605938002023, + "grad_norm": 0.41414556643068295, + "learning_rate": 9.04259089239182e-06, + "loss": 0.4107, + "step": 1886 + }, + { + "epoch": 0.4490985898732671, + "grad_norm": 0.43327968208641654, + "learning_rate": 9.041456258465295e-06, + "loss": 0.4257, + "step": 1887 + }, + { + "epoch": 0.4493365859463319, + "grad_norm": 0.4245883014008134, + "learning_rate": 9.040321023878387e-06, + "loss": 0.3388, + "step": 1888 + }, + { + "epoch": 0.4495745820193967, + "grad_norm": 0.4421642723801354, + "learning_rate": 9.039185188799824e-06, + "loss": 0.3743, + "step": 1889 + }, + { + "epoch": 0.4498125780924615, + "grad_norm": 0.41567634181571017, + "learning_rate": 9.038048753398417e-06, + "loss": 0.4091, + "step": 1890 + }, + { + "epoch": 0.4500505741655263, + "grad_norm": 0.4133338214658035, + "learning_rate": 9.036911717843067e-06, + "loss": 0.3367, + "step": 1891 + }, + { + "epoch": 0.4502885702385911, + "grad_norm": 0.41679245676442966, + "learning_rate": 9.035774082302769e-06, + "loss": 0.3222, + "step": 1892 + }, + { + "epoch": 0.4505265663116559, + "grad_norm": 0.38537142529458723, + "learning_rate": 9.034635846946603e-06, + "loss": 0.3554, + "step": 1893 + }, + { + "epoch": 0.4507645623847207, + "grad_norm": 0.4135663142912014, + "learning_rate": 9.033497011943735e-06, + "loss": 0.4212, + "step": 1894 + }, + { + "epoch": 0.4510025584577855, + "grad_norm": 0.39185654824504884, + "learning_rate": 9.032357577463429e-06, + "loss": 0.3207, + "step": 1895 + }, + { + "epoch": 0.45124055453085027, + "grad_norm": 0.39385062485453165, + "learning_rate": 9.031217543675032e-06, + "loss": 0.33, + "step": 1896 + }, + { + "epoch": 0.45147855060391506, + "grad_norm": 0.41350024044695805, + "learning_rate": 9.03007691074798e-06, + "loss": 0.395, + "step": 1897 + }, + { + "epoch": 0.4517165466769798, + "grad_norm": 0.4048615581588889, + "learning_rate": 9.028935678851798e-06, + "loss": 0.3942, + "step": 1898 + }, + { + "epoch": 0.4519545427500446, + "grad_norm": 0.43192810625467914, + "learning_rate": 9.027793848156106e-06, + "loss": 0.2972, + "step": 1899 + }, + { + "epoch": 0.4521925388231094, + "grad_norm": 0.3638638789244606, + "learning_rate": 9.026651418830603e-06, + "loss": 0.3737, + "step": 1900 + }, + { + "epoch": 0.4524305348961742, + "grad_norm": 0.4178434063849281, + "learning_rate": 9.025508391045087e-06, + "loss": 0.4463, + "step": 1901 + }, + { + "epoch": 0.452668530969239, + "grad_norm": 0.43266874725956067, + "learning_rate": 9.024364764969435e-06, + "loss": 0.354, + "step": 1902 + }, + { + "epoch": 0.4529065270423038, + "grad_norm": 0.4386170116690222, + "learning_rate": 9.023220540773621e-06, + "loss": 0.3436, + "step": 1903 + }, + { + "epoch": 0.45314452311536857, + "grad_norm": 0.4154738684824685, + "learning_rate": 9.022075718627707e-06, + "loss": 0.3934, + "step": 1904 + }, + { + "epoch": 0.45338251918843336, + "grad_norm": 0.4336014698946849, + "learning_rate": 9.02093029870184e-06, + "loss": 0.376, + "step": 1905 + }, + { + "epoch": 0.45362051526149816, + "grad_norm": 0.4496211440899709, + "learning_rate": 9.019784281166255e-06, + "loss": 0.3017, + "step": 1906 + }, + { + "epoch": 0.45385851133456295, + "grad_norm": 0.438757831396024, + "learning_rate": 9.018637666191284e-06, + "loss": 0.3907, + "step": 1907 + }, + { + "epoch": 0.45409650740762775, + "grad_norm": 0.38217794195897165, + "learning_rate": 9.017490453947337e-06, + "loss": 0.4368, + "step": 1908 + }, + { + "epoch": 0.45433450348069254, + "grad_norm": 0.4045541745421685, + "learning_rate": 9.016342644604923e-06, + "loss": 0.3241, + "step": 1909 + }, + { + "epoch": 0.45457249955375734, + "grad_norm": 0.4749105178045131, + "learning_rate": 9.01519423833463e-06, + "loss": 0.3236, + "step": 1910 + }, + { + "epoch": 0.45481049562682213, + "grad_norm": 0.39441695061972637, + "learning_rate": 9.014045235307144e-06, + "loss": 0.3832, + "step": 1911 + }, + { + "epoch": 0.4550484916998869, + "grad_norm": 0.4006478226656663, + "learning_rate": 9.012895635693232e-06, + "loss": 0.3885, + "step": 1912 + }, + { + "epoch": 0.4552864877729517, + "grad_norm": 0.42235636070301646, + "learning_rate": 9.011745439663756e-06, + "loss": 0.3053, + "step": 1913 + }, + { + "epoch": 0.4555244838460165, + "grad_norm": 0.4330833366027642, + "learning_rate": 9.010594647389662e-06, + "loss": 0.3426, + "step": 1914 + }, + { + "epoch": 0.4557624799190813, + "grad_norm": 0.3935945827408916, + "learning_rate": 9.009443259041984e-06, + "loss": 0.438, + "step": 1915 + }, + { + "epoch": 0.4560004759921461, + "grad_norm": 0.4573862811142753, + "learning_rate": 9.008291274791849e-06, + "loss": 0.3511, + "step": 1916 + }, + { + "epoch": 0.4562384720652109, + "grad_norm": 0.4191822242792251, + "learning_rate": 9.00713869481047e-06, + "loss": 0.3424, + "step": 1917 + }, + { + "epoch": 0.4564764681382757, + "grad_norm": 0.3714154576618009, + "learning_rate": 9.005985519269151e-06, + "loss": 0.3758, + "step": 1918 + }, + { + "epoch": 0.4567144642113405, + "grad_norm": 0.3968866803688529, + "learning_rate": 9.00483174833928e-06, + "loss": 0.4399, + "step": 1919 + }, + { + "epoch": 0.4569524602844053, + "grad_norm": 0.4381863859987972, + "learning_rate": 9.003677382192337e-06, + "loss": 0.3317, + "step": 1920 + }, + { + "epoch": 0.4571904563574701, + "grad_norm": 0.3668542092189725, + "learning_rate": 9.002522420999887e-06, + "loss": 0.3302, + "step": 1921 + }, + { + "epoch": 0.45742845243053487, + "grad_norm": 0.40073715252923153, + "learning_rate": 9.00136686493359e-06, + "loss": 0.3946, + "step": 1922 + }, + { + "epoch": 0.45766644850359967, + "grad_norm": 0.422377185704182, + "learning_rate": 9.000210714165185e-06, + "loss": 0.3721, + "step": 1923 + }, + { + "epoch": 0.45790444457666446, + "grad_norm": 0.39892724787175154, + "learning_rate": 8.999053968866509e-06, + "loss": 0.3285, + "step": 1924 + }, + { + "epoch": 0.45814244064972925, + "grad_norm": 0.39420537454241245, + "learning_rate": 8.997896629209482e-06, + "loss": 0.3954, + "step": 1925 + }, + { + "epoch": 0.45838043672279405, + "grad_norm": 0.3875725349434963, + "learning_rate": 8.996738695366111e-06, + "loss": 0.4209, + "step": 1926 + }, + { + "epoch": 0.45861843279585884, + "grad_norm": 0.4320566083850361, + "learning_rate": 8.995580167508495e-06, + "loss": 0.3111, + "step": 1927 + }, + { + "epoch": 0.45885642886892364, + "grad_norm": 0.39786821137659234, + "learning_rate": 8.994421045808821e-06, + "loss": 0.3344, + "step": 1928 + }, + { + "epoch": 0.45909442494198843, + "grad_norm": 0.3864397258868153, + "learning_rate": 8.993261330439365e-06, + "loss": 0.4114, + "step": 1929 + }, + { + "epoch": 0.4593324210150532, + "grad_norm": 0.44366727708416975, + "learning_rate": 8.992101021572483e-06, + "loss": 0.3847, + "step": 1930 + }, + { + "epoch": 0.459570417088118, + "grad_norm": 0.42879150308085556, + "learning_rate": 8.990940119380632e-06, + "loss": 0.3251, + "step": 1931 + }, + { + "epoch": 0.4598084131611828, + "grad_norm": 0.4013669319373018, + "learning_rate": 8.989778624036346e-06, + "loss": 0.3751, + "step": 1932 + }, + { + "epoch": 0.4600464092342476, + "grad_norm": 0.38329016484024186, + "learning_rate": 8.988616535712255e-06, + "loss": 0.4322, + "step": 1933 + }, + { + "epoch": 0.4602844053073124, + "grad_norm": 0.4593810052561611, + "learning_rate": 8.987453854581074e-06, + "loss": 0.351, + "step": 1934 + }, + { + "epoch": 0.4605224013803772, + "grad_norm": 0.4290257106099543, + "learning_rate": 8.986290580815605e-06, + "loss": 0.2975, + "step": 1935 + }, + { + "epoch": 0.460760397453442, + "grad_norm": 0.38235899911292076, + "learning_rate": 8.985126714588739e-06, + "loss": 0.4136, + "step": 1936 + }, + { + "epoch": 0.4609983935265068, + "grad_norm": 0.38940541074163637, + "learning_rate": 8.983962256073457e-06, + "loss": 0.4249, + "step": 1937 + }, + { + "epoch": 0.4612363895995716, + "grad_norm": 0.3599022107051358, + "learning_rate": 8.982797205442823e-06, + "loss": 0.3004, + "step": 1938 + }, + { + "epoch": 0.4614743856726364, + "grad_norm": 0.3859999702393283, + "learning_rate": 8.981631562869997e-06, + "loss": 0.3493, + "step": 1939 + }, + { + "epoch": 0.4617123817457012, + "grad_norm": 0.38132124586744975, + "learning_rate": 8.98046532852822e-06, + "loss": 0.4082, + "step": 1940 + }, + { + "epoch": 0.46195037781876597, + "grad_norm": 0.40954534803570225, + "learning_rate": 8.979298502590821e-06, + "loss": 0.3792, + "step": 1941 + }, + { + "epoch": 0.46218837389183076, + "grad_norm": 0.3763058419790421, + "learning_rate": 8.978131085231223e-06, + "loss": 0.314, + "step": 1942 + }, + { + "epoch": 0.46242636996489556, + "grad_norm": 0.3801916014322026, + "learning_rate": 8.976963076622932e-06, + "loss": 0.3608, + "step": 1943 + }, + { + "epoch": 0.46266436603796035, + "grad_norm": 0.3754880817697566, + "learning_rate": 8.975794476939541e-06, + "loss": 0.4133, + "step": 1944 + }, + { + "epoch": 0.46290236211102515, + "grad_norm": 0.4458270346361261, + "learning_rate": 8.974625286354735e-06, + "loss": 0.3359, + "step": 1945 + }, + { + "epoch": 0.46314035818408994, + "grad_norm": 0.40596951932507613, + "learning_rate": 8.973455505042285e-06, + "loss": 0.36, + "step": 1946 + }, + { + "epoch": 0.46337835425715473, + "grad_norm": 0.3777591671479931, + "learning_rate": 8.972285133176047e-06, + "loss": 0.4053, + "step": 1947 + }, + { + "epoch": 0.46361635033021953, + "grad_norm": 0.381883104188769, + "learning_rate": 8.97111417092997e-06, + "loss": 0.3788, + "step": 1948 + }, + { + "epoch": 0.4638543464032843, + "grad_norm": 0.41602474134267337, + "learning_rate": 8.969942618478085e-06, + "loss": 0.3184, + "step": 1949 + }, + { + "epoch": 0.4640923424763491, + "grad_norm": 0.3838946466607663, + "learning_rate": 8.968770475994514e-06, + "loss": 0.3573, + "step": 1950 + }, + { + "epoch": 0.4643303385494139, + "grad_norm": 0.3860312821497556, + "learning_rate": 8.967597743653471e-06, + "loss": 0.4234, + "step": 1951 + }, + { + "epoch": 0.4645683346224787, + "grad_norm": 0.36977923888724, + "learning_rate": 8.966424421629247e-06, + "loss": 0.3242, + "step": 1952 + }, + { + "epoch": 0.4648063306955435, + "grad_norm": 0.3937693643527784, + "learning_rate": 8.965250510096231e-06, + "loss": 0.3117, + "step": 1953 + }, + { + "epoch": 0.4650443267686083, + "grad_norm": 0.38742510667851454, + "learning_rate": 8.964076009228892e-06, + "loss": 0.3899, + "step": 1954 + }, + { + "epoch": 0.4652823228416731, + "grad_norm": 0.381510778099069, + "learning_rate": 8.962900919201793e-06, + "loss": 0.363, + "step": 1955 + }, + { + "epoch": 0.4655203189147379, + "grad_norm": 0.4266911809851305, + "learning_rate": 8.96172524018958e-06, + "loss": 0.3424, + "step": 1956 + }, + { + "epoch": 0.4657583149878027, + "grad_norm": 0.39273456552864333, + "learning_rate": 8.960548972366987e-06, + "loss": 0.3623, + "step": 1957 + }, + { + "epoch": 0.4659963110608675, + "grad_norm": 0.39550307752848174, + "learning_rate": 8.959372115908838e-06, + "loss": 0.4073, + "step": 1958 + }, + { + "epoch": 0.46623430713393227, + "grad_norm": 0.3939424995482473, + "learning_rate": 8.958194670990043e-06, + "loss": 0.3466, + "step": 1959 + }, + { + "epoch": 0.46647230320699706, + "grad_norm": 0.4163291327720836, + "learning_rate": 8.957016637785599e-06, + "loss": 0.3167, + "step": 1960 + }, + { + "epoch": 0.46671029928006186, + "grad_norm": 0.3744209884685097, + "learning_rate": 8.95583801647059e-06, + "loss": 0.3617, + "step": 1961 + }, + { + "epoch": 0.46694829535312665, + "grad_norm": 0.36438599515039355, + "learning_rate": 8.954658807220189e-06, + "loss": 0.3979, + "step": 1962 + }, + { + "epoch": 0.46718629142619145, + "grad_norm": 0.36878304664624034, + "learning_rate": 8.953479010209655e-06, + "loss": 0.3402, + "step": 1963 + }, + { + "epoch": 0.46742428749925624, + "grad_norm": 0.38784861609124194, + "learning_rate": 8.952298625614335e-06, + "loss": 0.3465, + "step": 1964 + }, + { + "epoch": 0.46766228357232104, + "grad_norm": 0.36193075171069583, + "learning_rate": 8.951117653609666e-06, + "loss": 0.4162, + "step": 1965 + }, + { + "epoch": 0.46790027964538583, + "grad_norm": 0.4273533447324452, + "learning_rate": 8.949936094371168e-06, + "loss": 0.357, + "step": 1966 + }, + { + "epoch": 0.4681382757184506, + "grad_norm": 0.3748626683477407, + "learning_rate": 8.948753948074448e-06, + "loss": 0.3286, + "step": 1967 + }, + { + "epoch": 0.4683762717915154, + "grad_norm": 0.3615946348473227, + "learning_rate": 8.947571214895206e-06, + "loss": 0.374, + "step": 1968 + }, + { + "epoch": 0.4686142678645802, + "grad_norm": 0.40199712164294776, + "learning_rate": 8.946387895009221e-06, + "loss": 0.4383, + "step": 1969 + }, + { + "epoch": 0.468852263937645, + "grad_norm": 0.40189899947360347, + "learning_rate": 8.945203988592368e-06, + "loss": 0.3137, + "step": 1970 + }, + { + "epoch": 0.4690902600107098, + "grad_norm": 0.42304082101866364, + "learning_rate": 8.944019495820602e-06, + "loss": 0.3607, + "step": 1971 + }, + { + "epoch": 0.4693282560837746, + "grad_norm": 0.3915717145865347, + "learning_rate": 8.942834416869967e-06, + "loss": 0.4186, + "step": 1972 + }, + { + "epoch": 0.4695662521568394, + "grad_norm": 0.3699303309540853, + "learning_rate": 8.941648751916598e-06, + "loss": 0.3479, + "step": 1973 + }, + { + "epoch": 0.4698042482299042, + "grad_norm": 0.3931300817695143, + "learning_rate": 8.940462501136712e-06, + "loss": 0.3416, + "step": 1974 + }, + { + "epoch": 0.470042244302969, + "grad_norm": 0.4174108723423135, + "learning_rate": 8.939275664706618e-06, + "loss": 0.3809, + "step": 1975 + }, + { + "epoch": 0.4702802403760338, + "grad_norm": 0.4017321399357895, + "learning_rate": 8.938088242802705e-06, + "loss": 0.3999, + "step": 1976 + }, + { + "epoch": 0.47051823644909857, + "grad_norm": 0.3796367988353256, + "learning_rate": 8.936900235601456e-06, + "loss": 0.3103, + "step": 1977 + }, + { + "epoch": 0.47075623252216336, + "grad_norm": 0.4341261574035814, + "learning_rate": 8.93571164327944e-06, + "loss": 0.3357, + "step": 1978 + }, + { + "epoch": 0.47099422859522816, + "grad_norm": 0.4070316455404249, + "learning_rate": 8.934522466013305e-06, + "loss": 0.3898, + "step": 1979 + }, + { + "epoch": 0.47123222466829295, + "grad_norm": 0.3974826629955441, + "learning_rate": 8.933332703979798e-06, + "loss": 0.3888, + "step": 1980 + }, + { + "epoch": 0.47147022074135775, + "grad_norm": 0.4184039008610896, + "learning_rate": 8.932142357355747e-06, + "loss": 0.3078, + "step": 1981 + }, + { + "epoch": 0.47170821681442254, + "grad_norm": 0.41469276127770316, + "learning_rate": 8.930951426318061e-06, + "loss": 0.346, + "step": 1982 + }, + { + "epoch": 0.47194621288748734, + "grad_norm": 0.3679140205317907, + "learning_rate": 8.929759911043749e-06, + "loss": 0.4249, + "step": 1983 + }, + { + "epoch": 0.47218420896055213, + "grad_norm": 0.38233800356637293, + "learning_rate": 8.928567811709897e-06, + "loss": 0.3664, + "step": 1984 + }, + { + "epoch": 0.4724222050336169, + "grad_norm": 0.3860957949442366, + "learning_rate": 8.927375128493679e-06, + "loss": 0.3278, + "step": 1985 + }, + { + "epoch": 0.4726602011066817, + "grad_norm": 0.42016968648143477, + "learning_rate": 8.92618186157236e-06, + "loss": 0.4028, + "step": 1986 + }, + { + "epoch": 0.4728981971797465, + "grad_norm": 0.3932672809367911, + "learning_rate": 8.924988011123286e-06, + "loss": 0.4194, + "step": 1987 + }, + { + "epoch": 0.4731361932528113, + "grad_norm": 0.3914761416304697, + "learning_rate": 8.923793577323894e-06, + "loss": 0.289, + "step": 1988 + }, + { + "epoch": 0.4733741893258761, + "grad_norm": 0.41886930971894176, + "learning_rate": 8.922598560351705e-06, + "loss": 0.3386, + "step": 1989 + }, + { + "epoch": 0.4736121853989409, + "grad_norm": 0.40054887442025705, + "learning_rate": 8.92140296038433e-06, + "loss": 0.4222, + "step": 1990 + }, + { + "epoch": 0.4738501814720057, + "grad_norm": 0.37901621734635876, + "learning_rate": 8.920206777599467e-06, + "loss": 0.3587, + "step": 1991 + }, + { + "epoch": 0.4740881775450705, + "grad_norm": 0.37289856531710025, + "learning_rate": 8.919010012174894e-06, + "loss": 0.3589, + "step": 1992 + }, + { + "epoch": 0.4743261736181353, + "grad_norm": 0.37673235179772124, + "learning_rate": 8.917812664288481e-06, + "loss": 0.3493, + "step": 1993 + }, + { + "epoch": 0.4745641696912001, + "grad_norm": 0.40946971737432014, + "learning_rate": 8.916614734118184e-06, + "loss": 0.4314, + "step": 1994 + }, + { + "epoch": 0.47480216576426487, + "grad_norm": 0.3663866235640273, + "learning_rate": 8.915416221842045e-06, + "loss": 0.3188, + "step": 1995 + }, + { + "epoch": 0.47504016183732967, + "grad_norm": 0.3729349793322217, + "learning_rate": 8.914217127638194e-06, + "loss": 0.33, + "step": 1996 + }, + { + "epoch": 0.47527815791039446, + "grad_norm": 0.37819004145885465, + "learning_rate": 8.913017451684845e-06, + "loss": 0.3864, + "step": 1997 + }, + { + "epoch": 0.47551615398345926, + "grad_norm": 0.3816038357588018, + "learning_rate": 8.911817194160297e-06, + "loss": 0.3707, + "step": 1998 + }, + { + "epoch": 0.47575415005652405, + "grad_norm": 0.381594564688525, + "learning_rate": 8.910616355242943e-06, + "loss": 0.2955, + "step": 1999 + }, + { + "epoch": 0.47599214612958884, + "grad_norm": 0.4144362212607731, + "learning_rate": 8.909414935111251e-06, + "loss": 0.3695, + "step": 2000 + }, + { + "epoch": 0.47623014220265364, + "grad_norm": 0.40541513918237165, + "learning_rate": 8.908212933943788e-06, + "loss": 0.4287, + "step": 2001 + }, + { + "epoch": 0.47646813827571843, + "grad_norm": 0.4010091276525552, + "learning_rate": 8.907010351919198e-06, + "loss": 0.352, + "step": 2002 + }, + { + "epoch": 0.47670613434878323, + "grad_norm": 0.37436400202401043, + "learning_rate": 8.905807189216216e-06, + "loss": 0.3316, + "step": 2003 + }, + { + "epoch": 0.476944130421848, + "grad_norm": 0.3610601587615899, + "learning_rate": 8.90460344601366e-06, + "loss": 0.3977, + "step": 2004 + }, + { + "epoch": 0.4771821264949128, + "grad_norm": 0.40390327913148766, + "learning_rate": 8.903399122490436e-06, + "loss": 0.4071, + "step": 2005 + }, + { + "epoch": 0.4774201225679776, + "grad_norm": 0.39707419630426943, + "learning_rate": 8.902194218825537e-06, + "loss": 0.3261, + "step": 2006 + }, + { + "epoch": 0.4776581186410424, + "grad_norm": 0.4064601483285377, + "learning_rate": 8.900988735198043e-06, + "loss": 0.373, + "step": 2007 + }, + { + "epoch": 0.4778961147141072, + "grad_norm": 0.3921311614892764, + "learning_rate": 8.899782671787114e-06, + "loss": 0.402, + "step": 2008 + }, + { + "epoch": 0.478134110787172, + "grad_norm": 0.38829304720202296, + "learning_rate": 8.898576028772006e-06, + "loss": 0.3733, + "step": 2009 + }, + { + "epoch": 0.4783721068602368, + "grad_norm": 0.423689833869032, + "learning_rate": 8.897368806332053e-06, + "loss": 0.3363, + "step": 2010 + }, + { + "epoch": 0.4786101029333016, + "grad_norm": 0.40741104435951725, + "learning_rate": 8.896161004646682e-06, + "loss": 0.3923, + "step": 2011 + }, + { + "epoch": 0.4788480990063664, + "grad_norm": 0.38648441274354406, + "learning_rate": 8.894952623895396e-06, + "loss": 0.4054, + "step": 2012 + }, + { + "epoch": 0.4790860950794312, + "grad_norm": 0.3899353786856898, + "learning_rate": 8.893743664257796e-06, + "loss": 0.3402, + "step": 2013 + }, + { + "epoch": 0.47932409115249597, + "grad_norm": 0.37220325831387135, + "learning_rate": 8.892534125913558e-06, + "loss": 0.3757, + "step": 2014 + }, + { + "epoch": 0.47956208722556076, + "grad_norm": 0.37816661979461397, + "learning_rate": 8.891324009042456e-06, + "loss": 0.4359, + "step": 2015 + }, + { + "epoch": 0.47980008329862556, + "grad_norm": 0.3787006467971716, + "learning_rate": 8.890113313824339e-06, + "loss": 0.366, + "step": 2016 + }, + { + "epoch": 0.48003807937169035, + "grad_norm": 0.36458447262183397, + "learning_rate": 8.888902040439145e-06, + "loss": 0.3146, + "step": 2017 + }, + { + "epoch": 0.48027607544475515, + "grad_norm": 0.39938985399119364, + "learning_rate": 8.887690189066899e-06, + "loss": 0.3725, + "step": 2018 + }, + { + "epoch": 0.48051407151781994, + "grad_norm": 0.3591918893002723, + "learning_rate": 8.886477759887717e-06, + "loss": 0.3943, + "step": 2019 + }, + { + "epoch": 0.48075206759088474, + "grad_norm": 0.36999609920363924, + "learning_rate": 8.885264753081794e-06, + "loss": 0.3297, + "step": 2020 + }, + { + "epoch": 0.48099006366394953, + "grad_norm": 0.407098460962921, + "learning_rate": 8.884051168829409e-06, + "loss": 0.3605, + "step": 2021 + }, + { + "epoch": 0.4812280597370143, + "grad_norm": 0.3847503739932104, + "learning_rate": 8.882837007310936e-06, + "loss": 0.4087, + "step": 2022 + }, + { + "epoch": 0.4814660558100791, + "grad_norm": 0.3953664392850304, + "learning_rate": 8.881622268706825e-06, + "loss": 0.3865, + "step": 2023 + }, + { + "epoch": 0.4817040518831439, + "grad_norm": 0.4139311567463376, + "learning_rate": 8.88040695319762e-06, + "loss": 0.3024, + "step": 2024 + }, + { + "epoch": 0.4819420479562087, + "grad_norm": 0.3915339203989994, + "learning_rate": 8.879191060963943e-06, + "loss": 0.3784, + "step": 2025 + }, + { + "epoch": 0.4821800440292735, + "grad_norm": 0.3600119497807205, + "learning_rate": 8.87797459218651e-06, + "loss": 0.4668, + "step": 2026 + }, + { + "epoch": 0.4824180401023383, + "grad_norm": 0.4115620298847174, + "learning_rate": 8.876757547046116e-06, + "loss": 0.3102, + "step": 2027 + }, + { + "epoch": 0.4826560361754031, + "grad_norm": 0.4194742305434396, + "learning_rate": 8.875539925723641e-06, + "loss": 0.3474, + "step": 2028 + }, + { + "epoch": 0.4828940322484679, + "grad_norm": 0.37165614518858103, + "learning_rate": 8.874321728400059e-06, + "loss": 0.3922, + "step": 2029 + }, + { + "epoch": 0.4831320283215327, + "grad_norm": 0.37833468745165144, + "learning_rate": 8.873102955256423e-06, + "loss": 0.3875, + "step": 2030 + }, + { + "epoch": 0.4833700243945975, + "grad_norm": 0.4131846841190066, + "learning_rate": 8.871883606473871e-06, + "loss": 0.3283, + "step": 2031 + }, + { + "epoch": 0.48360802046766227, + "grad_norm": 0.38823207775132323, + "learning_rate": 8.87066368223363e-06, + "loss": 0.3778, + "step": 2032 + }, + { + "epoch": 0.48384601654072706, + "grad_norm": 0.3691014202938941, + "learning_rate": 8.869443182717009e-06, + "loss": 0.4339, + "step": 2033 + }, + { + "epoch": 0.48408401261379186, + "grad_norm": 0.39215841748503116, + "learning_rate": 8.868222108105407e-06, + "loss": 0.3159, + "step": 2034 + }, + { + "epoch": 0.48432200868685665, + "grad_norm": 0.3882981946824124, + "learning_rate": 8.867000458580302e-06, + "loss": 0.3129, + "step": 2035 + }, + { + "epoch": 0.48456000475992145, + "grad_norm": 0.3918438336109168, + "learning_rate": 8.865778234323266e-06, + "loss": 0.4061, + "step": 2036 + }, + { + "epoch": 0.48479800083298624, + "grad_norm": 0.36841655891359437, + "learning_rate": 8.864555435515949e-06, + "loss": 0.408, + "step": 2037 + }, + { + "epoch": 0.48503599690605104, + "grad_norm": 0.37655472476118107, + "learning_rate": 8.863332062340091e-06, + "loss": 0.297, + "step": 2038 + }, + { + "epoch": 0.48527399297911583, + "grad_norm": 0.3798702816409488, + "learning_rate": 8.862108114977512e-06, + "loss": 0.3575, + "step": 2039 + }, + { + "epoch": 0.4855119890521806, + "grad_norm": 0.36545896885427936, + "learning_rate": 8.860883593610126e-06, + "loss": 0.4035, + "step": 2040 + }, + { + "epoch": 0.4857499851252454, + "grad_norm": 0.35102870721288, + "learning_rate": 8.859658498419922e-06, + "loss": 0.3564, + "step": 2041 + }, + { + "epoch": 0.4859879811983102, + "grad_norm": 0.3576865709579289, + "learning_rate": 8.858432829588984e-06, + "loss": 0.296, + "step": 2042 + }, + { + "epoch": 0.486225977271375, + "grad_norm": 0.36034938518371656, + "learning_rate": 8.857206587299471e-06, + "loss": 0.3711, + "step": 2043 + }, + { + "epoch": 0.4864639733444398, + "grad_norm": 0.4065171435929195, + "learning_rate": 8.85597977173364e-06, + "loss": 0.4387, + "step": 2044 + }, + { + "epoch": 0.4867019694175046, + "grad_norm": 0.3834118830811794, + "learning_rate": 8.85475238307382e-06, + "loss": 0.2858, + "step": 2045 + }, + { + "epoch": 0.4869399654905694, + "grad_norm": 0.3943529386040461, + "learning_rate": 8.853524421502436e-06, + "loss": 0.3485, + "step": 2046 + }, + { + "epoch": 0.4871779615636342, + "grad_norm": 0.4010452585395006, + "learning_rate": 8.852295887201988e-06, + "loss": 0.4022, + "step": 2047 + }, + { + "epoch": 0.487415957636699, + "grad_norm": 0.4092837500931114, + "learning_rate": 8.851066780355074e-06, + "loss": 0.3798, + "step": 2048 + }, + { + "epoch": 0.4876539537097638, + "grad_norm": 0.41343430994676683, + "learning_rate": 8.849837101144363e-06, + "loss": 0.3158, + "step": 2049 + }, + { + "epoch": 0.48789194978282857, + "grad_norm": 0.3737441299895588, + "learning_rate": 8.84860684975262e-06, + "loss": 0.3551, + "step": 2050 + }, + { + "epoch": 0.48812994585589337, + "grad_norm": 0.3745215140069202, + "learning_rate": 8.847376026362688e-06, + "loss": 0.4366, + "step": 2051 + }, + { + "epoch": 0.48836794192895816, + "grad_norm": 0.42064142614172584, + "learning_rate": 8.8461446311575e-06, + "loss": 0.3181, + "step": 2052 + }, + { + "epoch": 0.48860593800202295, + "grad_norm": 0.37945037835962714, + "learning_rate": 8.844912664320072e-06, + "loss": 0.3606, + "step": 2053 + }, + { + "epoch": 0.48884393407508775, + "grad_norm": 0.38167249606508846, + "learning_rate": 8.8436801260335e-06, + "loss": 0.3845, + "step": 2054 + }, + { + "epoch": 0.48908193014815254, + "grad_norm": 0.3746464998144936, + "learning_rate": 8.842447016480975e-06, + "loss": 0.3902, + "step": 2055 + }, + { + "epoch": 0.48931992622121734, + "grad_norm": 0.39703764026174426, + "learning_rate": 8.841213335845767e-06, + "loss": 0.3346, + "step": 2056 + }, + { + "epoch": 0.48955792229428213, + "grad_norm": 0.391301094421723, + "learning_rate": 8.839979084311228e-06, + "loss": 0.3585, + "step": 2057 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.4083430548241745, + "learning_rate": 8.8387442620608e-06, + "loss": 0.4185, + "step": 2058 + }, + { + "epoch": 0.4900339144404117, + "grad_norm": 0.37140532210327554, + "learning_rate": 8.837508869278011e-06, + "loss": 0.3295, + "step": 2059 + }, + { + "epoch": 0.4902719105134765, + "grad_norm": 0.43155618842343063, + "learning_rate": 8.836272906146467e-06, + "loss": 0.3387, + "step": 2060 + }, + { + "epoch": 0.4905099065865413, + "grad_norm": 0.3865957545475916, + "learning_rate": 8.835036372849867e-06, + "loss": 0.3753, + "step": 2061 + }, + { + "epoch": 0.4907479026596061, + "grad_norm": 0.38780511078052327, + "learning_rate": 8.833799269571985e-06, + "loss": 0.397, + "step": 2062 + }, + { + "epoch": 0.4909858987326709, + "grad_norm": 0.4141760815744359, + "learning_rate": 8.832561596496689e-06, + "loss": 0.3248, + "step": 2063 + }, + { + "epoch": 0.4912238948057357, + "grad_norm": 0.4008766368624388, + "learning_rate": 8.831323353807928e-06, + "loss": 0.342, + "step": 2064 + }, + { + "epoch": 0.4914618908788005, + "grad_norm": 0.39406895654952484, + "learning_rate": 8.830084541689731e-06, + "loss": 0.4475, + "step": 2065 + }, + { + "epoch": 0.4916998869518653, + "grad_norm": 0.399495170165796, + "learning_rate": 8.828845160326222e-06, + "loss": 0.3681, + "step": 2066 + }, + { + "epoch": 0.4919378830249301, + "grad_norm": 0.3868671762065268, + "learning_rate": 8.827605209901602e-06, + "loss": 0.3054, + "step": 2067 + }, + { + "epoch": 0.4921758790979949, + "grad_norm": 0.38625669927979583, + "learning_rate": 8.826364690600155e-06, + "loss": 0.3585, + "step": 2068 + }, + { + "epoch": 0.49241387517105967, + "grad_norm": 0.4117055914580924, + "learning_rate": 8.825123602606256e-06, + "loss": 0.4328, + "step": 2069 + }, + { + "epoch": 0.49265187124412446, + "grad_norm": 0.3711886777989222, + "learning_rate": 8.82388194610436e-06, + "loss": 0.3273, + "step": 2070 + }, + { + "epoch": 0.49288986731718926, + "grad_norm": 0.4050709073259558, + "learning_rate": 8.82263972127901e-06, + "loss": 0.341, + "step": 2071 + }, + { + "epoch": 0.49312786339025405, + "grad_norm": 0.39297282044656545, + "learning_rate": 8.82139692831483e-06, + "loss": 0.3942, + "step": 2072 + }, + { + "epoch": 0.49336585946331885, + "grad_norm": 0.4006082152861021, + "learning_rate": 8.820153567396528e-06, + "loss": 0.3502, + "step": 2073 + }, + { + "epoch": 0.49360385553638364, + "grad_norm": 0.4641634293550907, + "learning_rate": 8.818909638708901e-06, + "loss": 0.3107, + "step": 2074 + }, + { + "epoch": 0.49384185160944843, + "grad_norm": 0.3822372933111529, + "learning_rate": 8.817665142436826e-06, + "loss": 0.3832, + "step": 2075 + }, + { + "epoch": 0.49407984768251323, + "grad_norm": 0.36126502781069214, + "learning_rate": 8.816420078765267e-06, + "loss": 0.4281, + "step": 2076 + }, + { + "epoch": 0.494317843755578, + "grad_norm": 0.3805792912065209, + "learning_rate": 8.81517444787927e-06, + "loss": 0.326, + "step": 2077 + }, + { + "epoch": 0.4945558398286428, + "grad_norm": 0.3938968001852603, + "learning_rate": 8.813928249963967e-06, + "loss": 0.348, + "step": 2078 + }, + { + "epoch": 0.4947938359017076, + "grad_norm": 0.38229600062789804, + "learning_rate": 8.812681485204575e-06, + "loss": 0.4155, + "step": 2079 + }, + { + "epoch": 0.4950318319747724, + "grad_norm": 0.4196935982542361, + "learning_rate": 8.811434153786392e-06, + "loss": 0.377, + "step": 2080 + }, + { + "epoch": 0.4952698280478372, + "grad_norm": 0.4143016393191996, + "learning_rate": 8.810186255894804e-06, + "loss": 0.3329, + "step": 2081 + }, + { + "epoch": 0.495507824120902, + "grad_norm": 0.3843063609766728, + "learning_rate": 8.808937791715278e-06, + "loss": 0.3756, + "step": 2082 + }, + { + "epoch": 0.4957458201939668, + "grad_norm": 0.36885161283443524, + "learning_rate": 8.807688761433369e-06, + "loss": 0.433, + "step": 2083 + }, + { + "epoch": 0.4959838162670316, + "grad_norm": 0.36443450880109934, + "learning_rate": 8.806439165234711e-06, + "loss": 0.3567, + "step": 2084 + }, + { + "epoch": 0.4962218123400964, + "grad_norm": 0.38276313847917565, + "learning_rate": 8.805189003305026e-06, + "loss": 0.3166, + "step": 2085 + }, + { + "epoch": 0.4964598084131612, + "grad_norm": 0.38135336290107213, + "learning_rate": 8.803938275830122e-06, + "loss": 0.3852, + "step": 2086 + }, + { + "epoch": 0.49669780448622597, + "grad_norm": 0.3876532972271958, + "learning_rate": 8.802686982995882e-06, + "loss": 0.394, + "step": 2087 + }, + { + "epoch": 0.49693580055929076, + "grad_norm": 0.42237733267854094, + "learning_rate": 8.801435124988284e-06, + "loss": 0.3224, + "step": 2088 + }, + { + "epoch": 0.49717379663235556, + "grad_norm": 0.36357998941592223, + "learning_rate": 8.800182701993383e-06, + "loss": 0.3489, + "step": 2089 + }, + { + "epoch": 0.49741179270542035, + "grad_norm": 0.3776997755369579, + "learning_rate": 8.798929714197321e-06, + "loss": 0.4065, + "step": 2090 + }, + { + "epoch": 0.49764978877848515, + "grad_norm": 0.39974248449207966, + "learning_rate": 8.797676161786322e-06, + "loss": 0.3716, + "step": 2091 + }, + { + "epoch": 0.49788778485154994, + "grad_norm": 0.3512586392644621, + "learning_rate": 8.796422044946697e-06, + "loss": 0.2985, + "step": 2092 + }, + { + "epoch": 0.49812578092461474, + "grad_norm": 0.37530246376072357, + "learning_rate": 8.795167363864835e-06, + "loss": 0.3765, + "step": 2093 + }, + { + "epoch": 0.49836377699767953, + "grad_norm": 0.403359701606089, + "learning_rate": 8.793912118727214e-06, + "loss": 0.4233, + "step": 2094 + }, + { + "epoch": 0.4986017730707443, + "grad_norm": 0.39277782417073076, + "learning_rate": 8.792656309720398e-06, + "loss": 0.323, + "step": 2095 + }, + { + "epoch": 0.4988397691438091, + "grad_norm": 0.3901545968666047, + "learning_rate": 8.791399937031027e-06, + "loss": 0.3246, + "step": 2096 + }, + { + "epoch": 0.4990777652168739, + "grad_norm": 0.4108640260500558, + "learning_rate": 8.790143000845832e-06, + "loss": 0.4035, + "step": 2097 + }, + { + "epoch": 0.4993157612899387, + "grad_norm": 0.43483507105211894, + "learning_rate": 8.788885501351622e-06, + "loss": 0.3644, + "step": 2098 + }, + { + "epoch": 0.4995537573630035, + "grad_norm": 0.3989358061906033, + "learning_rate": 8.787627438735295e-06, + "loss": 0.3092, + "step": 2099 + }, + { + "epoch": 0.4997917534360683, + "grad_norm": 0.39812360364772853, + "learning_rate": 8.786368813183829e-06, + "loss": 0.3558, + "step": 2100 + }, + { + "epoch": 0.5000297495091331, + "grad_norm": 0.404777603959223, + "learning_rate": 8.785109624884287e-06, + "loss": 0.4303, + "step": 2101 + }, + { + "epoch": 0.5002677455821979, + "grad_norm": 0.40005648641359376, + "learning_rate": 8.783849874023816e-06, + "loss": 0.3249, + "step": 2102 + }, + { + "epoch": 0.5005057416552627, + "grad_norm": 0.4387837922924149, + "learning_rate": 8.782589560789645e-06, + "loss": 0.3341, + "step": 2103 + }, + { + "epoch": 0.5007437377283275, + "grad_norm": 0.36939020016413393, + "learning_rate": 8.781328685369088e-06, + "loss": 0.3658, + "step": 2104 + }, + { + "epoch": 0.5009817338013923, + "grad_norm": 0.42034598447699306, + "learning_rate": 8.780067247949545e-06, + "loss": 0.4267, + "step": 2105 + }, + { + "epoch": 0.5012197298744571, + "grad_norm": 0.3744391349615731, + "learning_rate": 8.778805248718492e-06, + "loss": 0.3169, + "step": 2106 + }, + { + "epoch": 0.5014577259475219, + "grad_norm": 0.39455712888275285, + "learning_rate": 8.777542687863498e-06, + "loss": 0.385, + "step": 2107 + }, + { + "epoch": 0.5016957220205867, + "grad_norm": 0.4107559343031647, + "learning_rate": 8.776279565572208e-06, + "loss": 0.4194, + "step": 2108 + }, + { + "epoch": 0.5019337180936515, + "grad_norm": 0.376450072712637, + "learning_rate": 8.775015882032355e-06, + "loss": 0.3303, + "step": 2109 + }, + { + "epoch": 0.5021717141667162, + "grad_norm": 0.39192436430391, + "learning_rate": 8.77375163743175e-06, + "loss": 0.3186, + "step": 2110 + }, + { + "epoch": 0.5024097102397811, + "grad_norm": 0.4103061508772666, + "learning_rate": 8.772486831958293e-06, + "loss": 0.3535, + "step": 2111 + }, + { + "epoch": 0.5026477063128458, + "grad_norm": 0.4061911120479169, + "learning_rate": 8.771221465799968e-06, + "loss": 0.4365, + "step": 2112 + }, + { + "epoch": 0.5028857023859107, + "grad_norm": 0.36093912967050273, + "learning_rate": 8.769955539144839e-06, + "loss": 0.311, + "step": 2113 + }, + { + "epoch": 0.5031236984589754, + "grad_norm": 0.431661016075779, + "learning_rate": 8.768689052181051e-06, + "loss": 0.3709, + "step": 2114 + }, + { + "epoch": 0.5033616945320403, + "grad_norm": 0.37241191111075006, + "learning_rate": 8.767422005096838e-06, + "loss": 0.4208, + "step": 2115 + }, + { + "epoch": 0.503599690605105, + "grad_norm": 0.3900038040150187, + "learning_rate": 8.766154398080511e-06, + "loss": 0.3656, + "step": 2116 + }, + { + "epoch": 0.5038376866781699, + "grad_norm": 0.38734311341707733, + "learning_rate": 8.764886231320473e-06, + "loss": 0.315, + "step": 2117 + }, + { + "epoch": 0.5040756827512346, + "grad_norm": 0.39624067824100434, + "learning_rate": 8.7636175050052e-06, + "loss": 0.3679, + "step": 2118 + }, + { + "epoch": 0.5043136788242994, + "grad_norm": 0.3786762562077862, + "learning_rate": 8.76234821932326e-06, + "loss": 0.421, + "step": 2119 + }, + { + "epoch": 0.5045516748973642, + "grad_norm": 0.3658684215497542, + "learning_rate": 8.7610783744633e-06, + "loss": 0.3373, + "step": 2120 + }, + { + "epoch": 0.504789670970429, + "grad_norm": 0.371242399448406, + "learning_rate": 8.759807970614044e-06, + "loss": 0.3295, + "step": 2121 + }, + { + "epoch": 0.5050276670434938, + "grad_norm": 0.40052728933299747, + "learning_rate": 8.758537007964314e-06, + "loss": 0.4182, + "step": 2122 + }, + { + "epoch": 0.5052656631165586, + "grad_norm": 0.39743565216070303, + "learning_rate": 8.757265486703001e-06, + "loss": 0.3706, + "step": 2123 + }, + { + "epoch": 0.5055036591896234, + "grad_norm": 0.36470933499650765, + "learning_rate": 8.75599340701909e-06, + "loss": 0.2961, + "step": 2124 + }, + { + "epoch": 0.5057416552626882, + "grad_norm": 0.37485254459171946, + "learning_rate": 8.754720769101636e-06, + "loss": 0.3761, + "step": 2125 + }, + { + "epoch": 0.505979651335753, + "grad_norm": 0.37277864735486177, + "learning_rate": 8.75344757313979e-06, + "loss": 0.4196, + "step": 2126 + }, + { + "epoch": 0.5062176474088178, + "grad_norm": 0.39747924989282063, + "learning_rate": 8.75217381932278e-06, + "loss": 0.3059, + "step": 2127 + }, + { + "epoch": 0.5064556434818825, + "grad_norm": 0.3715732881453211, + "learning_rate": 8.750899507839913e-06, + "loss": 0.322, + "step": 2128 + }, + { + "epoch": 0.5066936395549474, + "grad_norm": 0.4053341652611045, + "learning_rate": 8.74962463888059e-06, + "loss": 0.4358, + "step": 2129 + }, + { + "epoch": 0.5069316356280121, + "grad_norm": 0.40118580003340715, + "learning_rate": 8.748349212634284e-06, + "loss": 0.3766, + "step": 2130 + }, + { + "epoch": 0.507169631701077, + "grad_norm": 0.4019091388621105, + "learning_rate": 8.747073229290552e-06, + "loss": 0.3041, + "step": 2131 + }, + { + "epoch": 0.5074076277741417, + "grad_norm": 0.4304176264560313, + "learning_rate": 8.745796689039043e-06, + "loss": 0.3581, + "step": 2132 + }, + { + "epoch": 0.5076456238472066, + "grad_norm": 0.4099748631186, + "learning_rate": 8.744519592069479e-06, + "loss": 0.4268, + "step": 2133 + }, + { + "epoch": 0.5078836199202713, + "grad_norm": 0.40848377251157103, + "learning_rate": 8.743241938571667e-06, + "loss": 0.3395, + "step": 2134 + }, + { + "epoch": 0.5081216159933362, + "grad_norm": 0.38784109641240894, + "learning_rate": 8.741963728735502e-06, + "loss": 0.3247, + "step": 2135 + }, + { + "epoch": 0.5083596120664009, + "grad_norm": 0.387056172080891, + "learning_rate": 8.740684962750953e-06, + "loss": 0.4079, + "step": 2136 + }, + { + "epoch": 0.5085976081394658, + "grad_norm": 0.43968856979286486, + "learning_rate": 8.73940564080808e-06, + "loss": 0.4165, + "step": 2137 + }, + { + "epoch": 0.5088356042125305, + "grad_norm": 0.4069695223863342, + "learning_rate": 8.738125763097019e-06, + "loss": 0.3109, + "step": 2138 + }, + { + "epoch": 0.5090736002855953, + "grad_norm": 0.390822454142334, + "learning_rate": 8.736845329807994e-06, + "loss": 0.3342, + "step": 2139 + }, + { + "epoch": 0.5093115963586601, + "grad_norm": 0.3791678415289624, + "learning_rate": 8.735564341131308e-06, + "loss": 0.4129, + "step": 2140 + }, + { + "epoch": 0.5095495924317249, + "grad_norm": 0.3842175369404934, + "learning_rate": 8.734282797257347e-06, + "loss": 0.3665, + "step": 2141 + }, + { + "epoch": 0.5097875885047897, + "grad_norm": 0.39913401398744197, + "learning_rate": 8.733000698376579e-06, + "loss": 0.3109, + "step": 2142 + }, + { + "epoch": 0.5100255845778545, + "grad_norm": 0.3937291920519653, + "learning_rate": 8.73171804467956e-06, + "loss": 0.3699, + "step": 2143 + }, + { + "epoch": 0.5102635806509193, + "grad_norm": 0.3710499687158456, + "learning_rate": 8.73043483635692e-06, + "loss": 0.409, + "step": 2144 + }, + { + "epoch": 0.5105015767239841, + "grad_norm": 0.4346758984087426, + "learning_rate": 8.729151073599376e-06, + "loss": 0.3292, + "step": 2145 + }, + { + "epoch": 0.5107395727970488, + "grad_norm": 0.40591530956978267, + "learning_rate": 8.72786675659773e-06, + "loss": 0.3347, + "step": 2146 + }, + { + "epoch": 0.5109775688701137, + "grad_norm": 0.37110322190450645, + "learning_rate": 8.72658188554286e-06, + "loss": 0.431, + "step": 2147 + }, + { + "epoch": 0.5112155649431784, + "grad_norm": 0.3762833501337982, + "learning_rate": 8.725296460625729e-06, + "loss": 0.3465, + "step": 2148 + }, + { + "epoch": 0.5114535610162433, + "grad_norm": 0.42166675669869735, + "learning_rate": 8.724010482037386e-06, + "loss": 0.3255, + "step": 2149 + }, + { + "epoch": 0.511691557089308, + "grad_norm": 0.4134995142541755, + "learning_rate": 8.722723949968958e-06, + "loss": 0.3746, + "step": 2150 + }, + { + "epoch": 0.5119295531623729, + "grad_norm": 0.38354990683388107, + "learning_rate": 8.721436864611653e-06, + "loss": 0.4031, + "step": 2151 + }, + { + "epoch": 0.5121675492354376, + "grad_norm": 0.4329904072710047, + "learning_rate": 8.720149226156769e-06, + "loss": 0.3331, + "step": 2152 + }, + { + "epoch": 0.5124055453085025, + "grad_norm": 0.4137030908720267, + "learning_rate": 8.718861034795677e-06, + "loss": 0.3028, + "step": 2153 + }, + { + "epoch": 0.5126435413815672, + "grad_norm": 0.3654824931248918, + "learning_rate": 8.717572290719835e-06, + "loss": 0.3846, + "step": 2154 + }, + { + "epoch": 0.512881537454632, + "grad_norm": 0.422834765499284, + "learning_rate": 8.716282994120782e-06, + "loss": 0.3844, + "step": 2155 + }, + { + "epoch": 0.5131195335276968, + "grad_norm": 0.3922197545354826, + "learning_rate": 8.71499314519014e-06, + "loss": 0.3464, + "step": 2156 + }, + { + "epoch": 0.5133575296007616, + "grad_norm": 0.38918598237414415, + "learning_rate": 8.713702744119613e-06, + "loss": 0.3361, + "step": 2157 + }, + { + "epoch": 0.5135955256738264, + "grad_norm": 0.4107866717596137, + "learning_rate": 8.712411791100983e-06, + "loss": 0.4021, + "step": 2158 + }, + { + "epoch": 0.5138335217468912, + "grad_norm": 0.3901161988399392, + "learning_rate": 8.711120286326122e-06, + "loss": 0.3225, + "step": 2159 + }, + { + "epoch": 0.514071517819956, + "grad_norm": 0.413635276999212, + "learning_rate": 8.709828229986978e-06, + "loss": 0.3094, + "step": 2160 + }, + { + "epoch": 0.5143095138930208, + "grad_norm": 0.37538329913818896, + "learning_rate": 8.708535622275581e-06, + "loss": 0.4063, + "step": 2161 + }, + { + "epoch": 0.5145475099660856, + "grad_norm": 0.37400221595282845, + "learning_rate": 8.707242463384046e-06, + "loss": 0.4596, + "step": 2162 + }, + { + "epoch": 0.5147855060391504, + "grad_norm": 0.40427219364856926, + "learning_rate": 8.705948753504569e-06, + "loss": 0.3199, + "step": 2163 + }, + { + "epoch": 0.5150235021122151, + "grad_norm": 0.3803126661404316, + "learning_rate": 8.704654492829428e-06, + "loss": 0.3466, + "step": 2164 + }, + { + "epoch": 0.51526149818528, + "grad_norm": 0.3755852465058205, + "learning_rate": 8.703359681550978e-06, + "loss": 0.3883, + "step": 2165 + }, + { + "epoch": 0.5154994942583447, + "grad_norm": 0.3908086804440207, + "learning_rate": 8.702064319861663e-06, + "loss": 0.3674, + "step": 2166 + }, + { + "epoch": 0.5157374903314096, + "grad_norm": 0.3771686738573496, + "learning_rate": 8.700768407954007e-06, + "loss": 0.315, + "step": 2167 + }, + { + "epoch": 0.5159754864044743, + "grad_norm": 0.370557512425135, + "learning_rate": 8.699471946020612e-06, + "loss": 0.3404, + "step": 2168 + }, + { + "epoch": 0.5162134824775391, + "grad_norm": 0.3752377056778907, + "learning_rate": 8.698174934254164e-06, + "loss": 0.4201, + "step": 2169 + }, + { + "epoch": 0.5164514785506039, + "grad_norm": 0.4184085294213279, + "learning_rate": 8.696877372847434e-06, + "loss": 0.3294, + "step": 2170 + }, + { + "epoch": 0.5166894746236687, + "grad_norm": 0.388912569698524, + "learning_rate": 8.69557926199327e-06, + "loss": 0.359, + "step": 2171 + }, + { + "epoch": 0.5169274706967335, + "grad_norm": 0.365614463397713, + "learning_rate": 8.694280601884603e-06, + "loss": 0.4009, + "step": 2172 + }, + { + "epoch": 0.5171654667697982, + "grad_norm": 0.42621501649673044, + "learning_rate": 8.692981392714445e-06, + "loss": 0.36, + "step": 2173 + }, + { + "epoch": 0.5174034628428631, + "grad_norm": 0.41391982376423003, + "learning_rate": 8.691681634675895e-06, + "loss": 0.3199, + "step": 2174 + }, + { + "epoch": 0.5176414589159278, + "grad_norm": 0.41365783101678866, + "learning_rate": 8.690381327962125e-06, + "loss": 0.3919, + "step": 2175 + }, + { + "epoch": 0.5178794549889927, + "grad_norm": 0.37354876476127546, + "learning_rate": 8.689080472766393e-06, + "loss": 0.4031, + "step": 2176 + }, + { + "epoch": 0.5181174510620574, + "grad_norm": 0.4203856959561625, + "learning_rate": 8.687779069282041e-06, + "loss": 0.3208, + "step": 2177 + }, + { + "epoch": 0.5183554471351223, + "grad_norm": 0.40423490700726816, + "learning_rate": 8.686477117702488e-06, + "loss": 0.3334, + "step": 2178 + }, + { + "epoch": 0.518593443208187, + "grad_norm": 0.3509157300766899, + "learning_rate": 8.685174618221235e-06, + "loss": 0.4177, + "step": 2179 + }, + { + "epoch": 0.5188314392812519, + "grad_norm": 0.40079035375275884, + "learning_rate": 8.683871571031867e-06, + "loss": 0.4139, + "step": 2180 + }, + { + "epoch": 0.5190694353543166, + "grad_norm": 0.37163196418771477, + "learning_rate": 8.68256797632805e-06, + "loss": 0.3169, + "step": 2181 + }, + { + "epoch": 0.5193074314273814, + "grad_norm": 0.4054280532520997, + "learning_rate": 8.681263834303528e-06, + "loss": 0.4119, + "step": 2182 + }, + { + "epoch": 0.5195454275004462, + "grad_norm": 0.4273379334199248, + "learning_rate": 8.67995914515213e-06, + "loss": 0.4099, + "step": 2183 + }, + { + "epoch": 0.519783423573511, + "grad_norm": 0.38570960886267935, + "learning_rate": 8.678653909067767e-06, + "loss": 0.3451, + "step": 2184 + }, + { + "epoch": 0.5200214196465758, + "grad_norm": 0.39683790687415654, + "learning_rate": 8.677348126244427e-06, + "loss": 0.316, + "step": 2185 + }, + { + "epoch": 0.5202594157196406, + "grad_norm": 0.38461925483513904, + "learning_rate": 8.676041796876183e-06, + "loss": 0.3881, + "step": 2186 + }, + { + "epoch": 0.5204974117927054, + "grad_norm": 0.4254609757724267, + "learning_rate": 8.674734921157185e-06, + "loss": 0.4075, + "step": 2187 + }, + { + "epoch": 0.5207354078657702, + "grad_norm": 0.36619327488390596, + "learning_rate": 8.67342749928167e-06, + "loss": 0.3159, + "step": 2188 + }, + { + "epoch": 0.520973403938835, + "grad_norm": 0.41853879431263746, + "learning_rate": 8.672119531443951e-06, + "loss": 0.3249, + "step": 2189 + }, + { + "epoch": 0.5212114000118998, + "grad_norm": 0.37412197275330067, + "learning_rate": 8.67081101783843e-06, + "loss": 0.4339, + "step": 2190 + }, + { + "epoch": 0.5214493960849645, + "grad_norm": 0.3840297484734256, + "learning_rate": 8.669501958659576e-06, + "loss": 0.3418, + "step": 2191 + }, + { + "epoch": 0.5216873921580294, + "grad_norm": 0.39214557075111384, + "learning_rate": 8.668192354101953e-06, + "loss": 0.3065, + "step": 2192 + }, + { + "epoch": 0.5219253882310941, + "grad_norm": 0.3979212333040952, + "learning_rate": 8.666882204360201e-06, + "loss": 0.3639, + "step": 2193 + }, + { + "epoch": 0.522163384304159, + "grad_norm": 0.43111388602684464, + "learning_rate": 8.665571509629038e-06, + "loss": 0.3999, + "step": 2194 + }, + { + "epoch": 0.5224013803772237, + "grad_norm": 0.39713249026912617, + "learning_rate": 8.664260270103265e-06, + "loss": 0.329, + "step": 2195 + }, + { + "epoch": 0.5226393764502886, + "grad_norm": 0.3899241343272046, + "learning_rate": 8.662948485977768e-06, + "loss": 0.3436, + "step": 2196 + }, + { + "epoch": 0.5228773725233533, + "grad_norm": 0.3939001732463367, + "learning_rate": 8.661636157447511e-06, + "loss": 0.4149, + "step": 2197 + }, + { + "epoch": 0.5231153685964182, + "grad_norm": 0.3957340016317853, + "learning_rate": 8.660323284707535e-06, + "loss": 0.3787, + "step": 2198 + }, + { + "epoch": 0.5233533646694829, + "grad_norm": 0.44601803382908783, + "learning_rate": 8.659009867952966e-06, + "loss": 0.2919, + "step": 2199 + }, + { + "epoch": 0.5235913607425478, + "grad_norm": 0.3932173337224158, + "learning_rate": 8.657695907379011e-06, + "loss": 0.3767, + "step": 2200 + }, + { + "epoch": 0.5238293568156125, + "grad_norm": 0.408961622615314, + "learning_rate": 8.65638140318096e-06, + "loss": 0.4167, + "step": 2201 + }, + { + "epoch": 0.5240673528886773, + "grad_norm": 0.3895863266832368, + "learning_rate": 8.655066355554175e-06, + "loss": 0.3162, + "step": 2202 + }, + { + "epoch": 0.5243053489617421, + "grad_norm": 0.385176418198331, + "learning_rate": 8.65375076469411e-06, + "loss": 0.323, + "step": 2203 + }, + { + "epoch": 0.5245433450348069, + "grad_norm": 0.36711047121100115, + "learning_rate": 8.652434630796288e-06, + "loss": 0.4063, + "step": 2204 + }, + { + "epoch": 0.5247813411078717, + "grad_norm": 0.3996610302929268, + "learning_rate": 8.651117954056325e-06, + "loss": 0.4035, + "step": 2205 + }, + { + "epoch": 0.5250193371809365, + "grad_norm": 0.38507884013978433, + "learning_rate": 8.649800734669912e-06, + "loss": 0.3275, + "step": 2206 + }, + { + "epoch": 0.5252573332540013, + "grad_norm": 0.3649799910820221, + "learning_rate": 8.648482972832815e-06, + "loss": 0.3651, + "step": 2207 + }, + { + "epoch": 0.5254953293270661, + "grad_norm": 0.37617511815362664, + "learning_rate": 8.647164668740891e-06, + "loss": 0.4303, + "step": 2208 + }, + { + "epoch": 0.5257333254001308, + "grad_norm": 0.38432072478719503, + "learning_rate": 8.64584582259007e-06, + "loss": 0.3724, + "step": 2209 + }, + { + "epoch": 0.5259713214731957, + "grad_norm": 0.42322209539590827, + "learning_rate": 8.644526434576365e-06, + "loss": 0.323, + "step": 2210 + }, + { + "epoch": 0.5262093175462604, + "grad_norm": 0.40391854416882916, + "learning_rate": 8.64320650489587e-06, + "loss": 0.3575, + "step": 2211 + }, + { + "epoch": 0.5264473136193253, + "grad_norm": 0.3835559240154108, + "learning_rate": 8.641886033744762e-06, + "loss": 0.426, + "step": 2212 + }, + { + "epoch": 0.52668530969239, + "grad_norm": 0.3936562166696423, + "learning_rate": 8.640565021319293e-06, + "loss": 0.3247, + "step": 2213 + }, + { + "epoch": 0.5269233057654549, + "grad_norm": 0.39202674206549953, + "learning_rate": 8.639243467815798e-06, + "loss": 0.3597, + "step": 2214 + }, + { + "epoch": 0.5271613018385196, + "grad_norm": 0.3783225902955775, + "learning_rate": 8.637921373430694e-06, + "loss": 0.4475, + "step": 2215 + }, + { + "epoch": 0.5273992979115845, + "grad_norm": 0.3600937320401172, + "learning_rate": 8.636598738360476e-06, + "loss": 0.3617, + "step": 2216 + }, + { + "epoch": 0.5276372939846492, + "grad_norm": 0.4160704153133727, + "learning_rate": 8.63527556280172e-06, + "loss": 0.3446, + "step": 2217 + }, + { + "epoch": 0.527875290057714, + "grad_norm": 0.3913242338353506, + "learning_rate": 8.633951846951081e-06, + "loss": 0.3822, + "step": 2218 + }, + { + "epoch": 0.5281132861307788, + "grad_norm": 0.4035152853862847, + "learning_rate": 8.6326275910053e-06, + "loss": 0.4297, + "step": 2219 + }, + { + "epoch": 0.5283512822038436, + "grad_norm": 0.409203322526621, + "learning_rate": 8.631302795161192e-06, + "loss": 0.3351, + "step": 2220 + }, + { + "epoch": 0.5285892782769084, + "grad_norm": 0.39436563517242, + "learning_rate": 8.629977459615655e-06, + "loss": 0.3404, + "step": 2221 + }, + { + "epoch": 0.5288272743499732, + "grad_norm": 0.3590194240170449, + "learning_rate": 8.628651584565665e-06, + "loss": 0.3969, + "step": 2222 + }, + { + "epoch": 0.529065270423038, + "grad_norm": 0.4019072389485261, + "learning_rate": 8.627325170208282e-06, + "loss": 0.3476, + "step": 2223 + }, + { + "epoch": 0.5293032664961028, + "grad_norm": 0.37619114031005646, + "learning_rate": 8.625998216740643e-06, + "loss": 0.3222, + "step": 2224 + }, + { + "epoch": 0.5295412625691676, + "grad_norm": 0.4034865597690215, + "learning_rate": 8.624670724359964e-06, + "loss": 0.3545, + "step": 2225 + }, + { + "epoch": 0.5297792586422324, + "grad_norm": 0.408006482551879, + "learning_rate": 8.623342693263549e-06, + "loss": 0.4144, + "step": 2226 + }, + { + "epoch": 0.5300172547152971, + "grad_norm": 0.38382326688221563, + "learning_rate": 8.62201412364877e-06, + "loss": 0.3342, + "step": 2227 + }, + { + "epoch": 0.530255250788362, + "grad_norm": 0.41258702064324476, + "learning_rate": 8.620685015713089e-06, + "loss": 0.3147, + "step": 2228 + }, + { + "epoch": 0.5304932468614267, + "grad_norm": 0.38204434060333653, + "learning_rate": 8.619355369654043e-06, + "loss": 0.3818, + "step": 2229 + }, + { + "epoch": 0.5307312429344916, + "grad_norm": 0.38216547595409855, + "learning_rate": 8.61802518566925e-06, + "loss": 0.383, + "step": 2230 + }, + { + "epoch": 0.5309692390075563, + "grad_norm": 0.4029134683361265, + "learning_rate": 8.616694463956409e-06, + "loss": 0.2774, + "step": 2231 + }, + { + "epoch": 0.5312072350806212, + "grad_norm": 0.37301434376960896, + "learning_rate": 8.615363204713299e-06, + "loss": 0.3619, + "step": 2232 + }, + { + "epoch": 0.5314452311536859, + "grad_norm": 0.38496277632142917, + "learning_rate": 8.614031408137775e-06, + "loss": 0.4241, + "step": 2233 + }, + { + "epoch": 0.5316832272267508, + "grad_norm": 0.33800484116848534, + "learning_rate": 8.612699074427777e-06, + "loss": 0.3205, + "step": 2234 + }, + { + "epoch": 0.5319212232998155, + "grad_norm": 0.38034462571684113, + "learning_rate": 8.611366203781323e-06, + "loss": 0.3176, + "step": 2235 + }, + { + "epoch": 0.5321592193728804, + "grad_norm": 0.41034018337162753, + "learning_rate": 8.610032796396513e-06, + "loss": 0.3932, + "step": 2236 + }, + { + "epoch": 0.5323972154459451, + "grad_norm": 0.3624140867849283, + "learning_rate": 8.60869885247152e-06, + "loss": 0.3799, + "step": 2237 + }, + { + "epoch": 0.5326352115190099, + "grad_norm": 0.39211384701169555, + "learning_rate": 8.607364372204602e-06, + "loss": 0.3236, + "step": 2238 + }, + { + "epoch": 0.5328732075920747, + "grad_norm": 0.3631336275865908, + "learning_rate": 8.606029355794095e-06, + "loss": 0.3626, + "step": 2239 + }, + { + "epoch": 0.5331112036651395, + "grad_norm": 0.3499747372288337, + "learning_rate": 8.604693803438418e-06, + "loss": 0.3962, + "step": 2240 + }, + { + "epoch": 0.5333491997382043, + "grad_norm": 0.3833522235637884, + "learning_rate": 8.603357715336067e-06, + "loss": 0.3912, + "step": 2241 + }, + { + "epoch": 0.5335871958112691, + "grad_norm": 0.40874050154372005, + "learning_rate": 8.602021091685615e-06, + "loss": 0.3315, + "step": 2242 + }, + { + "epoch": 0.5338251918843339, + "grad_norm": 0.40195329644632777, + "learning_rate": 8.600683932685721e-06, + "loss": 0.3904, + "step": 2243 + }, + { + "epoch": 0.5340631879573987, + "grad_norm": 0.3846575731760141, + "learning_rate": 8.599346238535118e-06, + "loss": 0.4735, + "step": 2244 + }, + { + "epoch": 0.5343011840304634, + "grad_norm": 0.36643018837011293, + "learning_rate": 8.59800800943262e-06, + "loss": 0.3197, + "step": 2245 + }, + { + "epoch": 0.5345391801035283, + "grad_norm": 0.40617951682169934, + "learning_rate": 8.596669245577119e-06, + "loss": 0.3588, + "step": 2246 + }, + { + "epoch": 0.534777176176593, + "grad_norm": 0.3698247677981035, + "learning_rate": 8.595329947167593e-06, + "loss": 0.4227, + "step": 2247 + }, + { + "epoch": 0.5350151722496579, + "grad_norm": 0.3755104238478636, + "learning_rate": 8.593990114403093e-06, + "loss": 0.3951, + "step": 2248 + }, + { + "epoch": 0.5352531683227226, + "grad_norm": 0.37805284252091315, + "learning_rate": 8.59264974748275e-06, + "loss": 0.2891, + "step": 2249 + }, + { + "epoch": 0.5354911643957875, + "grad_norm": 0.37337307896849703, + "learning_rate": 8.591308846605777e-06, + "loss": 0.3513, + "step": 2250 + }, + { + "epoch": 0.5357291604688522, + "grad_norm": 0.3460371741397967, + "learning_rate": 8.589967411971464e-06, + "loss": 0.4251, + "step": 2251 + }, + { + "epoch": 0.5359671565419171, + "grad_norm": 0.37459542754801345, + "learning_rate": 8.588625443779183e-06, + "loss": 0.3355, + "step": 2252 + }, + { + "epoch": 0.5362051526149818, + "grad_norm": 0.40230507957953726, + "learning_rate": 8.587282942228382e-06, + "loss": 0.34, + "step": 2253 + }, + { + "epoch": 0.5364431486880467, + "grad_norm": 0.372308451561018, + "learning_rate": 8.585939907518591e-06, + "loss": 0.3885, + "step": 2254 + }, + { + "epoch": 0.5366811447611114, + "grad_norm": 0.3689804359627597, + "learning_rate": 8.584596339849419e-06, + "loss": 0.3928, + "step": 2255 + }, + { + "epoch": 0.5369191408341762, + "grad_norm": 0.37560524421780966, + "learning_rate": 8.583252239420549e-06, + "loss": 0.3303, + "step": 2256 + }, + { + "epoch": 0.537157136907241, + "grad_norm": 0.3842271683641431, + "learning_rate": 8.581907606431754e-06, + "loss": 0.3536, + "step": 2257 + }, + { + "epoch": 0.5373951329803058, + "grad_norm": 0.4002665238506151, + "learning_rate": 8.580562441082876e-06, + "loss": 0.4165, + "step": 2258 + }, + { + "epoch": 0.5376331290533706, + "grad_norm": 0.38907378275117815, + "learning_rate": 8.579216743573839e-06, + "loss": 0.2978, + "step": 2259 + }, + { + "epoch": 0.5378711251264354, + "grad_norm": 0.3956313105170694, + "learning_rate": 8.577870514104651e-06, + "loss": 0.3106, + "step": 2260 + }, + { + "epoch": 0.5381091211995002, + "grad_norm": 0.39047993680689397, + "learning_rate": 8.57652375287539e-06, + "loss": 0.3575, + "step": 2261 + }, + { + "epoch": 0.538347117272565, + "grad_norm": 0.41768920949511756, + "learning_rate": 8.575176460086221e-06, + "loss": 0.4167, + "step": 2262 + }, + { + "epoch": 0.5385851133456298, + "grad_norm": 0.3870552235120542, + "learning_rate": 8.573828635937384e-06, + "loss": 0.2996, + "step": 2263 + }, + { + "epoch": 0.5388231094186946, + "grad_norm": 0.4596709809606415, + "learning_rate": 8.5724802806292e-06, + "loss": 0.373, + "step": 2264 + }, + { + "epoch": 0.5390611054917593, + "grad_norm": 0.3706967842183925, + "learning_rate": 8.571131394362069e-06, + "loss": 0.404, + "step": 2265 + }, + { + "epoch": 0.5392991015648242, + "grad_norm": 0.40181886987790577, + "learning_rate": 8.569781977336464e-06, + "loss": 0.3562, + "step": 2266 + }, + { + "epoch": 0.5395370976378889, + "grad_norm": 0.4298167654016625, + "learning_rate": 8.568432029752947e-06, + "loss": 0.3202, + "step": 2267 + }, + { + "epoch": 0.5397750937109538, + "grad_norm": 0.39862560057903823, + "learning_rate": 8.56708155181215e-06, + "loss": 0.3843, + "step": 2268 + }, + { + "epoch": 0.5400130897840185, + "grad_norm": 0.41266735819620026, + "learning_rate": 8.565730543714791e-06, + "loss": 0.4155, + "step": 2269 + }, + { + "epoch": 0.5402510858570834, + "grad_norm": 0.3974166536006753, + "learning_rate": 8.564379005661661e-06, + "loss": 0.3293, + "step": 2270 + }, + { + "epoch": 0.5404890819301481, + "grad_norm": 0.35669423871612993, + "learning_rate": 8.563026937853633e-06, + "loss": 0.3605, + "step": 2271 + }, + { + "epoch": 0.540727078003213, + "grad_norm": 0.4127273022491468, + "learning_rate": 8.561674340491656e-06, + "loss": 0.409, + "step": 2272 + }, + { + "epoch": 0.5409650740762777, + "grad_norm": 0.4680110594563534, + "learning_rate": 8.56032121377676e-06, + "loss": 0.3709, + "step": 2273 + }, + { + "epoch": 0.5412030701493425, + "grad_norm": 0.4124449173785681, + "learning_rate": 8.558967557910054e-06, + "loss": 0.2934, + "step": 2274 + }, + { + "epoch": 0.5414410662224073, + "grad_norm": 0.4639894069096805, + "learning_rate": 8.557613373092724e-06, + "loss": 0.3645, + "step": 2275 + }, + { + "epoch": 0.5416790622954721, + "grad_norm": 0.3845035156200861, + "learning_rate": 8.556258659526036e-06, + "loss": 0.4354, + "step": 2276 + }, + { + "epoch": 0.5419170583685369, + "grad_norm": 0.4096705605468606, + "learning_rate": 8.554903417411333e-06, + "loss": 0.3055, + "step": 2277 + }, + { + "epoch": 0.5421550544416017, + "grad_norm": 0.3894517106688162, + "learning_rate": 8.553547646950037e-06, + "loss": 0.3147, + "step": 2278 + }, + { + "epoch": 0.5423930505146665, + "grad_norm": 0.41144516694641703, + "learning_rate": 8.552191348343653e-06, + "loss": 0.4009, + "step": 2279 + }, + { + "epoch": 0.5426310465877313, + "grad_norm": 0.40545518632374106, + "learning_rate": 8.550834521793757e-06, + "loss": 0.3549, + "step": 2280 + }, + { + "epoch": 0.542869042660796, + "grad_norm": 0.4536869892584681, + "learning_rate": 8.549477167502006e-06, + "loss": 0.2982, + "step": 2281 + }, + { + "epoch": 0.5431070387338609, + "grad_norm": 0.36914461803860876, + "learning_rate": 8.54811928567014e-06, + "loss": 0.3515, + "step": 2282 + }, + { + "epoch": 0.5433450348069256, + "grad_norm": 0.38659861108474836, + "learning_rate": 8.546760876499968e-06, + "loss": 0.4102, + "step": 2283 + }, + { + "epoch": 0.5435830308799905, + "grad_norm": 0.40982842576957934, + "learning_rate": 8.545401940193392e-06, + "loss": 0.3362, + "step": 2284 + }, + { + "epoch": 0.5438210269530552, + "grad_norm": 0.3991434411628737, + "learning_rate": 8.544042476952377e-06, + "loss": 0.3119, + "step": 2285 + }, + { + "epoch": 0.5440590230261201, + "grad_norm": 0.3929815486547205, + "learning_rate": 8.542682486978973e-06, + "loss": 0.3739, + "step": 2286 + }, + { + "epoch": 0.5442970190991848, + "grad_norm": 0.42239476535343035, + "learning_rate": 8.541321970475312e-06, + "loss": 0.3884, + "step": 2287 + }, + { + "epoch": 0.5445350151722497, + "grad_norm": 0.4368412726721034, + "learning_rate": 8.539960927643596e-06, + "loss": 0.3054, + "step": 2288 + }, + { + "epoch": 0.5447730112453144, + "grad_norm": 0.39557679784627, + "learning_rate": 8.538599358686112e-06, + "loss": 0.353, + "step": 2289 + }, + { + "epoch": 0.5450110073183793, + "grad_norm": 0.39922321971295105, + "learning_rate": 8.537237263805225e-06, + "loss": 0.4309, + "step": 2290 + }, + { + "epoch": 0.545249003391444, + "grad_norm": 0.39924479565216386, + "learning_rate": 8.53587464320337e-06, + "loss": 0.3412, + "step": 2291 + }, + { + "epoch": 0.5454869994645088, + "grad_norm": 0.39342949827033796, + "learning_rate": 8.534511497083073e-06, + "loss": 0.2964, + "step": 2292 + }, + { + "epoch": 0.5457249955375736, + "grad_norm": 0.37825134436273367, + "learning_rate": 8.533147825646925e-06, + "loss": 0.3704, + "step": 2293 + }, + { + "epoch": 0.5459629916106384, + "grad_norm": 0.38022616425475375, + "learning_rate": 8.531783629097608e-06, + "loss": 0.405, + "step": 2294 + }, + { + "epoch": 0.5462009876837032, + "grad_norm": 0.36260140531944307, + "learning_rate": 8.530418907637868e-06, + "loss": 0.3217, + "step": 2295 + }, + { + "epoch": 0.546438983756768, + "grad_norm": 0.37353441118295333, + "learning_rate": 8.529053661470542e-06, + "loss": 0.3432, + "step": 2296 + }, + { + "epoch": 0.5466769798298328, + "grad_norm": 0.40946792741351434, + "learning_rate": 8.527687890798537e-06, + "loss": 0.4199, + "step": 2297 + }, + { + "epoch": 0.5469149759028976, + "grad_norm": 0.3726039053557827, + "learning_rate": 8.52632159582484e-06, + "loss": 0.3707, + "step": 2298 + }, + { + "epoch": 0.5471529719759624, + "grad_norm": 0.3659302138094137, + "learning_rate": 8.524954776752516e-06, + "loss": 0.338, + "step": 2299 + }, + { + "epoch": 0.5473909680490272, + "grad_norm": 0.3829909878854894, + "learning_rate": 8.52358743378471e-06, + "loss": 0.3499, + "step": 2300 + }, + { + "epoch": 0.5476289641220919, + "grad_norm": 0.3890459899862676, + "learning_rate": 8.522219567124643e-06, + "loss": 0.4132, + "step": 2301 + }, + { + "epoch": 0.5478669601951568, + "grad_norm": 0.36286174283101486, + "learning_rate": 8.520851176975612e-06, + "loss": 0.35, + "step": 2302 + }, + { + "epoch": 0.5481049562682215, + "grad_norm": 0.38768718004776836, + "learning_rate": 8.519482263540994e-06, + "loss": 0.3365, + "step": 2303 + }, + { + "epoch": 0.5483429523412864, + "grad_norm": 0.4160688543785431, + "learning_rate": 8.518112827024245e-06, + "loss": 0.4039, + "step": 2304 + }, + { + "epoch": 0.5485809484143511, + "grad_norm": 0.36929499946058547, + "learning_rate": 8.516742867628895e-06, + "loss": 0.4113, + "step": 2305 + }, + { + "epoch": 0.548818944487416, + "grad_norm": 0.4434563141145236, + "learning_rate": 8.515372385558554e-06, + "loss": 0.3342, + "step": 2306 + }, + { + "epoch": 0.5490569405604807, + "grad_norm": 0.37489035852359814, + "learning_rate": 8.514001381016912e-06, + "loss": 0.3657, + "step": 2307 + }, + { + "epoch": 0.5492949366335456, + "grad_norm": 0.35701192424068157, + "learning_rate": 8.512629854207733e-06, + "loss": 0.4158, + "step": 2308 + }, + { + "epoch": 0.5495329327066103, + "grad_norm": 0.4554656989283448, + "learning_rate": 8.511257805334859e-06, + "loss": 0.3355, + "step": 2309 + }, + { + "epoch": 0.5497709287796751, + "grad_norm": 0.3669973654578771, + "learning_rate": 8.509885234602209e-06, + "loss": 0.3158, + "step": 2310 + }, + { + "epoch": 0.5500089248527399, + "grad_norm": 0.391116918799618, + "learning_rate": 8.508512142213784e-06, + "loss": 0.3917, + "step": 2311 + }, + { + "epoch": 0.5502469209258047, + "grad_norm": 0.40840152784809935, + "learning_rate": 8.507138528373658e-06, + "loss": 0.4089, + "step": 2312 + }, + { + "epoch": 0.5504849169988695, + "grad_norm": 0.39490420844357255, + "learning_rate": 8.505764393285985e-06, + "loss": 0.3205, + "step": 2313 + }, + { + "epoch": 0.5507229130719343, + "grad_norm": 0.3845395660774653, + "learning_rate": 8.504389737154994e-06, + "loss": 0.391, + "step": 2314 + }, + { + "epoch": 0.5509609091449991, + "grad_norm": 0.3909360241094245, + "learning_rate": 8.503014560184994e-06, + "loss": 0.4145, + "step": 2315 + }, + { + "epoch": 0.5511989052180639, + "grad_norm": 0.38762183020768226, + "learning_rate": 8.50163886258037e-06, + "loss": 0.3754, + "step": 2316 + }, + { + "epoch": 0.5514369012911287, + "grad_norm": 0.3912728587608663, + "learning_rate": 8.500262644545584e-06, + "loss": 0.3224, + "step": 2317 + }, + { + "epoch": 0.5516748973641935, + "grad_norm": 0.37973815027865854, + "learning_rate": 8.498885906285177e-06, + "loss": 0.3811, + "step": 2318 + }, + { + "epoch": 0.5519128934372582, + "grad_norm": 0.3579851250106683, + "learning_rate": 8.497508648003765e-06, + "loss": 0.4085, + "step": 2319 + }, + { + "epoch": 0.5521508895103231, + "grad_norm": 0.39660478319056564, + "learning_rate": 8.496130869906046e-06, + "loss": 0.3407, + "step": 2320 + }, + { + "epoch": 0.5523888855833878, + "grad_norm": 0.47401560455711944, + "learning_rate": 8.49475257219679e-06, + "loss": 0.3387, + "step": 2321 + }, + { + "epoch": 0.5526268816564527, + "grad_norm": 0.37354092809224154, + "learning_rate": 8.493373755080843e-06, + "loss": 0.3914, + "step": 2322 + }, + { + "epoch": 0.5528648777295174, + "grad_norm": 0.35292148040523397, + "learning_rate": 8.491994418763136e-06, + "loss": 0.3585, + "step": 2323 + }, + { + "epoch": 0.5531028738025823, + "grad_norm": 0.5613791127606462, + "learning_rate": 8.49061456344867e-06, + "loss": 0.2989, + "step": 2324 + }, + { + "epoch": 0.553340869875647, + "grad_norm": 0.41661056361625665, + "learning_rate": 8.489234189342526e-06, + "loss": 0.3642, + "step": 2325 + }, + { + "epoch": 0.5535788659487119, + "grad_norm": 0.3734315892480904, + "learning_rate": 8.487853296649861e-06, + "loss": 0.4333, + "step": 2326 + }, + { + "epoch": 0.5538168620217766, + "grad_norm": 0.3674453219548101, + "learning_rate": 8.486471885575912e-06, + "loss": 0.3408, + "step": 2327 + }, + { + "epoch": 0.5540548580948415, + "grad_norm": 0.3673572223097309, + "learning_rate": 8.48508995632599e-06, + "loss": 0.3572, + "step": 2328 + }, + { + "epoch": 0.5542928541679062, + "grad_norm": 0.4046956426382339, + "learning_rate": 8.483707509105483e-06, + "loss": 0.3804, + "step": 2329 + }, + { + "epoch": 0.554530850240971, + "grad_norm": 0.35220241891020276, + "learning_rate": 8.482324544119858e-06, + "loss": 0.3549, + "step": 2330 + }, + { + "epoch": 0.5547688463140358, + "grad_norm": 0.4301526345521306, + "learning_rate": 8.480941061574656e-06, + "loss": 0.3218, + "step": 2331 + }, + { + "epoch": 0.5550068423871006, + "grad_norm": 0.38794458883354294, + "learning_rate": 8.479557061675498e-06, + "loss": 0.3864, + "step": 2332 + }, + { + "epoch": 0.5552448384601654, + "grad_norm": 0.37899037555503806, + "learning_rate": 8.478172544628082e-06, + "loss": 0.4119, + "step": 2333 + }, + { + "epoch": 0.5554828345332302, + "grad_norm": 0.4193850656712972, + "learning_rate": 8.476787510638179e-06, + "loss": 0.362, + "step": 2334 + }, + { + "epoch": 0.555720830606295, + "grad_norm": 0.37309432031469925, + "learning_rate": 8.47540195991164e-06, + "loss": 0.3088, + "step": 2335 + }, + { + "epoch": 0.5559588266793598, + "grad_norm": 0.3870361410415046, + "learning_rate": 8.474015892654394e-06, + "loss": 0.4003, + "step": 2336 + }, + { + "epoch": 0.5561968227524245, + "grad_norm": 0.4108399130708448, + "learning_rate": 8.472629309072443e-06, + "loss": 0.4116, + "step": 2337 + }, + { + "epoch": 0.5564348188254894, + "grad_norm": 0.38847211210505317, + "learning_rate": 8.471242209371867e-06, + "loss": 0.3489, + "step": 2338 + }, + { + "epoch": 0.5566728148985541, + "grad_norm": 0.3632605487642382, + "learning_rate": 8.469854593758825e-06, + "loss": 0.3579, + "step": 2339 + }, + { + "epoch": 0.556910810971619, + "grad_norm": 0.36587914178181785, + "learning_rate": 8.468466462439549e-06, + "loss": 0.4237, + "step": 2340 + }, + { + "epoch": 0.5571488070446837, + "grad_norm": 0.3955575662234394, + "learning_rate": 8.467077815620352e-06, + "loss": 0.3263, + "step": 2341 + }, + { + "epoch": 0.5573868031177486, + "grad_norm": 0.4044906304039566, + "learning_rate": 8.46568865350762e-06, + "loss": 0.3015, + "step": 2342 + }, + { + "epoch": 0.5576247991908133, + "grad_norm": 0.3719462975550843, + "learning_rate": 8.464298976307816e-06, + "loss": 0.3709, + "step": 2343 + }, + { + "epoch": 0.5578627952638782, + "grad_norm": 0.41922112217020985, + "learning_rate": 8.462908784227484e-06, + "loss": 0.4194, + "step": 2344 + }, + { + "epoch": 0.5581007913369429, + "grad_norm": 0.4271019026096489, + "learning_rate": 8.461518077473236e-06, + "loss": 0.3211, + "step": 2345 + }, + { + "epoch": 0.5583387874100078, + "grad_norm": 0.401362760699648, + "learning_rate": 8.46012685625177e-06, + "loss": 0.3347, + "step": 2346 + }, + { + "epoch": 0.5585767834830725, + "grad_norm": 0.3788197761561941, + "learning_rate": 8.458735120769853e-06, + "loss": 0.3966, + "step": 2347 + }, + { + "epoch": 0.5588147795561373, + "grad_norm": 0.37058769198361474, + "learning_rate": 8.457342871234331e-06, + "loss": 0.3761, + "step": 2348 + }, + { + "epoch": 0.5590527756292021, + "grad_norm": 0.3701385464501336, + "learning_rate": 8.455950107852127e-06, + "loss": 0.296, + "step": 2349 + }, + { + "epoch": 0.5592907717022669, + "grad_norm": 0.39037984018929806, + "learning_rate": 8.454556830830242e-06, + "loss": 0.3621, + "step": 2350 + }, + { + "epoch": 0.5595287677753317, + "grad_norm": 0.3678062640318312, + "learning_rate": 8.453163040375751e-06, + "loss": 0.4257, + "step": 2351 + }, + { + "epoch": 0.5597667638483965, + "grad_norm": 0.3905948837382555, + "learning_rate": 8.451768736695806e-06, + "loss": 0.3342, + "step": 2352 + }, + { + "epoch": 0.5600047599214613, + "grad_norm": 0.4279691685882921, + "learning_rate": 8.450373919997633e-06, + "loss": 0.3265, + "step": 2353 + }, + { + "epoch": 0.5602427559945261, + "grad_norm": 0.39184884481824495, + "learning_rate": 8.448978590488538e-06, + "loss": 0.398, + "step": 2354 + }, + { + "epoch": 0.5604807520675908, + "grad_norm": 0.3667348181679106, + "learning_rate": 8.447582748375899e-06, + "loss": 0.3963, + "step": 2355 + }, + { + "epoch": 0.5607187481406557, + "grad_norm": 0.4044466646021283, + "learning_rate": 8.446186393867175e-06, + "loss": 0.3226, + "step": 2356 + }, + { + "epoch": 0.5609567442137204, + "grad_norm": 0.42695375000264973, + "learning_rate": 8.444789527169899e-06, + "loss": 0.345, + "step": 2357 + }, + { + "epoch": 0.5611947402867853, + "grad_norm": 0.38004783330852054, + "learning_rate": 8.44339214849168e-06, + "loss": 0.3978, + "step": 2358 + }, + { + "epoch": 0.56143273635985, + "grad_norm": 0.3996997964059031, + "learning_rate": 8.441994258040202e-06, + "loss": 0.3303, + "step": 2359 + }, + { + "epoch": 0.5616707324329149, + "grad_norm": 0.39510081986039514, + "learning_rate": 8.440595856023226e-06, + "loss": 0.3358, + "step": 2360 + }, + { + "epoch": 0.5619087285059796, + "grad_norm": 0.4191208470001434, + "learning_rate": 8.439196942648589e-06, + "loss": 0.4067, + "step": 2361 + }, + { + "epoch": 0.5621467245790445, + "grad_norm": 0.4303815244296002, + "learning_rate": 8.437797518124205e-06, + "loss": 0.4148, + "step": 2362 + }, + { + "epoch": 0.5623847206521092, + "grad_norm": 0.3989720793075796, + "learning_rate": 8.436397582658062e-06, + "loss": 0.3123, + "step": 2363 + }, + { + "epoch": 0.562622716725174, + "grad_norm": 0.3880785671355131, + "learning_rate": 8.434997136458227e-06, + "loss": 0.3449, + "step": 2364 + }, + { + "epoch": 0.5628607127982388, + "grad_norm": 0.35392253974214943, + "learning_rate": 8.433596179732838e-06, + "loss": 0.4147, + "step": 2365 + }, + { + "epoch": 0.5630987088713036, + "grad_norm": 0.35980860715044094, + "learning_rate": 8.432194712690117e-06, + "loss": 0.344, + "step": 2366 + }, + { + "epoch": 0.5633367049443684, + "grad_norm": 0.3585696578371747, + "learning_rate": 8.430792735538352e-06, + "loss": 0.3067, + "step": 2367 + }, + { + "epoch": 0.5635747010174332, + "grad_norm": 0.3443163729830074, + "learning_rate": 8.429390248485911e-06, + "loss": 0.3627, + "step": 2368 + }, + { + "epoch": 0.563812697090498, + "grad_norm": 0.3649074447807937, + "learning_rate": 8.42798725174124e-06, + "loss": 0.4325, + "step": 2369 + }, + { + "epoch": 0.5640506931635628, + "grad_norm": 0.3798587786258275, + "learning_rate": 8.426583745512862e-06, + "loss": 0.3246, + "step": 2370 + }, + { + "epoch": 0.5642886892366276, + "grad_norm": 0.40308412350983575, + "learning_rate": 8.425179730009368e-06, + "loss": 0.3384, + "step": 2371 + }, + { + "epoch": 0.5645266853096924, + "grad_norm": 0.4049157079324643, + "learning_rate": 8.423775205439433e-06, + "loss": 0.4236, + "step": 2372 + }, + { + "epoch": 0.5647646813827571, + "grad_norm": 0.3975137139248059, + "learning_rate": 8.4223701720118e-06, + "loss": 0.3573, + "step": 2373 + }, + { + "epoch": 0.565002677455822, + "grad_norm": 0.40160322607563, + "learning_rate": 8.420964629935294e-06, + "loss": 0.306, + "step": 2374 + }, + { + "epoch": 0.5652406735288867, + "grad_norm": 0.37308395646020814, + "learning_rate": 8.419558579418813e-06, + "loss": 0.3918, + "step": 2375 + }, + { + "epoch": 0.5654786696019516, + "grad_norm": 0.3514852696457898, + "learning_rate": 8.418152020671335e-06, + "loss": 0.4363, + "step": 2376 + }, + { + "epoch": 0.5657166656750163, + "grad_norm": 0.6416951229578869, + "learning_rate": 8.416744953901904e-06, + "loss": 0.3119, + "step": 2377 + }, + { + "epoch": 0.5659546617480812, + "grad_norm": 0.3436478077704659, + "learning_rate": 8.415337379319645e-06, + "loss": 0.3405, + "step": 2378 + }, + { + "epoch": 0.5661926578211459, + "grad_norm": 0.38888307187061655, + "learning_rate": 8.41392929713376e-06, + "loss": 0.3862, + "step": 2379 + }, + { + "epoch": 0.5664306538942108, + "grad_norm": 0.4044080349860496, + "learning_rate": 8.412520707553527e-06, + "loss": 0.4068, + "step": 2380 + }, + { + "epoch": 0.5666686499672755, + "grad_norm": 0.3738358783673891, + "learning_rate": 8.411111610788294e-06, + "loss": 0.3103, + "step": 2381 + }, + { + "epoch": 0.5669066460403404, + "grad_norm": 0.3894619985619245, + "learning_rate": 8.40970200704749e-06, + "loss": 0.3569, + "step": 2382 + }, + { + "epoch": 0.5671446421134051, + "grad_norm": 0.37956208132491615, + "learning_rate": 8.408291896540613e-06, + "loss": 0.4404, + "step": 2383 + }, + { + "epoch": 0.56738263818647, + "grad_norm": 0.4385341684832731, + "learning_rate": 8.406881279477244e-06, + "loss": 0.3509, + "step": 2384 + }, + { + "epoch": 0.5676206342595347, + "grad_norm": 0.43016413455724534, + "learning_rate": 8.405470156067038e-06, + "loss": 0.3038, + "step": 2385 + }, + { + "epoch": 0.5678586303325995, + "grad_norm": 0.41401646258681746, + "learning_rate": 8.404058526519717e-06, + "loss": 0.3866, + "step": 2386 + }, + { + "epoch": 0.5680966264056643, + "grad_norm": 0.3977676509749422, + "learning_rate": 8.402646391045085e-06, + "loss": 0.4203, + "step": 2387 + }, + { + "epoch": 0.5683346224787291, + "grad_norm": 0.47631691258225883, + "learning_rate": 8.401233749853024e-06, + "loss": 0.3177, + "step": 2388 + }, + { + "epoch": 0.5685726185517939, + "grad_norm": 0.3878321830383998, + "learning_rate": 8.399820603153483e-06, + "loss": 0.3613, + "step": 2389 + }, + { + "epoch": 0.5688106146248587, + "grad_norm": 0.3714565178735775, + "learning_rate": 8.398406951156496e-06, + "loss": 0.426, + "step": 2390 + }, + { + "epoch": 0.5690486106979235, + "grad_norm": 0.39492294312602905, + "learning_rate": 8.396992794072162e-06, + "loss": 0.3286, + "step": 2391 + }, + { + "epoch": 0.5692866067709883, + "grad_norm": 0.4043641921314243, + "learning_rate": 8.395578132110663e-06, + "loss": 0.3344, + "step": 2392 + }, + { + "epoch": 0.569524602844053, + "grad_norm": 0.39639917678624054, + "learning_rate": 8.394162965482249e-06, + "loss": 0.3757, + "step": 2393 + }, + { + "epoch": 0.5697625989171179, + "grad_norm": 0.37351452297347937, + "learning_rate": 8.39274729439725e-06, + "loss": 0.3834, + "step": 2394 + }, + { + "epoch": 0.5700005949901826, + "grad_norm": 0.4062182778345128, + "learning_rate": 8.391331119066071e-06, + "loss": 0.304, + "step": 2395 + }, + { + "epoch": 0.5702385910632475, + "grad_norm": 0.36745005612369885, + "learning_rate": 8.389914439699191e-06, + "loss": 0.3195, + "step": 2396 + }, + { + "epoch": 0.5704765871363122, + "grad_norm": 0.3687137278697919, + "learning_rate": 8.388497256507163e-06, + "loss": 0.3726, + "step": 2397 + }, + { + "epoch": 0.5707145832093771, + "grad_norm": 0.3792259871749328, + "learning_rate": 8.387079569700615e-06, + "loss": 0.3311, + "step": 2398 + }, + { + "epoch": 0.5709525792824418, + "grad_norm": 0.40696921581904394, + "learning_rate": 8.38566137949025e-06, + "loss": 0.2977, + "step": 2399 + }, + { + "epoch": 0.5711905753555067, + "grad_norm": 0.3699322815190473, + "learning_rate": 8.384242686086848e-06, + "loss": 0.3794, + "step": 2400 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3826635436528603, + "learning_rate": 8.382823489701262e-06, + "loss": 0.4304, + "step": 2401 + }, + { + "epoch": 0.5716665675016362, + "grad_norm": 0.38070938472969035, + "learning_rate": 8.381403790544416e-06, + "loss": 0.3305, + "step": 2402 + }, + { + "epoch": 0.571904563574701, + "grad_norm": 0.3868544933608521, + "learning_rate": 8.379983588827314e-06, + "loss": 0.3067, + "step": 2403 + }, + { + "epoch": 0.5721425596477658, + "grad_norm": 0.39043334928429774, + "learning_rate": 8.378562884761037e-06, + "loss": 0.385, + "step": 2404 + }, + { + "epoch": 0.5723805557208306, + "grad_norm": 0.3953822016500048, + "learning_rate": 8.37714167855673e-06, + "loss": 0.3968, + "step": 2405 + }, + { + "epoch": 0.5726185517938954, + "grad_norm": 0.36014258721015646, + "learning_rate": 8.375719970425626e-06, + "loss": 0.3058, + "step": 2406 + }, + { + "epoch": 0.5728565478669602, + "grad_norm": 0.38242727883148314, + "learning_rate": 8.374297760579024e-06, + "loss": 0.3667, + "step": 2407 + }, + { + "epoch": 0.573094543940025, + "grad_norm": 0.3583190778095351, + "learning_rate": 8.372875049228295e-06, + "loss": 0.448, + "step": 2408 + }, + { + "epoch": 0.5733325400130898, + "grad_norm": 0.43781235930710827, + "learning_rate": 8.371451836584894e-06, + "loss": 0.3132, + "step": 2409 + }, + { + "epoch": 0.5735705360861546, + "grad_norm": 0.3688130777579906, + "learning_rate": 8.370028122860346e-06, + "loss": 0.3088, + "step": 2410 + }, + { + "epoch": 0.5738085321592193, + "grad_norm": 0.398460180753574, + "learning_rate": 8.368603908266248e-06, + "loss": 0.3805, + "step": 2411 + }, + { + "epoch": 0.5740465282322842, + "grad_norm": 0.39559802571213665, + "learning_rate": 8.367179193014275e-06, + "loss": 0.4192, + "step": 2412 + }, + { + "epoch": 0.5742845243053489, + "grad_norm": 0.41593476553175096, + "learning_rate": 8.365753977316171e-06, + "loss": 0.3284, + "step": 2413 + }, + { + "epoch": 0.5745225203784138, + "grad_norm": 0.3718359537773827, + "learning_rate": 8.364328261383763e-06, + "loss": 0.34, + "step": 2414 + }, + { + "epoch": 0.5747605164514785, + "grad_norm": 0.3557624684742045, + "learning_rate": 8.362902045428945e-06, + "loss": 0.4296, + "step": 2415 + }, + { + "epoch": 0.5749985125245434, + "grad_norm": 0.402514906677194, + "learning_rate": 8.36147532966369e-06, + "loss": 0.3406, + "step": 2416 + }, + { + "epoch": 0.5752365085976081, + "grad_norm": 0.39462407875443245, + "learning_rate": 8.360048114300041e-06, + "loss": 0.3383, + "step": 2417 + }, + { + "epoch": 0.575474504670673, + "grad_norm": 0.41311640572509384, + "learning_rate": 8.35862039955012e-06, + "loss": 0.3611, + "step": 2418 + }, + { + "epoch": 0.5757125007437377, + "grad_norm": 0.3917287001092687, + "learning_rate": 8.357192185626118e-06, + "loss": 0.4186, + "step": 2419 + }, + { + "epoch": 0.5759504968168025, + "grad_norm": 0.3467390799072633, + "learning_rate": 8.355763472740305e-06, + "loss": 0.3304, + "step": 2420 + }, + { + "epoch": 0.5761884928898673, + "grad_norm": 0.38769794783433975, + "learning_rate": 8.354334261105023e-06, + "loss": 0.3565, + "step": 2421 + }, + { + "epoch": 0.5764264889629321, + "grad_norm": 0.4041528840850423, + "learning_rate": 8.352904550932687e-06, + "loss": 0.4234, + "step": 2422 + }, + { + "epoch": 0.5766644850359969, + "grad_norm": 0.4100086929644872, + "learning_rate": 8.351474342435786e-06, + "loss": 0.356, + "step": 2423 + }, + { + "epoch": 0.5769024811090617, + "grad_norm": 0.36794983188883607, + "learning_rate": 8.350043635826888e-06, + "loss": 0.3224, + "step": 2424 + }, + { + "epoch": 0.5771404771821265, + "grad_norm": 0.40729170540747445, + "learning_rate": 8.34861243131863e-06, + "loss": 0.3484, + "step": 2425 + }, + { + "epoch": 0.5773784732551913, + "grad_norm": 0.40251376202597505, + "learning_rate": 8.347180729123724e-06, + "loss": 0.4518, + "step": 2426 + }, + { + "epoch": 0.577616469328256, + "grad_norm": 0.3852352526845826, + "learning_rate": 8.345748529454956e-06, + "loss": 0.3455, + "step": 2427 + }, + { + "epoch": 0.5778544654013209, + "grad_norm": 0.39160730175824005, + "learning_rate": 8.344315832525187e-06, + "loss": 0.3527, + "step": 2428 + }, + { + "epoch": 0.5780924614743856, + "grad_norm": 0.38108553823066177, + "learning_rate": 8.342882638547351e-06, + "loss": 0.4142, + "step": 2429 + }, + { + "epoch": 0.5783304575474505, + "grad_norm": 0.4092273445218704, + "learning_rate": 8.341448947734454e-06, + "loss": 0.3947, + "step": 2430 + }, + { + "epoch": 0.5785684536205152, + "grad_norm": 0.40333249898205964, + "learning_rate": 8.340014760299582e-06, + "loss": 0.3326, + "step": 2431 + }, + { + "epoch": 0.5788064496935801, + "grad_norm": 0.3729044861254844, + "learning_rate": 8.33858007645589e-06, + "loss": 0.3735, + "step": 2432 + }, + { + "epoch": 0.5790444457666448, + "grad_norm": 0.3671984616197141, + "learning_rate": 8.337144896416602e-06, + "loss": 0.3966, + "step": 2433 + }, + { + "epoch": 0.5792824418397097, + "grad_norm": 0.3836501393931116, + "learning_rate": 8.335709220395029e-06, + "loss": 0.3285, + "step": 2434 + }, + { + "epoch": 0.5795204379127744, + "grad_norm": 0.35340054131259374, + "learning_rate": 8.334273048604541e-06, + "loss": 0.3307, + "step": 2435 + }, + { + "epoch": 0.5797584339858393, + "grad_norm": 0.3439612755818031, + "learning_rate": 8.332836381258596e-06, + "loss": 0.3899, + "step": 2436 + }, + { + "epoch": 0.579996430058904, + "grad_norm": 0.3569322611942811, + "learning_rate": 8.331399218570711e-06, + "loss": 0.4009, + "step": 2437 + }, + { + "epoch": 0.5802344261319689, + "grad_norm": 0.6662367868543072, + "learning_rate": 8.32996156075449e-06, + "loss": 0.3147, + "step": 2438 + }, + { + "epoch": 0.5804724222050336, + "grad_norm": 0.4227515779891762, + "learning_rate": 8.328523408023599e-06, + "loss": 0.3299, + "step": 2439 + }, + { + "epoch": 0.5807104182780984, + "grad_norm": 0.37706784919884295, + "learning_rate": 8.327084760591788e-06, + "loss": 0.4154, + "step": 2440 + }, + { + "epoch": 0.5809484143511632, + "grad_norm": 0.36959641829833456, + "learning_rate": 8.325645618672873e-06, + "loss": 0.3566, + "step": 2441 + }, + { + "epoch": 0.581186410424228, + "grad_norm": 0.38877783129131926, + "learning_rate": 8.324205982480747e-06, + "loss": 0.3094, + "step": 2442 + }, + { + "epoch": 0.5814244064972928, + "grad_norm": 0.4093291177503294, + "learning_rate": 8.322765852229373e-06, + "loss": 0.3788, + "step": 2443 + }, + { + "epoch": 0.5816624025703576, + "grad_norm": 0.35993564376286113, + "learning_rate": 8.321325228132793e-06, + "loss": 0.3971, + "step": 2444 + }, + { + "epoch": 0.5819003986434224, + "grad_norm": 0.37858365267915045, + "learning_rate": 8.31988411040512e-06, + "loss": 0.3312, + "step": 2445 + }, + { + "epoch": 0.5821383947164872, + "grad_norm": 0.36232519560188203, + "learning_rate": 8.318442499260538e-06, + "loss": 0.3209, + "step": 2446 + }, + { + "epoch": 0.582376390789552, + "grad_norm": 0.37803394847163874, + "learning_rate": 8.317000394913304e-06, + "loss": 0.4168, + "step": 2447 + }, + { + "epoch": 0.5826143868626168, + "grad_norm": 0.39433181440916604, + "learning_rate": 8.315557797577754e-06, + "loss": 0.3812, + "step": 2448 + }, + { + "epoch": 0.5828523829356815, + "grad_norm": 0.403795746000504, + "learning_rate": 8.314114707468293e-06, + "loss": 0.2969, + "step": 2449 + }, + { + "epoch": 0.5830903790087464, + "grad_norm": 0.3611167532562934, + "learning_rate": 8.312671124799398e-06, + "loss": 0.3553, + "step": 2450 + }, + { + "epoch": 0.5833283750818111, + "grad_norm": 0.38084432296705717, + "learning_rate": 8.311227049785623e-06, + "loss": 0.4011, + "step": 2451 + }, + { + "epoch": 0.583566371154876, + "grad_norm": 0.3655837949285881, + "learning_rate": 8.309782482641595e-06, + "loss": 0.3379, + "step": 2452 + }, + { + "epoch": 0.5838043672279407, + "grad_norm": 0.3913423734099842, + "learning_rate": 8.308337423582006e-06, + "loss": 0.3441, + "step": 2453 + }, + { + "epoch": 0.5840423633010056, + "grad_norm": 0.39397581704554785, + "learning_rate": 8.306891872821635e-06, + "loss": 0.3829, + "step": 2454 + }, + { + "epoch": 0.5842803593740703, + "grad_norm": 0.3843675181792697, + "learning_rate": 8.30544583057532e-06, + "loss": 0.3929, + "step": 2455 + }, + { + "epoch": 0.5845183554471352, + "grad_norm": 0.3832844659583936, + "learning_rate": 8.303999297057983e-06, + "loss": 0.3161, + "step": 2456 + }, + { + "epoch": 0.5847563515201999, + "grad_norm": 0.39763166006214656, + "learning_rate": 8.302552272484613e-06, + "loss": 0.3667, + "step": 2457 + }, + { + "epoch": 0.5849943475932647, + "grad_norm": 0.3585575011269157, + "learning_rate": 8.301104757070276e-06, + "loss": 0.4027, + "step": 2458 + }, + { + "epoch": 0.5852323436663295, + "grad_norm": 0.38820380899321066, + "learning_rate": 8.299656751030105e-06, + "loss": 0.3341, + "step": 2459 + }, + { + "epoch": 0.5854703397393943, + "grad_norm": 0.3836385113109399, + "learning_rate": 8.298208254579311e-06, + "loss": 0.3173, + "step": 2460 + }, + { + "epoch": 0.5857083358124591, + "grad_norm": 0.38574649888799817, + "learning_rate": 8.296759267933178e-06, + "loss": 0.3862, + "step": 2461 + }, + { + "epoch": 0.5859463318855239, + "grad_norm": 0.3684542994428625, + "learning_rate": 8.29530979130706e-06, + "loss": 0.3995, + "step": 2462 + }, + { + "epoch": 0.5861843279585887, + "grad_norm": 0.39106683323722646, + "learning_rate": 8.293859824916383e-06, + "loss": 0.3097, + "step": 2463 + }, + { + "epoch": 0.5864223240316535, + "grad_norm": 0.4653396505467396, + "learning_rate": 8.29240936897665e-06, + "loss": 0.3342, + "step": 2464 + }, + { + "epoch": 0.5866603201047182, + "grad_norm": 0.3895719288501651, + "learning_rate": 8.290958423703437e-06, + "loss": 0.4107, + "step": 2465 + }, + { + "epoch": 0.5868983161777831, + "grad_norm": 0.4018410614287424, + "learning_rate": 8.289506989312386e-06, + "loss": 0.3257, + "step": 2466 + }, + { + "epoch": 0.5871363122508478, + "grad_norm": 0.3944410384951886, + "learning_rate": 8.288055066019218e-06, + "loss": 0.3238, + "step": 2467 + }, + { + "epoch": 0.5873743083239127, + "grad_norm": 0.38792256065200875, + "learning_rate": 8.286602654039724e-06, + "loss": 0.3826, + "step": 2468 + }, + { + "epoch": 0.5876123043969774, + "grad_norm": 0.38502496740303777, + "learning_rate": 8.28514975358977e-06, + "loss": 0.3997, + "step": 2469 + }, + { + "epoch": 0.5878503004700423, + "grad_norm": 0.37703883930285037, + "learning_rate": 8.283696364885293e-06, + "loss": 0.3356, + "step": 2470 + }, + { + "epoch": 0.588088296543107, + "grad_norm": 0.3861610449357546, + "learning_rate": 8.282242488142299e-06, + "loss": 0.3348, + "step": 2471 + }, + { + "epoch": 0.5883262926161719, + "grad_norm": 0.3780090152003883, + "learning_rate": 8.280788123576873e-06, + "loss": 0.4045, + "step": 2472 + }, + { + "epoch": 0.5885642886892366, + "grad_norm": 0.3801772021397056, + "learning_rate": 8.279333271405171e-06, + "loss": 0.3941, + "step": 2473 + }, + { + "epoch": 0.5888022847623015, + "grad_norm": 0.3903145537175543, + "learning_rate": 8.277877931843417e-06, + "loss": 0.32, + "step": 2474 + }, + { + "epoch": 0.5890402808353662, + "grad_norm": 0.3762448521852173, + "learning_rate": 8.276422105107911e-06, + "loss": 0.3519, + "step": 2475 + }, + { + "epoch": 0.589278276908431, + "grad_norm": 0.4024863557944298, + "learning_rate": 8.274965791415026e-06, + "loss": 0.4126, + "step": 2476 + }, + { + "epoch": 0.5895162729814958, + "grad_norm": 0.3810042080891356, + "learning_rate": 8.273508990981206e-06, + "loss": 0.3461, + "step": 2477 + }, + { + "epoch": 0.5897542690545606, + "grad_norm": 0.39863438603953905, + "learning_rate": 8.272051704022965e-06, + "loss": 0.3328, + "step": 2478 + }, + { + "epoch": 0.5899922651276254, + "grad_norm": 0.3867478091098192, + "learning_rate": 8.270593930756897e-06, + "loss": 0.377, + "step": 2479 + }, + { + "epoch": 0.5902302612006902, + "grad_norm": 0.3879898329337162, + "learning_rate": 8.269135671399659e-06, + "loss": 0.3646, + "step": 2480 + }, + { + "epoch": 0.590468257273755, + "grad_norm": 0.36772292163364306, + "learning_rate": 8.267676926167986e-06, + "loss": 0.3086, + "step": 2481 + }, + { + "epoch": 0.5907062533468198, + "grad_norm": 0.3565818145381248, + "learning_rate": 8.266217695278682e-06, + "loss": 0.355, + "step": 2482 + }, + { + "epoch": 0.5909442494198845, + "grad_norm": 0.37046723786837815, + "learning_rate": 8.264757978948627e-06, + "loss": 0.4076, + "step": 2483 + }, + { + "epoch": 0.5911822454929494, + "grad_norm": 0.4249706612993194, + "learning_rate": 8.263297777394772e-06, + "loss": 0.3327, + "step": 2484 + }, + { + "epoch": 0.5914202415660141, + "grad_norm": 0.3951802875165859, + "learning_rate": 8.261837090834135e-06, + "loss": 0.3376, + "step": 2485 + }, + { + "epoch": 0.591658237639079, + "grad_norm": 0.3673787092456175, + "learning_rate": 8.260375919483812e-06, + "loss": 0.3509, + "step": 2486 + }, + { + "epoch": 0.5918962337121437, + "grad_norm": 0.3896876553874139, + "learning_rate": 8.258914263560971e-06, + "loss": 0.4089, + "step": 2487 + }, + { + "epoch": 0.5921342297852086, + "grad_norm": 0.37425201217841664, + "learning_rate": 8.257452123282847e-06, + "loss": 0.3259, + "step": 2488 + }, + { + "epoch": 0.5923722258582733, + "grad_norm": 0.3859687694225435, + "learning_rate": 8.255989498866754e-06, + "loss": 0.3243, + "step": 2489 + }, + { + "epoch": 0.5926102219313382, + "grad_norm": 0.3708026305032604, + "learning_rate": 8.254526390530071e-06, + "loss": 0.3954, + "step": 2490 + }, + { + "epoch": 0.5928482180044029, + "grad_norm": 0.3794998880457793, + "learning_rate": 8.253062798490255e-06, + "loss": 0.3535, + "step": 2491 + }, + { + "epoch": 0.5930862140774678, + "grad_norm": 0.4489229665961381, + "learning_rate": 8.251598722964828e-06, + "loss": 0.3371, + "step": 2492 + }, + { + "epoch": 0.5933242101505325, + "grad_norm": 0.3631137000424927, + "learning_rate": 8.250134164171391e-06, + "loss": 0.3718, + "step": 2493 + }, + { + "epoch": 0.5935622062235973, + "grad_norm": 0.3527479506637244, + "learning_rate": 8.248669122327612e-06, + "loss": 0.3933, + "step": 2494 + }, + { + "epoch": 0.5938002022966621, + "grad_norm": 0.3742555835211287, + "learning_rate": 8.247203597651234e-06, + "loss": 0.3274, + "step": 2495 + }, + { + "epoch": 0.5940381983697269, + "grad_norm": 0.4308062065966235, + "learning_rate": 8.24573759036007e-06, + "loss": 0.3666, + "step": 2496 + }, + { + "epoch": 0.5942761944427917, + "grad_norm": 0.3966932502889181, + "learning_rate": 8.244271100672004e-06, + "loss": 0.4066, + "step": 2497 + }, + { + "epoch": 0.5945141905158565, + "grad_norm": 0.35946849534322295, + "learning_rate": 8.242804128804993e-06, + "loss": 0.3555, + "step": 2498 + }, + { + "epoch": 0.5947521865889213, + "grad_norm": 0.37828665852870985, + "learning_rate": 8.241336674977064e-06, + "loss": 0.3126, + "step": 2499 + }, + { + "epoch": 0.5949901826619861, + "grad_norm": 0.38560613627125817, + "learning_rate": 8.23986873940632e-06, + "loss": 0.3595, + "step": 2500 + }, + { + "epoch": 0.5952281787350509, + "grad_norm": 0.3933069848593732, + "learning_rate": 8.238400322310931e-06, + "loss": 0.4599, + "step": 2501 + }, + { + "epoch": 0.5954661748081157, + "grad_norm": 0.37906655075786916, + "learning_rate": 8.23693142390914e-06, + "loss": 0.3189, + "step": 2502 + }, + { + "epoch": 0.5957041708811804, + "grad_norm": 0.3597221601036792, + "learning_rate": 8.23546204441926e-06, + "loss": 0.3001, + "step": 2503 + }, + { + "epoch": 0.5959421669542453, + "grad_norm": 0.36715814207073877, + "learning_rate": 8.233992184059681e-06, + "loss": 0.3793, + "step": 2504 + }, + { + "epoch": 0.59618016302731, + "grad_norm": 0.37986642282050925, + "learning_rate": 8.232521843048855e-06, + "loss": 0.378, + "step": 2505 + }, + { + "epoch": 0.5964181591003749, + "grad_norm": 0.38641730071730823, + "learning_rate": 8.231051021605316e-06, + "loss": 0.3321, + "step": 2506 + }, + { + "epoch": 0.5966561551734396, + "grad_norm": 0.4161340087618939, + "learning_rate": 8.229579719947664e-06, + "loss": 0.3855, + "step": 2507 + }, + { + "epoch": 0.5968941512465045, + "grad_norm": 0.3557911482780271, + "learning_rate": 8.228107938294568e-06, + "loss": 0.3944, + "step": 2508 + }, + { + "epoch": 0.5971321473195692, + "grad_norm": 0.37982110625589854, + "learning_rate": 8.226635676864774e-06, + "loss": 0.3175, + "step": 2509 + }, + { + "epoch": 0.5973701433926341, + "grad_norm": 0.37294853078019546, + "learning_rate": 8.225162935877096e-06, + "loss": 0.3159, + "step": 2510 + }, + { + "epoch": 0.5976081394656988, + "grad_norm": 0.3732432601966647, + "learning_rate": 8.223689715550417e-06, + "loss": 0.4083, + "step": 2511 + }, + { + "epoch": 0.5978461355387636, + "grad_norm": 0.37372480614911496, + "learning_rate": 8.222216016103697e-06, + "loss": 0.4262, + "step": 2512 + }, + { + "epoch": 0.5980841316118284, + "grad_norm": 0.35343278143642487, + "learning_rate": 8.220741837755964e-06, + "loss": 0.3131, + "step": 2513 + }, + { + "epoch": 0.5983221276848932, + "grad_norm": 0.36855420142012446, + "learning_rate": 8.219267180726315e-06, + "loss": 0.3692, + "step": 2514 + }, + { + "epoch": 0.598560123757958, + "grad_norm": 0.3994967531361665, + "learning_rate": 8.217792045233924e-06, + "loss": 0.3973, + "step": 2515 + }, + { + "epoch": 0.5987981198310228, + "grad_norm": 0.4108525006798571, + "learning_rate": 8.216316431498028e-06, + "loss": 0.3718, + "step": 2516 + }, + { + "epoch": 0.5990361159040876, + "grad_norm": 0.40662138405492826, + "learning_rate": 8.214840339737943e-06, + "loss": 0.2975, + "step": 2517 + }, + { + "epoch": 0.5992741119771524, + "grad_norm": 0.3728366285892858, + "learning_rate": 8.213363770173054e-06, + "loss": 0.3559, + "step": 2518 + }, + { + "epoch": 0.5995121080502172, + "grad_norm": 0.37944723614507436, + "learning_rate": 8.211886723022814e-06, + "loss": 0.4264, + "step": 2519 + }, + { + "epoch": 0.599750104123282, + "grad_norm": 0.3611408857103594, + "learning_rate": 8.210409198506748e-06, + "loss": 0.3163, + "step": 2520 + }, + { + "epoch": 0.5999881001963467, + "grad_norm": 0.3729043334589071, + "learning_rate": 8.208931196844453e-06, + "loss": 0.3315, + "step": 2521 + }, + { + "epoch": 0.6002260962694116, + "grad_norm": 0.39851502121087623, + "learning_rate": 8.207452718255597e-06, + "loss": 0.4049, + "step": 2522 + }, + { + "epoch": 0.6004640923424763, + "grad_norm": 0.402522005091446, + "learning_rate": 8.20597376295992e-06, + "loss": 0.3749, + "step": 2523 + }, + { + "epoch": 0.6007020884155412, + "grad_norm": 0.4137714780671095, + "learning_rate": 8.204494331177229e-06, + "loss": 0.3014, + "step": 2524 + }, + { + "epoch": 0.6009400844886059, + "grad_norm": 0.3901604913593569, + "learning_rate": 8.203014423127405e-06, + "loss": 0.3693, + "step": 2525 + }, + { + "epoch": 0.6011780805616708, + "grad_norm": 0.3839384353345892, + "learning_rate": 8.201534039030398e-06, + "loss": 0.4382, + "step": 2526 + }, + { + "epoch": 0.6014160766347355, + "grad_norm": 0.39209431032725645, + "learning_rate": 8.20005317910623e-06, + "loss": 0.3409, + "step": 2527 + }, + { + "epoch": 0.6016540727078004, + "grad_norm": 0.39203793684317206, + "learning_rate": 8.198571843574997e-06, + "loss": 0.3048, + "step": 2528 + }, + { + "epoch": 0.6018920687808651, + "grad_norm": 0.3696671599005409, + "learning_rate": 8.197090032656858e-06, + "loss": 0.4085, + "step": 2529 + }, + { + "epoch": 0.60213006485393, + "grad_norm": 0.36909421514297636, + "learning_rate": 8.195607746572047e-06, + "loss": 0.3601, + "step": 2530 + }, + { + "epoch": 0.6023680609269947, + "grad_norm": 0.4061715297820262, + "learning_rate": 8.19412498554087e-06, + "loss": 0.3074, + "step": 2531 + }, + { + "epoch": 0.6026060570000595, + "grad_norm": 0.42169562952568157, + "learning_rate": 8.192641749783703e-06, + "loss": 0.3869, + "step": 2532 + }, + { + "epoch": 0.6028440530731243, + "grad_norm": 0.37904313486242136, + "learning_rate": 8.191158039520986e-06, + "loss": 0.4187, + "step": 2533 + }, + { + "epoch": 0.6030820491461891, + "grad_norm": 0.3933415457883819, + "learning_rate": 8.18967385497324e-06, + "loss": 0.3349, + "step": 2534 + }, + { + "epoch": 0.6033200452192539, + "grad_norm": 0.37746188889384785, + "learning_rate": 8.188189196361052e-06, + "loss": 0.3126, + "step": 2535 + }, + { + "epoch": 0.6035580412923187, + "grad_norm": 0.37077728868779936, + "learning_rate": 8.186704063905078e-06, + "loss": 0.3702, + "step": 2536 + }, + { + "epoch": 0.6037960373653835, + "grad_norm": 0.384234124457375, + "learning_rate": 8.185218457826043e-06, + "loss": 0.4062, + "step": 2537 + }, + { + "epoch": 0.6040340334384483, + "grad_norm": 8.292846660801779, + "learning_rate": 8.183732378344747e-06, + "loss": 0.3263, + "step": 2538 + }, + { + "epoch": 0.604272029511513, + "grad_norm": 0.3985849899773034, + "learning_rate": 8.18224582568206e-06, + "loss": 0.3693, + "step": 2539 + }, + { + "epoch": 0.6045100255845779, + "grad_norm": 0.36445215405013764, + "learning_rate": 8.180758800058914e-06, + "loss": 0.3774, + "step": 2540 + }, + { + "epoch": 0.6047480216576426, + "grad_norm": 0.38537810047531207, + "learning_rate": 8.179271301696326e-06, + "loss": 0.3585, + "step": 2541 + }, + { + "epoch": 0.6049860177307075, + "grad_norm": 0.3722062094603994, + "learning_rate": 8.17778333081537e-06, + "loss": 0.3261, + "step": 2542 + }, + { + "epoch": 0.6052240138037722, + "grad_norm": 0.43085688766278635, + "learning_rate": 8.176294887637195e-06, + "loss": 0.3491, + "step": 2543 + }, + { + "epoch": 0.6054620098768371, + "grad_norm": 0.4080559076822167, + "learning_rate": 8.174805972383024e-06, + "loss": 0.3951, + "step": 2544 + }, + { + "epoch": 0.6057000059499018, + "grad_norm": 0.37312331547160477, + "learning_rate": 8.173316585274144e-06, + "loss": 0.3025, + "step": 2545 + }, + { + "epoch": 0.6059380020229667, + "grad_norm": 0.3667533329998851, + "learning_rate": 8.171826726531916e-06, + "loss": 0.3169, + "step": 2546 + }, + { + "epoch": 0.6061759980960314, + "grad_norm": 0.3893058682249162, + "learning_rate": 8.170336396377767e-06, + "loss": 0.4053, + "step": 2547 + }, + { + "epoch": 0.6064139941690962, + "grad_norm": 0.36637391718899653, + "learning_rate": 8.168845595033202e-06, + "loss": 0.3559, + "step": 2548 + }, + { + "epoch": 0.606651990242161, + "grad_norm": 0.41529994785116214, + "learning_rate": 8.167354322719785e-06, + "loss": 0.3117, + "step": 2549 + }, + { + "epoch": 0.6068899863152258, + "grad_norm": 0.38610454842175557, + "learning_rate": 8.165862579659161e-06, + "loss": 0.3633, + "step": 2550 + }, + { + "epoch": 0.6071279823882906, + "grad_norm": 0.3753416629984086, + "learning_rate": 8.164370366073038e-06, + "loss": 0.4248, + "step": 2551 + }, + { + "epoch": 0.6073659784613554, + "grad_norm": 0.394779197165333, + "learning_rate": 8.162877682183197e-06, + "loss": 0.3117, + "step": 2552 + }, + { + "epoch": 0.6076039745344202, + "grad_norm": 0.4086616702369126, + "learning_rate": 8.161384528211485e-06, + "loss": 0.3191, + "step": 2553 + }, + { + "epoch": 0.607841970607485, + "grad_norm": 0.3788225625439413, + "learning_rate": 8.159890904379823e-06, + "loss": 0.3859, + "step": 2554 + }, + { + "epoch": 0.6080799666805498, + "grad_norm": 0.38335294969113787, + "learning_rate": 8.158396810910201e-06, + "loss": 0.3708, + "step": 2555 + }, + { + "epoch": 0.6083179627536146, + "grad_norm": 0.38664017085300634, + "learning_rate": 8.156902248024678e-06, + "loss": 0.3318, + "step": 2556 + }, + { + "epoch": 0.6085559588266793, + "grad_norm": 0.38453529856278706, + "learning_rate": 8.155407215945382e-06, + "loss": 0.3584, + "step": 2557 + }, + { + "epoch": 0.6087939548997442, + "grad_norm": 0.42462216171053496, + "learning_rate": 8.153911714894513e-06, + "loss": 0.4105, + "step": 2558 + }, + { + "epoch": 0.6090319509728089, + "grad_norm": 0.39687180601172, + "learning_rate": 8.152415745094342e-06, + "loss": 0.3442, + "step": 2559 + }, + { + "epoch": 0.6092699470458738, + "grad_norm": 0.4136807644278921, + "learning_rate": 8.150919306767202e-06, + "loss": 0.3311, + "step": 2560 + }, + { + "epoch": 0.6095079431189385, + "grad_norm": 0.35777343605244677, + "learning_rate": 8.149422400135503e-06, + "loss": 0.3723, + "step": 2561 + }, + { + "epoch": 0.6097459391920034, + "grad_norm": 0.4469538872661931, + "learning_rate": 8.14792502542172e-06, + "loss": 0.3759, + "step": 2562 + }, + { + "epoch": 0.6099839352650681, + "grad_norm": 0.39114145249451315, + "learning_rate": 8.146427182848407e-06, + "loss": 0.2963, + "step": 2563 + }, + { + "epoch": 0.610221931338133, + "grad_norm": 0.39846643949056665, + "learning_rate": 8.144928872638174e-06, + "loss": 0.3149, + "step": 2564 + }, + { + "epoch": 0.6104599274111977, + "grad_norm": 0.39040261177798785, + "learning_rate": 8.143430095013706e-06, + "loss": 0.3969, + "step": 2565 + }, + { + "epoch": 0.6106979234842626, + "grad_norm": 0.3735836629849792, + "learning_rate": 8.141930850197765e-06, + "loss": 0.3382, + "step": 2566 + }, + { + "epoch": 0.6109359195573273, + "grad_norm": 0.3736061629320796, + "learning_rate": 8.14043113841317e-06, + "loss": 0.307, + "step": 2567 + }, + { + "epoch": 0.6111739156303921, + "grad_norm": 0.35064240017216614, + "learning_rate": 8.138930959882818e-06, + "loss": 0.3508, + "step": 2568 + }, + { + "epoch": 0.6114119117034569, + "grad_norm": 0.39486353842354627, + "learning_rate": 8.137430314829671e-06, + "loss": 0.4483, + "step": 2569 + }, + { + "epoch": 0.6116499077765217, + "grad_norm": 0.3759570099382165, + "learning_rate": 8.135929203476764e-06, + "loss": 0.343, + "step": 2570 + }, + { + "epoch": 0.6118879038495865, + "grad_norm": 0.376912992715495, + "learning_rate": 8.134427626047198e-06, + "loss": 0.3342, + "step": 2571 + }, + { + "epoch": 0.6121258999226513, + "grad_norm": 0.3675782252622046, + "learning_rate": 8.132925582764144e-06, + "loss": 0.4009, + "step": 2572 + }, + { + "epoch": 0.6123638959957161, + "grad_norm": 0.37894873028788706, + "learning_rate": 8.131423073850845e-06, + "loss": 0.3797, + "step": 2573 + }, + { + "epoch": 0.6126018920687809, + "grad_norm": 0.41501690842186384, + "learning_rate": 8.129920099530608e-06, + "loss": 0.3149, + "step": 2574 + }, + { + "epoch": 0.6128398881418456, + "grad_norm": 0.3901236279329649, + "learning_rate": 8.128416660026816e-06, + "loss": 0.3559, + "step": 2575 + }, + { + "epoch": 0.6130778842149105, + "grad_norm": 0.36410270538740547, + "learning_rate": 8.126912755562913e-06, + "loss": 0.4105, + "step": 2576 + }, + { + "epoch": 0.6133158802879752, + "grad_norm": 0.40445769613053906, + "learning_rate": 8.125408386362419e-06, + "loss": 0.3565, + "step": 2577 + }, + { + "epoch": 0.6135538763610401, + "grad_norm": 0.37585605678511325, + "learning_rate": 8.12390355264892e-06, + "loss": 0.299, + "step": 2578 + }, + { + "epoch": 0.6137918724341048, + "grad_norm": 0.35091890724717334, + "learning_rate": 8.122398254646071e-06, + "loss": 0.3952, + "step": 2579 + }, + { + "epoch": 0.6140298685071697, + "grad_norm": 0.3858838022367494, + "learning_rate": 8.120892492577598e-06, + "loss": 0.4277, + "step": 2580 + }, + { + "epoch": 0.6142678645802344, + "grad_norm": 0.39895986019373647, + "learning_rate": 8.119386266667292e-06, + "loss": 0.3109, + "step": 2581 + }, + { + "epoch": 0.6145058606532993, + "grad_norm": 0.4111713762728162, + "learning_rate": 8.117879577139019e-06, + "loss": 0.3601, + "step": 2582 + }, + { + "epoch": 0.614743856726364, + "grad_norm": 0.3873631664853874, + "learning_rate": 8.116372424216705e-06, + "loss": 0.4458, + "step": 2583 + }, + { + "epoch": 0.6149818527994289, + "grad_norm": 0.3810771917479411, + "learning_rate": 8.114864808124356e-06, + "loss": 0.296, + "step": 2584 + }, + { + "epoch": 0.6152198488724936, + "grad_norm": 0.39306780621427023, + "learning_rate": 8.113356729086038e-06, + "loss": 0.3077, + "step": 2585 + }, + { + "epoch": 0.6154578449455584, + "grad_norm": 0.3857981143058693, + "learning_rate": 8.111848187325889e-06, + "loss": 0.3732, + "step": 2586 + }, + { + "epoch": 0.6156958410186232, + "grad_norm": 0.37904552046137224, + "learning_rate": 8.110339183068117e-06, + "loss": 0.3811, + "step": 2587 + }, + { + "epoch": 0.615933837091688, + "grad_norm": 0.35684929770087614, + "learning_rate": 8.108829716536993e-06, + "loss": 0.3357, + "step": 2588 + }, + { + "epoch": 0.6161718331647528, + "grad_norm": 0.37821384127046404, + "learning_rate": 8.107319787956866e-06, + "loss": 0.3508, + "step": 2589 + }, + { + "epoch": 0.6164098292378176, + "grad_norm": 0.3574697327965667, + "learning_rate": 8.105809397552148e-06, + "loss": 0.4393, + "step": 2590 + }, + { + "epoch": 0.6166478253108824, + "grad_norm": 0.39258448327977785, + "learning_rate": 8.10429854554732e-06, + "loss": 0.3101, + "step": 2591 + }, + { + "epoch": 0.6168858213839472, + "grad_norm": 0.3888467385192024, + "learning_rate": 8.10278723216693e-06, + "loss": 0.3177, + "step": 2592 + }, + { + "epoch": 0.617123817457012, + "grad_norm": 0.3852959961633021, + "learning_rate": 8.101275457635601e-06, + "loss": 0.3663, + "step": 2593 + }, + { + "epoch": 0.6173618135300768, + "grad_norm": 0.40121539043640597, + "learning_rate": 8.099763222178015e-06, + "loss": 0.405, + "step": 2594 + }, + { + "epoch": 0.6175998096031415, + "grad_norm": 0.4081613317939108, + "learning_rate": 8.098250526018927e-06, + "loss": 0.3019, + "step": 2595 + }, + { + "epoch": 0.6178378056762064, + "grad_norm": 0.45909803930292775, + "learning_rate": 8.096737369383167e-06, + "loss": 0.339, + "step": 2596 + }, + { + "epoch": 0.6180758017492711, + "grad_norm": 0.39288919774336073, + "learning_rate": 8.095223752495625e-06, + "loss": 0.3849, + "step": 2597 + }, + { + "epoch": 0.618313797822336, + "grad_norm": 0.40645028794870114, + "learning_rate": 8.09370967558126e-06, + "loss": 0.3484, + "step": 2598 + }, + { + "epoch": 0.6185517938954007, + "grad_norm": 0.36773838171647716, + "learning_rate": 8.092195138865102e-06, + "loss": 0.3205, + "step": 2599 + }, + { + "epoch": 0.6187897899684656, + "grad_norm": 0.39094679473606364, + "learning_rate": 8.090680142572251e-06, + "loss": 0.3719, + "step": 2600 + }, + { + "epoch": 0.6190277860415303, + "grad_norm": 0.37593142152302145, + "learning_rate": 8.089164686927869e-06, + "loss": 0.4178, + "step": 2601 + }, + { + "epoch": 0.6192657821145952, + "grad_norm": 0.3875234390518961, + "learning_rate": 8.087648772157193e-06, + "loss": 0.3594, + "step": 2602 + }, + { + "epoch": 0.6195037781876599, + "grad_norm": 0.3937548041329991, + "learning_rate": 8.086132398485525e-06, + "loss": 0.3272, + "step": 2603 + }, + { + "epoch": 0.6197417742607247, + "grad_norm": 0.4882360341404345, + "learning_rate": 8.084615566138234e-06, + "loss": 0.3972, + "step": 2604 + }, + { + "epoch": 0.6199797703337895, + "grad_norm": 0.3644617243184282, + "learning_rate": 8.083098275340762e-06, + "loss": 0.3796, + "step": 2605 + }, + { + "epoch": 0.6202177664068543, + "grad_norm": 0.3464988959256686, + "learning_rate": 8.081580526318614e-06, + "loss": 0.3145, + "step": 2606 + }, + { + "epoch": 0.6204557624799191, + "grad_norm": 0.37402621710256423, + "learning_rate": 8.080062319297364e-06, + "loss": 0.3315, + "step": 2607 + }, + { + "epoch": 0.6206937585529839, + "grad_norm": 0.40711958296792816, + "learning_rate": 8.078543654502656e-06, + "loss": 0.4, + "step": 2608 + }, + { + "epoch": 0.6209317546260487, + "grad_norm": 0.36492939402646146, + "learning_rate": 8.077024532160202e-06, + "loss": 0.3082, + "step": 2609 + }, + { + "epoch": 0.6211697506991135, + "grad_norm": 0.37941276134920326, + "learning_rate": 8.075504952495781e-06, + "loss": 0.3377, + "step": 2610 + }, + { + "epoch": 0.6214077467721782, + "grad_norm": 0.4020215489655943, + "learning_rate": 8.07398491573524e-06, + "loss": 0.3639, + "step": 2611 + }, + { + "epoch": 0.6216457428452431, + "grad_norm": 0.4020370198246242, + "learning_rate": 8.072464422104493e-06, + "loss": 0.4031, + "step": 2612 + }, + { + "epoch": 0.6218837389183078, + "grad_norm": 0.4204655833624215, + "learning_rate": 8.070943471829524e-06, + "loss": 0.2973, + "step": 2613 + }, + { + "epoch": 0.6221217349913727, + "grad_norm": 0.4006137318758786, + "learning_rate": 8.069422065136386e-06, + "loss": 0.3611, + "step": 2614 + }, + { + "epoch": 0.6223597310644374, + "grad_norm": 0.3662514094870652, + "learning_rate": 8.067900202251191e-06, + "loss": 0.397, + "step": 2615 + }, + { + "epoch": 0.6225977271375023, + "grad_norm": 0.44999481000949004, + "learning_rate": 8.066377883400132e-06, + "loss": 0.3546, + "step": 2616 + }, + { + "epoch": 0.622835723210567, + "grad_norm": 0.364463764085942, + "learning_rate": 8.064855108809461e-06, + "loss": 0.2948, + "step": 2617 + }, + { + "epoch": 0.6230737192836319, + "grad_norm": 0.3964833253922973, + "learning_rate": 8.063331878705499e-06, + "loss": 0.3764, + "step": 2618 + }, + { + "epoch": 0.6233117153566966, + "grad_norm": 0.3514192516820972, + "learning_rate": 8.061808193314638e-06, + "loss": 0.4172, + "step": 2619 + }, + { + "epoch": 0.6235497114297615, + "grad_norm": 0.3548064383030754, + "learning_rate": 8.060284052863334e-06, + "loss": 0.3202, + "step": 2620 + }, + { + "epoch": 0.6237877075028262, + "grad_norm": 0.38064126138563015, + "learning_rate": 8.05875945757811e-06, + "loss": 0.3158, + "step": 2621 + }, + { + "epoch": 0.624025703575891, + "grad_norm": 0.3849786777328987, + "learning_rate": 8.057234407685563e-06, + "loss": 0.3875, + "step": 2622 + }, + { + "epoch": 0.6242636996489558, + "grad_norm": 0.3737306119830634, + "learning_rate": 8.05570890341235e-06, + "loss": 0.3512, + "step": 2623 + }, + { + "epoch": 0.6245016957220206, + "grad_norm": 0.4064964030178299, + "learning_rate": 8.054182944985198e-06, + "loss": 0.3145, + "step": 2624 + }, + { + "epoch": 0.6247396917950854, + "grad_norm": 0.38133945827871857, + "learning_rate": 8.052656532630905e-06, + "loss": 0.36, + "step": 2625 + }, + { + "epoch": 0.6249776878681502, + "grad_norm": 0.34831792324190414, + "learning_rate": 8.051129666576331e-06, + "loss": 0.4264, + "step": 2626 + }, + { + "epoch": 0.625215683941215, + "grad_norm": 0.38814660085579894, + "learning_rate": 8.049602347048408e-06, + "loss": 0.3214, + "step": 2627 + }, + { + "epoch": 0.6254536800142798, + "grad_norm": 0.4292810547509034, + "learning_rate": 8.048074574274132e-06, + "loss": 0.3469, + "step": 2628 + }, + { + "epoch": 0.6256916760873446, + "grad_norm": 0.36341426198547644, + "learning_rate": 8.04654634848057e-06, + "loss": 0.3922, + "step": 2629 + }, + { + "epoch": 0.6259296721604094, + "grad_norm": 0.3821109148057445, + "learning_rate": 8.045017669894851e-06, + "loss": 0.3777, + "step": 2630 + }, + { + "epoch": 0.6261676682334741, + "grad_norm": 0.37289488804653775, + "learning_rate": 8.043488538744177e-06, + "loss": 0.2943, + "step": 2631 + }, + { + "epoch": 0.626405664306539, + "grad_norm": 0.37258093559025074, + "learning_rate": 8.041958955255815e-06, + "loss": 0.342, + "step": 2632 + }, + { + "epoch": 0.6266436603796037, + "grad_norm": 0.3486184579114556, + "learning_rate": 8.040428919657095e-06, + "loss": 0.4144, + "step": 2633 + }, + { + "epoch": 0.6268816564526686, + "grad_norm": 0.3741035400050582, + "learning_rate": 8.038898432175424e-06, + "loss": 0.3246, + "step": 2634 + }, + { + "epoch": 0.6271196525257333, + "grad_norm": 0.40892825888298057, + "learning_rate": 8.037367493038265e-06, + "loss": 0.3332, + "step": 2635 + }, + { + "epoch": 0.6273576485987982, + "grad_norm": 0.36522383480533593, + "learning_rate": 8.035836102473155e-06, + "loss": 0.3566, + "step": 2636 + }, + { + "epoch": 0.6275956446718629, + "grad_norm": 0.3652591745540261, + "learning_rate": 8.0343042607077e-06, + "loss": 0.3765, + "step": 2637 + }, + { + "epoch": 0.6278336407449278, + "grad_norm": 0.35844676767075323, + "learning_rate": 8.032771967969566e-06, + "loss": 0.3158, + "step": 2638 + }, + { + "epoch": 0.6280716368179925, + "grad_norm": 0.3931558454911546, + "learning_rate": 8.03123922448649e-06, + "loss": 0.3568, + "step": 2639 + }, + { + "epoch": 0.6283096328910573, + "grad_norm": 0.36411863899036223, + "learning_rate": 8.029706030486274e-06, + "loss": 0.4586, + "step": 2640 + }, + { + "epoch": 0.6285476289641221, + "grad_norm": 0.3796092362122607, + "learning_rate": 8.028172386196794e-06, + "loss": 0.372, + "step": 2641 + }, + { + "epoch": 0.6287856250371869, + "grad_norm": 0.3761030511051456, + "learning_rate": 8.026638291845982e-06, + "loss": 0.3092, + "step": 2642 + }, + { + "epoch": 0.6290236211102517, + "grad_norm": 0.3699761519507502, + "learning_rate": 8.025103747661844e-06, + "loss": 0.3805, + "step": 2643 + }, + { + "epoch": 0.6292616171833165, + "grad_norm": 0.373140231509292, + "learning_rate": 8.023568753872453e-06, + "loss": 0.388, + "step": 2644 + }, + { + "epoch": 0.6294996132563813, + "grad_norm": 0.3637236022266386, + "learning_rate": 8.022033310705946e-06, + "loss": 0.3184, + "step": 2645 + }, + { + "epoch": 0.6297376093294461, + "grad_norm": 0.3915776794451119, + "learning_rate": 8.020497418390527e-06, + "loss": 0.3349, + "step": 2646 + }, + { + "epoch": 0.6299756054025109, + "grad_norm": 0.36532602213156506, + "learning_rate": 8.018961077154468e-06, + "loss": 0.3957, + "step": 2647 + }, + { + "epoch": 0.6302136014755757, + "grad_norm": 0.4140563457038236, + "learning_rate": 8.017424287226107e-06, + "loss": 0.415, + "step": 2648 + }, + { + "epoch": 0.6304515975486404, + "grad_norm": 0.3769113334994522, + "learning_rate": 8.01588704883385e-06, + "loss": 0.3145, + "step": 2649 + }, + { + "epoch": 0.6306895936217053, + "grad_norm": 1.5368218754503258, + "learning_rate": 8.014349362206167e-06, + "loss": 0.374, + "step": 2650 + }, + { + "epoch": 0.63092758969477, + "grad_norm": 0.38350013956172013, + "learning_rate": 8.012811227571597e-06, + "loss": 0.4003, + "step": 2651 + }, + { + "epoch": 0.6311655857678349, + "grad_norm": 0.3866132408928916, + "learning_rate": 8.011272645158747e-06, + "loss": 0.3235, + "step": 2652 + }, + { + "epoch": 0.6314035818408996, + "grad_norm": 0.40608371668229537, + "learning_rate": 8.009733615196287e-06, + "loss": 0.3244, + "step": 2653 + }, + { + "epoch": 0.6316415779139645, + "grad_norm": 0.38096062005900233, + "learning_rate": 8.008194137912955e-06, + "loss": 0.412, + "step": 2654 + }, + { + "epoch": 0.6318795739870292, + "grad_norm": 0.3862436056336853, + "learning_rate": 8.006654213537553e-06, + "loss": 0.3634, + "step": 2655 + }, + { + "epoch": 0.6321175700600941, + "grad_norm": 0.3976502335701635, + "learning_rate": 8.005113842298954e-06, + "loss": 0.3234, + "step": 2656 + }, + { + "epoch": 0.6323555661331588, + "grad_norm": 0.39127711057264086, + "learning_rate": 8.003573024426094e-06, + "loss": 0.3673, + "step": 2657 + }, + { + "epoch": 0.6325935622062236, + "grad_norm": 0.3896004400805801, + "learning_rate": 8.00203176014798e-06, + "loss": 0.4458, + "step": 2658 + }, + { + "epoch": 0.6328315582792884, + "grad_norm": 0.46398616972299095, + "learning_rate": 8.000490049693678e-06, + "loss": 0.3429, + "step": 2659 + }, + { + "epoch": 0.6330695543523532, + "grad_norm": 0.46831349058115457, + "learning_rate": 7.998947893292328e-06, + "loss": 0.3135, + "step": 2660 + }, + { + "epoch": 0.633307550425418, + "grad_norm": 0.4258334551178332, + "learning_rate": 7.99740529117313e-06, + "loss": 0.381, + "step": 2661 + }, + { + "epoch": 0.6335455464984828, + "grad_norm": 0.413976046137531, + "learning_rate": 7.995862243565352e-06, + "loss": 0.4138, + "step": 2662 + }, + { + "epoch": 0.6337835425715476, + "grad_norm": 0.43240881559747, + "learning_rate": 7.994318750698333e-06, + "loss": 0.3083, + "step": 2663 + }, + { + "epoch": 0.6340215386446124, + "grad_norm": 0.4246524529086203, + "learning_rate": 7.99277481280147e-06, + "loss": 0.3523, + "step": 2664 + }, + { + "epoch": 0.6342595347176772, + "grad_norm": 0.38578881152202554, + "learning_rate": 7.991230430104233e-06, + "loss": 0.3935, + "step": 2665 + }, + { + "epoch": 0.634497530790742, + "grad_norm": 0.7982507745714851, + "learning_rate": 7.989685602836155e-06, + "loss": 0.3685, + "step": 2666 + }, + { + "epoch": 0.6347355268638067, + "grad_norm": 0.39291885711093316, + "learning_rate": 7.988140331226835e-06, + "loss": 0.302, + "step": 2667 + }, + { + "epoch": 0.6349735229368716, + "grad_norm": 0.3774641923130378, + "learning_rate": 7.986594615505938e-06, + "loss": 0.3455, + "step": 2668 + }, + { + "epoch": 0.6352115190099363, + "grad_norm": 0.3849751604819079, + "learning_rate": 7.985048455903195e-06, + "loss": 0.3941, + "step": 2669 + }, + { + "epoch": 0.6354495150830012, + "grad_norm": 0.4028547987540656, + "learning_rate": 7.983501852648408e-06, + "loss": 0.3105, + "step": 2670 + }, + { + "epoch": 0.6356875111560659, + "grad_norm": 0.4175776893284019, + "learning_rate": 7.981954805971434e-06, + "loss": 0.3202, + "step": 2671 + }, + { + "epoch": 0.6359255072291308, + "grad_norm": 0.3948825669750941, + "learning_rate": 7.98040731610221e-06, + "loss": 0.386, + "step": 2672 + }, + { + "epoch": 0.6361635033021955, + "grad_norm": 0.37485861058760167, + "learning_rate": 7.978859383270723e-06, + "loss": 0.3553, + "step": 2673 + }, + { + "epoch": 0.6364014993752604, + "grad_norm": 0.38429792606989394, + "learning_rate": 7.97731100770704e-06, + "loss": 0.305, + "step": 2674 + }, + { + "epoch": 0.6366394954483251, + "grad_norm": 0.40243135178828765, + "learning_rate": 7.975762189641287e-06, + "loss": 0.3782, + "step": 2675 + }, + { + "epoch": 0.63687749152139, + "grad_norm": 0.3537177295193898, + "learning_rate": 7.974212929303655e-06, + "loss": 0.4115, + "step": 2676 + }, + { + "epoch": 0.6371154875944547, + "grad_norm": 0.4063244285595607, + "learning_rate": 7.972663226924404e-06, + "loss": 0.3373, + "step": 2677 + }, + { + "epoch": 0.6373534836675195, + "grad_norm": 0.7174554198990031, + "learning_rate": 7.971113082733855e-06, + "loss": 0.319, + "step": 2678 + }, + { + "epoch": 0.6375914797405843, + "grad_norm": 0.38291427234154246, + "learning_rate": 7.969562496962402e-06, + "loss": 0.4063, + "step": 2679 + }, + { + "epoch": 0.6378294758136491, + "grad_norm": 0.37205693582330984, + "learning_rate": 7.968011469840498e-06, + "loss": 0.3852, + "step": 2680 + }, + { + "epoch": 0.6380674718867139, + "grad_norm": 0.37289319443279956, + "learning_rate": 7.966460001598666e-06, + "loss": 0.3108, + "step": 2681 + }, + { + "epoch": 0.6383054679597787, + "grad_norm": 0.36566089153086934, + "learning_rate": 7.96490809246749e-06, + "loss": 0.3616, + "step": 2682 + }, + { + "epoch": 0.6385434640328435, + "grad_norm": 0.3792165993672128, + "learning_rate": 7.963355742677622e-06, + "loss": 0.3968, + "step": 2683 + }, + { + "epoch": 0.6387814601059083, + "grad_norm": 0.3798446258828693, + "learning_rate": 7.961802952459782e-06, + "loss": 0.3367, + "step": 2684 + }, + { + "epoch": 0.639019456178973, + "grad_norm": 0.3490183798631681, + "learning_rate": 7.96024972204475e-06, + "loss": 0.295, + "step": 2685 + }, + { + "epoch": 0.6392574522520379, + "grad_norm": 0.3964584193841697, + "learning_rate": 7.958696051663378e-06, + "loss": 0.3989, + "step": 2686 + }, + { + "epoch": 0.6394954483251026, + "grad_norm": 0.41876375542959604, + "learning_rate": 7.957141941546579e-06, + "loss": 0.4196, + "step": 2687 + }, + { + "epoch": 0.6397334443981675, + "grad_norm": 0.3704391565272174, + "learning_rate": 7.95558739192533e-06, + "loss": 0.3395, + "step": 2688 + }, + { + "epoch": 0.6399714404712322, + "grad_norm": 0.38705193099976026, + "learning_rate": 7.954032403030676e-06, + "loss": 0.3671, + "step": 2689 + }, + { + "epoch": 0.6402094365442971, + "grad_norm": 0.3439386120652561, + "learning_rate": 7.952476975093729e-06, + "loss": 0.3779, + "step": 2690 + }, + { + "epoch": 0.6404474326173618, + "grad_norm": 0.40968077374028966, + "learning_rate": 7.950921108345663e-06, + "loss": 0.3443, + "step": 2691 + }, + { + "epoch": 0.6406854286904267, + "grad_norm": 0.40943399377681217, + "learning_rate": 7.949364803017716e-06, + "loss": 0.3285, + "step": 2692 + }, + { + "epoch": 0.6409234247634914, + "grad_norm": 0.35009359805640566, + "learning_rate": 7.947808059341198e-06, + "loss": 0.3749, + "step": 2693 + }, + { + "epoch": 0.6411614208365563, + "grad_norm": 0.41593770811256764, + "learning_rate": 7.946250877547477e-06, + "loss": 0.4014, + "step": 2694 + }, + { + "epoch": 0.641399416909621, + "grad_norm": 0.37165753141343205, + "learning_rate": 7.944693257867988e-06, + "loss": 0.3249, + "step": 2695 + }, + { + "epoch": 0.6416374129826858, + "grad_norm": 0.36739798172367116, + "learning_rate": 7.943135200534231e-06, + "loss": 0.3509, + "step": 2696 + }, + { + "epoch": 0.6418754090557506, + "grad_norm": 0.3724656741195384, + "learning_rate": 7.941576705777775e-06, + "loss": 0.4075, + "step": 2697 + }, + { + "epoch": 0.6421134051288154, + "grad_norm": 0.36653430005871285, + "learning_rate": 7.940017773830251e-06, + "loss": 0.3887, + "step": 2698 + }, + { + "epoch": 0.6423514012018802, + "grad_norm": 0.3954242148262836, + "learning_rate": 7.93845840492335e-06, + "loss": 0.2837, + "step": 2699 + }, + { + "epoch": 0.642589397274945, + "grad_norm": 0.37217288826320916, + "learning_rate": 7.936898599288837e-06, + "loss": 0.367, + "step": 2700 + }, + { + "epoch": 0.6428273933480098, + "grad_norm": 0.3433351247909528, + "learning_rate": 7.93533835715854e-06, + "loss": 0.413, + "step": 2701 + }, + { + "epoch": 0.6430653894210746, + "grad_norm": 0.3710719613065313, + "learning_rate": 7.933777678764342e-06, + "loss": 0.3481, + "step": 2702 + }, + { + "epoch": 0.6433033854941393, + "grad_norm": 0.4051379668160816, + "learning_rate": 7.932216564338207e-06, + "loss": 0.3487, + "step": 2703 + }, + { + "epoch": 0.6435413815672042, + "grad_norm": 0.35261923047482774, + "learning_rate": 7.930655014112149e-06, + "loss": 0.3673, + "step": 2704 + }, + { + "epoch": 0.6437793776402689, + "grad_norm": 0.37561470708818495, + "learning_rate": 7.929093028318254e-06, + "loss": 0.3952, + "step": 2705 + }, + { + "epoch": 0.6440173737133338, + "grad_norm": 0.38085667134847434, + "learning_rate": 7.927530607188674e-06, + "loss": 0.2997, + "step": 2706 + }, + { + "epoch": 0.6442553697863985, + "grad_norm": 0.38309188860484494, + "learning_rate": 7.925967750955621e-06, + "loss": 0.3318, + "step": 2707 + }, + { + "epoch": 0.6444933658594634, + "grad_norm": 0.35539044141417847, + "learning_rate": 7.924404459851376e-06, + "loss": 0.4216, + "step": 2708 + }, + { + "epoch": 0.6447313619325281, + "grad_norm": 0.36545999682638586, + "learning_rate": 7.92284073410828e-06, + "loss": 0.3565, + "step": 2709 + }, + { + "epoch": 0.644969358005593, + "grad_norm": 0.3853826020640622, + "learning_rate": 7.921276573958747e-06, + "loss": 0.3193, + "step": 2710 + }, + { + "epoch": 0.6452073540786577, + "grad_norm": 0.370265598492799, + "learning_rate": 7.919711979635245e-06, + "loss": 0.3624, + "step": 2711 + }, + { + "epoch": 0.6454453501517224, + "grad_norm": 0.38428982530476763, + "learning_rate": 7.918146951370312e-06, + "loss": 0.368, + "step": 2712 + }, + { + "epoch": 0.6456833462247873, + "grad_norm": 0.3863324681379504, + "learning_rate": 7.916581489396551e-06, + "loss": 0.3382, + "step": 2713 + }, + { + "epoch": 0.645921342297852, + "grad_norm": 0.40476758870600216, + "learning_rate": 7.915015593946627e-06, + "loss": 0.3452, + "step": 2714 + }, + { + "epoch": 0.6461593383709169, + "grad_norm": 0.350755584302665, + "learning_rate": 7.913449265253272e-06, + "loss": 0.3541, + "step": 2715 + }, + { + "epoch": 0.6463973344439816, + "grad_norm": 0.3917385422279502, + "learning_rate": 7.911882503549282e-06, + "loss": 0.3615, + "step": 2716 + }, + { + "epoch": 0.6466353305170465, + "grad_norm": 0.4148508130598785, + "learning_rate": 7.910315309067515e-06, + "loss": 0.3237, + "step": 2717 + }, + { + "epoch": 0.6468733265901112, + "grad_norm": 0.3655588510807828, + "learning_rate": 7.908747682040893e-06, + "loss": 0.3717, + "step": 2718 + }, + { + "epoch": 0.6471113226631761, + "grad_norm": 0.3786662704253251, + "learning_rate": 7.907179622702409e-06, + "loss": 0.3995, + "step": 2719 + }, + { + "epoch": 0.6473493187362408, + "grad_norm": 0.41077148959741994, + "learning_rate": 7.905611131285114e-06, + "loss": 0.3225, + "step": 2720 + }, + { + "epoch": 0.6475873148093056, + "grad_norm": 0.3845293510123214, + "learning_rate": 7.904042208022121e-06, + "loss": 0.3184, + "step": 2721 + }, + { + "epoch": 0.6478253108823704, + "grad_norm": 0.38026816465993274, + "learning_rate": 7.902472853146614e-06, + "loss": 0.4047, + "step": 2722 + }, + { + "epoch": 0.6480633069554352, + "grad_norm": 0.3831104022471865, + "learning_rate": 7.90090306689184e-06, + "loss": 0.3919, + "step": 2723 + }, + { + "epoch": 0.6483013030285, + "grad_norm": 0.39343951127809484, + "learning_rate": 7.899332849491101e-06, + "loss": 0.2942, + "step": 2724 + }, + { + "epoch": 0.6485392991015648, + "grad_norm": 0.37517985298715045, + "learning_rate": 7.897762201177777e-06, + "loss": 0.3449, + "step": 2725 + }, + { + "epoch": 0.6487772951746296, + "grad_norm": 0.36054421682014914, + "learning_rate": 7.896191122185302e-06, + "loss": 0.3968, + "step": 2726 + }, + { + "epoch": 0.6490152912476944, + "grad_norm": 0.38545956806454257, + "learning_rate": 7.894619612747177e-06, + "loss": 0.3261, + "step": 2727 + }, + { + "epoch": 0.6492532873207592, + "grad_norm": 0.4049278658685888, + "learning_rate": 7.89304767309697e-06, + "loss": 0.3517, + "step": 2728 + }, + { + "epoch": 0.649491283393824, + "grad_norm": 0.377732909086466, + "learning_rate": 7.891475303468307e-06, + "loss": 0.377, + "step": 2729 + }, + { + "epoch": 0.6497292794668887, + "grad_norm": 0.35831707426820597, + "learning_rate": 7.889902504094883e-06, + "loss": 0.386, + "step": 2730 + }, + { + "epoch": 0.6499672755399536, + "grad_norm": 0.38647507461352604, + "learning_rate": 7.888329275210454e-06, + "loss": 0.3116, + "step": 2731 + }, + { + "epoch": 0.6502052716130183, + "grad_norm": 0.41222519034063054, + "learning_rate": 7.88675561704884e-06, + "loss": 0.3485, + "step": 2732 + }, + { + "epoch": 0.6504432676860832, + "grad_norm": 0.3540005364042782, + "learning_rate": 7.885181529843928e-06, + "loss": 0.3811, + "step": 2733 + }, + { + "epoch": 0.6506812637591479, + "grad_norm": 0.3889803639057776, + "learning_rate": 7.883607013829664e-06, + "loss": 0.3164, + "step": 2734 + }, + { + "epoch": 0.6509192598322128, + "grad_norm": 0.3825132936787645, + "learning_rate": 7.88203206924006e-06, + "loss": 0.3355, + "step": 2735 + }, + { + "epoch": 0.6511572559052775, + "grad_norm": 0.3600797185755466, + "learning_rate": 7.880456696309194e-06, + "loss": 0.3822, + "step": 2736 + }, + { + "epoch": 0.6513952519783424, + "grad_norm": 0.36788983076651144, + "learning_rate": 7.878880895271203e-06, + "loss": 0.4035, + "step": 2737 + }, + { + "epoch": 0.6516332480514071, + "grad_norm": 0.36838811546682065, + "learning_rate": 7.87730466636029e-06, + "loss": 0.3201, + "step": 2738 + }, + { + "epoch": 0.651871244124472, + "grad_norm": 0.4199311085900462, + "learning_rate": 7.875728009810723e-06, + "loss": 0.3354, + "step": 2739 + }, + { + "epoch": 0.6521092401975367, + "grad_norm": 0.3700461680317869, + "learning_rate": 7.874150925856832e-06, + "loss": 0.3907, + "step": 2740 + }, + { + "epoch": 0.6523472362706015, + "grad_norm": 0.3658208922433521, + "learning_rate": 7.87257341473301e-06, + "loss": 0.3436, + "step": 2741 + }, + { + "epoch": 0.6525852323436663, + "grad_norm": 0.3673071582783027, + "learning_rate": 7.870995476673716e-06, + "loss": 0.3026, + "step": 2742 + }, + { + "epoch": 0.6528232284167311, + "grad_norm": 0.36996570740087614, + "learning_rate": 7.869417111913469e-06, + "loss": 0.3494, + "step": 2743 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.387133905688407, + "learning_rate": 7.867838320686852e-06, + "loss": 0.4002, + "step": 2744 + }, + { + "epoch": 0.6532992205628607, + "grad_norm": 0.4136666892086636, + "learning_rate": 7.866259103228513e-06, + "loss": 0.3187, + "step": 2745 + }, + { + "epoch": 0.6535372166359255, + "grad_norm": 0.39398446442473684, + "learning_rate": 7.864679459773165e-06, + "loss": 0.3197, + "step": 2746 + }, + { + "epoch": 0.6537752127089903, + "grad_norm": 0.3856438918664348, + "learning_rate": 7.863099390555579e-06, + "loss": 0.3823, + "step": 2747 + }, + { + "epoch": 0.654013208782055, + "grad_norm": 0.3908108331472674, + "learning_rate": 7.861518895810597e-06, + "loss": 0.36, + "step": 2748 + }, + { + "epoch": 0.6542512048551199, + "grad_norm": 0.3906604552323348, + "learning_rate": 7.859937975773113e-06, + "loss": 0.3083, + "step": 2749 + }, + { + "epoch": 0.6544892009281846, + "grad_norm": 0.3644262557255445, + "learning_rate": 7.858356630678095e-06, + "loss": 0.3834, + "step": 2750 + }, + { + "epoch": 0.6547271970012495, + "grad_norm": 0.3859873524132499, + "learning_rate": 7.85677486076057e-06, + "loss": 0.4009, + "step": 2751 + }, + { + "epoch": 0.6549651930743142, + "grad_norm": 0.4012225445856221, + "learning_rate": 7.855192666255627e-06, + "loss": 0.3174, + "step": 2752 + }, + { + "epoch": 0.6552031891473791, + "grad_norm": 0.3894743297706602, + "learning_rate": 7.853610047398422e-06, + "loss": 0.3163, + "step": 2753 + }, + { + "epoch": 0.6554411852204438, + "grad_norm": 0.36865693864488674, + "learning_rate": 7.852027004424166e-06, + "loss": 0.4053, + "step": 2754 + }, + { + "epoch": 0.6556791812935087, + "grad_norm": 0.3871705949523487, + "learning_rate": 7.850443537568142e-06, + "loss": 0.3511, + "step": 2755 + }, + { + "epoch": 0.6559171773665734, + "grad_norm": 0.40039522901635954, + "learning_rate": 7.848859647065692e-06, + "loss": 0.3183, + "step": 2756 + }, + { + "epoch": 0.6561551734396383, + "grad_norm": 0.3899094558661615, + "learning_rate": 7.847275333152222e-06, + "loss": 0.3536, + "step": 2757 + }, + { + "epoch": 0.656393169512703, + "grad_norm": 0.38854298448856595, + "learning_rate": 7.845690596063198e-06, + "loss": 0.3992, + "step": 2758 + }, + { + "epoch": 0.6566311655857678, + "grad_norm": 0.37350144359617254, + "learning_rate": 7.844105436034156e-06, + "loss": 0.3449, + "step": 2759 + }, + { + "epoch": 0.6568691616588326, + "grad_norm": 0.44114084013334837, + "learning_rate": 7.842519853300683e-06, + "loss": 0.2977, + "step": 2760 + }, + { + "epoch": 0.6571071577318974, + "grad_norm": 0.35999935925172966, + "learning_rate": 7.840933848098437e-06, + "loss": 0.3916, + "step": 2761 + }, + { + "epoch": 0.6573451538049622, + "grad_norm": 0.39899431523147383, + "learning_rate": 7.839347420663143e-06, + "loss": 0.4253, + "step": 2762 + }, + { + "epoch": 0.657583149878027, + "grad_norm": 0.3511330110093876, + "learning_rate": 7.837760571230582e-06, + "loss": 0.3323, + "step": 2763 + }, + { + "epoch": 0.6578211459510918, + "grad_norm": 0.4461907551327487, + "learning_rate": 7.836173300036594e-06, + "loss": 0.3239, + "step": 2764 + }, + { + "epoch": 0.6580591420241566, + "grad_norm": 0.39054055621536493, + "learning_rate": 7.83458560731709e-06, + "loss": 0.4205, + "step": 2765 + }, + { + "epoch": 0.6582971380972213, + "grad_norm": 0.3769385946466861, + "learning_rate": 7.832997493308043e-06, + "loss": 0.3511, + "step": 2766 + }, + { + "epoch": 0.6585351341702862, + "grad_norm": 0.3986011878362815, + "learning_rate": 7.831408958245483e-06, + "loss": 0.296, + "step": 2767 + }, + { + "epoch": 0.6587731302433509, + "grad_norm": 0.3880140645019253, + "learning_rate": 7.829820002365504e-06, + "loss": 0.3771, + "step": 2768 + }, + { + "epoch": 0.6590111263164158, + "grad_norm": 0.38212257026061813, + "learning_rate": 7.828230625904269e-06, + "loss": 0.423, + "step": 2769 + }, + { + "epoch": 0.6592491223894805, + "grad_norm": 0.38852972753947673, + "learning_rate": 7.826640829097994e-06, + "loss": 0.3055, + "step": 2770 + }, + { + "epoch": 0.6594871184625454, + "grad_norm": 0.392631438114691, + "learning_rate": 7.825050612182965e-06, + "loss": 0.3385, + "step": 2771 + }, + { + "epoch": 0.6597251145356101, + "grad_norm": 0.37085742233686686, + "learning_rate": 7.823459975395527e-06, + "loss": 0.3848, + "step": 2772 + }, + { + "epoch": 0.659963110608675, + "grad_norm": 0.3834859007180493, + "learning_rate": 7.821868918972087e-06, + "loss": 0.3313, + "step": 2773 + }, + { + "epoch": 0.6602011066817397, + "grad_norm": 0.3822005143101434, + "learning_rate": 7.820277443149114e-06, + "loss": 0.3139, + "step": 2774 + }, + { + "epoch": 0.6604391027548046, + "grad_norm": 0.3751827860273947, + "learning_rate": 7.818685548163144e-06, + "loss": 0.3772, + "step": 2775 + }, + { + "epoch": 0.6606770988278693, + "grad_norm": 0.36208369988094957, + "learning_rate": 7.817093234250772e-06, + "loss": 0.4038, + "step": 2776 + }, + { + "epoch": 0.6609150949009341, + "grad_norm": 0.394839034258573, + "learning_rate": 7.815500501648654e-06, + "loss": 0.3098, + "step": 2777 + }, + { + "epoch": 0.6611530909739989, + "grad_norm": 0.37324421123913026, + "learning_rate": 7.813907350593509e-06, + "loss": 0.2987, + "step": 2778 + }, + { + "epoch": 0.6613910870470637, + "grad_norm": 0.3522307791159532, + "learning_rate": 7.812313781322119e-06, + "loss": 0.3708, + "step": 2779 + }, + { + "epoch": 0.6616290831201285, + "grad_norm": 0.38147797701046887, + "learning_rate": 7.810719794071326e-06, + "loss": 0.3826, + "step": 2780 + }, + { + "epoch": 0.6618670791931933, + "grad_norm": 0.3818587837039667, + "learning_rate": 7.809125389078038e-06, + "loss": 0.306, + "step": 2781 + }, + { + "epoch": 0.6621050752662581, + "grad_norm": 0.4383170403521546, + "learning_rate": 7.807530566579225e-06, + "loss": 0.3566, + "step": 2782 + }, + { + "epoch": 0.6623430713393229, + "grad_norm": 0.35732395462798827, + "learning_rate": 7.805935326811913e-06, + "loss": 0.4335, + "step": 2783 + }, + { + "epoch": 0.6625810674123876, + "grad_norm": 0.3504216615398166, + "learning_rate": 7.804339670013196e-06, + "loss": 0.3143, + "step": 2784 + }, + { + "epoch": 0.6628190634854525, + "grad_norm": 0.3661223386576208, + "learning_rate": 7.802743596420228e-06, + "loss": 0.3529, + "step": 2785 + }, + { + "epoch": 0.6630570595585172, + "grad_norm": 0.3723251622104599, + "learning_rate": 7.801147106270227e-06, + "loss": 0.3883, + "step": 2786 + }, + { + "epoch": 0.6632950556315821, + "grad_norm": 0.36878452917502985, + "learning_rate": 7.799550199800468e-06, + "loss": 0.421, + "step": 2787 + }, + { + "epoch": 0.6635330517046468, + "grad_norm": 0.3618774249811299, + "learning_rate": 7.797952877248289e-06, + "loss": 0.3128, + "step": 2788 + }, + { + "epoch": 0.6637710477777117, + "grad_norm": 0.4123556625672267, + "learning_rate": 7.796355138851098e-06, + "loss": 0.3594, + "step": 2789 + }, + { + "epoch": 0.6640090438507764, + "grad_norm": 0.335938751220196, + "learning_rate": 7.794756984846353e-06, + "loss": 0.4006, + "step": 2790 + }, + { + "epoch": 0.6642470399238413, + "grad_norm": 0.38401019782464624, + "learning_rate": 7.793158415471582e-06, + "loss": 0.3682, + "step": 2791 + }, + { + "epoch": 0.664485035996906, + "grad_norm": 0.35133531035355053, + "learning_rate": 7.791559430964371e-06, + "loss": 0.2978, + "step": 2792 + }, + { + "epoch": 0.6647230320699709, + "grad_norm": 0.39741836114549955, + "learning_rate": 7.789960031562368e-06, + "loss": 0.3708, + "step": 2793 + }, + { + "epoch": 0.6649610281430356, + "grad_norm": 0.3886500686427725, + "learning_rate": 7.788360217503284e-06, + "loss": 0.4344, + "step": 2794 + }, + { + "epoch": 0.6651990242161004, + "grad_norm": 0.38181964678908115, + "learning_rate": 7.786759989024891e-06, + "loss": 0.3091, + "step": 2795 + }, + { + "epoch": 0.6654370202891652, + "grad_norm": 0.3575252771082044, + "learning_rate": 7.785159346365024e-06, + "loss": 0.3533, + "step": 2796 + }, + { + "epoch": 0.66567501636223, + "grad_norm": 0.35735170640215147, + "learning_rate": 7.783558289761575e-06, + "loss": 0.4113, + "step": 2797 + }, + { + "epoch": 0.6659130124352948, + "grad_norm": 0.3802244001409177, + "learning_rate": 7.781956819452503e-06, + "loss": 0.3463, + "step": 2798 + }, + { + "epoch": 0.6661510085083596, + "grad_norm": 0.3589197801986663, + "learning_rate": 7.780354935675824e-06, + "loss": 0.3054, + "step": 2799 + }, + { + "epoch": 0.6663890045814244, + "grad_norm": 0.37241774516966863, + "learning_rate": 7.778752638669621e-06, + "loss": 0.344, + "step": 2800 + }, + { + "epoch": 0.6666270006544892, + "grad_norm": 0.38446901953737084, + "learning_rate": 7.777149928672032e-06, + "loss": 0.389, + "step": 2801 + }, + { + "epoch": 0.666864996727554, + "grad_norm": 0.3748374371731264, + "learning_rate": 7.775546805921259e-06, + "loss": 0.2864, + "step": 2802 + }, + { + "epoch": 0.6671029928006188, + "grad_norm": 0.4043541069673673, + "learning_rate": 7.773943270655568e-06, + "loss": 0.3045, + "step": 2803 + }, + { + "epoch": 0.6673409888736835, + "grad_norm": 0.4320616556631244, + "learning_rate": 7.772339323113283e-06, + "loss": 0.3828, + "step": 2804 + }, + { + "epoch": 0.6675789849467484, + "grad_norm": 0.3884296375705101, + "learning_rate": 7.770734963532791e-06, + "loss": 0.3935, + "step": 2805 + }, + { + "epoch": 0.6678169810198131, + "grad_norm": 0.3947257784010564, + "learning_rate": 7.769130192152538e-06, + "loss": 0.3206, + "step": 2806 + }, + { + "epoch": 0.668054977092878, + "grad_norm": 0.4031463873354111, + "learning_rate": 7.767525009211032e-06, + "loss": 0.3593, + "step": 2807 + }, + { + "epoch": 0.6682929731659427, + "grad_norm": 0.33474899996590485, + "learning_rate": 7.765919414946846e-06, + "loss": 0.387, + "step": 2808 + }, + { + "epoch": 0.6685309692390076, + "grad_norm": 0.3959331136357287, + "learning_rate": 7.76431340959861e-06, + "loss": 0.3433, + "step": 2809 + }, + { + "epoch": 0.6687689653120723, + "grad_norm": 0.3963425292776564, + "learning_rate": 7.762706993405014e-06, + "loss": 0.2888, + "step": 2810 + }, + { + "epoch": 0.6690069613851372, + "grad_norm": 0.37455109669088693, + "learning_rate": 7.761100166604814e-06, + "loss": 0.3744, + "step": 2811 + }, + { + "epoch": 0.6692449574582019, + "grad_norm": 0.37164146720882896, + "learning_rate": 7.759492929436821e-06, + "loss": 0.3976, + "step": 2812 + }, + { + "epoch": 0.6694829535312667, + "grad_norm": 0.37019035117029214, + "learning_rate": 7.757885282139913e-06, + "loss": 0.3273, + "step": 2813 + }, + { + "epoch": 0.6697209496043315, + "grad_norm": 0.3867936235753999, + "learning_rate": 7.756277224953027e-06, + "loss": 0.3476, + "step": 2814 + }, + { + "epoch": 0.6699589456773963, + "grad_norm": 0.37145512873107056, + "learning_rate": 7.754668758115157e-06, + "loss": 0.3941, + "step": 2815 + }, + { + "epoch": 0.6701969417504611, + "grad_norm": 0.36270472677005344, + "learning_rate": 7.753059881865361e-06, + "loss": 0.3415, + "step": 2816 + }, + { + "epoch": 0.6704349378235259, + "grad_norm": 0.4044012125764732, + "learning_rate": 7.751450596442761e-06, + "loss": 0.3126, + "step": 2817 + }, + { + "epoch": 0.6706729338965907, + "grad_norm": 0.4117053779057893, + "learning_rate": 7.749840902086534e-06, + "loss": 0.3767, + "step": 2818 + }, + { + "epoch": 0.6709109299696555, + "grad_norm": 0.3709103066934568, + "learning_rate": 7.748230799035922e-06, + "loss": 0.4183, + "step": 2819 + }, + { + "epoch": 0.6711489260427203, + "grad_norm": 0.36765736344691075, + "learning_rate": 7.746620287530224e-06, + "loss": 0.3289, + "step": 2820 + }, + { + "epoch": 0.6713869221157851, + "grad_norm": 0.39213657203388363, + "learning_rate": 7.745009367808805e-06, + "loss": 0.3612, + "step": 2821 + }, + { + "epoch": 0.6716249181888498, + "grad_norm": 0.3694609673330917, + "learning_rate": 7.743398040111085e-06, + "loss": 0.4092, + "step": 2822 + }, + { + "epoch": 0.6718629142619147, + "grad_norm": 0.36370204232238, + "learning_rate": 7.741786304676546e-06, + "loss": 0.3647, + "step": 2823 + }, + { + "epoch": 0.6721009103349794, + "grad_norm": 0.39103037281531944, + "learning_rate": 7.740174161744734e-06, + "loss": 0.3049, + "step": 2824 + }, + { + "epoch": 0.6723389064080443, + "grad_norm": 0.4294014674026649, + "learning_rate": 7.738561611555256e-06, + "loss": 0.3702, + "step": 2825 + }, + { + "epoch": 0.672576902481109, + "grad_norm": 0.378252161646541, + "learning_rate": 7.736948654347771e-06, + "loss": 0.429, + "step": 2826 + }, + { + "epoch": 0.6728148985541739, + "grad_norm": 0.3733849472381206, + "learning_rate": 7.735335290362008e-06, + "loss": 0.3083, + "step": 2827 + }, + { + "epoch": 0.6730528946272386, + "grad_norm": 0.3875790384170856, + "learning_rate": 7.733721519837751e-06, + "loss": 0.3125, + "step": 2828 + }, + { + "epoch": 0.6732908907003035, + "grad_norm": 0.357812704637255, + "learning_rate": 7.732107343014848e-06, + "loss": 0.4005, + "step": 2829 + }, + { + "epoch": 0.6735288867733682, + "grad_norm": 0.3644020205564977, + "learning_rate": 7.730492760133204e-06, + "loss": 0.4116, + "step": 2830 + }, + { + "epoch": 0.673766882846433, + "grad_norm": 0.39330628647179383, + "learning_rate": 7.728877771432787e-06, + "loss": 0.322, + "step": 2831 + }, + { + "epoch": 0.6740048789194978, + "grad_norm": 0.39515618942825725, + "learning_rate": 7.727262377153625e-06, + "loss": 0.3311, + "step": 2832 + }, + { + "epoch": 0.6742428749925626, + "grad_norm": 0.3826839966849655, + "learning_rate": 7.725646577535803e-06, + "loss": 0.4147, + "step": 2833 + }, + { + "epoch": 0.6744808710656274, + "grad_norm": 0.40336985165060796, + "learning_rate": 7.724030372819473e-06, + "loss": 0.3492, + "step": 2834 + }, + { + "epoch": 0.6747188671386922, + "grad_norm": 0.40161595680714757, + "learning_rate": 7.722413763244837e-06, + "loss": 0.3058, + "step": 2835 + }, + { + "epoch": 0.674956863211757, + "grad_norm": 0.37111456991939507, + "learning_rate": 7.720796749052169e-06, + "loss": 0.3988, + "step": 2836 + }, + { + "epoch": 0.6751948592848218, + "grad_norm": 0.4093260967053505, + "learning_rate": 7.719179330481791e-06, + "loss": 0.3985, + "step": 2837 + }, + { + "epoch": 0.6754328553578866, + "grad_norm": 0.37469388622058286, + "learning_rate": 7.7175615077741e-06, + "loss": 0.3206, + "step": 2838 + }, + { + "epoch": 0.6756708514309514, + "grad_norm": 0.36305043278624066, + "learning_rate": 7.715943281169539e-06, + "loss": 0.3477, + "step": 2839 + }, + { + "epoch": 0.6759088475040161, + "grad_norm": 0.36935236818452855, + "learning_rate": 7.714324650908615e-06, + "loss": 0.3998, + "step": 2840 + }, + { + "epoch": 0.676146843577081, + "grad_norm": 0.3948705853817137, + "learning_rate": 7.7127056172319e-06, + "loss": 0.3422, + "step": 2841 + }, + { + "epoch": 0.6763848396501457, + "grad_norm": 0.3796382485482362, + "learning_rate": 7.711086180380021e-06, + "loss": 0.3149, + "step": 2842 + }, + { + "epoch": 0.6766228357232106, + "grad_norm": 0.3616110674940236, + "learning_rate": 7.709466340593666e-06, + "loss": 0.364, + "step": 2843 + }, + { + "epoch": 0.6768608317962753, + "grad_norm": 0.3662500931977446, + "learning_rate": 7.707846098113583e-06, + "loss": 0.3967, + "step": 2844 + }, + { + "epoch": 0.6770988278693402, + "grad_norm": 0.37281181697106686, + "learning_rate": 7.706225453180583e-06, + "loss": 0.3279, + "step": 2845 + }, + { + "epoch": 0.6773368239424049, + "grad_norm": 0.3855893672663316, + "learning_rate": 7.704604406035531e-06, + "loss": 0.3496, + "step": 2846 + }, + { + "epoch": 0.6775748200154698, + "grad_norm": 0.3889507681326313, + "learning_rate": 7.702982956919356e-06, + "loss": 0.4173, + "step": 2847 + }, + { + "epoch": 0.6778128160885345, + "grad_norm": 0.3971306511498377, + "learning_rate": 7.701361106073044e-06, + "loss": 0.3444, + "step": 2848 + }, + { + "epoch": 0.6780508121615993, + "grad_norm": 0.36005635998666913, + "learning_rate": 7.699738853737646e-06, + "loss": 0.2921, + "step": 2849 + }, + { + "epoch": 0.6782888082346641, + "grad_norm": 0.38741602947105674, + "learning_rate": 7.698116200154262e-06, + "loss": 0.3202, + "step": 2850 + }, + { + "epoch": 0.6785268043077289, + "grad_norm": 0.3676779858892718, + "learning_rate": 7.696493145564065e-06, + "loss": 0.4059, + "step": 2851 + }, + { + "epoch": 0.6787648003807937, + "grad_norm": 0.3417732883784283, + "learning_rate": 7.694869690208278e-06, + "loss": 0.3071, + "step": 2852 + }, + { + "epoch": 0.6790027964538585, + "grad_norm": 0.3806094615926385, + "learning_rate": 7.693245834328186e-06, + "loss": 0.3005, + "step": 2853 + }, + { + "epoch": 0.6792407925269233, + "grad_norm": 0.37415443493343387, + "learning_rate": 7.691621578165135e-06, + "loss": 0.3775, + "step": 2854 + }, + { + "epoch": 0.6794787885999881, + "grad_norm": 0.3588357321765243, + "learning_rate": 7.689996921960533e-06, + "loss": 0.3828, + "step": 2855 + }, + { + "epoch": 0.6797167846730529, + "grad_norm": 0.3928642880638878, + "learning_rate": 7.68837186595584e-06, + "loss": 0.3388, + "step": 2856 + }, + { + "epoch": 0.6799547807461177, + "grad_norm": 0.39713877557243726, + "learning_rate": 7.686746410392579e-06, + "loss": 0.3547, + "step": 2857 + }, + { + "epoch": 0.6801927768191824, + "grad_norm": 0.37028184933204206, + "learning_rate": 7.685120555512335e-06, + "loss": 0.3991, + "step": 2858 + }, + { + "epoch": 0.6804307728922473, + "grad_norm": 0.44868723676569244, + "learning_rate": 7.68349430155675e-06, + "loss": 0.2975, + "step": 2859 + }, + { + "epoch": 0.680668768965312, + "grad_norm": 0.38969868882705583, + "learning_rate": 7.681867648767527e-06, + "loss": 0.3064, + "step": 2860 + }, + { + "epoch": 0.6809067650383769, + "grad_norm": 0.38048777908327175, + "learning_rate": 7.680240597386423e-06, + "loss": 0.3544, + "step": 2861 + }, + { + "epoch": 0.6811447611114416, + "grad_norm": 0.37492467378771765, + "learning_rate": 7.678613147655263e-06, + "loss": 0.4007, + "step": 2862 + }, + { + "epoch": 0.6813827571845065, + "grad_norm": 0.3689709476215601, + "learning_rate": 7.676985299815921e-06, + "loss": 0.3204, + "step": 2863 + }, + { + "epoch": 0.6816207532575712, + "grad_norm": 0.4036262585926096, + "learning_rate": 7.675357054110337e-06, + "loss": 0.3487, + "step": 2864 + }, + { + "epoch": 0.6818587493306361, + "grad_norm": 0.41388190325234303, + "learning_rate": 7.673728410780512e-06, + "loss": 0.3926, + "step": 2865 + }, + { + "epoch": 0.6820967454037008, + "grad_norm": 0.3896246709459209, + "learning_rate": 7.6720993700685e-06, + "loss": 0.3495, + "step": 2866 + }, + { + "epoch": 0.6823347414767656, + "grad_norm": 0.38993044742482114, + "learning_rate": 7.670469932216416e-06, + "loss": 0.2993, + "step": 2867 + }, + { + "epoch": 0.6825727375498304, + "grad_norm": 0.3709106695995631, + "learning_rate": 7.668840097466438e-06, + "loss": 0.4017, + "step": 2868 + }, + { + "epoch": 0.6828107336228952, + "grad_norm": 0.39844192153673497, + "learning_rate": 7.667209866060795e-06, + "loss": 0.4329, + "step": 2869 + }, + { + "epoch": 0.68304872969596, + "grad_norm": 0.394396694213486, + "learning_rate": 7.665579238241783e-06, + "loss": 0.3091, + "step": 2870 + }, + { + "epoch": 0.6832867257690248, + "grad_norm": 0.39853662948827007, + "learning_rate": 7.663948214251754e-06, + "loss": 0.336, + "step": 2871 + }, + { + "epoch": 0.6835247218420896, + "grad_norm": 0.3630956355542585, + "learning_rate": 7.662316794333115e-06, + "loss": 0.4052, + "step": 2872 + }, + { + "epoch": 0.6837627179151544, + "grad_norm": 0.3963082232165837, + "learning_rate": 7.660684978728341e-06, + "loss": 0.3736, + "step": 2873 + }, + { + "epoch": 0.6840007139882192, + "grad_norm": 0.3977137944295455, + "learning_rate": 7.659052767679956e-06, + "loss": 0.3113, + "step": 2874 + }, + { + "epoch": 0.684238710061284, + "grad_norm": 0.4015812355864788, + "learning_rate": 7.657420161430548e-06, + "loss": 0.3585, + "step": 2875 + }, + { + "epoch": 0.6844767061343487, + "grad_norm": 0.39210402493400554, + "learning_rate": 7.655787160222762e-06, + "loss": 0.4295, + "step": 2876 + }, + { + "epoch": 0.6847147022074136, + "grad_norm": 0.39681491199767854, + "learning_rate": 7.654153764299304e-06, + "loss": 0.2991, + "step": 2877 + }, + { + "epoch": 0.6849526982804783, + "grad_norm": 0.4163108651945542, + "learning_rate": 7.652519973902935e-06, + "loss": 0.3332, + "step": 2878 + }, + { + "epoch": 0.6851906943535432, + "grad_norm": 0.3748847281909277, + "learning_rate": 7.650885789276477e-06, + "loss": 0.3468, + "step": 2879 + }, + { + "epoch": 0.6854286904266079, + "grad_norm": 0.421568928920472, + "learning_rate": 7.649251210662812e-06, + "loss": 0.373, + "step": 2880 + }, + { + "epoch": 0.6856666864996728, + "grad_norm": 0.37882377572325376, + "learning_rate": 7.647616238304876e-06, + "loss": 0.3311, + "step": 2881 + }, + { + "epoch": 0.6859046825727375, + "grad_norm": 0.37714593364667043, + "learning_rate": 7.64598087244567e-06, + "loss": 0.3475, + "step": 2882 + }, + { + "epoch": 0.6861426786458024, + "grad_norm": 0.3855958441067824, + "learning_rate": 7.644345113328248e-06, + "loss": 0.4108, + "step": 2883 + }, + { + "epoch": 0.6863806747188671, + "grad_norm": 0.3927803834584747, + "learning_rate": 7.642708961195723e-06, + "loss": 0.3284, + "step": 2884 + }, + { + "epoch": 0.686618670791932, + "grad_norm": 0.39284438670389804, + "learning_rate": 7.641072416291271e-06, + "loss": 0.3087, + "step": 2885 + }, + { + "epoch": 0.6868566668649967, + "grad_norm": 0.38222011385116755, + "learning_rate": 7.639435478858119e-06, + "loss": 0.3611, + "step": 2886 + }, + { + "epoch": 0.6870946629380615, + "grad_norm": 0.35343748065798736, + "learning_rate": 7.637798149139559e-06, + "loss": 0.4157, + "step": 2887 + }, + { + "epoch": 0.6873326590111263, + "grad_norm": 0.4761170077236074, + "learning_rate": 7.636160427378938e-06, + "loss": 0.3342, + "step": 2888 + }, + { + "epoch": 0.6875706550841911, + "grad_norm": 0.3653390110713056, + "learning_rate": 7.634522313819664e-06, + "loss": 0.332, + "step": 2889 + }, + { + "epoch": 0.6878086511572559, + "grad_norm": 0.357610536904034, + "learning_rate": 7.632883808705196e-06, + "loss": 0.4209, + "step": 2890 + }, + { + "epoch": 0.6880466472303207, + "grad_norm": 0.35884223240749147, + "learning_rate": 7.631244912279061e-06, + "loss": 0.3103, + "step": 2891 + }, + { + "epoch": 0.6882846433033855, + "grad_norm": 0.3932088139549123, + "learning_rate": 7.629605624784839e-06, + "loss": 0.3257, + "step": 2892 + }, + { + "epoch": 0.6885226393764503, + "grad_norm": 0.3756997656933469, + "learning_rate": 7.627965946466167e-06, + "loss": 0.3667, + "step": 2893 + }, + { + "epoch": 0.688760635449515, + "grad_norm": 0.41363244284700523, + "learning_rate": 7.626325877566741e-06, + "loss": 0.421, + "step": 2894 + }, + { + "epoch": 0.6889986315225799, + "grad_norm": 0.38949498606977967, + "learning_rate": 7.624685418330319e-06, + "loss": 0.3179, + "step": 2895 + }, + { + "epoch": 0.6892366275956446, + "grad_norm": 0.37724094080660947, + "learning_rate": 7.623044569000712e-06, + "loss": 0.3282, + "step": 2896 + }, + { + "epoch": 0.6894746236687095, + "grad_norm": 0.5285214493849093, + "learning_rate": 7.621403329821792e-06, + "loss": 0.393, + "step": 2897 + }, + { + "epoch": 0.6897126197417742, + "grad_norm": 0.3726869379004517, + "learning_rate": 7.619761701037486e-06, + "loss": 0.3831, + "step": 2898 + }, + { + "epoch": 0.6899506158148391, + "grad_norm": 0.39521137312351157, + "learning_rate": 7.618119682891782e-06, + "loss": 0.3289, + "step": 2899 + }, + { + "epoch": 0.6901886118879038, + "grad_norm": 0.4132662165624947, + "learning_rate": 7.6164772756287234e-06, + "loss": 0.3614, + "step": 2900 + }, + { + "epoch": 0.6904266079609687, + "grad_norm": 0.3713002154182572, + "learning_rate": 7.614834479492413e-06, + "loss": 0.3873, + "step": 2901 + }, + { + "epoch": 0.6906646040340334, + "grad_norm": 0.4066260605116146, + "learning_rate": 7.613191294727011e-06, + "loss": 0.3409, + "step": 2902 + }, + { + "epoch": 0.6909026001070983, + "grad_norm": 0.4241202111497487, + "learning_rate": 7.611547721576738e-06, + "loss": 0.3538, + "step": 2903 + }, + { + "epoch": 0.691140596180163, + "grad_norm": 0.4686662314847007, + "learning_rate": 7.609903760285864e-06, + "loss": 0.4259, + "step": 2904 + }, + { + "epoch": 0.6913785922532278, + "grad_norm": 0.362668095436962, + "learning_rate": 7.608259411098725e-06, + "loss": 0.3968, + "step": 2905 + }, + { + "epoch": 0.6916165883262926, + "grad_norm": 0.40773309376570577, + "learning_rate": 7.606614674259714e-06, + "loss": 0.3047, + "step": 2906 + }, + { + "epoch": 0.6918545843993574, + "grad_norm": 0.35828275597855175, + "learning_rate": 7.6049695500132754e-06, + "loss": 0.3743, + "step": 2907 + }, + { + "epoch": 0.6920925804724222, + "grad_norm": 0.3603447573938342, + "learning_rate": 7.603324038603921e-06, + "loss": 0.3842, + "step": 2908 + }, + { + "epoch": 0.692330576545487, + "grad_norm": 0.40350976963719254, + "learning_rate": 7.601678140276209e-06, + "loss": 0.3228, + "step": 2909 + }, + { + "epoch": 0.6925685726185518, + "grad_norm": 0.38318140197317035, + "learning_rate": 7.600031855274764e-06, + "loss": 0.3013, + "step": 2910 + }, + { + "epoch": 0.6928065686916166, + "grad_norm": 0.3810631038611205, + "learning_rate": 7.598385183844263e-06, + "loss": 0.3572, + "step": 2911 + }, + { + "epoch": 0.6930445647646813, + "grad_norm": 0.3784857718340065, + "learning_rate": 7.5967381262294435e-06, + "loss": 0.3771, + "step": 2912 + }, + { + "epoch": 0.6932825608377462, + "grad_norm": 0.37559330533248253, + "learning_rate": 7.595090682675098e-06, + "loss": 0.3135, + "step": 2913 + }, + { + "epoch": 0.6935205569108109, + "grad_norm": 0.365340499769047, + "learning_rate": 7.593442853426077e-06, + "loss": 0.337, + "step": 2914 + }, + { + "epoch": 0.6937585529838758, + "grad_norm": 0.3680511519981457, + "learning_rate": 7.59179463872729e-06, + "loss": 0.4097, + "step": 2915 + }, + { + "epoch": 0.6939965490569405, + "grad_norm": 0.3766039547521425, + "learning_rate": 7.590146038823702e-06, + "loss": 0.332, + "step": 2916 + }, + { + "epoch": 0.6942345451300054, + "grad_norm": 0.3889666332240844, + "learning_rate": 7.588497053960335e-06, + "loss": 0.3063, + "step": 2917 + }, + { + "epoch": 0.6944725412030701, + "grad_norm": 0.377287587936367, + "learning_rate": 7.586847684382269e-06, + "loss": 0.3576, + "step": 2918 + }, + { + "epoch": 0.694710537276135, + "grad_norm": 0.3803646716074299, + "learning_rate": 7.585197930334642e-06, + "loss": 0.3992, + "step": 2919 + }, + { + "epoch": 0.6949485333491997, + "grad_norm": 0.3630382010768588, + "learning_rate": 7.5835477920626474e-06, + "loss": 0.3336, + "step": 2920 + }, + { + "epoch": 0.6951865294222646, + "grad_norm": 0.4145598312089378, + "learning_rate": 7.5818972698115375e-06, + "loss": 0.3337, + "step": 2921 + }, + { + "epoch": 0.6954245254953293, + "grad_norm": 0.40683667940888457, + "learning_rate": 7.580246363826621e-06, + "loss": 0.4267, + "step": 2922 + }, + { + "epoch": 0.6956625215683941, + "grad_norm": 0.4208382587332909, + "learning_rate": 7.578595074353262e-06, + "loss": 0.3757, + "step": 2923 + }, + { + "epoch": 0.6959005176414589, + "grad_norm": 0.35944681590578803, + "learning_rate": 7.5769434016368845e-06, + "loss": 0.3203, + "step": 2924 + }, + { + "epoch": 0.6961385137145237, + "grad_norm": 0.3599434098310551, + "learning_rate": 7.575291345922966e-06, + "loss": 0.3583, + "step": 2925 + }, + { + "epoch": 0.6963765097875885, + "grad_norm": 0.3756154404949079, + "learning_rate": 7.5736389074570425e-06, + "loss": 0.408, + "step": 2926 + }, + { + "epoch": 0.6966145058606533, + "grad_norm": 0.367210127037896, + "learning_rate": 7.571986086484711e-06, + "loss": 0.3372, + "step": 2927 + }, + { + "epoch": 0.6968525019337181, + "grad_norm": 0.3875760302017146, + "learning_rate": 7.570332883251618e-06, + "loss": 0.3293, + "step": 2928 + }, + { + "epoch": 0.6970904980067829, + "grad_norm": 0.39687946684154773, + "learning_rate": 7.568679298003472e-06, + "loss": 0.3794, + "step": 2929 + }, + { + "epoch": 0.6973284940798476, + "grad_norm": 0.3774932208465836, + "learning_rate": 7.567025330986035e-06, + "loss": 0.3996, + "step": 2930 + }, + { + "epoch": 0.6975664901529125, + "grad_norm": 0.37825723085597085, + "learning_rate": 7.565370982445131e-06, + "loss": 0.3063, + "step": 2931 + }, + { + "epoch": 0.6978044862259772, + "grad_norm": 0.3701480630486983, + "learning_rate": 7.563716252626632e-06, + "loss": 0.3392, + "step": 2932 + }, + { + "epoch": 0.6980424822990421, + "grad_norm": 0.363483092582883, + "learning_rate": 7.562061141776476e-06, + "loss": 0.4165, + "step": 2933 + }, + { + "epoch": 0.6982804783721068, + "grad_norm": 0.40095465937550545, + "learning_rate": 7.560405650140652e-06, + "loss": 0.3086, + "step": 2934 + }, + { + "epoch": 0.6985184744451717, + "grad_norm": 0.5381288739496879, + "learning_rate": 7.5587497779652065e-06, + "loss": 0.2924, + "step": 2935 + }, + { + "epoch": 0.6987564705182364, + "grad_norm": 0.3515417031170579, + "learning_rate": 7.557093525496245e-06, + "loss": 0.389, + "step": 2936 + }, + { + "epoch": 0.6989944665913013, + "grad_norm": 0.37096216086334566, + "learning_rate": 7.555436892979926e-06, + "loss": 0.421, + "step": 2937 + }, + { + "epoch": 0.699232462664366, + "grad_norm": 0.3482586299052201, + "learning_rate": 7.553779880662465e-06, + "loss": 0.2726, + "step": 2938 + }, + { + "epoch": 0.6994704587374309, + "grad_norm": 0.4102693276938636, + "learning_rate": 7.552122488790136e-06, + "loss": 0.3548, + "step": 2939 + }, + { + "epoch": 0.6997084548104956, + "grad_norm": 0.3581143681257741, + "learning_rate": 7.55046471760927e-06, + "loss": 0.379, + "step": 2940 + }, + { + "epoch": 0.6999464508835604, + "grad_norm": 0.4034626868783178, + "learning_rate": 7.548806567366251e-06, + "loss": 0.376, + "step": 2941 + }, + { + "epoch": 0.7001844469566252, + "grad_norm": 0.37940003906957404, + "learning_rate": 7.547148038307521e-06, + "loss": 0.3032, + "step": 2942 + }, + { + "epoch": 0.70042244302969, + "grad_norm": 0.37237400358199735, + "learning_rate": 7.545489130679581e-06, + "loss": 0.3705, + "step": 2943 + }, + { + "epoch": 0.7006604391027548, + "grad_norm": 0.3714730152118366, + "learning_rate": 7.543829844728983e-06, + "loss": 0.4293, + "step": 2944 + }, + { + "epoch": 0.7008984351758196, + "grad_norm": 0.35794214306124905, + "learning_rate": 7.542170180702337e-06, + "loss": 0.2971, + "step": 2945 + }, + { + "epoch": 0.7011364312488844, + "grad_norm": 0.43530263361830346, + "learning_rate": 7.540510138846313e-06, + "loss": 0.324, + "step": 2946 + }, + { + "epoch": 0.7013744273219492, + "grad_norm": 0.38306179336745, + "learning_rate": 7.538849719407632e-06, + "loss": 0.3798, + "step": 2947 + }, + { + "epoch": 0.701612423395014, + "grad_norm": 0.39934257512599863, + "learning_rate": 7.5371889226330765e-06, + "loss": 0.3281, + "step": 2948 + }, + { + "epoch": 0.7018504194680788, + "grad_norm": 0.36426837248809085, + "learning_rate": 7.53552774876948e-06, + "loss": 0.3066, + "step": 2949 + }, + { + "epoch": 0.7020884155411435, + "grad_norm": 0.4139378891941689, + "learning_rate": 7.533866198063734e-06, + "loss": 0.3819, + "step": 2950 + }, + { + "epoch": 0.7023264116142084, + "grad_norm": 0.3662560224348692, + "learning_rate": 7.532204270762786e-06, + "loss": 0.4475, + "step": 2951 + }, + { + "epoch": 0.7025644076872731, + "grad_norm": 0.3990867976425942, + "learning_rate": 7.530541967113639e-06, + "loss": 0.3468, + "step": 2952 + }, + { + "epoch": 0.702802403760338, + "grad_norm": 0.3856725854345983, + "learning_rate": 7.528879287363354e-06, + "loss": 0.3259, + "step": 2953 + }, + { + "epoch": 0.7030403998334027, + "grad_norm": 0.37688679209097037, + "learning_rate": 7.527216231759045e-06, + "loss": 0.3637, + "step": 2954 + }, + { + "epoch": 0.7032783959064676, + "grad_norm": 0.3667821119467153, + "learning_rate": 7.525552800547883e-06, + "loss": 0.3878, + "step": 2955 + }, + { + "epoch": 0.7035163919795323, + "grad_norm": 0.3753572526176831, + "learning_rate": 7.523888993977097e-06, + "loss": 0.3284, + "step": 2956 + }, + { + "epoch": 0.7037543880525972, + "grad_norm": 0.38058502140052836, + "learning_rate": 7.522224812293968e-06, + "loss": 0.3636, + "step": 2957 + }, + { + "epoch": 0.7039923841256619, + "grad_norm": 0.38834168887671905, + "learning_rate": 7.5205602557458345e-06, + "loss": 0.4213, + "step": 2958 + }, + { + "epoch": 0.7042303801987267, + "grad_norm": 0.37141054900070025, + "learning_rate": 7.518895324580091e-06, + "loss": 0.3236, + "step": 2959 + }, + { + "epoch": 0.7044683762717915, + "grad_norm": 0.39065100626003746, + "learning_rate": 7.517230019044188e-06, + "loss": 0.3049, + "step": 2960 + }, + { + "epoch": 0.7047063723448563, + "grad_norm": 0.3666524852773353, + "learning_rate": 7.51556433938563e-06, + "loss": 0.3719, + "step": 2961 + }, + { + "epoch": 0.7049443684179211, + "grad_norm": 0.3817964753531332, + "learning_rate": 7.513898285851982e-06, + "loss": 0.4357, + "step": 2962 + }, + { + "epoch": 0.7051823644909859, + "grad_norm": 0.4198232988530167, + "learning_rate": 7.512231858690856e-06, + "loss": 0.3348, + "step": 2963 + }, + { + "epoch": 0.7054203605640507, + "grad_norm": 0.3964784823438321, + "learning_rate": 7.510565058149927e-06, + "loss": 0.3519, + "step": 2964 + }, + { + "epoch": 0.7056583566371155, + "grad_norm": 0.34675732231223394, + "learning_rate": 7.508897884476921e-06, + "loss": 0.3907, + "step": 2965 + }, + { + "epoch": 0.7058963527101803, + "grad_norm": 0.3578070224969587, + "learning_rate": 7.507230337919623e-06, + "loss": 0.335, + "step": 2966 + }, + { + "epoch": 0.7061343487832451, + "grad_norm": 0.42191824768221664, + "learning_rate": 7.505562418725869e-06, + "loss": 0.296, + "step": 2967 + }, + { + "epoch": 0.7063723448563098, + "grad_norm": 0.3651458752052437, + "learning_rate": 7.503894127143558e-06, + "loss": 0.3755, + "step": 2968 + }, + { + "epoch": 0.7066103409293747, + "grad_norm": 0.4570811159042686, + "learning_rate": 7.5022254634206345e-06, + "loss": 0.3917, + "step": 2969 + }, + { + "epoch": 0.7068483370024394, + "grad_norm": 0.34715154210823396, + "learning_rate": 7.500556427805106e-06, + "loss": 0.3153, + "step": 2970 + }, + { + "epoch": 0.7070863330755043, + "grad_norm": 0.3947211815602096, + "learning_rate": 7.498887020545031e-06, + "loss": 0.3417, + "step": 2971 + }, + { + "epoch": 0.707324329148569, + "grad_norm": 0.37334961366076164, + "learning_rate": 7.497217241888525e-06, + "loss": 0.4165, + "step": 2972 + }, + { + "epoch": 0.7075623252216339, + "grad_norm": 0.3509931391175888, + "learning_rate": 7.495547092083758e-06, + "loss": 0.413, + "step": 2973 + }, + { + "epoch": 0.7078003212946986, + "grad_norm": 0.4205974782723252, + "learning_rate": 7.493876571378958e-06, + "loss": 0.2983, + "step": 2974 + }, + { + "epoch": 0.7080383173677635, + "grad_norm": 0.40308557662169947, + "learning_rate": 7.492205680022402e-06, + "loss": 0.3522, + "step": 2975 + }, + { + "epoch": 0.7082763134408282, + "grad_norm": 0.3511659283555987, + "learning_rate": 7.490534418262429e-06, + "loss": 0.4201, + "step": 2976 + }, + { + "epoch": 0.708514309513893, + "grad_norm": 0.38333575891336563, + "learning_rate": 7.488862786347428e-06, + "loss": 0.3414, + "step": 2977 + }, + { + "epoch": 0.7087523055869578, + "grad_norm": 0.38546643322989155, + "learning_rate": 7.487190784525847e-06, + "loss": 0.3206, + "step": 2978 + }, + { + "epoch": 0.7089903016600226, + "grad_norm": 0.36230466619537316, + "learning_rate": 7.485518413046185e-06, + "loss": 0.3884, + "step": 2979 + }, + { + "epoch": 0.7092282977330874, + "grad_norm": 0.36360795125731105, + "learning_rate": 7.4838456721569975e-06, + "loss": 0.3583, + "step": 2980 + }, + { + "epoch": 0.7094662938061522, + "grad_norm": 0.3836337184101506, + "learning_rate": 7.482172562106894e-06, + "loss": 0.3268, + "step": 2981 + }, + { + "epoch": 0.709704289879217, + "grad_norm": 0.4106360537544092, + "learning_rate": 7.480499083144544e-06, + "loss": 0.3347, + "step": 2982 + }, + { + "epoch": 0.7099422859522818, + "grad_norm": 0.3748286533906386, + "learning_rate": 7.478825235518665e-06, + "loss": 0.3911, + "step": 2983 + }, + { + "epoch": 0.7101802820253466, + "grad_norm": 0.42427243151949334, + "learning_rate": 7.477151019478033e-06, + "loss": 0.3345, + "step": 2984 + }, + { + "epoch": 0.7104182780984114, + "grad_norm": 0.3954736604508801, + "learning_rate": 7.4754764352714775e-06, + "loss": 0.3117, + "step": 2985 + }, + { + "epoch": 0.7106562741714761, + "grad_norm": 0.35115884372592393, + "learning_rate": 7.4738014831478825e-06, + "loss": 0.3906, + "step": 2986 + }, + { + "epoch": 0.710894270244541, + "grad_norm": 0.3945125666106555, + "learning_rate": 7.472126163356189e-06, + "loss": 0.4087, + "step": 2987 + }, + { + "epoch": 0.7111322663176057, + "grad_norm": 0.36333273981672193, + "learning_rate": 7.47045047614539e-06, + "loss": 0.3242, + "step": 2988 + }, + { + "epoch": 0.7113702623906706, + "grad_norm": 0.3914705133621617, + "learning_rate": 7.468774421764534e-06, + "loss": 0.3465, + "step": 2989 + }, + { + "epoch": 0.7116082584637353, + "grad_norm": 0.3823529571498665, + "learning_rate": 7.467098000462726e-06, + "loss": 0.3816, + "step": 2990 + }, + { + "epoch": 0.7118462545368002, + "grad_norm": 0.3548613233280991, + "learning_rate": 7.465421212489121e-06, + "loss": 0.3438, + "step": 2991 + }, + { + "epoch": 0.7120842506098649, + "grad_norm": 0.39262400367886485, + "learning_rate": 7.463744058092932e-06, + "loss": 0.352, + "step": 2992 + }, + { + "epoch": 0.7123222466829298, + "grad_norm": 0.367798301154, + "learning_rate": 7.462066537523427e-06, + "loss": 0.3347, + "step": 2993 + }, + { + "epoch": 0.7125602427559945, + "grad_norm": 0.38423044064973816, + "learning_rate": 7.460388651029925e-06, + "loss": 0.4565, + "step": 2994 + }, + { + "epoch": 0.7127982388290593, + "grad_norm": 0.3640635090647599, + "learning_rate": 7.458710398861802e-06, + "loss": 0.3073, + "step": 2995 + }, + { + "epoch": 0.7130362349021241, + "grad_norm": 0.4006113452209261, + "learning_rate": 7.457031781268488e-06, + "loss": 0.3733, + "step": 2996 + }, + { + "epoch": 0.7132742309751889, + "grad_norm": 0.3807254297958794, + "learning_rate": 7.455352798499468e-06, + "loss": 0.4349, + "step": 2997 + }, + { + "epoch": 0.7135122270482537, + "grad_norm": 0.39126757394311634, + "learning_rate": 7.453673450804279e-06, + "loss": 0.3575, + "step": 2998 + }, + { + "epoch": 0.7137502231213185, + "grad_norm": 0.4052451533875098, + "learning_rate": 7.451993738432514e-06, + "loss": 0.2873, + "step": 2999 + }, + { + "epoch": 0.7139882191943833, + "grad_norm": 0.39364111352601927, + "learning_rate": 7.450313661633821e-06, + "loss": 0.3606, + "step": 3000 + }, + { + "epoch": 0.7142262152674481, + "grad_norm": 0.40280636612509435, + "learning_rate": 7.448633220657901e-06, + "loss": 0.383, + "step": 3001 + }, + { + "epoch": 0.7144642113405129, + "grad_norm": 0.4339937070314304, + "learning_rate": 7.4469524157545055e-06, + "loss": 0.336, + "step": 3002 + }, + { + "epoch": 0.7147022074135777, + "grad_norm": 0.40489755454617965, + "learning_rate": 7.445271247173449e-06, + "loss": 0.3121, + "step": 3003 + }, + { + "epoch": 0.7149402034866424, + "grad_norm": 0.4041470391859187, + "learning_rate": 7.44358971516459e-06, + "loss": 0.396, + "step": 3004 + }, + { + "epoch": 0.7151781995597073, + "grad_norm": 0.39500293731975633, + "learning_rate": 7.441907819977849e-06, + "loss": 0.4046, + "step": 3005 + }, + { + "epoch": 0.715416195632772, + "grad_norm": 0.41370890008241495, + "learning_rate": 7.440225561863197e-06, + "loss": 0.3536, + "step": 3006 + }, + { + "epoch": 0.7156541917058369, + "grad_norm": 0.3939257241455852, + "learning_rate": 7.438542941070657e-06, + "loss": 0.3451, + "step": 3007 + }, + { + "epoch": 0.7158921877789016, + "grad_norm": 0.35452897061585553, + "learning_rate": 7.436859957850309e-06, + "loss": 0.3992, + "step": 3008 + }, + { + "epoch": 0.7161301838519665, + "grad_norm": 0.3816527448874161, + "learning_rate": 7.435176612452286e-06, + "loss": 0.3353, + "step": 3009 + }, + { + "epoch": 0.7163681799250312, + "grad_norm": 0.41454236361169766, + "learning_rate": 7.4334929051267755e-06, + "loss": 0.3166, + "step": 3010 + }, + { + "epoch": 0.7166061759980961, + "grad_norm": 0.3602376065948697, + "learning_rate": 7.431808836124018e-06, + "loss": 0.3797, + "step": 3011 + }, + { + "epoch": 0.7168441720711608, + "grad_norm": 0.39512210096179917, + "learning_rate": 7.4301244056943075e-06, + "loss": 0.4029, + "step": 3012 + }, + { + "epoch": 0.7170821681442257, + "grad_norm": 0.3827942527690594, + "learning_rate": 7.42843961408799e-06, + "loss": 0.3173, + "step": 3013 + }, + { + "epoch": 0.7173201642172904, + "grad_norm": 0.39446387386662557, + "learning_rate": 7.426754461555471e-06, + "loss": 0.3544, + "step": 3014 + }, + { + "epoch": 0.7175581602903552, + "grad_norm": 0.35472318916385837, + "learning_rate": 7.425068948347204e-06, + "loss": 0.4169, + "step": 3015 + }, + { + "epoch": 0.71779615636342, + "grad_norm": 0.38263262613927496, + "learning_rate": 7.423383074713697e-06, + "loss": 0.3714, + "step": 3016 + }, + { + "epoch": 0.7180341524364848, + "grad_norm": 0.3808280361027985, + "learning_rate": 7.421696840905515e-06, + "loss": 0.2928, + "step": 3017 + }, + { + "epoch": 0.7182721485095496, + "grad_norm": 0.36752434940666945, + "learning_rate": 7.4200102471732704e-06, + "loss": 0.3481, + "step": 3018 + }, + { + "epoch": 0.7185101445826144, + "grad_norm": 0.38919742791659145, + "learning_rate": 7.4183232937676375e-06, + "loss": 0.391, + "step": 3019 + }, + { + "epoch": 0.7187481406556792, + "grad_norm": 0.37374333756044126, + "learning_rate": 7.416635980939335e-06, + "loss": 0.348, + "step": 3020 + }, + { + "epoch": 0.718986136728744, + "grad_norm": 0.36920174596848016, + "learning_rate": 7.414948308939141e-06, + "loss": 0.3441, + "step": 3021 + }, + { + "epoch": 0.7192241328018087, + "grad_norm": 0.3899921201145649, + "learning_rate": 7.413260278017887e-06, + "loss": 0.3838, + "step": 3022 + }, + { + "epoch": 0.7194621288748736, + "grad_norm": 0.43144843112558834, + "learning_rate": 7.411571888426452e-06, + "loss": 0.3486, + "step": 3023 + }, + { + "epoch": 0.7197001249479383, + "grad_norm": 0.3911155710827359, + "learning_rate": 7.4098831404157765e-06, + "loss": 0.3315, + "step": 3024 + }, + { + "epoch": 0.7199381210210032, + "grad_norm": 0.42440357958109975, + "learning_rate": 7.408194034236849e-06, + "loss": 0.3736, + "step": 3025 + }, + { + "epoch": 0.7201761170940679, + "grad_norm": 0.3680061754320904, + "learning_rate": 7.40650457014071e-06, + "loss": 0.3912, + "step": 3026 + }, + { + "epoch": 0.7204141131671328, + "grad_norm": 0.3830697542132779, + "learning_rate": 7.404814748378461e-06, + "loss": 0.3287, + "step": 3027 + }, + { + "epoch": 0.7206521092401975, + "grad_norm": 0.37264948361322325, + "learning_rate": 7.403124569201246e-06, + "loss": 0.3231, + "step": 3028 + }, + { + "epoch": 0.7208901053132624, + "grad_norm": 0.42134581649859754, + "learning_rate": 7.4014340328602685e-06, + "loss": 0.3565, + "step": 3029 + }, + { + "epoch": 0.7211281013863271, + "grad_norm": 0.41130909439310925, + "learning_rate": 7.399743139606788e-06, + "loss": 0.4445, + "step": 3030 + }, + { + "epoch": 0.721366097459392, + "grad_norm": 0.4343180622590514, + "learning_rate": 7.398051889692108e-06, + "loss": 0.3357, + "step": 3031 + }, + { + "epoch": 0.7216040935324567, + "grad_norm": 0.39112408012774896, + "learning_rate": 7.396360283367594e-06, + "loss": 0.3501, + "step": 3032 + }, + { + "epoch": 0.7218420896055215, + "grad_norm": 0.3354333543079656, + "learning_rate": 7.394668320884658e-06, + "loss": 0.3829, + "step": 3033 + }, + { + "epoch": 0.7220800856785863, + "grad_norm": 0.34658725353941033, + "learning_rate": 7.392976002494768e-06, + "loss": 0.3088, + "step": 3034 + }, + { + "epoch": 0.7223180817516511, + "grad_norm": 0.42267425856173196, + "learning_rate": 7.391283328449445e-06, + "loss": 0.3284, + "step": 3035 + }, + { + "epoch": 0.7225560778247159, + "grad_norm": 0.3696528618085508, + "learning_rate": 7.389590299000262e-06, + "loss": 0.3727, + "step": 3036 + }, + { + "epoch": 0.7227940738977807, + "grad_norm": 0.37189620964899023, + "learning_rate": 7.387896914398845e-06, + "loss": 0.4148, + "step": 3037 + }, + { + "epoch": 0.7230320699708455, + "grad_norm": 0.4135419215724609, + "learning_rate": 7.386203174896872e-06, + "loss": 0.3078, + "step": 3038 + }, + { + "epoch": 0.7232700660439103, + "grad_norm": 0.3992204444214033, + "learning_rate": 7.384509080746076e-06, + "loss": 0.3575, + "step": 3039 + }, + { + "epoch": 0.723508062116975, + "grad_norm": 0.35922624214255505, + "learning_rate": 7.382814632198241e-06, + "loss": 0.4262, + "step": 3040 + }, + { + "epoch": 0.7237460581900399, + "grad_norm": 0.3793102673645999, + "learning_rate": 7.381119829505204e-06, + "loss": 0.365, + "step": 3041 + }, + { + "epoch": 0.7239840542631046, + "grad_norm": 0.3944308333847217, + "learning_rate": 7.379424672918853e-06, + "loss": 0.2977, + "step": 3042 + }, + { + "epoch": 0.7242220503361695, + "grad_norm": 0.36373682832023985, + "learning_rate": 7.377729162691131e-06, + "loss": 0.3509, + "step": 3043 + }, + { + "epoch": 0.7244600464092342, + "grad_norm": 0.3627646085804695, + "learning_rate": 7.376033299074035e-06, + "loss": 0.4184, + "step": 3044 + }, + { + "epoch": 0.7246980424822991, + "grad_norm": 0.3871761600279167, + "learning_rate": 7.3743370823196096e-06, + "loss": 0.3216, + "step": 3045 + }, + { + "epoch": 0.7249360385553638, + "grad_norm": 0.3798739905567255, + "learning_rate": 7.372640512679955e-06, + "loss": 0.3456, + "step": 3046 + }, + { + "epoch": 0.7251740346284287, + "grad_norm": 0.4259458917487435, + "learning_rate": 7.370943590407225e-06, + "loss": 0.4103, + "step": 3047 + }, + { + "epoch": 0.7254120307014934, + "grad_norm": 0.3626974255553493, + "learning_rate": 7.369246315753623e-06, + "loss": 0.3433, + "step": 3048 + }, + { + "epoch": 0.7256500267745583, + "grad_norm": 0.4435787230045141, + "learning_rate": 7.367548688971407e-06, + "loss": 0.2972, + "step": 3049 + }, + { + "epoch": 0.725888022847623, + "grad_norm": 0.3554006057479478, + "learning_rate": 7.365850710312883e-06, + "loss": 0.3688, + "step": 3050 + }, + { + "epoch": 0.7261260189206878, + "grad_norm": 0.36442397072863186, + "learning_rate": 7.364152380030416e-06, + "loss": 0.4098, + "step": 3051 + }, + { + "epoch": 0.7263640149937526, + "grad_norm": 0.4410087294940679, + "learning_rate": 7.3624536983764195e-06, + "loss": 0.3272, + "step": 3052 + }, + { + "epoch": 0.7266020110668174, + "grad_norm": 0.4282754810487593, + "learning_rate": 7.3607546656033594e-06, + "loss": 0.3241, + "step": 3053 + }, + { + "epoch": 0.7268400071398822, + "grad_norm": 0.4083398243189906, + "learning_rate": 7.359055281963753e-06, + "loss": 0.3797, + "step": 3054 + }, + { + "epoch": 0.727078003212947, + "grad_norm": 0.3965480234646783, + "learning_rate": 7.357355547710172e-06, + "loss": 0.3904, + "step": 3055 + }, + { + "epoch": 0.7273159992860118, + "grad_norm": 0.42026409912961626, + "learning_rate": 7.355655463095239e-06, + "loss": 0.3146, + "step": 3056 + }, + { + "epoch": 0.7275539953590766, + "grad_norm": 0.3771142416353585, + "learning_rate": 7.3539550283716265e-06, + "loss": 0.3445, + "step": 3057 + }, + { + "epoch": 0.7277919914321413, + "grad_norm": 0.3551162990197736, + "learning_rate": 7.352254243792064e-06, + "loss": 0.4232, + "step": 3058 + }, + { + "epoch": 0.7280299875052062, + "grad_norm": 0.4085035997854665, + "learning_rate": 7.350553109609329e-06, + "loss": 0.3106, + "step": 3059 + }, + { + "epoch": 0.7282679835782709, + "grad_norm": 0.3847757510606681, + "learning_rate": 7.348851626076252e-06, + "loss": 0.3063, + "step": 3060 + }, + { + "epoch": 0.7285059796513358, + "grad_norm": 0.36865435960657855, + "learning_rate": 7.347149793445715e-06, + "loss": 0.3544, + "step": 3061 + }, + { + "epoch": 0.7287439757244005, + "grad_norm": 0.3922363467897953, + "learning_rate": 7.345447611970653e-06, + "loss": 0.3822, + "step": 3062 + }, + { + "epoch": 0.7289819717974654, + "grad_norm": 0.34369944175304884, + "learning_rate": 7.3437450819040536e-06, + "loss": 0.3206, + "step": 3063 + }, + { + "epoch": 0.7292199678705301, + "grad_norm": 0.35824405322282227, + "learning_rate": 7.342042203498952e-06, + "loss": 0.3554, + "step": 3064 + }, + { + "epoch": 0.729457963943595, + "grad_norm": 0.39512318030878885, + "learning_rate": 7.34033897700844e-06, + "loss": 0.3839, + "step": 3065 + }, + { + "epoch": 0.7296959600166597, + "grad_norm": 0.38854090842538846, + "learning_rate": 7.338635402685659e-06, + "loss": 0.3378, + "step": 3066 + }, + { + "epoch": 0.7299339560897246, + "grad_norm": 0.38073840280648125, + "learning_rate": 7.336931480783801e-06, + "loss": 0.3343, + "step": 3067 + }, + { + "epoch": 0.7301719521627893, + "grad_norm": 0.3869439775508733, + "learning_rate": 7.335227211556113e-06, + "loss": 0.3474, + "step": 3068 + }, + { + "epoch": 0.7304099482358541, + "grad_norm": 0.36331918337163177, + "learning_rate": 7.3335225952558904e-06, + "loss": 0.4093, + "step": 3069 + }, + { + "epoch": 0.7306479443089189, + "grad_norm": 0.3676892295018712, + "learning_rate": 7.3318176321364835e-06, + "loss": 0.3082, + "step": 3070 + }, + { + "epoch": 0.7308859403819837, + "grad_norm": 0.37227813295867007, + "learning_rate": 7.330112322451287e-06, + "loss": 0.3266, + "step": 3071 + }, + { + "epoch": 0.7311239364550485, + "grad_norm": 0.37682897636401325, + "learning_rate": 7.328406666453757e-06, + "loss": 0.4061, + "step": 3072 + }, + { + "epoch": 0.7313619325281133, + "grad_norm": 0.43099604922414647, + "learning_rate": 7.326700664397395e-06, + "loss": 0.3787, + "step": 3073 + }, + { + "epoch": 0.7315999286011781, + "grad_norm": 0.38646511617724494, + "learning_rate": 7.324994316535753e-06, + "loss": 0.3184, + "step": 3074 + }, + { + "epoch": 0.7318379246742429, + "grad_norm": 0.3745081198422119, + "learning_rate": 7.323287623122439e-06, + "loss": 0.3465, + "step": 3075 + }, + { + "epoch": 0.7320759207473077, + "grad_norm": 0.3868492244329305, + "learning_rate": 7.321580584411108e-06, + "loss": 0.4354, + "step": 3076 + }, + { + "epoch": 0.7323139168203725, + "grad_norm": 0.3746511813825389, + "learning_rate": 7.31987320065547e-06, + "loss": 0.3515, + "step": 3077 + }, + { + "epoch": 0.7325519128934372, + "grad_norm": 0.46299638156809153, + "learning_rate": 7.318165472109282e-06, + "loss": 0.3055, + "step": 3078 + }, + { + "epoch": 0.7327899089665021, + "grad_norm": 0.4037577857446162, + "learning_rate": 7.3164573990263574e-06, + "loss": 0.3781, + "step": 3079 + }, + { + "epoch": 0.7330279050395668, + "grad_norm": 0.3996195774993303, + "learning_rate": 7.314748981660555e-06, + "loss": 0.3987, + "step": 3080 + }, + { + "epoch": 0.7332659011126317, + "grad_norm": 0.3936618750557137, + "learning_rate": 7.313040220265792e-06, + "loss": 0.3083, + "step": 3081 + }, + { + "epoch": 0.7335038971856964, + "grad_norm": 0.38121369149338524, + "learning_rate": 7.31133111509603e-06, + "loss": 0.3229, + "step": 3082 + }, + { + "epoch": 0.7337418932587613, + "grad_norm": 0.388825782352307, + "learning_rate": 7.309621666405284e-06, + "loss": 0.4128, + "step": 3083 + }, + { + "epoch": 0.733979889331826, + "grad_norm": 0.3715499437212395, + "learning_rate": 7.307911874447622e-06, + "loss": 0.3383, + "step": 3084 + }, + { + "epoch": 0.7342178854048909, + "grad_norm": 0.4232766232253306, + "learning_rate": 7.306201739477159e-06, + "loss": 0.3137, + "step": 3085 + }, + { + "epoch": 0.7344558814779556, + "grad_norm": 0.345889539039532, + "learning_rate": 7.304491261748067e-06, + "loss": 0.3484, + "step": 3086 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.37961353019617056, + "learning_rate": 7.302780441514561e-06, + "loss": 0.3989, + "step": 3087 + }, + { + "epoch": 0.7349318736240852, + "grad_norm": 0.3772100128276546, + "learning_rate": 7.3010692790309145e-06, + "loss": 0.3109, + "step": 3088 + }, + { + "epoch": 0.73516986969715, + "grad_norm": 0.4148043405953893, + "learning_rate": 7.2993577745514475e-06, + "loss": 0.339, + "step": 3089 + }, + { + "epoch": 0.7354078657702148, + "grad_norm": 0.4041915618136653, + "learning_rate": 7.2976459283305326e-06, + "loss": 0.3987, + "step": 3090 + }, + { + "epoch": 0.7356458618432796, + "grad_norm": 0.37070530009594416, + "learning_rate": 7.2959337406225894e-06, + "loss": 0.3478, + "step": 3091 + }, + { + "epoch": 0.7358838579163444, + "grad_norm": 0.3405354595093119, + "learning_rate": 7.294221211682096e-06, + "loss": 0.3366, + "step": 3092 + }, + { + "epoch": 0.7361218539894092, + "grad_norm": 0.36518838895931155, + "learning_rate": 7.292508341763574e-06, + "loss": 0.3426, + "step": 3093 + }, + { + "epoch": 0.736359850062474, + "grad_norm": 0.37822344257306156, + "learning_rate": 7.290795131121595e-06, + "loss": 0.3879, + "step": 3094 + }, + { + "epoch": 0.7365978461355388, + "grad_norm": 0.3730174664708697, + "learning_rate": 7.289081580010792e-06, + "loss": 0.3241, + "step": 3095 + }, + { + "epoch": 0.7368358422086035, + "grad_norm": 0.360220342804819, + "learning_rate": 7.287367688685835e-06, + "loss": 0.3325, + "step": 3096 + }, + { + "epoch": 0.7370738382816684, + "grad_norm": 0.36585884890518433, + "learning_rate": 7.285653457401453e-06, + "loss": 0.3872, + "step": 3097 + }, + { + "epoch": 0.7373118343547331, + "grad_norm": 0.5429383224292799, + "learning_rate": 7.283938886412424e-06, + "loss": 0.3578, + "step": 3098 + }, + { + "epoch": 0.737549830427798, + "grad_norm": 0.41494686957593174, + "learning_rate": 7.2822239759735735e-06, + "loss": 0.3477, + "step": 3099 + }, + { + "epoch": 0.7377878265008627, + "grad_norm": 0.3816939297486696, + "learning_rate": 7.280508726339781e-06, + "loss": 0.3684, + "step": 3100 + }, + { + "epoch": 0.7380258225739276, + "grad_norm": 0.37029991676249097, + "learning_rate": 7.278793137765976e-06, + "loss": 0.4007, + "step": 3101 + }, + { + "epoch": 0.7382638186469923, + "grad_norm": 0.4339643261053269, + "learning_rate": 7.277077210507135e-06, + "loss": 0.3341, + "step": 3102 + }, + { + "epoch": 0.7385018147200572, + "grad_norm": 0.37177092531956185, + "learning_rate": 7.2753609448182885e-06, + "loss": 0.3178, + "step": 3103 + }, + { + "epoch": 0.7387398107931219, + "grad_norm": 0.36083182065635616, + "learning_rate": 7.273644340954515e-06, + "loss": 0.3974, + "step": 3104 + }, + { + "epoch": 0.7389778068661867, + "grad_norm": 0.3742211097385228, + "learning_rate": 7.271927399170946e-06, + "loss": 0.3764, + "step": 3105 + }, + { + "epoch": 0.7392158029392515, + "grad_norm": 0.39314424003897525, + "learning_rate": 7.270210119722761e-06, + "loss": 0.3178, + "step": 3106 + }, + { + "epoch": 0.7394537990123163, + "grad_norm": 0.387246764459355, + "learning_rate": 7.2684925028651875e-06, + "loss": 0.3463, + "step": 3107 + }, + { + "epoch": 0.7396917950853811, + "grad_norm": 0.3455422305424231, + "learning_rate": 7.26677454885351e-06, + "loss": 0.4018, + "step": 3108 + }, + { + "epoch": 0.7399297911584459, + "grad_norm": 0.3680423220780498, + "learning_rate": 7.265056257943059e-06, + "loss": 0.3318, + "step": 3109 + }, + { + "epoch": 0.7401677872315107, + "grad_norm": 0.4218016718701506, + "learning_rate": 7.2633376303892115e-06, + "loss": 0.3327, + "step": 3110 + }, + { + "epoch": 0.7404057833045755, + "grad_norm": 0.4797971091280992, + "learning_rate": 7.2616186664474e-06, + "loss": 0.3781, + "step": 3111 + }, + { + "epoch": 0.7406437793776403, + "grad_norm": 0.3656564240573133, + "learning_rate": 7.259899366373105e-06, + "loss": 0.3589, + "step": 3112 + }, + { + "epoch": 0.7408817754507051, + "grad_norm": 0.39727650704719514, + "learning_rate": 7.258179730421856e-06, + "loss": 0.302, + "step": 3113 + }, + { + "epoch": 0.7411197715237698, + "grad_norm": 0.3854240175463639, + "learning_rate": 7.256459758849236e-06, + "loss": 0.3257, + "step": 3114 + }, + { + "epoch": 0.7413577675968347, + "grad_norm": 0.35574522570705447, + "learning_rate": 7.254739451910872e-06, + "loss": 0.3877, + "step": 3115 + }, + { + "epoch": 0.7415957636698994, + "grad_norm": 0.4630875140371236, + "learning_rate": 7.253018809862448e-06, + "loss": 0.3405, + "step": 3116 + }, + { + "epoch": 0.7418337597429643, + "grad_norm": 0.41595067569227445, + "learning_rate": 7.251297832959691e-06, + "loss": 0.3055, + "step": 3117 + }, + { + "epoch": 0.742071755816029, + "grad_norm": 0.35751446775150947, + "learning_rate": 7.249576521458381e-06, + "loss": 0.3468, + "step": 3118 + }, + { + "epoch": 0.7423097518890939, + "grad_norm": 0.3782939975585627, + "learning_rate": 7.247854875614348e-06, + "loss": 0.4108, + "step": 3119 + }, + { + "epoch": 0.7425477479621586, + "grad_norm": 0.36443090923466487, + "learning_rate": 7.246132895683472e-06, + "loss": 0.3252, + "step": 3120 + }, + { + "epoch": 0.7427857440352235, + "grad_norm": 0.3521241114001235, + "learning_rate": 7.244410581921679e-06, + "loss": 0.3367, + "step": 3121 + }, + { + "epoch": 0.7430237401082882, + "grad_norm": 0.40051085069078274, + "learning_rate": 7.242687934584952e-06, + "loss": 0.3951, + "step": 3122 + }, + { + "epoch": 0.743261736181353, + "grad_norm": 0.44680675909155915, + "learning_rate": 7.2409649539293155e-06, + "loss": 0.3604, + "step": 3123 + }, + { + "epoch": 0.7434997322544178, + "grad_norm": 0.41922316103507395, + "learning_rate": 7.239241640210849e-06, + "loss": 0.3254, + "step": 3124 + }, + { + "epoch": 0.7437377283274826, + "grad_norm": 0.3777661176299726, + "learning_rate": 7.2375179936856775e-06, + "loss": 0.371, + "step": 3125 + }, + { + "epoch": 0.7439757244005474, + "grad_norm": 0.37648805520020007, + "learning_rate": 7.235794014609978e-06, + "loss": 0.4046, + "step": 3126 + }, + { + "epoch": 0.7442137204736122, + "grad_norm": 0.410705623333718, + "learning_rate": 7.234069703239979e-06, + "loss": 0.3264, + "step": 3127 + }, + { + "epoch": 0.744451716546677, + "grad_norm": 0.36702681685700017, + "learning_rate": 7.2323450598319535e-06, + "loss": 0.3487, + "step": 3128 + }, + { + "epoch": 0.7446897126197418, + "grad_norm": 0.3670128177868576, + "learning_rate": 7.230620084642226e-06, + "loss": 0.3881, + "step": 3129 + }, + { + "epoch": 0.7449277086928066, + "grad_norm": 0.40207445243333073, + "learning_rate": 7.228894777927171e-06, + "loss": 0.3942, + "step": 3130 + }, + { + "epoch": 0.7451657047658714, + "grad_norm": 0.40080124947818796, + "learning_rate": 7.227169139943211e-06, + "loss": 0.3075, + "step": 3131 + }, + { + "epoch": 0.7454037008389361, + "grad_norm": 0.39628761333956425, + "learning_rate": 7.22544317094682e-06, + "loss": 0.3387, + "step": 3132 + }, + { + "epoch": 0.745641696912001, + "grad_norm": 0.37788202720786135, + "learning_rate": 7.223716871194519e-06, + "loss": 0.4307, + "step": 3133 + }, + { + "epoch": 0.7458796929850657, + "grad_norm": 0.37196320757312157, + "learning_rate": 7.221990240942878e-06, + "loss": 0.3459, + "step": 3134 + }, + { + "epoch": 0.7461176890581306, + "grad_norm": 0.37988100060665997, + "learning_rate": 7.220263280448518e-06, + "loss": 0.2937, + "step": 3135 + }, + { + "epoch": 0.7463556851311953, + "grad_norm": 0.37254514931619725, + "learning_rate": 7.21853598996811e-06, + "loss": 0.4057, + "step": 3136 + }, + { + "epoch": 0.7465936812042602, + "grad_norm": 0.3900480382097283, + "learning_rate": 7.216808369758368e-06, + "loss": 0.381, + "step": 3137 + }, + { + "epoch": 0.7468316772773249, + "grad_norm": 0.3796183272267863, + "learning_rate": 7.215080420076061e-06, + "loss": 0.3289, + "step": 3138 + }, + { + "epoch": 0.7470696733503898, + "grad_norm": 0.3806521308902775, + "learning_rate": 7.2133521411780075e-06, + "loss": 0.3483, + "step": 3139 + }, + { + "epoch": 0.7473076694234545, + "grad_norm": 0.38838395980321355, + "learning_rate": 7.211623533321067e-06, + "loss": 0.3974, + "step": 3140 + }, + { + "epoch": 0.7475456654965194, + "grad_norm": 0.41947045054774285, + "learning_rate": 7.209894596762158e-06, + "loss": 0.3447, + "step": 3141 + }, + { + "epoch": 0.7477836615695841, + "grad_norm": 0.3540371480756213, + "learning_rate": 7.2081653317582414e-06, + "loss": 0.3406, + "step": 3142 + }, + { + "epoch": 0.7480216576426489, + "grad_norm": 0.3826391985417264, + "learning_rate": 7.20643573856633e-06, + "loss": 0.3343, + "step": 3143 + }, + { + "epoch": 0.7482596537157137, + "grad_norm": 0.3538807128710463, + "learning_rate": 7.204705817443483e-06, + "loss": 0.4207, + "step": 3144 + }, + { + "epoch": 0.7484976497887785, + "grad_norm": 0.352020290864496, + "learning_rate": 7.202975568646809e-06, + "loss": 0.3251, + "step": 3145 + }, + { + "epoch": 0.7487356458618433, + "grad_norm": 0.39868157169625523, + "learning_rate": 7.201244992433466e-06, + "loss": 0.3339, + "step": 3146 + }, + { + "epoch": 0.7489736419349081, + "grad_norm": 0.4023017435952385, + "learning_rate": 7.199514089060662e-06, + "loss": 0.3981, + "step": 3147 + }, + { + "epoch": 0.7492116380079729, + "grad_norm": 0.3741576715187835, + "learning_rate": 7.19778285878565e-06, + "loss": 0.3561, + "step": 3148 + }, + { + "epoch": 0.7494496340810377, + "grad_norm": 0.4709729162057381, + "learning_rate": 7.196051301865736e-06, + "loss": 0.3399, + "step": 3149 + }, + { + "epoch": 0.7496876301541024, + "grad_norm": 0.41862196677750924, + "learning_rate": 7.19431941855827e-06, + "loss": 0.3235, + "step": 3150 + }, + { + "epoch": 0.7499256262271673, + "grad_norm": 0.36071859439792425, + "learning_rate": 7.192587209120654e-06, + "loss": 0.3893, + "step": 3151 + }, + { + "epoch": 0.750163622300232, + "grad_norm": 0.3699965349983255, + "learning_rate": 7.190854673810337e-06, + "loss": 0.3109, + "step": 3152 + }, + { + "epoch": 0.7504016183732969, + "grad_norm": 0.42095108252199576, + "learning_rate": 7.189121812884816e-06, + "loss": 0.3121, + "step": 3153 + }, + { + "epoch": 0.7506396144463616, + "grad_norm": 0.37196059546980936, + "learning_rate": 7.1873886266016365e-06, + "loss": 0.3859, + "step": 3154 + }, + { + "epoch": 0.7508776105194265, + "grad_norm": 0.3728280714448741, + "learning_rate": 7.185655115218395e-06, + "loss": 0.3766, + "step": 3155 + }, + { + "epoch": 0.7511156065924912, + "grad_norm": 0.37533698264848414, + "learning_rate": 7.183921278992731e-06, + "loss": 0.2943, + "step": 3156 + }, + { + "epoch": 0.7513536026655561, + "grad_norm": 0.3918689428403605, + "learning_rate": 7.18218711818234e-06, + "loss": 0.3503, + "step": 3157 + }, + { + "epoch": 0.7515915987386208, + "grad_norm": 0.42511294665911836, + "learning_rate": 7.180452633044958e-06, + "loss": 0.4205, + "step": 3158 + }, + { + "epoch": 0.7518295948116857, + "grad_norm": 0.39558443408700916, + "learning_rate": 7.178717823838371e-06, + "loss": 0.3445, + "step": 3159 + }, + { + "epoch": 0.7520675908847504, + "grad_norm": 0.3861088188015497, + "learning_rate": 7.176982690820418e-06, + "loss": 0.3184, + "step": 3160 + }, + { + "epoch": 0.7523055869578152, + "grad_norm": 0.3598250325709718, + "learning_rate": 7.175247234248979e-06, + "loss": 0.3543, + "step": 3161 + }, + { + "epoch": 0.75254358303088, + "grad_norm": 0.38572071033062194, + "learning_rate": 7.173511454381991e-06, + "loss": 0.3887, + "step": 3162 + }, + { + "epoch": 0.7527815791039448, + "grad_norm": 0.3597558768595933, + "learning_rate": 7.171775351477429e-06, + "loss": 0.3176, + "step": 3163 + }, + { + "epoch": 0.7530195751770096, + "grad_norm": 0.3519928195184308, + "learning_rate": 7.170038925793323e-06, + "loss": 0.3598, + "step": 3164 + }, + { + "epoch": 0.7532575712500744, + "grad_norm": 0.40143875905094994, + "learning_rate": 7.16830217758775e-06, + "loss": 0.405, + "step": 3165 + }, + { + "epoch": 0.7534955673231392, + "grad_norm": 0.39193156061116613, + "learning_rate": 7.16656510711883e-06, + "loss": 0.3403, + "step": 3166 + }, + { + "epoch": 0.753733563396204, + "grad_norm": 0.38720279747566255, + "learning_rate": 7.164827714644738e-06, + "loss": 0.292, + "step": 3167 + }, + { + "epoch": 0.7539715594692687, + "grad_norm": 0.36467759519856513, + "learning_rate": 7.163090000423691e-06, + "loss": 0.3451, + "step": 3168 + }, + { + "epoch": 0.7542095555423336, + "grad_norm": 0.41427804995589623, + "learning_rate": 7.161351964713959e-06, + "loss": 0.4082, + "step": 3169 + }, + { + "epoch": 0.7544475516153983, + "grad_norm": 0.38192749026353107, + "learning_rate": 7.159613607773857e-06, + "loss": 0.3061, + "step": 3170 + }, + { + "epoch": 0.7546855476884632, + "grad_norm": 0.43361143142318026, + "learning_rate": 7.157874929861745e-06, + "loss": 0.3096, + "step": 3171 + }, + { + "epoch": 0.7549235437615279, + "grad_norm": 0.3554010401349325, + "learning_rate": 7.156135931236034e-06, + "loss": 0.4144, + "step": 3172 + }, + { + "epoch": 0.7551615398345928, + "grad_norm": 0.39011150823679097, + "learning_rate": 7.1543966121551845e-06, + "loss": 0.3801, + "step": 3173 + }, + { + "epoch": 0.7553995359076575, + "grad_norm": 0.3817077245262734, + "learning_rate": 7.152656972877702e-06, + "loss": 0.2873, + "step": 3174 + }, + { + "epoch": 0.7556375319807224, + "grad_norm": 0.3720557616381678, + "learning_rate": 7.150917013662138e-06, + "loss": 0.3617, + "step": 3175 + }, + { + "epoch": 0.7558755280537871, + "grad_norm": 0.34769187060464707, + "learning_rate": 7.149176734767095e-06, + "loss": 0.4133, + "step": 3176 + }, + { + "epoch": 0.756113524126852, + "grad_norm": 0.3835724087378952, + "learning_rate": 7.147436136451221e-06, + "loss": 0.325, + "step": 3177 + }, + { + "epoch": 0.7563515201999167, + "grad_norm": 0.37749841830040526, + "learning_rate": 7.145695218973213e-06, + "loss": 0.3102, + "step": 3178 + }, + { + "epoch": 0.7565895162729815, + "grad_norm": 0.374520542733197, + "learning_rate": 7.143953982591813e-06, + "loss": 0.3978, + "step": 3179 + }, + { + "epoch": 0.7568275123460463, + "grad_norm": 0.4023541368854894, + "learning_rate": 7.142212427565812e-06, + "loss": 0.3807, + "step": 3180 + }, + { + "epoch": 0.7570655084191111, + "grad_norm": 0.35873098883174603, + "learning_rate": 7.140470554154048e-06, + "loss": 0.3559, + "step": 3181 + }, + { + "epoch": 0.7573035044921759, + "grad_norm": 0.3867236526972635, + "learning_rate": 7.138728362615408e-06, + "loss": 0.3331, + "step": 3182 + }, + { + "epoch": 0.7575415005652407, + "grad_norm": 0.34735983378932106, + "learning_rate": 7.136985853208824e-06, + "loss": 0.398, + "step": 3183 + }, + { + "epoch": 0.7577794966383055, + "grad_norm": 0.38443189061398053, + "learning_rate": 7.135243026193275e-06, + "loss": 0.3334, + "step": 3184 + }, + { + "epoch": 0.7580174927113703, + "grad_norm": 0.3821702158023499, + "learning_rate": 7.13349988182779e-06, + "loss": 0.3129, + "step": 3185 + }, + { + "epoch": 0.758255488784435, + "grad_norm": 0.3569641423841968, + "learning_rate": 7.131756420371441e-06, + "loss": 0.38, + "step": 3186 + }, + { + "epoch": 0.7584934848574999, + "grad_norm": 0.40219483194623007, + "learning_rate": 7.130012642083351e-06, + "loss": 0.4149, + "step": 3187 + }, + { + "epoch": 0.7587314809305646, + "grad_norm": 0.3708101360867531, + "learning_rate": 7.128268547222688e-06, + "loss": 0.3625, + "step": 3188 + }, + { + "epoch": 0.7589694770036295, + "grad_norm": 0.37603123822726897, + "learning_rate": 7.126524136048669e-06, + "loss": 0.33, + "step": 3189 + }, + { + "epoch": 0.7592074730766942, + "grad_norm": 0.3625022362919271, + "learning_rate": 7.124779408820555e-06, + "loss": 0.4318, + "step": 3190 + }, + { + "epoch": 0.7594454691497591, + "grad_norm": 0.39739967957247313, + "learning_rate": 7.123034365797657e-06, + "loss": 0.3272, + "step": 3191 + }, + { + "epoch": 0.7596834652228238, + "grad_norm": 0.3702784091648422, + "learning_rate": 7.121289007239331e-06, + "loss": 0.3268, + "step": 3192 + }, + { + "epoch": 0.7599214612958887, + "grad_norm": 0.3516224416164428, + "learning_rate": 7.119543333404981e-06, + "loss": 0.3504, + "step": 3193 + }, + { + "epoch": 0.7601594573689534, + "grad_norm": 0.3559370032742599, + "learning_rate": 7.117797344554056e-06, + "loss": 0.4026, + "step": 3194 + }, + { + "epoch": 0.7603974534420183, + "grad_norm": 0.3734187380388612, + "learning_rate": 7.116051040946053e-06, + "loss": 0.3204, + "step": 3195 + }, + { + "epoch": 0.760635449515083, + "grad_norm": 0.3579819166208651, + "learning_rate": 7.114304422840517e-06, + "loss": 0.3277, + "step": 3196 + }, + { + "epoch": 0.7608734455881478, + "grad_norm": 0.3752870062603591, + "learning_rate": 7.112557490497038e-06, + "loss": 0.3955, + "step": 3197 + }, + { + "epoch": 0.7611114416612126, + "grad_norm": 0.3707029313698444, + "learning_rate": 7.1108102441752546e-06, + "loss": 0.3666, + "step": 3198 + }, + { + "epoch": 0.7613494377342774, + "grad_norm": 0.3867018314993227, + "learning_rate": 7.109062684134851e-06, + "loss": 0.3063, + "step": 3199 + }, + { + "epoch": 0.7615874338073422, + "grad_norm": 0.40876052113414646, + "learning_rate": 7.107314810635555e-06, + "loss": 0.3568, + "step": 3200 + }, + { + "epoch": 0.761825429880407, + "grad_norm": 0.4030334876778494, + "learning_rate": 7.105566623937145e-06, + "loss": 0.4414, + "step": 3201 + }, + { + "epoch": 0.7620634259534718, + "grad_norm": 0.36214917368846855, + "learning_rate": 7.103818124299446e-06, + "loss": 0.3108, + "step": 3202 + }, + { + "epoch": 0.7623014220265366, + "grad_norm": 0.36698016620497026, + "learning_rate": 7.102069311982329e-06, + "loss": 0.3015, + "step": 3203 + }, + { + "epoch": 0.7625394180996014, + "grad_norm": 0.368871180085732, + "learning_rate": 7.100320187245711e-06, + "loss": 0.3934, + "step": 3204 + }, + { + "epoch": 0.7627774141726662, + "grad_norm": 0.39902520639387057, + "learning_rate": 7.098570750349552e-06, + "loss": 0.3871, + "step": 3205 + }, + { + "epoch": 0.7630154102457309, + "grad_norm": 0.37446900240784753, + "learning_rate": 7.096821001553863e-06, + "loss": 0.3109, + "step": 3206 + }, + { + "epoch": 0.7632534063187958, + "grad_norm": 0.33881592006301775, + "learning_rate": 7.0950709411187e-06, + "loss": 0.3471, + "step": 3207 + }, + { + "epoch": 0.7634914023918605, + "grad_norm": 0.372761489856163, + "learning_rate": 7.093320569304168e-06, + "loss": 0.4049, + "step": 3208 + }, + { + "epoch": 0.7637293984649254, + "grad_norm": 0.4017927278263893, + "learning_rate": 7.0915698863704094e-06, + "loss": 0.3293, + "step": 3209 + }, + { + "epoch": 0.7639673945379901, + "grad_norm": 0.4054778790918442, + "learning_rate": 7.089818892577625e-06, + "loss": 0.335, + "step": 3210 + }, + { + "epoch": 0.764205390611055, + "grad_norm": 0.3850085404080853, + "learning_rate": 7.088067588186053e-06, + "loss": 0.3492, + "step": 3211 + }, + { + "epoch": 0.7644433866841197, + "grad_norm": 0.37942848234718374, + "learning_rate": 7.086315973455982e-06, + "loss": 0.4199, + "step": 3212 + }, + { + "epoch": 0.7646813827571846, + "grad_norm": 0.4002793707382145, + "learning_rate": 7.084564048647742e-06, + "loss": 0.3223, + "step": 3213 + }, + { + "epoch": 0.7649193788302493, + "grad_norm": 0.3758467906510097, + "learning_rate": 7.082811814021717e-06, + "loss": 0.3411, + "step": 3214 + }, + { + "epoch": 0.7651573749033141, + "grad_norm": 0.38353128527707686, + "learning_rate": 7.08105926983833e-06, + "loss": 0.4176, + "step": 3215 + }, + { + "epoch": 0.7653953709763789, + "grad_norm": 0.41062374283228403, + "learning_rate": 7.0793064163580515e-06, + "loss": 0.3443, + "step": 3216 + }, + { + "epoch": 0.7656333670494437, + "grad_norm": 0.38600351854755544, + "learning_rate": 7.0775532538414005e-06, + "loss": 0.3275, + "step": 3217 + }, + { + "epoch": 0.7658713631225085, + "grad_norm": 0.3588188716781081, + "learning_rate": 7.0757997825489395e-06, + "loss": 0.368, + "step": 3218 + }, + { + "epoch": 0.7661093591955733, + "grad_norm": 0.3582749191881199, + "learning_rate": 7.074046002741279e-06, + "loss": 0.3938, + "step": 3219 + }, + { + "epoch": 0.7663473552686381, + "grad_norm": 0.3597466282489841, + "learning_rate": 7.072291914679072e-06, + "loss": 0.3072, + "step": 3220 + }, + { + "epoch": 0.7665853513417029, + "grad_norm": 0.3920888152123116, + "learning_rate": 7.070537518623022e-06, + "loss": 0.2984, + "step": 3221 + }, + { + "epoch": 0.7668233474147677, + "grad_norm": 0.3641437351355874, + "learning_rate": 7.068782814833872e-06, + "loss": 0.3744, + "step": 3222 + }, + { + "epoch": 0.7670613434878325, + "grad_norm": 0.37937338820809674, + "learning_rate": 7.067027803572417e-06, + "loss": 0.3666, + "step": 3223 + }, + { + "epoch": 0.7672993395608972, + "grad_norm": 0.40998865963471826, + "learning_rate": 7.065272485099496e-06, + "loss": 0.2939, + "step": 3224 + }, + { + "epoch": 0.7675373356339621, + "grad_norm": 0.4023210745015225, + "learning_rate": 7.06351685967599e-06, + "loss": 0.3818, + "step": 3225 + }, + { + "epoch": 0.7677753317070268, + "grad_norm": 0.37345454403274114, + "learning_rate": 7.061760927562831e-06, + "loss": 0.4636, + "step": 3226 + }, + { + "epoch": 0.7680133277800917, + "grad_norm": 0.4223720881715161, + "learning_rate": 7.060004689020991e-06, + "loss": 0.3236, + "step": 3227 + }, + { + "epoch": 0.7682513238531564, + "grad_norm": 0.40982795591831794, + "learning_rate": 7.058248144311493e-06, + "loss": 0.3324, + "step": 3228 + }, + { + "epoch": 0.7684893199262213, + "grad_norm": 0.3557070641164775, + "learning_rate": 7.056491293695401e-06, + "loss": 0.3833, + "step": 3229 + }, + { + "epoch": 0.768727315999286, + "grad_norm": 0.3797321346430915, + "learning_rate": 7.05473413743383e-06, + "loss": 0.3552, + "step": 3230 + }, + { + "epoch": 0.7689653120723509, + "grad_norm": 0.4115569949091365, + "learning_rate": 7.052976675787932e-06, + "loss": 0.3028, + "step": 3231 + }, + { + "epoch": 0.7692033081454156, + "grad_norm": 0.42313222429594644, + "learning_rate": 7.051218909018913e-06, + "loss": 0.36, + "step": 3232 + }, + { + "epoch": 0.7694413042184804, + "grad_norm": 0.3576077007683352, + "learning_rate": 7.04946083738802e-06, + "loss": 0.3817, + "step": 3233 + }, + { + "epoch": 0.7696793002915452, + "grad_norm": 0.4065000621443697, + "learning_rate": 7.047702461156545e-06, + "loss": 0.3447, + "step": 3234 + }, + { + "epoch": 0.76991729636461, + "grad_norm": 0.38063058594261784, + "learning_rate": 7.045943780585826e-06, + "loss": 0.2826, + "step": 3235 + }, + { + "epoch": 0.7701552924376748, + "grad_norm": 0.410395519650393, + "learning_rate": 7.044184795937248e-06, + "loss": 0.3782, + "step": 3236 + }, + { + "epoch": 0.7703932885107396, + "grad_norm": 0.38149692275265434, + "learning_rate": 7.042425507472237e-06, + "loss": 0.4149, + "step": 3237 + }, + { + "epoch": 0.7706312845838044, + "grad_norm": 0.3809955756687497, + "learning_rate": 7.040665915452269e-06, + "loss": 0.3169, + "step": 3238 + }, + { + "epoch": 0.7708692806568692, + "grad_norm": 0.3697765244878929, + "learning_rate": 7.038906020138863e-06, + "loss": 0.3496, + "step": 3239 + }, + { + "epoch": 0.771107276729934, + "grad_norm": 0.4309372907682824, + "learning_rate": 7.037145821793582e-06, + "loss": 0.3958, + "step": 3240 + }, + { + "epoch": 0.7713452728029988, + "grad_norm": 0.38099748409784767, + "learning_rate": 7.035385320678035e-06, + "loss": 0.3601, + "step": 3241 + }, + { + "epoch": 0.7715832688760635, + "grad_norm": 0.4276643539057699, + "learning_rate": 7.033624517053878e-06, + "loss": 0.3132, + "step": 3242 + }, + { + "epoch": 0.7718212649491284, + "grad_norm": 0.3518664176009511, + "learning_rate": 7.031863411182806e-06, + "loss": 0.3615, + "step": 3243 + }, + { + "epoch": 0.7720592610221931, + "grad_norm": 0.362211191847796, + "learning_rate": 7.0301020033265655e-06, + "loss": 0.4409, + "step": 3244 + }, + { + "epoch": 0.772297257095258, + "grad_norm": 0.41910572530406326, + "learning_rate": 7.0283402937469455e-06, + "loss": 0.3131, + "step": 3245 + }, + { + "epoch": 0.7725352531683227, + "grad_norm": 0.4314331090536776, + "learning_rate": 7.0265782827057804e-06, + "loss": 0.3372, + "step": 3246 + }, + { + "epoch": 0.7727732492413876, + "grad_norm": 0.3693815905880875, + "learning_rate": 7.024815970464947e-06, + "loss": 0.4088, + "step": 3247 + }, + { + "epoch": 0.7730112453144523, + "grad_norm": 0.37981415522062073, + "learning_rate": 7.023053357286366e-06, + "loss": 0.3587, + "step": 3248 + }, + { + "epoch": 0.7732492413875172, + "grad_norm": 0.408493661115404, + "learning_rate": 7.0212904434320115e-06, + "loss": 0.307, + "step": 3249 + }, + { + "epoch": 0.7734872374605819, + "grad_norm": 0.37233119228531636, + "learning_rate": 7.019527229163891e-06, + "loss": 0.3307, + "step": 3250 + }, + { + "epoch": 0.7737252335336468, + "grad_norm": 0.355235494120951, + "learning_rate": 7.0177637147440645e-06, + "loss": 0.4161, + "step": 3251 + }, + { + "epoch": 0.7739632296067115, + "grad_norm": 0.41229891884255343, + "learning_rate": 7.015999900434632e-06, + "loss": 0.3474, + "step": 3252 + }, + { + "epoch": 0.7742012256797762, + "grad_norm": 0.37602579982370005, + "learning_rate": 7.0142357864977425e-06, + "loss": 0.3039, + "step": 3253 + }, + { + "epoch": 0.7744392217528411, + "grad_norm": 0.3958312928917149, + "learning_rate": 7.012471373195584e-06, + "loss": 0.3848, + "step": 3254 + }, + { + "epoch": 0.7746772178259058, + "grad_norm": 0.4121169912150407, + "learning_rate": 7.010706660790393e-06, + "loss": 0.3912, + "step": 3255 + }, + { + "epoch": 0.7749152138989707, + "grad_norm": 0.3824825922233828, + "learning_rate": 7.0089416495444505e-06, + "loss": 0.3104, + "step": 3256 + }, + { + "epoch": 0.7751532099720354, + "grad_norm": 0.3844237240799921, + "learning_rate": 7.007176339720079e-06, + "loss": 0.3511, + "step": 3257 + }, + { + "epoch": 0.7753912060451003, + "grad_norm": 0.4050734698257971, + "learning_rate": 7.005410731579649e-06, + "loss": 0.3956, + "step": 3258 + }, + { + "epoch": 0.775629202118165, + "grad_norm": 0.38094079518404117, + "learning_rate": 7.003644825385574e-06, + "loss": 0.3168, + "step": 3259 + }, + { + "epoch": 0.7758671981912298, + "grad_norm": 0.3756921808647231, + "learning_rate": 7.001878621400309e-06, + "loss": 0.3121, + "step": 3260 + }, + { + "epoch": 0.7761051942642946, + "grad_norm": 0.3795064413693575, + "learning_rate": 7.000112119886356e-06, + "loss": 0.3722, + "step": 3261 + }, + { + "epoch": 0.7763431903373594, + "grad_norm": 0.38072119281394334, + "learning_rate": 6.998345321106264e-06, + "loss": 0.3769, + "step": 3262 + }, + { + "epoch": 0.7765811864104242, + "grad_norm": 0.3521206764121899, + "learning_rate": 6.996578225322619e-06, + "loss": 0.3074, + "step": 3263 + }, + { + "epoch": 0.776819182483489, + "grad_norm": 0.3934398383104075, + "learning_rate": 6.994810832798056e-06, + "loss": 0.329, + "step": 3264 + }, + { + "epoch": 0.7770571785565538, + "grad_norm": 0.3832785826014656, + "learning_rate": 6.993043143795255e-06, + "loss": 0.405, + "step": 3265 + }, + { + "epoch": 0.7772951746296186, + "grad_norm": 0.3642849945216878, + "learning_rate": 6.991275158576936e-06, + "loss": 0.3423, + "step": 3266 + }, + { + "epoch": 0.7775331707026834, + "grad_norm": 0.40701200067939586, + "learning_rate": 6.989506877405867e-06, + "loss": 0.3073, + "step": 3267 + }, + { + "epoch": 0.7777711667757482, + "grad_norm": 0.38472233590714217, + "learning_rate": 6.9877383005448595e-06, + "loss": 0.3395, + "step": 3268 + }, + { + "epoch": 0.7780091628488129, + "grad_norm": 0.3839123853902827, + "learning_rate": 6.9859694282567655e-06, + "loss": 0.3998, + "step": 3269 + }, + { + "epoch": 0.7782471589218778, + "grad_norm": 0.35916227359386177, + "learning_rate": 6.9842002608044844e-06, + "loss": 0.3028, + "step": 3270 + }, + { + "epoch": 0.7784851549949425, + "grad_norm": 0.37216903957857644, + "learning_rate": 6.9824307984509565e-06, + "loss": 0.3197, + "step": 3271 + }, + { + "epoch": 0.7787231510680074, + "grad_norm": 0.401474993410537, + "learning_rate": 6.98066104145917e-06, + "loss": 0.4028, + "step": 3272 + }, + { + "epoch": 0.7789611471410721, + "grad_norm": 0.4764009593683862, + "learning_rate": 6.9788909900921546e-06, + "loss": 0.3624, + "step": 3273 + }, + { + "epoch": 0.779199143214137, + "grad_norm": 0.35941136835596654, + "learning_rate": 6.977120644612981e-06, + "loss": 0.3168, + "step": 3274 + }, + { + "epoch": 0.7794371392872017, + "grad_norm": 0.47474946631582, + "learning_rate": 6.975350005284769e-06, + "loss": 0.3574, + "step": 3275 + }, + { + "epoch": 0.7796751353602666, + "grad_norm": 0.3664417092614242, + "learning_rate": 6.973579072370678e-06, + "loss": 0.4299, + "step": 3276 + }, + { + "epoch": 0.7799131314333313, + "grad_norm": 0.3583726867946201, + "learning_rate": 6.971807846133912e-06, + "loss": 0.3236, + "step": 3277 + }, + { + "epoch": 0.7801511275063961, + "grad_norm": 0.4022285999884666, + "learning_rate": 6.97003632683772e-06, + "loss": 0.3138, + "step": 3278 + }, + { + "epoch": 0.7803891235794609, + "grad_norm": 0.35817090327334505, + "learning_rate": 6.9682645147453954e-06, + "loss": 0.3843, + "step": 3279 + }, + { + "epoch": 0.7806271196525257, + "grad_norm": 0.3846970980247595, + "learning_rate": 6.966492410120269e-06, + "loss": 0.3713, + "step": 3280 + }, + { + "epoch": 0.7808651157255905, + "grad_norm": 0.366789717423642, + "learning_rate": 6.964720013225723e-06, + "loss": 0.2951, + "step": 3281 + }, + { + "epoch": 0.7811031117986553, + "grad_norm": 0.38839627264622095, + "learning_rate": 6.962947324325178e-06, + "loss": 0.3634, + "step": 3282 + }, + { + "epoch": 0.7813411078717201, + "grad_norm": 0.36282780184060254, + "learning_rate": 6.9611743436821e-06, + "loss": 0.3993, + "step": 3283 + }, + { + "epoch": 0.7815791039447849, + "grad_norm": 0.35508176540562836, + "learning_rate": 6.959401071559997e-06, + "loss": 0.3159, + "step": 3284 + }, + { + "epoch": 0.7818171000178497, + "grad_norm": 0.4152503707635735, + "learning_rate": 6.957627508222421e-06, + "loss": 0.2873, + "step": 3285 + }, + { + "epoch": 0.7820550960909145, + "grad_norm": 0.39876662297919807, + "learning_rate": 6.955853653932969e-06, + "loss": 0.3802, + "step": 3286 + }, + { + "epoch": 0.7822930921639792, + "grad_norm": 0.36696139093226415, + "learning_rate": 6.9540795089552785e-06, + "loss": 0.3693, + "step": 3287 + }, + { + "epoch": 0.7825310882370441, + "grad_norm": 0.3600324867306773, + "learning_rate": 6.952305073553031e-06, + "loss": 0.3051, + "step": 3288 + }, + { + "epoch": 0.7827690843101088, + "grad_norm": 0.3489749012167579, + "learning_rate": 6.950530347989952e-06, + "loss": 0.3532, + "step": 3289 + }, + { + "epoch": 0.7830070803831737, + "grad_norm": 0.3620983180455999, + "learning_rate": 6.9487553325298086e-06, + "loss": 0.3963, + "step": 3290 + }, + { + "epoch": 0.7832450764562384, + "grad_norm": 0.36785158602869183, + "learning_rate": 6.946980027436413e-06, + "loss": 0.343, + "step": 3291 + }, + { + "epoch": 0.7834830725293033, + "grad_norm": 0.3599934529541224, + "learning_rate": 6.94520443297362e-06, + "loss": 0.2774, + "step": 3292 + }, + { + "epoch": 0.783721068602368, + "grad_norm": 0.38715941357436445, + "learning_rate": 6.943428549405327e-06, + "loss": 0.3519, + "step": 3293 + }, + { + "epoch": 0.7839590646754329, + "grad_norm": 0.35349331453822797, + "learning_rate": 6.941652376995471e-06, + "loss": 0.3988, + "step": 3294 + }, + { + "epoch": 0.7841970607484976, + "grad_norm": 0.3955985233260755, + "learning_rate": 6.93987591600804e-06, + "loss": 0.3389, + "step": 3295 + }, + { + "epoch": 0.7844350568215624, + "grad_norm": 0.34663056196232733, + "learning_rate": 6.938099166707058e-06, + "loss": 0.314, + "step": 3296 + }, + { + "epoch": 0.7846730528946272, + "grad_norm": 0.38906902208324257, + "learning_rate": 6.936322129356592e-06, + "loss": 0.3764, + "step": 3297 + }, + { + "epoch": 0.784911048967692, + "grad_norm": 0.41156216306072313, + "learning_rate": 6.934544804220755e-06, + "loss": 0.3957, + "step": 3298 + }, + { + "epoch": 0.7851490450407568, + "grad_norm": 0.35518958539778267, + "learning_rate": 6.932767191563703e-06, + "loss": 0.282, + "step": 3299 + }, + { + "epoch": 0.7853870411138216, + "grad_norm": 0.47033631889554306, + "learning_rate": 6.9309892916496315e-06, + "loss": 0.3442, + "step": 3300 + }, + { + "epoch": 0.7856250371868864, + "grad_norm": 0.3666911616164395, + "learning_rate": 6.929211104742781e-06, + "loss": 0.4094, + "step": 3301 + }, + { + "epoch": 0.7858630332599512, + "grad_norm": 0.3577310928807679, + "learning_rate": 6.927432631107434e-06, + "loss": 0.3025, + "step": 3302 + }, + { + "epoch": 0.786101029333016, + "grad_norm": 0.37429247897055223, + "learning_rate": 6.925653871007916e-06, + "loss": 0.3226, + "step": 3303 + }, + { + "epoch": 0.7863390254060808, + "grad_norm": 0.3880037905135578, + "learning_rate": 6.923874824708594e-06, + "loss": 0.354, + "step": 3304 + }, + { + "epoch": 0.7865770214791455, + "grad_norm": 0.4433660243451845, + "learning_rate": 6.922095492473877e-06, + "loss": 0.3544, + "step": 3305 + }, + { + "epoch": 0.7868150175522104, + "grad_norm": 0.3759346004041654, + "learning_rate": 6.920315874568222e-06, + "loss": 0.2971, + "step": 3306 + }, + { + "epoch": 0.7870530136252751, + "grad_norm": 0.3785964570018848, + "learning_rate": 6.918535971256121e-06, + "loss": 0.3874, + "step": 3307 + }, + { + "epoch": 0.78729100969834, + "grad_norm": 0.3558284632586464, + "learning_rate": 6.91675578280211e-06, + "loss": 0.422, + "step": 3308 + }, + { + "epoch": 0.7875290057714047, + "grad_norm": 0.37377127129597065, + "learning_rate": 6.914975309470775e-06, + "loss": 0.3282, + "step": 3309 + }, + { + "epoch": 0.7877670018444696, + "grad_norm": 0.40012751504223365, + "learning_rate": 6.913194551526733e-06, + "loss": 0.3119, + "step": 3310 + }, + { + "epoch": 0.7880049979175343, + "grad_norm": 0.3754850496234726, + "learning_rate": 6.911413509234651e-06, + "loss": 0.3494, + "step": 3311 + }, + { + "epoch": 0.7882429939905992, + "grad_norm": 0.40489835887499337, + "learning_rate": 6.9096321828592336e-06, + "loss": 0.4102, + "step": 3312 + }, + { + "epoch": 0.7884809900636639, + "grad_norm": 0.43391003763332675, + "learning_rate": 6.9078505726652345e-06, + "loss": 0.3132, + "step": 3313 + }, + { + "epoch": 0.7887189861367288, + "grad_norm": 0.3897042525840326, + "learning_rate": 6.906068678917442e-06, + "loss": 0.3263, + "step": 3314 + }, + { + "epoch": 0.7889569822097935, + "grad_norm": 0.3983465394931579, + "learning_rate": 6.904286501880688e-06, + "loss": 0.4121, + "step": 3315 + }, + { + "epoch": 0.7891949782828583, + "grad_norm": 0.3960830865161725, + "learning_rate": 6.902504041819853e-06, + "loss": 0.3513, + "step": 3316 + }, + { + "epoch": 0.7894329743559231, + "grad_norm": 0.4092338028148718, + "learning_rate": 6.900721298999849e-06, + "loss": 0.3118, + "step": 3317 + }, + { + "epoch": 0.7896709704289879, + "grad_norm": 0.37481354646128145, + "learning_rate": 6.8989382736856405e-06, + "loss": 0.3556, + "step": 3318 + }, + { + "epoch": 0.7899089665020527, + "grad_norm": 0.4041714681573852, + "learning_rate": 6.897154966142225e-06, + "loss": 0.3973, + "step": 3319 + }, + { + "epoch": 0.7901469625751175, + "grad_norm": 0.3961472202444824, + "learning_rate": 6.89537137663465e-06, + "loss": 0.2757, + "step": 3320 + }, + { + "epoch": 0.7903849586481823, + "grad_norm": 0.3476476966030962, + "learning_rate": 6.893587505427997e-06, + "loss": 0.3343, + "step": 3321 + }, + { + "epoch": 0.7906229547212471, + "grad_norm": 0.4234915791131859, + "learning_rate": 6.891803352787396e-06, + "loss": 0.3884, + "step": 3322 + }, + { + "epoch": 0.7908609507943118, + "grad_norm": 0.3688518168774175, + "learning_rate": 6.890018918978018e-06, + "loss": 0.3443, + "step": 3323 + }, + { + "epoch": 0.7910989468673767, + "grad_norm": 0.44308279354142094, + "learning_rate": 6.888234204265071e-06, + "loss": 0.2956, + "step": 3324 + }, + { + "epoch": 0.7913369429404414, + "grad_norm": 0.3728873788461928, + "learning_rate": 6.8864492089138076e-06, + "loss": 0.3573, + "step": 3325 + }, + { + "epoch": 0.7915749390135063, + "grad_norm": 0.3691109038727998, + "learning_rate": 6.8846639331895235e-06, + "loss": 0.415, + "step": 3326 + }, + { + "epoch": 0.791812935086571, + "grad_norm": 0.40429091720127247, + "learning_rate": 6.882878377357555e-06, + "loss": 0.3342, + "step": 3327 + }, + { + "epoch": 0.7920509311596359, + "grad_norm": 0.35173869530824603, + "learning_rate": 6.881092541683279e-06, + "loss": 0.3036, + "step": 3328 + }, + { + "epoch": 0.7922889272327006, + "grad_norm": 0.3691282683210561, + "learning_rate": 6.879306426432116e-06, + "loss": 0.3957, + "step": 3329 + }, + { + "epoch": 0.7925269233057655, + "grad_norm": 0.3547986919135188, + "learning_rate": 6.877520031869527e-06, + "loss": 0.3853, + "step": 3330 + }, + { + "epoch": 0.7927649193788302, + "grad_norm": 0.41969998025380606, + "learning_rate": 6.875733358261012e-06, + "loss": 0.312, + "step": 3331 + }, + { + "epoch": 0.793002915451895, + "grad_norm": 0.3666600692656378, + "learning_rate": 6.873946405872116e-06, + "loss": 0.3293, + "step": 3332 + }, + { + "epoch": 0.7932409115249598, + "grad_norm": 0.3793195334251969, + "learning_rate": 6.872159174968427e-06, + "loss": 0.3999, + "step": 3333 + }, + { + "epoch": 0.7934789075980246, + "grad_norm": 0.3653083899549032, + "learning_rate": 6.870371665815567e-06, + "loss": 0.347, + "step": 3334 + }, + { + "epoch": 0.7937169036710894, + "grad_norm": 0.42000680796497764, + "learning_rate": 6.868583878679209e-06, + "loss": 0.3133, + "step": 3335 + }, + { + "epoch": 0.7939548997441542, + "grad_norm": 0.3798826072515094, + "learning_rate": 6.866795813825059e-06, + "loss": 0.3855, + "step": 3336 + }, + { + "epoch": 0.794192895817219, + "grad_norm": 0.4204127342603903, + "learning_rate": 6.8650074715188695e-06, + "loss": 0.4339, + "step": 3337 + }, + { + "epoch": 0.7944308918902838, + "grad_norm": 0.3987640353825363, + "learning_rate": 6.863218852026432e-06, + "loss": 0.3097, + "step": 3338 + }, + { + "epoch": 0.7946688879633486, + "grad_norm": 0.3758268701259083, + "learning_rate": 6.861429955613579e-06, + "loss": 0.3308, + "step": 3339 + }, + { + "epoch": 0.7949068840364134, + "grad_norm": 0.3465046232838342, + "learning_rate": 6.859640782546183e-06, + "loss": 0.3863, + "step": 3340 + }, + { + "epoch": 0.7951448801094781, + "grad_norm": 0.3592668570581204, + "learning_rate": 6.8578513330901645e-06, + "loss": 0.3441, + "step": 3341 + }, + { + "epoch": 0.795382876182543, + "grad_norm": 0.3714878302081704, + "learning_rate": 6.856061607511475e-06, + "loss": 0.3282, + "step": 3342 + }, + { + "epoch": 0.7956208722556077, + "grad_norm": 0.3717879012940753, + "learning_rate": 6.854271606076114e-06, + "loss": 0.3704, + "step": 3343 + }, + { + "epoch": 0.7958588683286726, + "grad_norm": 0.36524627168544693, + "learning_rate": 6.85248132905012e-06, + "loss": 0.4022, + "step": 3344 + }, + { + "epoch": 0.7960968644017373, + "grad_norm": 0.3880622674807223, + "learning_rate": 6.850690776699574e-06, + "loss": 0.3121, + "step": 3345 + }, + { + "epoch": 0.7963348604748022, + "grad_norm": 0.38708118353283516, + "learning_rate": 6.848899949290592e-06, + "loss": 0.3444, + "step": 3346 + }, + { + "epoch": 0.7965728565478669, + "grad_norm": 0.3721719168783397, + "learning_rate": 6.847108847089339e-06, + "loss": 0.3891, + "step": 3347 + }, + { + "epoch": 0.7968108526209318, + "grad_norm": 0.4005020642472224, + "learning_rate": 6.8453174703620155e-06, + "loss": 0.3959, + "step": 3348 + }, + { + "epoch": 0.7970488486939965, + "grad_norm": 0.3522265566325754, + "learning_rate": 6.843525819374866e-06, + "loss": 0.3006, + "step": 3349 + }, + { + "epoch": 0.7972868447670614, + "grad_norm": 0.359814924986342, + "learning_rate": 6.841733894394172e-06, + "loss": 0.3521, + "step": 3350 + }, + { + "epoch": 0.7975248408401261, + "grad_norm": 0.3908892005179815, + "learning_rate": 6.839941695686261e-06, + "loss": 0.4427, + "step": 3351 + }, + { + "epoch": 0.7977628369131909, + "grad_norm": 0.39295135554595395, + "learning_rate": 6.838149223517495e-06, + "loss": 0.3039, + "step": 3352 + }, + { + "epoch": 0.7980008329862557, + "grad_norm": 0.3999452447353942, + "learning_rate": 6.836356478154279e-06, + "loss": 0.3105, + "step": 3353 + }, + { + "epoch": 0.7982388290593205, + "grad_norm": 0.36113133945632986, + "learning_rate": 6.834563459863064e-06, + "loss": 0.4003, + "step": 3354 + }, + { + "epoch": 0.7984768251323853, + "grad_norm": 0.3993728396402891, + "learning_rate": 6.832770168910332e-06, + "loss": 0.3429, + "step": 3355 + }, + { + "epoch": 0.7987148212054501, + "grad_norm": 0.36265914054354226, + "learning_rate": 6.830976605562614e-06, + "loss": 0.3019, + "step": 3356 + }, + { + "epoch": 0.7989528172785149, + "grad_norm": 0.3760662642357464, + "learning_rate": 6.829182770086474e-06, + "loss": 0.4016, + "step": 3357 + }, + { + "epoch": 0.7991908133515797, + "grad_norm": 0.3558139104694944, + "learning_rate": 6.8273886627485245e-06, + "loss": 0.4189, + "step": 3358 + }, + { + "epoch": 0.7994288094246444, + "grad_norm": 0.42200701223038184, + "learning_rate": 6.825594283815411e-06, + "loss": 0.3216, + "step": 3359 + }, + { + "epoch": 0.7996668054977093, + "grad_norm": 0.3628739848145437, + "learning_rate": 6.8237996335538245e-06, + "loss": 0.3099, + "step": 3360 + }, + { + "epoch": 0.799904801570774, + "grad_norm": 0.3622850476056826, + "learning_rate": 6.822004712230493e-06, + "loss": 0.3894, + "step": 3361 + }, + { + "epoch": 0.8001427976438389, + "grad_norm": 0.37608877349141273, + "learning_rate": 6.820209520112188e-06, + "loss": 0.3764, + "step": 3362 + }, + { + "epoch": 0.8003807937169036, + "grad_norm": 0.3921518941166787, + "learning_rate": 6.8184140574657185e-06, + "loss": 0.2933, + "step": 3363 + }, + { + "epoch": 0.8006187897899685, + "grad_norm": 0.3709070132502542, + "learning_rate": 6.816618324557934e-06, + "loss": 0.3279, + "step": 3364 + }, + { + "epoch": 0.8008567858630332, + "grad_norm": 0.3457649897592504, + "learning_rate": 6.8148223216557275e-06, + "loss": 0.421, + "step": 3365 + }, + { + "epoch": 0.8010947819360981, + "grad_norm": 0.363710583757097, + "learning_rate": 6.813026049026026e-06, + "loss": 0.3431, + "step": 3366 + }, + { + "epoch": 0.8013327780091628, + "grad_norm": 0.44025195568280423, + "learning_rate": 6.8112295069358005e-06, + "loss": 0.2966, + "step": 3367 + }, + { + "epoch": 0.8015707740822277, + "grad_norm": 0.44684338780747807, + "learning_rate": 6.809432695652063e-06, + "loss": 0.3689, + "step": 3368 + }, + { + "epoch": 0.8018087701552924, + "grad_norm": 0.3866828600799754, + "learning_rate": 6.807635615441866e-06, + "loss": 0.4114, + "step": 3369 + }, + { + "epoch": 0.8020467662283572, + "grad_norm": 0.3608818486936804, + "learning_rate": 6.805838266572296e-06, + "loss": 0.3066, + "step": 3370 + }, + { + "epoch": 0.802284762301422, + "grad_norm": 0.3903835514254373, + "learning_rate": 6.804040649310485e-06, + "loss": 0.3154, + "step": 3371 + }, + { + "epoch": 0.8025227583744868, + "grad_norm": 0.4174028450269264, + "learning_rate": 6.802242763923603e-06, + "loss": 0.3932, + "step": 3372 + }, + { + "epoch": 0.8027607544475516, + "grad_norm": 0.3902998602460606, + "learning_rate": 6.800444610678862e-06, + "loss": 0.3686, + "step": 3373 + }, + { + "epoch": 0.8029987505206164, + "grad_norm": 0.37542329189902135, + "learning_rate": 6.798646189843512e-06, + "loss": 0.2835, + "step": 3374 + }, + { + "epoch": 0.8032367465936812, + "grad_norm": 0.5826616163085041, + "learning_rate": 6.796847501684839e-06, + "loss": 0.3604, + "step": 3375 + }, + { + "epoch": 0.803474742666746, + "grad_norm": 0.36773753481276283, + "learning_rate": 6.795048546470178e-06, + "loss": 0.3924, + "step": 3376 + }, + { + "epoch": 0.8037127387398108, + "grad_norm": 0.3653653462075791, + "learning_rate": 6.793249324466895e-06, + "loss": 0.3302, + "step": 3377 + }, + { + "epoch": 0.8039507348128756, + "grad_norm": 0.3674665028573087, + "learning_rate": 6.7914498359424e-06, + "loss": 0.2912, + "step": 3378 + }, + { + "epoch": 0.8041887308859403, + "grad_norm": 0.3535016420982183, + "learning_rate": 6.78965008116414e-06, + "loss": 0.3777, + "step": 3379 + }, + { + "epoch": 0.8044267269590052, + "grad_norm": 0.3809175328016834, + "learning_rate": 6.787850060399604e-06, + "loss": 0.4065, + "step": 3380 + }, + { + "epoch": 0.8046647230320699, + "grad_norm": 0.38371526792218086, + "learning_rate": 6.78604977391632e-06, + "loss": 0.301, + "step": 3381 + }, + { + "epoch": 0.8049027191051348, + "grad_norm": 0.37885740937735535, + "learning_rate": 6.784249221981856e-06, + "loss": 0.3888, + "step": 3382 + }, + { + "epoch": 0.8051407151781995, + "grad_norm": 0.37193003169198097, + "learning_rate": 6.782448404863816e-06, + "loss": 0.3764, + "step": 3383 + }, + { + "epoch": 0.8053787112512644, + "grad_norm": 0.40081137867530753, + "learning_rate": 6.780647322829849e-06, + "loss": 0.3487, + "step": 3384 + }, + { + "epoch": 0.8056167073243291, + "grad_norm": 0.38584530133036227, + "learning_rate": 6.778845976147638e-06, + "loss": 0.3073, + "step": 3385 + }, + { + "epoch": 0.805854703397394, + "grad_norm": 0.3737571701313767, + "learning_rate": 6.777044365084907e-06, + "loss": 0.389, + "step": 3386 + }, + { + "epoch": 0.8060926994704587, + "grad_norm": 0.38914805596699603, + "learning_rate": 6.775242489909423e-06, + "loss": 0.3707, + "step": 3387 + }, + { + "epoch": 0.8063306955435235, + "grad_norm": 0.3908919574280197, + "learning_rate": 6.773440350888986e-06, + "loss": 0.3293, + "step": 3388 + }, + { + "epoch": 0.8065686916165883, + "grad_norm": 0.3732810073093854, + "learning_rate": 6.771637948291441e-06, + "loss": 0.3214, + "step": 3389 + }, + { + "epoch": 0.8068066876896531, + "grad_norm": 0.40490720807254776, + "learning_rate": 6.769835282384669e-06, + "loss": 0.3995, + "step": 3390 + }, + { + "epoch": 0.8070446837627179, + "grad_norm": 0.39783899196176287, + "learning_rate": 6.768032353436591e-06, + "loss": 0.364, + "step": 3391 + }, + { + "epoch": 0.8072826798357827, + "grad_norm": 0.36646595976846763, + "learning_rate": 6.766229161715165e-06, + "loss": 0.2909, + "step": 3392 + }, + { + "epoch": 0.8075206759088475, + "grad_norm": 0.4596377496880723, + "learning_rate": 6.764425707488393e-06, + "loss": 0.3568, + "step": 3393 + }, + { + "epoch": 0.8077586719819123, + "grad_norm": 0.38919838139408713, + "learning_rate": 6.76262199102431e-06, + "loss": 0.4245, + "step": 3394 + }, + { + "epoch": 0.807996668054977, + "grad_norm": 0.3756696902040613, + "learning_rate": 6.760818012590993e-06, + "loss": 0.3169, + "step": 3395 + }, + { + "epoch": 0.8082346641280419, + "grad_norm": 0.38466137056243654, + "learning_rate": 6.75901377245656e-06, + "loss": 0.3322, + "step": 3396 + }, + { + "epoch": 0.8084726602011066, + "grad_norm": 0.372823608083243, + "learning_rate": 6.757209270889164e-06, + "loss": 0.3765, + "step": 3397 + }, + { + "epoch": 0.8087106562741715, + "grad_norm": 0.40319286820868333, + "learning_rate": 6.755404508156999e-06, + "loss": 0.3428, + "step": 3398 + }, + { + "epoch": 0.8089486523472362, + "grad_norm": 0.37211593039177365, + "learning_rate": 6.753599484528297e-06, + "loss": 0.3013, + "step": 3399 + }, + { + "epoch": 0.8091866484203011, + "grad_norm": 0.36450494947046486, + "learning_rate": 6.75179420027133e-06, + "loss": 0.3378, + "step": 3400 + }, + { + "epoch": 0.8094246444933658, + "grad_norm": 0.35084818549241026, + "learning_rate": 6.749988655654408e-06, + "loss": 0.4222, + "step": 3401 + }, + { + "epoch": 0.8096626405664307, + "grad_norm": 0.38771334806435903, + "learning_rate": 6.748182850945878e-06, + "loss": 0.3184, + "step": 3402 + }, + { + "epoch": 0.8099006366394954, + "grad_norm": 0.40751175987989763, + "learning_rate": 6.746376786414129e-06, + "loss": 0.3106, + "step": 3403 + }, + { + "epoch": 0.8101386327125603, + "grad_norm": 0.3763443373885834, + "learning_rate": 6.744570462327588e-06, + "loss": 0.4034, + "step": 3404 + }, + { + "epoch": 0.810376628785625, + "grad_norm": 0.3912057352411011, + "learning_rate": 6.742763878954716e-06, + "loss": 0.3267, + "step": 3405 + }, + { + "epoch": 0.8106146248586898, + "grad_norm": 0.38899642397811124, + "learning_rate": 6.740957036564018e-06, + "loss": 0.2985, + "step": 3406 + }, + { + "epoch": 0.8108526209317546, + "grad_norm": 0.38659168634889995, + "learning_rate": 6.739149935424036e-06, + "loss": 0.3583, + "step": 3407 + }, + { + "epoch": 0.8110906170048194, + "grad_norm": 0.38015341694039984, + "learning_rate": 6.737342575803347e-06, + "loss": 0.4034, + "step": 3408 + }, + { + "epoch": 0.8113286130778842, + "grad_norm": 0.3784853530949804, + "learning_rate": 6.735534957970573e-06, + "loss": 0.3077, + "step": 3409 + }, + { + "epoch": 0.811566609150949, + "grad_norm": 0.3715243825768867, + "learning_rate": 6.733727082194369e-06, + "loss": 0.3189, + "step": 3410 + }, + { + "epoch": 0.8118046052240138, + "grad_norm": 0.3920251893951503, + "learning_rate": 6.73191894874343e-06, + "loss": 0.3666, + "step": 3411 + }, + { + "epoch": 0.8120426012970786, + "grad_norm": 0.4047784901748257, + "learning_rate": 6.73011055788649e-06, + "loss": 0.412, + "step": 3412 + }, + { + "epoch": 0.8122805973701434, + "grad_norm": 0.34961388235995544, + "learning_rate": 6.728301909892318e-06, + "loss": 0.3028, + "step": 3413 + }, + { + "epoch": 0.8125185934432082, + "grad_norm": 0.4964751847494611, + "learning_rate": 6.72649300502973e-06, + "loss": 0.3354, + "step": 3414 + }, + { + "epoch": 0.8127565895162729, + "grad_norm": 0.3737712857277735, + "learning_rate": 6.724683843567567e-06, + "loss": 0.3834, + "step": 3415 + }, + { + "epoch": 0.8129945855893378, + "grad_norm": 0.38533893171844813, + "learning_rate": 6.7228744257747195e-06, + "loss": 0.338, + "step": 3416 + }, + { + "epoch": 0.8132325816624025, + "grad_norm": 0.39656870787598, + "learning_rate": 6.72106475192011e-06, + "loss": 0.2903, + "step": 3417 + }, + { + "epoch": 0.8134705777354674, + "grad_norm": 0.35585546765967224, + "learning_rate": 6.719254822272701e-06, + "loss": 0.3375, + "step": 3418 + }, + { + "epoch": 0.8137085738085321, + "grad_norm": 0.37372380792440973, + "learning_rate": 6.717444637101494e-06, + "loss": 0.4215, + "step": 3419 + }, + { + "epoch": 0.813946569881597, + "grad_norm": 0.38397186576664505, + "learning_rate": 6.715634196675527e-06, + "loss": 0.3081, + "step": 3420 + }, + { + "epoch": 0.8141845659546617, + "grad_norm": 0.42512101870564933, + "learning_rate": 6.713823501263874e-06, + "loss": 0.3501, + "step": 3421 + }, + { + "epoch": 0.8144225620277266, + "grad_norm": 0.39007776244652564, + "learning_rate": 6.712012551135651e-06, + "loss": 0.3927, + "step": 3422 + }, + { + "epoch": 0.8146605581007913, + "grad_norm": 0.37533224466831305, + "learning_rate": 6.71020134656001e-06, + "loss": 0.3562, + "step": 3423 + }, + { + "epoch": 0.8148985541738561, + "grad_norm": 0.3691992345183985, + "learning_rate": 6.708389887806142e-06, + "loss": 0.3062, + "step": 3424 + }, + { + "epoch": 0.8151365502469209, + "grad_norm": 0.3836542262713381, + "learning_rate": 6.706578175143271e-06, + "loss": 0.3577, + "step": 3425 + }, + { + "epoch": 0.8153745463199857, + "grad_norm": 0.34274121993433526, + "learning_rate": 6.704766208840666e-06, + "loss": 0.3955, + "step": 3426 + }, + { + "epoch": 0.8156125423930505, + "grad_norm": 0.3785837148211837, + "learning_rate": 6.702953989167627e-06, + "loss": 0.328, + "step": 3427 + }, + { + "epoch": 0.8158505384661153, + "grad_norm": 0.367311727838402, + "learning_rate": 6.701141516393497e-06, + "loss": 0.3133, + "step": 3428 + }, + { + "epoch": 0.8160885345391801, + "grad_norm": 0.38174012941670704, + "learning_rate": 6.6993287907876526e-06, + "loss": 0.3642, + "step": 3429 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.36620752112375166, + "learning_rate": 6.6975158126195114e-06, + "loss": 0.4239, + "step": 3430 + }, + { + "epoch": 0.8165645266853097, + "grad_norm": 0.388324045604444, + "learning_rate": 6.695702582158527e-06, + "loss": 0.3357, + "step": 3431 + }, + { + "epoch": 0.8168025227583745, + "grad_norm": 0.3707836017064938, + "learning_rate": 6.693889099674188e-06, + "loss": 0.3521, + "step": 3432 + }, + { + "epoch": 0.8170405188314392, + "grad_norm": 0.3610380265483462, + "learning_rate": 6.692075365436024e-06, + "loss": 0.3975, + "step": 3433 + }, + { + "epoch": 0.8172785149045041, + "grad_norm": 0.3848858140680098, + "learning_rate": 6.690261379713601e-06, + "loss": 0.3366, + "step": 3434 + }, + { + "epoch": 0.8175165109775688, + "grad_norm": 0.41597504909165217, + "learning_rate": 6.688447142776522e-06, + "loss": 0.3135, + "step": 3435 + }, + { + "epoch": 0.8177545070506337, + "grad_norm": 0.3631226395036403, + "learning_rate": 6.6866326548944276e-06, + "loss": 0.3642, + "step": 3436 + }, + { + "epoch": 0.8179925031236984, + "grad_norm": 0.3821455357196431, + "learning_rate": 6.684817916336994e-06, + "loss": 0.3862, + "step": 3437 + }, + { + "epoch": 0.8182304991967633, + "grad_norm": 0.3585464421983625, + "learning_rate": 6.683002927373938e-06, + "loss": 0.3127, + "step": 3438 + }, + { + "epoch": 0.818468495269828, + "grad_norm": 0.37218554816765576, + "learning_rate": 6.681187688275013e-06, + "loss": 0.3131, + "step": 3439 + }, + { + "epoch": 0.8187064913428929, + "grad_norm": 0.3627840886800982, + "learning_rate": 6.679372199310006e-06, + "loss": 0.4235, + "step": 3440 + }, + { + "epoch": 0.8189444874159576, + "grad_norm": 0.4463956845389517, + "learning_rate": 6.677556460748744e-06, + "loss": 0.3329, + "step": 3441 + }, + { + "epoch": 0.8191824834890225, + "grad_norm": 0.41940300133110525, + "learning_rate": 6.675740472861092e-06, + "loss": 0.2945, + "step": 3442 + }, + { + "epoch": 0.8194204795620872, + "grad_norm": 0.3861526446671477, + "learning_rate": 6.673924235916948e-06, + "loss": 0.3535, + "step": 3443 + }, + { + "epoch": 0.819658475635152, + "grad_norm": 0.44522995704309204, + "learning_rate": 6.672107750186255e-06, + "loss": 0.3907, + "step": 3444 + }, + { + "epoch": 0.8198964717082168, + "grad_norm": 0.43143411864941417, + "learning_rate": 6.670291015938983e-06, + "loss": 0.2998, + "step": 3445 + }, + { + "epoch": 0.8201344677812816, + "grad_norm": 0.4500423391786268, + "learning_rate": 6.6684740334451445e-06, + "loss": 0.3279, + "step": 3446 + }, + { + "epoch": 0.8203724638543464, + "grad_norm": 0.3777270892774342, + "learning_rate": 6.666656802974789e-06, + "loss": 0.361, + "step": 3447 + }, + { + "epoch": 0.8206104599274112, + "grad_norm": 0.43511204091192573, + "learning_rate": 6.664839324798002e-06, + "loss": 0.394, + "step": 3448 + }, + { + "epoch": 0.820848456000476, + "grad_norm": 0.37499179792352805, + "learning_rate": 6.663021599184904e-06, + "loss": 0.3216, + "step": 3449 + }, + { + "epoch": 0.8210864520735408, + "grad_norm": 0.42216483258775034, + "learning_rate": 6.661203626405656e-06, + "loss": 0.3902, + "step": 3450 + }, + { + "epoch": 0.8213244481466055, + "grad_norm": 0.39104996064612785, + "learning_rate": 6.659385406730452e-06, + "loss": 0.4236, + "step": 3451 + }, + { + "epoch": 0.8215624442196704, + "grad_norm": 0.36054918099443395, + "learning_rate": 6.6575669404295265e-06, + "loss": 0.3189, + "step": 3452 + }, + { + "epoch": 0.8218004402927351, + "grad_norm": 0.40811287467124596, + "learning_rate": 6.6557482277731465e-06, + "loss": 0.3119, + "step": 3453 + }, + { + "epoch": 0.8220384363658, + "grad_norm": 0.36440658348004856, + "learning_rate": 6.653929269031618e-06, + "loss": 0.3753, + "step": 3454 + }, + { + "epoch": 0.8222764324388647, + "grad_norm": 0.4096319668844619, + "learning_rate": 6.652110064475286e-06, + "loss": 0.3708, + "step": 3455 + }, + { + "epoch": 0.8225144285119296, + "grad_norm": 0.4138762726742505, + "learning_rate": 6.650290614374526e-06, + "loss": 0.2696, + "step": 3456 + }, + { + "epoch": 0.8227524245849943, + "grad_norm": 0.38274207266784216, + "learning_rate": 6.648470918999754e-06, + "loss": 0.3701, + "step": 3457 + }, + { + "epoch": 0.8229904206580592, + "grad_norm": 0.38596079001363315, + "learning_rate": 6.646650978621422e-06, + "loss": 0.3932, + "step": 3458 + }, + { + "epoch": 0.8232284167311239, + "grad_norm": 0.4033700687920439, + "learning_rate": 6.644830793510019e-06, + "loss": 0.3134, + "step": 3459 + }, + { + "epoch": 0.8234664128041888, + "grad_norm": 0.3957395882295056, + "learning_rate": 6.64301036393607e-06, + "loss": 0.2913, + "step": 3460 + }, + { + "epoch": 0.8237044088772535, + "grad_norm": 0.37332886096760914, + "learning_rate": 6.641189690170135e-06, + "loss": 0.3702, + "step": 3461 + }, + { + "epoch": 0.8239424049503183, + "grad_norm": 0.382271014403154, + "learning_rate": 6.639368772482809e-06, + "loss": 0.3848, + "step": 3462 + }, + { + "epoch": 0.8241804010233831, + "grad_norm": 0.42293254876321, + "learning_rate": 6.637547611144729e-06, + "loss": 0.3168, + "step": 3463 + }, + { + "epoch": 0.8244183970964479, + "grad_norm": 0.5064018854163713, + "learning_rate": 6.635726206426562e-06, + "loss": 0.3296, + "step": 3464 + }, + { + "epoch": 0.8246563931695127, + "grad_norm": 0.3699399246485159, + "learning_rate": 6.633904558599015e-06, + "loss": 0.379, + "step": 3465 + }, + { + "epoch": 0.8248943892425775, + "grad_norm": 0.36822069349704023, + "learning_rate": 6.63208266793283e-06, + "loss": 0.3622, + "step": 3466 + }, + { + "epoch": 0.8251323853156423, + "grad_norm": 0.37564917131168607, + "learning_rate": 6.630260534698784e-06, + "loss": 0.2927, + "step": 3467 + }, + { + "epoch": 0.8253703813887071, + "grad_norm": 0.359534156281574, + "learning_rate": 6.628438159167691e-06, + "loss": 0.3703, + "step": 3468 + }, + { + "epoch": 0.8256083774617718, + "grad_norm": 0.41724063858650523, + "learning_rate": 6.626615541610404e-06, + "loss": 0.4272, + "step": 3469 + }, + { + "epoch": 0.8258463735348367, + "grad_norm": 0.35435885568070713, + "learning_rate": 6.624792682297807e-06, + "loss": 0.2995, + "step": 3470 + }, + { + "epoch": 0.8260843696079014, + "grad_norm": 0.3924531439546138, + "learning_rate": 6.62296958150082e-06, + "loss": 0.3106, + "step": 3471 + }, + { + "epoch": 0.8263223656809663, + "grad_norm": 0.3793607220180146, + "learning_rate": 6.621146239490405e-06, + "loss": 0.3642, + "step": 3472 + }, + { + "epoch": 0.826560361754031, + "grad_norm": 0.3757765538144861, + "learning_rate": 6.619322656537552e-06, + "loss": 0.3751, + "step": 3473 + }, + { + "epoch": 0.8267983578270959, + "grad_norm": 0.379936104170262, + "learning_rate": 6.6174988329132935e-06, + "loss": 0.3125, + "step": 3474 + }, + { + "epoch": 0.8270363539001606, + "grad_norm": 0.36029253203578504, + "learning_rate": 6.615674768888693e-06, + "loss": 0.3439, + "step": 3475 + }, + { + "epoch": 0.8272743499732255, + "grad_norm": 0.400916363212535, + "learning_rate": 6.613850464734852e-06, + "loss": 0.3963, + "step": 3476 + }, + { + "epoch": 0.8275123460462902, + "grad_norm": 0.39903964241453344, + "learning_rate": 6.6120259207229074e-06, + "loss": 0.3212, + "step": 3477 + }, + { + "epoch": 0.827750342119355, + "grad_norm": 0.4140249986684947, + "learning_rate": 6.61020113712403e-06, + "loss": 0.3088, + "step": 3478 + }, + { + "epoch": 0.8279883381924198, + "grad_norm": 0.3755700342561937, + "learning_rate": 6.60837611420943e-06, + "loss": 0.3936, + "step": 3479 + }, + { + "epoch": 0.8282263342654846, + "grad_norm": 0.3669468151714262, + "learning_rate": 6.606550852250351e-06, + "loss": 0.3653, + "step": 3480 + }, + { + "epoch": 0.8284643303385494, + "grad_norm": 0.3630825299629299, + "learning_rate": 6.60472535151807e-06, + "loss": 0.2969, + "step": 3481 + }, + { + "epoch": 0.8287023264116142, + "grad_norm": 0.48144687843420203, + "learning_rate": 6.602899612283903e-06, + "loss": 0.3609, + "step": 3482 + }, + { + "epoch": 0.828940322484679, + "grad_norm": 0.3375193982835446, + "learning_rate": 6.6010736348192e-06, + "loss": 0.4184, + "step": 3483 + }, + { + "epoch": 0.8291783185577438, + "grad_norm": 0.40206568719269975, + "learning_rate": 6.599247419395346e-06, + "loss": 0.3326, + "step": 3484 + }, + { + "epoch": 0.8294163146308086, + "grad_norm": 0.35032011037790545, + "learning_rate": 6.597420966283762e-06, + "loss": 0.3043, + "step": 3485 + }, + { + "epoch": 0.8296543107038734, + "grad_norm": 0.36114199055488116, + "learning_rate": 6.595594275755905e-06, + "loss": 0.3566, + "step": 3486 + }, + { + "epoch": 0.8298923067769381, + "grad_norm": 0.40221894140390374, + "learning_rate": 6.593767348083264e-06, + "loss": 0.4178, + "step": 3487 + }, + { + "epoch": 0.830130302850003, + "grad_norm": 0.4219667697893653, + "learning_rate": 6.591940183537369e-06, + "loss": 0.301, + "step": 3488 + }, + { + "epoch": 0.8303682989230677, + "grad_norm": 0.4089831045897673, + "learning_rate": 6.590112782389779e-06, + "loss": 0.3545, + "step": 3489 + }, + { + "epoch": 0.8306062949961326, + "grad_norm": 0.3568585115767541, + "learning_rate": 6.588285144912092e-06, + "loss": 0.4073, + "step": 3490 + }, + { + "epoch": 0.8308442910691973, + "grad_norm": 0.39444918515542654, + "learning_rate": 6.58645727137594e-06, + "loss": 0.3751, + "step": 3491 + }, + { + "epoch": 0.8310822871422622, + "grad_norm": 0.40709353343588783, + "learning_rate": 6.584629162052991e-06, + "loss": 0.3463, + "step": 3492 + }, + { + "epoch": 0.8313202832153269, + "grad_norm": 0.3926505024184683, + "learning_rate": 6.582800817214947e-06, + "loss": 0.372, + "step": 3493 + }, + { + "epoch": 0.8315582792883918, + "grad_norm": 0.39799166748115355, + "learning_rate": 6.5809722371335425e-06, + "loss": 0.4018, + "step": 3494 + }, + { + "epoch": 0.8317962753614565, + "grad_norm": 0.38694896220045694, + "learning_rate": 6.579143422080555e-06, + "loss": 0.3137, + "step": 3495 + }, + { + "epoch": 0.8320342714345214, + "grad_norm": 0.3643527025654742, + "learning_rate": 6.577314372327788e-06, + "loss": 0.3131, + "step": 3496 + }, + { + "epoch": 0.8322722675075861, + "grad_norm": 0.3818835372248485, + "learning_rate": 6.575485088147085e-06, + "loss": 0.3707, + "step": 3497 + }, + { + "epoch": 0.8325102635806509, + "grad_norm": 0.40425205370807665, + "learning_rate": 6.57365556981032e-06, + "loss": 0.3499, + "step": 3498 + }, + { + "epoch": 0.8327482596537157, + "grad_norm": 0.4772831093586169, + "learning_rate": 6.571825817589409e-06, + "loss": 0.3433, + "step": 3499 + }, + { + "epoch": 0.8329862557267805, + "grad_norm": 0.37500312469382296, + "learning_rate": 6.569995831756296e-06, + "loss": 0.3384, + "step": 3500 + }, + { + "epoch": 0.8332242517998453, + "grad_norm": 0.3878483799536773, + "learning_rate": 6.568165612582963e-06, + "loss": 0.4041, + "step": 3501 + }, + { + "epoch": 0.8334622478729101, + "grad_norm": 0.39314546980554216, + "learning_rate": 6.566335160341425e-06, + "loss": 0.3057, + "step": 3502 + }, + { + "epoch": 0.8337002439459749, + "grad_norm": 0.39459765976708316, + "learning_rate": 6.564504475303732e-06, + "loss": 0.3598, + "step": 3503 + }, + { + "epoch": 0.8339382400190397, + "grad_norm": 0.37557187665586705, + "learning_rate": 6.562673557741972e-06, + "loss": 0.3976, + "step": 3504 + }, + { + "epoch": 0.8341762360921045, + "grad_norm": 0.3666761239770583, + "learning_rate": 6.560842407928261e-06, + "loss": 0.3936, + "step": 3505 + }, + { + "epoch": 0.8344142321651693, + "grad_norm": 0.43334042433611564, + "learning_rate": 6.559011026134755e-06, + "loss": 0.309, + "step": 3506 + }, + { + "epoch": 0.834652228238234, + "grad_norm": 0.3639925280228689, + "learning_rate": 6.557179412633643e-06, + "loss": 0.3225, + "step": 3507 + }, + { + "epoch": 0.8348902243112989, + "grad_norm": 0.368213342084047, + "learning_rate": 6.555347567697147e-06, + "loss": 0.4027, + "step": 3508 + }, + { + "epoch": 0.8351282203843636, + "grad_norm": 0.40280786786827166, + "learning_rate": 6.553515491597525e-06, + "loss": 0.2956, + "step": 3509 + }, + { + "epoch": 0.8353662164574285, + "grad_norm": 0.3901118662358053, + "learning_rate": 6.55168318460707e-06, + "loss": 0.3167, + "step": 3510 + }, + { + "epoch": 0.8356042125304932, + "grad_norm": 0.36744551369501394, + "learning_rate": 6.549850646998106e-06, + "loss": 0.3774, + "step": 3511 + }, + { + "epoch": 0.8358422086035581, + "grad_norm": 0.42446857731031934, + "learning_rate": 6.548017879042993e-06, + "loss": 0.3903, + "step": 3512 + }, + { + "epoch": 0.8360802046766228, + "grad_norm": 0.4358337216970375, + "learning_rate": 6.546184881014128e-06, + "loss": 0.316, + "step": 3513 + }, + { + "epoch": 0.8363182007496877, + "grad_norm": 0.37095075714460507, + "learning_rate": 6.54435165318394e-06, + "loss": 0.3294, + "step": 3514 + }, + { + "epoch": 0.8365561968227524, + "grad_norm": 0.3735370617178222, + "learning_rate": 6.54251819582489e-06, + "loss": 0.4038, + "step": 3515 + }, + { + "epoch": 0.8367941928958172, + "grad_norm": 0.3754617184639044, + "learning_rate": 6.5406845092094775e-06, + "loss": 0.3109, + "step": 3516 + }, + { + "epoch": 0.837032188968882, + "grad_norm": 0.4256438483351794, + "learning_rate": 6.5388505936102305e-06, + "loss": 0.3236, + "step": 3517 + }, + { + "epoch": 0.8372701850419468, + "grad_norm": 0.36228819168311865, + "learning_rate": 6.537016449299718e-06, + "loss": 0.3672, + "step": 3518 + }, + { + "epoch": 0.8375081811150116, + "grad_norm": 0.3567201288288663, + "learning_rate": 6.5351820765505345e-06, + "loss": 0.3876, + "step": 3519 + }, + { + "epoch": 0.8377461771880764, + "grad_norm": 0.36599561427777705, + "learning_rate": 6.533347475635316e-06, + "loss": 0.3043, + "step": 3520 + }, + { + "epoch": 0.8379841732611412, + "grad_norm": 0.4053564651011694, + "learning_rate": 6.531512646826731e-06, + "loss": 0.3253, + "step": 3521 + }, + { + "epoch": 0.838222169334206, + "grad_norm": 0.37600903870469027, + "learning_rate": 6.529677590397478e-06, + "loss": 0.4036, + "step": 3522 + }, + { + "epoch": 0.8384601654072708, + "grad_norm": 0.361124659195214, + "learning_rate": 6.527842306620294e-06, + "loss": 0.3696, + "step": 3523 + }, + { + "epoch": 0.8386981614803356, + "grad_norm": 0.3908512268752686, + "learning_rate": 6.5260067957679455e-06, + "loss": 0.2881, + "step": 3524 + }, + { + "epoch": 0.8389361575534003, + "grad_norm": 0.391451223833398, + "learning_rate": 6.524171058113236e-06, + "loss": 0.3734, + "step": 3525 + }, + { + "epoch": 0.8391741536264652, + "grad_norm": 0.34623716604454546, + "learning_rate": 6.522335093928999e-06, + "loss": 0.4447, + "step": 3526 + }, + { + "epoch": 0.8394121496995299, + "grad_norm": 0.35943956265557137, + "learning_rate": 6.520498903488108e-06, + "loss": 0.3042, + "step": 3527 + }, + { + "epoch": 0.8396501457725948, + "grad_norm": 0.39161434146649104, + "learning_rate": 6.518662487063464e-06, + "loss": 0.291, + "step": 3528 + }, + { + "epoch": 0.8398881418456595, + "grad_norm": 0.37421840622968505, + "learning_rate": 6.516825844928005e-06, + "loss": 0.3847, + "step": 3529 + }, + { + "epoch": 0.8401261379187244, + "grad_norm": 0.37611154296694504, + "learning_rate": 6.514988977354701e-06, + "loss": 0.3662, + "step": 3530 + }, + { + "epoch": 0.8403641339917891, + "grad_norm": 0.3647185189803361, + "learning_rate": 6.513151884616556e-06, + "loss": 0.3128, + "step": 3531 + }, + { + "epoch": 0.840602130064854, + "grad_norm": 0.3821277967922997, + "learning_rate": 6.511314566986608e-06, + "loss": 0.3503, + "step": 3532 + }, + { + "epoch": 0.8408401261379187, + "grad_norm": 0.35310735826505824, + "learning_rate": 6.5094770247379256e-06, + "loss": 0.3922, + "step": 3533 + }, + { + "epoch": 0.8410781222109835, + "grad_norm": 0.39211042496153803, + "learning_rate": 6.507639258143615e-06, + "loss": 0.3153, + "step": 3534 + }, + { + "epoch": 0.8413161182840483, + "grad_norm": 0.37374980390462503, + "learning_rate": 6.5058012674768136e-06, + "loss": 0.2926, + "step": 3535 + }, + { + "epoch": 0.8415541143571131, + "grad_norm": 0.3480206219661194, + "learning_rate": 6.5039630530106925e-06, + "loss": 0.3595, + "step": 3536 + }, + { + "epoch": 0.8417921104301779, + "grad_norm": 0.37436697934855795, + "learning_rate": 6.502124615018456e-06, + "loss": 0.3896, + "step": 3537 + }, + { + "epoch": 0.8420301065032427, + "grad_norm": 0.3783010027033652, + "learning_rate": 6.50028595377334e-06, + "loss": 0.3203, + "step": 3538 + }, + { + "epoch": 0.8422681025763075, + "grad_norm": 0.4307862323394335, + "learning_rate": 6.498447069548617e-06, + "loss": 0.3373, + "step": 3539 + }, + { + "epoch": 0.8425060986493723, + "grad_norm": 0.4059325152116869, + "learning_rate": 6.496607962617588e-06, + "loss": 0.3827, + "step": 3540 + }, + { + "epoch": 0.842744094722437, + "grad_norm": 0.4279651224236223, + "learning_rate": 6.494768633253593e-06, + "loss": 0.3371, + "step": 3541 + }, + { + "epoch": 0.8429820907955019, + "grad_norm": 0.3780915639053451, + "learning_rate": 6.492929081729999e-06, + "loss": 0.3081, + "step": 3542 + }, + { + "epoch": 0.8432200868685666, + "grad_norm": 0.42345332107805106, + "learning_rate": 6.491089308320212e-06, + "loss": 0.3686, + "step": 3543 + }, + { + "epoch": 0.8434580829416315, + "grad_norm": 0.3479273768259802, + "learning_rate": 6.489249313297665e-06, + "loss": 0.4429, + "step": 3544 + }, + { + "epoch": 0.8436960790146962, + "grad_norm": 0.41111841314637215, + "learning_rate": 6.487409096935828e-06, + "loss": 0.3142, + "step": 3545 + }, + { + "epoch": 0.8439340750877611, + "grad_norm": 0.41453492108831, + "learning_rate": 6.485568659508201e-06, + "loss": 0.3128, + "step": 3546 + }, + { + "epoch": 0.8441720711608258, + "grad_norm": 0.3412459944917207, + "learning_rate": 6.483728001288322e-06, + "loss": 0.3878, + "step": 3547 + }, + { + "epoch": 0.8444100672338907, + "grad_norm": 0.3580627139101501, + "learning_rate": 6.481887122549755e-06, + "loss": 0.3663, + "step": 3548 + }, + { + "epoch": 0.8446480633069554, + "grad_norm": 0.3881281457560659, + "learning_rate": 6.480046023566101e-06, + "loss": 0.3162, + "step": 3549 + }, + { + "epoch": 0.8448860593800203, + "grad_norm": 0.38842732635621374, + "learning_rate": 6.4782047046109956e-06, + "loss": 0.3745, + "step": 3550 + }, + { + "epoch": 0.845124055453085, + "grad_norm": 0.368464384136309, + "learning_rate": 6.476363165958101e-06, + "loss": 0.4105, + "step": 3551 + }, + { + "epoch": 0.8453620515261498, + "grad_norm": 0.42844621258294174, + "learning_rate": 6.474521407881116e-06, + "loss": 0.3035, + "step": 3552 + }, + { + "epoch": 0.8456000475992146, + "grad_norm": 0.37487158122933406, + "learning_rate": 6.472679430653771e-06, + "loss": 0.3028, + "step": 3553 + }, + { + "epoch": 0.8458380436722794, + "grad_norm": 0.3697192895630358, + "learning_rate": 6.470837234549831e-06, + "loss": 0.3821, + "step": 3554 + }, + { + "epoch": 0.8460760397453442, + "grad_norm": 0.39897676938087634, + "learning_rate": 6.468994819843093e-06, + "loss": 0.3787, + "step": 3555 + }, + { + "epoch": 0.846314035818409, + "grad_norm": 0.370440881250878, + "learning_rate": 6.467152186807381e-06, + "loss": 0.3008, + "step": 3556 + }, + { + "epoch": 0.8465520318914738, + "grad_norm": 0.36810682890311036, + "learning_rate": 6.4653093357165605e-06, + "loss": 0.3324, + "step": 3557 + }, + { + "epoch": 0.8467900279645386, + "grad_norm": 0.44038968365113795, + "learning_rate": 6.463466266844523e-06, + "loss": 0.4133, + "step": 3558 + }, + { + "epoch": 0.8470280240376034, + "grad_norm": 0.42146397355664883, + "learning_rate": 6.461622980465192e-06, + "loss": 0.3262, + "step": 3559 + }, + { + "epoch": 0.8472660201106682, + "grad_norm": 0.3892143606840479, + "learning_rate": 6.459779476852528e-06, + "loss": 0.2951, + "step": 3560 + }, + { + "epoch": 0.8475040161837329, + "grad_norm": 0.3606861577032436, + "learning_rate": 6.45793575628052e-06, + "loss": 0.3468, + "step": 3561 + }, + { + "epoch": 0.8477420122567978, + "grad_norm": 0.39838162217271655, + "learning_rate": 6.456091819023192e-06, + "loss": 0.3662, + "step": 3562 + }, + { + "epoch": 0.8479800083298625, + "grad_norm": 0.40545284210053256, + "learning_rate": 6.454247665354596e-06, + "loss": 0.3092, + "step": 3563 + }, + { + "epoch": 0.8482180044029274, + "grad_norm": 0.3745103557445221, + "learning_rate": 6.452403295548822e-06, + "loss": 0.3259, + "step": 3564 + }, + { + "epoch": 0.8484560004759921, + "grad_norm": 0.36452519630195096, + "learning_rate": 6.450558709879988e-06, + "loss": 0.3935, + "step": 3565 + }, + { + "epoch": 0.848693996549057, + "grad_norm": 0.3753670451371452, + "learning_rate": 6.448713908622244e-06, + "loss": 0.364, + "step": 3566 + }, + { + "epoch": 0.8489319926221217, + "grad_norm": 0.4415121020870584, + "learning_rate": 6.446868892049774e-06, + "loss": 0.2946, + "step": 3567 + }, + { + "epoch": 0.8491699886951866, + "grad_norm": 0.3586879855069214, + "learning_rate": 6.445023660436792e-06, + "loss": 0.3556, + "step": 3568 + }, + { + "epoch": 0.8494079847682513, + "grad_norm": 0.388777137976686, + "learning_rate": 6.443178214057546e-06, + "loss": 0.4276, + "step": 3569 + }, + { + "epoch": 0.8496459808413162, + "grad_norm": 0.37232395659576667, + "learning_rate": 6.441332553186317e-06, + "loss": 0.2855, + "step": 3570 + }, + { + "epoch": 0.8498839769143809, + "grad_norm": 0.36549499530200263, + "learning_rate": 6.439486678097412e-06, + "loss": 0.2883, + "step": 3571 + }, + { + "epoch": 0.8501219729874457, + "grad_norm": 0.4015119927722684, + "learning_rate": 6.437640589065175e-06, + "loss": 0.4052, + "step": 3572 + }, + { + "epoch": 0.8503599690605105, + "grad_norm": 0.39600433293408427, + "learning_rate": 6.435794286363981e-06, + "loss": 0.3315, + "step": 3573 + }, + { + "epoch": 0.8505979651335753, + "grad_norm": 0.34745554675127294, + "learning_rate": 6.4339477702682365e-06, + "loss": 0.3379, + "step": 3574 + }, + { + "epoch": 0.8508359612066401, + "grad_norm": 0.376990594604472, + "learning_rate": 6.4321010410523785e-06, + "loss": 0.3708, + "step": 3575 + }, + { + "epoch": 0.8510739572797049, + "grad_norm": 0.37580912173380343, + "learning_rate": 6.430254098990879e-06, + "loss": 0.3867, + "step": 3576 + }, + { + "epoch": 0.8513119533527697, + "grad_norm": 0.36492592391634565, + "learning_rate": 6.428406944358236e-06, + "loss": 0.3329, + "step": 3577 + }, + { + "epoch": 0.8515499494258345, + "grad_norm": 0.4561243150373411, + "learning_rate": 6.426559577428986e-06, + "loss": 0.3443, + "step": 3578 + }, + { + "epoch": 0.8517879454988992, + "grad_norm": 0.37361088859679636, + "learning_rate": 6.42471199847769e-06, + "loss": 0.3821, + "step": 3579 + }, + { + "epoch": 0.8520259415719641, + "grad_norm": 0.3858002858596615, + "learning_rate": 6.422864207778946e-06, + "loss": 0.3575, + "step": 3580 + }, + { + "epoch": 0.8522639376450288, + "grad_norm": 0.591361749000545, + "learning_rate": 6.42101620560738e-06, + "loss": 0.2932, + "step": 3581 + }, + { + "epoch": 0.8525019337180937, + "grad_norm": 0.3862018455711387, + "learning_rate": 6.4191679922376514e-06, + "loss": 0.332, + "step": 3582 + }, + { + "epoch": 0.8527399297911584, + "grad_norm": 0.3477194194120392, + "learning_rate": 6.417319567944451e-06, + "loss": 0.4039, + "step": 3583 + }, + { + "epoch": 0.8529779258642233, + "grad_norm": 0.3867671638518174, + "learning_rate": 6.4154709330025014e-06, + "loss": 0.3283, + "step": 3584 + }, + { + "epoch": 0.853215921937288, + "grad_norm": 0.3789296076013951, + "learning_rate": 6.413622087686553e-06, + "loss": 0.3197, + "step": 3585 + }, + { + "epoch": 0.8534539180103529, + "grad_norm": 0.35204003022469893, + "learning_rate": 6.411773032271391e-06, + "loss": 0.3575, + "step": 3586 + }, + { + "epoch": 0.8536919140834176, + "grad_norm": 0.4135524819567001, + "learning_rate": 6.4099237670318295e-06, + "loss": 0.3739, + "step": 3587 + }, + { + "epoch": 0.8539299101564825, + "grad_norm": 0.36478421612897816, + "learning_rate": 6.408074292242719e-06, + "loss": 0.314, + "step": 3588 + }, + { + "epoch": 0.8541679062295472, + "grad_norm": 0.3991900857385473, + "learning_rate": 6.4062246081789316e-06, + "loss": 0.3417, + "step": 3589 + }, + { + "epoch": 0.854405902302612, + "grad_norm": 0.35893254692737425, + "learning_rate": 6.40437471511538e-06, + "loss": 0.3911, + "step": 3590 + }, + { + "epoch": 0.8546438983756768, + "grad_norm": 0.35453540240239667, + "learning_rate": 6.402524613327005e-06, + "loss": 0.3469, + "step": 3591 + }, + { + "epoch": 0.8548818944487416, + "grad_norm": 0.43997392067335855, + "learning_rate": 6.400674303088774e-06, + "loss": 0.2904, + "step": 3592 + }, + { + "epoch": 0.8551198905218064, + "grad_norm": 0.3932703368817032, + "learning_rate": 6.398823784675692e-06, + "loss": 0.3453, + "step": 3593 + }, + { + "epoch": 0.8553578865948712, + "grad_norm": 0.38331111856702316, + "learning_rate": 6.3969730583627895e-06, + "loss": 0.4321, + "step": 3594 + }, + { + "epoch": 0.855595882667936, + "grad_norm": 0.36082426472750595, + "learning_rate": 6.39512212442513e-06, + "loss": 0.303, + "step": 3595 + }, + { + "epoch": 0.8558338787410008, + "grad_norm": 0.36640398807283237, + "learning_rate": 6.39327098313781e-06, + "loss": 0.3424, + "step": 3596 + }, + { + "epoch": 0.8560718748140655, + "grad_norm": 0.39345559455136353, + "learning_rate": 6.391419634775955e-06, + "loss": 0.3943, + "step": 3597 + }, + { + "epoch": 0.8563098708871304, + "grad_norm": 0.38176260350590113, + "learning_rate": 6.3895680796147195e-06, + "loss": 0.3916, + "step": 3598 + }, + { + "epoch": 0.8565478669601951, + "grad_norm": 0.3701812101754324, + "learning_rate": 6.387716317929291e-06, + "loss": 0.3091, + "step": 3599 + }, + { + "epoch": 0.85678586303326, + "grad_norm": 0.37515785937868595, + "learning_rate": 6.385864349994887e-06, + "loss": 0.3352, + "step": 3600 + }, + { + "epoch": 0.8570238591063247, + "grad_norm": 0.3811040883533929, + "learning_rate": 6.384012176086756e-06, + "loss": 0.4348, + "step": 3601 + }, + { + "epoch": 0.8572618551793896, + "grad_norm": 0.38277086910509656, + "learning_rate": 6.382159796480176e-06, + "loss": 0.3105, + "step": 3602 + }, + { + "epoch": 0.8574998512524543, + "grad_norm": 0.4049398160651801, + "learning_rate": 6.380307211450459e-06, + "loss": 0.2908, + "step": 3603 + }, + { + "epoch": 0.8577378473255192, + "grad_norm": 0.37259835586706896, + "learning_rate": 6.3784544212729425e-06, + "loss": 0.3591, + "step": 3604 + }, + { + "epoch": 0.8579758433985839, + "grad_norm": 0.39620623445763115, + "learning_rate": 6.376601426222998e-06, + "loss": 0.417, + "step": 3605 + }, + { + "epoch": 0.8582138394716488, + "grad_norm": 0.38565150714963886, + "learning_rate": 6.374748226576026e-06, + "loss": 0.3028, + "step": 3606 + }, + { + "epoch": 0.8584518355447135, + "grad_norm": 0.38396283701575223, + "learning_rate": 6.372894822607459e-06, + "loss": 0.3637, + "step": 3607 + }, + { + "epoch": 0.8586898316177783, + "grad_norm": 0.36500484893052054, + "learning_rate": 6.371041214592756e-06, + "loss": 0.3775, + "step": 3608 + }, + { + "epoch": 0.8589278276908431, + "grad_norm": 0.3661904393716665, + "learning_rate": 6.369187402807409e-06, + "loss": 0.3382, + "step": 3609 + }, + { + "epoch": 0.8591658237639079, + "grad_norm": 0.3635402605710434, + "learning_rate": 6.3673333875269435e-06, + "loss": 0.3263, + "step": 3610 + }, + { + "epoch": 0.8594038198369727, + "grad_norm": 0.34166628776145125, + "learning_rate": 6.3654791690269115e-06, + "loss": 0.3689, + "step": 3611 + }, + { + "epoch": 0.8596418159100375, + "grad_norm": 0.40702525148527946, + "learning_rate": 6.363624747582895e-06, + "loss": 0.3635, + "step": 3612 + }, + { + "epoch": 0.8598798119831023, + "grad_norm": 0.3788739157966428, + "learning_rate": 6.361770123470506e-06, + "loss": 0.3199, + "step": 3613 + }, + { + "epoch": 0.8601178080561671, + "grad_norm": 0.4010995049473161, + "learning_rate": 6.359915296965386e-06, + "loss": 0.3378, + "step": 3614 + }, + { + "epoch": 0.8603558041292318, + "grad_norm": 0.3976190901244627, + "learning_rate": 6.3580602683432114e-06, + "loss": 0.3947, + "step": 3615 + }, + { + "epoch": 0.8605938002022967, + "grad_norm": 0.41040643943188415, + "learning_rate": 6.356205037879683e-06, + "loss": 0.353, + "step": 3616 + }, + { + "epoch": 0.8608317962753614, + "grad_norm": 0.40129402029771805, + "learning_rate": 6.354349605850537e-06, + "loss": 0.3115, + "step": 3617 + }, + { + "epoch": 0.8610697923484263, + "grad_norm": 0.37800238180174833, + "learning_rate": 6.352493972531535e-06, + "loss": 0.3461, + "step": 3618 + }, + { + "epoch": 0.861307788421491, + "grad_norm": 0.35601066827900923, + "learning_rate": 6.350638138198468e-06, + "loss": 0.3927, + "step": 3619 + }, + { + "epoch": 0.8615457844945559, + "grad_norm": 0.4294833642760016, + "learning_rate": 6.348782103127161e-06, + "loss": 0.3186, + "step": 3620 + }, + { + "epoch": 0.8617837805676206, + "grad_norm": 0.39885340739339903, + "learning_rate": 6.346925867593468e-06, + "loss": 0.3481, + "step": 3621 + }, + { + "epoch": 0.8620217766406855, + "grad_norm": 0.381073935397439, + "learning_rate": 6.345069431873267e-06, + "loss": 0.3632, + "step": 3622 + }, + { + "epoch": 0.8622597727137502, + "grad_norm": 0.3716616071286771, + "learning_rate": 6.3432127962424724e-06, + "loss": 0.3735, + "step": 3623 + }, + { + "epoch": 0.8624977687868151, + "grad_norm": 0.3936052335005196, + "learning_rate": 6.341355960977029e-06, + "loss": 0.3002, + "step": 3624 + }, + { + "epoch": 0.8627357648598798, + "grad_norm": 0.35707110608872256, + "learning_rate": 6.3394989263529075e-06, + "loss": 0.3286, + "step": 3625 + }, + { + "epoch": 0.8629737609329446, + "grad_norm": 0.37387621795612574, + "learning_rate": 6.337641692646106e-06, + "loss": 0.4234, + "step": 3626 + }, + { + "epoch": 0.8632117570060094, + "grad_norm": 0.36608540038166837, + "learning_rate": 6.335784260132656e-06, + "loss": 0.3326, + "step": 3627 + }, + { + "epoch": 0.8634497530790742, + "grad_norm": 0.40136651207935303, + "learning_rate": 6.33392662908862e-06, + "loss": 0.3229, + "step": 3628 + }, + { + "epoch": 0.863687749152139, + "grad_norm": 0.38445657475293393, + "learning_rate": 6.332068799790088e-06, + "loss": 0.3608, + "step": 3629 + }, + { + "epoch": 0.8639257452252038, + "grad_norm": 0.4134034698432213, + "learning_rate": 6.330210772513179e-06, + "loss": 0.3888, + "step": 3630 + }, + { + "epoch": 0.8641637412982686, + "grad_norm": 0.3403794858258773, + "learning_rate": 6.32835254753404e-06, + "loss": 0.3299, + "step": 3631 + }, + { + "epoch": 0.8644017373713334, + "grad_norm": 0.400740147025239, + "learning_rate": 6.3264941251288524e-06, + "loss": 0.3519, + "step": 3632 + }, + { + "epoch": 0.8646397334443982, + "grad_norm": 0.3696283238042445, + "learning_rate": 6.324635505573821e-06, + "loss": 0.4068, + "step": 3633 + }, + { + "epoch": 0.864877729517463, + "grad_norm": 0.3648815050871534, + "learning_rate": 6.3227766891451834e-06, + "loss": 0.3229, + "step": 3634 + }, + { + "epoch": 0.8651157255905277, + "grad_norm": 0.3684339728207634, + "learning_rate": 6.3209176761192056e-06, + "loss": 0.2937, + "step": 3635 + }, + { + "epoch": 0.8653537216635926, + "grad_norm": 0.3412830815800425, + "learning_rate": 6.319058466772183e-06, + "loss": 0.3547, + "step": 3636 + }, + { + "epoch": 0.8655917177366573, + "grad_norm": 0.4151380109178161, + "learning_rate": 6.317199061380442e-06, + "loss": 0.4081, + "step": 3637 + }, + { + "epoch": 0.8658297138097222, + "grad_norm": 0.3807487486762807, + "learning_rate": 6.3153394602203335e-06, + "loss": 0.2962, + "step": 3638 + }, + { + "epoch": 0.8660677098827869, + "grad_norm": 0.3818685065331829, + "learning_rate": 6.313479663568241e-06, + "loss": 0.3313, + "step": 3639 + }, + { + "epoch": 0.8663057059558518, + "grad_norm": 0.39854588070720026, + "learning_rate": 6.311619671700577e-06, + "loss": 0.3949, + "step": 3640 + }, + { + "epoch": 0.8665437020289165, + "grad_norm": 0.39040880832128244, + "learning_rate": 6.309759484893781e-06, + "loss": 0.3463, + "step": 3641 + }, + { + "epoch": 0.8667816981019814, + "grad_norm": 0.3808134094910771, + "learning_rate": 6.3078991034243246e-06, + "loss": 0.3069, + "step": 3642 + }, + { + "epoch": 0.8670196941750461, + "grad_norm": 0.42611867594264174, + "learning_rate": 6.306038527568703e-06, + "loss": 0.3499, + "step": 3643 + }, + { + "epoch": 0.867257690248111, + "grad_norm": 0.38268539446297695, + "learning_rate": 6.304177757603449e-06, + "loss": 0.381, + "step": 3644 + }, + { + "epoch": 0.8674956863211757, + "grad_norm": 0.39139862991744423, + "learning_rate": 6.302316793805117e-06, + "loss": 0.311, + "step": 3645 + }, + { + "epoch": 0.8677336823942405, + "grad_norm": 0.34134245528800944, + "learning_rate": 6.300455636450291e-06, + "loss": 0.3326, + "step": 3646 + }, + { + "epoch": 0.8679716784673053, + "grad_norm": 0.3603534975781144, + "learning_rate": 6.298594285815585e-06, + "loss": 0.3793, + "step": 3647 + }, + { + "epoch": 0.8682096745403701, + "grad_norm": 0.3614007549196889, + "learning_rate": 6.296732742177644e-06, + "loss": 0.3392, + "step": 3648 + }, + { + "epoch": 0.8684476706134349, + "grad_norm": 0.4195748954343944, + "learning_rate": 6.294871005813137e-06, + "loss": 0.3134, + "step": 3649 + }, + { + "epoch": 0.8686856666864997, + "grad_norm": 0.39119705289423146, + "learning_rate": 6.293009076998763e-06, + "loss": 0.3361, + "step": 3650 + }, + { + "epoch": 0.8689236627595645, + "grad_norm": 0.4029024677261677, + "learning_rate": 6.291146956011255e-06, + "loss": 0.3868, + "step": 3651 + }, + { + "epoch": 0.8691616588326293, + "grad_norm": 0.3811799244974477, + "learning_rate": 6.289284643127367e-06, + "loss": 0.3218, + "step": 3652 + }, + { + "epoch": 0.869399654905694, + "grad_norm": 0.4261716940666211, + "learning_rate": 6.287422138623886e-06, + "loss": 0.3203, + "step": 3653 + }, + { + "epoch": 0.8696376509787589, + "grad_norm": 0.39445236227113656, + "learning_rate": 6.285559442777624e-06, + "loss": 0.4, + "step": 3654 + }, + { + "epoch": 0.8698756470518236, + "grad_norm": 0.4319574272153776, + "learning_rate": 6.283696555865429e-06, + "loss": 0.3435, + "step": 3655 + }, + { + "epoch": 0.8701136431248885, + "grad_norm": 0.4546433169970997, + "learning_rate": 6.281833478164168e-06, + "loss": 0.3051, + "step": 3656 + }, + { + "epoch": 0.8703516391979532, + "grad_norm": 0.38154852645595494, + "learning_rate": 6.279970209950738e-06, + "loss": 0.3498, + "step": 3657 + }, + { + "epoch": 0.8705896352710181, + "grad_norm": 0.3548764531740995, + "learning_rate": 6.278106751502073e-06, + "loss": 0.4182, + "step": 3658 + }, + { + "epoch": 0.8708276313440828, + "grad_norm": 0.3759251982640629, + "learning_rate": 6.2762431030951255e-06, + "loss": 0.3106, + "step": 3659 + }, + { + "epoch": 0.8710656274171477, + "grad_norm": 0.3613232709245374, + "learning_rate": 6.274379265006879e-06, + "loss": 0.2905, + "step": 3660 + }, + { + "epoch": 0.8713036234902124, + "grad_norm": 0.3751051295735669, + "learning_rate": 6.272515237514349e-06, + "loss": 0.358, + "step": 3661 + }, + { + "epoch": 0.8715416195632772, + "grad_norm": 0.36121575959400304, + "learning_rate": 6.270651020894572e-06, + "loss": 0.4264, + "step": 3662 + }, + { + "epoch": 0.871779615636342, + "grad_norm": 0.3780337870363989, + "learning_rate": 6.2687866154246204e-06, + "loss": 0.3072, + "step": 3663 + }, + { + "epoch": 0.8720176117094068, + "grad_norm": 0.4035208753610946, + "learning_rate": 6.266922021381588e-06, + "loss": 0.3603, + "step": 3664 + }, + { + "epoch": 0.8722556077824716, + "grad_norm": 0.3977305240744663, + "learning_rate": 6.265057239042602e-06, + "loss": 0.4096, + "step": 3665 + }, + { + "epoch": 0.8724936038555364, + "grad_norm": 0.3826214538741579, + "learning_rate": 6.263192268684814e-06, + "loss": 0.3427, + "step": 3666 + }, + { + "epoch": 0.8727315999286012, + "grad_norm": 0.40584625486221326, + "learning_rate": 6.2613271105854065e-06, + "loss": 0.3293, + "step": 3667 + }, + { + "epoch": 0.872969596001666, + "grad_norm": 0.6329608207460291, + "learning_rate": 6.259461765021584e-06, + "loss": 0.3824, + "step": 3668 + }, + { + "epoch": 0.8732075920747308, + "grad_norm": 0.3898204136913921, + "learning_rate": 6.257596232270587e-06, + "loss": 0.411, + "step": 3669 + }, + { + "epoch": 0.8734455881477956, + "grad_norm": 0.36641905581111767, + "learning_rate": 6.255730512609679e-06, + "loss": 0.3169, + "step": 3670 + }, + { + "epoch": 0.8736835842208603, + "grad_norm": 0.4829470116833935, + "learning_rate": 6.25386460631615e-06, + "loss": 0.3299, + "step": 3671 + }, + { + "epoch": 0.8739215802939252, + "grad_norm": 0.3562617699548361, + "learning_rate": 6.2519985136673235e-06, + "loss": 0.3818, + "step": 3672 + }, + { + "epoch": 0.8741595763669899, + "grad_norm": 0.3887628157005426, + "learning_rate": 6.250132234940543e-06, + "loss": 0.3696, + "step": 3673 + }, + { + "epoch": 0.8743975724400548, + "grad_norm": 0.39340605103932813, + "learning_rate": 6.248265770413187e-06, + "loss": 0.3048, + "step": 3674 + }, + { + "epoch": 0.8746355685131195, + "grad_norm": 0.3573200120544181, + "learning_rate": 6.2463991203626565e-06, + "loss": 0.3333, + "step": 3675 + }, + { + "epoch": 0.8748735645861844, + "grad_norm": 0.3773239240601227, + "learning_rate": 6.244532285066382e-06, + "loss": 0.4064, + "step": 3676 + }, + { + "epoch": 0.8751115606592491, + "grad_norm": 0.41082700756267004, + "learning_rate": 6.2426652648018215e-06, + "loss": 0.3286, + "step": 3677 + }, + { + "epoch": 0.875349556732314, + "grad_norm": 0.38054049215364216, + "learning_rate": 6.2407980598464615e-06, + "loss": 0.2894, + "step": 3678 + }, + { + "epoch": 0.8755875528053787, + "grad_norm": 0.3678209031947387, + "learning_rate": 6.238930670477813e-06, + "loss": 0.3526, + "step": 3679 + }, + { + "epoch": 0.8758255488784435, + "grad_norm": 0.40127333894221406, + "learning_rate": 6.237063096973418e-06, + "loss": 0.3743, + "step": 3680 + }, + { + "epoch": 0.8760635449515083, + "grad_norm": 0.3582362616157782, + "learning_rate": 6.235195339610842e-06, + "loss": 0.3023, + "step": 3681 + }, + { + "epoch": 0.8763015410245731, + "grad_norm": 0.44829796994435595, + "learning_rate": 6.233327398667682e-06, + "loss": 0.3699, + "step": 3682 + }, + { + "epoch": 0.8765395370976379, + "grad_norm": 0.3966518210291202, + "learning_rate": 6.2314592744215605e-06, + "loss": 0.4058, + "step": 3683 + }, + { + "epoch": 0.8767775331707027, + "grad_norm": 0.390658057800168, + "learning_rate": 6.229590967150124e-06, + "loss": 0.3136, + "step": 3684 + }, + { + "epoch": 0.8770155292437675, + "grad_norm": 0.4308954670330036, + "learning_rate": 6.227722477131053e-06, + "loss": 0.2851, + "step": 3685 + }, + { + "epoch": 0.8772535253168323, + "grad_norm": 0.442296326588899, + "learning_rate": 6.225853804642048e-06, + "loss": 0.3569, + "step": 3686 + }, + { + "epoch": 0.8774915213898971, + "grad_norm": 0.36301211552233426, + "learning_rate": 6.223984949960843e-06, + "loss": 0.3704, + "step": 3687 + }, + { + "epoch": 0.8777295174629619, + "grad_norm": 0.39720988569190824, + "learning_rate": 6.2221159133651946e-06, + "loss": 0.301, + "step": 3688 + }, + { + "epoch": 0.8779675135360266, + "grad_norm": 0.3772685192299192, + "learning_rate": 6.220246695132887e-06, + "loss": 0.3498, + "step": 3689 + }, + { + "epoch": 0.8782055096090915, + "grad_norm": 0.35844628342397195, + "learning_rate": 6.218377295541733e-06, + "loss": 0.3919, + "step": 3690 + }, + { + "epoch": 0.8784435056821562, + "grad_norm": 0.39072910342836975, + "learning_rate": 6.21650771486957e-06, + "loss": 0.3412, + "step": 3691 + }, + { + "epoch": 0.8786815017552211, + "grad_norm": 0.3934761558248729, + "learning_rate": 6.214637953394268e-06, + "loss": 0.3195, + "step": 3692 + }, + { + "epoch": 0.8789194978282858, + "grad_norm": 0.41802350381203424, + "learning_rate": 6.212768011393717e-06, + "loss": 0.3613, + "step": 3693 + }, + { + "epoch": 0.8791574939013507, + "grad_norm": 0.3673222836006008, + "learning_rate": 6.2108978891458374e-06, + "loss": 0.4147, + "step": 3694 + }, + { + "epoch": 0.8793954899744154, + "grad_norm": 0.3792040026583229, + "learning_rate": 6.2090275869285735e-06, + "loss": 0.3214, + "step": 3695 + }, + { + "epoch": 0.8796334860474803, + "grad_norm": 0.39223693432096834, + "learning_rate": 6.207157105019902e-06, + "loss": 0.3345, + "step": 3696 + }, + { + "epoch": 0.879871482120545, + "grad_norm": 0.36686866689029535, + "learning_rate": 6.205286443697821e-06, + "loss": 0.4232, + "step": 3697 + }, + { + "epoch": 0.8801094781936099, + "grad_norm": 0.3987162836648213, + "learning_rate": 6.2034156032403555e-06, + "loss": 0.3754, + "step": 3698 + }, + { + "epoch": 0.8803474742666746, + "grad_norm": 0.4458906881177551, + "learning_rate": 6.201544583925562e-06, + "loss": 0.3313, + "step": 3699 + }, + { + "epoch": 0.8805854703397394, + "grad_norm": 0.3774384264045408, + "learning_rate": 6.199673386031518e-06, + "loss": 0.3442, + "step": 3700 + }, + { + "epoch": 0.8808234664128042, + "grad_norm": 0.34013768584119114, + "learning_rate": 6.197802009836331e-06, + "loss": 0.4061, + "step": 3701 + }, + { + "epoch": 0.881061462485869, + "grad_norm": 0.37237424732062263, + "learning_rate": 6.195930455618132e-06, + "loss": 0.333, + "step": 3702 + }, + { + "epoch": 0.8812994585589338, + "grad_norm": 0.3960116351456216, + "learning_rate": 6.194058723655083e-06, + "loss": 0.3069, + "step": 3703 + }, + { + "epoch": 0.8815374546319986, + "grad_norm": 0.39926411331947254, + "learning_rate": 6.192186814225367e-06, + "loss": 0.3738, + "step": 3704 + }, + { + "epoch": 0.8817754507050634, + "grad_norm": 0.44748846445351975, + "learning_rate": 6.190314727607196e-06, + "loss": 0.3749, + "step": 3705 + }, + { + "epoch": 0.8820134467781282, + "grad_norm": 0.41088375205239147, + "learning_rate": 6.188442464078811e-06, + "loss": 0.29, + "step": 3706 + }, + { + "epoch": 0.882251442851193, + "grad_norm": 0.36272393711398615, + "learning_rate": 6.1865700239184755e-06, + "loss": 0.337, + "step": 3707 + }, + { + "epoch": 0.8824894389242578, + "grad_norm": 0.38722534388780844, + "learning_rate": 6.184697407404478e-06, + "loss": 0.4223, + "step": 3708 + }, + { + "epoch": 0.8827274349973225, + "grad_norm": 0.4042014513980286, + "learning_rate": 6.18282461481514e-06, + "loss": 0.3241, + "step": 3709 + }, + { + "epoch": 0.8829654310703874, + "grad_norm": 0.4002874055607258, + "learning_rate": 6.180951646428801e-06, + "loss": 0.3142, + "step": 3710 + }, + { + "epoch": 0.8832034271434521, + "grad_norm": 0.39939392997960865, + "learning_rate": 6.179078502523834e-06, + "loss": 0.3774, + "step": 3711 + }, + { + "epoch": 0.883441423216517, + "grad_norm": 0.36300470644879695, + "learning_rate": 6.177205183378629e-06, + "loss": 0.4179, + "step": 3712 + }, + { + "epoch": 0.8836794192895817, + "grad_norm": 0.36795015375713797, + "learning_rate": 6.1753316892716156e-06, + "loss": 0.3032, + "step": 3713 + }, + { + "epoch": 0.8839174153626466, + "grad_norm": 0.4120733964613225, + "learning_rate": 6.173458020481234e-06, + "loss": 0.3473, + "step": 3714 + }, + { + "epoch": 0.8841554114357113, + "grad_norm": 0.38507546779235274, + "learning_rate": 6.171584177285962e-06, + "loss": 0.4031, + "step": 3715 + }, + { + "epoch": 0.8843934075087762, + "grad_norm": 0.37436623493188265, + "learning_rate": 6.1697101599642976e-06, + "loss": 0.3287, + "step": 3716 + }, + { + "epoch": 0.8846314035818409, + "grad_norm": 0.37601841360225746, + "learning_rate": 6.167835968794766e-06, + "loss": 0.2813, + "step": 3717 + }, + { + "epoch": 0.8848693996549057, + "grad_norm": 0.36926230908935936, + "learning_rate": 6.165961604055917e-06, + "loss": 0.365, + "step": 3718 + }, + { + "epoch": 0.8851073957279705, + "grad_norm": 0.36487906680665755, + "learning_rate": 6.1640870660263295e-06, + "loss": 0.3984, + "step": 3719 + }, + { + "epoch": 0.8853453918010353, + "grad_norm": 0.37472610973420145, + "learning_rate": 6.162212354984607e-06, + "loss": 0.3098, + "step": 3720 + }, + { + "epoch": 0.8855833878741001, + "grad_norm": 0.37234322290198935, + "learning_rate": 6.160337471209377e-06, + "loss": 0.3439, + "step": 3721 + }, + { + "epoch": 0.8858213839471649, + "grad_norm": 0.3877763209468878, + "learning_rate": 6.158462414979292e-06, + "loss": 0.3869, + "step": 3722 + }, + { + "epoch": 0.8860593800202297, + "grad_norm": 0.4034026778107165, + "learning_rate": 6.156587186573033e-06, + "loss": 0.3543, + "step": 3723 + }, + { + "epoch": 0.8862973760932945, + "grad_norm": 0.3700540316755882, + "learning_rate": 6.154711786269307e-06, + "loss": 0.3153, + "step": 3724 + }, + { + "epoch": 0.8865353721663592, + "grad_norm": 0.39243565604036756, + "learning_rate": 6.152836214346843e-06, + "loss": 0.3664, + "step": 3725 + }, + { + "epoch": 0.8867733682394241, + "grad_norm": 0.36077090161516945, + "learning_rate": 6.150960471084397e-06, + "loss": 0.4108, + "step": 3726 + }, + { + "epoch": 0.8870113643124888, + "grad_norm": 0.37912503567699524, + "learning_rate": 6.149084556760753e-06, + "loss": 0.3269, + "step": 3727 + }, + { + "epoch": 0.8872493603855537, + "grad_norm": 0.4517235850955721, + "learning_rate": 6.147208471654715e-06, + "loss": 0.3127, + "step": 3728 + }, + { + "epoch": 0.8874873564586184, + "grad_norm": 0.387038812957885, + "learning_rate": 6.145332216045119e-06, + "loss": 0.3647, + "step": 3729 + }, + { + "epoch": 0.8877253525316833, + "grad_norm": 0.3779239208107375, + "learning_rate": 6.143455790210822e-06, + "loss": 0.4041, + "step": 3730 + }, + { + "epoch": 0.887963348604748, + "grad_norm": 0.3732475038469887, + "learning_rate": 6.1415791944307056e-06, + "loss": 0.3049, + "step": 3731 + }, + { + "epoch": 0.8882013446778129, + "grad_norm": 0.4145109701498706, + "learning_rate": 6.13970242898368e-06, + "loss": 0.3649, + "step": 3732 + }, + { + "epoch": 0.8884393407508776, + "grad_norm": 0.3649394561399619, + "learning_rate": 6.137825494148678e-06, + "loss": 0.4286, + "step": 3733 + }, + { + "epoch": 0.8886773368239425, + "grad_norm": 0.35279202905563867, + "learning_rate": 6.1359483902046605e-06, + "loss": 0.3256, + "step": 3734 + }, + { + "epoch": 0.8889153328970072, + "grad_norm": 0.3814399015745296, + "learning_rate": 6.134071117430609e-06, + "loss": 0.3111, + "step": 3735 + }, + { + "epoch": 0.889153328970072, + "grad_norm": 0.387544011842372, + "learning_rate": 6.132193676105533e-06, + "loss": 0.3563, + "step": 3736 + }, + { + "epoch": 0.8893913250431368, + "grad_norm": 0.357184072343786, + "learning_rate": 6.1303160665084705e-06, + "loss": 0.3986, + "step": 3737 + }, + { + "epoch": 0.8896293211162016, + "grad_norm": 0.378840936122068, + "learning_rate": 6.1284382889184756e-06, + "loss": 0.3211, + "step": 3738 + }, + { + "epoch": 0.8898673171892664, + "grad_norm": 0.36510963419962517, + "learning_rate": 6.126560343614636e-06, + "loss": 0.327, + "step": 3739 + }, + { + "epoch": 0.8901053132623312, + "grad_norm": 0.359174171499247, + "learning_rate": 6.1246822308760575e-06, + "loss": 0.4083, + "step": 3740 + }, + { + "epoch": 0.890343309335396, + "grad_norm": 0.45021032420547086, + "learning_rate": 6.122803950981878e-06, + "loss": 0.3493, + "step": 3741 + }, + { + "epoch": 0.8905813054084608, + "grad_norm": 0.35860397415514117, + "learning_rate": 6.1209255042112546e-06, + "loss": 0.2942, + "step": 3742 + }, + { + "epoch": 0.8908193014815255, + "grad_norm": 0.36924714501202766, + "learning_rate": 6.119046890843371e-06, + "loss": 0.3465, + "step": 3743 + }, + { + "epoch": 0.8910572975545904, + "grad_norm": 0.3562121440352722, + "learning_rate": 6.117168111157435e-06, + "loss": 0.4262, + "step": 3744 + }, + { + "epoch": 0.8912952936276551, + "grad_norm": 0.35320499238862774, + "learning_rate": 6.115289165432681e-06, + "loss": 0.3027, + "step": 3745 + }, + { + "epoch": 0.89153328970072, + "grad_norm": 0.3759235321559563, + "learning_rate": 6.113410053948364e-06, + "loss": 0.3309, + "step": 3746 + }, + { + "epoch": 0.8917712857737847, + "grad_norm": 0.36596971834289177, + "learning_rate": 6.111530776983771e-06, + "loss": 0.3858, + "step": 3747 + }, + { + "epoch": 0.8920092818468496, + "grad_norm": 0.3707386283118514, + "learning_rate": 6.109651334818204e-06, + "loss": 0.3554, + "step": 3748 + }, + { + "epoch": 0.8922472779199143, + "grad_norm": 0.34061647689147134, + "learning_rate": 6.1077717277309986e-06, + "loss": 0.2951, + "step": 3749 + }, + { + "epoch": 0.8924852739929792, + "grad_norm": 0.36573408045045774, + "learning_rate": 6.1058919560015106e-06, + "loss": 0.3549, + "step": 3750 + }, + { + "epoch": 0.8927232700660439, + "grad_norm": 0.3364732547422088, + "learning_rate": 6.104012019909119e-06, + "loss": 0.4163, + "step": 3751 + }, + { + "epoch": 0.8929612661391088, + "grad_norm": 0.37430490741861316, + "learning_rate": 6.102131919733229e-06, + "loss": 0.3171, + "step": 3752 + }, + { + "epoch": 0.8931992622121735, + "grad_norm": 0.3694758086583808, + "learning_rate": 6.1002516557532684e-06, + "loss": 0.3298, + "step": 3753 + }, + { + "epoch": 0.8934372582852383, + "grad_norm": 0.34593061138576553, + "learning_rate": 6.098371228248695e-06, + "loss": 0.373, + "step": 3754 + }, + { + "epoch": 0.8936752543583031, + "grad_norm": 0.37157546314792966, + "learning_rate": 6.096490637498985e-06, + "loss": 0.3566, + "step": 3755 + }, + { + "epoch": 0.8939132504313679, + "grad_norm": 0.4080002638011446, + "learning_rate": 6.09460988378364e-06, + "loss": 0.2772, + "step": 3756 + }, + { + "epoch": 0.8941512465044327, + "grad_norm": 0.39409190673689626, + "learning_rate": 6.092728967382186e-06, + "loss": 0.3462, + "step": 3757 + }, + { + "epoch": 0.8943892425774975, + "grad_norm": 0.3707276172179733, + "learning_rate": 6.090847888574176e-06, + "loss": 0.3948, + "step": 3758 + }, + { + "epoch": 0.8946272386505623, + "grad_norm": 0.3609993458216574, + "learning_rate": 6.088966647639183e-06, + "loss": 0.3002, + "step": 3759 + }, + { + "epoch": 0.8948652347236271, + "grad_norm": 0.3955638153510142, + "learning_rate": 6.087085244856805e-06, + "loss": 0.3046, + "step": 3760 + }, + { + "epoch": 0.8951032307966919, + "grad_norm": 0.3904254667786321, + "learning_rate": 6.0852036805066684e-06, + "loss": 0.3435, + "step": 3761 + }, + { + "epoch": 0.8953412268697567, + "grad_norm": 0.3708619740535345, + "learning_rate": 6.083321954868416e-06, + "loss": 0.3902, + "step": 3762 + }, + { + "epoch": 0.8955792229428214, + "grad_norm": 0.3966041731350864, + "learning_rate": 6.0814400682217236e-06, + "loss": 0.2959, + "step": 3763 + }, + { + "epoch": 0.8958172190158863, + "grad_norm": 0.4221574195250755, + "learning_rate": 6.0795580208462824e-06, + "loss": 0.3404, + "step": 3764 + }, + { + "epoch": 0.896055215088951, + "grad_norm": 0.3593405180380063, + "learning_rate": 6.077675813021812e-06, + "loss": 0.3972, + "step": 3765 + }, + { + "epoch": 0.8962932111620159, + "grad_norm": 0.39581975922524365, + "learning_rate": 6.075793445028056e-06, + "loss": 0.344, + "step": 3766 + }, + { + "epoch": 0.8965312072350806, + "grad_norm": 0.3955579182890299, + "learning_rate": 6.073910917144778e-06, + "loss": 0.3158, + "step": 3767 + }, + { + "epoch": 0.8967692033081455, + "grad_norm": 0.3797132012723383, + "learning_rate": 6.072028229651773e-06, + "loss": 0.3482, + "step": 3768 + }, + { + "epoch": 0.8970071993812102, + "grad_norm": 0.410201975103764, + "learning_rate": 6.0701453828288524e-06, + "loss": 0.4076, + "step": 3769 + }, + { + "epoch": 0.8972451954542751, + "grad_norm": 0.4137362624937579, + "learning_rate": 6.068262376955854e-06, + "loss": 0.2942, + "step": 3770 + }, + { + "epoch": 0.8974831915273398, + "grad_norm": 0.3409390585560632, + "learning_rate": 6.066379212312638e-06, + "loss": 0.3169, + "step": 3771 + }, + { + "epoch": 0.8977211876004046, + "grad_norm": 0.3649824690116523, + "learning_rate": 6.06449588917909e-06, + "loss": 0.418, + "step": 3772 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.4238883491721038, + "learning_rate": 6.062612407835117e-06, + "loss": 0.3662, + "step": 3773 + }, + { + "epoch": 0.8981971797465342, + "grad_norm": 0.3737577013120221, + "learning_rate": 6.060728768560654e-06, + "loss": 0.3084, + "step": 3774 + }, + { + "epoch": 0.898435175819599, + "grad_norm": 0.37762209485191345, + "learning_rate": 6.058844971635654e-06, + "loss": 0.3556, + "step": 3775 + }, + { + "epoch": 0.8986731718926638, + "grad_norm": 0.34826957290855015, + "learning_rate": 6.056961017340097e-06, + "loss": 0.3975, + "step": 3776 + }, + { + "epoch": 0.8989111679657286, + "grad_norm": 0.4161304452441926, + "learning_rate": 6.055076905953986e-06, + "loss": 0.2959, + "step": 3777 + }, + { + "epoch": 0.8991491640387934, + "grad_norm": 0.35944427744861646, + "learning_rate": 6.053192637757346e-06, + "loss": 0.2953, + "step": 3778 + }, + { + "epoch": 0.8993871601118582, + "grad_norm": 0.3534735542841618, + "learning_rate": 6.051308213030224e-06, + "loss": 0.3666, + "step": 3779 + }, + { + "epoch": 0.899625156184923, + "grad_norm": 0.4729648450816866, + "learning_rate": 6.049423632052693e-06, + "loss": 0.3592, + "step": 3780 + }, + { + "epoch": 0.8998631522579877, + "grad_norm": 0.40762763698787485, + "learning_rate": 6.0475388951048486e-06, + "loss": 0.298, + "step": 3781 + }, + { + "epoch": 0.9001011483310526, + "grad_norm": 0.3810584850880683, + "learning_rate": 6.045654002466809e-06, + "loss": 0.3393, + "step": 3782 + }, + { + "epoch": 0.9003391444041173, + "grad_norm": 0.4559084065779489, + "learning_rate": 6.043768954418719e-06, + "loss": 0.4002, + "step": 3783 + }, + { + "epoch": 0.9005771404771822, + "grad_norm": 0.43908602771497135, + "learning_rate": 6.041883751240739e-06, + "loss": 0.3226, + "step": 3784 + }, + { + "epoch": 0.9008151365502469, + "grad_norm": 0.3930802632673325, + "learning_rate": 6.03999839321306e-06, + "loss": 0.3064, + "step": 3785 + }, + { + "epoch": 0.9010531326233118, + "grad_norm": 0.33579273743850313, + "learning_rate": 6.03811288061589e-06, + "loss": 0.368, + "step": 3786 + }, + { + "epoch": 0.9012911286963765, + "grad_norm": 0.39169821582510483, + "learning_rate": 6.036227213729464e-06, + "loss": 0.3959, + "step": 3787 + }, + { + "epoch": 0.9015291247694414, + "grad_norm": 0.43853941776440175, + "learning_rate": 6.03434139283404e-06, + "loss": 0.3188, + "step": 3788 + }, + { + "epoch": 0.9017671208425061, + "grad_norm": 0.3689294208498079, + "learning_rate": 6.032455418209895e-06, + "loss": 0.351, + "step": 3789 + }, + { + "epoch": 0.902005116915571, + "grad_norm": 0.340196421020995, + "learning_rate": 6.030569290137335e-06, + "loss": 0.3988, + "step": 3790 + }, + { + "epoch": 0.9022431129886357, + "grad_norm": 0.39137456638518625, + "learning_rate": 6.028683008896683e-06, + "loss": 0.3547, + "step": 3791 + }, + { + "epoch": 0.9024811090617005, + "grad_norm": 0.43319110374923225, + "learning_rate": 6.026796574768288e-06, + "loss": 0.3019, + "step": 3792 + }, + { + "epoch": 0.9027191051347653, + "grad_norm": 0.3486367504488833, + "learning_rate": 6.024909988032519e-06, + "loss": 0.3584, + "step": 3793 + }, + { + "epoch": 0.9029571012078301, + "grad_norm": 0.36094039333364036, + "learning_rate": 6.0230232489697725e-06, + "loss": 0.412, + "step": 3794 + }, + { + "epoch": 0.9031950972808949, + "grad_norm": 0.3615716508779515, + "learning_rate": 6.021136357860461e-06, + "loss": 0.3103, + "step": 3795 + }, + { + "epoch": 0.9034330933539596, + "grad_norm": 0.3767985308127309, + "learning_rate": 6.0192493149850255e-06, + "loss": 0.3292, + "step": 3796 + }, + { + "epoch": 0.9036710894270245, + "grad_norm": 0.36959666250758044, + "learning_rate": 6.017362120623928e-06, + "loss": 0.3781, + "step": 3797 + }, + { + "epoch": 0.9039090855000892, + "grad_norm": 0.3571566018380154, + "learning_rate": 6.015474775057649e-06, + "loss": 0.3583, + "step": 3798 + }, + { + "epoch": 0.904147081573154, + "grad_norm": 0.38935644719203016, + "learning_rate": 6.013587278566698e-06, + "loss": 0.3145, + "step": 3799 + }, + { + "epoch": 0.9043850776462188, + "grad_norm": 0.3701781411050491, + "learning_rate": 6.011699631431603e-06, + "loss": 0.3197, + "step": 3800 + }, + { + "epoch": 0.9046230737192836, + "grad_norm": 0.3906408890697199, + "learning_rate": 6.0098118339329124e-06, + "loss": 0.4124, + "step": 3801 + }, + { + "epoch": 0.9048610697923484, + "grad_norm": 0.369451923410181, + "learning_rate": 6.007923886351202e-06, + "loss": 0.3269, + "step": 3802 + }, + { + "epoch": 0.9050990658654132, + "grad_norm": 0.37174979302655076, + "learning_rate": 6.00603578896707e-06, + "loss": 0.3283, + "step": 3803 + }, + { + "epoch": 0.905337061938478, + "grad_norm": 0.37058723879913413, + "learning_rate": 6.004147542061129e-06, + "loss": 0.3723, + "step": 3804 + }, + { + "epoch": 0.9055750580115428, + "grad_norm": 0.37650352346249066, + "learning_rate": 6.0022591459140235e-06, + "loss": 0.3471, + "step": 3805 + }, + { + "epoch": 0.9058130540846075, + "grad_norm": 0.37453899744942026, + "learning_rate": 6.000370600806415e-06, + "loss": 0.3212, + "step": 3806 + }, + { + "epoch": 0.9060510501576724, + "grad_norm": 0.3869447902100158, + "learning_rate": 5.9984819070189845e-06, + "loss": 0.3526, + "step": 3807 + }, + { + "epoch": 0.9062890462307371, + "grad_norm": 0.3527049994912703, + "learning_rate": 5.9965930648324425e-06, + "loss": 0.3878, + "step": 3808 + }, + { + "epoch": 0.906527042303802, + "grad_norm": 0.40048440151462644, + "learning_rate": 5.9947040745275174e-06, + "loss": 0.3268, + "step": 3809 + }, + { + "epoch": 0.9067650383768667, + "grad_norm": 0.36101747024521835, + "learning_rate": 5.992814936384958e-06, + "loss": 0.3299, + "step": 3810 + }, + { + "epoch": 0.9070030344499316, + "grad_norm": 0.3884060451475327, + "learning_rate": 5.990925650685539e-06, + "loss": 0.3604, + "step": 3811 + }, + { + "epoch": 0.9072410305229963, + "grad_norm": 0.38945955785689845, + "learning_rate": 5.9890362177100535e-06, + "loss": 0.3968, + "step": 3812 + }, + { + "epoch": 0.9074790265960612, + "grad_norm": 0.38256970785987937, + "learning_rate": 5.987146637739319e-06, + "loss": 0.285, + "step": 3813 + }, + { + "epoch": 0.9077170226691259, + "grad_norm": 0.3725122594244136, + "learning_rate": 5.985256911054171e-06, + "loss": 0.3099, + "step": 3814 + }, + { + "epoch": 0.9079550187421908, + "grad_norm": 0.3832707895396545, + "learning_rate": 5.983367037935473e-06, + "loss": 0.3996, + "step": 3815 + }, + { + "epoch": 0.9081930148152555, + "grad_norm": 0.3833096459028676, + "learning_rate": 5.9814770186641065e-06, + "loss": 0.3466, + "step": 3816 + }, + { + "epoch": 0.9084310108883203, + "grad_norm": 0.36397044311226656, + "learning_rate": 5.979586853520974e-06, + "loss": 0.2958, + "step": 3817 + }, + { + "epoch": 0.9086690069613851, + "grad_norm": 0.36891177007878145, + "learning_rate": 5.977696542787003e-06, + "loss": 0.3562, + "step": 3818 + }, + { + "epoch": 0.9089070030344499, + "grad_norm": 0.3840511476005476, + "learning_rate": 5.975806086743137e-06, + "loss": 0.3974, + "step": 3819 + }, + { + "epoch": 0.9091449991075147, + "grad_norm": 0.36836219306960205, + "learning_rate": 5.973915485670348e-06, + "loss": 0.2928, + "step": 3820 + }, + { + "epoch": 0.9093829951805795, + "grad_norm": 0.3684371158231424, + "learning_rate": 5.972024739849622e-06, + "loss": 0.3035, + "step": 3821 + }, + { + "epoch": 0.9096209912536443, + "grad_norm": 0.3623595279659431, + "learning_rate": 5.970133849561973e-06, + "loss": 0.3729, + "step": 3822 + }, + { + "epoch": 0.9098589873267091, + "grad_norm": 0.3813529115463126, + "learning_rate": 5.968242815088435e-06, + "loss": 0.3771, + "step": 3823 + }, + { + "epoch": 0.9100969833997739, + "grad_norm": 0.4428663575301828, + "learning_rate": 5.9663516367100614e-06, + "loss": 0.3069, + "step": 3824 + }, + { + "epoch": 0.9103349794728387, + "grad_norm": 0.38649551932626275, + "learning_rate": 5.964460314707928e-06, + "loss": 0.3451, + "step": 3825 + }, + { + "epoch": 0.9105729755459034, + "grad_norm": 0.35224183137486065, + "learning_rate": 5.962568849363133e-06, + "loss": 0.4268, + "step": 3826 + }, + { + "epoch": 0.9108109716189683, + "grad_norm": 0.3468956951171978, + "learning_rate": 5.960677240956792e-06, + "loss": 0.3078, + "step": 3827 + }, + { + "epoch": 0.911048967692033, + "grad_norm": 0.3542112048407511, + "learning_rate": 5.958785489770049e-06, + "loss": 0.2749, + "step": 3828 + }, + { + "epoch": 0.9112869637650979, + "grad_norm": 0.3674351382002242, + "learning_rate": 5.956893596084061e-06, + "loss": 0.3578, + "step": 3829 + }, + { + "epoch": 0.9115249598381626, + "grad_norm": 0.35983162039990363, + "learning_rate": 5.955001560180015e-06, + "loss": 0.392, + "step": 3830 + }, + { + "epoch": 0.9117629559112275, + "grad_norm": 0.36869673312884654, + "learning_rate": 5.953109382339111e-06, + "loss": 0.296, + "step": 3831 + }, + { + "epoch": 0.9120009519842922, + "grad_norm": 0.3937419732099826, + "learning_rate": 5.951217062842573e-06, + "loss": 0.3287, + "step": 3832 + }, + { + "epoch": 0.9122389480573571, + "grad_norm": 0.39019517526976494, + "learning_rate": 5.949324601971648e-06, + "loss": 0.4424, + "step": 3833 + }, + { + "epoch": 0.9124769441304218, + "grad_norm": 0.3944112522424989, + "learning_rate": 5.947432000007601e-06, + "loss": 0.3025, + "step": 3834 + }, + { + "epoch": 0.9127149402034866, + "grad_norm": 0.3906256947087767, + "learning_rate": 5.9455392572317225e-06, + "loss": 0.3067, + "step": 3835 + }, + { + "epoch": 0.9129529362765514, + "grad_norm": 0.36209911312794113, + "learning_rate": 5.9436463739253154e-06, + "loss": 0.3718, + "step": 3836 + }, + { + "epoch": 0.9131909323496162, + "grad_norm": 0.37733100352844173, + "learning_rate": 5.9417533503697155e-06, + "loss": 0.398, + "step": 3837 + }, + { + "epoch": 0.913428928422681, + "grad_norm": 0.3835158676308841, + "learning_rate": 5.939860186846269e-06, + "loss": 0.3215, + "step": 3838 + }, + { + "epoch": 0.9136669244957458, + "grad_norm": 0.367565730076443, + "learning_rate": 5.937966883636348e-06, + "loss": 0.3263, + "step": 3839 + }, + { + "epoch": 0.9139049205688106, + "grad_norm": 0.35666300012315477, + "learning_rate": 5.936073441021344e-06, + "loss": 0.3827, + "step": 3840 + }, + { + "epoch": 0.9141429166418754, + "grad_norm": 0.3638886300120447, + "learning_rate": 5.934179859282668e-06, + "loss": 0.3401, + "step": 3841 + }, + { + "epoch": 0.9143809127149402, + "grad_norm": 0.3412227559234039, + "learning_rate": 5.932286138701756e-06, + "loss": 0.3269, + "step": 3842 + }, + { + "epoch": 0.914618908788005, + "grad_norm": 0.36540816931263137, + "learning_rate": 5.930392279560059e-06, + "loss": 0.3647, + "step": 3843 + }, + { + "epoch": 0.9148569048610697, + "grad_norm": 0.346958869901011, + "learning_rate": 5.928498282139053e-06, + "loss": 0.3772, + "step": 3844 + }, + { + "epoch": 0.9150949009341346, + "grad_norm": 0.3794111378758652, + "learning_rate": 5.926604146720232e-06, + "loss": 0.3163, + "step": 3845 + }, + { + "epoch": 0.9153328970071993, + "grad_norm": 0.39132485441773784, + "learning_rate": 5.9247098735851125e-06, + "loss": 0.3245, + "step": 3846 + }, + { + "epoch": 0.9155708930802642, + "grad_norm": 0.36279235336046384, + "learning_rate": 5.922815463015229e-06, + "loss": 0.3925, + "step": 3847 + }, + { + "epoch": 0.9158088891533289, + "grad_norm": 0.380778919905592, + "learning_rate": 5.920920915292138e-06, + "loss": 0.3313, + "step": 3848 + }, + { + "epoch": 0.9160468852263938, + "grad_norm": 0.3651654131810896, + "learning_rate": 5.919026230697418e-06, + "loss": 0.2968, + "step": 3849 + }, + { + "epoch": 0.9162848812994585, + "grad_norm": 0.3670633414101187, + "learning_rate": 5.917131409512663e-06, + "loss": 0.3495, + "step": 3850 + }, + { + "epoch": 0.9165228773725234, + "grad_norm": 0.34896717104695807, + "learning_rate": 5.915236452019491e-06, + "loss": 0.3772, + "step": 3851 + }, + { + "epoch": 0.9167608734455881, + "grad_norm": 0.3780403642407919, + "learning_rate": 5.913341358499543e-06, + "loss": 0.3302, + "step": 3852 + }, + { + "epoch": 0.916998869518653, + "grad_norm": 0.36534779843005166, + "learning_rate": 5.911446129234473e-06, + "loss": 0.3154, + "step": 3853 + }, + { + "epoch": 0.9172368655917177, + "grad_norm": 0.36166515125650645, + "learning_rate": 5.909550764505959e-06, + "loss": 0.3428, + "step": 3854 + }, + { + "epoch": 0.9174748616647825, + "grad_norm": 0.36662895628646563, + "learning_rate": 5.907655264595701e-06, + "loss": 0.3726, + "step": 3855 + }, + { + "epoch": 0.9177128577378473, + "grad_norm": 0.41478425489418713, + "learning_rate": 5.905759629785417e-06, + "loss": 0.306, + "step": 3856 + }, + { + "epoch": 0.9179508538109121, + "grad_norm": 0.442141948179443, + "learning_rate": 5.903863860356843e-06, + "loss": 0.3664, + "step": 3857 + }, + { + "epoch": 0.9181888498839769, + "grad_norm": 0.3835994090422323, + "learning_rate": 5.9019679565917396e-06, + "loss": 0.4351, + "step": 3858 + }, + { + "epoch": 0.9184268459570417, + "grad_norm": 0.3916877069146898, + "learning_rate": 5.900071918771885e-06, + "loss": 0.3322, + "step": 3859 + }, + { + "epoch": 0.9186648420301065, + "grad_norm": 0.4452770801712549, + "learning_rate": 5.898175747179077e-06, + "loss": 0.3111, + "step": 3860 + }, + { + "epoch": 0.9189028381031713, + "grad_norm": 0.3810069457057913, + "learning_rate": 5.896279442095132e-06, + "loss": 0.3757, + "step": 3861 + }, + { + "epoch": 0.919140834176236, + "grad_norm": 0.42405450068532397, + "learning_rate": 5.894383003801889e-06, + "loss": 0.3957, + "step": 3862 + }, + { + "epoch": 0.9193788302493009, + "grad_norm": 0.3860106901737225, + "learning_rate": 5.892486432581206e-06, + "loss": 0.3128, + "step": 3863 + }, + { + "epoch": 0.9196168263223656, + "grad_norm": 0.35769077891896367, + "learning_rate": 5.890589728714959e-06, + "loss": 0.3283, + "step": 3864 + }, + { + "epoch": 0.9198548223954305, + "grad_norm": 0.3580135745616656, + "learning_rate": 5.8886928924850484e-06, + "loss": 0.4148, + "step": 3865 + }, + { + "epoch": 0.9200928184684952, + "grad_norm": 0.390966292937549, + "learning_rate": 5.886795924173388e-06, + "loss": 0.3515, + "step": 3866 + }, + { + "epoch": 0.9203308145415601, + "grad_norm": 0.3582229975022328, + "learning_rate": 5.884898824061914e-06, + "loss": 0.2942, + "step": 3867 + }, + { + "epoch": 0.9205688106146248, + "grad_norm": 0.383565757640599, + "learning_rate": 5.883001592432583e-06, + "loss": 0.3574, + "step": 3868 + }, + { + "epoch": 0.9208068066876897, + "grad_norm": 0.41449449325824644, + "learning_rate": 5.881104229567373e-06, + "loss": 0.3743, + "step": 3869 + }, + { + "epoch": 0.9210448027607544, + "grad_norm": 0.4022815267702927, + "learning_rate": 5.879206735748275e-06, + "loss": 0.3004, + "step": 3870 + }, + { + "epoch": 0.9212827988338192, + "grad_norm": 0.37800390773235576, + "learning_rate": 5.877309111257306e-06, + "loss": 0.3007, + "step": 3871 + }, + { + "epoch": 0.921520794906884, + "grad_norm": 0.3826886690510641, + "learning_rate": 5.8754113563765e-06, + "loss": 0.3896, + "step": 3872 + }, + { + "epoch": 0.9217587909799488, + "grad_norm": 0.39813220759183665, + "learning_rate": 5.873513471387911e-06, + "loss": 0.3316, + "step": 3873 + }, + { + "epoch": 0.9219967870530136, + "grad_norm": 0.3865193341498503, + "learning_rate": 5.871615456573608e-06, + "loss": 0.302, + "step": 3874 + }, + { + "epoch": 0.9222347831260784, + "grad_norm": 0.37725905957142175, + "learning_rate": 5.8697173122156885e-06, + "loss": 0.3364, + "step": 3875 + }, + { + "epoch": 0.9224727791991432, + "grad_norm": 0.3896841573344773, + "learning_rate": 5.8678190385962585e-06, + "loss": 0.4141, + "step": 3876 + }, + { + "epoch": 0.922710775272208, + "grad_norm": 0.3717555754024187, + "learning_rate": 5.8659206359974495e-06, + "loss": 0.3271, + "step": 3877 + }, + { + "epoch": 0.9229487713452728, + "grad_norm": 0.38884985169390957, + "learning_rate": 5.864022104701413e-06, + "loss": 0.3206, + "step": 3878 + }, + { + "epoch": 0.9231867674183376, + "grad_norm": 0.40651050344993656, + "learning_rate": 5.862123444990319e-06, + "loss": 0.3614, + "step": 3879 + }, + { + "epoch": 0.9234247634914023, + "grad_norm": 0.3699122959015574, + "learning_rate": 5.8602246571463506e-06, + "loss": 0.3953, + "step": 3880 + }, + { + "epoch": 0.9236627595644672, + "grad_norm": 0.35373619100502907, + "learning_rate": 5.858325741451718e-06, + "loss": 0.3106, + "step": 3881 + }, + { + "epoch": 0.9239007556375319, + "grad_norm": 0.3822556148266497, + "learning_rate": 5.8564266981886465e-06, + "loss": 0.3551, + "step": 3882 + }, + { + "epoch": 0.9241387517105968, + "grad_norm": 0.3367352347831046, + "learning_rate": 5.854527527639381e-06, + "loss": 0.4214, + "step": 3883 + }, + { + "epoch": 0.9243767477836615, + "grad_norm": 0.41091234068434845, + "learning_rate": 5.852628230086184e-06, + "loss": 0.3105, + "step": 3884 + }, + { + "epoch": 0.9246147438567264, + "grad_norm": 0.39373294006122483, + "learning_rate": 5.850728805811339e-06, + "loss": 0.3234, + "step": 3885 + }, + { + "epoch": 0.9248527399297911, + "grad_norm": 0.3427571664260684, + "learning_rate": 5.8488292550971485e-06, + "loss": 0.3525, + "step": 3886 + }, + { + "epoch": 0.925090736002856, + "grad_norm": 0.35439854536041354, + "learning_rate": 5.84692957822593e-06, + "loss": 0.3836, + "step": 3887 + }, + { + "epoch": 0.9253287320759207, + "grad_norm": 0.396795073329766, + "learning_rate": 5.845029775480026e-06, + "loss": 0.295, + "step": 3888 + }, + { + "epoch": 0.9255667281489856, + "grad_norm": 0.3777068010438854, + "learning_rate": 5.843129847141792e-06, + "loss": 0.3076, + "step": 3889 + }, + { + "epoch": 0.9258047242220503, + "grad_norm": 0.35763041328215195, + "learning_rate": 5.841229793493604e-06, + "loss": 0.4072, + "step": 3890 + }, + { + "epoch": 0.9260427202951151, + "grad_norm": 0.38017084454143346, + "learning_rate": 5.839329614817858e-06, + "loss": 0.3474, + "step": 3891 + }, + { + "epoch": 0.9262807163681799, + "grad_norm": 0.35675610940815455, + "learning_rate": 5.837429311396967e-06, + "loss": 0.2979, + "step": 3892 + }, + { + "epoch": 0.9265187124412447, + "grad_norm": 0.3793380743409526, + "learning_rate": 5.835528883513364e-06, + "loss": 0.3703, + "step": 3893 + }, + { + "epoch": 0.9267567085143095, + "grad_norm": 0.36281165596625997, + "learning_rate": 5.833628331449498e-06, + "loss": 0.4068, + "step": 3894 + }, + { + "epoch": 0.9269947045873743, + "grad_norm": 0.3691042976161106, + "learning_rate": 5.831727655487839e-06, + "loss": 0.3091, + "step": 3895 + }, + { + "epoch": 0.9272327006604391, + "grad_norm": 0.344360877845291, + "learning_rate": 5.829826855910875e-06, + "loss": 0.3287, + "step": 3896 + }, + { + "epoch": 0.9274706967335039, + "grad_norm": 0.3740607315809258, + "learning_rate": 5.827925933001111e-06, + "loss": 0.3885, + "step": 3897 + }, + { + "epoch": 0.9277086928065686, + "grad_norm": 0.3685370821262386, + "learning_rate": 5.826024887041071e-06, + "loss": 0.3755, + "step": 3898 + }, + { + "epoch": 0.9279466888796335, + "grad_norm": 0.3760368000609693, + "learning_rate": 5.8241237183132986e-06, + "loss": 0.289, + "step": 3899 + }, + { + "epoch": 0.9281846849526982, + "grad_norm": 0.3705899806280766, + "learning_rate": 5.822222427100354e-06, + "loss": 0.3575, + "step": 3900 + }, + { + "epoch": 0.9284226810257631, + "grad_norm": 0.39603394670641257, + "learning_rate": 5.820321013684815e-06, + "loss": 0.4147, + "step": 3901 + }, + { + "epoch": 0.9286606770988278, + "grad_norm": 0.36521058651075333, + "learning_rate": 5.818419478349281e-06, + "loss": 0.3325, + "step": 3902 + }, + { + "epoch": 0.9288986731718927, + "grad_norm": 0.3439794302882938, + "learning_rate": 5.816517821376365e-06, + "loss": 0.3189, + "step": 3903 + }, + { + "epoch": 0.9291366692449574, + "grad_norm": 0.37141750525341993, + "learning_rate": 5.814616043048702e-06, + "loss": 0.3497, + "step": 3904 + }, + { + "epoch": 0.9293746653180223, + "grad_norm": 0.38016142601364294, + "learning_rate": 5.8127141436489395e-06, + "loss": 0.3838, + "step": 3905 + }, + { + "epoch": 0.929612661391087, + "grad_norm": 0.3606163418991101, + "learning_rate": 5.810812123459753e-06, + "loss": 0.2855, + "step": 3906 + }, + { + "epoch": 0.9298506574641519, + "grad_norm": 0.36356185335551205, + "learning_rate": 5.808909982763825e-06, + "loss": 0.3472, + "step": 3907 + }, + { + "epoch": 0.9300886535372166, + "grad_norm": 0.36536907609964997, + "learning_rate": 5.807007721843862e-06, + "loss": 0.421, + "step": 3908 + }, + { + "epoch": 0.9303266496102814, + "grad_norm": 0.4097160409903546, + "learning_rate": 5.805105340982586e-06, + "loss": 0.3065, + "step": 3909 + }, + { + "epoch": 0.9305646456833462, + "grad_norm": 0.32639160343180673, + "learning_rate": 5.803202840462741e-06, + "loss": 0.2853, + "step": 3910 + }, + { + "epoch": 0.930802641756411, + "grad_norm": 0.34431820740085145, + "learning_rate": 5.801300220567083e-06, + "loss": 0.3654, + "step": 3911 + }, + { + "epoch": 0.9310406378294758, + "grad_norm": 0.3393942154458769, + "learning_rate": 5.799397481578388e-06, + "loss": 0.3814, + "step": 3912 + }, + { + "epoch": 0.9312786339025406, + "grad_norm": 0.3650148623295529, + "learning_rate": 5.797494623779453e-06, + "loss": 0.2883, + "step": 3913 + }, + { + "epoch": 0.9315166299756054, + "grad_norm": 0.40074023711292456, + "learning_rate": 5.795591647453086e-06, + "loss": 0.3447, + "step": 3914 + }, + { + "epoch": 0.9317546260486702, + "grad_norm": 0.38051255748001506, + "learning_rate": 5.79368855288212e-06, + "loss": 0.3804, + "step": 3915 + }, + { + "epoch": 0.931992622121735, + "grad_norm": 0.37149987585322053, + "learning_rate": 5.7917853403493985e-06, + "loss": 0.3397, + "step": 3916 + }, + { + "epoch": 0.9322306181947998, + "grad_norm": 0.37572329221656625, + "learning_rate": 5.7898820101377885e-06, + "loss": 0.2941, + "step": 3917 + }, + { + "epoch": 0.9324686142678645, + "grad_norm": 0.3971060021730622, + "learning_rate": 5.787978562530172e-06, + "loss": 0.3382, + "step": 3918 + }, + { + "epoch": 0.9327066103409294, + "grad_norm": 0.39918849730220163, + "learning_rate": 5.786074997809445e-06, + "loss": 0.4049, + "step": 3919 + }, + { + "epoch": 0.9329446064139941, + "grad_norm": 0.386940462855031, + "learning_rate": 5.784171316258528e-06, + "loss": 0.3133, + "step": 3920 + }, + { + "epoch": 0.933182602487059, + "grad_norm": 0.39373148829016735, + "learning_rate": 5.782267518160354e-06, + "loss": 0.3161, + "step": 3921 + }, + { + "epoch": 0.9334205985601237, + "grad_norm": 0.4027047525256417, + "learning_rate": 5.780363603797874e-06, + "loss": 0.3806, + "step": 3922 + }, + { + "epoch": 0.9336585946331886, + "grad_norm": 0.436189477052603, + "learning_rate": 5.778459573454058e-06, + "loss": 0.3396, + "step": 3923 + }, + { + "epoch": 0.9338965907062533, + "grad_norm": 0.37159344865326444, + "learning_rate": 5.776555427411891e-06, + "loss": 0.2909, + "step": 3924 + }, + { + "epoch": 0.9341345867793182, + "grad_norm": 0.35999558789407055, + "learning_rate": 5.774651165954377e-06, + "loss": 0.3357, + "step": 3925 + }, + { + "epoch": 0.9343725828523829, + "grad_norm": 0.3450624176837847, + "learning_rate": 5.772746789364534e-06, + "loss": 0.4188, + "step": 3926 + }, + { + "epoch": 0.9346105789254477, + "grad_norm": 0.375979183281526, + "learning_rate": 5.770842297925402e-06, + "loss": 0.3443, + "step": 3927 + }, + { + "epoch": 0.9348485749985125, + "grad_norm": 0.3646206735183569, + "learning_rate": 5.768937691920036e-06, + "loss": 0.2904, + "step": 3928 + }, + { + "epoch": 0.9350865710715773, + "grad_norm": 0.37338768923849774, + "learning_rate": 5.767032971631506e-06, + "loss": 0.3965, + "step": 3929 + }, + { + "epoch": 0.9353245671446421, + "grad_norm": 0.36862713467496905, + "learning_rate": 5.7651281373429e-06, + "loss": 0.3441, + "step": 3930 + }, + { + "epoch": 0.9355625632177069, + "grad_norm": 0.36754831493989937, + "learning_rate": 5.763223189337324e-06, + "loss": 0.325, + "step": 3931 + }, + { + "epoch": 0.9358005592907717, + "grad_norm": 0.37946091566468837, + "learning_rate": 5.761318127897903e-06, + "loss": 0.3237, + "step": 3932 + }, + { + "epoch": 0.9360385553638365, + "grad_norm": 0.381077983146763, + "learning_rate": 5.759412953307771e-06, + "loss": 0.3836, + "step": 3933 + }, + { + "epoch": 0.9362765514369012, + "grad_norm": 0.3797233859010859, + "learning_rate": 5.75750766585009e-06, + "loss": 0.3368, + "step": 3934 + }, + { + "epoch": 0.9365145475099661, + "grad_norm": 0.4055187658650143, + "learning_rate": 5.7556022658080276e-06, + "loss": 0.3064, + "step": 3935 + }, + { + "epoch": 0.9367525435830308, + "grad_norm": 0.3428864215245841, + "learning_rate": 5.753696753464778e-06, + "loss": 0.3785, + "step": 3936 + }, + { + "epoch": 0.9369905396560957, + "grad_norm": 0.38066009287080843, + "learning_rate": 5.751791129103545e-06, + "loss": 0.4031, + "step": 3937 + }, + { + "epoch": 0.9372285357291604, + "grad_norm": 0.38910361084548145, + "learning_rate": 5.749885393007552e-06, + "loss": 0.3196, + "step": 3938 + }, + { + "epoch": 0.9374665318022253, + "grad_norm": 0.4447865941468255, + "learning_rate": 5.747979545460036e-06, + "loss": 0.3112, + "step": 3939 + }, + { + "epoch": 0.93770452787529, + "grad_norm": 0.36280386489269756, + "learning_rate": 5.746073586744258e-06, + "loss": 0.3824, + "step": 3940 + }, + { + "epoch": 0.9379425239483549, + "grad_norm": 0.3715191063601824, + "learning_rate": 5.744167517143486e-06, + "loss": 0.351, + "step": 3941 + }, + { + "epoch": 0.9381805200214196, + "grad_norm": 0.3916498754924406, + "learning_rate": 5.742261336941013e-06, + "loss": 0.3221, + "step": 3942 + }, + { + "epoch": 0.9384185160944845, + "grad_norm": 0.38502974009058716, + "learning_rate": 5.740355046420142e-06, + "loss": 0.4041, + "step": 3943 + }, + { + "epoch": 0.9386565121675492, + "grad_norm": 0.5425497961680991, + "learning_rate": 5.738448645864195e-06, + "loss": 0.415, + "step": 3944 + }, + { + "epoch": 0.938894508240614, + "grad_norm": 0.3914605628286909, + "learning_rate": 5.736542135556512e-06, + "loss": 0.3102, + "step": 3945 + }, + { + "epoch": 0.9391325043136788, + "grad_norm": 0.3930678191979905, + "learning_rate": 5.7346355157804455e-06, + "loss": 0.3227, + "step": 3946 + }, + { + "epoch": 0.9393705003867436, + "grad_norm": 0.3660355806904165, + "learning_rate": 5.732728786819368e-06, + "loss": 0.3873, + "step": 3947 + }, + { + "epoch": 0.9396084964598084, + "grad_norm": 0.3579948522758407, + "learning_rate": 5.730821948956665e-06, + "loss": 0.366, + "step": 3948 + }, + { + "epoch": 0.9398464925328732, + "grad_norm": 0.35994213666298, + "learning_rate": 5.7289150024757415e-06, + "loss": 0.2896, + "step": 3949 + }, + { + "epoch": 0.940084488605938, + "grad_norm": 0.3660284695948094, + "learning_rate": 5.727007947660016e-06, + "loss": 0.3578, + "step": 3950 + }, + { + "epoch": 0.9403224846790028, + "grad_norm": 0.39417668974001635, + "learning_rate": 5.725100784792924e-06, + "loss": 0.3948, + "step": 3951 + }, + { + "epoch": 0.9405604807520676, + "grad_norm": 0.382207382206984, + "learning_rate": 5.723193514157918e-06, + "loss": 0.3124, + "step": 3952 + }, + { + "epoch": 0.9407984768251324, + "grad_norm": 0.4024628893156382, + "learning_rate": 5.721286136038463e-06, + "loss": 0.2865, + "step": 3953 + }, + { + "epoch": 0.9410364728981971, + "grad_norm": 0.386924814297485, + "learning_rate": 5.719378650718046e-06, + "loss": 0.364, + "step": 3954 + }, + { + "epoch": 0.941274468971262, + "grad_norm": 0.376900110000449, + "learning_rate": 5.717471058480165e-06, + "loss": 0.3686, + "step": 3955 + }, + { + "epoch": 0.9415124650443267, + "grad_norm": 0.3832996377113013, + "learning_rate": 5.7155633596083345e-06, + "loss": 0.2966, + "step": 3956 + }, + { + "epoch": 0.9417504611173916, + "grad_norm": 0.3689062510414365, + "learning_rate": 5.713655554386088e-06, + "loss": 0.3313, + "step": 3957 + }, + { + "epoch": 0.9419884571904563, + "grad_norm": 0.36517279149138915, + "learning_rate": 5.71174764309697e-06, + "loss": 0.3935, + "step": 3958 + }, + { + "epoch": 0.9422264532635212, + "grad_norm": 0.37317314065760315, + "learning_rate": 5.709839626024545e-06, + "loss": 0.2822, + "step": 3959 + }, + { + "epoch": 0.9424644493365859, + "grad_norm": 0.39141777366177366, + "learning_rate": 5.70793150345239e-06, + "loss": 0.3011, + "step": 3960 + }, + { + "epoch": 0.9427024454096508, + "grad_norm": 0.3705484724661079, + "learning_rate": 5.706023275664101e-06, + "loss": 0.3703, + "step": 3961 + }, + { + "epoch": 0.9429404414827155, + "grad_norm": 0.37822124229893755, + "learning_rate": 5.704114942943286e-06, + "loss": 0.3911, + "step": 3962 + }, + { + "epoch": 0.9431784375557803, + "grad_norm": 0.4070014262482564, + "learning_rate": 5.702206505573572e-06, + "loss": 0.3071, + "step": 3963 + }, + { + "epoch": 0.9434164336288451, + "grad_norm": 0.36329996164851147, + "learning_rate": 5.7002979638386005e-06, + "loss": 0.3356, + "step": 3964 + }, + { + "epoch": 0.9436544297019099, + "grad_norm": 0.36649962919002427, + "learning_rate": 5.698389318022026e-06, + "loss": 0.4008, + "step": 3965 + }, + { + "epoch": 0.9438924257749747, + "grad_norm": 0.3638109792735608, + "learning_rate": 5.696480568407523e-06, + "loss": 0.3156, + "step": 3966 + }, + { + "epoch": 0.9441304218480395, + "grad_norm": 0.3605646832595526, + "learning_rate": 5.694571715278775e-06, + "loss": 0.2936, + "step": 3967 + }, + { + "epoch": 0.9443684179211043, + "grad_norm": 0.3453303049010467, + "learning_rate": 5.692662758919489e-06, + "loss": 0.3573, + "step": 3968 + }, + { + "epoch": 0.9446064139941691, + "grad_norm": 0.3772126922641312, + "learning_rate": 5.690753699613382e-06, + "loss": 0.4241, + "step": 3969 + }, + { + "epoch": 0.9448444100672339, + "grad_norm": 0.445093770628227, + "learning_rate": 5.688844537644186e-06, + "loss": 0.3324, + "step": 3970 + }, + { + "epoch": 0.9450824061402987, + "grad_norm": 0.3699388236702028, + "learning_rate": 5.686935273295649e-06, + "loss": 0.3115, + "step": 3971 + }, + { + "epoch": 0.9453204022133634, + "grad_norm": 0.3897628984455295, + "learning_rate": 5.685025906851539e-06, + "loss": 0.3806, + "step": 3972 + }, + { + "epoch": 0.9455583982864283, + "grad_norm": 0.4016257052922472, + "learning_rate": 5.6831164385956314e-06, + "loss": 0.3558, + "step": 3973 + }, + { + "epoch": 0.945796394359493, + "grad_norm": 0.3631492629536495, + "learning_rate": 5.681206868811721e-06, + "loss": 0.299, + "step": 3974 + }, + { + "epoch": 0.9460343904325579, + "grad_norm": 0.3617842355634617, + "learning_rate": 5.679297197783617e-06, + "loss": 0.3351, + "step": 3975 + }, + { + "epoch": 0.9462723865056226, + "grad_norm": 0.37153756822155454, + "learning_rate": 5.677387425795146e-06, + "loss": 0.389, + "step": 3976 + }, + { + "epoch": 0.9465103825786875, + "grad_norm": 0.37351547703057986, + "learning_rate": 5.675477553130145e-06, + "loss": 0.3245, + "step": 3977 + }, + { + "epoch": 0.9467483786517522, + "grad_norm": 0.38350414771635083, + "learning_rate": 5.6735675800724695e-06, + "loss": 0.3055, + "step": 3978 + }, + { + "epoch": 0.9469863747248171, + "grad_norm": 0.42323362399946074, + "learning_rate": 5.671657506905989e-06, + "loss": 0.3664, + "step": 3979 + }, + { + "epoch": 0.9472243707978818, + "grad_norm": 0.3736129312742057, + "learning_rate": 5.669747333914586e-06, + "loss": 0.3772, + "step": 3980 + }, + { + "epoch": 0.9474623668709466, + "grad_norm": 0.3940418326123638, + "learning_rate": 5.66783706138216e-06, + "loss": 0.3301, + "step": 3981 + }, + { + "epoch": 0.9477003629440114, + "grad_norm": 0.3647257614559865, + "learning_rate": 5.665926689592626e-06, + "loss": 0.3509, + "step": 3982 + }, + { + "epoch": 0.9479383590170762, + "grad_norm": 0.35718234674396576, + "learning_rate": 5.664016218829911e-06, + "loss": 0.4159, + "step": 3983 + }, + { + "epoch": 0.948176355090141, + "grad_norm": 0.35559817666437965, + "learning_rate": 5.6621056493779605e-06, + "loss": 0.2914, + "step": 3984 + }, + { + "epoch": 0.9484143511632058, + "grad_norm": 0.4155897468492624, + "learning_rate": 5.660194981520729e-06, + "loss": 0.3059, + "step": 3985 + }, + { + "epoch": 0.9486523472362706, + "grad_norm": 0.39058764215986536, + "learning_rate": 5.658284215542191e-06, + "loss": 0.3828, + "step": 3986 + }, + { + "epoch": 0.9488903433093354, + "grad_norm": 0.36296631175689215, + "learning_rate": 5.656373351726334e-06, + "loss": 0.3917, + "step": 3987 + }, + { + "epoch": 0.9491283393824002, + "grad_norm": 0.3654376052680258, + "learning_rate": 5.654462390357159e-06, + "loss": 0.3176, + "step": 3988 + }, + { + "epoch": 0.949366335455465, + "grad_norm": 0.37351319837559777, + "learning_rate": 5.652551331718681e-06, + "loss": 0.3476, + "step": 3989 + }, + { + "epoch": 0.9496043315285297, + "grad_norm": 0.3654444319811079, + "learning_rate": 5.6506401760949335e-06, + "loss": 0.4022, + "step": 3990 + }, + { + "epoch": 0.9498423276015946, + "grad_norm": 0.37676497486164945, + "learning_rate": 5.6487289237699595e-06, + "loss": 0.3443, + "step": 3991 + }, + { + "epoch": 0.9500803236746593, + "grad_norm": 0.3816522699094411, + "learning_rate": 5.646817575027819e-06, + "loss": 0.2994, + "step": 3992 + }, + { + "epoch": 0.9503183197477242, + "grad_norm": 0.36418061297866533, + "learning_rate": 5.6449061301525845e-06, + "loss": 0.3747, + "step": 3993 + }, + { + "epoch": 0.9505563158207889, + "grad_norm": 0.3457838501298794, + "learning_rate": 5.642994589428344e-06, + "loss": 0.3956, + "step": 3994 + }, + { + "epoch": 0.9507943118938538, + "grad_norm": 0.3814926599582243, + "learning_rate": 5.641082953139201e-06, + "loss": 0.3113, + "step": 3995 + }, + { + "epoch": 0.9510323079669185, + "grad_norm": 0.38468757516147417, + "learning_rate": 5.639171221569273e-06, + "loss": 0.3233, + "step": 3996 + }, + { + "epoch": 0.9512703040399834, + "grad_norm": 0.3676820335135461, + "learning_rate": 5.637259395002688e-06, + "loss": 0.409, + "step": 3997 + }, + { + "epoch": 0.9515083001130481, + "grad_norm": 0.366933898548463, + "learning_rate": 5.635347473723592e-06, + "loss": 0.3456, + "step": 3998 + }, + { + "epoch": 0.951746296186113, + "grad_norm": 0.3470289154843928, + "learning_rate": 5.633435458016144e-06, + "loss": 0.3005, + "step": 3999 + }, + { + "epoch": 0.9519842922591777, + "grad_norm": 0.3685253402972838, + "learning_rate": 5.631523348164517e-06, + "loss": 0.3329, + "step": 4000 + }, + { + "epoch": 0.9522222883322425, + "grad_norm": 0.37326221272675375, + "learning_rate": 5.629611144452896e-06, + "loss": 0.3898, + "step": 4001 + }, + { + "epoch": 0.9524602844053073, + "grad_norm": 0.369692070324989, + "learning_rate": 5.627698847165484e-06, + "loss": 0.3069, + "step": 4002 + }, + { + "epoch": 0.9526982804783721, + "grad_norm": 0.4315390716578853, + "learning_rate": 5.625786456586493e-06, + "loss": 0.33, + "step": 4003 + }, + { + "epoch": 0.9529362765514369, + "grad_norm": 0.4310155102266023, + "learning_rate": 5.623873973000156e-06, + "loss": 0.4028, + "step": 4004 + }, + { + "epoch": 0.9531742726245017, + "grad_norm": 0.3751833494613908, + "learning_rate": 5.621961396690712e-06, + "loss": 0.3683, + "step": 4005 + }, + { + "epoch": 0.9534122686975665, + "grad_norm": 0.42057861372299066, + "learning_rate": 5.620048727942416e-06, + "loss": 0.2942, + "step": 4006 + }, + { + "epoch": 0.9536502647706313, + "grad_norm": 0.3625924700652696, + "learning_rate": 5.618135967039542e-06, + "loss": 0.3401, + "step": 4007 + }, + { + "epoch": 0.953888260843696, + "grad_norm": 0.3542760242464296, + "learning_rate": 5.616223114266369e-06, + "loss": 0.417, + "step": 4008 + }, + { + "epoch": 0.9541262569167609, + "grad_norm": 0.40855459494512736, + "learning_rate": 5.614310169907199e-06, + "loss": 0.3232, + "step": 4009 + }, + { + "epoch": 0.9543642529898256, + "grad_norm": 0.37105385201006647, + "learning_rate": 5.61239713424634e-06, + "loss": 0.2834, + "step": 4010 + }, + { + "epoch": 0.9546022490628905, + "grad_norm": 0.37855213750129024, + "learning_rate": 5.610484007568117e-06, + "loss": 0.3644, + "step": 4011 + }, + { + "epoch": 0.9548402451359552, + "grad_norm": 0.36744333500958176, + "learning_rate": 5.608570790156867e-06, + "loss": 0.3968, + "step": 4012 + }, + { + "epoch": 0.9550782412090201, + "grad_norm": 0.40177284275871733, + "learning_rate": 5.606657482296943e-06, + "loss": 0.3354, + "step": 4013 + }, + { + "epoch": 0.9553162372820848, + "grad_norm": 0.40131947230064974, + "learning_rate": 5.6047440842727075e-06, + "loss": 0.3262, + "step": 4014 + }, + { + "epoch": 0.9555542333551497, + "grad_norm": 0.35010111030696944, + "learning_rate": 5.602830596368543e-06, + "loss": 0.3737, + "step": 4015 + }, + { + "epoch": 0.9557922294282144, + "grad_norm": 0.3939903727222176, + "learning_rate": 5.600917018868835e-06, + "loss": 0.3344, + "step": 4016 + }, + { + "epoch": 0.9560302255012793, + "grad_norm": 0.4081024631555434, + "learning_rate": 5.599003352057994e-06, + "loss": 0.3028, + "step": 4017 + }, + { + "epoch": 0.956268221574344, + "grad_norm": 0.38240903753305877, + "learning_rate": 5.597089596220437e-06, + "loss": 0.3847, + "step": 4018 + }, + { + "epoch": 0.9565062176474088, + "grad_norm": 0.39578792437353716, + "learning_rate": 5.595175751640595e-06, + "loss": 0.3914, + "step": 4019 + }, + { + "epoch": 0.9567442137204736, + "grad_norm": 0.43574336288913884, + "learning_rate": 5.593261818602912e-06, + "loss": 0.33, + "step": 4020 + }, + { + "epoch": 0.9569822097935384, + "grad_norm": 0.3944916284608389, + "learning_rate": 5.5913477973918465e-06, + "loss": 0.3162, + "step": 4021 + }, + { + "epoch": 0.9572202058666032, + "grad_norm": 0.3667260519177921, + "learning_rate": 5.589433688291867e-06, + "loss": 0.3784, + "step": 4022 + }, + { + "epoch": 0.957458201939668, + "grad_norm": 0.38390699290292674, + "learning_rate": 5.5875194915874605e-06, + "loss": 0.3512, + "step": 4023 + }, + { + "epoch": 0.9576961980127328, + "grad_norm": 0.37193449103190873, + "learning_rate": 5.585605207563124e-06, + "loss": 0.2912, + "step": 4024 + }, + { + "epoch": 0.9579341940857976, + "grad_norm": 0.3699623027804539, + "learning_rate": 5.583690836503366e-06, + "loss": 0.3425, + "step": 4025 + }, + { + "epoch": 0.9581721901588623, + "grad_norm": 0.37064282931321874, + "learning_rate": 5.58177637869271e-06, + "loss": 0.3992, + "step": 4026 + }, + { + "epoch": 0.9584101862319272, + "grad_norm": 0.37635147922447526, + "learning_rate": 5.5798618344156916e-06, + "loss": 0.3117, + "step": 4027 + }, + { + "epoch": 0.9586481823049919, + "grad_norm": 0.3987637365857944, + "learning_rate": 5.577947203956858e-06, + "loss": 0.3143, + "step": 4028 + }, + { + "epoch": 0.9588861783780568, + "grad_norm": 0.4035395300731761, + "learning_rate": 5.576032487600773e-06, + "loss": 0.378, + "step": 4029 + }, + { + "epoch": 0.9591241744511215, + "grad_norm": 0.4101548946799405, + "learning_rate": 5.5741176856320105e-06, + "loss": 0.3796, + "step": 4030 + }, + { + "epoch": 0.9593621705241864, + "grad_norm": 0.3845855259199741, + "learning_rate": 5.5722027983351565e-06, + "loss": 0.3068, + "step": 4031 + }, + { + "epoch": 0.9596001665972511, + "grad_norm": 0.3609449815477264, + "learning_rate": 5.570287825994812e-06, + "loss": 0.3256, + "step": 4032 + }, + { + "epoch": 0.959838162670316, + "grad_norm": 0.3452672734583461, + "learning_rate": 5.568372768895588e-06, + "loss": 0.4226, + "step": 4033 + }, + { + "epoch": 0.9600761587433807, + "grad_norm": 0.3712187021978535, + "learning_rate": 5.566457627322109e-06, + "loss": 0.3293, + "step": 4034 + }, + { + "epoch": 0.9603141548164456, + "grad_norm": 0.35303931097547897, + "learning_rate": 5.5645424015590144e-06, + "loss": 0.28, + "step": 4035 + }, + { + "epoch": 0.9605521508895103, + "grad_norm": 0.3633441697287166, + "learning_rate": 5.562627091890951e-06, + "loss": 0.3538, + "step": 4036 + }, + { + "epoch": 0.9607901469625751, + "grad_norm": 0.41159227713226276, + "learning_rate": 5.560711698602584e-06, + "loss": 0.383, + "step": 4037 + }, + { + "epoch": 0.9610281430356399, + "grad_norm": 0.35928803089226324, + "learning_rate": 5.558796221978587e-06, + "loss": 0.2985, + "step": 4038 + }, + { + "epoch": 0.9612661391087047, + "grad_norm": 0.44144609493808085, + "learning_rate": 5.556880662303648e-06, + "loss": 0.3703, + "step": 4039 + }, + { + "epoch": 0.9615041351817695, + "grad_norm": 0.3803784240581862, + "learning_rate": 5.554965019862466e-06, + "loss": 0.3911, + "step": 4040 + }, + { + "epoch": 0.9617421312548343, + "grad_norm": 0.36481000247944867, + "learning_rate": 5.553049294939752e-06, + "loss": 0.3311, + "step": 4041 + }, + { + "epoch": 0.9619801273278991, + "grad_norm": 0.3593387405042659, + "learning_rate": 5.551133487820231e-06, + "loss": 0.2952, + "step": 4042 + }, + { + "epoch": 0.9622181234009639, + "grad_norm": 0.3625692759696533, + "learning_rate": 5.54921759878864e-06, + "loss": 0.3734, + "step": 4043 + }, + { + "epoch": 0.9624561194740286, + "grad_norm": 0.4202028000178488, + "learning_rate": 5.547301628129726e-06, + "loss": 0.3948, + "step": 4044 + }, + { + "epoch": 0.9626941155470935, + "grad_norm": 0.3783279956511722, + "learning_rate": 5.545385576128252e-06, + "loss": 0.3263, + "step": 4045 + }, + { + "epoch": 0.9629321116201582, + "grad_norm": 0.3926107017682296, + "learning_rate": 5.54346944306899e-06, + "loss": 0.3155, + "step": 4046 + }, + { + "epoch": 0.9631701076932231, + "grad_norm": 0.3926267406486093, + "learning_rate": 5.541553229236721e-06, + "loss": 0.367, + "step": 4047 + }, + { + "epoch": 0.9634081037662878, + "grad_norm": 0.38364235306865657, + "learning_rate": 5.539636934916247e-06, + "loss": 0.3193, + "step": 4048 + }, + { + "epoch": 0.9636460998393527, + "grad_norm": 0.38507849318568027, + "learning_rate": 5.537720560392373e-06, + "loss": 0.3056, + "step": 4049 + }, + { + "epoch": 0.9638840959124174, + "grad_norm": 0.37301941582141107, + "learning_rate": 5.535804105949922e-06, + "loss": 0.3405, + "step": 4050 + }, + { + "epoch": 0.9641220919854823, + "grad_norm": 0.35296914488364023, + "learning_rate": 5.533887571873725e-06, + "loss": 0.4018, + "step": 4051 + }, + { + "epoch": 0.964360088058547, + "grad_norm": 0.3588491551483761, + "learning_rate": 5.531970958448628e-06, + "loss": 0.3004, + "step": 4052 + }, + { + "epoch": 0.9645980841316119, + "grad_norm": 0.39575643827764756, + "learning_rate": 5.530054265959486e-06, + "loss": 0.2866, + "step": 4053 + }, + { + "epoch": 0.9648360802046766, + "grad_norm": 0.3466594386599281, + "learning_rate": 5.528137494691167e-06, + "loss": 0.3955, + "step": 4054 + }, + { + "epoch": 0.9650740762777414, + "grad_norm": 0.373285637369822, + "learning_rate": 5.52622064492855e-06, + "loss": 0.3484, + "step": 4055 + }, + { + "epoch": 0.9653120723508062, + "grad_norm": 0.42113681485363846, + "learning_rate": 5.524303716956528e-06, + "loss": 0.3046, + "step": 4056 + }, + { + "epoch": 0.965550068423871, + "grad_norm": 0.39214970328364873, + "learning_rate": 5.522386711060002e-06, + "loss": 0.3518, + "step": 4057 + }, + { + "epoch": 0.9657880644969358, + "grad_norm": 0.3636119541231596, + "learning_rate": 5.520469627523889e-06, + "loss": 0.3726, + "step": 4058 + }, + { + "epoch": 0.9660260605700006, + "grad_norm": 0.38531522777192306, + "learning_rate": 5.518552466633112e-06, + "loss": 0.3341, + "step": 4059 + }, + { + "epoch": 0.9662640566430654, + "grad_norm": 0.3689829393306755, + "learning_rate": 5.516635228672612e-06, + "loss": 0.3131, + "step": 4060 + }, + { + "epoch": 0.9665020527161302, + "grad_norm": 0.37131422956593985, + "learning_rate": 5.514717913927336e-06, + "loss": 0.3755, + "step": 4061 + }, + { + "epoch": 0.966740048789195, + "grad_norm": 0.36698419750198635, + "learning_rate": 5.5128005226822435e-06, + "loss": 0.4112, + "step": 4062 + }, + { + "epoch": 0.9669780448622598, + "grad_norm": 0.37767384948653765, + "learning_rate": 5.510883055222307e-06, + "loss": 0.3098, + "step": 4063 + }, + { + "epoch": 0.9672160409353245, + "grad_norm": 0.3831387096070846, + "learning_rate": 5.508965511832509e-06, + "loss": 0.3377, + "step": 4064 + }, + { + "epoch": 0.9674540370083894, + "grad_norm": 0.3413991536604678, + "learning_rate": 5.507047892797846e-06, + "loss": 0.3926, + "step": 4065 + }, + { + "epoch": 0.9676920330814541, + "grad_norm": 0.4405648042369594, + "learning_rate": 5.505130198403324e-06, + "loss": 0.3568, + "step": 4066 + }, + { + "epoch": 0.967930029154519, + "grad_norm": 0.36599437275948865, + "learning_rate": 5.503212428933956e-06, + "loss": 0.3054, + "step": 4067 + }, + { + "epoch": 0.9681680252275837, + "grad_norm": 0.4337574365319751, + "learning_rate": 5.501294584674771e-06, + "loss": 0.3363, + "step": 4068 + }, + { + "epoch": 0.9684060213006486, + "grad_norm": 0.3602978619539388, + "learning_rate": 5.499376665910812e-06, + "loss": 0.4053, + "step": 4069 + }, + { + "epoch": 0.9686440173737133, + "grad_norm": 0.3642524483127621, + "learning_rate": 5.497458672927124e-06, + "loss": 0.3262, + "step": 4070 + }, + { + "epoch": 0.9688820134467782, + "grad_norm": 0.3990749674732015, + "learning_rate": 5.495540606008772e-06, + "loss": 0.3322, + "step": 4071 + }, + { + "epoch": 0.9691200095198429, + "grad_norm": 0.3452572444028366, + "learning_rate": 5.493622465440828e-06, + "loss": 0.401, + "step": 4072 + }, + { + "epoch": 0.9693580055929077, + "grad_norm": 0.3947407561093372, + "learning_rate": 5.491704251508373e-06, + "loss": 0.3727, + "step": 4073 + }, + { + "epoch": 0.9695960016659725, + "grad_norm": 0.35872991795969333, + "learning_rate": 5.489785964496503e-06, + "loss": 0.2752, + "step": 4074 + }, + { + "epoch": 0.9698339977390373, + "grad_norm": 0.3810135107843619, + "learning_rate": 5.48786760469032e-06, + "loss": 0.3631, + "step": 4075 + }, + { + "epoch": 0.9700719938121021, + "grad_norm": 0.36736070326138315, + "learning_rate": 5.485949172374944e-06, + "loss": 0.4297, + "step": 4076 + }, + { + "epoch": 0.9703099898851669, + "grad_norm": 0.38917535599652914, + "learning_rate": 5.484030667835496e-06, + "loss": 0.3338, + "step": 4077 + }, + { + "epoch": 0.9705479859582317, + "grad_norm": 0.4162578135435146, + "learning_rate": 5.482112091357119e-06, + "loss": 0.3201, + "step": 4078 + }, + { + "epoch": 0.9707859820312965, + "grad_norm": 0.4446346807732351, + "learning_rate": 5.480193443224957e-06, + "loss": 0.3781, + "step": 4079 + }, + { + "epoch": 0.9710239781043613, + "grad_norm": 0.363773543353832, + "learning_rate": 5.478274723724172e-06, + "loss": 0.363, + "step": 4080 + }, + { + "epoch": 0.9712619741774261, + "grad_norm": 0.44939319983262227, + "learning_rate": 5.47635593313993e-06, + "loss": 0.3207, + "step": 4081 + }, + { + "epoch": 0.9714999702504908, + "grad_norm": 0.38983047666671894, + "learning_rate": 5.47443707175741e-06, + "loss": 0.3369, + "step": 4082 + }, + { + "epoch": 0.9717379663235557, + "grad_norm": 0.3609085790607123, + "learning_rate": 5.472518139861806e-06, + "loss": 0.4184, + "step": 4083 + }, + { + "epoch": 0.9719759623966204, + "grad_norm": 0.4326297185681936, + "learning_rate": 5.470599137738315e-06, + "loss": 0.3157, + "step": 4084 + }, + { + "epoch": 0.9722139584696853, + "grad_norm": 0.3863516117664841, + "learning_rate": 5.468680065672152e-06, + "loss": 0.3215, + "step": 4085 + }, + { + "epoch": 0.97245195454275, + "grad_norm": 0.37735302540928745, + "learning_rate": 5.466760923948536e-06, + "loss": 0.4003, + "step": 4086 + }, + { + "epoch": 0.9726899506158149, + "grad_norm": 0.3777977585862572, + "learning_rate": 5.464841712852701e-06, + "loss": 0.4035, + "step": 4087 + }, + { + "epoch": 0.9729279466888796, + "grad_norm": 0.3884798640154453, + "learning_rate": 5.462922432669886e-06, + "loss": 0.2875, + "step": 4088 + }, + { + "epoch": 0.9731659427619445, + "grad_norm": 0.3968066470866644, + "learning_rate": 5.461003083685346e-06, + "loss": 0.3509, + "step": 4089 + }, + { + "epoch": 0.9734039388350092, + "grad_norm": 0.37546658100029856, + "learning_rate": 5.459083666184344e-06, + "loss": 0.4117, + "step": 4090 + }, + { + "epoch": 0.973641934908074, + "grad_norm": 0.4033600486378232, + "learning_rate": 5.4571641804521505e-06, + "loss": 0.3023, + "step": 4091 + }, + { + "epoch": 0.9738799309811388, + "grad_norm": 0.366298156488569, + "learning_rate": 5.4552446267740515e-06, + "loss": 0.2914, + "step": 4092 + }, + { + "epoch": 0.9741179270542036, + "grad_norm": 0.3820600291897777, + "learning_rate": 5.453325005435337e-06, + "loss": 0.3516, + "step": 4093 + }, + { + "epoch": 0.9743559231272684, + "grad_norm": 0.3654555161312593, + "learning_rate": 5.451405316721313e-06, + "loss": 0.3877, + "step": 4094 + }, + { + "epoch": 0.9745939192003332, + "grad_norm": 0.40260654542095964, + "learning_rate": 5.449485560917291e-06, + "loss": 0.2995, + "step": 4095 + }, + { + "epoch": 0.974831915273398, + "grad_norm": 0.3968641036674078, + "learning_rate": 5.4475657383085955e-06, + "loss": 0.306, + "step": 4096 + }, + { + "epoch": 0.9750699113464628, + "grad_norm": 0.3903922997930381, + "learning_rate": 5.44564584918056e-06, + "loss": 0.3827, + "step": 4097 + }, + { + "epoch": 0.9753079074195276, + "grad_norm": 0.439867110372997, + "learning_rate": 5.443725893818524e-06, + "loss": 0.3308, + "step": 4098 + }, + { + "epoch": 0.9755459034925924, + "grad_norm": 0.4024543253971147, + "learning_rate": 5.441805872507846e-06, + "loss": 0.2899, + "step": 4099 + }, + { + "epoch": 0.9757838995656571, + "grad_norm": 0.3732267890164506, + "learning_rate": 5.439885785533884e-06, + "loss": 0.3557, + "step": 4100 + }, + { + "epoch": 0.976021895638722, + "grad_norm": 0.38784018489625477, + "learning_rate": 5.437965633182012e-06, + "loss": 0.3825, + "step": 4101 + }, + { + "epoch": 0.9762598917117867, + "grad_norm": 0.35874607034227307, + "learning_rate": 5.436045415737613e-06, + "loss": 0.2903, + "step": 4102 + }, + { + "epoch": 0.9764978877848516, + "grad_norm": 0.35448408385767743, + "learning_rate": 5.434125133486078e-06, + "loss": 0.3117, + "step": 4103 + }, + { + "epoch": 0.9767358838579163, + "grad_norm": 0.36167261584165866, + "learning_rate": 5.432204786712807e-06, + "loss": 0.3822, + "step": 4104 + }, + { + "epoch": 0.9769738799309812, + "grad_norm": 0.36236014204816985, + "learning_rate": 5.430284375703213e-06, + "loss": 0.3937, + "step": 4105 + }, + { + "epoch": 0.9772118760040459, + "grad_norm": 0.3615011221235271, + "learning_rate": 5.428363900742717e-06, + "loss": 0.2882, + "step": 4106 + }, + { + "epoch": 0.9774498720771108, + "grad_norm": 0.36358536744553127, + "learning_rate": 5.426443362116746e-06, + "loss": 0.3445, + "step": 4107 + }, + { + "epoch": 0.9776878681501755, + "grad_norm": 0.3499471450758627, + "learning_rate": 5.424522760110744e-06, + "loss": 0.3819, + "step": 4108 + }, + { + "epoch": 0.9779258642232403, + "grad_norm": 0.3783758065626236, + "learning_rate": 5.422602095010157e-06, + "loss": 0.3003, + "step": 4109 + }, + { + "epoch": 0.9781638602963051, + "grad_norm": 0.3485194689281019, + "learning_rate": 5.420681367100443e-06, + "loss": 0.3274, + "step": 4110 + }, + { + "epoch": 0.9784018563693699, + "grad_norm": 0.3560157044900479, + "learning_rate": 5.418760576667071e-06, + "loss": 0.3514, + "step": 4111 + }, + { + "epoch": 0.9786398524424347, + "grad_norm": 0.36557640443599765, + "learning_rate": 5.416839723995518e-06, + "loss": 0.3701, + "step": 4112 + }, + { + "epoch": 0.9788778485154995, + "grad_norm": 0.4127125165049302, + "learning_rate": 5.41491880937127e-06, + "loss": 0.3, + "step": 4113 + }, + { + "epoch": 0.9791158445885643, + "grad_norm": 0.3946777818047238, + "learning_rate": 5.4129978330798224e-06, + "loss": 0.3319, + "step": 4114 + }, + { + "epoch": 0.9793538406616291, + "grad_norm": 0.35875313580725443, + "learning_rate": 5.41107679540668e-06, + "loss": 0.3716, + "step": 4115 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.3583640218186905, + "learning_rate": 5.409155696637357e-06, + "loss": 0.3203, + "step": 4116 + }, + { + "epoch": 0.9798298328077587, + "grad_norm": 0.43393683642612224, + "learning_rate": 5.4072345370573745e-06, + "loss": 0.3299, + "step": 4117 + }, + { + "epoch": 0.9800678288808234, + "grad_norm": 0.3689652477197836, + "learning_rate": 5.405313316952265e-06, + "loss": 0.3645, + "step": 4118 + }, + { + "epoch": 0.9803058249538883, + "grad_norm": 0.38248656505142614, + "learning_rate": 5.403392036607568e-06, + "loss": 0.4105, + "step": 4119 + }, + { + "epoch": 0.980543821026953, + "grad_norm": 0.34515772392085886, + "learning_rate": 5.401470696308838e-06, + "loss": 0.3192, + "step": 4120 + }, + { + "epoch": 0.9807818171000179, + "grad_norm": 0.349261482848996, + "learning_rate": 5.399549296341629e-06, + "loss": 0.3257, + "step": 4121 + }, + { + "epoch": 0.9810198131730826, + "grad_norm": 0.3802296455540952, + "learning_rate": 5.39762783699151e-06, + "loss": 0.3968, + "step": 4122 + }, + { + "epoch": 0.9812578092461475, + "grad_norm": 0.3615234129840896, + "learning_rate": 5.395706318544059e-06, + "loss": 0.3298, + "step": 4123 + }, + { + "epoch": 0.9814958053192122, + "grad_norm": 0.4070437397173655, + "learning_rate": 5.393784741284858e-06, + "loss": 0.3196, + "step": 4124 + }, + { + "epoch": 0.9817338013922771, + "grad_norm": 0.3920234501678305, + "learning_rate": 5.391863105499505e-06, + "loss": 0.3159, + "step": 4125 + }, + { + "epoch": 0.9819717974653418, + "grad_norm": 0.3990009179740692, + "learning_rate": 5.389941411473598e-06, + "loss": 0.4174, + "step": 4126 + }, + { + "epoch": 0.9822097935384067, + "grad_norm": 0.41580376912089045, + "learning_rate": 5.3880196594927514e-06, + "loss": 0.2869, + "step": 4127 + }, + { + "epoch": 0.9824477896114714, + "grad_norm": 0.41224321601260766, + "learning_rate": 5.3860978498425845e-06, + "loss": 0.2956, + "step": 4128 + }, + { + "epoch": 0.9826857856845362, + "grad_norm": 0.37553862338123545, + "learning_rate": 5.384175982808726e-06, + "loss": 0.4063, + "step": 4129 + }, + { + "epoch": 0.982923781757601, + "grad_norm": 0.3883006441711229, + "learning_rate": 5.382254058676812e-06, + "loss": 0.3706, + "step": 4130 + }, + { + "epoch": 0.9831617778306658, + "grad_norm": 0.35991795831667905, + "learning_rate": 5.380332077732489e-06, + "loss": 0.2823, + "step": 4131 + }, + { + "epoch": 0.9833997739037306, + "grad_norm": 0.34116602859971285, + "learning_rate": 5.378410040261408e-06, + "loss": 0.3513, + "step": 4132 + }, + { + "epoch": 0.9836377699767954, + "grad_norm": 0.4029763010820559, + "learning_rate": 5.376487946549235e-06, + "loss": 0.3987, + "step": 4133 + }, + { + "epoch": 0.9838757660498602, + "grad_norm": 0.3868100898163792, + "learning_rate": 5.374565796881639e-06, + "loss": 0.2773, + "step": 4134 + }, + { + "epoch": 0.984113762122925, + "grad_norm": 0.3428976994977603, + "learning_rate": 5.3726435915442986e-06, + "loss": 0.2696, + "step": 4135 + }, + { + "epoch": 0.9843517581959897, + "grad_norm": 0.37266226267641545, + "learning_rate": 5.370721330822904e-06, + "loss": 0.3714, + "step": 4136 + }, + { + "epoch": 0.9845897542690546, + "grad_norm": 0.34598524004373593, + "learning_rate": 5.368799015003146e-06, + "loss": 0.384, + "step": 4137 + }, + { + "epoch": 0.9848277503421193, + "grad_norm": 0.4192229780499278, + "learning_rate": 5.366876644370733e-06, + "loss": 0.273, + "step": 4138 + }, + { + "epoch": 0.9850657464151842, + "grad_norm": 0.3486972025471823, + "learning_rate": 5.364954219211372e-06, + "loss": 0.337, + "step": 4139 + }, + { + "epoch": 0.9853037424882489, + "grad_norm": 0.36821440890479656, + "learning_rate": 5.363031739810787e-06, + "loss": 0.4031, + "step": 4140 + }, + { + "epoch": 0.9855417385613138, + "grad_norm": 0.39566913279734434, + "learning_rate": 5.361109206454704e-06, + "loss": 0.3342, + "step": 4141 + }, + { + "epoch": 0.9857797346343785, + "grad_norm": 0.4004101088241766, + "learning_rate": 5.359186619428861e-06, + "loss": 0.2989, + "step": 4142 + }, + { + "epoch": 0.9860177307074434, + "grad_norm": 0.41706306261687004, + "learning_rate": 5.357263979018999e-06, + "loss": 0.3715, + "step": 4143 + }, + { + "epoch": 0.9862557267805081, + "grad_norm": 0.38390634367368964, + "learning_rate": 5.355341285510872e-06, + "loss": 0.4016, + "step": 4144 + }, + { + "epoch": 0.986493722853573, + "grad_norm": 0.4119688642491026, + "learning_rate": 5.35341853919024e-06, + "loss": 0.3256, + "step": 4145 + }, + { + "epoch": 0.9867317189266377, + "grad_norm": 0.3966809599743582, + "learning_rate": 5.351495740342868e-06, + "loss": 0.3151, + "step": 4146 + }, + { + "epoch": 0.9869697149997025, + "grad_norm": 0.37458039305346175, + "learning_rate": 5.349572889254535e-06, + "loss": 0.3874, + "step": 4147 + }, + { + "epoch": 0.9872077110727673, + "grad_norm": 0.35624311527568975, + "learning_rate": 5.347649986211022e-06, + "loss": 0.3685, + "step": 4148 + }, + { + "epoch": 0.9874457071458321, + "grad_norm": 0.41927466293047233, + "learning_rate": 5.34572703149812e-06, + "loss": 0.3162, + "step": 4149 + }, + { + "epoch": 0.9876837032188969, + "grad_norm": 0.3880202059036109, + "learning_rate": 5.343804025401628e-06, + "loss": 0.3319, + "step": 4150 + }, + { + "epoch": 0.9879216992919617, + "grad_norm": 0.36850162612727255, + "learning_rate": 5.3418809682073546e-06, + "loss": 0.4004, + "step": 4151 + }, + { + "epoch": 0.9881596953650265, + "grad_norm": 0.3884277693954743, + "learning_rate": 5.339957860201111e-06, + "loss": 0.2997, + "step": 4152 + }, + { + "epoch": 0.9883976914380913, + "grad_norm": 0.3670588779035891, + "learning_rate": 5.338034701668717e-06, + "loss": 0.3034, + "step": 4153 + }, + { + "epoch": 0.988635687511156, + "grad_norm": 0.35848269208319117, + "learning_rate": 5.336111492896005e-06, + "loss": 0.3707, + "step": 4154 + }, + { + "epoch": 0.9888736835842209, + "grad_norm": 0.39501627370624287, + "learning_rate": 5.334188234168811e-06, + "loss": 0.3591, + "step": 4155 + }, + { + "epoch": 0.9891116796572856, + "grad_norm": 0.3924138317274099, + "learning_rate": 5.332264925772979e-06, + "loss": 0.3118, + "step": 4156 + }, + { + "epoch": 0.9893496757303505, + "grad_norm": 0.40183975512658543, + "learning_rate": 5.330341567994357e-06, + "loss": 0.3298, + "step": 4157 + }, + { + "epoch": 0.9895876718034152, + "grad_norm": 0.3911218716929348, + "learning_rate": 5.328418161118807e-06, + "loss": 0.3957, + "step": 4158 + }, + { + "epoch": 0.9898256678764801, + "grad_norm": 0.39644737968164145, + "learning_rate": 5.326494705432194e-06, + "loss": 0.3453, + "step": 4159 + }, + { + "epoch": 0.9900636639495448, + "grad_norm": 0.37820532138547525, + "learning_rate": 5.324571201220388e-06, + "loss": 0.303, + "step": 4160 + }, + { + "epoch": 0.9903016600226097, + "grad_norm": 0.3680837268102721, + "learning_rate": 5.322647648769275e-06, + "loss": 0.3685, + "step": 4161 + }, + { + "epoch": 0.9905396560956744, + "grad_norm": 0.398138838445303, + "learning_rate": 5.320724048364736e-06, + "loss": 0.3738, + "step": 4162 + }, + { + "epoch": 0.9907776521687393, + "grad_norm": 0.3627540114538959, + "learning_rate": 5.3188004002926715e-06, + "loss": 0.3166, + "step": 4163 + }, + { + "epoch": 0.991015648241804, + "grad_norm": 0.3694127802681357, + "learning_rate": 5.316876704838981e-06, + "loss": 0.3229, + "step": 4164 + }, + { + "epoch": 0.9912536443148688, + "grad_norm": 0.37299601489965684, + "learning_rate": 5.314952962289574e-06, + "loss": 0.3883, + "step": 4165 + }, + { + "epoch": 0.9914916403879336, + "grad_norm": 0.3572974513846591, + "learning_rate": 5.3130291729303625e-06, + "loss": 0.3368, + "step": 4166 + }, + { + "epoch": 0.9917296364609984, + "grad_norm": 0.3730121718719649, + "learning_rate": 5.311105337047273e-06, + "loss": 0.2987, + "step": 4167 + }, + { + "epoch": 0.9919676325340632, + "grad_norm": 0.34901771416498145, + "learning_rate": 5.3091814549262345e-06, + "loss": 0.3522, + "step": 4168 + }, + { + "epoch": 0.992205628607128, + "grad_norm": 0.3810404289861147, + "learning_rate": 5.3072575268531835e-06, + "loss": 0.404, + "step": 4169 + }, + { + "epoch": 0.9924436246801928, + "grad_norm": 0.35455196776033093, + "learning_rate": 5.305333553114061e-06, + "loss": 0.2767, + "step": 4170 + }, + { + "epoch": 0.9926816207532576, + "grad_norm": 0.3861290164430468, + "learning_rate": 5.303409533994821e-06, + "loss": 0.319, + "step": 4171 + }, + { + "epoch": 0.9929196168263223, + "grad_norm": 0.3494911589336109, + "learning_rate": 5.301485469781418e-06, + "loss": 0.3845, + "step": 4172 + }, + { + "epoch": 0.9931576128993872, + "grad_norm": 0.3522517568562557, + "learning_rate": 5.299561360759815e-06, + "loss": 0.3433, + "step": 4173 + }, + { + "epoch": 0.9933956089724519, + "grad_norm": 0.39675837098549627, + "learning_rate": 5.297637207215982e-06, + "loss": 0.3027, + "step": 4174 + }, + { + "epoch": 0.9936336050455168, + "grad_norm": 0.37785284486500015, + "learning_rate": 5.295713009435898e-06, + "loss": 0.3505, + "step": 4175 + }, + { + "epoch": 0.9938716011185815, + "grad_norm": 0.3903018829163007, + "learning_rate": 5.293788767705544e-06, + "loss": 0.4349, + "step": 4176 + }, + { + "epoch": 0.9941095971916464, + "grad_norm": 0.36388379671761967, + "learning_rate": 5.291864482310913e-06, + "loss": 0.3286, + "step": 4177 + }, + { + "epoch": 0.9943475932647111, + "grad_norm": 0.359114110954639, + "learning_rate": 5.289940153537999e-06, + "loss": 0.3227, + "step": 4178 + }, + { + "epoch": 0.994585589337776, + "grad_norm": 0.3408663433796706, + "learning_rate": 5.2880157816728055e-06, + "loss": 0.4037, + "step": 4179 + }, + { + "epoch": 0.9948235854108407, + "grad_norm": 0.3827699859436007, + "learning_rate": 5.2860913670013415e-06, + "loss": 0.363, + "step": 4180 + }, + { + "epoch": 0.9950615814839056, + "grad_norm": 0.34748568520852696, + "learning_rate": 5.2841669098096215e-06, + "loss": 0.2973, + "step": 4181 + }, + { + "epoch": 0.9952995775569703, + "grad_norm": 0.3573147481932218, + "learning_rate": 5.2822424103836715e-06, + "loss": 0.3185, + "step": 4182 + }, + { + "epoch": 0.9955375736300351, + "grad_norm": 0.35989079023901194, + "learning_rate": 5.280317869009514e-06, + "loss": 0.3781, + "step": 4183 + }, + { + "epoch": 0.9957755697030999, + "grad_norm": 0.43015819010371176, + "learning_rate": 5.278393285973189e-06, + "loss": 0.3283, + "step": 4184 + }, + { + "epoch": 0.9960135657761647, + "grad_norm": 0.36962744882859006, + "learning_rate": 5.276468661560733e-06, + "loss": 0.2961, + "step": 4185 + }, + { + "epoch": 0.9962515618492295, + "grad_norm": 0.351978630130564, + "learning_rate": 5.274543996058195e-06, + "loss": 0.3664, + "step": 4186 + }, + { + "epoch": 0.9964895579222943, + "grad_norm": 0.3936832601718924, + "learning_rate": 5.272619289751627e-06, + "loss": 0.4083, + "step": 4187 + }, + { + "epoch": 0.9967275539953591, + "grad_norm": 0.4103960278310374, + "learning_rate": 5.270694542927089e-06, + "loss": 0.3127, + "step": 4188 + }, + { + "epoch": 0.9969655500684239, + "grad_norm": 0.3889947229714806, + "learning_rate": 5.268769755870643e-06, + "loss": 0.3141, + "step": 4189 + }, + { + "epoch": 0.9972035461414887, + "grad_norm": 0.39726469757369814, + "learning_rate": 5.266844928868364e-06, + "loss": 0.3777, + "step": 4190 + }, + { + "epoch": 0.9974415422145535, + "grad_norm": 0.3797443444152874, + "learning_rate": 5.264920062206328e-06, + "loss": 0.3664, + "step": 4191 + }, + { + "epoch": 0.9976795382876182, + "grad_norm": 0.3731570350476907, + "learning_rate": 5.262995156170616e-06, + "loss": 0.3014, + "step": 4192 + }, + { + "epoch": 0.9979175343606831, + "grad_norm": 0.3635536818396404, + "learning_rate": 5.261070211047318e-06, + "loss": 0.335, + "step": 4193 + }, + { + "epoch": 0.9981555304337478, + "grad_norm": 0.37957844831809195, + "learning_rate": 5.2591452271225276e-06, + "loss": 0.385, + "step": 4194 + }, + { + "epoch": 0.9983935265068127, + "grad_norm": 0.4412946371353015, + "learning_rate": 5.2572202046823455e-06, + "loss": 0.333, + "step": 4195 + }, + { + "epoch": 0.9986315225798774, + "grad_norm": 0.3949114004805517, + "learning_rate": 5.255295144012877e-06, + "loss": 0.327, + "step": 4196 + }, + { + "epoch": 0.9988695186529423, + "grad_norm": 0.37599046677243475, + "learning_rate": 5.253370045400236e-06, + "loss": 0.4231, + "step": 4197 + }, + { + "epoch": 0.999107514726007, + "grad_norm": 0.36211875864414783, + "learning_rate": 5.2514449091305375e-06, + "loss": 0.3453, + "step": 4198 + }, + { + "epoch": 0.9993455107990719, + "grad_norm": 0.4080030072391759, + "learning_rate": 5.249519735489904e-06, + "loss": 0.2976, + "step": 4199 + }, + { + "epoch": 0.9995835068721366, + "grad_norm": 0.4151890843838898, + "learning_rate": 5.247594524764466e-06, + "loss": 0.3492, + "step": 4200 + }, + { + "epoch": 0.9998215029452014, + "grad_norm": 0.3667853249254919, + "learning_rate": 5.2456692772403565e-06, + "loss": 0.4018, + "step": 4201 + }, + { + "epoch": 1.0002379960730647, + "grad_norm": 0.34868915612115237, + "learning_rate": 5.243743993203715e-06, + "loss": 0.461, + "step": 4202 + }, + { + "epoch": 1.0004759921461295, + "grad_norm": 0.3543503974639351, + "learning_rate": 5.241818672940684e-06, + "loss": 0.3138, + "step": 4203 + }, + { + "epoch": 1.0007139882191944, + "grad_norm": 0.35172615371783905, + "learning_rate": 5.239893316737419e-06, + "loss": 0.3275, + "step": 4204 + }, + { + "epoch": 1.0009519842922592, + "grad_norm": 0.37691116884826764, + "learning_rate": 5.23796792488007e-06, + "loss": 0.3566, + "step": 4205 + }, + { + "epoch": 1.001189980365324, + "grad_norm": 0.3409709908376464, + "learning_rate": 5.236042497654802e-06, + "loss": 0.3944, + "step": 4206 + }, + { + "epoch": 1.0014279764383887, + "grad_norm": 0.36845210293609737, + "learning_rate": 5.2341170353477795e-06, + "loss": 0.2646, + "step": 4207 + }, + { + "epoch": 1.0016659725114536, + "grad_norm": 0.359375740170696, + "learning_rate": 5.232191538245173e-06, + "loss": 0.319, + "step": 4208 + }, + { + "epoch": 1.0019039685845184, + "grad_norm": 0.3920628761903601, + "learning_rate": 5.23026600663316e-06, + "loss": 0.3928, + "step": 4209 + }, + { + "epoch": 1.002141964657583, + "grad_norm": 0.41390620119344756, + "learning_rate": 5.2283404407979225e-06, + "loss": 0.3013, + "step": 4210 + }, + { + "epoch": 1.0023799607306478, + "grad_norm": 0.4220811528276169, + "learning_rate": 5.226414841025645e-06, + "loss": 0.2726, + "step": 4211 + }, + { + "epoch": 1.0026179568037128, + "grad_norm": 0.41557061395608813, + "learning_rate": 5.2244892076025225e-06, + "loss": 0.365, + "step": 4212 + }, + { + "epoch": 1.0028559528767775, + "grad_norm": 0.40671365818027116, + "learning_rate": 5.22256354081475e-06, + "loss": 0.3638, + "step": 4213 + }, + { + "epoch": 1.0030939489498423, + "grad_norm": 0.38827259080751864, + "learning_rate": 5.220637840948528e-06, + "loss": 0.3195, + "step": 4214 + }, + { + "epoch": 1.003331945022907, + "grad_norm": 0.46007325776629265, + "learning_rate": 5.218712108290065e-06, + "loss": 0.3423, + "step": 4215 + }, + { + "epoch": 1.003569941095972, + "grad_norm": 0.3927299977513199, + "learning_rate": 5.216786343125572e-06, + "loss": 0.3815, + "step": 4216 + }, + { + "epoch": 1.0038079371690367, + "grad_norm": 0.3777948807064563, + "learning_rate": 5.214860545741266e-06, + "loss": 0.3139, + "step": 4217 + }, + { + "epoch": 1.0040459332421015, + "grad_norm": 0.4096489630411397, + "learning_rate": 5.212934716423368e-06, + "loss": 0.2923, + "step": 4218 + }, + { + "epoch": 1.0042839293151662, + "grad_norm": 0.4428443016282393, + "learning_rate": 5.211008855458103e-06, + "loss": 0.3226, + "step": 4219 + }, + { + "epoch": 1.0045219253882312, + "grad_norm": 0.3548101722105468, + "learning_rate": 5.209082963131702e-06, + "loss": 0.3717, + "step": 4220 + }, + { + "epoch": 1.004759921461296, + "grad_norm": 0.3661098943341066, + "learning_rate": 5.2071570397303995e-06, + "loss": 0.3291, + "step": 4221 + }, + { + "epoch": 1.0049979175343606, + "grad_norm": 0.39846837704199717, + "learning_rate": 5.2052310855404356e-06, + "loss": 0.2979, + "step": 4222 + }, + { + "epoch": 1.0052359136074254, + "grad_norm": 0.44041097154948994, + "learning_rate": 5.203305100848056e-06, + "loss": 0.3587, + "step": 4223 + }, + { + "epoch": 1.0054739096804903, + "grad_norm": 0.3696154903241395, + "learning_rate": 5.20137908593951e-06, + "loss": 0.357, + "step": 4224 + }, + { + "epoch": 1.005711905753555, + "grad_norm": 0.3589543414638921, + "learning_rate": 5.1994530411010495e-06, + "loss": 0.3017, + "step": 4225 + }, + { + "epoch": 1.0059499018266198, + "grad_norm": 0.4534518628656481, + "learning_rate": 5.1975269666189325e-06, + "loss": 0.3259, + "step": 4226 + }, + { + "epoch": 1.0061878978996845, + "grad_norm": 0.37753721010737906, + "learning_rate": 5.195600862779421e-06, + "loss": 0.3984, + "step": 4227 + }, + { + "epoch": 1.0064258939727495, + "grad_norm": 0.35907800320760047, + "learning_rate": 5.193674729868781e-06, + "loss": 0.3162, + "step": 4228 + }, + { + "epoch": 1.0066638900458142, + "grad_norm": 0.38681209195212035, + "learning_rate": 5.191748568173288e-06, + "loss": 0.3052, + "step": 4229 + }, + { + "epoch": 1.006901886118879, + "grad_norm": 0.38975003350282217, + "learning_rate": 5.1898223779792125e-06, + "loss": 0.369, + "step": 4230 + }, + { + "epoch": 1.0071398821919437, + "grad_norm": 0.37451302101771794, + "learning_rate": 5.187896159572836e-06, + "loss": 0.3867, + "step": 4231 + }, + { + "epoch": 1.0073778782650087, + "grad_norm": 0.3746202575214494, + "learning_rate": 5.185969913240442e-06, + "loss": 0.3123, + "step": 4232 + }, + { + "epoch": 1.0076158743380734, + "grad_norm": 0.3399200166882226, + "learning_rate": 5.184043639268318e-06, + "loss": 0.3423, + "step": 4233 + }, + { + "epoch": 1.0078538704111382, + "grad_norm": 0.37484361667395105, + "learning_rate": 5.1821173379427566e-06, + "loss": 0.3874, + "step": 4234 + }, + { + "epoch": 1.008091866484203, + "grad_norm": 0.38264504872383276, + "learning_rate": 5.180191009550053e-06, + "loss": 0.3665, + "step": 4235 + }, + { + "epoch": 1.0083298625572679, + "grad_norm": 0.3905871905210143, + "learning_rate": 5.178264654376507e-06, + "loss": 0.3112, + "step": 4236 + }, + { + "epoch": 1.0085678586303326, + "grad_norm": 0.39456565680861116, + "learning_rate": 5.176338272708424e-06, + "loss": 0.3265, + "step": 4237 + }, + { + "epoch": 1.0088058547033973, + "grad_norm": 0.36221965002648904, + "learning_rate": 5.174411864832111e-06, + "loss": 0.3884, + "step": 4238 + }, + { + "epoch": 1.009043850776462, + "grad_norm": 0.40646971651183245, + "learning_rate": 5.172485431033882e-06, + "loss": 0.2814, + "step": 4239 + }, + { + "epoch": 1.009281846849527, + "grad_norm": 0.40606667706481386, + "learning_rate": 5.17055897160005e-06, + "loss": 0.295, + "step": 4240 + }, + { + "epoch": 1.0095198429225918, + "grad_norm": 0.4187704484376356, + "learning_rate": 5.168632486816934e-06, + "loss": 0.3786, + "step": 4241 + }, + { + "epoch": 1.0097578389956565, + "grad_norm": 0.38137336025761603, + "learning_rate": 5.1667059769708595e-06, + "loss": 0.3093, + "step": 4242 + }, + { + "epoch": 1.0099958350687213, + "grad_norm": 0.4262476491330625, + "learning_rate": 5.1647794423481516e-06, + "loss": 0.2841, + "step": 4243 + }, + { + "epoch": 1.0102338311417862, + "grad_norm": 0.42194011631004263, + "learning_rate": 5.1628528832351436e-06, + "loss": 0.3438, + "step": 4244 + }, + { + "epoch": 1.010471827214851, + "grad_norm": 0.37173229226099785, + "learning_rate": 5.160926299918167e-06, + "loss": 0.3863, + "step": 4245 + }, + { + "epoch": 1.0107098232879157, + "grad_norm": 0.3611360955387892, + "learning_rate": 5.158999692683563e-06, + "loss": 0.2911, + "step": 4246 + }, + { + "epoch": 1.0109478193609804, + "grad_norm": 0.37976056901327965, + "learning_rate": 5.15707306181767e-06, + "loss": 0.3135, + "step": 4247 + }, + { + "epoch": 1.0111858154340454, + "grad_norm": 0.40048201279174483, + "learning_rate": 5.155146407606835e-06, + "loss": 0.3757, + "step": 4248 + }, + { + "epoch": 1.0114238115071101, + "grad_norm": 0.36075054638382786, + "learning_rate": 5.153219730337406e-06, + "loss": 0.3147, + "step": 4249 + }, + { + "epoch": 1.0116618075801749, + "grad_norm": 0.3679232730274379, + "learning_rate": 5.151293030295732e-06, + "loss": 0.2865, + "step": 4250 + }, + { + "epoch": 1.0118998036532396, + "grad_norm": 0.3581758351944555, + "learning_rate": 5.149366307768173e-06, + "loss": 0.3218, + "step": 4251 + }, + { + "epoch": 1.0121377997263046, + "grad_norm": 0.3823224717214005, + "learning_rate": 5.147439563041086e-06, + "loss": 0.3987, + "step": 4252 + }, + { + "epoch": 1.0123757957993693, + "grad_norm": 0.33371251114397465, + "learning_rate": 5.145512796400831e-06, + "loss": 0.2999, + "step": 4253 + }, + { + "epoch": 1.012613791872434, + "grad_norm": 0.3823340185719183, + "learning_rate": 5.143586008133776e-06, + "loss": 0.2941, + "step": 4254 + }, + { + "epoch": 1.0128517879454988, + "grad_norm": 0.382504369490985, + "learning_rate": 5.141659198526287e-06, + "loss": 0.3752, + "step": 4255 + }, + { + "epoch": 1.0130897840185638, + "grad_norm": 0.3814481433528493, + "learning_rate": 5.139732367864736e-06, + "loss": 0.3442, + "step": 4256 + }, + { + "epoch": 1.0133277800916285, + "grad_norm": 0.3964801873649591, + "learning_rate": 5.137805516435499e-06, + "loss": 0.2744, + "step": 4257 + }, + { + "epoch": 1.0135657761646932, + "grad_norm": 0.35135187470588386, + "learning_rate": 5.135878644524953e-06, + "loss": 0.3428, + "step": 4258 + }, + { + "epoch": 1.013803772237758, + "grad_norm": 0.43647825401352464, + "learning_rate": 5.13395175241948e-06, + "loss": 0.4011, + "step": 4259 + }, + { + "epoch": 1.014041768310823, + "grad_norm": 0.48740933719008334, + "learning_rate": 5.132024840405462e-06, + "loss": 0.3116, + "step": 4260 + }, + { + "epoch": 1.0142797643838877, + "grad_norm": 0.3754855598282161, + "learning_rate": 5.130097908769287e-06, + "loss": 0.2721, + "step": 4261 + }, + { + "epoch": 1.0145177604569524, + "grad_norm": 0.37968335966178834, + "learning_rate": 5.128170957797345e-06, + "loss": 0.3164, + "step": 4262 + }, + { + "epoch": 1.0147557565300172, + "grad_norm": 0.38142166044175096, + "learning_rate": 5.126243987776026e-06, + "loss": 0.3873, + "step": 4263 + }, + { + "epoch": 1.0149937526030821, + "grad_norm": 0.374445975748934, + "learning_rate": 5.124316998991728e-06, + "loss": 0.2786, + "step": 4264 + }, + { + "epoch": 1.0152317486761469, + "grad_norm": 0.42403267311698933, + "learning_rate": 5.122389991730848e-06, + "loss": 0.3162, + "step": 4265 + }, + { + "epoch": 1.0154697447492116, + "grad_norm": 0.3964607797578093, + "learning_rate": 5.120462966279789e-06, + "loss": 0.3996, + "step": 4266 + }, + { + "epoch": 1.0157077408222763, + "grad_norm": 0.363101926710214, + "learning_rate": 5.118535922924952e-06, + "loss": 0.3341, + "step": 4267 + }, + { + "epoch": 1.0159457368953413, + "grad_norm": 0.3751042986845883, + "learning_rate": 5.1166088619527445e-06, + "loss": 0.2738, + "step": 4268 + }, + { + "epoch": 1.016183732968406, + "grad_norm": 0.36892262345896815, + "learning_rate": 5.114681783649575e-06, + "loss": 0.3194, + "step": 4269 + }, + { + "epoch": 1.0164217290414708, + "grad_norm": 0.34748547029304633, + "learning_rate": 5.112754688301855e-06, + "loss": 0.3562, + "step": 4270 + }, + { + "epoch": 1.0166597251145355, + "grad_norm": 0.38763196396366484, + "learning_rate": 5.110827576196e-06, + "loss": 0.307, + "step": 4271 + }, + { + "epoch": 1.0168977211876005, + "grad_norm": 0.37714618148409906, + "learning_rate": 5.1089004476184255e-06, + "loss": 0.3069, + "step": 4272 + }, + { + "epoch": 1.0171357172606652, + "grad_norm": 0.3771535074762055, + "learning_rate": 5.10697330285555e-06, + "loss": 0.3587, + "step": 4273 + }, + { + "epoch": 1.01737371333373, + "grad_norm": 0.3691570859798567, + "learning_rate": 5.105046142193796e-06, + "loss": 0.3495, + "step": 4274 + }, + { + "epoch": 1.0176117094067947, + "grad_norm": 0.4171393197587053, + "learning_rate": 5.103118965919586e-06, + "loss": 0.2785, + "step": 4275 + }, + { + "epoch": 1.0178497054798596, + "grad_norm": 0.3718279478108766, + "learning_rate": 5.101191774319346e-06, + "loss": 0.3278, + "step": 4276 + }, + { + "epoch": 1.0180877015529244, + "grad_norm": 0.3762497178178657, + "learning_rate": 5.099264567679505e-06, + "loss": 0.3814, + "step": 4277 + }, + { + "epoch": 1.0183256976259891, + "grad_norm": 0.3925916540388656, + "learning_rate": 5.097337346286494e-06, + "loss": 0.3096, + "step": 4278 + }, + { + "epoch": 1.0185636936990539, + "grad_norm": 0.4026406915827254, + "learning_rate": 5.095410110426746e-06, + "loss": 0.2751, + "step": 4279 + }, + { + "epoch": 1.0188016897721188, + "grad_norm": 0.3737121034084029, + "learning_rate": 5.093482860386695e-06, + "loss": 0.3442, + "step": 4280 + }, + { + "epoch": 1.0190396858451836, + "grad_norm": 0.3662525796650136, + "learning_rate": 5.091555596452777e-06, + "loss": 0.3694, + "step": 4281 + }, + { + "epoch": 1.0192776819182483, + "grad_norm": 0.37406892875546127, + "learning_rate": 5.089628318911434e-06, + "loss": 0.2785, + "step": 4282 + }, + { + "epoch": 1.019515677991313, + "grad_norm": 0.362735438961647, + "learning_rate": 5.0877010280491045e-06, + "loss": 0.3289, + "step": 4283 + }, + { + "epoch": 1.019753674064378, + "grad_norm": 0.3737293250336674, + "learning_rate": 5.085773724152232e-06, + "loss": 0.3725, + "step": 4284 + }, + { + "epoch": 1.0199916701374427, + "grad_norm": 0.37431985674627993, + "learning_rate": 5.083846407507263e-06, + "loss": 0.3119, + "step": 4285 + }, + { + "epoch": 1.0202296662105075, + "grad_norm": 0.39281524554349173, + "learning_rate": 5.0819190784006444e-06, + "loss": 0.3318, + "step": 4286 + }, + { + "epoch": 1.0204676622835722, + "grad_norm": 0.4178241632215565, + "learning_rate": 5.079991737118823e-06, + "loss": 0.3592, + "step": 4287 + }, + { + "epoch": 1.0207056583566372, + "grad_norm": 0.37007927511322464, + "learning_rate": 5.0780643839482515e-06, + "loss": 0.3696, + "step": 4288 + }, + { + "epoch": 1.020943654429702, + "grad_norm": 0.393635301072028, + "learning_rate": 5.076137019175381e-06, + "loss": 0.2891, + "step": 4289 + }, + { + "epoch": 1.0211816505027667, + "grad_norm": 0.37703876121952784, + "learning_rate": 5.074209643086666e-06, + "loss": 0.33, + "step": 4290 + }, + { + "epoch": 1.0214196465758314, + "grad_norm": 0.42049817515097926, + "learning_rate": 5.072282255968561e-06, + "loss": 0.3764, + "step": 4291 + }, + { + "epoch": 1.0216576426488964, + "grad_norm": 0.38019207344494804, + "learning_rate": 5.070354858107526e-06, + "loss": 0.31, + "step": 4292 + }, + { + "epoch": 1.021895638721961, + "grad_norm": 0.4040481113943476, + "learning_rate": 5.068427449790019e-06, + "loss": 0.2868, + "step": 4293 + }, + { + "epoch": 1.0221336347950258, + "grad_norm": 0.370686703144065, + "learning_rate": 5.0665000313024995e-06, + "loss": 0.3369, + "step": 4294 + }, + { + "epoch": 1.0223716308680906, + "grad_norm": 0.39346612528862, + "learning_rate": 5.064572602931428e-06, + "loss": 0.3823, + "step": 4295 + }, + { + "epoch": 1.0226096269411555, + "grad_norm": 0.3717514479143731, + "learning_rate": 5.0626451649632725e-06, + "loss": 0.3087, + "step": 4296 + }, + { + "epoch": 1.0228476230142203, + "grad_norm": 0.4092985098023293, + "learning_rate": 5.060717717684496e-06, + "loss": 0.3115, + "step": 4297 + }, + { + "epoch": 1.023085619087285, + "grad_norm": 0.3934618504996616, + "learning_rate": 5.058790261381563e-06, + "loss": 0.3801, + "step": 4298 + }, + { + "epoch": 1.0233236151603498, + "grad_norm": 0.3492191091572103, + "learning_rate": 5.056862796340944e-06, + "loss": 0.3489, + "step": 4299 + }, + { + "epoch": 1.0235616112334147, + "grad_norm": 0.4051063697173622, + "learning_rate": 5.054935322849107e-06, + "loss": 0.2997, + "step": 4300 + }, + { + "epoch": 1.0237996073064795, + "grad_norm": 0.3469654129695318, + "learning_rate": 5.053007841192522e-06, + "loss": 0.3036, + "step": 4301 + }, + { + "epoch": 1.0240376033795442, + "grad_norm": 0.38270686628936906, + "learning_rate": 5.05108035165766e-06, + "loss": 0.3964, + "step": 4302 + }, + { + "epoch": 1.024275599452609, + "grad_norm": 0.36001024577676244, + "learning_rate": 5.049152854530994e-06, + "loss": 0.2792, + "step": 4303 + }, + { + "epoch": 1.024513595525674, + "grad_norm": 0.37961661632016525, + "learning_rate": 5.047225350098999e-06, + "loss": 0.3038, + "step": 4304 + }, + { + "epoch": 1.0247515915987386, + "grad_norm": 0.3494836771560282, + "learning_rate": 5.045297838648145e-06, + "loss": 0.3919, + "step": 4305 + }, + { + "epoch": 1.0249895876718034, + "grad_norm": 0.36305044333379766, + "learning_rate": 5.043370320464915e-06, + "loss": 0.3636, + "step": 4306 + }, + { + "epoch": 1.025227583744868, + "grad_norm": 0.4258357687705437, + "learning_rate": 5.041442795835783e-06, + "loss": 0.2803, + "step": 4307 + }, + { + "epoch": 1.025465579817933, + "grad_norm": 0.4160868010268067, + "learning_rate": 5.039515265047224e-06, + "loss": 0.3107, + "step": 4308 + }, + { + "epoch": 1.0257035758909978, + "grad_norm": 0.4342418241767286, + "learning_rate": 5.037587728385719e-06, + "loss": 0.4278, + "step": 4309 + }, + { + "epoch": 1.0259415719640625, + "grad_norm": 0.37103949650763524, + "learning_rate": 5.035660186137749e-06, + "loss": 0.3015, + "step": 4310 + }, + { + "epoch": 1.0261795680371273, + "grad_norm": 0.38074037486983087, + "learning_rate": 5.033732638589793e-06, + "loss": 0.2781, + "step": 4311 + }, + { + "epoch": 1.0264175641101922, + "grad_norm": 0.4189038925958272, + "learning_rate": 5.0318050860283306e-06, + "loss": 0.3664, + "step": 4312 + }, + { + "epoch": 1.026655560183257, + "grad_norm": 0.3951890393473658, + "learning_rate": 5.029877528739848e-06, + "loss": 0.3446, + "step": 4313 + }, + { + "epoch": 1.0268935562563217, + "grad_norm": 0.3983002055289879, + "learning_rate": 5.0279499670108245e-06, + "loss": 0.3065, + "step": 4314 + }, + { + "epoch": 1.0271315523293865, + "grad_norm": 0.39958104133807376, + "learning_rate": 5.0260224011277445e-06, + "loss": 0.3262, + "step": 4315 + }, + { + "epoch": 1.0273695484024514, + "grad_norm": 0.4123932451298326, + "learning_rate": 5.0240948313770934e-06, + "loss": 0.3763, + "step": 4316 + }, + { + "epoch": 1.0276075444755162, + "grad_norm": 0.39306243304232114, + "learning_rate": 5.022167258045353e-06, + "loss": 0.3214, + "step": 4317 + }, + { + "epoch": 1.027845540548581, + "grad_norm": 0.3761676443591786, + "learning_rate": 5.0202396814190095e-06, + "loss": 0.2848, + "step": 4318 + }, + { + "epoch": 1.0280835366216456, + "grad_norm": 0.38223410956180287, + "learning_rate": 5.018312101784548e-06, + "loss": 0.3257, + "step": 4319 + }, + { + "epoch": 1.0283215326947106, + "grad_norm": 0.3597507159599325, + "learning_rate": 5.016384519428456e-06, + "loss": 0.3752, + "step": 4320 + }, + { + "epoch": 1.0285595287677753, + "grad_norm": 0.328025805606429, + "learning_rate": 5.0144569346372185e-06, + "loss": 0.2937, + "step": 4321 + }, + { + "epoch": 1.02879752484084, + "grad_norm": 0.4660913256009466, + "learning_rate": 5.012529347697322e-06, + "loss": 0.3239, + "step": 4322 + }, + { + "epoch": 1.0290355209139048, + "grad_norm": 0.383898516473306, + "learning_rate": 5.010601758895257e-06, + "loss": 0.3778, + "step": 4323 + }, + { + "epoch": 1.0292735169869698, + "grad_norm": 0.36767695340931256, + "learning_rate": 5.008674168517507e-06, + "loss": 0.3279, + "step": 4324 + }, + { + "epoch": 1.0295115130600345, + "grad_norm": 0.37694685767170544, + "learning_rate": 5.006746576850562e-06, + "loss": 0.3108, + "step": 4325 + }, + { + "epoch": 1.0297495091330993, + "grad_norm": 0.37325893976856545, + "learning_rate": 5.004818984180907e-06, + "loss": 0.3387, + "step": 4326 + }, + { + "epoch": 1.029987505206164, + "grad_norm": 0.38537315515694565, + "learning_rate": 5.002891390795033e-06, + "loss": 0.4258, + "step": 4327 + }, + { + "epoch": 1.030225501279229, + "grad_norm": 0.3589773496139594, + "learning_rate": 5.0009637969794255e-06, + "loss": 0.2959, + "step": 4328 + }, + { + "epoch": 1.0304634973522937, + "grad_norm": 0.37090708026971275, + "learning_rate": 4.9990362030205745e-06, + "loss": 0.2818, + "step": 4329 + }, + { + "epoch": 1.0307014934253584, + "grad_norm": 0.3798707951977427, + "learning_rate": 4.997108609204968e-06, + "loss": 0.3882, + "step": 4330 + }, + { + "epoch": 1.0309394894984232, + "grad_norm": 0.3786611077695848, + "learning_rate": 4.995181015819094e-06, + "loss": 0.3903, + "step": 4331 + }, + { + "epoch": 1.0311774855714881, + "grad_norm": 0.37672618523404194, + "learning_rate": 4.99325342314944e-06, + "loss": 0.3122, + "step": 4332 + }, + { + "epoch": 1.0314154816445529, + "grad_norm": 0.391170036332283, + "learning_rate": 4.991325831482494e-06, + "loss": 0.3309, + "step": 4333 + }, + { + "epoch": 1.0316534777176176, + "grad_norm": 0.4742272257387251, + "learning_rate": 4.989398241104745e-06, + "loss": 0.3744, + "step": 4334 + }, + { + "epoch": 1.0318914737906824, + "grad_norm": 0.381528588265086, + "learning_rate": 4.987470652302679e-06, + "loss": 0.3236, + "step": 4335 + }, + { + "epoch": 1.0321294698637473, + "grad_norm": 0.3909739709474945, + "learning_rate": 4.985543065362782e-06, + "loss": 0.3005, + "step": 4336 + }, + { + "epoch": 1.032367465936812, + "grad_norm": 0.4030798587204337, + "learning_rate": 4.983615480571546e-06, + "loss": 0.3292, + "step": 4337 + }, + { + "epoch": 1.0326054620098768, + "grad_norm": 0.39016102552178, + "learning_rate": 4.981687898215454e-06, + "loss": 0.3811, + "step": 4338 + }, + { + "epoch": 1.0328434580829415, + "grad_norm": 0.35718795349461896, + "learning_rate": 4.979760318580993e-06, + "loss": 0.297, + "step": 4339 + }, + { + "epoch": 1.0330814541560065, + "grad_norm": 1.5662065706280457, + "learning_rate": 4.97783274195465e-06, + "loss": 0.3122, + "step": 4340 + }, + { + "epoch": 1.0333194502290712, + "grad_norm": 0.4031583976642777, + "learning_rate": 4.97590516862291e-06, + "loss": 0.3742, + "step": 4341 + }, + { + "epoch": 1.033557446302136, + "grad_norm": 0.3666936736763186, + "learning_rate": 4.973977598872257e-06, + "loss": 0.3361, + "step": 4342 + }, + { + "epoch": 1.0337954423752007, + "grad_norm": 0.3870156161445879, + "learning_rate": 4.9720500329891755e-06, + "loss": 0.2907, + "step": 4343 + }, + { + "epoch": 1.0340334384482657, + "grad_norm": 0.3943973816109862, + "learning_rate": 4.9701224712601526e-06, + "loss": 0.3316, + "step": 4344 + }, + { + "epoch": 1.0342714345213304, + "grad_norm": 0.37045738296936315, + "learning_rate": 4.9681949139716686e-06, + "loss": 0.3745, + "step": 4345 + }, + { + "epoch": 1.0345094305943952, + "grad_norm": 0.3526903041685496, + "learning_rate": 4.966267361410209e-06, + "loss": 0.2671, + "step": 4346 + }, + { + "epoch": 1.03474742666746, + "grad_norm": 0.4097067620359685, + "learning_rate": 4.964339813862252e-06, + "loss": 0.3019, + "step": 4347 + }, + { + "epoch": 1.0349854227405249, + "grad_norm": 0.3907934312556052, + "learning_rate": 4.962412271614282e-06, + "loss": 0.3721, + "step": 4348 + }, + { + "epoch": 1.0352234188135896, + "grad_norm": 0.39946066906031263, + "learning_rate": 4.9604847349527775e-06, + "loss": 0.3449, + "step": 4349 + }, + { + "epoch": 1.0354614148866543, + "grad_norm": 0.35694279142285595, + "learning_rate": 4.958557204164219e-06, + "loss": 0.2945, + "step": 4350 + }, + { + "epoch": 1.035699410959719, + "grad_norm": 0.36264389505281575, + "learning_rate": 4.956629679535086e-06, + "loss": 0.3114, + "step": 4351 + }, + { + "epoch": 1.035937407032784, + "grad_norm": 0.3837297046950812, + "learning_rate": 4.954702161351856e-06, + "loss": 0.398, + "step": 4352 + }, + { + "epoch": 1.0361754031058488, + "grad_norm": 0.3723581905698087, + "learning_rate": 4.952774649901004e-06, + "loss": 0.3005, + "step": 4353 + }, + { + "epoch": 1.0364133991789135, + "grad_norm": 0.3668387139380813, + "learning_rate": 4.950847145469008e-06, + "loss": 0.3008, + "step": 4354 + }, + { + "epoch": 1.0366513952519782, + "grad_norm": 0.36162042038397063, + "learning_rate": 4.948919648342342e-06, + "loss": 0.3702, + "step": 4355 + }, + { + "epoch": 1.0368893913250432, + "grad_norm": 0.3603460188850127, + "learning_rate": 4.946992158807481e-06, + "loss": 0.3833, + "step": 4356 + }, + { + "epoch": 1.037127387398108, + "grad_norm": 0.3744254359291755, + "learning_rate": 4.945064677150893e-06, + "loss": 0.281, + "step": 4357 + }, + { + "epoch": 1.0373653834711727, + "grad_norm": 0.3842915853972375, + "learning_rate": 4.943137203659056e-06, + "loss": 0.346, + "step": 4358 + }, + { + "epoch": 1.0376033795442374, + "grad_norm": 0.3822698279560983, + "learning_rate": 4.941209738618437e-06, + "loss": 0.4112, + "step": 4359 + }, + { + "epoch": 1.0378413756173024, + "grad_norm": 0.40057285679058796, + "learning_rate": 4.939282282315505e-06, + "loss": 0.2888, + "step": 4360 + }, + { + "epoch": 1.0380793716903671, + "grad_norm": 0.382452358107358, + "learning_rate": 4.937354835036728e-06, + "loss": 0.2905, + "step": 4361 + }, + { + "epoch": 1.0383173677634319, + "grad_norm": 0.3850074270426723, + "learning_rate": 4.935427397068573e-06, + "loss": 0.3644, + "step": 4362 + }, + { + "epoch": 1.0385553638364966, + "grad_norm": 0.34238196519140435, + "learning_rate": 4.933499968697503e-06, + "loss": 0.3405, + "step": 4363 + }, + { + "epoch": 1.0387933599095616, + "grad_norm": 0.36795318780076897, + "learning_rate": 4.931572550209983e-06, + "loss": 0.2819, + "step": 4364 + }, + { + "epoch": 1.0390313559826263, + "grad_norm": 0.3974431339386504, + "learning_rate": 4.929645141892475e-06, + "loss": 0.3485, + "step": 4365 + }, + { + "epoch": 1.039269352055691, + "grad_norm": 0.3861629647354368, + "learning_rate": 4.92771774403144e-06, + "loss": 0.4002, + "step": 4366 + }, + { + "epoch": 1.0395073481287558, + "grad_norm": 0.3874473883648921, + "learning_rate": 4.925790356913337e-06, + "loss": 0.3324, + "step": 4367 + }, + { + "epoch": 1.0397453442018207, + "grad_norm": 0.39344198259301555, + "learning_rate": 4.923862980824622e-06, + "loss": 0.2779, + "step": 4368 + }, + { + "epoch": 1.0399833402748855, + "grad_norm": 0.4195324906692714, + "learning_rate": 4.921935616051751e-06, + "loss": 0.3325, + "step": 4369 + }, + { + "epoch": 1.0402213363479502, + "grad_norm": 0.3908083748851083, + "learning_rate": 4.920008262881177e-06, + "loss": 0.3987, + "step": 4370 + }, + { + "epoch": 1.040459332421015, + "grad_norm": 0.3951373300669574, + "learning_rate": 4.918080921599356e-06, + "loss": 0.2903, + "step": 4371 + }, + { + "epoch": 1.04069732849408, + "grad_norm": 0.40344550596748047, + "learning_rate": 4.9161535924927375e-06, + "loss": 0.3174, + "step": 4372 + }, + { + "epoch": 1.0409353245671447, + "grad_norm": 0.3856931317543428, + "learning_rate": 4.914226275847768e-06, + "loss": 0.3455, + "step": 4373 + }, + { + "epoch": 1.0411733206402094, + "grad_norm": 0.36331854787145207, + "learning_rate": 4.912298971950897e-06, + "loss": 0.3507, + "step": 4374 + }, + { + "epoch": 1.0414113167132741, + "grad_norm": 0.370895311159966, + "learning_rate": 4.910371681088568e-06, + "loss": 0.3087, + "step": 4375 + }, + { + "epoch": 1.041649312786339, + "grad_norm": 0.3771524826161335, + "learning_rate": 4.908444403547224e-06, + "loss": 0.3444, + "step": 4376 + }, + { + "epoch": 1.0418873088594038, + "grad_norm": 0.41144922180124954, + "learning_rate": 4.906517139613307e-06, + "loss": 0.4052, + "step": 4377 + }, + { + "epoch": 1.0421253049324686, + "grad_norm": 0.3646312683356659, + "learning_rate": 4.9045898895732555e-06, + "loss": 0.296, + "step": 4378 + }, + { + "epoch": 1.0423633010055333, + "grad_norm": 0.38904303184845496, + "learning_rate": 4.902662653713507e-06, + "loss": 0.2914, + "step": 4379 + }, + { + "epoch": 1.0426012970785983, + "grad_norm": 0.38168935291772754, + "learning_rate": 4.900735432320496e-06, + "loss": 0.3709, + "step": 4380 + }, + { + "epoch": 1.042839293151663, + "grad_norm": 0.3730634093577279, + "learning_rate": 4.898808225680656e-06, + "loss": 0.358, + "step": 4381 + }, + { + "epoch": 1.0430772892247278, + "grad_norm": 0.38559125950569123, + "learning_rate": 4.8968810340804166e-06, + "loss": 0.2736, + "step": 4382 + }, + { + "epoch": 1.0433152852977925, + "grad_norm": 0.38852084763466593, + "learning_rate": 4.894953857806207e-06, + "loss": 0.3232, + "step": 4383 + }, + { + "epoch": 1.0435532813708575, + "grad_norm": 0.3922704604899118, + "learning_rate": 4.893026697144451e-06, + "loss": 0.3953, + "step": 4384 + }, + { + "epoch": 1.0437912774439222, + "grad_norm": 0.3464747600169571, + "learning_rate": 4.891099552381575e-06, + "loss": 0.3247, + "step": 4385 + }, + { + "epoch": 1.044029273516987, + "grad_norm": 0.3814240246283631, + "learning_rate": 4.8891724238040004e-06, + "loss": 0.2907, + "step": 4386 + }, + { + "epoch": 1.0442672695900517, + "grad_norm": 0.3771293957546087, + "learning_rate": 4.887245311698146e-06, + "loss": 0.355, + "step": 4387 + }, + { + "epoch": 1.0445052656631166, + "grad_norm": 0.3868624148021604, + "learning_rate": 4.8853182163504265e-06, + "loss": 0.3455, + "step": 4388 + }, + { + "epoch": 1.0447432617361814, + "grad_norm": 0.4023587442678937, + "learning_rate": 4.883391138047258e-06, + "loss": 0.3055, + "step": 4389 + }, + { + "epoch": 1.0449812578092461, + "grad_norm": 0.3743342358321569, + "learning_rate": 4.8814640770750495e-06, + "loss": 0.3441, + "step": 4390 + }, + { + "epoch": 1.0452192538823109, + "grad_norm": 0.3908606318397311, + "learning_rate": 4.879537033720212e-06, + "loss": 0.3837, + "step": 4391 + }, + { + "epoch": 1.0454572499553758, + "grad_norm": 0.3927864891120312, + "learning_rate": 4.877610008269153e-06, + "loss": 0.3233, + "step": 4392 + }, + { + "epoch": 1.0456952460284406, + "grad_norm": 0.37891845327252005, + "learning_rate": 4.875683001008274e-06, + "loss": 0.3127, + "step": 4393 + }, + { + "epoch": 1.0459332421015053, + "grad_norm": 0.40161470551225587, + "learning_rate": 4.873756012223977e-06, + "loss": 0.3332, + "step": 4394 + }, + { + "epoch": 1.04617123817457, + "grad_norm": 0.436580051432871, + "learning_rate": 4.871829042202658e-06, + "loss": 0.3743, + "step": 4395 + }, + { + "epoch": 1.046409234247635, + "grad_norm": 0.4008401671583846, + "learning_rate": 4.8699020912307155e-06, + "loss": 0.303, + "step": 4396 + }, + { + "epoch": 1.0466472303206997, + "grad_norm": 0.39091168851981944, + "learning_rate": 4.86797515959454e-06, + "loss": 0.3068, + "step": 4397 + }, + { + "epoch": 1.0468852263937645, + "grad_norm": 0.4171630119960414, + "learning_rate": 4.866048247580521e-06, + "loss": 0.3736, + "step": 4398 + }, + { + "epoch": 1.0471232224668292, + "grad_norm": 0.3830177263756179, + "learning_rate": 4.864121355475047e-06, + "loss": 0.3746, + "step": 4399 + }, + { + "epoch": 1.0473612185398942, + "grad_norm": 0.37078719399047205, + "learning_rate": 4.862194483564501e-06, + "loss": 0.2977, + "step": 4400 + }, + { + "epoch": 1.047599214612959, + "grad_norm": 0.36764015487272417, + "learning_rate": 4.8602676321352646e-06, + "loss": 0.3353, + "step": 4401 + }, + { + "epoch": 1.0478372106860236, + "grad_norm": 0.3791879193271118, + "learning_rate": 4.858340801473715e-06, + "loss": 0.3836, + "step": 4402 + }, + { + "epoch": 1.0480752067590884, + "grad_norm": 0.34908273436476017, + "learning_rate": 4.856413991866225e-06, + "loss": 0.2991, + "step": 4403 + }, + { + "epoch": 1.0483132028321533, + "grad_norm": 0.3726740932651389, + "learning_rate": 4.85448720359917e-06, + "loss": 0.3106, + "step": 4404 + }, + { + "epoch": 1.048551198905218, + "grad_norm": 0.4036355116456374, + "learning_rate": 4.852560436958916e-06, + "loss": 0.3737, + "step": 4405 + }, + { + "epoch": 1.0487891949782828, + "grad_norm": 0.36697835588631594, + "learning_rate": 4.850633692231828e-06, + "loss": 0.3501, + "step": 4406 + }, + { + "epoch": 1.0490271910513476, + "grad_norm": 0.36549398149960405, + "learning_rate": 4.848706969704269e-06, + "loss": 0.285, + "step": 4407 + }, + { + "epoch": 1.0492651871244125, + "grad_norm": 0.38262831908848516, + "learning_rate": 4.846780269662597e-06, + "loss": 0.3424, + "step": 4408 + }, + { + "epoch": 1.0495031831974773, + "grad_norm": 0.427528345212208, + "learning_rate": 4.8448535923931675e-06, + "loss": 0.3711, + "step": 4409 + }, + { + "epoch": 1.049741179270542, + "grad_norm": 0.35527661752034534, + "learning_rate": 4.842926938182332e-06, + "loss": 0.3213, + "step": 4410 + }, + { + "epoch": 1.0499791753436067, + "grad_norm": 0.40048394169356094, + "learning_rate": 4.84100030731644e-06, + "loss": 0.2941, + "step": 4411 + }, + { + "epoch": 1.0502171714166717, + "grad_norm": 0.3892299648187431, + "learning_rate": 4.8390737000818326e-06, + "loss": 0.3235, + "step": 4412 + }, + { + "epoch": 1.0504551674897364, + "grad_norm": 0.3772914101337546, + "learning_rate": 4.837147116764857e-06, + "loss": 0.346, + "step": 4413 + }, + { + "epoch": 1.0506931635628012, + "grad_norm": 0.37840783896597235, + "learning_rate": 4.835220557651849e-06, + "loss": 0.2795, + "step": 4414 + }, + { + "epoch": 1.050931159635866, + "grad_norm": 0.38825533133807016, + "learning_rate": 4.833294023029142e-06, + "loss": 0.3024, + "step": 4415 + }, + { + "epoch": 1.0511691557089309, + "grad_norm": 0.3865655826669927, + "learning_rate": 4.831367513183068e-06, + "loss": 0.3913, + "step": 4416 + }, + { + "epoch": 1.0514071517819956, + "grad_norm": 0.35292818007412385, + "learning_rate": 4.829441028399952e-06, + "loss": 0.322, + "step": 4417 + }, + { + "epoch": 1.0516451478550604, + "grad_norm": 0.3668531742705322, + "learning_rate": 4.827514568966119e-06, + "loss": 0.298, + "step": 4418 + }, + { + "epoch": 1.051883143928125, + "grad_norm": 0.3749278980780594, + "learning_rate": 4.825588135167889e-06, + "loss": 0.3288, + "step": 4419 + }, + { + "epoch": 1.05212114000119, + "grad_norm": 0.3622324145932679, + "learning_rate": 4.823661727291577e-06, + "loss": 0.389, + "step": 4420 + }, + { + "epoch": 1.0523591360742548, + "grad_norm": 0.396752910815969, + "learning_rate": 4.821735345623494e-06, + "loss": 0.2953, + "step": 4421 + }, + { + "epoch": 1.0525971321473195, + "grad_norm": 0.3758408909876523, + "learning_rate": 4.819808990449949e-06, + "loss": 0.303, + "step": 4422 + }, + { + "epoch": 1.0528351282203843, + "grad_norm": 0.4081539657673205, + "learning_rate": 4.817882662057246e-06, + "loss": 0.3709, + "step": 4423 + }, + { + "epoch": 1.0530731242934492, + "grad_norm": 0.349815937390151, + "learning_rate": 4.815956360731684e-06, + "loss": 0.349, + "step": 4424 + }, + { + "epoch": 1.053311120366514, + "grad_norm": 0.34903612236748616, + "learning_rate": 4.814030086759561e-06, + "loss": 0.2885, + "step": 4425 + }, + { + "epoch": 1.0535491164395787, + "grad_norm": 0.39164825325025976, + "learning_rate": 4.812103840427165e-06, + "loss": 0.3512, + "step": 4426 + }, + { + "epoch": 1.0537871125126435, + "grad_norm": 0.36669475356586617, + "learning_rate": 4.810177622020788e-06, + "loss": 0.3903, + "step": 4427 + }, + { + "epoch": 1.0540251085857084, + "grad_norm": 0.3504650544959202, + "learning_rate": 4.808251431826713e-06, + "loss": 0.2901, + "step": 4428 + }, + { + "epoch": 1.0542631046587732, + "grad_norm": 0.34674534781413774, + "learning_rate": 4.806325270131219e-06, + "loss": 0.2899, + "step": 4429 + }, + { + "epoch": 1.054501100731838, + "grad_norm": 0.3777786304373956, + "learning_rate": 4.80439913722058e-06, + "loss": 0.3304, + "step": 4430 + }, + { + "epoch": 1.0547390968049026, + "grad_norm": 0.37181874792865827, + "learning_rate": 4.802473033381069e-06, + "loss": 0.4006, + "step": 4431 + }, + { + "epoch": 1.0549770928779676, + "grad_norm": 0.38052787907987795, + "learning_rate": 4.800546958898952e-06, + "loss": 0.3153, + "step": 4432 + }, + { + "epoch": 1.0552150889510323, + "grad_norm": 0.40481761082909645, + "learning_rate": 4.798620914060492e-06, + "loss": 0.3188, + "step": 4433 + }, + { + "epoch": 1.055453085024097, + "grad_norm": 0.3588854522073126, + "learning_rate": 4.7966948991519446e-06, + "loss": 0.3802, + "step": 4434 + }, + { + "epoch": 1.0556910810971618, + "grad_norm": 0.3773596458270944, + "learning_rate": 4.794768914459565e-06, + "loss": 0.3467, + "step": 4435 + }, + { + "epoch": 1.0559290771702268, + "grad_norm": 0.4156895917585443, + "learning_rate": 4.792842960269603e-06, + "loss": 0.3065, + "step": 4436 + }, + { + "epoch": 1.0561670732432915, + "grad_norm": 0.3609593155405436, + "learning_rate": 4.790917036868301e-06, + "loss": 0.3456, + "step": 4437 + }, + { + "epoch": 1.0564050693163562, + "grad_norm": 0.37963675817335796, + "learning_rate": 4.7889911445419e-06, + "loss": 0.3859, + "step": 4438 + }, + { + "epoch": 1.056643065389421, + "grad_norm": 0.38339295139402263, + "learning_rate": 4.787065283576633e-06, + "loss": 0.3096, + "step": 4439 + }, + { + "epoch": 1.056881061462486, + "grad_norm": 0.4148001262695314, + "learning_rate": 4.785139454258734e-06, + "loss": 0.3063, + "step": 4440 + }, + { + "epoch": 1.0571190575355507, + "grad_norm": 0.3849351006849719, + "learning_rate": 4.783213656874428e-06, + "loss": 0.354, + "step": 4441 + }, + { + "epoch": 1.0573570536086154, + "grad_norm": 0.3904733075529352, + "learning_rate": 4.781287891709936e-06, + "loss": 0.3307, + "step": 4442 + }, + { + "epoch": 1.0575950496816802, + "grad_norm": 0.39073356763740286, + "learning_rate": 4.779362159051474e-06, + "loss": 0.3202, + "step": 4443 + }, + { + "epoch": 1.0578330457547451, + "grad_norm": 0.382841095559944, + "learning_rate": 4.777436459185252e-06, + "loss": 0.3363, + "step": 4444 + }, + { + "epoch": 1.0580710418278099, + "grad_norm": 0.37809442113241104, + "learning_rate": 4.775510792397479e-06, + "loss": 0.365, + "step": 4445 + }, + { + "epoch": 1.0583090379008746, + "grad_norm": 0.36463101257216596, + "learning_rate": 4.773585158974356e-06, + "loss": 0.3039, + "step": 4446 + }, + { + "epoch": 1.0585470339739393, + "grad_norm": 0.4116743791608411, + "learning_rate": 4.77165955920208e-06, + "loss": 0.3284, + "step": 4447 + }, + { + "epoch": 1.0587850300470043, + "grad_norm": 0.42410933724473493, + "learning_rate": 4.769733993366842e-06, + "loss": 0.3708, + "step": 4448 + }, + { + "epoch": 1.059023026120069, + "grad_norm": 0.37205012492946415, + "learning_rate": 4.76780846175483e-06, + "loss": 0.3242, + "step": 4449 + }, + { + "epoch": 1.0592610221931338, + "grad_norm": 0.3472944005021577, + "learning_rate": 4.765882964652223e-06, + "loss": 0.2834, + "step": 4450 + }, + { + "epoch": 1.0594990182661985, + "grad_norm": 0.420700058102123, + "learning_rate": 4.7639575023452e-06, + "loss": 0.3451, + "step": 4451 + }, + { + "epoch": 1.0597370143392635, + "grad_norm": 0.3850537500639467, + "learning_rate": 4.762032075119932e-06, + "loss": 0.4028, + "step": 4452 + }, + { + "epoch": 1.0599750104123282, + "grad_norm": 0.35606255517837127, + "learning_rate": 4.760106683262582e-06, + "loss": 0.3206, + "step": 4453 + }, + { + "epoch": 1.060213006485393, + "grad_norm": 0.4207878717161147, + "learning_rate": 4.758181327059316e-06, + "loss": 0.2905, + "step": 4454 + }, + { + "epoch": 1.0604510025584577, + "grad_norm": 0.39178183992943744, + "learning_rate": 4.756256006796287e-06, + "loss": 0.3826, + "step": 4455 + }, + { + "epoch": 1.0606889986315227, + "grad_norm": 0.3829442776223385, + "learning_rate": 4.754330722759645e-06, + "loss": 0.3628, + "step": 4456 + }, + { + "epoch": 1.0609269947045874, + "grad_norm": 0.3954498442186975, + "learning_rate": 4.7524054752355345e-06, + "loss": 0.3066, + "step": 4457 + }, + { + "epoch": 1.0611649907776521, + "grad_norm": 0.393762593771785, + "learning_rate": 4.750480264510097e-06, + "loss": 0.3186, + "step": 4458 + }, + { + "epoch": 1.0614029868507169, + "grad_norm": 0.36465796218337193, + "learning_rate": 4.748555090869464e-06, + "loss": 0.3876, + "step": 4459 + }, + { + "epoch": 1.0616409829237818, + "grad_norm": 0.3664333642625829, + "learning_rate": 4.746629954599766e-06, + "loss": 0.3216, + "step": 4460 + }, + { + "epoch": 1.0618789789968466, + "grad_norm": 0.37283914856358796, + "learning_rate": 4.744704855987125e-06, + "loss": 0.2674, + "step": 4461 + }, + { + "epoch": 1.0621169750699113, + "grad_norm": 0.3929643060999283, + "learning_rate": 4.742779795317657e-06, + "loss": 0.3625, + "step": 4462 + }, + { + "epoch": 1.062354971142976, + "grad_norm": 0.3639867047560858, + "learning_rate": 4.740854772877475e-06, + "loss": 0.3875, + "step": 4463 + }, + { + "epoch": 1.062592967216041, + "grad_norm": 0.34468157426611506, + "learning_rate": 4.738929788952685e-06, + "loss": 0.3047, + "step": 4464 + }, + { + "epoch": 1.0628309632891058, + "grad_norm": 0.407693623350532, + "learning_rate": 4.737004843829387e-06, + "loss": 0.3164, + "step": 4465 + }, + { + "epoch": 1.0630689593621705, + "grad_norm": 0.3953104625961449, + "learning_rate": 4.735079937793675e-06, + "loss": 0.3848, + "step": 4466 + }, + { + "epoch": 1.0633069554352352, + "grad_norm": 0.35620972393509376, + "learning_rate": 4.733155071131636e-06, + "loss": 0.3204, + "step": 4467 + }, + { + "epoch": 1.0635449515083002, + "grad_norm": 0.37941109344701723, + "learning_rate": 4.731230244129357e-06, + "loss": 0.2926, + "step": 4468 + }, + { + "epoch": 1.063782947581365, + "grad_norm": 0.36519045880745366, + "learning_rate": 4.729305457072913e-06, + "loss": 0.3606, + "step": 4469 + }, + { + "epoch": 1.0640209436544297, + "grad_norm": 0.3575723913390525, + "learning_rate": 4.727380710248375e-06, + "loss": 0.3744, + "step": 4470 + }, + { + "epoch": 1.0642589397274944, + "grad_norm": 0.3567610268225815, + "learning_rate": 4.725456003941805e-06, + "loss": 0.2866, + "step": 4471 + }, + { + "epoch": 1.0644969358005594, + "grad_norm": 0.37128027520359136, + "learning_rate": 4.723531338439268e-06, + "loss": 0.3214, + "step": 4472 + }, + { + "epoch": 1.0647349318736241, + "grad_norm": 0.3820839576708276, + "learning_rate": 4.721606714026812e-06, + "loss": 0.3902, + "step": 4473 + }, + { + "epoch": 1.0649729279466889, + "grad_norm": 0.4089191938756884, + "learning_rate": 4.7196821309904865e-06, + "loss": 0.334, + "step": 4474 + }, + { + "epoch": 1.0652109240197536, + "grad_norm": 0.3728452645121599, + "learning_rate": 4.717757589616331e-06, + "loss": 0.2551, + "step": 4475 + }, + { + "epoch": 1.0654489200928186, + "grad_norm": 0.37511700011909915, + "learning_rate": 4.715833090190379e-06, + "loss": 0.3443, + "step": 4476 + }, + { + "epoch": 1.0656869161658833, + "grad_norm": 0.37701885649521155, + "learning_rate": 4.713908632998661e-06, + "loss": 0.3962, + "step": 4477 + }, + { + "epoch": 1.065924912238948, + "grad_norm": 0.3616386275749788, + "learning_rate": 4.711984218327197e-06, + "loss": 0.3166, + "step": 4478 + }, + { + "epoch": 1.0661629083120128, + "grad_norm": 0.38456233240401916, + "learning_rate": 4.710059846462003e-06, + "loss": 0.3029, + "step": 4479 + }, + { + "epoch": 1.0664009043850777, + "grad_norm": 0.3789566875155259, + "learning_rate": 4.708135517689088e-06, + "loss": 0.3717, + "step": 4480 + }, + { + "epoch": 1.0666389004581425, + "grad_norm": 0.3768525328415589, + "learning_rate": 4.706211232294456e-06, + "loss": 0.3438, + "step": 4481 + }, + { + "epoch": 1.0668768965312072, + "grad_norm": 0.3783650867548586, + "learning_rate": 4.704286990564103e-06, + "loss": 0.2877, + "step": 4482 + }, + { + "epoch": 1.067114892604272, + "grad_norm": 0.41564337367706855, + "learning_rate": 4.702362792784019e-06, + "loss": 0.3331, + "step": 4483 + }, + { + "epoch": 1.067352888677337, + "grad_norm": 0.43075275803922985, + "learning_rate": 4.700438639240186e-06, + "loss": 0.3629, + "step": 4484 + }, + { + "epoch": 1.0675908847504016, + "grad_norm": 0.36040007984808253, + "learning_rate": 4.698514530218584e-06, + "loss": 0.3061, + "step": 4485 + }, + { + "epoch": 1.0678288808234664, + "grad_norm": 0.404962256464901, + "learning_rate": 4.6965904660051804e-06, + "loss": 0.3088, + "step": 4486 + }, + { + "epoch": 1.0680668768965311, + "grad_norm": 0.3697507889870145, + "learning_rate": 4.6946664468859395e-06, + "loss": 0.3457, + "step": 4487 + }, + { + "epoch": 1.068304872969596, + "grad_norm": 0.4036468754442713, + "learning_rate": 4.692742473146818e-06, + "loss": 0.3603, + "step": 4488 + }, + { + "epoch": 1.0685428690426608, + "grad_norm": 0.4042242479571805, + "learning_rate": 4.690818545073767e-06, + "loss": 0.2794, + "step": 4489 + }, + { + "epoch": 1.0687808651157256, + "grad_norm": 0.4099157906703565, + "learning_rate": 4.688894662952729e-06, + "loss": 0.3127, + "step": 4490 + }, + { + "epoch": 1.0690188611887903, + "grad_norm": 0.3468763641985224, + "learning_rate": 4.686970827069639e-06, + "loss": 0.3843, + "step": 4491 + }, + { + "epoch": 1.0692568572618553, + "grad_norm": 0.37635304439254064, + "learning_rate": 4.68504703771043e-06, + "loss": 0.3453, + "step": 4492 + }, + { + "epoch": 1.06949485333492, + "grad_norm": 0.39528560108604954, + "learning_rate": 4.683123295161021e-06, + "loss": 0.2812, + "step": 4493 + }, + { + "epoch": 1.0697328494079847, + "grad_norm": 0.4283777664794825, + "learning_rate": 4.6811995997073285e-06, + "loss": 0.3331, + "step": 4494 + }, + { + "epoch": 1.0699708454810495, + "grad_norm": 0.3710567512234336, + "learning_rate": 4.679275951635264e-06, + "loss": 0.3875, + "step": 4495 + }, + { + "epoch": 1.0702088415541144, + "grad_norm": 0.36243117057229535, + "learning_rate": 4.6773523512307275e-06, + "loss": 0.2728, + "step": 4496 + }, + { + "epoch": 1.0704468376271792, + "grad_norm": 0.37892611536648607, + "learning_rate": 4.675428798779613e-06, + "loss": 0.3035, + "step": 4497 + }, + { + "epoch": 1.070684833700244, + "grad_norm": 0.36860351230116295, + "learning_rate": 4.673505294567809e-06, + "loss": 0.3669, + "step": 4498 + }, + { + "epoch": 1.0709228297733087, + "grad_norm": 0.36740997263006975, + "learning_rate": 4.6715818388811945e-06, + "loss": 0.3238, + "step": 4499 + }, + { + "epoch": 1.0711608258463736, + "grad_norm": 0.3994604434926032, + "learning_rate": 4.669658432005644e-06, + "loss": 0.296, + "step": 4500 + }, + { + "epoch": 1.0713988219194384, + "grad_norm": 0.3698668250700613, + "learning_rate": 4.667735074227024e-06, + "loss": 0.3494, + "step": 4501 + }, + { + "epoch": 1.071636817992503, + "grad_norm": 0.440292753533519, + "learning_rate": 4.66581176583119e-06, + "loss": 0.4327, + "step": 4502 + }, + { + "epoch": 1.0718748140655678, + "grad_norm": 0.3719781722435344, + "learning_rate": 4.663888507103996e-06, + "loss": 0.3222, + "step": 4503 + }, + { + "epoch": 1.0721128101386328, + "grad_norm": 0.4082136221681287, + "learning_rate": 4.6619652983312844e-06, + "loss": 0.2975, + "step": 4504 + }, + { + "epoch": 1.0723508062116975, + "grad_norm": 0.360658966423447, + "learning_rate": 4.660042139798892e-06, + "loss": 0.342, + "step": 4505 + }, + { + "epoch": 1.0725888022847623, + "grad_norm": 0.3593445634450902, + "learning_rate": 4.658119031792648e-06, + "loss": 0.3406, + "step": 4506 + }, + { + "epoch": 1.072826798357827, + "grad_norm": 0.40080585396608215, + "learning_rate": 4.6561959745983724e-06, + "loss": 0.2878, + "step": 4507 + }, + { + "epoch": 1.073064794430892, + "grad_norm": 0.38452336744074866, + "learning_rate": 4.65427296850188e-06, + "loss": 0.3205, + "step": 4508 + }, + { + "epoch": 1.0733027905039567, + "grad_norm": 0.40724996130914426, + "learning_rate": 4.652350013788979e-06, + "loss": 0.4083, + "step": 4509 + }, + { + "epoch": 1.0735407865770215, + "grad_norm": 0.4014847828369795, + "learning_rate": 4.650427110745467e-06, + "loss": 0.3173, + "step": 4510 + }, + { + "epoch": 1.0737787826500862, + "grad_norm": 0.38549635848058716, + "learning_rate": 4.648504259657132e-06, + "loss": 0.2915, + "step": 4511 + }, + { + "epoch": 1.0740167787231512, + "grad_norm": 0.3852457361423471, + "learning_rate": 4.646581460809762e-06, + "loss": 0.3348, + "step": 4512 + }, + { + "epoch": 1.074254774796216, + "grad_norm": 0.3712671658962982, + "learning_rate": 4.644658714489129e-06, + "loss": 0.3808, + "step": 4513 + }, + { + "epoch": 1.0744927708692806, + "grad_norm": 0.43356148305108094, + "learning_rate": 4.642736020981002e-06, + "loss": 0.2974, + "step": 4514 + }, + { + "epoch": 1.0747307669423454, + "grad_norm": 0.3800025715731251, + "learning_rate": 4.64081338057114e-06, + "loss": 0.3053, + "step": 4515 + }, + { + "epoch": 1.0749687630154103, + "grad_norm": 0.4199497942695732, + "learning_rate": 4.638890793545297e-06, + "loss": 0.3937, + "step": 4516 + }, + { + "epoch": 1.075206759088475, + "grad_norm": 0.35073357189494836, + "learning_rate": 4.636968260189214e-06, + "loss": 0.3195, + "step": 4517 + }, + { + "epoch": 1.0754447551615398, + "grad_norm": 0.4014979008525594, + "learning_rate": 4.635045780788629e-06, + "loss": 0.2757, + "step": 4518 + }, + { + "epoch": 1.0756827512346046, + "grad_norm": 0.4018578448257378, + "learning_rate": 4.63312335562927e-06, + "loss": 0.3202, + "step": 4519 + }, + { + "epoch": 1.0759207473076695, + "grad_norm": 0.3853979552410293, + "learning_rate": 4.6312009849968544e-06, + "loss": 0.3796, + "step": 4520 + }, + { + "epoch": 1.0761587433807343, + "grad_norm": 0.3725529344423733, + "learning_rate": 4.629278669177098e-06, + "loss": 0.3162, + "step": 4521 + }, + { + "epoch": 1.076396739453799, + "grad_norm": 0.3465817848520335, + "learning_rate": 4.627356408455701e-06, + "loss": 0.2863, + "step": 4522 + }, + { + "epoch": 1.0766347355268637, + "grad_norm": 0.3869139089298558, + "learning_rate": 4.625434203118362e-06, + "loss": 0.3826, + "step": 4523 + }, + { + "epoch": 1.0768727315999287, + "grad_norm": 0.3809180722646469, + "learning_rate": 4.623512053450767e-06, + "loss": 0.3408, + "step": 4524 + }, + { + "epoch": 1.0771107276729934, + "grad_norm": 0.3703176815754259, + "learning_rate": 4.621589959738593e-06, + "loss": 0.3108, + "step": 4525 + }, + { + "epoch": 1.0773487237460582, + "grad_norm": 0.37759518980753926, + "learning_rate": 4.619667922267514e-06, + "loss": 0.3366, + "step": 4526 + }, + { + "epoch": 1.077586719819123, + "grad_norm": 0.3898155012281468, + "learning_rate": 4.617745941323189e-06, + "loss": 0.4133, + "step": 4527 + }, + { + "epoch": 1.0778247158921879, + "grad_norm": 0.36255189396131254, + "learning_rate": 4.615824017191275e-06, + "loss": 0.3031, + "step": 4528 + }, + { + "epoch": 1.0780627119652526, + "grad_norm": 0.3691335793651894, + "learning_rate": 4.613902150157416e-06, + "loss": 0.2816, + "step": 4529 + }, + { + "epoch": 1.0783007080383173, + "grad_norm": 0.39196606245884325, + "learning_rate": 4.61198034050725e-06, + "loss": 0.3632, + "step": 4530 + }, + { + "epoch": 1.078538704111382, + "grad_norm": 0.39160631748508107, + "learning_rate": 4.610058588526404e-06, + "loss": 0.3471, + "step": 4531 + }, + { + "epoch": 1.078776700184447, + "grad_norm": 0.3624974985968683, + "learning_rate": 4.6081368945004976e-06, + "loss": 0.301, + "step": 4532 + }, + { + "epoch": 1.0790146962575118, + "grad_norm": 0.3921259171027228, + "learning_rate": 4.606215258715144e-06, + "loss": 0.3611, + "step": 4533 + }, + { + "epoch": 1.0792526923305765, + "grad_norm": 0.38270676417966176, + "learning_rate": 4.604293681455942e-06, + "loss": 0.3717, + "step": 4534 + }, + { + "epoch": 1.0794906884036413, + "grad_norm": 0.4878601599757468, + "learning_rate": 4.602372163008491e-06, + "loss": 0.3385, + "step": 4535 + }, + { + "epoch": 1.0797286844767062, + "grad_norm": 0.4258797436561885, + "learning_rate": 4.6004507036583714e-06, + "loss": 0.3118, + "step": 4536 + }, + { + "epoch": 1.079966680549771, + "grad_norm": 0.36842771403409225, + "learning_rate": 4.598529303691163e-06, + "loss": 0.354, + "step": 4537 + }, + { + "epoch": 1.0802046766228357, + "grad_norm": 0.38395032278946645, + "learning_rate": 4.596607963392431e-06, + "loss": 0.3878, + "step": 4538 + }, + { + "epoch": 1.0804426726959004, + "grad_norm": 0.37424827121079335, + "learning_rate": 4.594686683047736e-06, + "loss": 0.3127, + "step": 4539 + }, + { + "epoch": 1.0806806687689654, + "grad_norm": 0.36463462903952687, + "learning_rate": 4.592765462942627e-06, + "loss": 0.3135, + "step": 4540 + }, + { + "epoch": 1.0809186648420301, + "grad_norm": 0.4010499621374228, + "learning_rate": 4.590844303362645e-06, + "loss": 0.4033, + "step": 4541 + }, + { + "epoch": 1.0811566609150949, + "grad_norm": 0.35721252951546795, + "learning_rate": 4.5889232045933204e-06, + "loss": 0.3467, + "step": 4542 + }, + { + "epoch": 1.0813946569881596, + "grad_norm": 0.3883975133944084, + "learning_rate": 4.587002166920178e-06, + "loss": 0.2614, + "step": 4543 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 0.3644825130668685, + "learning_rate": 4.5850811906287315e-06, + "loss": 0.3605, + "step": 4544 + }, + { + "epoch": 1.0818706491342893, + "grad_norm": 0.3839532185379082, + "learning_rate": 4.583160276004483e-06, + "loss": 0.3859, + "step": 4545 + }, + { + "epoch": 1.082108645207354, + "grad_norm": 0.37149271895145186, + "learning_rate": 4.58123942333293e-06, + "loss": 0.2943, + "step": 4546 + }, + { + "epoch": 1.0823466412804188, + "grad_norm": 0.386864449700323, + "learning_rate": 4.5793186328995585e-06, + "loss": 0.3084, + "step": 4547 + }, + { + "epoch": 1.0825846373534838, + "grad_norm": 0.36399312445382515, + "learning_rate": 4.577397904989846e-06, + "loss": 0.3798, + "step": 4548 + }, + { + "epoch": 1.0828226334265485, + "grad_norm": 0.4192409273041034, + "learning_rate": 4.575477239889258e-06, + "loss": 0.359, + "step": 4549 + }, + { + "epoch": 1.0830606294996132, + "grad_norm": 0.4118535694398554, + "learning_rate": 4.5735566378832545e-06, + "loss": 0.3003, + "step": 4550 + }, + { + "epoch": 1.083298625572678, + "grad_norm": 0.39405326883837694, + "learning_rate": 4.571636099257285e-06, + "loss": 0.3287, + "step": 4551 + }, + { + "epoch": 1.083536621645743, + "grad_norm": 0.4282411602854888, + "learning_rate": 4.569715624296788e-06, + "loss": 0.3933, + "step": 4552 + }, + { + "epoch": 1.0837746177188077, + "grad_norm": 0.3646511346959913, + "learning_rate": 4.567795213287194e-06, + "loss": 0.3013, + "step": 4553 + }, + { + "epoch": 1.0840126137918724, + "grad_norm": 0.39611556493858835, + "learning_rate": 4.565874866513924e-06, + "loss": 0.3023, + "step": 4554 + }, + { + "epoch": 1.0842506098649372, + "grad_norm": 0.41873346499425335, + "learning_rate": 4.563954584262388e-06, + "loss": 0.3308, + "step": 4555 + }, + { + "epoch": 1.0844886059380021, + "grad_norm": 0.37895952135841016, + "learning_rate": 4.562034366817989e-06, + "loss": 0.3503, + "step": 4556 + }, + { + "epoch": 1.0847266020110669, + "grad_norm": 0.3755212007026155, + "learning_rate": 4.560114214466118e-06, + "loss": 0.2777, + "step": 4557 + }, + { + "epoch": 1.0849645980841316, + "grad_norm": 0.3686507710984981, + "learning_rate": 4.558194127492156e-06, + "loss": 0.3146, + "step": 4558 + }, + { + "epoch": 1.0852025941571963, + "grad_norm": 0.37532310578022504, + "learning_rate": 4.556274106181477e-06, + "loss": 0.4073, + "step": 4559 + }, + { + "epoch": 1.0854405902302613, + "grad_norm": 0.38070453470704196, + "learning_rate": 4.554354150819442e-06, + "loss": 0.3069, + "step": 4560 + }, + { + "epoch": 1.085678586303326, + "grad_norm": 0.38448824696456024, + "learning_rate": 4.552434261691405e-06, + "loss": 0.2876, + "step": 4561 + }, + { + "epoch": 1.0859165823763908, + "grad_norm": 0.4030821070519119, + "learning_rate": 4.55051443908271e-06, + "loss": 0.3773, + "step": 4562 + }, + { + "epoch": 1.0861545784494555, + "grad_norm": 0.35298371324439093, + "learning_rate": 4.5485946832786885e-06, + "loss": 0.3785, + "step": 4563 + }, + { + "epoch": 1.0863925745225205, + "grad_norm": 0.36012679234114514, + "learning_rate": 4.546674994564664e-06, + "loss": 0.3021, + "step": 4564 + }, + { + "epoch": 1.0866305705955852, + "grad_norm": 0.41175830720140366, + "learning_rate": 4.544755373225949e-06, + "loss": 0.3291, + "step": 4565 + }, + { + "epoch": 1.08686856666865, + "grad_norm": 0.3729710973827501, + "learning_rate": 4.54283581954785e-06, + "loss": 0.3917, + "step": 4566 + }, + { + "epoch": 1.0871065627417147, + "grad_norm": 0.38900125919653233, + "learning_rate": 4.540916333815658e-06, + "loss": 0.3425, + "step": 4567 + }, + { + "epoch": 1.0873445588147796, + "grad_norm": 0.4185005530339485, + "learning_rate": 4.5389969163146544e-06, + "loss": 0.3012, + "step": 4568 + }, + { + "epoch": 1.0875825548878444, + "grad_norm": 0.4088166803824769, + "learning_rate": 4.537077567330115e-06, + "loss": 0.3202, + "step": 4569 + }, + { + "epoch": 1.0878205509609091, + "grad_norm": 0.39177237999984393, + "learning_rate": 4.535158287147301e-06, + "loss": 0.3713, + "step": 4570 + }, + { + "epoch": 1.0880585470339739, + "grad_norm": 0.3853574514336472, + "learning_rate": 4.533239076051465e-06, + "loss": 0.3007, + "step": 4571 + }, + { + "epoch": 1.0882965431070388, + "grad_norm": 0.43082485652705493, + "learning_rate": 4.531319934327849e-06, + "loss": 0.3271, + "step": 4572 + }, + { + "epoch": 1.0885345391801036, + "grad_norm": 0.40152293574128467, + "learning_rate": 4.529400862261686e-06, + "loss": 0.3935, + "step": 4573 + }, + { + "epoch": 1.0887725352531683, + "grad_norm": 0.3820747241972296, + "learning_rate": 4.527481860138196e-06, + "loss": 0.3331, + "step": 4574 + }, + { + "epoch": 1.089010531326233, + "grad_norm": 0.3766358656424945, + "learning_rate": 4.525562928242592e-06, + "loss": 0.3045, + "step": 4575 + }, + { + "epoch": 1.089248527399298, + "grad_norm": 0.4237754758083936, + "learning_rate": 4.523644066860074e-06, + "loss": 0.3531, + "step": 4576 + }, + { + "epoch": 1.0894865234723627, + "grad_norm": 0.4304790203953773, + "learning_rate": 4.52172527627583e-06, + "loss": 0.3691, + "step": 4577 + }, + { + "epoch": 1.0897245195454275, + "grad_norm": 0.35559337217271153, + "learning_rate": 4.519806556775043e-06, + "loss": 0.3105, + "step": 4578 + }, + { + "epoch": 1.0899625156184922, + "grad_norm": 0.36470318443377925, + "learning_rate": 4.517887908642882e-06, + "loss": 0.2944, + "step": 4579 + }, + { + "epoch": 1.0902005116915572, + "grad_norm": 0.3917075725340582, + "learning_rate": 4.515969332164504e-06, + "loss": 0.3561, + "step": 4580 + }, + { + "epoch": 1.090438507764622, + "grad_norm": 0.3706508136005053, + "learning_rate": 4.514050827625058e-06, + "loss": 0.3862, + "step": 4581 + }, + { + "epoch": 1.0906765038376867, + "grad_norm": 0.38950829726986785, + "learning_rate": 4.512132395309681e-06, + "loss": 0.2885, + "step": 4582 + }, + { + "epoch": 1.0909144999107514, + "grad_norm": 0.3771455973737931, + "learning_rate": 4.510214035503499e-06, + "loss": 0.3464, + "step": 4583 + }, + { + "epoch": 1.0911524959838164, + "grad_norm": 0.36988427743457747, + "learning_rate": 4.508295748491628e-06, + "loss": 0.4167, + "step": 4584 + }, + { + "epoch": 1.091390492056881, + "grad_norm": 0.35658085377837206, + "learning_rate": 4.506377534559174e-06, + "loss": 0.3147, + "step": 4585 + }, + { + "epoch": 1.0916284881299458, + "grad_norm": 0.3761893075654618, + "learning_rate": 4.504459393991229e-06, + "loss": 0.2916, + "step": 4586 + }, + { + "epoch": 1.0918664842030106, + "grad_norm": 0.3778558101407558, + "learning_rate": 4.502541327072877e-06, + "loss": 0.3372, + "step": 4587 + }, + { + "epoch": 1.0921044802760755, + "grad_norm": 0.3819227496695729, + "learning_rate": 4.50062333408919e-06, + "loss": 0.4006, + "step": 4588 + }, + { + "epoch": 1.0923424763491403, + "grad_norm": 0.4723091940072908, + "learning_rate": 4.49870541532523e-06, + "loss": 0.3153, + "step": 4589 + }, + { + "epoch": 1.092580472422205, + "grad_norm": 0.45016072076415764, + "learning_rate": 4.496787571066047e-06, + "loss": 0.3088, + "step": 4590 + }, + { + "epoch": 1.0928184684952698, + "grad_norm": 0.3866045379739883, + "learning_rate": 4.494869801596679e-06, + "loss": 0.3687, + "step": 4591 + }, + { + "epoch": 1.0930564645683347, + "grad_norm": 0.3739372092733529, + "learning_rate": 4.492952107202154e-06, + "loss": 0.2934, + "step": 4592 + }, + { + "epoch": 1.0932944606413995, + "grad_norm": 0.35295097365438255, + "learning_rate": 4.49103448816749e-06, + "loss": 0.2993, + "step": 4593 + }, + { + "epoch": 1.0935324567144642, + "grad_norm": 0.3938263757181118, + "learning_rate": 4.489116944777694e-06, + "loss": 0.3359, + "step": 4594 + }, + { + "epoch": 1.093770452787529, + "grad_norm": 0.3831990383015829, + "learning_rate": 4.487199477317758e-06, + "loss": 0.3769, + "step": 4595 + }, + { + "epoch": 1.094008448860594, + "grad_norm": 0.35100904325779053, + "learning_rate": 4.485282086072666e-06, + "loss": 0.2762, + "step": 4596 + }, + { + "epoch": 1.0942464449336586, + "grad_norm": 0.38231100716355565, + "learning_rate": 4.48336477132739e-06, + "loss": 0.2986, + "step": 4597 + }, + { + "epoch": 1.0944844410067234, + "grad_norm": 0.36444180770894197, + "learning_rate": 4.4814475333668884e-06, + "loss": 0.3311, + "step": 4598 + }, + { + "epoch": 1.0947224370797881, + "grad_norm": 0.36930290531645005, + "learning_rate": 4.479530372476113e-06, + "loss": 0.3345, + "step": 4599 + }, + { + "epoch": 1.094960433152853, + "grad_norm": 0.38130542257095973, + "learning_rate": 4.477613288939999e-06, + "loss": 0.2754, + "step": 4600 + }, + { + "epoch": 1.0951984292259178, + "grad_norm": 0.371481003330978, + "learning_rate": 4.4756962830434735e-06, + "loss": 0.3172, + "step": 4601 + }, + { + "epoch": 1.0954364252989826, + "grad_norm": 0.3810788506730066, + "learning_rate": 4.4737793550714515e-06, + "loss": 0.3931, + "step": 4602 + }, + { + "epoch": 1.0956744213720473, + "grad_norm": 0.35314794218327844, + "learning_rate": 4.471862505308835e-06, + "loss": 0.2966, + "step": 4603 + }, + { + "epoch": 1.0959124174451123, + "grad_norm": 0.3836019074456394, + "learning_rate": 4.469945734040516e-06, + "loss": 0.331, + "step": 4604 + }, + { + "epoch": 1.096150413518177, + "grad_norm": 0.42836723354827033, + "learning_rate": 4.468029041551372e-06, + "loss": 0.3328, + "step": 4605 + }, + { + "epoch": 1.0963884095912417, + "grad_norm": 0.3480001600905687, + "learning_rate": 4.466112428126275e-06, + "loss": 0.3601, + "step": 4606 + }, + { + "epoch": 1.0966264056643065, + "grad_norm": 0.3885251422953546, + "learning_rate": 4.464195894050079e-06, + "loss": 0.2733, + "step": 4607 + }, + { + "epoch": 1.0968644017373714, + "grad_norm": 0.3940111256961415, + "learning_rate": 4.462279439607628e-06, + "loss": 0.3078, + "step": 4608 + }, + { + "epoch": 1.0971023978104362, + "grad_norm": 0.41464737325896495, + "learning_rate": 4.4603630650837545e-06, + "loss": 0.4018, + "step": 4609 + }, + { + "epoch": 1.097340393883501, + "grad_norm": 0.3657995338802541, + "learning_rate": 4.4584467707632804e-06, + "loss": 0.2893, + "step": 4610 + }, + { + "epoch": 1.0975783899565656, + "grad_norm": 0.3670886985872823, + "learning_rate": 4.456530556931013e-06, + "loss": 0.2717, + "step": 4611 + }, + { + "epoch": 1.0978163860296306, + "grad_norm": 0.3815190866043315, + "learning_rate": 4.454614423871749e-06, + "loss": 0.3265, + "step": 4612 + }, + { + "epoch": 1.0980543821026953, + "grad_norm": 0.3479008384560524, + "learning_rate": 4.4526983718702745e-06, + "loss": 0.3696, + "step": 4613 + }, + { + "epoch": 1.09829237817576, + "grad_norm": 0.3510433837303061, + "learning_rate": 4.450782401211362e-06, + "loss": 0.2636, + "step": 4614 + }, + { + "epoch": 1.0985303742488248, + "grad_norm": 0.3804726727656412, + "learning_rate": 4.4488665121797696e-06, + "loss": 0.3307, + "step": 4615 + }, + { + "epoch": 1.0987683703218898, + "grad_norm": 0.3614441700666874, + "learning_rate": 4.446950705060249e-06, + "loss": 0.3949, + "step": 4616 + }, + { + "epoch": 1.0990063663949545, + "grad_norm": 0.37288138794486936, + "learning_rate": 4.445034980137536e-06, + "loss": 0.2985, + "step": 4617 + }, + { + "epoch": 1.0992443624680193, + "grad_norm": 0.3609717776051928, + "learning_rate": 4.4431193376963534e-06, + "loss": 0.2821, + "step": 4618 + }, + { + "epoch": 1.099482358541084, + "grad_norm": 0.377067343268631, + "learning_rate": 4.441203778021412e-06, + "loss": 0.3213, + "step": 4619 + }, + { + "epoch": 1.099720354614149, + "grad_norm": 0.385028432731855, + "learning_rate": 4.439288301397416e-06, + "loss": 0.3769, + "step": 4620 + }, + { + "epoch": 1.0999583506872137, + "grad_norm": 0.3857984245884134, + "learning_rate": 4.437372908109049e-06, + "loss": 0.284, + "step": 4621 + }, + { + "epoch": 1.1001963467602784, + "grad_norm": 0.3804900796934547, + "learning_rate": 4.435457598440987e-06, + "loss": 0.3023, + "step": 4622 + }, + { + "epoch": 1.1004343428333432, + "grad_norm": 0.4095757121830929, + "learning_rate": 4.4335423726778914e-06, + "loss": 0.3569, + "step": 4623 + }, + { + "epoch": 1.1006723389064081, + "grad_norm": 0.3626590444928867, + "learning_rate": 4.431627231104413e-06, + "loss": 0.3528, + "step": 4624 + }, + { + "epoch": 1.1009103349794729, + "grad_norm": 0.3950093241561531, + "learning_rate": 4.429712174005189e-06, + "loss": 0.299, + "step": 4625 + }, + { + "epoch": 1.1011483310525376, + "grad_norm": 0.35813564107373463, + "learning_rate": 4.427797201664844e-06, + "loss": 0.3488, + "step": 4626 + }, + { + "epoch": 1.1013863271256024, + "grad_norm": 0.39623477898910664, + "learning_rate": 4.425882314367991e-06, + "loss": 0.3686, + "step": 4627 + }, + { + "epoch": 1.1016243231986673, + "grad_norm": 0.3536379619561069, + "learning_rate": 4.423967512399228e-06, + "loss": 0.3094, + "step": 4628 + }, + { + "epoch": 1.101862319271732, + "grad_norm": 0.361987975704904, + "learning_rate": 4.4220527960431435e-06, + "loss": 0.2832, + "step": 4629 + }, + { + "epoch": 1.1021003153447968, + "grad_norm": 0.40248707316608023, + "learning_rate": 4.420138165584311e-06, + "loss": 0.3405, + "step": 4630 + }, + { + "epoch": 1.1023383114178615, + "grad_norm": 0.37274174398754245, + "learning_rate": 4.418223621307293e-06, + "loss": 0.3721, + "step": 4631 + }, + { + "epoch": 1.1025763074909265, + "grad_norm": 0.3929251247694433, + "learning_rate": 4.416309163496635e-06, + "loss": 0.2833, + "step": 4632 + }, + { + "epoch": 1.1028143035639912, + "grad_norm": 0.41200450576439623, + "learning_rate": 4.414394792436877e-06, + "loss": 0.3211, + "step": 4633 + }, + { + "epoch": 1.103052299637056, + "grad_norm": 0.39715739923858967, + "learning_rate": 4.41248050841254e-06, + "loss": 0.3868, + "step": 4634 + }, + { + "epoch": 1.1032902957101207, + "grad_norm": 0.36901909132216004, + "learning_rate": 4.410566311708134e-06, + "loss": 0.316, + "step": 4635 + }, + { + "epoch": 1.1035282917831857, + "grad_norm": 0.3902124213524561, + "learning_rate": 4.408652202608156e-06, + "loss": 0.2951, + "step": 4636 + }, + { + "epoch": 1.1037662878562504, + "grad_norm": 0.37677094100062875, + "learning_rate": 4.40673818139709e-06, + "loss": 0.3694, + "step": 4637 + }, + { + "epoch": 1.1040042839293152, + "grad_norm": 0.4185542623961448, + "learning_rate": 4.404824248359407e-06, + "loss": 0.3527, + "step": 4638 + }, + { + "epoch": 1.10424228000238, + "grad_norm": 0.3668392732421244, + "learning_rate": 4.402910403779564e-06, + "loss": 0.273, + "step": 4639 + }, + { + "epoch": 1.1044802760754449, + "grad_norm": 0.370730106826423, + "learning_rate": 4.400996647942007e-06, + "loss": 0.3284, + "step": 4640 + }, + { + "epoch": 1.1047182721485096, + "grad_norm": 0.3592174118196979, + "learning_rate": 4.399082981131166e-06, + "loss": 0.393, + "step": 4641 + }, + { + "epoch": 1.1049562682215743, + "grad_norm": 0.36645767089673803, + "learning_rate": 4.39716940363146e-06, + "loss": 0.3077, + "step": 4642 + }, + { + "epoch": 1.105194264294639, + "grad_norm": 0.35544644161118766, + "learning_rate": 4.395255915727294e-06, + "loss": 0.278, + "step": 4643 + }, + { + "epoch": 1.105432260367704, + "grad_norm": 0.3834171151203786, + "learning_rate": 4.39334251770306e-06, + "loss": 0.3348, + "step": 4644 + }, + { + "epoch": 1.1056702564407688, + "grad_norm": 0.3825948031558651, + "learning_rate": 4.391429209843135e-06, + "loss": 0.387, + "step": 4645 + }, + { + "epoch": 1.1059082525138335, + "grad_norm": 0.35003853431694987, + "learning_rate": 4.389515992431884e-06, + "loss": 0.3046, + "step": 4646 + }, + { + "epoch": 1.1061462485868983, + "grad_norm": 0.3825762439469127, + "learning_rate": 4.387602865753661e-06, + "loss": 0.327, + "step": 4647 + }, + { + "epoch": 1.1063842446599632, + "grad_norm": 0.3975773348300809, + "learning_rate": 4.385689830092802e-06, + "loss": 0.3667, + "step": 4648 + }, + { + "epoch": 1.106622240733028, + "grad_norm": 0.4272073165238033, + "learning_rate": 4.383776885733631e-06, + "loss": 0.337, + "step": 4649 + }, + { + "epoch": 1.1068602368060927, + "grad_norm": 0.38783082489646, + "learning_rate": 4.3818640329604594e-06, + "loss": 0.3064, + "step": 4650 + }, + { + "epoch": 1.1070982328791574, + "grad_norm": 0.37650129964931706, + "learning_rate": 4.3799512720575845e-06, + "loss": 0.3432, + "step": 4651 + }, + { + "epoch": 1.1073362289522224, + "grad_norm": 0.3741163747344154, + "learning_rate": 4.3780386033092905e-06, + "loss": 0.3701, + "step": 4652 + }, + { + "epoch": 1.1075742250252871, + "grad_norm": 0.37940812325594914, + "learning_rate": 4.376126026999846e-06, + "loss": 0.3167, + "step": 4653 + }, + { + "epoch": 1.1078122210983519, + "grad_norm": 0.3866628177506814, + "learning_rate": 4.374213543413508e-06, + "loss": 0.3018, + "step": 4654 + }, + { + "epoch": 1.1080502171714166, + "grad_norm": 0.4190491021927373, + "learning_rate": 4.372301152834518e-06, + "loss": 0.3635, + "step": 4655 + }, + { + "epoch": 1.1082882132444816, + "grad_norm": 0.36038485452576885, + "learning_rate": 4.370388855547106e-06, + "loss": 0.3567, + "step": 4656 + }, + { + "epoch": 1.1085262093175463, + "grad_norm": 0.36962141459506104, + "learning_rate": 4.368476651835485e-06, + "loss": 0.2713, + "step": 4657 + }, + { + "epoch": 1.108764205390611, + "grad_norm": 0.3760349048148074, + "learning_rate": 4.366564541983858e-06, + "loss": 0.315, + "step": 4658 + }, + { + "epoch": 1.1090022014636758, + "grad_norm": 0.39751754536485306, + "learning_rate": 4.36465252627641e-06, + "loss": 0.3915, + "step": 4659 + }, + { + "epoch": 1.1092401975367407, + "grad_norm": 0.380234301198992, + "learning_rate": 4.362740604997312e-06, + "loss": 0.3096, + "step": 4660 + }, + { + "epoch": 1.1094781936098055, + "grad_norm": 0.3650822635588656, + "learning_rate": 4.360828778430728e-06, + "loss": 0.2777, + "step": 4661 + }, + { + "epoch": 1.1097161896828702, + "grad_norm": 0.38870114316759063, + "learning_rate": 4.358917046860799e-06, + "loss": 0.3885, + "step": 4662 + }, + { + "epoch": 1.109954185755935, + "grad_norm": 0.3821913095469407, + "learning_rate": 4.357005410571657e-06, + "loss": 0.3813, + "step": 4663 + }, + { + "epoch": 1.110192181829, + "grad_norm": 0.38268196285227385, + "learning_rate": 4.355093869847418e-06, + "loss": 0.2743, + "step": 4664 + }, + { + "epoch": 1.1104301779020647, + "grad_norm": 0.3950888429234955, + "learning_rate": 4.353182424972184e-06, + "loss": 0.3504, + "step": 4665 + }, + { + "epoch": 1.1106681739751294, + "grad_norm": 0.38723636956923085, + "learning_rate": 4.351271076230042e-06, + "loss": 0.4136, + "step": 4666 + }, + { + "epoch": 1.1109061700481941, + "grad_norm": 0.3745288707524673, + "learning_rate": 4.349359823905068e-06, + "loss": 0.3286, + "step": 4667 + }, + { + "epoch": 1.111144166121259, + "grad_norm": 0.35266375814406953, + "learning_rate": 4.34744866828132e-06, + "loss": 0.2839, + "step": 4668 + }, + { + "epoch": 1.1113821621943238, + "grad_norm": 0.4116398445629615, + "learning_rate": 4.345537609642843e-06, + "loss": 0.2851, + "step": 4669 + }, + { + "epoch": 1.1116201582673886, + "grad_norm": 0.4227512343669503, + "learning_rate": 4.343626648273667e-06, + "loss": 0.3862, + "step": 4670 + }, + { + "epoch": 1.1118581543404533, + "grad_norm": 0.40325487336775656, + "learning_rate": 4.34171578445781e-06, + "loss": 0.3086, + "step": 4671 + }, + { + "epoch": 1.1120961504135183, + "grad_norm": 0.41223011157726425, + "learning_rate": 4.339805018479273e-06, + "loss": 0.2965, + "step": 4672 + }, + { + "epoch": 1.112334146486583, + "grad_norm": 0.3738155892736554, + "learning_rate": 4.337894350622043e-06, + "loss": 0.371, + "step": 4673 + }, + { + "epoch": 1.1125721425596478, + "grad_norm": 0.3812405808634638, + "learning_rate": 4.335983781170089e-06, + "loss": 0.3477, + "step": 4674 + }, + { + "epoch": 1.1128101386327125, + "grad_norm": 0.4077123636219675, + "learning_rate": 4.334073310407375e-06, + "loss": 0.283, + "step": 4675 + }, + { + "epoch": 1.1130481347057772, + "grad_norm": 0.36774441909673067, + "learning_rate": 4.332162938617841e-06, + "loss": 0.3361, + "step": 4676 + }, + { + "epoch": 1.1132861307788422, + "grad_norm": 0.38011910226148504, + "learning_rate": 4.3302526660854155e-06, + "loss": 0.4107, + "step": 4677 + }, + { + "epoch": 1.113524126851907, + "grad_norm": 0.37483811120325883, + "learning_rate": 4.3283424930940135e-06, + "loss": 0.3076, + "step": 4678 + }, + { + "epoch": 1.1137621229249717, + "grad_norm": 0.4000942474412326, + "learning_rate": 4.326432419927532e-06, + "loss": 0.3053, + "step": 4679 + }, + { + "epoch": 1.1140001189980366, + "grad_norm": 0.4085016676353177, + "learning_rate": 4.324522446869856e-06, + "loss": 0.3721, + "step": 4680 + }, + { + "epoch": 1.1142381150711014, + "grad_norm": 0.38122936015480213, + "learning_rate": 4.322612574204856e-06, + "loss": 0.3561, + "step": 4681 + }, + { + "epoch": 1.1144761111441661, + "grad_norm": 0.46561579745978454, + "learning_rate": 4.320702802216384e-06, + "loss": 0.2734, + "step": 4682 + }, + { + "epoch": 1.1147141072172309, + "grad_norm": 0.38133591698111996, + "learning_rate": 4.318793131188281e-06, + "loss": 0.3366, + "step": 4683 + }, + { + "epoch": 1.1149521032902956, + "grad_norm": 0.39153072129909505, + "learning_rate": 4.316883561404371e-06, + "loss": 0.3929, + "step": 4684 + }, + { + "epoch": 1.1151900993633606, + "grad_norm": 0.39464520028456496, + "learning_rate": 4.314974093148464e-06, + "loss": 0.2922, + "step": 4685 + }, + { + "epoch": 1.1154280954364253, + "grad_norm": 0.3582950558941121, + "learning_rate": 4.313064726704352e-06, + "loss": 0.2963, + "step": 4686 + }, + { + "epoch": 1.11566609150949, + "grad_norm": 0.39622662281594123, + "learning_rate": 4.311155462355817e-06, + "loss": 0.3557, + "step": 4687 + }, + { + "epoch": 1.115904087582555, + "grad_norm": 0.38590265523350004, + "learning_rate": 4.309246300386619e-06, + "loss": 0.3763, + "step": 4688 + }, + { + "epoch": 1.1161420836556197, + "grad_norm": 0.3951607366057468, + "learning_rate": 4.3073372410805115e-06, + "loss": 0.2872, + "step": 4689 + }, + { + "epoch": 1.1163800797286845, + "grad_norm": 0.3600542205066233, + "learning_rate": 4.305428284721225e-06, + "loss": 0.3088, + "step": 4690 + }, + { + "epoch": 1.1166180758017492, + "grad_norm": 0.37643095384884356, + "learning_rate": 4.303519431592479e-06, + "loss": 0.3968, + "step": 4691 + }, + { + "epoch": 1.116856071874814, + "grad_norm": 0.36518409589510575, + "learning_rate": 4.301610681977975e-06, + "loss": 0.3333, + "step": 4692 + }, + { + "epoch": 1.117094067947879, + "grad_norm": 0.37667222528849537, + "learning_rate": 4.2997020361614e-06, + "loss": 0.2865, + "step": 4693 + }, + { + "epoch": 1.1173320640209436, + "grad_norm": 0.4077874035405591, + "learning_rate": 4.297793494426429e-06, + "loss": 0.3299, + "step": 4694 + }, + { + "epoch": 1.1175700600940084, + "grad_norm": 0.36323609120530465, + "learning_rate": 4.295885057056716e-06, + "loss": 0.4048, + "step": 4695 + }, + { + "epoch": 1.1178080561670733, + "grad_norm": 0.36319522250442954, + "learning_rate": 4.293976724335901e-06, + "loss": 0.2963, + "step": 4696 + }, + { + "epoch": 1.118046052240138, + "grad_norm": 0.4272106433249216, + "learning_rate": 4.292068496547612e-06, + "loss": 0.3082, + "step": 4697 + }, + { + "epoch": 1.1182840483132028, + "grad_norm": 0.3620957198196407, + "learning_rate": 4.290160373975457e-06, + "loss": 0.338, + "step": 4698 + }, + { + "epoch": 1.1185220443862676, + "grad_norm": 0.36206405693132543, + "learning_rate": 4.2882523569030325e-06, + "loss": 0.3244, + "step": 4699 + }, + { + "epoch": 1.1187600404593323, + "grad_norm": 0.40476613713113396, + "learning_rate": 4.286344445613914e-06, + "loss": 0.2865, + "step": 4700 + }, + { + "epoch": 1.1189980365323973, + "grad_norm": 0.4027176453027199, + "learning_rate": 4.2844366403916654e-06, + "loss": 0.3403, + "step": 4701 + }, + { + "epoch": 1.119236032605462, + "grad_norm": 0.37078100855116464, + "learning_rate": 4.282528941519836e-06, + "loss": 0.389, + "step": 4702 + }, + { + "epoch": 1.1194740286785267, + "grad_norm": 0.35909975767598074, + "learning_rate": 4.280621349281954e-06, + "loss": 0.2934, + "step": 4703 + }, + { + "epoch": 1.1197120247515917, + "grad_norm": 0.39195993134735413, + "learning_rate": 4.278713863961538e-06, + "loss": 0.2757, + "step": 4704 + }, + { + "epoch": 1.1199500208246564, + "grad_norm": 0.4917626986677953, + "learning_rate": 4.276806485842084e-06, + "loss": 0.3648, + "step": 4705 + }, + { + "epoch": 1.1201880168977212, + "grad_norm": 0.3819285748588736, + "learning_rate": 4.274899215207077e-06, + "loss": 0.3353, + "step": 4706 + }, + { + "epoch": 1.120426012970786, + "grad_norm": 0.3671927065772577, + "learning_rate": 4.272992052339986e-06, + "loss": 0.2612, + "step": 4707 + }, + { + "epoch": 1.1206640090438507, + "grad_norm": 0.44096283257819413, + "learning_rate": 4.271084997524261e-06, + "loss": 0.3343, + "step": 4708 + }, + { + "epoch": 1.1209020051169156, + "grad_norm": 0.41801562980887697, + "learning_rate": 4.269178051043336e-06, + "loss": 0.3761, + "step": 4709 + }, + { + "epoch": 1.1211400011899804, + "grad_norm": 0.3605543613421053, + "learning_rate": 4.2672712131806334e-06, + "loss": 0.2884, + "step": 4710 + }, + { + "epoch": 1.121377997263045, + "grad_norm": 0.4186671411554116, + "learning_rate": 4.265364484219556e-06, + "loss": 0.286, + "step": 4711 + }, + { + "epoch": 1.12161599333611, + "grad_norm": 0.3763168545269733, + "learning_rate": 4.263457864443491e-06, + "loss": 0.3482, + "step": 4712 + }, + { + "epoch": 1.1218539894091748, + "grad_norm": 0.3640333763753411, + "learning_rate": 4.261551354135807e-06, + "loss": 0.3576, + "step": 4713 + }, + { + "epoch": 1.1220919854822395, + "grad_norm": 0.3807965516234715, + "learning_rate": 4.259644953579861e-06, + "loss": 0.2808, + "step": 4714 + }, + { + "epoch": 1.1223299815553043, + "grad_norm": 0.38737302259739137, + "learning_rate": 4.2577386630589875e-06, + "loss": 0.3466, + "step": 4715 + }, + { + "epoch": 1.122567977628369, + "grad_norm": 0.3709834921935741, + "learning_rate": 4.255832482856514e-06, + "loss": 0.3879, + "step": 4716 + }, + { + "epoch": 1.122805973701434, + "grad_norm": 0.397332853820356, + "learning_rate": 4.253926413255743e-06, + "loss": 0.3053, + "step": 4717 + }, + { + "epoch": 1.1230439697744987, + "grad_norm": 0.37589401628573843, + "learning_rate": 4.252020454539965e-06, + "loss": 0.3125, + "step": 4718 + }, + { + "epoch": 1.1232819658475635, + "grad_norm": 0.39174326299097356, + "learning_rate": 4.250114606992451e-06, + "loss": 0.3167, + "step": 4719 + }, + { + "epoch": 1.1235199619206284, + "grad_norm": 0.3617505028367288, + "learning_rate": 4.248208870896456e-06, + "loss": 0.3868, + "step": 4720 + }, + { + "epoch": 1.1237579579936932, + "grad_norm": 0.401359881455322, + "learning_rate": 4.246303246535224e-06, + "loss": 0.2771, + "step": 4721 + }, + { + "epoch": 1.123995954066758, + "grad_norm": 0.3950415349640952, + "learning_rate": 4.244397734191973e-06, + "loss": 0.3337, + "step": 4722 + }, + { + "epoch": 1.1242339501398226, + "grad_norm": 0.40327719323124134, + "learning_rate": 4.242492334149911e-06, + "loss": 0.4014, + "step": 4723 + }, + { + "epoch": 1.1244719462128874, + "grad_norm": 0.37522594034975987, + "learning_rate": 4.2405870466922295e-06, + "loss": 0.3311, + "step": 4724 + }, + { + "epoch": 1.1247099422859523, + "grad_norm": 0.3900606525913724, + "learning_rate": 4.2386818721021e-06, + "loss": 0.2906, + "step": 4725 + }, + { + "epoch": 1.124947938359017, + "grad_norm": 0.36868405071537597, + "learning_rate": 4.236776810662677e-06, + "loss": 0.3458, + "step": 4726 + }, + { + "epoch": 1.1251859344320818, + "grad_norm": 0.3751466988447734, + "learning_rate": 4.2348718626571024e-06, + "loss": 0.3748, + "step": 4727 + }, + { + "epoch": 1.1254239305051468, + "grad_norm": 0.3635206406113209, + "learning_rate": 4.232967028368498e-06, + "loss": 0.27, + "step": 4728 + }, + { + "epoch": 1.1256619265782115, + "grad_norm": 0.3790908884920694, + "learning_rate": 4.231062308079965e-06, + "loss": 0.2931, + "step": 4729 + }, + { + "epoch": 1.1258999226512763, + "grad_norm": 0.3699061956945172, + "learning_rate": 4.229157702074598e-06, + "loss": 0.3573, + "step": 4730 + }, + { + "epoch": 1.126137918724341, + "grad_norm": 0.3541326251134355, + "learning_rate": 4.227253210635467e-06, + "loss": 0.3273, + "step": 4731 + }, + { + "epoch": 1.1263759147974057, + "grad_norm": 0.3706068751730151, + "learning_rate": 4.225348834045625e-06, + "loss": 0.3089, + "step": 4732 + }, + { + "epoch": 1.1266139108704707, + "grad_norm": 0.36912447443811713, + "learning_rate": 4.223444572588111e-06, + "loss": 0.3112, + "step": 4733 + }, + { + "epoch": 1.1268519069435354, + "grad_norm": 0.38272894471100344, + "learning_rate": 4.221540426545943e-06, + "loss": 0.3732, + "step": 4734 + }, + { + "epoch": 1.1270899030166002, + "grad_norm": 0.3659819182595056, + "learning_rate": 4.219636396202127e-06, + "loss": 0.3125, + "step": 4735 + }, + { + "epoch": 1.1273278990896651, + "grad_norm": 0.4029280672903165, + "learning_rate": 4.217732481839647e-06, + "loss": 0.2993, + "step": 4736 + }, + { + "epoch": 1.1275658951627299, + "grad_norm": 0.3471586840183988, + "learning_rate": 4.215828683741473e-06, + "loss": 0.3264, + "step": 4737 + }, + { + "epoch": 1.1278038912357946, + "grad_norm": 0.40496516884275047, + "learning_rate": 4.2139250021905564e-06, + "loss": 0.3911, + "step": 4738 + }, + { + "epoch": 1.1280418873088593, + "grad_norm": 0.3604417470905285, + "learning_rate": 4.212021437469831e-06, + "loss": 0.2733, + "step": 4739 + }, + { + "epoch": 1.128279883381924, + "grad_norm": 0.4045782091345007, + "learning_rate": 4.210117989862213e-06, + "loss": 0.3166, + "step": 4740 + }, + { + "epoch": 1.128517879454989, + "grad_norm": 0.42245205724517865, + "learning_rate": 4.208214659650603e-06, + "loss": 0.3806, + "step": 4741 + }, + { + "epoch": 1.1287558755280538, + "grad_norm": 0.37772806607684767, + "learning_rate": 4.206311447117883e-06, + "loss": 0.3229, + "step": 4742 + }, + { + "epoch": 1.1289938716011185, + "grad_norm": 0.45042461811956686, + "learning_rate": 4.204408352546914e-06, + "loss": 0.2872, + "step": 4743 + }, + { + "epoch": 1.1292318676741835, + "grad_norm": 0.41146418750242547, + "learning_rate": 4.202505376220548e-06, + "loss": 0.3595, + "step": 4744 + }, + { + "epoch": 1.1294698637472482, + "grad_norm": 0.37141506761563525, + "learning_rate": 4.2006025184216124e-06, + "loss": 0.387, + "step": 4745 + }, + { + "epoch": 1.129707859820313, + "grad_norm": 0.4033866279526767, + "learning_rate": 4.198699779432918e-06, + "loss": 0.2798, + "step": 4746 + }, + { + "epoch": 1.1299458558933777, + "grad_norm": 0.39808863942798545, + "learning_rate": 4.19679715953726e-06, + "loss": 0.3075, + "step": 4747 + }, + { + "epoch": 1.1301838519664424, + "grad_norm": 0.35618340589269987, + "learning_rate": 4.194894659017415e-06, + "loss": 0.3694, + "step": 4748 + }, + { + "epoch": 1.1304218480395074, + "grad_norm": 0.3632549519200726, + "learning_rate": 4.192992278156141e-06, + "loss": 0.3462, + "step": 4749 + }, + { + "epoch": 1.1306598441125721, + "grad_norm": 0.4018427851047768, + "learning_rate": 4.191090017236177e-06, + "loss": 0.2608, + "step": 4750 + }, + { + "epoch": 1.1308978401856369, + "grad_norm": 0.3889863500503954, + "learning_rate": 4.189187876540249e-06, + "loss": 0.3352, + "step": 4751 + }, + { + "epoch": 1.1311358362587018, + "grad_norm": 0.3495459311201199, + "learning_rate": 4.187285856351061e-06, + "loss": 0.3973, + "step": 4752 + }, + { + "epoch": 1.1313738323317666, + "grad_norm": 0.3678770041190173, + "learning_rate": 4.1853839569513015e-06, + "loss": 0.295, + "step": 4753 + }, + { + "epoch": 1.1316118284048313, + "grad_norm": 0.4160690563278992, + "learning_rate": 4.1834821786236375e-06, + "loss": 0.3, + "step": 4754 + }, + { + "epoch": 1.131849824477896, + "grad_norm": 0.3814427610026273, + "learning_rate": 4.181580521650722e-06, + "loss": 0.3727, + "step": 4755 + }, + { + "epoch": 1.1320878205509608, + "grad_norm": 0.35113445654594827, + "learning_rate": 4.179678986315185e-06, + "loss": 0.317, + "step": 4756 + }, + { + "epoch": 1.1323258166240258, + "grad_norm": 0.38027676840048824, + "learning_rate": 4.177777572899647e-06, + "loss": 0.2754, + "step": 4757 + }, + { + "epoch": 1.1325638126970905, + "grad_norm": 0.3539448580155885, + "learning_rate": 4.175876281686702e-06, + "loss": 0.3154, + "step": 4758 + }, + { + "epoch": 1.1328018087701552, + "grad_norm": 0.3741958790250975, + "learning_rate": 4.17397511295893e-06, + "loss": 0.4037, + "step": 4759 + }, + { + "epoch": 1.1330398048432202, + "grad_norm": 0.35759297486340763, + "learning_rate": 4.17207406699889e-06, + "loss": 0.2913, + "step": 4760 + }, + { + "epoch": 1.133277800916285, + "grad_norm": 0.41006094302343415, + "learning_rate": 4.170173144089127e-06, + "loss": 0.3066, + "step": 4761 + }, + { + "epoch": 1.1335157969893497, + "grad_norm": 0.3764612848174041, + "learning_rate": 4.168272344512163e-06, + "loss": 0.3449, + "step": 4762 + }, + { + "epoch": 1.1337537930624144, + "grad_norm": 0.3676606468861143, + "learning_rate": 4.1663716685505026e-06, + "loss": 0.353, + "step": 4763 + }, + { + "epoch": 1.1339917891354792, + "grad_norm": 0.43991654949679093, + "learning_rate": 4.164471116486638e-06, + "loss": 0.3147, + "step": 4764 + }, + { + "epoch": 1.1342297852085441, + "grad_norm": 0.389125286946846, + "learning_rate": 4.162570688603035e-06, + "loss": 0.2964, + "step": 4765 + }, + { + "epoch": 1.1344677812816089, + "grad_norm": 0.3953023155670322, + "learning_rate": 4.160670385182144e-06, + "loss": 0.3646, + "step": 4766 + }, + { + "epoch": 1.1347057773546736, + "grad_norm": 0.3803124764959841, + "learning_rate": 4.158770206506398e-06, + "loss": 0.3455, + "step": 4767 + }, + { + "epoch": 1.1349437734277386, + "grad_norm": 0.3932564435551, + "learning_rate": 4.1568701528582105e-06, + "loss": 0.3009, + "step": 4768 + }, + { + "epoch": 1.1351817695008033, + "grad_norm": 0.37337164388298766, + "learning_rate": 4.154970224519976e-06, + "loss": 0.3482, + "step": 4769 + }, + { + "epoch": 1.135419765573868, + "grad_norm": 0.3899160813781137, + "learning_rate": 4.15307042177407e-06, + "loss": 0.4065, + "step": 4770 + }, + { + "epoch": 1.1356577616469328, + "grad_norm": 0.35579178905990316, + "learning_rate": 4.151170744902852e-06, + "loss": 0.3079, + "step": 4771 + }, + { + "epoch": 1.1358957577199975, + "grad_norm": 0.37945121820654776, + "learning_rate": 4.149271194188662e-06, + "loss": 0.3105, + "step": 4772 + }, + { + "epoch": 1.1361337537930625, + "grad_norm": 0.4165547341391943, + "learning_rate": 4.147371769913817e-06, + "loss": 0.3696, + "step": 4773 + }, + { + "epoch": 1.1363717498661272, + "grad_norm": 0.376295683219943, + "learning_rate": 4.145472472360621e-06, + "loss": 0.3747, + "step": 4774 + }, + { + "epoch": 1.136609745939192, + "grad_norm": 0.3886386600985121, + "learning_rate": 4.143573301811355e-06, + "loss": 0.2783, + "step": 4775 + }, + { + "epoch": 1.136847742012257, + "grad_norm": 0.40835168558501084, + "learning_rate": 4.141674258548284e-06, + "loss": 0.3384, + "step": 4776 + }, + { + "epoch": 1.1370857380853217, + "grad_norm": 0.371459747825813, + "learning_rate": 4.13977534285365e-06, + "loss": 0.383, + "step": 4777 + }, + { + "epoch": 1.1373237341583864, + "grad_norm": 0.3944390627645514, + "learning_rate": 4.137876555009684e-06, + "loss": 0.3055, + "step": 4778 + }, + { + "epoch": 1.1375617302314511, + "grad_norm": 0.41245043405705123, + "learning_rate": 4.135977895298588e-06, + "loss": 0.3053, + "step": 4779 + }, + { + "epoch": 1.1377997263045159, + "grad_norm": 0.3846823535751807, + "learning_rate": 4.134079364002552e-06, + "loss": 0.3395, + "step": 4780 + }, + { + "epoch": 1.1380377223775808, + "grad_norm": 0.39696399724800313, + "learning_rate": 4.132180961403744e-06, + "loss": 0.3519, + "step": 4781 + }, + { + "epoch": 1.1382757184506456, + "grad_norm": 0.39325849294481, + "learning_rate": 4.130282687784315e-06, + "loss": 0.2742, + "step": 4782 + }, + { + "epoch": 1.1385137145237103, + "grad_norm": 0.40331167287236297, + "learning_rate": 4.1283845434263935e-06, + "loss": 0.3036, + "step": 4783 + }, + { + "epoch": 1.1387517105967753, + "grad_norm": 0.4029530512494824, + "learning_rate": 4.12648652861209e-06, + "loss": 0.3796, + "step": 4784 + }, + { + "epoch": 1.13898970666984, + "grad_norm": 0.3698197939582225, + "learning_rate": 4.1245886436235e-06, + "loss": 0.3043, + "step": 4785 + }, + { + "epoch": 1.1392277027429047, + "grad_norm": 0.37697533259800486, + "learning_rate": 4.122690888742694e-06, + "loss": 0.2891, + "step": 4786 + }, + { + "epoch": 1.1394656988159695, + "grad_norm": 0.39557375149649615, + "learning_rate": 4.120793264251726e-06, + "loss": 0.3232, + "step": 4787 + }, + { + "epoch": 1.1397036948890342, + "grad_norm": 0.364829176455308, + "learning_rate": 4.118895770432629e-06, + "loss": 0.3722, + "step": 4788 + }, + { + "epoch": 1.1399416909620992, + "grad_norm": 0.3778167261258978, + "learning_rate": 4.1169984075674184e-06, + "loss": 0.2911, + "step": 4789 + }, + { + "epoch": 1.140179687035164, + "grad_norm": 0.44220258438829385, + "learning_rate": 4.115101175938088e-06, + "loss": 0.3305, + "step": 4790 + }, + { + "epoch": 1.1404176831082287, + "grad_norm": 0.36291623619174723, + "learning_rate": 4.113204075826614e-06, + "loss": 0.3847, + "step": 4791 + }, + { + "epoch": 1.1406556791812936, + "grad_norm": 0.36770447829250785, + "learning_rate": 4.111307107514953e-06, + "loss": 0.3274, + "step": 4792 + }, + { + "epoch": 1.1408936752543584, + "grad_norm": 0.35815092054120873, + "learning_rate": 4.109410271285042e-06, + "loss": 0.2641, + "step": 4793 + }, + { + "epoch": 1.141131671327423, + "grad_norm": 0.36035533257546154, + "learning_rate": 4.107513567418796e-06, + "loss": 0.3437, + "step": 4794 + }, + { + "epoch": 1.1413696674004878, + "grad_norm": 0.3963850334927369, + "learning_rate": 4.105616996198113e-06, + "loss": 0.3981, + "step": 4795 + }, + { + "epoch": 1.1416076634735526, + "grad_norm": 0.3367518459228409, + "learning_rate": 4.10372055790487e-06, + "loss": 0.3095, + "step": 4796 + }, + { + "epoch": 1.1418456595466175, + "grad_norm": 0.38016156127534406, + "learning_rate": 4.101824252820926e-06, + "loss": 0.3133, + "step": 4797 + }, + { + "epoch": 1.1420836556196823, + "grad_norm": 0.3912648891515814, + "learning_rate": 4.099928081228115e-06, + "loss": 0.3705, + "step": 4798 + }, + { + "epoch": 1.142321651692747, + "grad_norm": 0.37387297859975754, + "learning_rate": 4.09803204340826e-06, + "loss": 0.3113, + "step": 4799 + }, + { + "epoch": 1.142559647765812, + "grad_norm": 0.3845831972941408, + "learning_rate": 4.096136139643158e-06, + "loss": 0.3138, + "step": 4800 + }, + { + "epoch": 1.1427976438388767, + "grad_norm": 0.3742150865042121, + "learning_rate": 4.094240370214585e-06, + "loss": 0.3117, + "step": 4801 + }, + { + "epoch": 1.1430356399119415, + "grad_norm": 0.39531922526195684, + "learning_rate": 4.0923447354043e-06, + "loss": 0.3866, + "step": 4802 + }, + { + "epoch": 1.1432736359850062, + "grad_norm": 0.36395182823896827, + "learning_rate": 4.090449235494043e-06, + "loss": 0.2797, + "step": 4803 + }, + { + "epoch": 1.143511632058071, + "grad_norm": 0.36855555988204036, + "learning_rate": 4.088553870765529e-06, + "loss": 0.2781, + "step": 4804 + }, + { + "epoch": 1.143749628131136, + "grad_norm": 0.37855067257091296, + "learning_rate": 4.086658641500458e-06, + "loss": 0.3791, + "step": 4805 + }, + { + "epoch": 1.1439876242042006, + "grad_norm": 0.3872640810576243, + "learning_rate": 4.0847635479805095e-06, + "loss": 0.3539, + "step": 4806 + }, + { + "epoch": 1.1442256202772654, + "grad_norm": 0.47053726164577175, + "learning_rate": 4.082868590487339e-06, + "loss": 0.2798, + "step": 4807 + }, + { + "epoch": 1.1444636163503303, + "grad_norm": 0.36992308839073973, + "learning_rate": 4.080973769302584e-06, + "loss": 0.3663, + "step": 4808 + }, + { + "epoch": 1.144701612423395, + "grad_norm": 0.3820408245781083, + "learning_rate": 4.079079084707864e-06, + "loss": 0.3765, + "step": 4809 + }, + { + "epoch": 1.1449396084964598, + "grad_norm": 0.36508657238373526, + "learning_rate": 4.077184536984773e-06, + "loss": 0.2741, + "step": 4810 + }, + { + "epoch": 1.1451776045695246, + "grad_norm": 0.3625603860149019, + "learning_rate": 4.07529012641489e-06, + "loss": 0.2954, + "step": 4811 + }, + { + "epoch": 1.1454156006425893, + "grad_norm": 0.3529289410508482, + "learning_rate": 4.073395853279768e-06, + "loss": 0.3098, + "step": 4812 + }, + { + "epoch": 1.1456535967156543, + "grad_norm": 0.3698173348748085, + "learning_rate": 4.071501717860947e-06, + "loss": 0.387, + "step": 4813 + }, + { + "epoch": 1.145891592788719, + "grad_norm": 0.36193766430504687, + "learning_rate": 4.069607720439942e-06, + "loss": 0.269, + "step": 4814 + }, + { + "epoch": 1.1461295888617837, + "grad_norm": 0.39085304104980867, + "learning_rate": 4.067713861298246e-06, + "loss": 0.3279, + "step": 4815 + }, + { + "epoch": 1.1463675849348487, + "grad_norm": 0.3703336609246225, + "learning_rate": 4.0658201407173335e-06, + "loss": 0.3719, + "step": 4816 + }, + { + "epoch": 1.1466055810079134, + "grad_norm": 0.3642397937157319, + "learning_rate": 4.063926558978657e-06, + "loss": 0.3411, + "step": 4817 + }, + { + "epoch": 1.1468435770809782, + "grad_norm": 0.3570075003209498, + "learning_rate": 4.062033116363653e-06, + "loss": 0.2743, + "step": 4818 + }, + { + "epoch": 1.147081573154043, + "grad_norm": 0.3926987214457056, + "learning_rate": 4.060139813153732e-06, + "loss": 0.338, + "step": 4819 + }, + { + "epoch": 1.1473195692271076, + "grad_norm": 0.36780611870947877, + "learning_rate": 4.058246649630286e-06, + "loss": 0.3727, + "step": 4820 + }, + { + "epoch": 1.1475575653001726, + "grad_norm": 0.3501727672950386, + "learning_rate": 4.056353626074685e-06, + "loss": 0.2804, + "step": 4821 + }, + { + "epoch": 1.1477955613732373, + "grad_norm": 0.4059745265643401, + "learning_rate": 4.05446074276828e-06, + "loss": 0.2883, + "step": 4822 + }, + { + "epoch": 1.148033557446302, + "grad_norm": 0.4002543534646528, + "learning_rate": 4.0525679999924e-06, + "loss": 0.3783, + "step": 4823 + }, + { + "epoch": 1.148271553519367, + "grad_norm": 0.37811094840803156, + "learning_rate": 4.050675398028354e-06, + "loss": 0.3345, + "step": 4824 + }, + { + "epoch": 1.1485095495924318, + "grad_norm": 0.39558594949102993, + "learning_rate": 4.048782937157427e-06, + "loss": 0.2961, + "step": 4825 + }, + { + "epoch": 1.1487475456654965, + "grad_norm": 0.39775175118133455, + "learning_rate": 4.04689061766089e-06, + "loss": 0.3206, + "step": 4826 + }, + { + "epoch": 1.1489855417385613, + "grad_norm": 0.37445723489979743, + "learning_rate": 4.044998439819986e-06, + "loss": 0.3986, + "step": 4827 + }, + { + "epoch": 1.149223537811626, + "grad_norm": 0.3739855904232471, + "learning_rate": 4.043106403915938e-06, + "loss": 0.2836, + "step": 4828 + }, + { + "epoch": 1.149461533884691, + "grad_norm": 0.4283283093929408, + "learning_rate": 4.041214510229952e-06, + "loss": 0.3011, + "step": 4829 + }, + { + "epoch": 1.1496995299577557, + "grad_norm": 0.40090894213533174, + "learning_rate": 4.0393227590432085e-06, + "loss": 0.3815, + "step": 4830 + }, + { + "epoch": 1.1499375260308204, + "grad_norm": 0.3617108318222815, + "learning_rate": 4.037431150636868e-06, + "loss": 0.3683, + "step": 4831 + }, + { + "epoch": 1.1501755221038854, + "grad_norm": 0.405107436573071, + "learning_rate": 4.0355396852920735e-06, + "loss": 0.2949, + "step": 4832 + }, + { + "epoch": 1.1504135181769501, + "grad_norm": 0.4091009038760894, + "learning_rate": 4.03364836328994e-06, + "loss": 0.2922, + "step": 4833 + }, + { + "epoch": 1.1506515142500149, + "grad_norm": 0.3723010060588749, + "learning_rate": 4.0317571849115665e-06, + "loss": 0.3649, + "step": 4834 + }, + { + "epoch": 1.1508895103230796, + "grad_norm": 0.3486410506502387, + "learning_rate": 4.029866150438029e-06, + "loss": 0.2946, + "step": 4835 + }, + { + "epoch": 1.1511275063961444, + "grad_norm": 0.40140072119081, + "learning_rate": 4.02797526015038e-06, + "loss": 0.2545, + "step": 4836 + }, + { + "epoch": 1.1513655024692093, + "grad_norm": 0.36966831779652004, + "learning_rate": 4.026084514329656e-06, + "loss": 0.3484, + "step": 4837 + }, + { + "epoch": 1.151603498542274, + "grad_norm": 0.3744156869841182, + "learning_rate": 4.024193913256865e-06, + "loss": 0.3852, + "step": 4838 + }, + { + "epoch": 1.1518414946153388, + "grad_norm": 0.3862547285091858, + "learning_rate": 4.022303457212998e-06, + "loss": 0.2861, + "step": 4839 + }, + { + "epoch": 1.1520794906884038, + "grad_norm": 0.3873257265237817, + "learning_rate": 4.020413146479026e-06, + "loss": 0.3121, + "step": 4840 + }, + { + "epoch": 1.1523174867614685, + "grad_norm": 0.4522160266820443, + "learning_rate": 4.018522981335894e-06, + "loss": 0.3734, + "step": 4841 + }, + { + "epoch": 1.1525554828345332, + "grad_norm": 0.3643381212709832, + "learning_rate": 4.0166329620645275e-06, + "loss": 0.3121, + "step": 4842 + }, + { + "epoch": 1.152793478907598, + "grad_norm": 0.40744044121578465, + "learning_rate": 4.01474308894583e-06, + "loss": 0.302, + "step": 4843 + }, + { + "epoch": 1.1530314749806627, + "grad_norm": 0.3799665007235963, + "learning_rate": 4.012853362260683e-06, + "loss": 0.325, + "step": 4844 + }, + { + "epoch": 1.1532694710537277, + "grad_norm": 0.40864748898009884, + "learning_rate": 4.010963782289948e-06, + "loss": 0.3931, + "step": 4845 + }, + { + "epoch": 1.1535074671267924, + "grad_norm": 0.39417564672662864, + "learning_rate": 4.009074349314462e-06, + "loss": 0.3143, + "step": 4846 + }, + { + "epoch": 1.1537454631998572, + "grad_norm": 0.42660463794138176, + "learning_rate": 4.007185063615043e-06, + "loss": 0.2997, + "step": 4847 + }, + { + "epoch": 1.1539834592729221, + "grad_norm": 0.45151329355721237, + "learning_rate": 4.005295925472484e-06, + "loss": 0.3525, + "step": 4848 + }, + { + "epoch": 1.1542214553459869, + "grad_norm": 0.3534689223030634, + "learning_rate": 4.003406935167558e-06, + "loss": 0.3348, + "step": 4849 + }, + { + "epoch": 1.1544594514190516, + "grad_norm": 0.3741669360663959, + "learning_rate": 4.001518092981017e-06, + "loss": 0.2907, + "step": 4850 + }, + { + "epoch": 1.1546974474921163, + "grad_norm": 0.3795133464854993, + "learning_rate": 3.999629399193589e-06, + "loss": 0.3346, + "step": 4851 + }, + { + "epoch": 1.154935443565181, + "grad_norm": 0.4063978193238941, + "learning_rate": 3.997740854085979e-06, + "loss": 0.369, + "step": 4852 + }, + { + "epoch": 1.155173439638246, + "grad_norm": 0.35078236527156986, + "learning_rate": 3.995852457938871e-06, + "loss": 0.3191, + "step": 4853 + }, + { + "epoch": 1.1554114357113108, + "grad_norm": 0.38716927785738015, + "learning_rate": 3.993964211032931e-06, + "loss": 0.2937, + "step": 4854 + }, + { + "epoch": 1.1556494317843755, + "grad_norm": 0.41552238495054705, + "learning_rate": 3.992076113648797e-06, + "loss": 0.3745, + "step": 4855 + }, + { + "epoch": 1.1558874278574405, + "grad_norm": 0.3909495828147318, + "learning_rate": 3.990188166067088e-06, + "loss": 0.3445, + "step": 4856 + }, + { + "epoch": 1.1561254239305052, + "grad_norm": 0.3773512559043716, + "learning_rate": 3.9883003685684e-06, + "loss": 0.2753, + "step": 4857 + }, + { + "epoch": 1.15636342000357, + "grad_norm": 0.3815969618990709, + "learning_rate": 3.9864127214333035e-06, + "loss": 0.3294, + "step": 4858 + }, + { + "epoch": 1.1566014160766347, + "grad_norm": 0.3682808970267619, + "learning_rate": 3.984525224942352e-06, + "loss": 0.3877, + "step": 4859 + }, + { + "epoch": 1.1568394121496994, + "grad_norm": 0.38593825650163915, + "learning_rate": 3.982637879376075e-06, + "loss": 0.3136, + "step": 4860 + }, + { + "epoch": 1.1570774082227644, + "grad_norm": 0.4087706279356301, + "learning_rate": 3.980750685014975e-06, + "loss": 0.2707, + "step": 4861 + }, + { + "epoch": 1.1573154042958291, + "grad_norm": 0.36407650716725964, + "learning_rate": 3.978863642139541e-06, + "loss": 0.3644, + "step": 4862 + }, + { + "epoch": 1.1575534003688939, + "grad_norm": 0.391041663164892, + "learning_rate": 3.97697675103023e-06, + "loss": 0.3702, + "step": 4863 + }, + { + "epoch": 1.1577913964419588, + "grad_norm": 0.36440676261441685, + "learning_rate": 3.975090011967483e-06, + "loss": 0.2939, + "step": 4864 + }, + { + "epoch": 1.1580293925150236, + "grad_norm": 0.3944390664922029, + "learning_rate": 3.973203425231715e-06, + "loss": 0.3345, + "step": 4865 + }, + { + "epoch": 1.1582673885880883, + "grad_norm": 0.37043202122893515, + "learning_rate": 3.971316991103319e-06, + "loss": 0.3966, + "step": 4866 + }, + { + "epoch": 1.158505384661153, + "grad_norm": 0.3596933682030246, + "learning_rate": 3.969430709862665e-06, + "loss": 0.3262, + "step": 4867 + }, + { + "epoch": 1.1587433807342178, + "grad_norm": 0.415916141016302, + "learning_rate": 3.967544581790105e-06, + "loss": 0.2955, + "step": 4868 + }, + { + "epoch": 1.1589813768072827, + "grad_norm": 0.364949249649275, + "learning_rate": 3.965658607165961e-06, + "loss": 0.3108, + "step": 4869 + }, + { + "epoch": 1.1592193728803475, + "grad_norm": 0.4038748930409246, + "learning_rate": 3.9637727862705375e-06, + "loss": 0.3811, + "step": 4870 + }, + { + "epoch": 1.1594573689534122, + "grad_norm": 0.37034415569974766, + "learning_rate": 3.961887119384111e-06, + "loss": 0.2626, + "step": 4871 + }, + { + "epoch": 1.1596953650264772, + "grad_norm": 0.37614882290717155, + "learning_rate": 3.960001606786942e-06, + "loss": 0.2926, + "step": 4872 + }, + { + "epoch": 1.159933361099542, + "grad_norm": 0.6816306788345337, + "learning_rate": 3.958116248759262e-06, + "loss": 0.3556, + "step": 4873 + }, + { + "epoch": 1.1601713571726067, + "grad_norm": 0.3601925064970214, + "learning_rate": 3.9562310455812825e-06, + "loss": 0.3386, + "step": 4874 + }, + { + "epoch": 1.1604093532456714, + "grad_norm": 0.37272600080408436, + "learning_rate": 3.9543459975331914e-06, + "loss": 0.2821, + "step": 4875 + }, + { + "epoch": 1.1606473493187361, + "grad_norm": 0.37365911547388664, + "learning_rate": 3.952461104895153e-06, + "loss": 0.304, + "step": 4876 + }, + { + "epoch": 1.160885345391801, + "grad_norm": 0.4220848745812085, + "learning_rate": 3.95057636794731e-06, + "loss": 0.3819, + "step": 4877 + }, + { + "epoch": 1.1611233414648658, + "grad_norm": 0.36664046373827575, + "learning_rate": 3.9486917869697795e-06, + "loss": 0.3174, + "step": 4878 + }, + { + "epoch": 1.1613613375379306, + "grad_norm": 0.36434472833349824, + "learning_rate": 3.9468073622426574e-06, + "loss": 0.3156, + "step": 4879 + }, + { + "epoch": 1.1615993336109953, + "grad_norm": 0.39980314450070703, + "learning_rate": 3.944923094046016e-06, + "loss": 0.3815, + "step": 4880 + }, + { + "epoch": 1.1618373296840603, + "grad_norm": 0.3825456910870827, + "learning_rate": 3.9430389826599026e-06, + "loss": 0.3578, + "step": 4881 + }, + { + "epoch": 1.162075325757125, + "grad_norm": 0.3901463596231784, + "learning_rate": 3.9411550283643465e-06, + "loss": 0.2817, + "step": 4882 + }, + { + "epoch": 1.1623133218301898, + "grad_norm": 0.4095148078840206, + "learning_rate": 3.939271231439348e-06, + "loss": 0.3111, + "step": 4883 + }, + { + "epoch": 1.1625513179032545, + "grad_norm": 0.40126458626689326, + "learning_rate": 3.937387592164884e-06, + "loss": 0.4164, + "step": 4884 + }, + { + "epoch": 1.1627893139763195, + "grad_norm": 0.40524146316425336, + "learning_rate": 3.935504110820912e-06, + "loss": 0.3031, + "step": 4885 + }, + { + "epoch": 1.1630273100493842, + "grad_norm": 0.41484043466115905, + "learning_rate": 3.933620787687365e-06, + "loss": 0.2683, + "step": 4886 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.42894419912735265, + "learning_rate": 3.931737623044149e-06, + "loss": 0.3446, + "step": 4887 + }, + { + "epoch": 1.1635033021955137, + "grad_norm": 0.37569764295018193, + "learning_rate": 3.929854617171149e-06, + "loss": 0.3875, + "step": 4888 + }, + { + "epoch": 1.1637412982685786, + "grad_norm": 0.38169828943839607, + "learning_rate": 3.927971770348228e-06, + "loss": 0.2981, + "step": 4889 + }, + { + "epoch": 1.1639792943416434, + "grad_norm": 0.4326907379906986, + "learning_rate": 3.9260890828552225e-06, + "loss": 0.321, + "step": 4890 + }, + { + "epoch": 1.1642172904147081, + "grad_norm": 0.37216325656330895, + "learning_rate": 3.924206554971947e-06, + "loss": 0.3721, + "step": 4891 + }, + { + "epoch": 1.1644552864877729, + "grad_norm": 0.38522417456420815, + "learning_rate": 3.92232418697819e-06, + "loss": 0.3308, + "step": 4892 + }, + { + "epoch": 1.1646932825608378, + "grad_norm": 0.40316171495665337, + "learning_rate": 3.920441979153721e-06, + "loss": 0.2768, + "step": 4893 + }, + { + "epoch": 1.1649312786339026, + "grad_norm": 0.37987776460120803, + "learning_rate": 3.918559931778277e-06, + "loss": 0.3307, + "step": 4894 + }, + { + "epoch": 1.1651692747069673, + "grad_norm": 0.3813096868034741, + "learning_rate": 3.916678045131584e-06, + "loss": 0.3603, + "step": 4895 + }, + { + "epoch": 1.165407270780032, + "grad_norm": 0.351364277393361, + "learning_rate": 3.914796319493333e-06, + "loss": 0.2873, + "step": 4896 + }, + { + "epoch": 1.165645266853097, + "grad_norm": 0.3926597142637336, + "learning_rate": 3.912914755143196e-06, + "loss": 0.3285, + "step": 4897 + }, + { + "epoch": 1.1658832629261617, + "grad_norm": 0.3905874686541914, + "learning_rate": 3.911033352360818e-06, + "loss": 0.3833, + "step": 4898 + }, + { + "epoch": 1.1661212589992265, + "grad_norm": 0.35936466590080296, + "learning_rate": 3.909152111425825e-06, + "loss": 0.3381, + "step": 4899 + }, + { + "epoch": 1.1663592550722912, + "grad_norm": 0.3725108986050583, + "learning_rate": 3.907271032617815e-06, + "loss": 0.2671, + "step": 4900 + }, + { + "epoch": 1.1665972511453562, + "grad_norm": 0.3835107673363921, + "learning_rate": 3.905390116216362e-06, + "loss": 0.3108, + "step": 4901 + }, + { + "epoch": 1.166835247218421, + "grad_norm": 0.3562168456730705, + "learning_rate": 3.9035093625010164e-06, + "loss": 0.379, + "step": 4902 + }, + { + "epoch": 1.1670732432914857, + "grad_norm": 0.34281015797938996, + "learning_rate": 3.901628771751306e-06, + "loss": 0.3064, + "step": 4903 + }, + { + "epoch": 1.1673112393645504, + "grad_norm": 0.40233559709709515, + "learning_rate": 3.899748344246732e-06, + "loss": 0.3005, + "step": 4904 + }, + { + "epoch": 1.1675492354376154, + "grad_norm": 0.422059793045895, + "learning_rate": 3.897868080266774e-06, + "loss": 0.3804, + "step": 4905 + }, + { + "epoch": 1.16778723151068, + "grad_norm": 0.4089575743526671, + "learning_rate": 3.895987980090884e-06, + "loss": 0.332, + "step": 4906 + }, + { + "epoch": 1.1680252275837448, + "grad_norm": 0.3718624537861076, + "learning_rate": 3.894108043998492e-06, + "loss": 0.2945, + "step": 4907 + }, + { + "epoch": 1.1682632236568096, + "grad_norm": 0.4028263680996291, + "learning_rate": 3.8922282722690006e-06, + "loss": 0.3229, + "step": 4908 + }, + { + "epoch": 1.1685012197298745, + "grad_norm": 0.45190473479232407, + "learning_rate": 3.890348665181796e-06, + "loss": 0.3799, + "step": 4909 + }, + { + "epoch": 1.1687392158029393, + "grad_norm": 0.37951311105400887, + "learning_rate": 3.888469223016231e-06, + "loss": 0.3311, + "step": 4910 + }, + { + "epoch": 1.168977211876004, + "grad_norm": 0.398817375650266, + "learning_rate": 3.886589946051637e-06, + "loss": 0.3005, + "step": 4911 + }, + { + "epoch": 1.1692152079490687, + "grad_norm": 0.3788321557640797, + "learning_rate": 3.884710834567321e-06, + "loss": 0.3668, + "step": 4912 + }, + { + "epoch": 1.1694532040221337, + "grad_norm": 0.3679850869739202, + "learning_rate": 3.882831888842566e-06, + "loss": 0.4159, + "step": 4913 + }, + { + "epoch": 1.1696912000951984, + "grad_norm": 0.38977532532858084, + "learning_rate": 3.880953109156631e-06, + "loss": 0.2969, + "step": 4914 + }, + { + "epoch": 1.1699291961682632, + "grad_norm": 0.3673051411020341, + "learning_rate": 3.879074495788746e-06, + "loss": 0.3382, + "step": 4915 + }, + { + "epoch": 1.170167192241328, + "grad_norm": 0.40474641214394, + "learning_rate": 3.8771960490181226e-06, + "loss": 0.3647, + "step": 4916 + }, + { + "epoch": 1.1704051883143929, + "grad_norm": 0.3677584145288695, + "learning_rate": 3.875317769123943e-06, + "loss": 0.3355, + "step": 4917 + }, + { + "epoch": 1.1706431843874576, + "grad_norm": 0.38928549279284, + "learning_rate": 3.873439656385367e-06, + "loss": 0.2852, + "step": 4918 + }, + { + "epoch": 1.1708811804605224, + "grad_norm": 0.40919373160433137, + "learning_rate": 3.871561711081526e-06, + "loss": 0.3662, + "step": 4919 + }, + { + "epoch": 1.171119176533587, + "grad_norm": 0.3911851721735495, + "learning_rate": 3.869683933491533e-06, + "loss": 0.3833, + "step": 4920 + }, + { + "epoch": 1.171357172606652, + "grad_norm": 0.382574197170867, + "learning_rate": 3.8678063238944674e-06, + "loss": 0.31, + "step": 4921 + }, + { + "epoch": 1.1715951686797168, + "grad_norm": 0.37949281565255283, + "learning_rate": 3.865928882569392e-06, + "loss": 0.3181, + "step": 4922 + }, + { + "epoch": 1.1718331647527815, + "grad_norm": 0.38403965708871085, + "learning_rate": 3.86405160979534e-06, + "loss": 0.3739, + "step": 4923 + }, + { + "epoch": 1.1720711608258463, + "grad_norm": 0.38512569570032895, + "learning_rate": 3.8621745058513225e-06, + "loss": 0.3229, + "step": 4924 + }, + { + "epoch": 1.1723091568989112, + "grad_norm": 0.3957163771288543, + "learning_rate": 3.8602975710163205e-06, + "loss": 0.2844, + "step": 4925 + }, + { + "epoch": 1.172547152971976, + "grad_norm": 0.39701056868163465, + "learning_rate": 3.858420805569295e-06, + "loss": 0.338, + "step": 4926 + }, + { + "epoch": 1.1727851490450407, + "grad_norm": 0.41141027404117647, + "learning_rate": 3.856544209789179e-06, + "loss": 0.4213, + "step": 4927 + }, + { + "epoch": 1.1730231451181055, + "grad_norm": 0.356743827281772, + "learning_rate": 3.854667783954882e-06, + "loss": 0.3054, + "step": 4928 + }, + { + "epoch": 1.1732611411911704, + "grad_norm": 0.36020835139169016, + "learning_rate": 3.852791528345286e-06, + "loss": 0.2547, + "step": 4929 + }, + { + "epoch": 1.1734991372642352, + "grad_norm": 0.39353940700048456, + "learning_rate": 3.85091544323925e-06, + "loss": 0.3495, + "step": 4930 + }, + { + "epoch": 1.1737371333373, + "grad_norm": 0.3739662096404696, + "learning_rate": 3.849039528915605e-06, + "loss": 0.3416, + "step": 4931 + }, + { + "epoch": 1.1739751294103646, + "grad_norm": 0.4021690846442395, + "learning_rate": 3.847163785653159e-06, + "loss": 0.2926, + "step": 4932 + }, + { + "epoch": 1.1742131254834296, + "grad_norm": 0.35310815717245986, + "learning_rate": 3.845288213730695e-06, + "loss": 0.3188, + "step": 4933 + }, + { + "epoch": 1.1744511215564943, + "grad_norm": 0.37742127338548703, + "learning_rate": 3.843412813426967e-06, + "loss": 0.3857, + "step": 4934 + }, + { + "epoch": 1.174689117629559, + "grad_norm": 0.36700195619184406, + "learning_rate": 3.84153758502071e-06, + "loss": 0.3065, + "step": 4935 + }, + { + "epoch": 1.1749271137026238, + "grad_norm": 0.37862390020379844, + "learning_rate": 3.839662528790625e-06, + "loss": 0.2822, + "step": 4936 + }, + { + "epoch": 1.1751651097756888, + "grad_norm": 0.37919008323525716, + "learning_rate": 3.837787645015395e-06, + "loss": 0.3459, + "step": 4937 + }, + { + "epoch": 1.1754031058487535, + "grad_norm": 0.3805425537038319, + "learning_rate": 3.835912933973671e-06, + "loss": 0.4043, + "step": 4938 + }, + { + "epoch": 1.1756411019218183, + "grad_norm": 0.3812253553869978, + "learning_rate": 3.834038395944084e-06, + "loss": 0.2991, + "step": 4939 + }, + { + "epoch": 1.175879097994883, + "grad_norm": 0.3885217222382259, + "learning_rate": 3.832164031205237e-06, + "loss": 0.3469, + "step": 4940 + }, + { + "epoch": 1.176117094067948, + "grad_norm": 0.3496306031416618, + "learning_rate": 3.830289840035705e-06, + "loss": 0.39, + "step": 4941 + }, + { + "epoch": 1.1763550901410127, + "grad_norm": 0.3725490258794375, + "learning_rate": 3.82841582271404e-06, + "loss": 0.3009, + "step": 4942 + }, + { + "epoch": 1.1765930862140774, + "grad_norm": 0.3961940913795775, + "learning_rate": 3.8265419795187675e-06, + "loss": 0.3007, + "step": 4943 + }, + { + "epoch": 1.1768310822871422, + "grad_norm": 0.40324342500628063, + "learning_rate": 3.824668310728387e-06, + "loss": 0.3132, + "step": 4944 + }, + { + "epoch": 1.1770690783602071, + "grad_norm": 0.36079064416493883, + "learning_rate": 3.822794816621371e-06, + "loss": 0.4038, + "step": 4945 + }, + { + "epoch": 1.1773070744332719, + "grad_norm": 0.40154958109483985, + "learning_rate": 3.8209214974761685e-06, + "loss": 0.2703, + "step": 4946 + }, + { + "epoch": 1.1775450705063366, + "grad_norm": 0.40470924949834836, + "learning_rate": 3.819048353571201e-06, + "loss": 0.3144, + "step": 4947 + }, + { + "epoch": 1.1777830665794013, + "grad_norm": 0.4251133239570864, + "learning_rate": 3.817175385184861e-06, + "loss": 0.3686, + "step": 4948 + }, + { + "epoch": 1.1780210626524663, + "grad_norm": 0.3824740738540186, + "learning_rate": 3.815302592595522e-06, + "loss": 0.3313, + "step": 4949 + }, + { + "epoch": 1.178259058725531, + "grad_norm": 0.43168535718039347, + "learning_rate": 3.813429976081526e-06, + "loss": 0.2672, + "step": 4950 + }, + { + "epoch": 1.1784970547985958, + "grad_norm": 0.3844314375612966, + "learning_rate": 3.8115575359211905e-06, + "loss": 0.3406, + "step": 4951 + }, + { + "epoch": 1.1787350508716605, + "grad_norm": 0.38263513845378433, + "learning_rate": 3.809685272392804e-06, + "loss": 0.3849, + "step": 4952 + }, + { + "epoch": 1.1789730469447255, + "grad_norm": 0.3613570750787706, + "learning_rate": 3.8078131857746346e-06, + "loss": 0.2964, + "step": 4953 + }, + { + "epoch": 1.1792110430177902, + "grad_norm": 0.38485653076788723, + "learning_rate": 3.8059412763449187e-06, + "loss": 0.3033, + "step": 4954 + }, + { + "epoch": 1.179449039090855, + "grad_norm": 0.3490639146215179, + "learning_rate": 3.804069544381869e-06, + "loss": 0.3481, + "step": 4955 + }, + { + "epoch": 1.1796870351639197, + "grad_norm": 0.3856291558218839, + "learning_rate": 3.802197990163671e-06, + "loss": 0.3471, + "step": 4956 + }, + { + "epoch": 1.1799250312369847, + "grad_norm": 0.39550282927113356, + "learning_rate": 3.8003266139684832e-06, + "loss": 0.2598, + "step": 4957 + }, + { + "epoch": 1.1801630273100494, + "grad_norm": 0.3862155603983133, + "learning_rate": 3.798455416074439e-06, + "loss": 0.3378, + "step": 4958 + }, + { + "epoch": 1.1804010233831141, + "grad_norm": 0.3839437629883702, + "learning_rate": 3.7965843967596453e-06, + "loss": 0.3982, + "step": 4959 + }, + { + "epoch": 1.1806390194561789, + "grad_norm": 0.4078427772915639, + "learning_rate": 3.7947135563021814e-06, + "loss": 0.2971, + "step": 4960 + }, + { + "epoch": 1.1808770155292438, + "grad_norm": 0.35307107716990815, + "learning_rate": 3.7928428949800996e-06, + "loss": 0.2988, + "step": 4961 + }, + { + "epoch": 1.1811150116023086, + "grad_norm": 0.3992799282030228, + "learning_rate": 3.7909724130714277e-06, + "loss": 0.3522, + "step": 4962 + }, + { + "epoch": 1.1813530076753733, + "grad_norm": 0.381054998500261, + "learning_rate": 3.7891021108541642e-06, + "loss": 0.3713, + "step": 4963 + }, + { + "epoch": 1.181591003748438, + "grad_norm": 0.40058461166635667, + "learning_rate": 3.787231988606284e-06, + "loss": 0.3104, + "step": 4964 + }, + { + "epoch": 1.181828999821503, + "grad_norm": 0.36686697436707344, + "learning_rate": 3.785362046605732e-06, + "loss": 0.3072, + "step": 4965 + }, + { + "epoch": 1.1820669958945678, + "grad_norm": 0.4149878680339965, + "learning_rate": 3.7834922851304297e-06, + "loss": 0.4061, + "step": 4966 + }, + { + "epoch": 1.1823049919676325, + "grad_norm": 0.3695370875718639, + "learning_rate": 3.7816227044582687e-06, + "loss": 0.3289, + "step": 4967 + }, + { + "epoch": 1.1825429880406972, + "grad_norm": 0.3596942240313325, + "learning_rate": 3.7797533048671146e-06, + "loss": 0.2954, + "step": 4968 + }, + { + "epoch": 1.1827809841137622, + "grad_norm": 0.39500031375483247, + "learning_rate": 3.7778840866348075e-06, + "loss": 0.3261, + "step": 4969 + }, + { + "epoch": 1.183018980186827, + "grad_norm": 0.41374777726632095, + "learning_rate": 3.7760150500391584e-06, + "loss": 0.3944, + "step": 4970 + }, + { + "epoch": 1.1832569762598917, + "grad_norm": 0.3537235485993441, + "learning_rate": 3.7741461953579527e-06, + "loss": 0.2971, + "step": 4971 + }, + { + "epoch": 1.1834949723329564, + "grad_norm": 0.3748434274707176, + "learning_rate": 3.772277522868949e-06, + "loss": 0.3244, + "step": 4972 + }, + { + "epoch": 1.1837329684060214, + "grad_norm": 0.38166375878188813, + "learning_rate": 3.770409032849878e-06, + "loss": 0.3598, + "step": 4973 + }, + { + "epoch": 1.1839709644790861, + "grad_norm": 0.3922577890411974, + "learning_rate": 3.7685407255784424e-06, + "loss": 0.3348, + "step": 4974 + }, + { + "epoch": 1.1842089605521509, + "grad_norm": 0.3845300676159576, + "learning_rate": 3.766672601332319e-06, + "loss": 0.2835, + "step": 4975 + }, + { + "epoch": 1.1844469566252156, + "grad_norm": 0.3864290171184251, + "learning_rate": 3.76480466038916e-06, + "loss": 0.3286, + "step": 4976 + }, + { + "epoch": 1.1846849526982806, + "grad_norm": 0.40164860660518437, + "learning_rate": 3.7629369030265834e-06, + "loss": 0.4308, + "step": 4977 + }, + { + "epoch": 1.1849229487713453, + "grad_norm": 0.38101030100174693, + "learning_rate": 3.7610693295221885e-06, + "loss": 0.278, + "step": 4978 + }, + { + "epoch": 1.18516094484441, + "grad_norm": 0.3827769312151021, + "learning_rate": 3.7592019401535397e-06, + "loss": 0.2915, + "step": 4979 + }, + { + "epoch": 1.1853989409174748, + "grad_norm": 0.39348943008486564, + "learning_rate": 3.7573347351981785e-06, + "loss": 0.3532, + "step": 4980 + }, + { + "epoch": 1.1856369369905397, + "grad_norm": 0.35366811317076075, + "learning_rate": 3.7554677149336186e-06, + "loss": 0.355, + "step": 4981 + }, + { + "epoch": 1.1858749330636045, + "grad_norm": 0.3730839491699928, + "learning_rate": 3.7536008796373447e-06, + "loss": 0.2738, + "step": 4982 + }, + { + "epoch": 1.1861129291366692, + "grad_norm": 0.36612366371157234, + "learning_rate": 3.7517342295868142e-06, + "loss": 0.3053, + "step": 4983 + }, + { + "epoch": 1.186350925209734, + "grad_norm": 0.3823419705705663, + "learning_rate": 3.7498677650594585e-06, + "loss": 0.3915, + "step": 4984 + }, + { + "epoch": 1.186588921282799, + "grad_norm": 0.36531157147493043, + "learning_rate": 3.7480014863326786e-06, + "loss": 0.2942, + "step": 4985 + }, + { + "epoch": 1.1868269173558637, + "grad_norm": 0.401505598688106, + "learning_rate": 3.746135393683851e-06, + "loss": 0.2788, + "step": 4986 + }, + { + "epoch": 1.1870649134289284, + "grad_norm": 0.38093798207572305, + "learning_rate": 3.7442694873903236e-06, + "loss": 0.3348, + "step": 4987 + }, + { + "epoch": 1.1873029095019931, + "grad_norm": 0.38132115350268836, + "learning_rate": 3.742403767729414e-06, + "loss": 0.3639, + "step": 4988 + }, + { + "epoch": 1.187540905575058, + "grad_norm": 0.39346856748575865, + "learning_rate": 3.740538234978417e-06, + "loss": 0.2893, + "step": 4989 + }, + { + "epoch": 1.1877789016481228, + "grad_norm": 0.3613470905129032, + "learning_rate": 3.7386728894145965e-06, + "loss": 0.3158, + "step": 4990 + }, + { + "epoch": 1.1880168977211876, + "grad_norm": 0.3715632977391001, + "learning_rate": 3.7368077313151866e-06, + "loss": 0.4092, + "step": 4991 + }, + { + "epoch": 1.1882548937942523, + "grad_norm": 0.43216241244974535, + "learning_rate": 3.7349427609573985e-06, + "loss": 0.3197, + "step": 4992 + }, + { + "epoch": 1.1884928898673173, + "grad_norm": 0.38373124295883176, + "learning_rate": 3.7330779786184122e-06, + "loss": 0.3042, + "step": 4993 + }, + { + "epoch": 1.188730885940382, + "grad_norm": 0.36301878619729605, + "learning_rate": 3.731213384575381e-06, + "loss": 0.3551, + "step": 4994 + }, + { + "epoch": 1.1889688820134467, + "grad_norm": 0.3633370905617256, + "learning_rate": 3.7293489791054293e-06, + "loss": 0.381, + "step": 4995 + }, + { + "epoch": 1.1892068780865115, + "grad_norm": 0.36132719462940693, + "learning_rate": 3.727484762485653e-06, + "loss": 0.2943, + "step": 4996 + }, + { + "epoch": 1.1894448741595764, + "grad_norm": 0.3626513454326024, + "learning_rate": 3.7256207349931216e-06, + "loss": 0.3158, + "step": 4997 + }, + { + "epoch": 1.1896828702326412, + "grad_norm": 0.39265659273339876, + "learning_rate": 3.7237568969048766e-06, + "loss": 0.3878, + "step": 4998 + }, + { + "epoch": 1.189920866305706, + "grad_norm": 0.35277130854248157, + "learning_rate": 3.7218932484979287e-06, + "loss": 0.338, + "step": 4999 + }, + { + "epoch": 1.1901588623787707, + "grad_norm": 0.39473980233793865, + "learning_rate": 3.7200297900492632e-06, + "loss": 0.2937, + "step": 5000 + }, + { + "epoch": 1.1903968584518356, + "grad_norm": 0.3618280922673368, + "learning_rate": 3.7181665218358354e-06, + "loss": 0.3374, + "step": 5001 + }, + { + "epoch": 1.1906348545249004, + "grad_norm": 0.39899118970440267, + "learning_rate": 3.7163034441345725e-06, + "loss": 0.3833, + "step": 5002 + }, + { + "epoch": 1.190872850597965, + "grad_norm": 0.36708868631705965, + "learning_rate": 3.7144405572223762e-06, + "loss": 0.3217, + "step": 5003 + }, + { + "epoch": 1.1911108466710298, + "grad_norm": 0.36066225989423434, + "learning_rate": 3.7125778613761164e-06, + "loss": 0.3079, + "step": 5004 + }, + { + "epoch": 1.1913488427440948, + "grad_norm": 0.4031849072858775, + "learning_rate": 3.710715356872634e-06, + "loss": 0.3519, + "step": 5005 + }, + { + "epoch": 1.1915868388171595, + "grad_norm": 0.39768511346134466, + "learning_rate": 3.708853043988746e-06, + "loss": 0.3612, + "step": 5006 + }, + { + "epoch": 1.1918248348902243, + "grad_norm": 0.3746114220221446, + "learning_rate": 3.7069909230012376e-06, + "loss": 0.3005, + "step": 5007 + }, + { + "epoch": 1.192062830963289, + "grad_norm": 0.3935100573355243, + "learning_rate": 3.705128994186865e-06, + "loss": 0.3071, + "step": 5008 + }, + { + "epoch": 1.192300827036354, + "grad_norm": 0.3544256856276741, + "learning_rate": 3.7032672578223583e-06, + "loss": 0.3868, + "step": 5009 + }, + { + "epoch": 1.1925388231094187, + "grad_norm": 0.37426633165695083, + "learning_rate": 3.701405714184416e-06, + "loss": 0.2953, + "step": 5010 + }, + { + "epoch": 1.1927768191824835, + "grad_norm": 0.42034731649414875, + "learning_rate": 3.699544363549711e-06, + "loss": 0.3005, + "step": 5011 + }, + { + "epoch": 1.1930148152555482, + "grad_norm": 0.4082503637885382, + "learning_rate": 3.6976832061948845e-06, + "loss": 0.3477, + "step": 5012 + }, + { + "epoch": 1.1932528113286132, + "grad_norm": 0.8416333609801906, + "learning_rate": 3.695822242396552e-06, + "loss": 0.3687, + "step": 5013 + }, + { + "epoch": 1.193490807401678, + "grad_norm": 0.37912063370018584, + "learning_rate": 3.693961472431298e-06, + "loss": 0.2979, + "step": 5014 + }, + { + "epoch": 1.1937288034747426, + "grad_norm": 0.401423257824879, + "learning_rate": 3.6921008965756775e-06, + "loss": 0.3199, + "step": 5015 + }, + { + "epoch": 1.1939667995478074, + "grad_norm": 0.3800891163492014, + "learning_rate": 3.690240515106221e-06, + "loss": 0.3765, + "step": 5016 + }, + { + "epoch": 1.1942047956208723, + "grad_norm": 0.3669803455039553, + "learning_rate": 3.6883803282994256e-06, + "loss": 0.3246, + "step": 5017 + }, + { + "epoch": 1.194442791693937, + "grad_norm": 0.4200743670629511, + "learning_rate": 3.6865203364317605e-06, + "loss": 0.2893, + "step": 5018 + }, + { + "epoch": 1.1946807877670018, + "grad_norm": 0.37720436817760933, + "learning_rate": 3.6846605397796677e-06, + "loss": 0.3496, + "step": 5019 + }, + { + "epoch": 1.1949187838400666, + "grad_norm": 0.38267177297360994, + "learning_rate": 3.6828009386195592e-06, + "loss": 0.3647, + "step": 5020 + }, + { + "epoch": 1.1951567799131315, + "grad_norm": 0.3855032217341476, + "learning_rate": 3.680941533227817e-06, + "loss": 0.2893, + "step": 5021 + }, + { + "epoch": 1.1953947759861963, + "grad_norm": 0.41661785772795806, + "learning_rate": 3.679082323880795e-06, + "loss": 0.2966, + "step": 5022 + }, + { + "epoch": 1.195632772059261, + "grad_norm": 0.38996824463256474, + "learning_rate": 3.6772233108548182e-06, + "loss": 0.3682, + "step": 5023 + }, + { + "epoch": 1.1958707681323257, + "grad_norm": 0.3579485476112444, + "learning_rate": 3.6753644944261806e-06, + "loss": 0.3461, + "step": 5024 + }, + { + "epoch": 1.1961087642053907, + "grad_norm": 0.38986130206284325, + "learning_rate": 3.6735058748711492e-06, + "loss": 0.2968, + "step": 5025 + }, + { + "epoch": 1.1963467602784554, + "grad_norm": 0.3703854439689402, + "learning_rate": 3.6716474524659608e-06, + "loss": 0.3479, + "step": 5026 + }, + { + "epoch": 1.1965847563515202, + "grad_norm": 0.4016888133573442, + "learning_rate": 3.669789227486823e-06, + "loss": 0.3763, + "step": 5027 + }, + { + "epoch": 1.196822752424585, + "grad_norm": 0.3767473882222918, + "learning_rate": 3.667931200209913e-06, + "loss": 0.2947, + "step": 5028 + }, + { + "epoch": 1.1970607484976499, + "grad_norm": 0.39627616648735325, + "learning_rate": 3.6660733709113805e-06, + "loss": 0.2915, + "step": 5029 + }, + { + "epoch": 1.1972987445707146, + "grad_norm": 0.3824120166417069, + "learning_rate": 3.664215739867345e-06, + "loss": 0.3831, + "step": 5030 + }, + { + "epoch": 1.1975367406437794, + "grad_norm": 0.3796189625607475, + "learning_rate": 3.662358307353897e-06, + "loss": 0.3594, + "step": 5031 + }, + { + "epoch": 1.197774736716844, + "grad_norm": 0.4057709147757602, + "learning_rate": 3.6605010736470945e-06, + "loss": 0.3109, + "step": 5032 + }, + { + "epoch": 1.198012732789909, + "grad_norm": 0.38925059478299473, + "learning_rate": 3.6586440390229705e-06, + "loss": 0.3187, + "step": 5033 + }, + { + "epoch": 1.1982507288629738, + "grad_norm": 0.3795304534109489, + "learning_rate": 3.656787203757527e-06, + "loss": 0.4042, + "step": 5034 + }, + { + "epoch": 1.1984887249360385, + "grad_norm": 0.33530850356764125, + "learning_rate": 3.654930568126734e-06, + "loss": 0.2998, + "step": 5035 + }, + { + "epoch": 1.1987267210091033, + "grad_norm": 0.43587793237382827, + "learning_rate": 3.6530741324065343e-06, + "loss": 0.3011, + "step": 5036 + }, + { + "epoch": 1.1989647170821682, + "grad_norm": 0.392590497330817, + "learning_rate": 3.65121789687284e-06, + "loss": 0.3432, + "step": 5037 + }, + { + "epoch": 1.199202713155233, + "grad_norm": 0.4238986802151149, + "learning_rate": 3.6493618618015335e-06, + "loss": 0.3859, + "step": 5038 + }, + { + "epoch": 1.1994407092282977, + "grad_norm": 0.3718207166623785, + "learning_rate": 3.647506027468467e-06, + "loss": 0.2891, + "step": 5039 + }, + { + "epoch": 1.1996787053013624, + "grad_norm": 0.39411079054515186, + "learning_rate": 3.645650394149465e-06, + "loss": 0.3328, + "step": 5040 + }, + { + "epoch": 1.1999167013744274, + "grad_norm": 0.36581910992126304, + "learning_rate": 3.6437949621203184e-06, + "loss": 0.3807, + "step": 5041 + }, + { + "epoch": 1.2001546974474921, + "grad_norm": 0.3661757831955536, + "learning_rate": 3.6419397316567902e-06, + "loss": 0.3113, + "step": 5042 + }, + { + "epoch": 1.2003926935205569, + "grad_norm": 0.45796517113773294, + "learning_rate": 3.640084703034616e-06, + "loss": 0.3018, + "step": 5043 + }, + { + "epoch": 1.2006306895936216, + "grad_norm": 0.35520419192547475, + "learning_rate": 3.6382298765294978e-06, + "loss": 0.3468, + "step": 5044 + }, + { + "epoch": 1.2008686856666866, + "grad_norm": 0.3770376085730245, + "learning_rate": 3.6363752524171083e-06, + "loss": 0.3914, + "step": 5045 + }, + { + "epoch": 1.2011066817397513, + "grad_norm": 0.37954607416403163, + "learning_rate": 3.6345208309730885e-06, + "loss": 0.3111, + "step": 5046 + }, + { + "epoch": 1.201344677812816, + "grad_norm": 0.3863432844778005, + "learning_rate": 3.632666612473056e-06, + "loss": 0.319, + "step": 5047 + }, + { + "epoch": 1.2015826738858808, + "grad_norm": 0.4500532032678951, + "learning_rate": 3.630812597192591e-06, + "loss": 0.3771, + "step": 5048 + }, + { + "epoch": 1.2018206699589458, + "grad_norm": 0.37110161136850156, + "learning_rate": 3.628958785407246e-06, + "loss": 0.3539, + "step": 5049 + }, + { + "epoch": 1.2020586660320105, + "grad_norm": 0.38550398130650293, + "learning_rate": 3.6271051773925434e-06, + "loss": 0.2796, + "step": 5050 + }, + { + "epoch": 1.2022966621050752, + "grad_norm": 0.41164229002791736, + "learning_rate": 3.6252517734239757e-06, + "loss": 0.3169, + "step": 5051 + }, + { + "epoch": 1.20253465817814, + "grad_norm": 0.4273076385043179, + "learning_rate": 3.6233985737770034e-06, + "loss": 0.3748, + "step": 5052 + }, + { + "epoch": 1.202772654251205, + "grad_norm": 0.38342810948189754, + "learning_rate": 3.6215455787270587e-06, + "loss": 0.3031, + "step": 5053 + }, + { + "epoch": 1.2030106503242697, + "grad_norm": 0.3815063828509451, + "learning_rate": 3.6196927885495426e-06, + "loss": 0.3084, + "step": 5054 + }, + { + "epoch": 1.2032486463973344, + "grad_norm": 0.39701942080159974, + "learning_rate": 3.617840203519825e-06, + "loss": 0.3449, + "step": 5055 + }, + { + "epoch": 1.2034866424703992, + "grad_norm": 0.3826077784262676, + "learning_rate": 3.6159878239132453e-06, + "loss": 0.3861, + "step": 5056 + }, + { + "epoch": 1.2037246385434641, + "grad_norm": 0.3761935038939176, + "learning_rate": 3.614135650005115e-06, + "loss": 0.2809, + "step": 5057 + }, + { + "epoch": 1.2039626346165289, + "grad_norm": 0.38813967758028695, + "learning_rate": 3.6122836820707107e-06, + "loss": 0.3292, + "step": 5058 + }, + { + "epoch": 1.2042006306895936, + "grad_norm": 0.4094613511832653, + "learning_rate": 3.6104319203852826e-06, + "loss": 0.3954, + "step": 5059 + }, + { + "epoch": 1.2044386267626583, + "grad_norm": 0.35131103373017636, + "learning_rate": 3.608580365224045e-06, + "loss": 0.3157, + "step": 5060 + }, + { + "epoch": 1.2046766228357233, + "grad_norm": 0.3779833656079706, + "learning_rate": 3.60672901686219e-06, + "loss": 0.3155, + "step": 5061 + }, + { + "epoch": 1.204914618908788, + "grad_norm": 0.3801342639897726, + "learning_rate": 3.60487787557487e-06, + "loss": 0.3412, + "step": 5062 + }, + { + "epoch": 1.2051526149818528, + "grad_norm": 0.3661980162834189, + "learning_rate": 3.603026941637212e-06, + "loss": 0.3519, + "step": 5063 + }, + { + "epoch": 1.2053906110549175, + "grad_norm": 0.3839709384547976, + "learning_rate": 3.6011762153243096e-06, + "loss": 0.3023, + "step": 5064 + }, + { + "epoch": 1.2056286071279825, + "grad_norm": 0.39407579614935423, + "learning_rate": 3.5993256969112266e-06, + "loss": 0.3062, + "step": 5065 + }, + { + "epoch": 1.2058666032010472, + "grad_norm": 0.3593214219415961, + "learning_rate": 3.5974753866729966e-06, + "loss": 0.4125, + "step": 5066 + }, + { + "epoch": 1.206104599274112, + "grad_norm": 0.3672574458694017, + "learning_rate": 3.5956252848846206e-06, + "loss": 0.3438, + "step": 5067 + }, + { + "epoch": 1.2063425953471767, + "grad_norm": 0.3723709621082384, + "learning_rate": 3.5937753918210705e-06, + "loss": 0.2843, + "step": 5068 + }, + { + "epoch": 1.2065805914202417, + "grad_norm": 0.3901256291535116, + "learning_rate": 3.5919257077572835e-06, + "loss": 0.3146, + "step": 5069 + }, + { + "epoch": 1.2068185874933064, + "grad_norm": 0.37899864532094263, + "learning_rate": 3.5900762329681717e-06, + "loss": 0.3923, + "step": 5070 + }, + { + "epoch": 1.2070565835663711, + "grad_norm": 0.35151540606068965, + "learning_rate": 3.5882269677286117e-06, + "loss": 0.2781, + "step": 5071 + }, + { + "epoch": 1.2072945796394359, + "grad_norm": 0.3834887945303703, + "learning_rate": 3.58637791231345e-06, + "loss": 0.3403, + "step": 5072 + }, + { + "epoch": 1.2075325757125008, + "grad_norm": 0.3971997277785191, + "learning_rate": 3.5845290669975015e-06, + "loss": 0.3797, + "step": 5073 + }, + { + "epoch": 1.2077705717855656, + "grad_norm": 0.36240585843749556, + "learning_rate": 3.5826804320555486e-06, + "loss": 0.3157, + "step": 5074 + }, + { + "epoch": 1.2080085678586303, + "grad_norm": 0.3767347339492281, + "learning_rate": 3.5808320077623485e-06, + "loss": 0.2653, + "step": 5075 + }, + { + "epoch": 1.208246563931695, + "grad_norm": 0.38456131915052816, + "learning_rate": 3.5789837943926208e-06, + "loss": 0.3516, + "step": 5076 + }, + { + "epoch": 1.20848456000476, + "grad_norm": 0.4128907315601859, + "learning_rate": 3.5771357922210555e-06, + "loss": 0.3727, + "step": 5077 + }, + { + "epoch": 1.2087225560778248, + "grad_norm": 0.3723204263394461, + "learning_rate": 3.5752880015223113e-06, + "loss": 0.2884, + "step": 5078 + }, + { + "epoch": 1.2089605521508895, + "grad_norm": 0.3637169168437252, + "learning_rate": 3.5734404225710157e-06, + "loss": 0.2807, + "step": 5079 + }, + { + "epoch": 1.2091985482239542, + "grad_norm": 0.38764628752736946, + "learning_rate": 3.5715930556417644e-06, + "loss": 0.3578, + "step": 5080 + }, + { + "epoch": 1.2094365442970192, + "grad_norm": 0.3450780601550473, + "learning_rate": 3.569745901009123e-06, + "loss": 0.3605, + "step": 5081 + }, + { + "epoch": 1.209674540370084, + "grad_norm": 0.3581139717614697, + "learning_rate": 3.5678989589476228e-06, + "loss": 0.2712, + "step": 5082 + }, + { + "epoch": 1.2099125364431487, + "grad_norm": 0.34988502902927887, + "learning_rate": 3.5660522297317648e-06, + "loss": 0.3645, + "step": 5083 + }, + { + "epoch": 1.2101505325162134, + "grad_norm": 0.3708416415228171, + "learning_rate": 3.5642057136360205e-06, + "loss": 0.3858, + "step": 5084 + }, + { + "epoch": 1.2103885285892784, + "grad_norm": 0.38942129205042847, + "learning_rate": 3.562359410934827e-06, + "loss": 0.3324, + "step": 5085 + }, + { + "epoch": 1.210626524662343, + "grad_norm": 0.3806323119990266, + "learning_rate": 3.560513321902591e-06, + "loss": 0.29, + "step": 5086 + }, + { + "epoch": 1.2108645207354078, + "grad_norm": 0.3700249289986975, + "learning_rate": 3.5586674468136838e-06, + "loss": 0.3722, + "step": 5087 + }, + { + "epoch": 1.2111025168084726, + "grad_norm": 0.3719331028186071, + "learning_rate": 3.5568217859424535e-06, + "loss": 0.3775, + "step": 5088 + }, + { + "epoch": 1.2113405128815375, + "grad_norm": 0.36400596249820083, + "learning_rate": 3.554976339563209e-06, + "loss": 0.3116, + "step": 5089 + }, + { + "epoch": 1.2115785089546023, + "grad_norm": 0.38994792469616396, + "learning_rate": 3.553131107950227e-06, + "loss": 0.3026, + "step": 5090 + }, + { + "epoch": 1.211816505027667, + "grad_norm": 0.3941320652284189, + "learning_rate": 3.551286091377757e-06, + "loss": 0.3698, + "step": 5091 + }, + { + "epoch": 1.2120545011007318, + "grad_norm": 0.4046036167301338, + "learning_rate": 3.549441290120013e-06, + "loss": 0.3097, + "step": 5092 + }, + { + "epoch": 1.2122924971737967, + "grad_norm": 0.4039353108910549, + "learning_rate": 3.547596704451179e-06, + "loss": 0.2699, + "step": 5093 + }, + { + "epoch": 1.2125304932468615, + "grad_norm": 0.4363727370434713, + "learning_rate": 3.545752334645405e-06, + "loss": 0.3303, + "step": 5094 + }, + { + "epoch": 1.2127684893199262, + "grad_norm": 0.3840952200590034, + "learning_rate": 3.5439081809768103e-06, + "loss": 0.3837, + "step": 5095 + }, + { + "epoch": 1.213006485392991, + "grad_norm": 0.37439745222911935, + "learning_rate": 3.5420642437194807e-06, + "loss": 0.3031, + "step": 5096 + }, + { + "epoch": 1.213244481466056, + "grad_norm": 0.40481232665765876, + "learning_rate": 3.540220523147474e-06, + "loss": 0.3129, + "step": 5097 + }, + { + "epoch": 1.2134824775391206, + "grad_norm": 0.3706994690208066, + "learning_rate": 3.53837701953481e-06, + "loss": 0.3574, + "step": 5098 + }, + { + "epoch": 1.2137204736121854, + "grad_norm": 0.36982293775644515, + "learning_rate": 3.53653373315548e-06, + "loss": 0.3361, + "step": 5099 + }, + { + "epoch": 1.2139584696852501, + "grad_norm": 0.44289417728869535, + "learning_rate": 3.534690664283441e-06, + "loss": 0.2839, + "step": 5100 + }, + { + "epoch": 1.214196465758315, + "grad_norm": 0.39709885081786933, + "learning_rate": 3.5328478131926182e-06, + "loss": 0.3385, + "step": 5101 + }, + { + "epoch": 1.2144344618313798, + "grad_norm": 0.386735844300816, + "learning_rate": 3.5310051801569077e-06, + "loss": 0.3861, + "step": 5102 + }, + { + "epoch": 1.2146724579044446, + "grad_norm": 0.38596849207457884, + "learning_rate": 3.5291627654501683e-06, + "loss": 0.3018, + "step": 5103 + }, + { + "epoch": 1.2149104539775093, + "grad_norm": 0.3772909341628557, + "learning_rate": 3.5273205693462294e-06, + "loss": 0.276, + "step": 5104 + }, + { + "epoch": 1.2151484500505743, + "grad_norm": 0.40038122463925, + "learning_rate": 3.5254785921188855e-06, + "loss": 0.3544, + "step": 5105 + }, + { + "epoch": 1.215386446123639, + "grad_norm": 0.394545199953025, + "learning_rate": 3.5236368340419015e-06, + "loss": 0.3795, + "step": 5106 + }, + { + "epoch": 1.2156244421967037, + "grad_norm": 0.37930731017600633, + "learning_rate": 3.5217952953890065e-06, + "loss": 0.2946, + "step": 5107 + }, + { + "epoch": 1.2158624382697685, + "grad_norm": 0.3801188952039268, + "learning_rate": 3.5199539764338995e-06, + "loss": 0.3532, + "step": 5108 + }, + { + "epoch": 1.2161004343428334, + "grad_norm": 0.3760239490498582, + "learning_rate": 3.518112877450247e-06, + "loss": 0.3925, + "step": 5109 + }, + { + "epoch": 1.2163384304158982, + "grad_norm": 0.34044640736378307, + "learning_rate": 3.51627199871168e-06, + "loss": 0.2874, + "step": 5110 + }, + { + "epoch": 1.216576426488963, + "grad_norm": 0.4167896879648559, + "learning_rate": 3.5144313404918e-06, + "loss": 0.3038, + "step": 5111 + }, + { + "epoch": 1.2168144225620277, + "grad_norm": 0.375356242770381, + "learning_rate": 3.512590903064175e-06, + "loss": 0.3309, + "step": 5112 + }, + { + "epoch": 1.2170524186350926, + "grad_norm": 0.37061202523878695, + "learning_rate": 3.5107506867023377e-06, + "loss": 0.373, + "step": 5113 + }, + { + "epoch": 1.2172904147081574, + "grad_norm": 0.36705161824794263, + "learning_rate": 3.508910691679791e-06, + "loss": 0.3056, + "step": 5114 + }, + { + "epoch": 1.217528410781222, + "grad_norm": 0.3939317720079383, + "learning_rate": 3.5070709182700007e-06, + "loss": 0.3024, + "step": 5115 + }, + { + "epoch": 1.2177664068542868, + "grad_norm": 0.4054053311852339, + "learning_rate": 3.5052313667464075e-06, + "loss": 0.3854, + "step": 5116 + }, + { + "epoch": 1.2180044029273518, + "grad_norm": 0.33993185418916133, + "learning_rate": 3.5033920373824125e-06, + "loss": 0.3238, + "step": 5117 + }, + { + "epoch": 1.2182423990004165, + "grad_norm": 0.3434027285290983, + "learning_rate": 3.5015529304513845e-06, + "loss": 0.2779, + "step": 5118 + }, + { + "epoch": 1.2184803950734813, + "grad_norm": 0.39857723088957964, + "learning_rate": 3.499714046226661e-06, + "loss": 0.3658, + "step": 5119 + }, + { + "epoch": 1.218718391146546, + "grad_norm": 0.3991383073000782, + "learning_rate": 3.4978753849815457e-06, + "loss": 0.3682, + "step": 5120 + }, + { + "epoch": 1.218956387219611, + "grad_norm": 0.3685347968394098, + "learning_rate": 3.4960369469893087e-06, + "loss": 0.2886, + "step": 5121 + }, + { + "epoch": 1.2191943832926757, + "grad_norm": 0.3833481773800204, + "learning_rate": 3.4941987325231873e-06, + "loss": 0.3273, + "step": 5122 + }, + { + "epoch": 1.2194323793657404, + "grad_norm": 0.3827498561469543, + "learning_rate": 3.4923607418563855e-06, + "loss": 0.3803, + "step": 5123 + }, + { + "epoch": 1.2196703754388052, + "grad_norm": 0.3628595561239951, + "learning_rate": 3.490522975262076e-06, + "loss": 0.3226, + "step": 5124 + }, + { + "epoch": 1.2199083715118701, + "grad_norm": 0.3427380203291989, + "learning_rate": 3.4886854330133947e-06, + "loss": 0.2499, + "step": 5125 + }, + { + "epoch": 1.2201463675849349, + "grad_norm": 0.3902097175795296, + "learning_rate": 3.4868481153834454e-06, + "loss": 0.328, + "step": 5126 + }, + { + "epoch": 1.2203843636579996, + "grad_norm": 0.3953893414153582, + "learning_rate": 3.485011022645301e-06, + "loss": 0.4049, + "step": 5127 + }, + { + "epoch": 1.2206223597310644, + "grad_norm": 0.36238449295552555, + "learning_rate": 3.4831741550719964e-06, + "loss": 0.3147, + "step": 5128 + }, + { + "epoch": 1.2208603558041293, + "grad_norm": 0.3978994446444037, + "learning_rate": 3.4813375129365357e-06, + "loss": 0.2917, + "step": 5129 + }, + { + "epoch": 1.221098351877194, + "grad_norm": 0.3669821823525373, + "learning_rate": 3.4795010965118926e-06, + "loss": 0.3574, + "step": 5130 + }, + { + "epoch": 1.2213363479502588, + "grad_norm": 0.3692786501186213, + "learning_rate": 3.477664906071001e-06, + "loss": 0.3685, + "step": 5131 + }, + { + "epoch": 1.2215743440233235, + "grad_norm": 0.3751050152516497, + "learning_rate": 3.4758289418867665e-06, + "loss": 0.2776, + "step": 5132 + }, + { + "epoch": 1.2218123400963885, + "grad_norm": 0.3598859162181419, + "learning_rate": 3.473993204232056e-06, + "loss": 0.3547, + "step": 5133 + }, + { + "epoch": 1.2220503361694532, + "grad_norm": 0.36721640794062704, + "learning_rate": 3.4721576933797072e-06, + "loss": 0.3836, + "step": 5134 + }, + { + "epoch": 1.222288332242518, + "grad_norm": 0.37753613271798336, + "learning_rate": 3.470322409602523e-06, + "loss": 0.3061, + "step": 5135 + }, + { + "epoch": 1.2225263283155827, + "grad_norm": 0.36033685671415805, + "learning_rate": 3.4684873531732704e-06, + "loss": 0.2989, + "step": 5136 + }, + { + "epoch": 1.2227643243886477, + "grad_norm": 0.4190827209156023, + "learning_rate": 3.4666525243646845e-06, + "loss": 0.3299, + "step": 5137 + }, + { + "epoch": 1.2230023204617124, + "grad_norm": 0.41563040679604657, + "learning_rate": 3.464817923449467e-06, + "loss": 0.3757, + "step": 5138 + }, + { + "epoch": 1.2232403165347772, + "grad_norm": 0.4017327361796809, + "learning_rate": 3.4629835507002853e-06, + "loss": 0.2852, + "step": 5139 + }, + { + "epoch": 1.223478312607842, + "grad_norm": 0.39000305457626766, + "learning_rate": 3.461149406389771e-06, + "loss": 0.306, + "step": 5140 + }, + { + "epoch": 1.2237163086809069, + "grad_norm": 0.3870106967773287, + "learning_rate": 3.4593154907905246e-06, + "loss": 0.3645, + "step": 5141 + }, + { + "epoch": 1.2239543047539716, + "grad_norm": 0.38791895997961046, + "learning_rate": 3.4574818041751113e-06, + "loss": 0.3239, + "step": 5142 + }, + { + "epoch": 1.2241923008270363, + "grad_norm": 0.3959723469109777, + "learning_rate": 3.45564834681606e-06, + "loss": 0.2933, + "step": 5143 + }, + { + "epoch": 1.224430296900101, + "grad_norm": 0.36905913364646237, + "learning_rate": 3.4538151189858717e-06, + "loss": 0.3453, + "step": 5144 + }, + { + "epoch": 1.224668292973166, + "grad_norm": 0.3837400246280125, + "learning_rate": 3.451982120957007e-06, + "loss": 0.3988, + "step": 5145 + }, + { + "epoch": 1.2249062890462308, + "grad_norm": 0.35298513550582294, + "learning_rate": 3.450149353001896e-06, + "loss": 0.3025, + "step": 5146 + }, + { + "epoch": 1.2251442851192955, + "grad_norm": 0.372652851908798, + "learning_rate": 3.4483168153929324e-06, + "loss": 0.2998, + "step": 5147 + }, + { + "epoch": 1.2253822811923603, + "grad_norm": 0.3571143797030416, + "learning_rate": 3.4464845084024767e-06, + "loss": 0.3579, + "step": 5148 + }, + { + "epoch": 1.2256202772654252, + "grad_norm": 0.3669338824803904, + "learning_rate": 3.444652432302855e-06, + "loss": 0.3227, + "step": 5149 + }, + { + "epoch": 1.22585827333849, + "grad_norm": 0.38715345328480644, + "learning_rate": 3.4428205873663584e-06, + "loss": 0.2854, + "step": 5150 + }, + { + "epoch": 1.2260962694115547, + "grad_norm": 0.3667959879928667, + "learning_rate": 3.440988973865246e-06, + "loss": 0.3279, + "step": 5151 + }, + { + "epoch": 1.2263342654846194, + "grad_norm": 0.39136089925706374, + "learning_rate": 3.4391575920717407e-06, + "loss": 0.4016, + "step": 5152 + }, + { + "epoch": 1.2265722615576844, + "grad_norm": 0.38499224408384597, + "learning_rate": 3.4373264422580305e-06, + "loss": 0.2915, + "step": 5153 + }, + { + "epoch": 1.2268102576307491, + "grad_norm": 0.3893740742863143, + "learning_rate": 3.4354955246962694e-06, + "loss": 0.2866, + "step": 5154 + }, + { + "epoch": 1.2270482537038139, + "grad_norm": 0.38314543939198864, + "learning_rate": 3.4336648396585777e-06, + "loss": 0.3695, + "step": 5155 + }, + { + "epoch": 1.2272862497768786, + "grad_norm": 0.3553545241693441, + "learning_rate": 3.4318343874170378e-06, + "loss": 0.3653, + "step": 5156 + }, + { + "epoch": 1.2275242458499434, + "grad_norm": 0.3721824699789484, + "learning_rate": 3.4300041682437046e-06, + "loss": 0.3055, + "step": 5157 + }, + { + "epoch": 1.2277622419230083, + "grad_norm": 0.34746845955767125, + "learning_rate": 3.428174182410592e-06, + "loss": 0.3031, + "step": 5158 + }, + { + "epoch": 1.228000237996073, + "grad_norm": 0.39404533678367354, + "learning_rate": 3.4263444301896805e-06, + "loss": 0.3667, + "step": 5159 + }, + { + "epoch": 1.2282382340691378, + "grad_norm": 0.3725519922822452, + "learning_rate": 3.424514911852917e-06, + "loss": 0.2965, + "step": 5160 + }, + { + "epoch": 1.2284762301422028, + "grad_norm": 0.38747366594584076, + "learning_rate": 3.4226856276722133e-06, + "loss": 0.2872, + "step": 5161 + }, + { + "epoch": 1.2287142262152675, + "grad_norm": 0.38316424613293093, + "learning_rate": 3.4208565779194467e-06, + "loss": 0.3519, + "step": 5162 + }, + { + "epoch": 1.2289522222883322, + "grad_norm": 0.38446797539419886, + "learning_rate": 3.4190277628664583e-06, + "loss": 0.3526, + "step": 5163 + }, + { + "epoch": 1.229190218361397, + "grad_norm": 0.3794454331861941, + "learning_rate": 3.417199182785055e-06, + "loss": 0.2841, + "step": 5164 + }, + { + "epoch": 1.2294282144344617, + "grad_norm": 0.3878874853038905, + "learning_rate": 3.4153708379470107e-06, + "loss": 0.3286, + "step": 5165 + }, + { + "epoch": 1.2296662105075267, + "grad_norm": 0.40528158013767834, + "learning_rate": 3.4135427286240613e-06, + "loss": 0.3699, + "step": 5166 + }, + { + "epoch": 1.2299042065805914, + "grad_norm": 0.3725084616775476, + "learning_rate": 3.41171485508791e-06, + "loss": 0.3351, + "step": 5167 + }, + { + "epoch": 1.2301422026536561, + "grad_norm": 0.3539613656037255, + "learning_rate": 3.409887217610223e-06, + "loss": 0.2858, + "step": 5168 + }, + { + "epoch": 1.230380198726721, + "grad_norm": 0.3923246643319525, + "learning_rate": 3.4080598164626333e-06, + "loss": 0.3168, + "step": 5169 + }, + { + "epoch": 1.2306181947997858, + "grad_norm": 0.3783741339958614, + "learning_rate": 3.4062326519167354e-06, + "loss": 0.3958, + "step": 5170 + }, + { + "epoch": 1.2308561908728506, + "grad_norm": 0.34812779553971557, + "learning_rate": 3.4044057242440954e-06, + "loss": 0.2827, + "step": 5171 + }, + { + "epoch": 1.2310941869459153, + "grad_norm": 0.3862300590706353, + "learning_rate": 3.402579033716238e-06, + "loss": 0.3068, + "step": 5172 + }, + { + "epoch": 1.23133218301898, + "grad_norm": 0.41044439207246175, + "learning_rate": 3.400752580604655e-06, + "loss": 0.3579, + "step": 5173 + }, + { + "epoch": 1.231570179092045, + "grad_norm": 0.35576711138677686, + "learning_rate": 3.3989263651808013e-06, + "loss": 0.3522, + "step": 5174 + }, + { + "epoch": 1.2318081751651098, + "grad_norm": 0.37907648138016425, + "learning_rate": 3.397100387716098e-06, + "loss": 0.2591, + "step": 5175 + }, + { + "epoch": 1.2320461712381745, + "grad_norm": 0.4047380890985508, + "learning_rate": 3.395274648481932e-06, + "loss": 0.3778, + "step": 5176 + }, + { + "epoch": 1.2322841673112395, + "grad_norm": 0.3767213089270945, + "learning_rate": 3.39344914774965e-06, + "loss": 0.3612, + "step": 5177 + }, + { + "epoch": 1.2325221633843042, + "grad_norm": 0.3618213335927789, + "learning_rate": 3.391623885790571e-06, + "loss": 0.2956, + "step": 5178 + }, + { + "epoch": 1.232760159457369, + "grad_norm": 0.37715805015205855, + "learning_rate": 3.3897988628759714e-06, + "loss": 0.3076, + "step": 5179 + }, + { + "epoch": 1.2329981555304337, + "grad_norm": 0.36354382791582407, + "learning_rate": 3.387974079277095e-06, + "loss": 0.3843, + "step": 5180 + }, + { + "epoch": 1.2332361516034984, + "grad_norm": 0.3828085727627118, + "learning_rate": 3.3861495352651504e-06, + "loss": 0.3464, + "step": 5181 + }, + { + "epoch": 1.2334741476765634, + "grad_norm": 0.40137098436265456, + "learning_rate": 3.3843252311113095e-06, + "loss": 0.2923, + "step": 5182 + }, + { + "epoch": 1.2337121437496281, + "grad_norm": 0.4180250525495085, + "learning_rate": 3.3825011670867086e-06, + "loss": 0.3647, + "step": 5183 + }, + { + "epoch": 1.2339501398226929, + "grad_norm": 0.3742865593493896, + "learning_rate": 3.3806773434624475e-06, + "loss": 0.3939, + "step": 5184 + }, + { + "epoch": 1.2341881358957578, + "grad_norm": 0.3799380469090617, + "learning_rate": 3.3788537605095957e-06, + "loss": 0.3187, + "step": 5185 + }, + { + "epoch": 1.2344261319688226, + "grad_norm": 0.3845177336003192, + "learning_rate": 3.37703041849918e-06, + "loss": 0.2822, + "step": 5186 + }, + { + "epoch": 1.2346641280418873, + "grad_norm": 0.38597150172129496, + "learning_rate": 3.375207317702194e-06, + "loss": 0.3609, + "step": 5187 + }, + { + "epoch": 1.234902124114952, + "grad_norm": 0.3806826446535519, + "learning_rate": 3.373384458389597e-06, + "loss": 0.3542, + "step": 5188 + }, + { + "epoch": 1.2351401201880168, + "grad_norm": 0.3664699692758197, + "learning_rate": 3.371561840832309e-06, + "loss": 0.2939, + "step": 5189 + }, + { + "epoch": 1.2353781162610817, + "grad_norm": 0.3855337340082918, + "learning_rate": 3.3697394653012185e-06, + "loss": 0.3241, + "step": 5190 + }, + { + "epoch": 1.2356161123341465, + "grad_norm": 0.3949003682130631, + "learning_rate": 3.367917332067172e-06, + "loss": 0.3747, + "step": 5191 + }, + { + "epoch": 1.2358541084072112, + "grad_norm": 0.3916469796647736, + "learning_rate": 3.3660954414009872e-06, + "loss": 0.2907, + "step": 5192 + }, + { + "epoch": 1.2360921044802762, + "grad_norm": 0.41103529668836525, + "learning_rate": 3.3642737935734403e-06, + "loss": 0.266, + "step": 5193 + }, + { + "epoch": 1.236330100553341, + "grad_norm": 0.4135491847771145, + "learning_rate": 3.3624523888552734e-06, + "loss": 0.3251, + "step": 5194 + }, + { + "epoch": 1.2365680966264057, + "grad_norm": 0.41366473528894604, + "learning_rate": 3.3606312275171928e-06, + "loss": 0.3663, + "step": 5195 + }, + { + "epoch": 1.2368060926994704, + "grad_norm": 0.3584651473821333, + "learning_rate": 3.358810309829868e-06, + "loss": 0.2973, + "step": 5196 + }, + { + "epoch": 1.2370440887725351, + "grad_norm": 0.44390275619305986, + "learning_rate": 3.356989636063932e-06, + "loss": 0.3176, + "step": 5197 + }, + { + "epoch": 1.2372820848456, + "grad_norm": 0.408274658661175, + "learning_rate": 3.3551692064899806e-06, + "loss": 0.388, + "step": 5198 + }, + { + "epoch": 1.2375200809186648, + "grad_norm": 0.3676982361906048, + "learning_rate": 3.353349021378578e-06, + "loss": 0.3239, + "step": 5199 + }, + { + "epoch": 1.2377580769917296, + "grad_norm": 0.38263348645456147, + "learning_rate": 3.3515290810002464e-06, + "loss": 0.3018, + "step": 5200 + }, + { + "epoch": 1.2379960730647945, + "grad_norm": 0.37234902877710807, + "learning_rate": 3.3497093856254757e-06, + "loss": 0.3607, + "step": 5201 + }, + { + "epoch": 1.2382340691378593, + "grad_norm": 0.396040248592026, + "learning_rate": 3.347889935524716e-06, + "loss": 0.4182, + "step": 5202 + }, + { + "epoch": 1.238472065210924, + "grad_norm": 0.3999889104147471, + "learning_rate": 3.3460707309683826e-06, + "loss": 0.304, + "step": 5203 + }, + { + "epoch": 1.2387100612839888, + "grad_norm": 0.40354368811724745, + "learning_rate": 3.3442517722268543e-06, + "loss": 0.2784, + "step": 5204 + }, + { + "epoch": 1.2389480573570535, + "grad_norm": 0.4088049571035965, + "learning_rate": 3.342433059570475e-06, + "loss": 0.3927, + "step": 5205 + }, + { + "epoch": 1.2391860534301185, + "grad_norm": 0.3784733136722843, + "learning_rate": 3.340614593269549e-06, + "loss": 0.3523, + "step": 5206 + }, + { + "epoch": 1.2394240495031832, + "grad_norm": 0.6754063547956777, + "learning_rate": 3.338796373594346e-06, + "loss": 0.2846, + "step": 5207 + }, + { + "epoch": 1.239662045576248, + "grad_norm": 0.37479104141833847, + "learning_rate": 3.336978400815098e-06, + "loss": 0.3278, + "step": 5208 + }, + { + "epoch": 1.239900041649313, + "grad_norm": 0.40369056554447125, + "learning_rate": 3.3351606752020004e-06, + "loss": 0.4141, + "step": 5209 + }, + { + "epoch": 1.2401380377223776, + "grad_norm": 0.3592088638519854, + "learning_rate": 3.333343197025213e-06, + "loss": 0.3133, + "step": 5210 + }, + { + "epoch": 1.2403760337954424, + "grad_norm": 0.4150892541339133, + "learning_rate": 3.3315259665548554e-06, + "loss": 0.2712, + "step": 5211 + }, + { + "epoch": 1.240614029868507, + "grad_norm": 0.3717429857265201, + "learning_rate": 3.3297089840610173e-06, + "loss": 0.3615, + "step": 5212 + }, + { + "epoch": 1.2408520259415718, + "grad_norm": 0.37643019244681736, + "learning_rate": 3.3278922498137455e-06, + "loss": 0.376, + "step": 5213 + }, + { + "epoch": 1.2410900220146368, + "grad_norm": 0.3735812988579749, + "learning_rate": 3.326075764083051e-06, + "loss": 0.2832, + "step": 5214 + }, + { + "epoch": 1.2413280180877015, + "grad_norm": 0.4247489715555179, + "learning_rate": 3.3242595271389087e-06, + "loss": 0.3459, + "step": 5215 + }, + { + "epoch": 1.2415660141607663, + "grad_norm": 0.3782694036825223, + "learning_rate": 3.3224435392512565e-06, + "loss": 0.3574, + "step": 5216 + }, + { + "epoch": 1.2418040102338312, + "grad_norm": 0.3593379490467785, + "learning_rate": 3.320627800689996e-06, + "loss": 0.3358, + "step": 5217 + }, + { + "epoch": 1.242042006306896, + "grad_norm": 0.38439219808924074, + "learning_rate": 3.3188123117249884e-06, + "loss": 0.2689, + "step": 5218 + }, + { + "epoch": 1.2422800023799607, + "grad_norm": 0.40058679929549906, + "learning_rate": 3.3169970726260625e-06, + "loss": 0.3514, + "step": 5219 + }, + { + "epoch": 1.2425179984530255, + "grad_norm": 0.3765460507646511, + "learning_rate": 3.3151820836630074e-06, + "loss": 0.3873, + "step": 5220 + }, + { + "epoch": 1.2427559945260902, + "grad_norm": 0.4315150126863899, + "learning_rate": 3.313367345105575e-06, + "loss": 0.274, + "step": 5221 + }, + { + "epoch": 1.2429939905991552, + "grad_norm": 0.3785008936826988, + "learning_rate": 3.31155285722348e-06, + "loss": 0.3163, + "step": 5222 + }, + { + "epoch": 1.24323198667222, + "grad_norm": 0.3967888293663607, + "learning_rate": 3.309738620286401e-06, + "loss": 0.3529, + "step": 5223 + }, + { + "epoch": 1.2434699827452846, + "grad_norm": 0.3861441173261868, + "learning_rate": 3.307924634563978e-06, + "loss": 0.3399, + "step": 5224 + }, + { + "epoch": 1.2437079788183496, + "grad_norm": 0.3750655532874109, + "learning_rate": 3.306110900325813e-06, + "loss": 0.3173, + "step": 5225 + }, + { + "epoch": 1.2439459748914143, + "grad_norm": 0.3925378776235632, + "learning_rate": 3.304297417841474e-06, + "loss": 0.3065, + "step": 5226 + }, + { + "epoch": 1.244183970964479, + "grad_norm": 0.39518178594914705, + "learning_rate": 3.3024841873804885e-06, + "loss": 0.3659, + "step": 5227 + }, + { + "epoch": 1.2444219670375438, + "grad_norm": 0.37902705321028035, + "learning_rate": 3.300671209212347e-06, + "loss": 0.3182, + "step": 5228 + }, + { + "epoch": 1.2446599631106086, + "grad_norm": 0.3720334846385885, + "learning_rate": 3.298858483606504e-06, + "loss": 0.2705, + "step": 5229 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 0.3733999529266795, + "learning_rate": 3.2970460108323744e-06, + "loss": 0.3387, + "step": 5230 + }, + { + "epoch": 1.2451359552567383, + "grad_norm": 0.3772749257271583, + "learning_rate": 3.295233791159336e-06, + "loss": 0.3739, + "step": 5231 + }, + { + "epoch": 1.245373951329803, + "grad_norm": 0.3695783892022936, + "learning_rate": 3.2934218248567294e-06, + "loss": 0.283, + "step": 5232 + }, + { + "epoch": 1.245611947402868, + "grad_norm": 0.4014905086469289, + "learning_rate": 3.29161011219386e-06, + "loss": 0.3269, + "step": 5233 + }, + { + "epoch": 1.2458499434759327, + "grad_norm": 0.4628150231356265, + "learning_rate": 3.2897986534399908e-06, + "loss": 0.39, + "step": 5234 + }, + { + "epoch": 1.2460879395489974, + "grad_norm": 0.35110622672123937, + "learning_rate": 3.2879874488643504e-06, + "loss": 0.3049, + "step": 5235 + }, + { + "epoch": 1.2463259356220622, + "grad_norm": 0.39624400624138423, + "learning_rate": 3.2861764987361276e-06, + "loss": 0.2845, + "step": 5236 + }, + { + "epoch": 1.246563931695127, + "grad_norm": 0.3548580217470856, + "learning_rate": 3.284365803324476e-06, + "loss": 0.3565, + "step": 5237 + }, + { + "epoch": 1.2468019277681919, + "grad_norm": 0.3795495986882351, + "learning_rate": 3.2825553628985078e-06, + "loss": 0.3632, + "step": 5238 + }, + { + "epoch": 1.2470399238412566, + "grad_norm": 0.4043537341500232, + "learning_rate": 3.280745177727299e-06, + "loss": 0.3075, + "step": 5239 + }, + { + "epoch": 1.2472779199143214, + "grad_norm": 0.40712561384193874, + "learning_rate": 3.278935248079891e-06, + "loss": 0.2944, + "step": 5240 + }, + { + "epoch": 1.2475159159873863, + "grad_norm": 0.3557393968961097, + "learning_rate": 3.2771255742252817e-06, + "loss": 0.3703, + "step": 5241 + }, + { + "epoch": 1.247753912060451, + "grad_norm": 0.360568352135061, + "learning_rate": 3.2753161564324344e-06, + "loss": 0.3443, + "step": 5242 + }, + { + "epoch": 1.2479919081335158, + "grad_norm": 0.3773441969979045, + "learning_rate": 3.2735069949702723e-06, + "loss": 0.3008, + "step": 5243 + }, + { + "epoch": 1.2482299042065805, + "grad_norm": 0.35789698002368536, + "learning_rate": 3.271698090107682e-06, + "loss": 0.3428, + "step": 5244 + }, + { + "epoch": 1.2484679002796453, + "grad_norm": 0.3799994603077194, + "learning_rate": 3.269889442113512e-06, + "loss": 0.3857, + "step": 5245 + }, + { + "epoch": 1.2487058963527102, + "grad_norm": 0.3732392024366841, + "learning_rate": 3.2680810512565714e-06, + "loss": 0.2917, + "step": 5246 + }, + { + "epoch": 1.248943892425775, + "grad_norm": 0.40137116851946253, + "learning_rate": 3.266272917805633e-06, + "loss": 0.273, + "step": 5247 + }, + { + "epoch": 1.2491818884988397, + "grad_norm": 0.36249398408452443, + "learning_rate": 3.2644650420294288e-06, + "loss": 0.3821, + "step": 5248 + }, + { + "epoch": 1.2494198845719047, + "grad_norm": 0.38050458261938763, + "learning_rate": 3.2626574241966546e-06, + "loss": 0.3256, + "step": 5249 + }, + { + "epoch": 1.2496578806449694, + "grad_norm": 0.3726179289456404, + "learning_rate": 3.2608500645759673e-06, + "loss": 0.2807, + "step": 5250 + }, + { + "epoch": 1.2498958767180341, + "grad_norm": 0.35938712373080584, + "learning_rate": 3.2590429634359845e-06, + "loss": 0.3395, + "step": 5251 + }, + { + "epoch": 1.2501338727910989, + "grad_norm": 0.4040569210815416, + "learning_rate": 3.257236121045286e-06, + "loss": 0.386, + "step": 5252 + }, + { + "epoch": 1.2503718688641636, + "grad_norm": 0.3844777853293955, + "learning_rate": 3.2554295376724134e-06, + "loss": 0.2991, + "step": 5253 + }, + { + "epoch": 1.2506098649372286, + "grad_norm": 0.4040805562877999, + "learning_rate": 3.253623213585871e-06, + "loss": 0.2617, + "step": 5254 + }, + { + "epoch": 1.2508478610102933, + "grad_norm": 0.3728383275268288, + "learning_rate": 3.2518171490541222e-06, + "loss": 0.3587, + "step": 5255 + }, + { + "epoch": 1.251085857083358, + "grad_norm": 0.37161586537379476, + "learning_rate": 3.250011344345594e-06, + "loss": 0.351, + "step": 5256 + }, + { + "epoch": 1.251323853156423, + "grad_norm": 0.3923833529850557, + "learning_rate": 3.2482057997286716e-06, + "loss": 0.2907, + "step": 5257 + }, + { + "epoch": 1.2515618492294878, + "grad_norm": 0.46418423651277113, + "learning_rate": 3.2464005154717036e-06, + "loss": 0.3159, + "step": 5258 + }, + { + "epoch": 1.2517998453025525, + "grad_norm": 0.391141738921714, + "learning_rate": 3.244595491843003e-06, + "loss": 0.3681, + "step": 5259 + }, + { + "epoch": 1.2520378413756172, + "grad_norm": 0.3998036064404465, + "learning_rate": 3.2427907291108383e-06, + "loss": 0.3235, + "step": 5260 + }, + { + "epoch": 1.252275837448682, + "grad_norm": 0.3895934764387763, + "learning_rate": 3.2409862275434425e-06, + "loss": 0.2746, + "step": 5261 + }, + { + "epoch": 1.252513833521747, + "grad_norm": 0.3704415894740105, + "learning_rate": 3.239181987409009e-06, + "loss": 0.3457, + "step": 5262 + }, + { + "epoch": 1.2527518295948117, + "grad_norm": 0.37132868730200563, + "learning_rate": 3.2373780089756934e-06, + "loss": 0.3927, + "step": 5263 + }, + { + "epoch": 1.2529898256678764, + "grad_norm": 0.3434397334645843, + "learning_rate": 3.2355742925116103e-06, + "loss": 0.2817, + "step": 5264 + }, + { + "epoch": 1.2532278217409414, + "grad_norm": 0.4033979941592838, + "learning_rate": 3.233770838284837e-06, + "loss": 0.3391, + "step": 5265 + }, + { + "epoch": 1.2534658178140061, + "grad_norm": 0.4863421240520152, + "learning_rate": 3.231967646563412e-06, + "loss": 0.3836, + "step": 5266 + }, + { + "epoch": 1.2537038138870709, + "grad_norm": 0.3863926541257452, + "learning_rate": 3.230164717615331e-06, + "loss": 0.3292, + "step": 5267 + }, + { + "epoch": 1.2539418099601356, + "grad_norm": 0.35788876853240303, + "learning_rate": 3.228362051708559e-06, + "loss": 0.2755, + "step": 5268 + }, + { + "epoch": 1.2541798060332003, + "grad_norm": 0.3946622156577876, + "learning_rate": 3.2265596491110145e-06, + "loss": 0.3349, + "step": 5269 + }, + { + "epoch": 1.2544178021062653, + "grad_norm": 0.3862161899983044, + "learning_rate": 3.2247575100905794e-06, + "loss": 0.4002, + "step": 5270 + }, + { + "epoch": 1.25465579817933, + "grad_norm": 0.3970524252747173, + "learning_rate": 3.2229556349150947e-06, + "loss": 0.2879, + "step": 5271 + }, + { + "epoch": 1.2548937942523948, + "grad_norm": 0.38366178715626265, + "learning_rate": 3.221154023852364e-06, + "loss": 0.3337, + "step": 5272 + }, + { + "epoch": 1.2551317903254597, + "grad_norm": 0.37924724516016045, + "learning_rate": 3.2193526771701526e-06, + "loss": 0.3732, + "step": 5273 + }, + { + "epoch": 1.2553697863985245, + "grad_norm": 0.3720944559532506, + "learning_rate": 3.2175515951361844e-06, + "loss": 0.3384, + "step": 5274 + }, + { + "epoch": 1.2556077824715892, + "grad_norm": 0.37916136226673736, + "learning_rate": 3.2157507780181457e-06, + "loss": 0.2804, + "step": 5275 + }, + { + "epoch": 1.255845778544654, + "grad_norm": 0.37290254799955885, + "learning_rate": 3.2139502260836806e-06, + "loss": 0.3394, + "step": 5276 + }, + { + "epoch": 1.2560837746177187, + "grad_norm": 0.4120937160799173, + "learning_rate": 3.2121499396003974e-06, + "loss": 0.3896, + "step": 5277 + }, + { + "epoch": 1.2563217706907837, + "grad_norm": 0.3631101334404687, + "learning_rate": 3.2103499188358623e-06, + "loss": 0.3017, + "step": 5278 + }, + { + "epoch": 1.2565597667638484, + "grad_norm": 0.38219111111630644, + "learning_rate": 3.2085501640576035e-06, + "loss": 0.2705, + "step": 5279 + }, + { + "epoch": 1.2567977628369131, + "grad_norm": 0.4099100476999923, + "learning_rate": 3.206750675533106e-06, + "loss": 0.368, + "step": 5280 + }, + { + "epoch": 1.257035758909978, + "grad_norm": 0.37534175826439325, + "learning_rate": 3.2049514535298227e-06, + "loss": 0.333, + "step": 5281 + }, + { + "epoch": 1.2572737549830428, + "grad_norm": 0.3958938531308809, + "learning_rate": 3.2031524983151606e-06, + "loss": 0.3224, + "step": 5282 + }, + { + "epoch": 1.2575117510561076, + "grad_norm": 0.3755193173874838, + "learning_rate": 3.20135381015649e-06, + "loss": 0.3333, + "step": 5283 + }, + { + "epoch": 1.2577497471291723, + "grad_norm": 0.405190126827234, + "learning_rate": 3.1995553893211395e-06, + "loss": 0.3538, + "step": 5284 + }, + { + "epoch": 1.257987743202237, + "grad_norm": 0.3554947984040851, + "learning_rate": 3.1977572360763977e-06, + "loss": 0.3086, + "step": 5285 + }, + { + "epoch": 1.258225739275302, + "grad_norm": 0.36944294131448824, + "learning_rate": 3.195959350689517e-06, + "loss": 0.3169, + "step": 5286 + }, + { + "epoch": 1.2584637353483668, + "grad_norm": 0.38302961275351355, + "learning_rate": 3.1941617334277066e-06, + "loss": 0.3534, + "step": 5287 + }, + { + "epoch": 1.2587017314214315, + "grad_norm": 0.34830251003999074, + "learning_rate": 3.1923643845581364e-06, + "loss": 0.3458, + "step": 5288 + }, + { + "epoch": 1.2589397274944965, + "grad_norm": 0.4007008524746251, + "learning_rate": 3.190567304347938e-06, + "loss": 0.2737, + "step": 5289 + }, + { + "epoch": 1.2591777235675612, + "grad_norm": 0.3743627078891674, + "learning_rate": 3.1887704930642016e-06, + "loss": 0.3119, + "step": 5290 + }, + { + "epoch": 1.259415719640626, + "grad_norm": 0.39475666902200535, + "learning_rate": 3.1869739509739772e-06, + "loss": 0.3684, + "step": 5291 + }, + { + "epoch": 1.2596537157136907, + "grad_norm": 0.3507214620255833, + "learning_rate": 3.1851776783442754e-06, + "loss": 0.3523, + "step": 5292 + }, + { + "epoch": 1.2598917117867554, + "grad_norm": 0.4174143666004117, + "learning_rate": 3.1833816754420678e-06, + "loss": 0.2806, + "step": 5293 + }, + { + "epoch": 1.2601297078598204, + "grad_norm": 0.3626773824316747, + "learning_rate": 3.181585942534282e-06, + "loss": 0.3449, + "step": 5294 + }, + { + "epoch": 1.260367703932885, + "grad_norm": 0.3746333463473584, + "learning_rate": 3.179790479887812e-06, + "loss": 0.3698, + "step": 5295 + }, + { + "epoch": 1.2606057000059498, + "grad_norm": 0.39419946032729236, + "learning_rate": 3.1779952877695077e-06, + "loss": 0.2622, + "step": 5296 + }, + { + "epoch": 1.2608436960790148, + "grad_norm": 0.3665818295311167, + "learning_rate": 3.176200366446177e-06, + "loss": 0.2874, + "step": 5297 + }, + { + "epoch": 1.2610816921520795, + "grad_norm": 0.41342588700331717, + "learning_rate": 3.174405716184591e-06, + "loss": 0.3516, + "step": 5298 + }, + { + "epoch": 1.2613196882251443, + "grad_norm": 0.35359012506797194, + "learning_rate": 3.1726113372514767e-06, + "loss": 0.3293, + "step": 5299 + }, + { + "epoch": 1.261557684298209, + "grad_norm": 0.40608712887436516, + "learning_rate": 3.1708172299135266e-06, + "loss": 0.2651, + "step": 5300 + }, + { + "epoch": 1.2617956803712738, + "grad_norm": 0.39078363662670146, + "learning_rate": 3.1690233944373883e-06, + "loss": 0.3475, + "step": 5301 + }, + { + "epoch": 1.2620336764443387, + "grad_norm": 0.39442057764959976, + "learning_rate": 3.1672298310896693e-06, + "loss": 0.448, + "step": 5302 + }, + { + "epoch": 1.2622716725174035, + "grad_norm": 0.3870252703040663, + "learning_rate": 3.165436540136938e-06, + "loss": 0.3393, + "step": 5303 + }, + { + "epoch": 1.2625096685904682, + "grad_norm": 0.35385684836054004, + "learning_rate": 3.1636435218457216e-06, + "loss": 0.2895, + "step": 5304 + }, + { + "epoch": 1.2627476646635332, + "grad_norm": 0.38830448667646283, + "learning_rate": 3.161850776482508e-06, + "loss": 0.3622, + "step": 5305 + }, + { + "epoch": 1.262985660736598, + "grad_norm": 0.3724314662342735, + "learning_rate": 3.160058304313742e-06, + "loss": 0.3318, + "step": 5306 + }, + { + "epoch": 1.2632236568096626, + "grad_norm": 0.4014989473946267, + "learning_rate": 3.1582661056058294e-06, + "loss": 0.2739, + "step": 5307 + }, + { + "epoch": 1.2634616528827274, + "grad_norm": 0.37792045811506125, + "learning_rate": 3.156474180625134e-06, + "loss": 0.3098, + "step": 5308 + }, + { + "epoch": 1.2636996489557921, + "grad_norm": 0.3923195999280906, + "learning_rate": 3.1546825296379845e-06, + "loss": 0.3802, + "step": 5309 + }, + { + "epoch": 1.263937645028857, + "grad_norm": 0.36757724848018525, + "learning_rate": 3.152891152910662e-06, + "loss": 0.3026, + "step": 5310 + }, + { + "epoch": 1.2641756411019218, + "grad_norm": 0.3689543106637064, + "learning_rate": 3.1511000507094097e-06, + "loss": 0.3073, + "step": 5311 + }, + { + "epoch": 1.2644136371749866, + "grad_norm": 0.36026728365035765, + "learning_rate": 3.149309223300428e-06, + "loss": 0.3464, + "step": 5312 + }, + { + "epoch": 1.2646516332480515, + "grad_norm": 0.3666885097703029, + "learning_rate": 3.1475186709498806e-06, + "loss": 0.397, + "step": 5313 + }, + { + "epoch": 1.2648896293211163, + "grad_norm": 0.35535524090766163, + "learning_rate": 3.145728393923887e-06, + "loss": 0.2911, + "step": 5314 + }, + { + "epoch": 1.265127625394181, + "grad_norm": 0.3699552572065724, + "learning_rate": 3.143938392488527e-06, + "loss": 0.3085, + "step": 5315 + }, + { + "epoch": 1.2653656214672457, + "grad_norm": 0.371881166524415, + "learning_rate": 3.1421486669098376e-06, + "loss": 0.3925, + "step": 5316 + }, + { + "epoch": 1.2656036175403105, + "grad_norm": 0.3979148251780867, + "learning_rate": 3.1403592174538176e-06, + "loss": 0.3259, + "step": 5317 + }, + { + "epoch": 1.2658416136133754, + "grad_norm": 0.35728138026407263, + "learning_rate": 3.138570044386424e-06, + "loss": 0.2795, + "step": 5318 + }, + { + "epoch": 1.2660796096864402, + "grad_norm": 0.40000777660528697, + "learning_rate": 3.136781147973571e-06, + "loss": 0.318, + "step": 5319 + }, + { + "epoch": 1.266317605759505, + "grad_norm": 0.37847837862392103, + "learning_rate": 3.134992528481133e-06, + "loss": 0.3983, + "step": 5320 + }, + { + "epoch": 1.2665556018325699, + "grad_norm": 0.36014200615492253, + "learning_rate": 3.133204186174942e-06, + "loss": 0.3218, + "step": 5321 + }, + { + "epoch": 1.2667935979056346, + "grad_norm": 0.3823696624234763, + "learning_rate": 3.131416121320792e-06, + "loss": 0.2918, + "step": 5322 + }, + { + "epoch": 1.2670315939786994, + "grad_norm": 0.3722726581374576, + "learning_rate": 3.129628334184433e-06, + "loss": 0.3727, + "step": 5323 + }, + { + "epoch": 1.267269590051764, + "grad_norm": 0.3659346216629232, + "learning_rate": 3.127840825031575e-06, + "loss": 0.3274, + "step": 5324 + }, + { + "epoch": 1.2675075861248288, + "grad_norm": 0.38942693904105447, + "learning_rate": 3.126053594127885e-06, + "loss": 0.3065, + "step": 5325 + }, + { + "epoch": 1.2677455821978938, + "grad_norm": 0.37772681354730653, + "learning_rate": 3.1242666417389896e-06, + "loss": 0.3058, + "step": 5326 + }, + { + "epoch": 1.2679835782709585, + "grad_norm": 0.38030394224311925, + "learning_rate": 3.122479968130475e-06, + "loss": 0.3953, + "step": 5327 + }, + { + "epoch": 1.2682215743440233, + "grad_norm": 0.38391257852782784, + "learning_rate": 3.1206935735678855e-06, + "loss": 0.2879, + "step": 5328 + }, + { + "epoch": 1.2684595704170882, + "grad_norm": 0.3731989200180512, + "learning_rate": 3.118907458316722e-06, + "loss": 0.2993, + "step": 5329 + }, + { + "epoch": 1.268697566490153, + "grad_norm": 0.6492017938323883, + "learning_rate": 3.1171216226424466e-06, + "loss": 0.359, + "step": 5330 + }, + { + "epoch": 1.2689355625632177, + "grad_norm": 0.389677366563528, + "learning_rate": 3.115336066810478e-06, + "loss": 0.3899, + "step": 5331 + }, + { + "epoch": 1.2691735586362825, + "grad_norm": 0.38739411432567555, + "learning_rate": 3.113550791086195e-06, + "loss": 0.2738, + "step": 5332 + }, + { + "epoch": 1.2694115547093472, + "grad_norm": 0.3697993858168483, + "learning_rate": 3.1117657957349325e-06, + "loss": 0.3336, + "step": 5333 + }, + { + "epoch": 1.2696495507824122, + "grad_norm": 0.3851139381884033, + "learning_rate": 3.109981081021985e-06, + "loss": 0.4039, + "step": 5334 + }, + { + "epoch": 1.269887546855477, + "grad_norm": 0.3663302911358832, + "learning_rate": 3.108196647212605e-06, + "loss": 0.3209, + "step": 5335 + }, + { + "epoch": 1.2701255429285416, + "grad_norm": 0.41294325410097843, + "learning_rate": 3.106412494572004e-06, + "loss": 0.2828, + "step": 5336 + }, + { + "epoch": 1.2703635390016066, + "grad_norm": 0.35880515050491946, + "learning_rate": 3.104628623365352e-06, + "loss": 0.3626, + "step": 5337 + }, + { + "epoch": 1.2706015350746713, + "grad_norm": 0.4004043215019132, + "learning_rate": 3.1028450338577767e-06, + "loss": 0.353, + "step": 5338 + }, + { + "epoch": 1.270839531147736, + "grad_norm": 0.3743970698217683, + "learning_rate": 3.1010617263143616e-06, + "loss": 0.2735, + "step": 5339 + }, + { + "epoch": 1.2710775272208008, + "grad_norm": 0.4001724562613586, + "learning_rate": 3.099278701000152e-06, + "loss": 0.3294, + "step": 5340 + }, + { + "epoch": 1.2713155232938655, + "grad_norm": 0.3606181579759712, + "learning_rate": 3.097495958180149e-06, + "loss": 0.3885, + "step": 5341 + }, + { + "epoch": 1.2715535193669305, + "grad_norm": 0.3757180444725599, + "learning_rate": 3.0957134981193125e-06, + "loss": 0.2869, + "step": 5342 + }, + { + "epoch": 1.2717915154399952, + "grad_norm": 0.37476881387920113, + "learning_rate": 3.09393132108256e-06, + "loss": 0.2862, + "step": 5343 + }, + { + "epoch": 1.27202951151306, + "grad_norm": 0.36713464862325923, + "learning_rate": 3.0921494273347676e-06, + "loss": 0.3074, + "step": 5344 + }, + { + "epoch": 1.272267507586125, + "grad_norm": 0.3747964323798518, + "learning_rate": 3.090367817140767e-06, + "loss": 0.4127, + "step": 5345 + }, + { + "epoch": 1.2725055036591897, + "grad_norm": 0.39120167273162276, + "learning_rate": 3.0885864907653517e-06, + "loss": 0.34, + "step": 5346 + }, + { + "epoch": 1.2727434997322544, + "grad_norm": 0.3851975033032519, + "learning_rate": 3.0868054484732696e-06, + "loss": 0.3095, + "step": 5347 + }, + { + "epoch": 1.2729814958053192, + "grad_norm": 0.4511302802067385, + "learning_rate": 3.085024690529227e-06, + "loss": 0.3931, + "step": 5348 + }, + { + "epoch": 1.273219491878384, + "grad_norm": 0.41851016514587525, + "learning_rate": 3.0832442171978895e-06, + "loss": 0.3374, + "step": 5349 + }, + { + "epoch": 1.2734574879514489, + "grad_norm": 0.3763563477399065, + "learning_rate": 3.0814640287438813e-06, + "loss": 0.2577, + "step": 5350 + }, + { + "epoch": 1.2736954840245136, + "grad_norm": 0.39491317152340816, + "learning_rate": 3.07968412543178e-06, + "loss": 0.3377, + "step": 5351 + }, + { + "epoch": 1.2739334800975783, + "grad_norm": 0.42386611214893466, + "learning_rate": 3.0779045075261237e-06, + "loss": 0.3715, + "step": 5352 + }, + { + "epoch": 1.2741714761706433, + "grad_norm": 0.35087910158459273, + "learning_rate": 3.0761251752914077e-06, + "loss": 0.3094, + "step": 5353 + }, + { + "epoch": 1.274409472243708, + "grad_norm": 0.347152988488092, + "learning_rate": 3.074346128992086e-06, + "loss": 0.2898, + "step": 5354 + }, + { + "epoch": 1.2746474683167728, + "grad_norm": 0.37229430513448064, + "learning_rate": 3.072567368892567e-06, + "loss": 0.3509, + "step": 5355 + }, + { + "epoch": 1.2748854643898375, + "grad_norm": 0.38027963988847857, + "learning_rate": 3.0707888952572206e-06, + "loss": 0.3587, + "step": 5356 + }, + { + "epoch": 1.2751234604629023, + "grad_norm": 0.3518662269310278, + "learning_rate": 3.06901070835037e-06, + "loss": 0.2739, + "step": 5357 + }, + { + "epoch": 1.2753614565359672, + "grad_norm": 0.371700159550814, + "learning_rate": 3.067232808436299e-06, + "loss": 0.3261, + "step": 5358 + }, + { + "epoch": 1.275599452609032, + "grad_norm": 0.36480653010967556, + "learning_rate": 3.0654551957792465e-06, + "loss": 0.3602, + "step": 5359 + }, + { + "epoch": 1.2758374486820967, + "grad_norm": 0.3755693780387078, + "learning_rate": 3.0636778706434104e-06, + "loss": 0.3067, + "step": 5360 + }, + { + "epoch": 1.2760754447551617, + "grad_norm": 0.4104166990367289, + "learning_rate": 3.061900833292945e-06, + "loss": 0.288, + "step": 5361 + }, + { + "epoch": 1.2763134408282264, + "grad_norm": 0.3860546262996206, + "learning_rate": 3.060124083991961e-06, + "loss": 0.3287, + "step": 5362 + }, + { + "epoch": 1.2765514369012911, + "grad_norm": 0.41369962174697755, + "learning_rate": 3.0583476230045283e-06, + "loss": 0.3812, + "step": 5363 + }, + { + "epoch": 1.2767894329743559, + "grad_norm": 0.36361020540067207, + "learning_rate": 3.0565714505946744e-06, + "loss": 0.2886, + "step": 5364 + }, + { + "epoch": 1.2770274290474206, + "grad_norm": 0.39741880740743823, + "learning_rate": 3.0547955670263805e-06, + "loss": 0.301, + "step": 5365 + }, + { + "epoch": 1.2772654251204856, + "grad_norm": 0.450792782481189, + "learning_rate": 3.0530199725635868e-06, + "loss": 0.3703, + "step": 5366 + }, + { + "epoch": 1.2775034211935503, + "grad_norm": 0.3897194119813881, + "learning_rate": 3.0512446674701923e-06, + "loss": 0.3263, + "step": 5367 + }, + { + "epoch": 1.277741417266615, + "grad_norm": 0.38367874389511963, + "learning_rate": 3.04946965201005e-06, + "loss": 0.2804, + "step": 5368 + }, + { + "epoch": 1.27797941333968, + "grad_norm": 0.39417790396645735, + "learning_rate": 3.047694926446971e-06, + "loss": 0.3121, + "step": 5369 + }, + { + "epoch": 1.2782174094127448, + "grad_norm": 0.3857971707373409, + "learning_rate": 3.0459204910447236e-06, + "loss": 0.39, + "step": 5370 + }, + { + "epoch": 1.2784554054858095, + "grad_norm": 0.3471166269383708, + "learning_rate": 3.044146346067033e-06, + "loss": 0.2831, + "step": 5371 + }, + { + "epoch": 1.2786934015588742, + "grad_norm": 0.42964523510831726, + "learning_rate": 3.0423724917775806e-06, + "loss": 0.3029, + "step": 5372 + }, + { + "epoch": 1.278931397631939, + "grad_norm": 0.39624435945594666, + "learning_rate": 3.040598928440005e-06, + "loss": 0.3685, + "step": 5373 + }, + { + "epoch": 1.279169393705004, + "grad_norm": 0.400417141293227, + "learning_rate": 3.0388256563179024e-06, + "loss": 0.3483, + "step": 5374 + }, + { + "epoch": 1.2794073897780687, + "grad_norm": 0.41684013264207087, + "learning_rate": 3.037052675674823e-06, + "loss": 0.2727, + "step": 5375 + }, + { + "epoch": 1.2796453858511334, + "grad_norm": 0.3675883946338833, + "learning_rate": 3.0352799867742788e-06, + "loss": 0.3194, + "step": 5376 + }, + { + "epoch": 1.2798833819241984, + "grad_norm": 0.35438374829565183, + "learning_rate": 3.0335075898797315e-06, + "loss": 0.3911, + "step": 5377 + }, + { + "epoch": 1.280121377997263, + "grad_norm": 0.40473636844510946, + "learning_rate": 3.0317354852546067e-06, + "loss": 0.3176, + "step": 5378 + }, + { + "epoch": 1.2803593740703278, + "grad_norm": 0.3699615041682992, + "learning_rate": 3.02996367316228e-06, + "loss": 0.2816, + "step": 5379 + }, + { + "epoch": 1.2805973701433926, + "grad_norm": 0.3870198992613291, + "learning_rate": 3.0281921538660885e-06, + "loss": 0.3498, + "step": 5380 + }, + { + "epoch": 1.2808353662164573, + "grad_norm": 0.39893170072412276, + "learning_rate": 3.026420927629323e-06, + "loss": 0.3506, + "step": 5381 + }, + { + "epoch": 1.2810733622895223, + "grad_norm": 0.39285551881065733, + "learning_rate": 3.024649994715233e-06, + "loss": 0.2825, + "step": 5382 + }, + { + "epoch": 1.281311358362587, + "grad_norm": 0.4255415402851682, + "learning_rate": 3.0228793553870204e-06, + "loss": 0.3282, + "step": 5383 + }, + { + "epoch": 1.2815493544356518, + "grad_norm": 0.37137193613780684, + "learning_rate": 3.0211090099078475e-06, + "loss": 0.3586, + "step": 5384 + }, + { + "epoch": 1.2817873505087167, + "grad_norm": 0.3897265976197935, + "learning_rate": 3.019338958540831e-06, + "loss": 0.314, + "step": 5385 + }, + { + "epoch": 1.2820253465817815, + "grad_norm": 0.43026997209773205, + "learning_rate": 3.0175692015490443e-06, + "loss": 0.2804, + "step": 5386 + }, + { + "epoch": 1.2822633426548462, + "grad_norm": 0.39280510289196574, + "learning_rate": 3.0157997391955172e-06, + "loss": 0.3495, + "step": 5387 + }, + { + "epoch": 1.282501338727911, + "grad_norm": 0.3808724957489551, + "learning_rate": 3.014030571743236e-06, + "loss": 0.384, + "step": 5388 + }, + { + "epoch": 1.2827393348009757, + "grad_norm": 0.4445927409783158, + "learning_rate": 3.0122616994551413e-06, + "loss": 0.2838, + "step": 5389 + }, + { + "epoch": 1.2829773308740406, + "grad_norm": 0.3877029737610317, + "learning_rate": 3.0104931225941335e-06, + "loss": 0.3451, + "step": 5390 + }, + { + "epoch": 1.2832153269471054, + "grad_norm": 0.40529519211735904, + "learning_rate": 3.008724841423064e-06, + "loss": 0.3857, + "step": 5391 + }, + { + "epoch": 1.2834533230201701, + "grad_norm": 0.36044362418622544, + "learning_rate": 3.006956856204747e-06, + "loss": 0.2969, + "step": 5392 + }, + { + "epoch": 1.283691319093235, + "grad_norm": 0.4635471433725574, + "learning_rate": 3.005189167201945e-06, + "loss": 0.3053, + "step": 5393 + }, + { + "epoch": 1.2839293151662998, + "grad_norm": 0.3878593945568825, + "learning_rate": 3.003421774677383e-06, + "loss": 0.3243, + "step": 5394 + }, + { + "epoch": 1.2841673112393646, + "grad_norm": 0.4323624105558046, + "learning_rate": 3.0016546788937385e-06, + "loss": 0.3795, + "step": 5395 + }, + { + "epoch": 1.2844053073124293, + "grad_norm": 0.37258405194326444, + "learning_rate": 2.9998878801136442e-06, + "loss": 0.2922, + "step": 5396 + }, + { + "epoch": 1.284643303385494, + "grad_norm": 0.3984352432024578, + "learning_rate": 2.9981213785996925e-06, + "loss": 0.3179, + "step": 5397 + }, + { + "epoch": 1.284881299458559, + "grad_norm": 0.3451845968169443, + "learning_rate": 2.996355174614428e-06, + "loss": 0.3662, + "step": 5398 + }, + { + "epoch": 1.2851192955316237, + "grad_norm": 0.3786688602776222, + "learning_rate": 2.994589268420352e-06, + "loss": 0.3069, + "step": 5399 + }, + { + "epoch": 1.2853572916046885, + "grad_norm": 0.41067641786263726, + "learning_rate": 2.992823660279922e-06, + "loss": 0.2652, + "step": 5400 + }, + { + "epoch": 1.2855952876777534, + "grad_norm": 0.3922515926989098, + "learning_rate": 2.9910583504555516e-06, + "loss": 0.3191, + "step": 5401 + }, + { + "epoch": 1.2858332837508182, + "grad_norm": 0.38153653409480226, + "learning_rate": 2.989293339209608e-06, + "loss": 0.3894, + "step": 5402 + }, + { + "epoch": 1.286071279823883, + "grad_norm": 0.37037061821185474, + "learning_rate": 2.987528626804418e-06, + "loss": 0.2799, + "step": 5403 + }, + { + "epoch": 1.2863092758969477, + "grad_norm": 0.39954728304549386, + "learning_rate": 2.98576421350226e-06, + "loss": 0.2984, + "step": 5404 + }, + { + "epoch": 1.2865472719700124, + "grad_norm": 0.42121203353357217, + "learning_rate": 2.9840000995653684e-06, + "loss": 0.3576, + "step": 5405 + }, + { + "epoch": 1.2867852680430774, + "grad_norm": 0.378600084659916, + "learning_rate": 2.9822362852559363e-06, + "loss": 0.3538, + "step": 5406 + }, + { + "epoch": 1.287023264116142, + "grad_norm": 0.37558983767345905, + "learning_rate": 2.9804727708361094e-06, + "loss": 0.288, + "step": 5407 + }, + { + "epoch": 1.2872612601892068, + "grad_norm": 0.4097981060709528, + "learning_rate": 2.9787095565679893e-06, + "loss": 0.3418, + "step": 5408 + }, + { + "epoch": 1.2874992562622718, + "grad_norm": 0.3880613286304227, + "learning_rate": 2.976946642713634e-06, + "loss": 0.3919, + "step": 5409 + }, + { + "epoch": 1.2877372523353365, + "grad_norm": 0.3576965119071218, + "learning_rate": 2.9751840295350554e-06, + "loss": 0.3108, + "step": 5410 + }, + { + "epoch": 1.2879752484084013, + "grad_norm": 0.386930113400703, + "learning_rate": 2.9734217172942216e-06, + "loss": 0.2923, + "step": 5411 + }, + { + "epoch": 1.288213244481466, + "grad_norm": 0.37377948379351883, + "learning_rate": 2.971659706253055e-06, + "loss": 0.3518, + "step": 5412 + }, + { + "epoch": 1.2884512405545308, + "grad_norm": 0.3907730616845053, + "learning_rate": 2.9698979966734353e-06, + "loss": 0.3765, + "step": 5413 + }, + { + "epoch": 1.2886892366275957, + "grad_norm": 0.35233568134416837, + "learning_rate": 2.968136588817196e-06, + "loss": 0.2804, + "step": 5414 + }, + { + "epoch": 1.2889272327006605, + "grad_norm": 0.3695879561285849, + "learning_rate": 2.966375482946125e-06, + "loss": 0.3208, + "step": 5415 + }, + { + "epoch": 1.2891652287737252, + "grad_norm": 0.383905634810903, + "learning_rate": 2.964614679321966e-06, + "loss": 0.3657, + "step": 5416 + }, + { + "epoch": 1.2894032248467902, + "grad_norm": 0.36362432939576694, + "learning_rate": 2.962854178206419e-06, + "loss": 0.3283, + "step": 5417 + }, + { + "epoch": 1.289641220919855, + "grad_norm": 0.37983820596988416, + "learning_rate": 2.961093979861137e-06, + "loss": 0.2758, + "step": 5418 + }, + { + "epoch": 1.2898792169929196, + "grad_norm": 0.3719877524530933, + "learning_rate": 2.9593340845477315e-06, + "loss": 0.3312, + "step": 5419 + }, + { + "epoch": 1.2901172130659844, + "grad_norm": 0.39592619284538855, + "learning_rate": 2.9575744925277626e-06, + "loss": 0.3998, + "step": 5420 + }, + { + "epoch": 1.290355209139049, + "grad_norm": 0.36564015551941653, + "learning_rate": 2.955815204062753e-06, + "loss": 0.2775, + "step": 5421 + }, + { + "epoch": 1.290593205212114, + "grad_norm": 0.3883816093497894, + "learning_rate": 2.954056219414174e-06, + "loss": 0.3132, + "step": 5422 + }, + { + "epoch": 1.2908312012851788, + "grad_norm": 0.3830741599542019, + "learning_rate": 2.952297538843456e-06, + "loss": 0.3843, + "step": 5423 + }, + { + "epoch": 1.2910691973582435, + "grad_norm": 0.3735082644605664, + "learning_rate": 2.9505391626119804e-06, + "loss": 0.3218, + "step": 5424 + }, + { + "epoch": 1.2913071934313085, + "grad_norm": 0.39360349279204443, + "learning_rate": 2.9487810909810876e-06, + "loss": 0.2838, + "step": 5425 + }, + { + "epoch": 1.2915451895043732, + "grad_norm": 0.39199734550206006, + "learning_rate": 2.947023324212069e-06, + "loss": 0.3363, + "step": 5426 + }, + { + "epoch": 1.291783185577438, + "grad_norm": 0.3946282023306039, + "learning_rate": 2.945265862566172e-06, + "loss": 0.3958, + "step": 5427 + }, + { + "epoch": 1.2920211816505027, + "grad_norm": 0.3833972335654247, + "learning_rate": 2.9435087063045997e-06, + "loss": 0.3168, + "step": 5428 + }, + { + "epoch": 1.2922591777235675, + "grad_norm": 0.36324852808260755, + "learning_rate": 2.9417518556885085e-06, + "loss": 0.3138, + "step": 5429 + }, + { + "epoch": 1.2924971737966324, + "grad_norm": 0.4118882053851258, + "learning_rate": 2.9399953109790104e-06, + "loss": 0.364, + "step": 5430 + }, + { + "epoch": 1.2927351698696972, + "grad_norm": 0.4094387139947238, + "learning_rate": 2.938239072437171e-06, + "loss": 0.358, + "step": 5431 + }, + { + "epoch": 1.292973165942762, + "grad_norm": 0.3746366818760013, + "learning_rate": 2.936483140324011e-06, + "loss": 0.3152, + "step": 5432 + }, + { + "epoch": 1.2932111620158269, + "grad_norm": 0.3905453061521183, + "learning_rate": 2.9347275149005046e-06, + "loss": 0.3501, + "step": 5433 + }, + { + "epoch": 1.2934491580888916, + "grad_norm": 0.4541439412535773, + "learning_rate": 2.9329721964275827e-06, + "loss": 0.401, + "step": 5434 + }, + { + "epoch": 1.2936871541619563, + "grad_norm": 0.398933342343559, + "learning_rate": 2.9312171851661285e-06, + "loss": 0.3134, + "step": 5435 + }, + { + "epoch": 1.293925150235021, + "grad_norm": 0.42307597168640104, + "learning_rate": 2.9294624813769795e-06, + "loss": 0.2636, + "step": 5436 + }, + { + "epoch": 1.2941631463080858, + "grad_norm": 0.4037706037517579, + "learning_rate": 2.9277080853209284e-06, + "loss": 0.3405, + "step": 5437 + }, + { + "epoch": 1.2944011423811508, + "grad_norm": 0.3812267379791281, + "learning_rate": 2.9259539972587227e-06, + "loss": 0.382, + "step": 5438 + }, + { + "epoch": 1.2946391384542155, + "grad_norm": 0.364830365933242, + "learning_rate": 2.9242002174510613e-06, + "loss": 0.2903, + "step": 5439 + }, + { + "epoch": 1.2948771345272803, + "grad_norm": 0.3897401469920776, + "learning_rate": 2.922446746158601e-06, + "loss": 0.3097, + "step": 5440 + }, + { + "epoch": 1.2951151306003452, + "grad_norm": 0.44911236787123654, + "learning_rate": 2.92069358364195e-06, + "loss": 0.4018, + "step": 5441 + }, + { + "epoch": 1.29535312667341, + "grad_norm": 0.37167943879371046, + "learning_rate": 2.918940730161672e-06, + "loss": 0.3206, + "step": 5442 + }, + { + "epoch": 1.2955911227464747, + "grad_norm": 0.35873134087334885, + "learning_rate": 2.9171881859782854e-06, + "loss": 0.2722, + "step": 5443 + }, + { + "epoch": 1.2958291188195394, + "grad_norm": 0.37574331573192493, + "learning_rate": 2.91543595135226e-06, + "loss": 0.3285, + "step": 5444 + }, + { + "epoch": 1.2960671148926042, + "grad_norm": 0.37774497295843434, + "learning_rate": 2.9136840265440213e-06, + "loss": 0.3934, + "step": 5445 + }, + { + "epoch": 1.2963051109656691, + "grad_norm": 0.37035405909980235, + "learning_rate": 2.9119324118139482e-06, + "loss": 0.2877, + "step": 5446 + }, + { + "epoch": 1.2965431070387339, + "grad_norm": 0.3719053824299425, + "learning_rate": 2.9101811074223762e-06, + "loss": 0.3049, + "step": 5447 + }, + { + "epoch": 1.2967811031117986, + "grad_norm": 0.38418916465237035, + "learning_rate": 2.9084301136295922e-06, + "loss": 0.3589, + "step": 5448 + }, + { + "epoch": 1.2970190991848636, + "grad_norm": 0.38154835318426467, + "learning_rate": 2.9066794306958356e-06, + "loss": 0.3518, + "step": 5449 + }, + { + "epoch": 1.2972570952579283, + "grad_norm": 0.37098490527058375, + "learning_rate": 2.904929058881302e-06, + "loss": 0.2862, + "step": 5450 + }, + { + "epoch": 1.297495091330993, + "grad_norm": 0.35922712949248786, + "learning_rate": 2.90317899844614e-06, + "loss": 0.3084, + "step": 5451 + }, + { + "epoch": 1.2977330874040578, + "grad_norm": 0.3752626421355902, + "learning_rate": 2.9014292496504493e-06, + "loss": 0.3723, + "step": 5452 + }, + { + "epoch": 1.2979710834771225, + "grad_norm": 0.3660609605786775, + "learning_rate": 2.89967981275429e-06, + "loss": 0.3029, + "step": 5453 + }, + { + "epoch": 1.2982090795501875, + "grad_norm": 0.383302664535824, + "learning_rate": 2.8979306880176706e-06, + "loss": 0.3021, + "step": 5454 + }, + { + "epoch": 1.2984470756232522, + "grad_norm": 0.3994458439573466, + "learning_rate": 2.8961818757005533e-06, + "loss": 0.3402, + "step": 5455 + }, + { + "epoch": 1.298685071696317, + "grad_norm": 0.3971982374786896, + "learning_rate": 2.894433376062855e-06, + "loss": 0.3528, + "step": 5456 + }, + { + "epoch": 1.298923067769382, + "grad_norm": 0.3882804894172824, + "learning_rate": 2.892685189364447e-06, + "loss": 0.2716, + "step": 5457 + }, + { + "epoch": 1.2991610638424467, + "grad_norm": 0.3899579933994844, + "learning_rate": 2.8909373158651523e-06, + "loss": 0.3196, + "step": 5458 + }, + { + "epoch": 1.2993990599155114, + "grad_norm": 0.3855353642705331, + "learning_rate": 2.889189755824747e-06, + "loss": 0.3789, + "step": 5459 + }, + { + "epoch": 1.2996370559885762, + "grad_norm": 0.3622462412263447, + "learning_rate": 2.887442509502962e-06, + "loss": 0.2928, + "step": 5460 + }, + { + "epoch": 1.299875052061641, + "grad_norm": 0.38805537438834, + "learning_rate": 2.885695577159484e-06, + "loss": 0.2817, + "step": 5461 + }, + { + "epoch": 1.3001130481347059, + "grad_norm": 0.4137674453573019, + "learning_rate": 2.8839489590539482e-06, + "loss": 0.3426, + "step": 5462 + }, + { + "epoch": 1.3003510442077706, + "grad_norm": 0.37778518714576936, + "learning_rate": 2.882202655445946e-06, + "loss": 0.3454, + "step": 5463 + }, + { + "epoch": 1.3005890402808353, + "grad_norm": 0.34689911133155854, + "learning_rate": 2.8804566665950207e-06, + "loss": 0.2948, + "step": 5464 + }, + { + "epoch": 1.3008270363539003, + "grad_norm": 0.3881888255918, + "learning_rate": 2.8787109927606704e-06, + "loss": 0.3178, + "step": 5465 + }, + { + "epoch": 1.301065032426965, + "grad_norm": 0.3689773814104203, + "learning_rate": 2.8769656342023445e-06, + "loss": 0.3571, + "step": 5466 + }, + { + "epoch": 1.3013030285000298, + "grad_norm": 0.37850711600818543, + "learning_rate": 2.8752205911794463e-06, + "loss": 0.3288, + "step": 5467 + }, + { + "epoch": 1.3015410245730945, + "grad_norm": 0.3848381260854274, + "learning_rate": 2.8734758639513327e-06, + "loss": 0.2831, + "step": 5468 + }, + { + "epoch": 1.3017790206461592, + "grad_norm": 0.3517844887890128, + "learning_rate": 2.8717314527773134e-06, + "loss": 0.3123, + "step": 5469 + }, + { + "epoch": 1.3020170167192242, + "grad_norm": 0.380584564601741, + "learning_rate": 2.8699873579166517e-06, + "loss": 0.3761, + "step": 5470 + }, + { + "epoch": 1.302255012792289, + "grad_norm": 0.3577811945107307, + "learning_rate": 2.8682435796285617e-06, + "loss": 0.285, + "step": 5471 + }, + { + "epoch": 1.3024930088653537, + "grad_norm": 0.3819440878979895, + "learning_rate": 2.8665001181722134e-06, + "loss": 0.2917, + "step": 5472 + }, + { + "epoch": 1.3027310049384186, + "grad_norm": 0.38014911424877107, + "learning_rate": 2.8647569738067262e-06, + "loss": 0.3719, + "step": 5473 + }, + { + "epoch": 1.3029690010114834, + "grad_norm": 0.38123121571081403, + "learning_rate": 2.8630141467911777e-06, + "loss": 0.3411, + "step": 5474 + }, + { + "epoch": 1.3032069970845481, + "grad_norm": 0.3930220532783937, + "learning_rate": 2.8612716373845927e-06, + "loss": 0.2891, + "step": 5475 + }, + { + "epoch": 1.3034449931576129, + "grad_norm": 0.4003883784524193, + "learning_rate": 2.859529445845953e-06, + "loss": 0.3191, + "step": 5476 + }, + { + "epoch": 1.3036829892306776, + "grad_norm": 0.39728799061286724, + "learning_rate": 2.8577875724341897e-06, + "loss": 0.3656, + "step": 5477 + }, + { + "epoch": 1.3039209853037426, + "grad_norm": 0.36276882877846156, + "learning_rate": 2.8560460174081896e-06, + "loss": 0.3009, + "step": 5478 + }, + { + "epoch": 1.3041589813768073, + "grad_norm": 0.34834226783278643, + "learning_rate": 2.8543047810267876e-06, + "loss": 0.2964, + "step": 5479 + }, + { + "epoch": 1.304396977449872, + "grad_norm": 0.37989807168764456, + "learning_rate": 2.852563863548779e-06, + "loss": 0.3486, + "step": 5480 + }, + { + "epoch": 1.304634973522937, + "grad_norm": 0.36552520765488317, + "learning_rate": 2.850823265232906e-06, + "loss": 0.3672, + "step": 5481 + }, + { + "epoch": 1.3048729695960017, + "grad_norm": 0.36751382218934403, + "learning_rate": 2.849082986337863e-06, + "loss": 0.251, + "step": 5482 + }, + { + "epoch": 1.3051109656690665, + "grad_norm": 0.40093768891686205, + "learning_rate": 2.8473430271222994e-06, + "loss": 0.3348, + "step": 5483 + }, + { + "epoch": 1.3053489617421312, + "grad_norm": 0.3780458249964564, + "learning_rate": 2.845603387844817e-06, + "loss": 0.3953, + "step": 5484 + }, + { + "epoch": 1.305586957815196, + "grad_norm": 0.3807478074730611, + "learning_rate": 2.8438640687639675e-06, + "loss": 0.3076, + "step": 5485 + }, + { + "epoch": 1.305824953888261, + "grad_norm": 0.3703942594376492, + "learning_rate": 2.842125070138258e-06, + "loss": 0.2822, + "step": 5486 + }, + { + "epoch": 1.3060629499613257, + "grad_norm": 0.3600318688567636, + "learning_rate": 2.8403863922261444e-06, + "loss": 0.3293, + "step": 5487 + }, + { + "epoch": 1.3063009460343904, + "grad_norm": 0.3609650481015529, + "learning_rate": 2.8386480352860414e-06, + "loss": 0.3507, + "step": 5488 + }, + { + "epoch": 1.3065389421074554, + "grad_norm": 0.36174141337750676, + "learning_rate": 2.8369099995763088e-06, + "loss": 0.2888, + "step": 5489 + }, + { + "epoch": 1.30677693818052, + "grad_norm": 0.3641547780000098, + "learning_rate": 2.835172285355263e-06, + "loss": 0.2959, + "step": 5490 + }, + { + "epoch": 1.3070149342535848, + "grad_norm": 0.35270388989031864, + "learning_rate": 2.833434892881171e-06, + "loss": 0.3901, + "step": 5491 + }, + { + "epoch": 1.3072529303266496, + "grad_norm": 0.41231868386069426, + "learning_rate": 2.831697822412252e-06, + "loss": 0.3232, + "step": 5492 + }, + { + "epoch": 1.3074909263997143, + "grad_norm": 0.4084581682764854, + "learning_rate": 2.8299610742066778e-06, + "loss": 0.2764, + "step": 5493 + }, + { + "epoch": 1.3077289224727793, + "grad_norm": 0.391038761585091, + "learning_rate": 2.8282246485225722e-06, + "loss": 0.3169, + "step": 5494 + }, + { + "epoch": 1.307966918545844, + "grad_norm": 0.392419286404586, + "learning_rate": 2.826488545618011e-06, + "loss": 0.41, + "step": 5495 + }, + { + "epoch": 1.3082049146189088, + "grad_norm": 0.36111528650542957, + "learning_rate": 2.824752765751022e-06, + "loss": 0.2922, + "step": 5496 + }, + { + "epoch": 1.3084429106919737, + "grad_norm": 0.40433967227880035, + "learning_rate": 2.8230173091795853e-06, + "loss": 0.312, + "step": 5497 + }, + { + "epoch": 1.3086809067650385, + "grad_norm": 0.4150889724294196, + "learning_rate": 2.8212821761616314e-06, + "loss": 0.3416, + "step": 5498 + }, + { + "epoch": 1.3089189028381032, + "grad_norm": 0.3784389771780397, + "learning_rate": 2.819547366955046e-06, + "loss": 0.3283, + "step": 5499 + }, + { + "epoch": 1.309156898911168, + "grad_norm": 0.39020777536186346, + "learning_rate": 2.817812881817663e-06, + "loss": 0.3039, + "step": 5500 + }, + { + "epoch": 1.3093948949842327, + "grad_norm": 0.38412700599020266, + "learning_rate": 2.8160787210072695e-06, + "loss": 0.335, + "step": 5501 + }, + { + "epoch": 1.3096328910572976, + "grad_norm": 0.37696789257027463, + "learning_rate": 2.814344884781607e-06, + "loss": 0.3842, + "step": 5502 + }, + { + "epoch": 1.3098708871303624, + "grad_norm": 0.3592980808212725, + "learning_rate": 2.812611373398365e-06, + "loss": 0.2864, + "step": 5503 + }, + { + "epoch": 1.310108883203427, + "grad_norm": 0.3932282988991156, + "learning_rate": 2.8108781871151866e-06, + "loss": 0.2911, + "step": 5504 + }, + { + "epoch": 1.310346879276492, + "grad_norm": 0.366771028303434, + "learning_rate": 2.8091453261896657e-06, + "loss": 0.3637, + "step": 5505 + }, + { + "epoch": 1.3105848753495568, + "grad_norm": 0.3612811617097267, + "learning_rate": 2.8074127908793464e-06, + "loss": 0.3796, + "step": 5506 + }, + { + "epoch": 1.3108228714226215, + "grad_norm": 0.3522436714920477, + "learning_rate": 2.8056805814417305e-06, + "loss": 0.2732, + "step": 5507 + }, + { + "epoch": 1.3110608674956863, + "grad_norm": 0.40918249461660394, + "learning_rate": 2.8039486981342647e-06, + "loss": 0.3357, + "step": 5508 + }, + { + "epoch": 1.311298863568751, + "grad_norm": 0.4104488620009276, + "learning_rate": 2.8022171412143504e-06, + "loss": 0.3646, + "step": 5509 + }, + { + "epoch": 1.311536859641816, + "grad_norm": 0.3643401976041939, + "learning_rate": 2.800485910939339e-06, + "loss": 0.3133, + "step": 5510 + }, + { + "epoch": 1.3117748557148807, + "grad_norm": 0.3894095595689637, + "learning_rate": 2.7987550075665356e-06, + "loss": 0.2957, + "step": 5511 + }, + { + "epoch": 1.3120128517879455, + "grad_norm": 0.38028414239770225, + "learning_rate": 2.7970244313531935e-06, + "loss": 0.3483, + "step": 5512 + }, + { + "epoch": 1.3122508478610104, + "grad_norm": 0.383503362557972, + "learning_rate": 2.7952941825565193e-06, + "loss": 0.3826, + "step": 5513 + }, + { + "epoch": 1.3124888439340752, + "grad_norm": 0.3753460629109302, + "learning_rate": 2.793564261433672e-06, + "loss": 0.2777, + "step": 5514 + }, + { + "epoch": 1.31272684000714, + "grad_norm": 0.38982091819158365, + "learning_rate": 2.7918346682417585e-06, + "loss": 0.3217, + "step": 5515 + }, + { + "epoch": 1.3129648360802046, + "grad_norm": 0.39383363481041805, + "learning_rate": 2.7901054032378426e-06, + "loss": 0.3899, + "step": 5516 + }, + { + "epoch": 1.3132028321532694, + "grad_norm": 0.43566741874589754, + "learning_rate": 2.7883764666789336e-06, + "loss": 0.2971, + "step": 5517 + }, + { + "epoch": 1.3134408282263343, + "grad_norm": 0.38349493003039725, + "learning_rate": 2.7866478588219945e-06, + "loss": 0.2766, + "step": 5518 + }, + { + "epoch": 1.313678824299399, + "grad_norm": 0.3767513973073563, + "learning_rate": 2.784919579923939e-06, + "loss": 0.3182, + "step": 5519 + }, + { + "epoch": 1.3139168203724638, + "grad_norm": 0.37130947209320847, + "learning_rate": 2.783191630241633e-06, + "loss": 0.3982, + "step": 5520 + }, + { + "epoch": 1.3141548164455288, + "grad_norm": 0.36376573221358194, + "learning_rate": 2.7814640100318917e-06, + "loss": 0.2772, + "step": 5521 + }, + { + "epoch": 1.3143928125185935, + "grad_norm": 0.504440580572112, + "learning_rate": 2.7797367195514825e-06, + "loss": 0.3145, + "step": 5522 + }, + { + "epoch": 1.3146308085916583, + "grad_norm": 0.3834097822594875, + "learning_rate": 2.778009759057123e-06, + "loss": 0.3448, + "step": 5523 + }, + { + "epoch": 1.314868804664723, + "grad_norm": 0.3670665934349199, + "learning_rate": 2.7762831288054836e-06, + "loss": 0.3409, + "step": 5524 + }, + { + "epoch": 1.3151068007377877, + "grad_norm": 0.37382482389619465, + "learning_rate": 2.7745568290531827e-06, + "loss": 0.2827, + "step": 5525 + }, + { + "epoch": 1.3153447968108527, + "grad_norm": 0.3685314429800074, + "learning_rate": 2.772830860056792e-06, + "loss": 0.3201, + "step": 5526 + }, + { + "epoch": 1.3155827928839174, + "grad_norm": 0.36043420452079505, + "learning_rate": 2.771105222072833e-06, + "loss": 0.426, + "step": 5527 + }, + { + "epoch": 1.3158207889569822, + "grad_norm": 0.36717830372844196, + "learning_rate": 2.769379915357776e-06, + "loss": 0.2985, + "step": 5528 + }, + { + "epoch": 1.3160587850300471, + "grad_norm": 0.37554243694911554, + "learning_rate": 2.7676549401680486e-06, + "loss": 0.2885, + "step": 5529 + }, + { + "epoch": 1.3162967811031119, + "grad_norm": 0.3741544659657053, + "learning_rate": 2.7659302967600226e-06, + "loss": 0.3648, + "step": 5530 + }, + { + "epoch": 1.3165347771761766, + "grad_norm": 0.38217417099471307, + "learning_rate": 2.764205985390023e-06, + "loss": 0.3696, + "step": 5531 + }, + { + "epoch": 1.3167727732492414, + "grad_norm": 0.3587235824649817, + "learning_rate": 2.762482006314324e-06, + "loss": 0.2703, + "step": 5532 + }, + { + "epoch": 1.317010769322306, + "grad_norm": 0.37109675762063177, + "learning_rate": 2.760758359789151e-06, + "loss": 0.3151, + "step": 5533 + }, + { + "epoch": 1.317248765395371, + "grad_norm": 0.4298879106634171, + "learning_rate": 2.7590350460706845e-06, + "loss": 0.3901, + "step": 5534 + }, + { + "epoch": 1.3174867614684358, + "grad_norm": 0.3555539814561159, + "learning_rate": 2.757312065415048e-06, + "loss": 0.2981, + "step": 5535 + }, + { + "epoch": 1.3177247575415005, + "grad_norm": 0.3717826964329484, + "learning_rate": 2.7555894180783203e-06, + "loss": 0.3002, + "step": 5536 + }, + { + "epoch": 1.3179627536145655, + "grad_norm": 0.41601130773589784, + "learning_rate": 2.7538671043165295e-06, + "loss": 0.3482, + "step": 5537 + }, + { + "epoch": 1.3182007496876302, + "grad_norm": 0.3901057217046057, + "learning_rate": 2.7521451243856533e-06, + "loss": 0.3883, + "step": 5538 + }, + { + "epoch": 1.318438745760695, + "grad_norm": 0.4228120609797883, + "learning_rate": 2.7504234785416206e-06, + "loss": 0.3078, + "step": 5539 + }, + { + "epoch": 1.3186767418337597, + "grad_norm": 0.3842260920241478, + "learning_rate": 2.7487021670403115e-06, + "loss": 0.315, + "step": 5540 + }, + { + "epoch": 1.3189147379068245, + "grad_norm": 0.41311984790557893, + "learning_rate": 2.746981190137554e-06, + "loss": 0.3725, + "step": 5541 + }, + { + "epoch": 1.3191527339798894, + "grad_norm": 0.3577865374099851, + "learning_rate": 2.7452605480891276e-06, + "loss": 0.3359, + "step": 5542 + }, + { + "epoch": 1.3193907300529542, + "grad_norm": 0.3845593269549925, + "learning_rate": 2.743540241150765e-06, + "loss": 0.2928, + "step": 5543 + }, + { + "epoch": 1.319628726126019, + "grad_norm": 0.3988246990305431, + "learning_rate": 2.7418202695781443e-06, + "loss": 0.3563, + "step": 5544 + }, + { + "epoch": 1.3198667221990839, + "grad_norm": 0.3926174352866639, + "learning_rate": 2.7401006336268966e-06, + "loss": 0.3319, + "step": 5545 + }, + { + "epoch": 1.3201047182721486, + "grad_norm": 0.3877729503505814, + "learning_rate": 2.738381333552601e-06, + "loss": 0.2998, + "step": 5546 + }, + { + "epoch": 1.3203427143452133, + "grad_norm": 0.40291838283798237, + "learning_rate": 2.73666236961079e-06, + "loss": 0.3096, + "step": 5547 + }, + { + "epoch": 1.320580710418278, + "grad_norm": 0.36068628191130936, + "learning_rate": 2.734943742056943e-06, + "loss": 0.3678, + "step": 5548 + }, + { + "epoch": 1.3208187064913428, + "grad_norm": 0.3709054992297036, + "learning_rate": 2.7332254511464906e-06, + "loss": 0.3333, + "step": 5549 + }, + { + "epoch": 1.3210567025644078, + "grad_norm": 0.3673136445373127, + "learning_rate": 2.7315074971348133e-06, + "loss": 0.303, + "step": 5550 + }, + { + "epoch": 1.3212946986374725, + "grad_norm": 0.3626370946230582, + "learning_rate": 2.729789880277242e-06, + "loss": 0.3252, + "step": 5551 + }, + { + "epoch": 1.3215326947105372, + "grad_norm": 0.3761315640459379, + "learning_rate": 2.728072600829057e-06, + "loss": 0.4076, + "step": 5552 + }, + { + "epoch": 1.3217706907836022, + "grad_norm": 0.38806642072912284, + "learning_rate": 2.7263556590454874e-06, + "loss": 0.3095, + "step": 5553 + }, + { + "epoch": 1.322008686856667, + "grad_norm": 0.3827363695116688, + "learning_rate": 2.724639055181715e-06, + "loss": 0.3012, + "step": 5554 + }, + { + "epoch": 1.3222466829297317, + "grad_norm": 0.406266926754827, + "learning_rate": 2.7229227894928666e-06, + "loss": 0.3563, + "step": 5555 + }, + { + "epoch": 1.3224846790027964, + "grad_norm": 0.3601509616480053, + "learning_rate": 2.721206862234026e-06, + "loss": 0.3359, + "step": 5556 + }, + { + "epoch": 1.3227226750758612, + "grad_norm": 0.39622194453014564, + "learning_rate": 2.71949127366022e-06, + "loss": 0.3009, + "step": 5557 + }, + { + "epoch": 1.322960671148926, + "grad_norm": 0.40428113767002444, + "learning_rate": 2.7177760240264273e-06, + "loss": 0.3141, + "step": 5558 + }, + { + "epoch": 1.3231986672219909, + "grad_norm": 0.39692065016659434, + "learning_rate": 2.7160611135875774e-06, + "loss": 0.3779, + "step": 5559 + }, + { + "epoch": 1.3234366632950556, + "grad_norm": 0.3546448081276393, + "learning_rate": 2.714346542598546e-06, + "loss": 0.3072, + "step": 5560 + }, + { + "epoch": 1.3236746593681203, + "grad_norm": 0.40729641787879445, + "learning_rate": 2.712632311314165e-06, + "loss": 0.2691, + "step": 5561 + }, + { + "epoch": 1.3239126554411853, + "grad_norm": 0.374238994477962, + "learning_rate": 2.7109184199892093e-06, + "loss": 0.3453, + "step": 5562 + }, + { + "epoch": 1.32415065151425, + "grad_norm": 0.3684026547826237, + "learning_rate": 2.7092048688784046e-06, + "loss": 0.3535, + "step": 5563 + }, + { + "epoch": 1.3243886475873148, + "grad_norm": 0.3957287753476546, + "learning_rate": 2.7074916582364284e-06, + "loss": 0.2692, + "step": 5564 + }, + { + "epoch": 1.3246266436603795, + "grad_norm": 0.3807784927555232, + "learning_rate": 2.705778788317906e-06, + "loss": 0.3471, + "step": 5565 + }, + { + "epoch": 1.3248646397334443, + "grad_norm": 0.36315758859429115, + "learning_rate": 2.7040662593774114e-06, + "loss": 0.384, + "step": 5566 + }, + { + "epoch": 1.3251026358065092, + "grad_norm": 0.3570108097887906, + "learning_rate": 2.70235407166947e-06, + "loss": 0.3291, + "step": 5567 + }, + { + "epoch": 1.325340631879574, + "grad_norm": 0.38719520188943835, + "learning_rate": 2.700642225448554e-06, + "loss": 0.3024, + "step": 5568 + }, + { + "epoch": 1.3255786279526387, + "grad_norm": 0.3896705832913739, + "learning_rate": 2.698930720969087e-06, + "loss": 0.3169, + "step": 5569 + }, + { + "epoch": 1.3258166240257037, + "grad_norm": 0.3888507607782142, + "learning_rate": 2.697219558485439e-06, + "loss": 0.3841, + "step": 5570 + }, + { + "epoch": 1.3260546200987684, + "grad_norm": 0.3670049338221428, + "learning_rate": 2.695508738251934e-06, + "loss": 0.2712, + "step": 5571 + }, + { + "epoch": 1.3262926161718331, + "grad_norm": 0.39842737745653384, + "learning_rate": 2.693798260522841e-06, + "loss": 0.2985, + "step": 5572 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.37727527288531104, + "learning_rate": 2.692088125552379e-06, + "loss": 0.3754, + "step": 5573 + }, + { + "epoch": 1.3267686083179626, + "grad_norm": 0.43871454005089827, + "learning_rate": 2.690378333594717e-06, + "loss": 0.3219, + "step": 5574 + }, + { + "epoch": 1.3270066043910276, + "grad_norm": 0.39318467207824387, + "learning_rate": 2.6886688849039717e-06, + "loss": 0.2756, + "step": 5575 + }, + { + "epoch": 1.3272446004640923, + "grad_norm": 0.3994623481607067, + "learning_rate": 2.6869597797342096e-06, + "loss": 0.3249, + "step": 5576 + }, + { + "epoch": 1.327482596537157, + "grad_norm": 0.44155749097845365, + "learning_rate": 2.685251018339446e-06, + "loss": 0.3763, + "step": 5577 + }, + { + "epoch": 1.327720592610222, + "grad_norm": 0.3665656601981612, + "learning_rate": 2.6835426009736455e-06, + "loss": 0.3239, + "step": 5578 + }, + { + "epoch": 1.3279585886832868, + "grad_norm": 0.4022642181119508, + "learning_rate": 2.681834527890721e-06, + "loss": 0.2761, + "step": 5579 + }, + { + "epoch": 1.3281965847563515, + "grad_norm": 0.3788048689911649, + "learning_rate": 2.6801267993445335e-06, + "loss": 0.3626, + "step": 5580 + }, + { + "epoch": 1.3284345808294162, + "grad_norm": 0.399964703157908, + "learning_rate": 2.6784194155888953e-06, + "loss": 0.364, + "step": 5581 + }, + { + "epoch": 1.328672576902481, + "grad_norm": 0.46248131169930756, + "learning_rate": 2.6767123768775627e-06, + "loss": 0.3001, + "step": 5582 + }, + { + "epoch": 1.328910572975546, + "grad_norm": 0.3790430115317474, + "learning_rate": 2.675005683464248e-06, + "loss": 0.3368, + "step": 5583 + }, + { + "epoch": 1.3291485690486107, + "grad_norm": 0.42296015749472615, + "learning_rate": 2.6732993356026073e-06, + "loss": 0.3964, + "step": 5584 + }, + { + "epoch": 1.3293865651216754, + "grad_norm": 0.37767108037602987, + "learning_rate": 2.671593333546244e-06, + "loss": 0.3186, + "step": 5585 + }, + { + "epoch": 1.3296245611947404, + "grad_norm": 0.3556314352660897, + "learning_rate": 2.669887677548712e-06, + "loss": 0.2928, + "step": 5586 + }, + { + "epoch": 1.3298625572678051, + "grad_norm": 0.3646511968328883, + "learning_rate": 2.6681823678635177e-06, + "loss": 0.3345, + "step": 5587 + }, + { + "epoch": 1.3301005533408699, + "grad_norm": 0.38411250506226, + "learning_rate": 2.6664774047441087e-06, + "loss": 0.3821, + "step": 5588 + }, + { + "epoch": 1.3303385494139346, + "grad_norm": 0.3780727074870984, + "learning_rate": 2.6647727884438866e-06, + "loss": 0.287, + "step": 5589 + }, + { + "epoch": 1.3305765454869993, + "grad_norm": 0.3595186487413398, + "learning_rate": 2.6630685192161995e-06, + "loss": 0.3187, + "step": 5590 + }, + { + "epoch": 1.3308145415600643, + "grad_norm": 0.352373082985822, + "learning_rate": 2.6613645973143427e-06, + "loss": 0.3637, + "step": 5591 + }, + { + "epoch": 1.331052537633129, + "grad_norm": 0.3611789272199898, + "learning_rate": 2.659661022991562e-06, + "loss": 0.3405, + "step": 5592 + }, + { + "epoch": 1.3312905337061938, + "grad_norm": 0.4065306276115284, + "learning_rate": 2.65795779650105e-06, + "loss": 0.293, + "step": 5593 + }, + { + "epoch": 1.3315285297792587, + "grad_norm": 0.3987698770035065, + "learning_rate": 2.656254918095949e-06, + "loss": 0.3428, + "step": 5594 + }, + { + "epoch": 1.3317665258523235, + "grad_norm": 0.3943243833309216, + "learning_rate": 2.654552388029349e-06, + "loss": 0.3822, + "step": 5595 + }, + { + "epoch": 1.3320045219253882, + "grad_norm": 0.36509611859681235, + "learning_rate": 2.652850206554287e-06, + "loss": 0.2925, + "step": 5596 + }, + { + "epoch": 1.332242517998453, + "grad_norm": 0.38035780331790836, + "learning_rate": 2.6511483739237508e-06, + "loss": 0.3156, + "step": 5597 + }, + { + "epoch": 1.3324805140715177, + "grad_norm": 0.37332659121072254, + "learning_rate": 2.649446890390671e-06, + "loss": 0.3465, + "step": 5598 + }, + { + "epoch": 1.3327185101445826, + "grad_norm": 0.3665673689467114, + "learning_rate": 2.647745756207937e-06, + "loss": 0.3658, + "step": 5599 + }, + { + "epoch": 1.3329565062176474, + "grad_norm": 0.3820284830332031, + "learning_rate": 2.646044971628374e-06, + "loss": 0.2949, + "step": 5600 + }, + { + "epoch": 1.3331945022907121, + "grad_norm": 0.37188294924637955, + "learning_rate": 2.6443445369047625e-06, + "loss": 0.3224, + "step": 5601 + }, + { + "epoch": 1.333432498363777, + "grad_norm": 0.3735185472412557, + "learning_rate": 2.6426444522898286e-06, + "loss": 0.3703, + "step": 5602 + }, + { + "epoch": 1.3336704944368418, + "grad_norm": 0.35518226222360766, + "learning_rate": 2.640944718036248e-06, + "loss": 0.3015, + "step": 5603 + }, + { + "epoch": 1.3339084905099066, + "grad_norm": 0.389450698522127, + "learning_rate": 2.6392453343966422e-06, + "loss": 0.2587, + "step": 5604 + }, + { + "epoch": 1.3341464865829713, + "grad_norm": 0.3971444862417287, + "learning_rate": 2.6375463016235826e-06, + "loss": 0.3954, + "step": 5605 + }, + { + "epoch": 1.334384482656036, + "grad_norm": 0.3502692387454883, + "learning_rate": 2.6358476199695858e-06, + "loss": 0.3411, + "step": 5606 + }, + { + "epoch": 1.334622478729101, + "grad_norm": 0.36914607170334995, + "learning_rate": 2.6341492896871198e-06, + "loss": 0.2936, + "step": 5607 + }, + { + "epoch": 1.3348604748021657, + "grad_norm": 0.4088428743135435, + "learning_rate": 2.632451311028598e-06, + "loss": 0.3102, + "step": 5608 + }, + { + "epoch": 1.3350984708752305, + "grad_norm": 0.37074985008059036, + "learning_rate": 2.630753684246378e-06, + "loss": 0.3687, + "step": 5609 + }, + { + "epoch": 1.3353364669482954, + "grad_norm": 0.35775847771800834, + "learning_rate": 2.6290564095927762e-06, + "loss": 0.3196, + "step": 5610 + }, + { + "epoch": 1.3355744630213602, + "grad_norm": 0.3834131260856632, + "learning_rate": 2.627359487320046e-06, + "loss": 0.2913, + "step": 5611 + }, + { + "epoch": 1.335812459094425, + "grad_norm": 0.37855664847349857, + "learning_rate": 2.6256629176803925e-06, + "loss": 0.3398, + "step": 5612 + }, + { + "epoch": 1.3360504551674897, + "grad_norm": 0.3915920080407146, + "learning_rate": 2.623966700925965e-06, + "loss": 0.3756, + "step": 5613 + }, + { + "epoch": 1.3362884512405544, + "grad_norm": 0.3936719541417686, + "learning_rate": 2.622270837308869e-06, + "loss": 0.2686, + "step": 5614 + }, + { + "epoch": 1.3365264473136194, + "grad_norm": 0.37541566356978545, + "learning_rate": 2.620575327081148e-06, + "loss": 0.3409, + "step": 5615 + }, + { + "epoch": 1.336764443386684, + "grad_norm": 0.39242038775201626, + "learning_rate": 2.6188801704947976e-06, + "loss": 0.4255, + "step": 5616 + }, + { + "epoch": 1.3370024394597488, + "grad_norm": 0.3738300495004616, + "learning_rate": 2.61718536780176e-06, + "loss": 0.3243, + "step": 5617 + }, + { + "epoch": 1.3372404355328138, + "grad_norm": 0.37526710784159806, + "learning_rate": 2.6154909192539248e-06, + "loss": 0.2954, + "step": 5618 + }, + { + "epoch": 1.3374784316058785, + "grad_norm": 0.3831635215027392, + "learning_rate": 2.613796825103129e-06, + "loss": 0.3692, + "step": 5619 + }, + { + "epoch": 1.3377164276789433, + "grad_norm": 0.3928912025057844, + "learning_rate": 2.6121030856011562e-06, + "loss": 0.4076, + "step": 5620 + }, + { + "epoch": 1.337954423752008, + "grad_norm": 0.3747902265725176, + "learning_rate": 2.61040970099974e-06, + "loss": 0.3122, + "step": 5621 + }, + { + "epoch": 1.3381924198250728, + "grad_norm": 0.3852536780569142, + "learning_rate": 2.6087166715505563e-06, + "loss": 0.3281, + "step": 5622 + }, + { + "epoch": 1.3384304158981377, + "grad_norm": 0.4203309500020824, + "learning_rate": 2.6070239975052334e-06, + "loss": 0.3748, + "step": 5623 + }, + { + "epoch": 1.3386684119712025, + "grad_norm": 0.37552479294518815, + "learning_rate": 2.605331679115344e-06, + "loss": 0.3544, + "step": 5624 + }, + { + "epoch": 1.3389064080442672, + "grad_norm": 0.3796552948357275, + "learning_rate": 2.6036397166324062e-06, + "loss": 0.257, + "step": 5625 + }, + { + "epoch": 1.3391444041173322, + "grad_norm": 0.36585880934173215, + "learning_rate": 2.6019481103078912e-06, + "loss": 0.3224, + "step": 5626 + }, + { + "epoch": 1.339382400190397, + "grad_norm": 0.3784318209352287, + "learning_rate": 2.6002568603932127e-06, + "loss": 0.4231, + "step": 5627 + }, + { + "epoch": 1.3396203962634616, + "grad_norm": 0.3581778615226102, + "learning_rate": 2.598565967139731e-06, + "loss": 0.269, + "step": 5628 + }, + { + "epoch": 1.3398583923365264, + "grad_norm": 0.3613572419065405, + "learning_rate": 2.5968754307987556e-06, + "loss": 0.3059, + "step": 5629 + }, + { + "epoch": 1.340096388409591, + "grad_norm": 0.41685880452742957, + "learning_rate": 2.5951852516215415e-06, + "loss": 0.3547, + "step": 5630 + }, + { + "epoch": 1.340334384482656, + "grad_norm": 0.3476538727617446, + "learning_rate": 2.593495429859291e-06, + "loss": 0.3522, + "step": 5631 + }, + { + "epoch": 1.3405723805557208, + "grad_norm": 0.38372719904096886, + "learning_rate": 2.5918059657631532e-06, + "loss": 0.2839, + "step": 5632 + }, + { + "epoch": 1.3408103766287855, + "grad_norm": 0.4168082237276136, + "learning_rate": 2.5901168595842256e-06, + "loss": 0.3166, + "step": 5633 + }, + { + "epoch": 1.3410483727018505, + "grad_norm": 0.3990911280449388, + "learning_rate": 2.58842811157355e-06, + "loss": 0.3948, + "step": 5634 + }, + { + "epoch": 1.3412863687749152, + "grad_norm": 0.3619981264622829, + "learning_rate": 2.5867397219821166e-06, + "loss": 0.3115, + "step": 5635 + }, + { + "epoch": 1.34152436484798, + "grad_norm": 0.37183438540217123, + "learning_rate": 2.5850516910608596e-06, + "loss": 0.299, + "step": 5636 + }, + { + "epoch": 1.3417623609210447, + "grad_norm": 0.3821320851640882, + "learning_rate": 2.5833640190606663e-06, + "loss": 0.3677, + "step": 5637 + }, + { + "epoch": 1.3420003569941095, + "grad_norm": 0.4340280479794182, + "learning_rate": 2.5816767062323646e-06, + "loss": 0.3653, + "step": 5638 + }, + { + "epoch": 1.3422383530671744, + "grad_norm": 0.3935957665861045, + "learning_rate": 2.5799897528267304e-06, + "loss": 0.3125, + "step": 5639 + }, + { + "epoch": 1.3424763491402392, + "grad_norm": 0.3551306900169191, + "learning_rate": 2.5783031590944853e-06, + "loss": 0.3321, + "step": 5640 + }, + { + "epoch": 1.342714345213304, + "grad_norm": 0.39027530241074593, + "learning_rate": 2.5766169252863026e-06, + "loss": 0.3719, + "step": 5641 + }, + { + "epoch": 1.3429523412863689, + "grad_norm": 0.3669775255066402, + "learning_rate": 2.574931051652796e-06, + "loss": 0.3106, + "step": 5642 + }, + { + "epoch": 1.3431903373594336, + "grad_norm": 0.36467746278063334, + "learning_rate": 2.573245538444529e-06, + "loss": 0.2812, + "step": 5643 + }, + { + "epoch": 1.3434283334324983, + "grad_norm": 0.44665190837644525, + "learning_rate": 2.5715603859120095e-06, + "loss": 0.3282, + "step": 5644 + }, + { + "epoch": 1.343666329505563, + "grad_norm": 0.36150659846601496, + "learning_rate": 2.569875594305694e-06, + "loss": 0.3898, + "step": 5645 + }, + { + "epoch": 1.3439043255786278, + "grad_norm": 0.36749221425362993, + "learning_rate": 2.5681911638759837e-06, + "loss": 0.2971, + "step": 5646 + }, + { + "epoch": 1.3441423216516928, + "grad_norm": 0.37962067008941414, + "learning_rate": 2.5665070948732258e-06, + "loss": 0.2935, + "step": 5647 + }, + { + "epoch": 1.3443803177247575, + "grad_norm": 0.4350753894556899, + "learning_rate": 2.564823387547716e-06, + "loss": 0.3688, + "step": 5648 + }, + { + "epoch": 1.3446183137978223, + "grad_norm": 0.36978176336633634, + "learning_rate": 2.5631400421496934e-06, + "loss": 0.3408, + "step": 5649 + }, + { + "epoch": 1.3448563098708872, + "grad_norm": 0.3672540601703578, + "learning_rate": 2.5614570589293457e-06, + "loss": 0.3011, + "step": 5650 + }, + { + "epoch": 1.345094305943952, + "grad_norm": 0.3767220367127715, + "learning_rate": 2.5597744381368063e-06, + "loss": 0.3395, + "step": 5651 + }, + { + "epoch": 1.3453323020170167, + "grad_norm": 0.38145663607283764, + "learning_rate": 2.558092180022153e-06, + "loss": 0.3896, + "step": 5652 + }, + { + "epoch": 1.3455702980900814, + "grad_norm": 0.36950737323434774, + "learning_rate": 2.5564102848354098e-06, + "loss": 0.3138, + "step": 5653 + }, + { + "epoch": 1.3458082941631462, + "grad_norm": 0.3764771683321259, + "learning_rate": 2.5547287528265517e-06, + "loss": 0.282, + "step": 5654 + }, + { + "epoch": 1.3460462902362111, + "grad_norm": 0.4283336061272312, + "learning_rate": 2.553047584245495e-06, + "loss": 0.372, + "step": 5655 + }, + { + "epoch": 1.3462842863092759, + "grad_norm": 0.3676206486750502, + "learning_rate": 2.5513667793421002e-06, + "loss": 0.3684, + "step": 5656 + }, + { + "epoch": 1.3465222823823406, + "grad_norm": 0.40021565693988703, + "learning_rate": 2.5496863383661797e-06, + "loss": 0.2643, + "step": 5657 + }, + { + "epoch": 1.3467602784554056, + "grad_norm": 0.3777923710780302, + "learning_rate": 2.548006261567487e-06, + "loss": 0.3046, + "step": 5658 + }, + { + "epoch": 1.3469982745284703, + "grad_norm": 0.38725995517731304, + "learning_rate": 2.5463265491957224e-06, + "loss": 0.3831, + "step": 5659 + }, + { + "epoch": 1.347236270601535, + "grad_norm": 0.3576244257854606, + "learning_rate": 2.544647201500534e-06, + "loss": 0.3197, + "step": 5660 + }, + { + "epoch": 1.3474742666745998, + "grad_norm": 0.36464513110800595, + "learning_rate": 2.542968218731514e-06, + "loss": 0.2718, + "step": 5661 + }, + { + "epoch": 1.3477122627476645, + "grad_norm": 0.3904440981846269, + "learning_rate": 2.541289601138201e-06, + "loss": 0.3488, + "step": 5662 + }, + { + "epoch": 1.3479502588207295, + "grad_norm": 0.3894407299674647, + "learning_rate": 2.539611348970077e-06, + "loss": 0.3785, + "step": 5663 + }, + { + "epoch": 1.3481882548937942, + "grad_norm": 0.36018047764775457, + "learning_rate": 2.537933462476575e-06, + "loss": 0.294, + "step": 5664 + }, + { + "epoch": 1.348426250966859, + "grad_norm": 0.45032023637446106, + "learning_rate": 2.5362559419070693e-06, + "loss": 0.3058, + "step": 5665 + }, + { + "epoch": 1.348664247039924, + "grad_norm": 0.3520211254562473, + "learning_rate": 2.534578787510881e-06, + "loss": 0.3629, + "step": 5666 + }, + { + "epoch": 1.3489022431129887, + "grad_norm": 0.35778323923398264, + "learning_rate": 2.532901999537274e-06, + "loss": 0.3121, + "step": 5667 + }, + { + "epoch": 1.3491402391860534, + "grad_norm": 0.40768614895424304, + "learning_rate": 2.531225578235465e-06, + "loss": 0.2942, + "step": 5668 + }, + { + "epoch": 1.3493782352591182, + "grad_norm": 0.40587873605164554, + "learning_rate": 2.52954952385461e-06, + "loss": 0.3128, + "step": 5669 + }, + { + "epoch": 1.349616231332183, + "grad_norm": 0.3665695548104563, + "learning_rate": 2.527873836643811e-06, + "loss": 0.3682, + "step": 5670 + }, + { + "epoch": 1.3498542274052479, + "grad_norm": 0.3757527838138373, + "learning_rate": 2.5261985168521174e-06, + "loss": 0.2842, + "step": 5671 + }, + { + "epoch": 1.3500922234783126, + "grad_norm": 0.38913613595462837, + "learning_rate": 2.5245235647285238e-06, + "loss": 0.3045, + "step": 5672 + }, + { + "epoch": 1.3503302195513773, + "grad_norm": 0.39284781780088013, + "learning_rate": 2.5228489805219684e-06, + "loss": 0.3733, + "step": 5673 + }, + { + "epoch": 1.3505682156244423, + "grad_norm": 0.39264836647519236, + "learning_rate": 2.5211747644813367e-06, + "loss": 0.323, + "step": 5674 + }, + { + "epoch": 1.350806211697507, + "grad_norm": 0.38369466339855723, + "learning_rate": 2.5195009168554572e-06, + "loss": 0.2919, + "step": 5675 + }, + { + "epoch": 1.3510442077705718, + "grad_norm": 0.37067590315834653, + "learning_rate": 2.517827437893107e-06, + "loss": 0.2875, + "step": 5676 + }, + { + "epoch": 1.3512822038436365, + "grad_norm": 0.38302813218504755, + "learning_rate": 2.5161543278430055e-06, + "loss": 0.3684, + "step": 5677 + }, + { + "epoch": 1.3515201999167012, + "grad_norm": 0.3605795300765205, + "learning_rate": 2.5144815869538177e-06, + "loss": 0.2855, + "step": 5678 + }, + { + "epoch": 1.3517581959897662, + "grad_norm": 0.36372350908656326, + "learning_rate": 2.512809215474155e-06, + "loss": 0.2996, + "step": 5679 + }, + { + "epoch": 1.351996192062831, + "grad_norm": 0.36974222068701523, + "learning_rate": 2.5111372136525713e-06, + "loss": 0.3311, + "step": 5680 + }, + { + "epoch": 1.3522341881358957, + "grad_norm": 0.36626523624003543, + "learning_rate": 2.509465581737571e-06, + "loss": 0.3287, + "step": 5681 + }, + { + "epoch": 1.3524721842089606, + "grad_norm": 0.3940460629565776, + "learning_rate": 2.5077943199775978e-06, + "loss": 0.29, + "step": 5682 + }, + { + "epoch": 1.3527101802820254, + "grad_norm": 0.38718589993001984, + "learning_rate": 2.5061234286210436e-06, + "loss": 0.2943, + "step": 5683 + }, + { + "epoch": 1.3529481763550901, + "grad_norm": 0.4250135569344329, + "learning_rate": 2.5044529079162426e-06, + "loss": 0.3941, + "step": 5684 + }, + { + "epoch": 1.3531861724281549, + "grad_norm": 0.395297953846377, + "learning_rate": 2.502782758111477e-06, + "loss": 0.2984, + "step": 5685 + }, + { + "epoch": 1.3534241685012196, + "grad_norm": 0.3758290085462567, + "learning_rate": 2.5011129794549717e-06, + "loss": 0.258, + "step": 5686 + }, + { + "epoch": 1.3536621645742846, + "grad_norm": 0.3900626080187409, + "learning_rate": 2.4994435721948966e-06, + "loss": 0.3284, + "step": 5687 + }, + { + "epoch": 1.3539001606473493, + "grad_norm": 0.38881872385050364, + "learning_rate": 2.4977745365793676e-06, + "loss": 0.3925, + "step": 5688 + }, + { + "epoch": 1.354138156720414, + "grad_norm": 0.37508295198986963, + "learning_rate": 2.4961058728564446e-06, + "loss": 0.2848, + "step": 5689 + }, + { + "epoch": 1.354376152793479, + "grad_norm": 0.4969361288451411, + "learning_rate": 2.4944375812741304e-06, + "loss": 0.3224, + "step": 5690 + }, + { + "epoch": 1.3546141488665437, + "grad_norm": 0.4191683283997192, + "learning_rate": 2.4927696620803783e-06, + "loss": 0.3844, + "step": 5691 + }, + { + "epoch": 1.3548521449396085, + "grad_norm": 0.3632365871635604, + "learning_rate": 2.4911021155230804e-06, + "loss": 0.3277, + "step": 5692 + }, + { + "epoch": 1.3550901410126732, + "grad_norm": 0.36797859774918557, + "learning_rate": 2.4894349418500745e-06, + "loss": 0.2917, + "step": 5693 + }, + { + "epoch": 1.355328137085738, + "grad_norm": 0.36534359001275923, + "learning_rate": 2.487768141309144e-06, + "loss": 0.3252, + "step": 5694 + }, + { + "epoch": 1.355566133158803, + "grad_norm": 0.41390949501152025, + "learning_rate": 2.486101714148018e-06, + "loss": 0.3776, + "step": 5695 + }, + { + "epoch": 1.3558041292318677, + "grad_norm": 0.3596335904706074, + "learning_rate": 2.4844356606143687e-06, + "loss": 0.3014, + "step": 5696 + }, + { + "epoch": 1.3560421253049324, + "grad_norm": 0.3738157482380636, + "learning_rate": 2.482769980955812e-06, + "loss": 0.3108, + "step": 5697 + }, + { + "epoch": 1.3562801213779974, + "grad_norm": 0.3712839660896256, + "learning_rate": 2.4811046754199098e-06, + "loss": 0.3732, + "step": 5698 + }, + { + "epoch": 1.356518117451062, + "grad_norm": 0.3879484832908067, + "learning_rate": 2.479439744254167e-06, + "loss": 0.3083, + "step": 5699 + }, + { + "epoch": 1.3567561135241268, + "grad_norm": 0.4129860781327451, + "learning_rate": 2.4777751877060343e-06, + "loss": 0.2727, + "step": 5700 + }, + { + "epoch": 1.3569941095971916, + "grad_norm": 0.3695216250582914, + "learning_rate": 2.476111006022905e-06, + "loss": 0.3, + "step": 5701 + }, + { + "epoch": 1.3572321056702563, + "grad_norm": 0.38079989838460043, + "learning_rate": 2.4744471994521184e-06, + "loss": 0.387, + "step": 5702 + }, + { + "epoch": 1.3574701017433213, + "grad_norm": 0.3851371719901647, + "learning_rate": 2.4727837682409574e-06, + "loss": 0.2918, + "step": 5703 + }, + { + "epoch": 1.357708097816386, + "grad_norm": 0.4030470108804071, + "learning_rate": 2.4711207126366483e-06, + "loss": 0.2774, + "step": 5704 + }, + { + "epoch": 1.3579460938894508, + "grad_norm": 0.4043361359872621, + "learning_rate": 2.4694580328863633e-06, + "loss": 0.3715, + "step": 5705 + }, + { + "epoch": 1.3581840899625157, + "grad_norm": 0.36653113787008357, + "learning_rate": 2.4677957292372166e-06, + "loss": 0.3627, + "step": 5706 + }, + { + "epoch": 1.3584220860355805, + "grad_norm": 0.3850700049312921, + "learning_rate": 2.4661338019362684e-06, + "loss": 0.2887, + "step": 5707 + }, + { + "epoch": 1.3586600821086452, + "grad_norm": 0.38782380526495214, + "learning_rate": 2.46447225123052e-06, + "loss": 0.35, + "step": 5708 + }, + { + "epoch": 1.35889807818171, + "grad_norm": 0.3875168580573579, + "learning_rate": 2.4628110773669235e-06, + "loss": 0.3909, + "step": 5709 + }, + { + "epoch": 1.3591360742547747, + "grad_norm": 0.3706961560048635, + "learning_rate": 2.4611502805923677e-06, + "loss": 0.3001, + "step": 5710 + }, + { + "epoch": 1.3593740703278396, + "grad_norm": 0.41022795283685426, + "learning_rate": 2.459489861153688e-06, + "loss": 0.2858, + "step": 5711 + }, + { + "epoch": 1.3596120664009044, + "grad_norm": 0.3507488121743938, + "learning_rate": 2.4578298192976646e-06, + "loss": 0.3481, + "step": 5712 + }, + { + "epoch": 1.3598500624739691, + "grad_norm": 0.3882156595432714, + "learning_rate": 2.4561701552710198e-06, + "loss": 0.3727, + "step": 5713 + }, + { + "epoch": 1.360088058547034, + "grad_norm": 0.41147742183986585, + "learning_rate": 2.454510869320422e-06, + "loss": 0.2848, + "step": 5714 + }, + { + "epoch": 1.3603260546200988, + "grad_norm": 0.3932347054914899, + "learning_rate": 2.4528519616924807e-06, + "loss": 0.319, + "step": 5715 + }, + { + "epoch": 1.3605640506931636, + "grad_norm": 0.3962467933758855, + "learning_rate": 2.4511934326337516e-06, + "loss": 0.3792, + "step": 5716 + }, + { + "epoch": 1.3608020467662283, + "grad_norm": 0.38954161787628644, + "learning_rate": 2.449535282390731e-06, + "loss": 0.3375, + "step": 5717 + }, + { + "epoch": 1.361040042839293, + "grad_norm": 0.3926513205788529, + "learning_rate": 2.4478775112098644e-06, + "loss": 0.271, + "step": 5718 + }, + { + "epoch": 1.361278038912358, + "grad_norm": 0.4009062810034733, + "learning_rate": 2.446220119337536e-06, + "loss": 0.3599, + "step": 5719 + }, + { + "epoch": 1.3615160349854227, + "grad_norm": 0.367410786702826, + "learning_rate": 2.444563107020076e-06, + "loss": 0.3562, + "step": 5720 + }, + { + "epoch": 1.3617540310584875, + "grad_norm": 0.37374300698130064, + "learning_rate": 2.4429064745037562e-06, + "loss": 0.2799, + "step": 5721 + }, + { + "epoch": 1.3619920271315524, + "grad_norm": 0.39466179262858975, + "learning_rate": 2.441250222034792e-06, + "loss": 0.3144, + "step": 5722 + }, + { + "epoch": 1.3622300232046172, + "grad_norm": 0.4045572620870956, + "learning_rate": 2.4395943498593476e-06, + "loss": 0.3638, + "step": 5723 + }, + { + "epoch": 1.362468019277682, + "grad_norm": 0.35753443357854653, + "learning_rate": 2.4379388582235236e-06, + "loss": 0.3324, + "step": 5724 + }, + { + "epoch": 1.3627060153507466, + "grad_norm": 0.3827497837460359, + "learning_rate": 2.436283747373368e-06, + "loss": 0.2728, + "step": 5725 + }, + { + "epoch": 1.3629440114238114, + "grad_norm": 0.369331720445923, + "learning_rate": 2.43462901755487e-06, + "loss": 0.3226, + "step": 5726 + }, + { + "epoch": 1.3631820074968763, + "grad_norm": 0.3868589649921183, + "learning_rate": 2.4329746690139656e-06, + "loss": 0.4188, + "step": 5727 + }, + { + "epoch": 1.363420003569941, + "grad_norm": 0.5212121670082667, + "learning_rate": 2.4313207019965295e-06, + "loss": 0.3194, + "step": 5728 + }, + { + "epoch": 1.3636579996430058, + "grad_norm": 0.380202951444516, + "learning_rate": 2.429667116748383e-06, + "loss": 0.2932, + "step": 5729 + }, + { + "epoch": 1.3638959957160708, + "grad_norm": 0.3965822543317094, + "learning_rate": 2.4280139135152906e-06, + "loss": 0.3701, + "step": 5730 + }, + { + "epoch": 1.3641339917891355, + "grad_norm": 0.37353099551658364, + "learning_rate": 2.4263610925429588e-06, + "loss": 0.3374, + "step": 5731 + }, + { + "epoch": 1.3643719878622003, + "grad_norm": 0.3661423191951257, + "learning_rate": 2.4247086540770365e-06, + "loss": 0.2938, + "step": 5732 + }, + { + "epoch": 1.364609983935265, + "grad_norm": 0.3647519555796, + "learning_rate": 2.4230565983631184e-06, + "loss": 0.3204, + "step": 5733 + }, + { + "epoch": 1.3648479800083297, + "grad_norm": 0.43532671681837853, + "learning_rate": 2.42140492564674e-06, + "loss": 0.3777, + "step": 5734 + }, + { + "epoch": 1.3650859760813947, + "grad_norm": 0.38492642963237705, + "learning_rate": 2.4197536361733792e-06, + "loss": 0.3075, + "step": 5735 + }, + { + "epoch": 1.3653239721544594, + "grad_norm": 0.3814772631917648, + "learning_rate": 2.418102730188462e-06, + "loss": 0.2729, + "step": 5736 + }, + { + "epoch": 1.3655619682275242, + "grad_norm": 0.39209652486984, + "learning_rate": 2.4164522079373525e-06, + "loss": 0.344, + "step": 5737 + }, + { + "epoch": 1.3657999643005891, + "grad_norm": 0.3664720899560753, + "learning_rate": 2.4148020696653583e-06, + "loss": 0.3635, + "step": 5738 + }, + { + "epoch": 1.3660379603736539, + "grad_norm": 0.3534142149903722, + "learning_rate": 2.413152315617732e-06, + "loss": 0.2905, + "step": 5739 + }, + { + "epoch": 1.3662759564467186, + "grad_norm": 0.3734213259184978, + "learning_rate": 2.4115029460396666e-06, + "loss": 0.2853, + "step": 5740 + }, + { + "epoch": 1.3665139525197834, + "grad_norm": 0.45252181922728846, + "learning_rate": 2.4098539611763e-06, + "loss": 0.3878, + "step": 5741 + }, + { + "epoch": 1.366751948592848, + "grad_norm": 0.4147075627953894, + "learning_rate": 2.408205361272712e-06, + "loss": 0.3316, + "step": 5742 + }, + { + "epoch": 1.366989944665913, + "grad_norm": 0.355835782980996, + "learning_rate": 2.4065571465739247e-06, + "loss": 0.2902, + "step": 5743 + }, + { + "epoch": 1.3672279407389778, + "grad_norm": 0.40862200982116076, + "learning_rate": 2.4049093173249026e-06, + "loss": 0.3258, + "step": 5744 + }, + { + "epoch": 1.3674659368120425, + "grad_norm": 0.3792126875199552, + "learning_rate": 2.403261873770557e-06, + "loss": 0.3966, + "step": 5745 + }, + { + "epoch": 1.3677039328851075, + "grad_norm": 0.39370562835081924, + "learning_rate": 2.401614816155737e-06, + "loss": 0.2929, + "step": 5746 + }, + { + "epoch": 1.3679419289581722, + "grad_norm": 0.3985119119940615, + "learning_rate": 2.399968144725237e-06, + "loss": 0.3155, + "step": 5747 + }, + { + "epoch": 1.368179925031237, + "grad_norm": 0.4030281878578675, + "learning_rate": 2.3983218597237924e-06, + "loss": 0.3834, + "step": 5748 + }, + { + "epoch": 1.3684179211043017, + "grad_norm": 0.3690031365226529, + "learning_rate": 2.3966759613960796e-06, + "loss": 0.3083, + "step": 5749 + }, + { + "epoch": 1.3686559171773665, + "grad_norm": 0.38052926649872104, + "learning_rate": 2.3950304499867237e-06, + "loss": 0.2933, + "step": 5750 + }, + { + "epoch": 1.3688939132504314, + "grad_norm": 0.3970644105901736, + "learning_rate": 2.393385325740287e-06, + "loss": 0.3469, + "step": 5751 + }, + { + "epoch": 1.3691319093234962, + "grad_norm": 0.38735698530577334, + "learning_rate": 2.391740588901276e-06, + "loss": 0.4109, + "step": 5752 + }, + { + "epoch": 1.369369905396561, + "grad_norm": 0.37818783783560916, + "learning_rate": 2.3900962397141375e-06, + "loss": 0.2838, + "step": 5753 + }, + { + "epoch": 1.3696079014696259, + "grad_norm": 0.3963292077588658, + "learning_rate": 2.3884522784232645e-06, + "loss": 0.2984, + "step": 5754 + }, + { + "epoch": 1.3698458975426906, + "grad_norm": 0.3783349832246415, + "learning_rate": 2.3868087052729893e-06, + "loss": 0.3633, + "step": 5755 + }, + { + "epoch": 1.3700838936157553, + "grad_norm": 0.3663177261232768, + "learning_rate": 2.385165520507588e-06, + "loss": 0.357, + "step": 5756 + }, + { + "epoch": 1.37032188968882, + "grad_norm": 0.40376407001395825, + "learning_rate": 2.383522724371278e-06, + "loss": 0.3001, + "step": 5757 + }, + { + "epoch": 1.3705598857618848, + "grad_norm": 0.3888540867491295, + "learning_rate": 2.38188031710822e-06, + "loss": 0.3109, + "step": 5758 + }, + { + "epoch": 1.3707978818349498, + "grad_norm": 0.3823516378443356, + "learning_rate": 2.3802382989625156e-06, + "loss": 0.374, + "step": 5759 + }, + { + "epoch": 1.3710358779080145, + "grad_norm": 0.38773823298396787, + "learning_rate": 2.3785966701782098e-06, + "loss": 0.3037, + "step": 5760 + }, + { + "epoch": 1.3712738739810792, + "grad_norm": 0.4285193328439413, + "learning_rate": 2.3769554309992894e-06, + "loss": 0.2735, + "step": 5761 + }, + { + "epoch": 1.3715118700541442, + "grad_norm": 0.36656643054155086, + "learning_rate": 2.3753145816696827e-06, + "loss": 0.3558, + "step": 5762 + }, + { + "epoch": 1.371749866127209, + "grad_norm": 0.3731902128515065, + "learning_rate": 2.3736741224332592e-06, + "loss": 0.3605, + "step": 5763 + }, + { + "epoch": 1.3719878622002737, + "grad_norm": 0.3743570952391234, + "learning_rate": 2.372034053533835e-06, + "loss": 0.279, + "step": 5764 + }, + { + "epoch": 1.3722258582733384, + "grad_norm": 0.36798469228190406, + "learning_rate": 2.370394375215163e-06, + "loss": 0.3228, + "step": 5765 + }, + { + "epoch": 1.3724638543464032, + "grad_norm": 0.40727564860454385, + "learning_rate": 2.36875508772094e-06, + "loss": 0.3691, + "step": 5766 + }, + { + "epoch": 1.3727018504194681, + "grad_norm": 0.379396729169711, + "learning_rate": 2.367116191294806e-06, + "loss": 0.3249, + "step": 5767 + }, + { + "epoch": 1.3729398464925329, + "grad_norm": 0.4026927710670588, + "learning_rate": 2.365477686180339e-06, + "loss": 0.2866, + "step": 5768 + }, + { + "epoch": 1.3731778425655976, + "grad_norm": 0.37305525839630027, + "learning_rate": 2.3638395726210633e-06, + "loss": 0.3318, + "step": 5769 + }, + { + "epoch": 1.3734158386386626, + "grad_norm": 0.39186058761949977, + "learning_rate": 2.362201850860443e-06, + "loss": 0.3883, + "step": 5770 + }, + { + "epoch": 1.3736538347117273, + "grad_norm": 0.3880251678929161, + "learning_rate": 2.3605645211418817e-06, + "loss": 0.3171, + "step": 5771 + }, + { + "epoch": 1.373891830784792, + "grad_norm": 0.38767893993417474, + "learning_rate": 2.35892758370873e-06, + "loss": 0.3205, + "step": 5772 + }, + { + "epoch": 1.3741298268578568, + "grad_norm": 0.40297474576829545, + "learning_rate": 2.357291038804277e-06, + "loss": 0.3814, + "step": 5773 + }, + { + "epoch": 1.3743678229309215, + "grad_norm": 0.3831088465518714, + "learning_rate": 2.3556548866717532e-06, + "loss": 0.3225, + "step": 5774 + }, + { + "epoch": 1.3746058190039865, + "grad_norm": 0.4134284378393506, + "learning_rate": 2.3540191275543313e-06, + "loss": 0.2801, + "step": 5775 + }, + { + "epoch": 1.3748438150770512, + "grad_norm": 0.42059665325132306, + "learning_rate": 2.352383761695125e-06, + "loss": 0.3214, + "step": 5776 + }, + { + "epoch": 1.375081811150116, + "grad_norm": 0.39143587678806474, + "learning_rate": 2.350748789337189e-06, + "loss": 0.3738, + "step": 5777 + }, + { + "epoch": 1.375319807223181, + "grad_norm": 0.349803339437346, + "learning_rate": 2.349114210723524e-06, + "loss": 0.2689, + "step": 5778 + }, + { + "epoch": 1.3755578032962457, + "grad_norm": 0.3951646071549478, + "learning_rate": 2.3474800260970663e-06, + "loss": 0.3007, + "step": 5779 + }, + { + "epoch": 1.3757957993693104, + "grad_norm": 0.3898501872812015, + "learning_rate": 2.345846235700698e-06, + "loss": 0.3639, + "step": 5780 + }, + { + "epoch": 1.3760337954423751, + "grad_norm": 0.38912512748823874, + "learning_rate": 2.3442128397772396e-06, + "loss": 0.3618, + "step": 5781 + }, + { + "epoch": 1.3762717915154399, + "grad_norm": 0.38556905651285667, + "learning_rate": 2.3425798385694536e-06, + "loss": 0.26, + "step": 5782 + }, + { + "epoch": 1.3765097875885048, + "grad_norm": 0.4281982976966381, + "learning_rate": 2.3409472323200456e-06, + "loss": 0.3449, + "step": 5783 + }, + { + "epoch": 1.3767477836615696, + "grad_norm": 0.39046675152439464, + "learning_rate": 2.3393150212716604e-06, + "loss": 0.3875, + "step": 5784 + }, + { + "epoch": 1.3769857797346343, + "grad_norm": 0.3544336457158023, + "learning_rate": 2.337683205666885e-06, + "loss": 0.3233, + "step": 5785 + }, + { + "epoch": 1.3772237758076993, + "grad_norm": 0.36474388790592926, + "learning_rate": 2.336051785748248e-06, + "loss": 0.285, + "step": 5786 + }, + { + "epoch": 1.377461771880764, + "grad_norm": 0.3585335598025468, + "learning_rate": 2.334420761758219e-06, + "loss": 0.3508, + "step": 5787 + }, + { + "epoch": 1.3776997679538288, + "grad_norm": 0.395098339023238, + "learning_rate": 2.332790133939207e-06, + "loss": 0.4056, + "step": 5788 + }, + { + "epoch": 1.3779377640268935, + "grad_norm": 0.3916655159612242, + "learning_rate": 2.3311599025335654e-06, + "loss": 0.3117, + "step": 5789 + }, + { + "epoch": 1.3781757600999582, + "grad_norm": 0.39352579920052977, + "learning_rate": 2.3295300677835857e-06, + "loss": 0.3329, + "step": 5790 + }, + { + "epoch": 1.3784137561730232, + "grad_norm": 0.3949421237210347, + "learning_rate": 2.327900629931501e-06, + "loss": 0.4006, + "step": 5791 + }, + { + "epoch": 1.378651752246088, + "grad_norm": 0.3930410571926244, + "learning_rate": 2.3262715892194885e-06, + "loss": 0.3191, + "step": 5792 + }, + { + "epoch": 1.3788897483191527, + "grad_norm": 0.3767521685835201, + "learning_rate": 2.3246429458896637e-06, + "loss": 0.2756, + "step": 5793 + }, + { + "epoch": 1.3791277443922176, + "grad_norm": 0.4252278529678899, + "learning_rate": 2.3230147001840814e-06, + "loss": 0.3496, + "step": 5794 + }, + { + "epoch": 1.3793657404652824, + "grad_norm": 0.40177327803867513, + "learning_rate": 2.3213868523447404e-06, + "loss": 0.3888, + "step": 5795 + }, + { + "epoch": 1.3796037365383471, + "grad_norm": 0.3991978634595426, + "learning_rate": 2.3197594026135785e-06, + "loss": 0.2992, + "step": 5796 + }, + { + "epoch": 1.3798417326114119, + "grad_norm": 0.3820122275069646, + "learning_rate": 2.3181323512324754e-06, + "loss": 0.3062, + "step": 5797 + }, + { + "epoch": 1.3800797286844766, + "grad_norm": 0.394013127202058, + "learning_rate": 2.3165056984432493e-06, + "loss": 0.3725, + "step": 5798 + }, + { + "epoch": 1.3803177247575416, + "grad_norm": 0.3865299916518418, + "learning_rate": 2.314879444487665e-06, + "loss": 0.308, + "step": 5799 + }, + { + "epoch": 1.3805557208306063, + "grad_norm": 0.3606035448903917, + "learning_rate": 2.3132535896074213e-06, + "loss": 0.2737, + "step": 5800 + }, + { + "epoch": 1.380793716903671, + "grad_norm": 0.39435874636986457, + "learning_rate": 2.3116281340441616e-06, + "loss": 0.3243, + "step": 5801 + }, + { + "epoch": 1.381031712976736, + "grad_norm": 0.3944881659871505, + "learning_rate": 2.310003078039468e-06, + "loss": 0.3845, + "step": 5802 + }, + { + "epoch": 1.3812697090498007, + "grad_norm": 0.3739664004902333, + "learning_rate": 2.3083784218348646e-06, + "loss": 0.3068, + "step": 5803 + }, + { + "epoch": 1.3815077051228655, + "grad_norm": 0.3784435268416916, + "learning_rate": 2.3067541656718133e-06, + "loss": 0.2978, + "step": 5804 + }, + { + "epoch": 1.3817457011959302, + "grad_norm": 0.35412487368158974, + "learning_rate": 2.305130309791723e-06, + "loss": 0.3379, + "step": 5805 + }, + { + "epoch": 1.381983697268995, + "grad_norm": 0.408945279084268, + "learning_rate": 2.303506854435936e-06, + "loss": 0.3552, + "step": 5806 + }, + { + "epoch": 1.38222169334206, + "grad_norm": 0.3753005006752039, + "learning_rate": 2.3018837998457384e-06, + "loss": 0.2929, + "step": 5807 + }, + { + "epoch": 1.3824596894151246, + "grad_norm": 0.36459640658547277, + "learning_rate": 2.3002611462623563e-06, + "loss": 0.3338, + "step": 5808 + }, + { + "epoch": 1.3826976854881894, + "grad_norm": 0.3665121988763295, + "learning_rate": 2.2986388939269567e-06, + "loss": 0.3665, + "step": 5809 + }, + { + "epoch": 1.3829356815612543, + "grad_norm": 0.3517995162869186, + "learning_rate": 2.2970170430806455e-06, + "loss": 0.2952, + "step": 5810 + }, + { + "epoch": 1.383173677634319, + "grad_norm": 0.39818503253324855, + "learning_rate": 2.29539559396447e-06, + "loss": 0.2816, + "step": 5811 + }, + { + "epoch": 1.3834116737073838, + "grad_norm": 0.3883965107678621, + "learning_rate": 2.2937745468194186e-06, + "loss": 0.3767, + "step": 5812 + }, + { + "epoch": 1.3836496697804486, + "grad_norm": 0.34808683822000047, + "learning_rate": 2.2921539018864177e-06, + "loss": 0.3601, + "step": 5813 + }, + { + "epoch": 1.3838876658535133, + "grad_norm": 0.3785593295383857, + "learning_rate": 2.290533659406336e-06, + "loss": 0.3008, + "step": 5814 + }, + { + "epoch": 1.3841256619265783, + "grad_norm": 0.38995875252493734, + "learning_rate": 2.2889138196199816e-06, + "loss": 0.3139, + "step": 5815 + }, + { + "epoch": 1.384363657999643, + "grad_norm": 0.37852442117564816, + "learning_rate": 2.287294382768103e-06, + "loss": 0.3529, + "step": 5816 + }, + { + "epoch": 1.3846016540727077, + "grad_norm": 0.37788122292949944, + "learning_rate": 2.285675349091388e-06, + "loss": 0.3376, + "step": 5817 + }, + { + "epoch": 1.3848396501457727, + "grad_norm": 0.42130823797471484, + "learning_rate": 2.284056718830463e-06, + "loss": 0.2815, + "step": 5818 + }, + { + "epoch": 1.3850776462188374, + "grad_norm": 0.4103925080704719, + "learning_rate": 2.2824384922259005e-06, + "loss": 0.3086, + "step": 5819 + }, + { + "epoch": 1.3853156422919022, + "grad_norm": 0.42671753513437, + "learning_rate": 2.280820669518208e-06, + "loss": 0.4045, + "step": 5820 + }, + { + "epoch": 1.385553638364967, + "grad_norm": 0.39262412594080603, + "learning_rate": 2.2792032509478335e-06, + "loss": 0.2985, + "step": 5821 + }, + { + "epoch": 1.3857916344380317, + "grad_norm": 0.3889466132697115, + "learning_rate": 2.2775862367551642e-06, + "loss": 0.3125, + "step": 5822 + }, + { + "epoch": 1.3860296305110966, + "grad_norm": 0.3873197747905789, + "learning_rate": 2.2759696271805298e-06, + "loss": 0.3508, + "step": 5823 + }, + { + "epoch": 1.3862676265841614, + "grad_norm": 0.38959234217638355, + "learning_rate": 2.274353422464198e-06, + "loss": 0.327, + "step": 5824 + }, + { + "epoch": 1.386505622657226, + "grad_norm": 0.49119856662683686, + "learning_rate": 2.2727376228463754e-06, + "loss": 0.2518, + "step": 5825 + }, + { + "epoch": 1.386743618730291, + "grad_norm": 0.3592727494766306, + "learning_rate": 2.271122228567213e-06, + "loss": 0.3103, + "step": 5826 + }, + { + "epoch": 1.3869816148033558, + "grad_norm": 0.40339073061591146, + "learning_rate": 2.2695072398667965e-06, + "loss": 0.4171, + "step": 5827 + }, + { + "epoch": 1.3872196108764205, + "grad_norm": 0.35219893549707026, + "learning_rate": 2.2678926569851535e-06, + "loss": 0.3219, + "step": 5828 + }, + { + "epoch": 1.3874576069494853, + "grad_norm": 0.3964161658508986, + "learning_rate": 2.26627848016225e-06, + "loss": 0.2777, + "step": 5829 + }, + { + "epoch": 1.38769560302255, + "grad_norm": 0.38942352677332387, + "learning_rate": 2.2646647096379944e-06, + "loss": 0.3602, + "step": 5830 + }, + { + "epoch": 1.387933599095615, + "grad_norm": 0.42284500435613087, + "learning_rate": 2.2630513456522315e-06, + "loss": 0.3794, + "step": 5831 + }, + { + "epoch": 1.3881715951686797, + "grad_norm": 0.3975269422746937, + "learning_rate": 2.261438388444745e-06, + "loss": 0.2931, + "step": 5832 + }, + { + "epoch": 1.3884095912417445, + "grad_norm": 0.3911429557797404, + "learning_rate": 2.259825838255265e-06, + "loss": 0.3317, + "step": 5833 + }, + { + "epoch": 1.3886475873148094, + "grad_norm": 0.40487348869777584, + "learning_rate": 2.2582136953234544e-06, + "loss": 0.3936, + "step": 5834 + }, + { + "epoch": 1.3888855833878742, + "grad_norm": 0.3675010804576965, + "learning_rate": 2.2566019598889168e-06, + "loss": 0.301, + "step": 5835 + }, + { + "epoch": 1.389123579460939, + "grad_norm": 0.3755478510541879, + "learning_rate": 2.254990632191197e-06, + "loss": 0.2646, + "step": 5836 + }, + { + "epoch": 1.3893615755340036, + "grad_norm": 0.3717516657625129, + "learning_rate": 2.2533797124697763e-06, + "loss": 0.3402, + "step": 5837 + }, + { + "epoch": 1.3895995716070684, + "grad_norm": 0.37873677832015795, + "learning_rate": 2.2517692009640796e-06, + "loss": 0.3532, + "step": 5838 + }, + { + "epoch": 1.3898375676801333, + "grad_norm": 0.3812765678125732, + "learning_rate": 2.250159097913467e-06, + "loss": 0.2933, + "step": 5839 + }, + { + "epoch": 1.390075563753198, + "grad_norm": 0.3833578003169235, + "learning_rate": 2.2485494035572404e-06, + "loss": 0.2927, + "step": 5840 + }, + { + "epoch": 1.3903135598262628, + "grad_norm": 0.37497476870990365, + "learning_rate": 2.24694011813464e-06, + "loss": 0.3708, + "step": 5841 + }, + { + "epoch": 1.3905515558993278, + "grad_norm": 0.35418256159670797, + "learning_rate": 2.2453312418848454e-06, + "loss": 0.318, + "step": 5842 + }, + { + "epoch": 1.3907895519723925, + "grad_norm": 0.3892901098428216, + "learning_rate": 2.243722775046976e-06, + "loss": 0.3176, + "step": 5843 + }, + { + "epoch": 1.3910275480454573, + "grad_norm": 0.37876686309642, + "learning_rate": 2.2421147178600883e-06, + "loss": 0.2907, + "step": 5844 + }, + { + "epoch": 1.391265544118522, + "grad_norm": 0.3847753143788955, + "learning_rate": 2.2405070705631808e-06, + "loss": 0.3906, + "step": 5845 + }, + { + "epoch": 1.3915035401915867, + "grad_norm": 0.3638169205713526, + "learning_rate": 2.238899833395188e-06, + "loss": 0.2749, + "step": 5846 + }, + { + "epoch": 1.3917415362646517, + "grad_norm": 0.40407484545756533, + "learning_rate": 2.2372930065949873e-06, + "loss": 0.3018, + "step": 5847 + }, + { + "epoch": 1.3919795323377164, + "grad_norm": 0.37766478316846785, + "learning_rate": 2.235686590401392e-06, + "loss": 0.3687, + "step": 5848 + }, + { + "epoch": 1.3922175284107812, + "grad_norm": 0.34719174148467286, + "learning_rate": 2.234080585053155e-06, + "loss": 0.3441, + "step": 5849 + }, + { + "epoch": 1.3924555244838461, + "grad_norm": 0.37713121646339276, + "learning_rate": 2.232474990788969e-06, + "loss": 0.2669, + "step": 5850 + }, + { + "epoch": 1.3926935205569109, + "grad_norm": 0.40478233582795153, + "learning_rate": 2.2308698078474645e-06, + "loss": 0.3362, + "step": 5851 + }, + { + "epoch": 1.3929315166299756, + "grad_norm": 0.4118362847556917, + "learning_rate": 2.2292650364672096e-06, + "loss": 0.3946, + "step": 5852 + }, + { + "epoch": 1.3931695127030403, + "grad_norm": 0.3558802597442529, + "learning_rate": 2.2276606768867172e-06, + "loss": 0.2805, + "step": 5853 + }, + { + "epoch": 1.393407508776105, + "grad_norm": 0.4691025727643871, + "learning_rate": 2.226056729344432e-06, + "loss": 0.2816, + "step": 5854 + }, + { + "epoch": 1.39364550484917, + "grad_norm": 0.4030556981141061, + "learning_rate": 2.2244531940787413e-06, + "loss": 0.3574, + "step": 5855 + }, + { + "epoch": 1.3938835009222348, + "grad_norm": 0.3809802135059961, + "learning_rate": 2.22285007132797e-06, + "loss": 0.3576, + "step": 5856 + }, + { + "epoch": 1.3941214969952995, + "grad_norm": 0.35544990461765996, + "learning_rate": 2.2212473613303807e-06, + "loss": 0.2642, + "step": 5857 + }, + { + "epoch": 1.3943594930683645, + "grad_norm": 0.3692152888228544, + "learning_rate": 2.2196450643241768e-06, + "loss": 0.3212, + "step": 5858 + }, + { + "epoch": 1.3945974891414292, + "grad_norm": 0.4034006295018707, + "learning_rate": 2.218043180547499e-06, + "loss": 0.3548, + "step": 5859 + }, + { + "epoch": 1.394835485214494, + "grad_norm": 0.3750782883132117, + "learning_rate": 2.216441710238425e-06, + "loss": 0.2913, + "step": 5860 + }, + { + "epoch": 1.3950734812875587, + "grad_norm": 0.4040307357444328, + "learning_rate": 2.214840653634977e-06, + "loss": 0.2816, + "step": 5861 + }, + { + "epoch": 1.3953114773606234, + "grad_norm": 0.3673705781593482, + "learning_rate": 2.213240010975109e-06, + "loss": 0.3495, + "step": 5862 + }, + { + "epoch": 1.3955494734336884, + "grad_norm": 0.3704834222648198, + "learning_rate": 2.211639782496717e-06, + "loss": 0.3757, + "step": 5863 + }, + { + "epoch": 1.3957874695067531, + "grad_norm": 0.37384507283033885, + "learning_rate": 2.2100399684376333e-06, + "loss": 0.2976, + "step": 5864 + }, + { + "epoch": 1.3960254655798179, + "grad_norm": 0.38977052889591346, + "learning_rate": 2.208440569035631e-06, + "loss": 0.3131, + "step": 5865 + }, + { + "epoch": 1.3962634616528828, + "grad_norm": 0.3803082135668662, + "learning_rate": 2.2068415845284197e-06, + "loss": 0.3629, + "step": 5866 + }, + { + "epoch": 1.3965014577259476, + "grad_norm": 0.3428155698284938, + "learning_rate": 2.2052430151536488e-06, + "loss": 0.306, + "step": 5867 + }, + { + "epoch": 1.3967394537990123, + "grad_norm": 0.4200376862637538, + "learning_rate": 2.203644861148904e-06, + "loss": 0.3033, + "step": 5868 + }, + { + "epoch": 1.396977449872077, + "grad_norm": 0.41707618321368245, + "learning_rate": 2.202047122751712e-06, + "loss": 0.3557, + "step": 5869 + }, + { + "epoch": 1.3972154459451418, + "grad_norm": 0.39243821595567624, + "learning_rate": 2.2004498001995355e-06, + "loss": 0.3608, + "step": 5870 + }, + { + "epoch": 1.3974534420182068, + "grad_norm": 0.3533215441822008, + "learning_rate": 2.1988528937297764e-06, + "loss": 0.2832, + "step": 5871 + }, + { + "epoch": 1.3976914380912715, + "grad_norm": 0.37292525643353835, + "learning_rate": 2.1972564035797738e-06, + "loss": 0.313, + "step": 5872 + }, + { + "epoch": 1.3979294341643362, + "grad_norm": 0.41160282830855566, + "learning_rate": 2.1956603299868052e-06, + "loss": 0.3649, + "step": 5873 + }, + { + "epoch": 1.3981674302374012, + "grad_norm": 0.3789548998028757, + "learning_rate": 2.1940646731880887e-06, + "loss": 0.3277, + "step": 5874 + }, + { + "epoch": 1.398405426310466, + "grad_norm": 0.40108158442899056, + "learning_rate": 2.1924694334207773e-06, + "loss": 0.2789, + "step": 5875 + }, + { + "epoch": 1.3986434223835307, + "grad_norm": 0.38479323451240566, + "learning_rate": 2.1908746109219633e-06, + "loss": 0.3232, + "step": 5876 + }, + { + "epoch": 1.3988814184565954, + "grad_norm": 0.39277475179443505, + "learning_rate": 2.189280205928676e-06, + "loss": 0.4036, + "step": 5877 + }, + { + "epoch": 1.3991194145296602, + "grad_norm": 0.3629626558888327, + "learning_rate": 2.1876862186778847e-06, + "loss": 0.292, + "step": 5878 + }, + { + "epoch": 1.3993574106027251, + "grad_norm": 0.39386803443255086, + "learning_rate": 2.186092649406492e-06, + "loss": 0.2813, + "step": 5879 + }, + { + "epoch": 1.3995954066757899, + "grad_norm": 0.4045307535442236, + "learning_rate": 2.184499498351347e-06, + "loss": 0.3579, + "step": 5880 + }, + { + "epoch": 1.3998334027488546, + "grad_norm": 0.38562044486044106, + "learning_rate": 2.182906765749228e-06, + "loss": 0.3259, + "step": 5881 + }, + { + "epoch": 1.4000713988219196, + "grad_norm": 0.3722054840630449, + "learning_rate": 2.1813144518368556e-06, + "loss": 0.2682, + "step": 5882 + }, + { + "epoch": 1.4003093948949843, + "grad_norm": 0.4853537819335595, + "learning_rate": 2.1797225568508863e-06, + "loss": 0.3586, + "step": 5883 + }, + { + "epoch": 1.400547390968049, + "grad_norm": 0.3917377497499103, + "learning_rate": 2.1781310810279156e-06, + "loss": 0.3889, + "step": 5884 + }, + { + "epoch": 1.4007853870411138, + "grad_norm": 0.3820969906527712, + "learning_rate": 2.1765400246044755e-06, + "loss": 0.286, + "step": 5885 + }, + { + "epoch": 1.4010233831141785, + "grad_norm": 0.3901672408734566, + "learning_rate": 2.1749493878170368e-06, + "loss": 0.268, + "step": 5886 + }, + { + "epoch": 1.4012613791872435, + "grad_norm": 0.5012229103908235, + "learning_rate": 2.173359170902006e-06, + "loss": 0.3502, + "step": 5887 + }, + { + "epoch": 1.4014993752603082, + "grad_norm": 0.4163979879875083, + "learning_rate": 2.171769374095732e-06, + "loss": 0.3584, + "step": 5888 + }, + { + "epoch": 1.401737371333373, + "grad_norm": 0.3825230668134233, + "learning_rate": 2.1701799976344956e-06, + "loss": 0.2952, + "step": 5889 + }, + { + "epoch": 1.401975367406438, + "grad_norm": 0.3692912522268186, + "learning_rate": 2.168591041754518e-06, + "loss": 0.3206, + "step": 5890 + }, + { + "epoch": 1.4022133634795027, + "grad_norm": 0.4020957675464499, + "learning_rate": 2.1670025066919575e-06, + "loss": 0.3645, + "step": 5891 + }, + { + "epoch": 1.4024513595525674, + "grad_norm": 0.3652682950372471, + "learning_rate": 2.1654143926829095e-06, + "loss": 0.3088, + "step": 5892 + }, + { + "epoch": 1.4026893556256321, + "grad_norm": 0.39357600720054553, + "learning_rate": 2.163826699963407e-06, + "loss": 0.3002, + "step": 5893 + }, + { + "epoch": 1.4029273516986969, + "grad_norm": 0.39176835973179347, + "learning_rate": 2.1622394287694203e-06, + "loss": 0.3524, + "step": 5894 + }, + { + "epoch": 1.4031653477717618, + "grad_norm": 0.4030630624539821, + "learning_rate": 2.1606525793368578e-06, + "loss": 0.4, + "step": 5895 + }, + { + "epoch": 1.4034033438448266, + "grad_norm": 0.3581093728731858, + "learning_rate": 2.159066151901563e-06, + "loss": 0.3042, + "step": 5896 + }, + { + "epoch": 1.4036413399178913, + "grad_norm": 0.3958734156159978, + "learning_rate": 2.1574801466993204e-06, + "loss": 0.2925, + "step": 5897 + }, + { + "epoch": 1.4038793359909563, + "grad_norm": 0.44017456212239137, + "learning_rate": 2.155894563965848e-06, + "loss": 0.3305, + "step": 5898 + }, + { + "epoch": 1.404117332064021, + "grad_norm": 0.36546554037443685, + "learning_rate": 2.1543094039368034e-06, + "loss": 0.3465, + "step": 5899 + }, + { + "epoch": 1.4043553281370857, + "grad_norm": 0.4261135558150708, + "learning_rate": 2.15272466684778e-06, + "loss": 0.2733, + "step": 5900 + }, + { + "epoch": 1.4045933242101505, + "grad_norm": 0.42031143455545217, + "learning_rate": 2.151140352934308e-06, + "loss": 0.3398, + "step": 5901 + }, + { + "epoch": 1.4048313202832152, + "grad_norm": 0.3751387002028948, + "learning_rate": 2.149556462431859e-06, + "loss": 0.3863, + "step": 5902 + }, + { + "epoch": 1.4050693163562802, + "grad_norm": 0.3438466812731875, + "learning_rate": 2.1479729955758354e-06, + "loss": 0.3185, + "step": 5903 + }, + { + "epoch": 1.405307312429345, + "grad_norm": 0.3693116279880859, + "learning_rate": 2.146389952601581e-06, + "loss": 0.2928, + "step": 5904 + }, + { + "epoch": 1.4055453085024097, + "grad_norm": 0.39195086077989794, + "learning_rate": 2.1448073337443743e-06, + "loss": 0.3798, + "step": 5905 + }, + { + "epoch": 1.4057833045754746, + "grad_norm": 0.379901101442375, + "learning_rate": 2.1432251392394303e-06, + "loss": 0.3315, + "step": 5906 + }, + { + "epoch": 1.4060213006485394, + "grad_norm": 0.3827424931752997, + "learning_rate": 2.141643369321905e-06, + "loss": 0.2723, + "step": 5907 + }, + { + "epoch": 1.406259296721604, + "grad_norm": 0.3816989292493151, + "learning_rate": 2.1400620242268883e-06, + "loss": 0.3374, + "step": 5908 + }, + { + "epoch": 1.4064972927946688, + "grad_norm": 0.38230399719450614, + "learning_rate": 2.1384811041894055e-06, + "loss": 0.387, + "step": 5909 + }, + { + "epoch": 1.4067352888677336, + "grad_norm": 0.3725751003550838, + "learning_rate": 2.1369006094444215e-06, + "loss": 0.2997, + "step": 5910 + }, + { + "epoch": 1.4069732849407985, + "grad_norm": 0.385430704556038, + "learning_rate": 2.1353205402268368e-06, + "loss": 0.3085, + "step": 5911 + }, + { + "epoch": 1.4072112810138633, + "grad_norm": 0.376139109287831, + "learning_rate": 2.1337408967714883e-06, + "loss": 0.3395, + "step": 5912 + }, + { + "epoch": 1.407449277086928, + "grad_norm": 0.3715517206789897, + "learning_rate": 2.1321616793131507e-06, + "loss": 0.3763, + "step": 5913 + }, + { + "epoch": 1.407687273159993, + "grad_norm": 0.39429770665412, + "learning_rate": 2.130582888086534e-06, + "loss": 0.2859, + "step": 5914 + }, + { + "epoch": 1.4079252692330577, + "grad_norm": 0.3733749256636019, + "learning_rate": 2.129004523326284e-06, + "loss": 0.2936, + "step": 5915 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.8974813867363939, + "learning_rate": 2.1274265852669894e-06, + "loss": 0.3692, + "step": 5916 + }, + { + "epoch": 1.4084012613791872, + "grad_norm": 0.3681033154441353, + "learning_rate": 2.125849074143168e-06, + "loss": 0.3132, + "step": 5917 + }, + { + "epoch": 1.408639257452252, + "grad_norm": 0.3871816903644566, + "learning_rate": 2.124271990189277e-06, + "loss": 0.2816, + "step": 5918 + }, + { + "epoch": 1.408877253525317, + "grad_norm": 0.3733497998587013, + "learning_rate": 2.1226953336397105e-06, + "loss": 0.3519, + "step": 5919 + }, + { + "epoch": 1.4091152495983816, + "grad_norm": 0.3581335746536029, + "learning_rate": 2.1211191047287988e-06, + "loss": 0.3848, + "step": 5920 + }, + { + "epoch": 1.4093532456714464, + "grad_norm": 0.35647096310686627, + "learning_rate": 2.119543303690808e-06, + "loss": 0.2912, + "step": 5921 + }, + { + "epoch": 1.4095912417445113, + "grad_norm": 0.3783590852785142, + "learning_rate": 2.117967930759941e-06, + "loss": 0.3103, + "step": 5922 + }, + { + "epoch": 1.409829237817576, + "grad_norm": 0.39200620885506376, + "learning_rate": 2.1163929861703383e-06, + "loss": 0.3547, + "step": 5923 + }, + { + "epoch": 1.4100672338906408, + "grad_norm": 0.3607566196572999, + "learning_rate": 2.1148184701560742e-06, + "loss": 0.3402, + "step": 5924 + }, + { + "epoch": 1.4103052299637056, + "grad_norm": 0.36895223669749566, + "learning_rate": 2.113244382951162e-06, + "loss": 0.275, + "step": 5925 + }, + { + "epoch": 1.4105432260367703, + "grad_norm": 0.3459412598350246, + "learning_rate": 2.1116707247895484e-06, + "loss": 0.3434, + "step": 5926 + }, + { + "epoch": 1.4107812221098353, + "grad_norm": 0.3767275620938957, + "learning_rate": 2.1100974959051198e-06, + "loss": 0.38, + "step": 5927 + }, + { + "epoch": 1.4110192181829, + "grad_norm": 0.39657367634889606, + "learning_rate": 2.1085246965316936e-06, + "loss": 0.3327, + "step": 5928 + }, + { + "epoch": 1.4112572142559647, + "grad_norm": 0.39804610232153587, + "learning_rate": 2.106952326903031e-06, + "loss": 0.2669, + "step": 5929 + }, + { + "epoch": 1.4114952103290297, + "grad_norm": 0.42512925166801646, + "learning_rate": 2.105380387252824e-06, + "loss": 0.3799, + "step": 5930 + }, + { + "epoch": 1.4117332064020944, + "grad_norm": 0.367576966472368, + "learning_rate": 2.1038088778147004e-06, + "loss": 0.3648, + "step": 5931 + }, + { + "epoch": 1.4119712024751592, + "grad_norm": 0.4071023704661706, + "learning_rate": 2.1022377988222255e-06, + "loss": 0.2493, + "step": 5932 + }, + { + "epoch": 1.412209198548224, + "grad_norm": 0.3517027129740413, + "learning_rate": 2.100667150508899e-06, + "loss": 0.3263, + "step": 5933 + }, + { + "epoch": 1.4124471946212886, + "grad_norm": 0.3630304666957127, + "learning_rate": 2.099096933108163e-06, + "loss": 0.3633, + "step": 5934 + }, + { + "epoch": 1.4126851906943536, + "grad_norm": 0.36827148716226255, + "learning_rate": 2.0975271468533864e-06, + "loss": 0.3073, + "step": 5935 + }, + { + "epoch": 1.4129231867674183, + "grad_norm": 0.39577401190423284, + "learning_rate": 2.0959577919778803e-06, + "loss": 0.2861, + "step": 5936 + }, + { + "epoch": 1.413161182840483, + "grad_norm": 0.40173519198012886, + "learning_rate": 2.0943888687148883e-06, + "loss": 0.3431, + "step": 5937 + }, + { + "epoch": 1.413399178913548, + "grad_norm": 0.3992484721549741, + "learning_rate": 2.0928203772975917e-06, + "loss": 0.3809, + "step": 5938 + }, + { + "epoch": 1.4136371749866128, + "grad_norm": 0.39599927216720754, + "learning_rate": 2.0912523179591076e-06, + "loss": 0.3008, + "step": 5939 + }, + { + "epoch": 1.4138751710596775, + "grad_norm": 0.4242280603241625, + "learning_rate": 2.0896846909324874e-06, + "loss": 0.3229, + "step": 5940 + }, + { + "epoch": 1.4141131671327423, + "grad_norm": 0.3664045587336667, + "learning_rate": 2.0881174964507205e-06, + "loss": 0.4031, + "step": 5941 + }, + { + "epoch": 1.414351163205807, + "grad_norm": 0.36607982694161056, + "learning_rate": 2.086550734746728e-06, + "loss": 0.3328, + "step": 5942 + }, + { + "epoch": 1.414589159278872, + "grad_norm": 0.3924516864004525, + "learning_rate": 2.0849844060533736e-06, + "loss": 0.2976, + "step": 5943 + }, + { + "epoch": 1.4148271553519367, + "grad_norm": 0.4057578371930518, + "learning_rate": 2.0834185106034503e-06, + "loss": 0.3015, + "step": 5944 + }, + { + "epoch": 1.4150651514250014, + "grad_norm": 0.3805032627968153, + "learning_rate": 2.081853048629689e-06, + "loss": 0.3876, + "step": 5945 + }, + { + "epoch": 1.4153031474980664, + "grad_norm": 0.3698394601043139, + "learning_rate": 2.0802880203647565e-06, + "loss": 0.2778, + "step": 5946 + }, + { + "epoch": 1.4155411435711311, + "grad_norm": 0.3828459136639372, + "learning_rate": 2.078723426041254e-06, + "loss": 0.3119, + "step": 5947 + }, + { + "epoch": 1.4157791396441959, + "grad_norm": 0.38957935834327895, + "learning_rate": 2.0771592658917196e-06, + "loss": 0.3794, + "step": 5948 + }, + { + "epoch": 1.4160171357172606, + "grad_norm": 0.3851845261585521, + "learning_rate": 2.0755955401486255e-06, + "loss": 0.3443, + "step": 5949 + }, + { + "epoch": 1.4162551317903254, + "grad_norm": 0.3546131940340063, + "learning_rate": 2.0740322490443802e-06, + "loss": 0.3024, + "step": 5950 + }, + { + "epoch": 1.4164931278633903, + "grad_norm": 0.4166580686412631, + "learning_rate": 2.072469392811329e-06, + "loss": 0.3346, + "step": 5951 + }, + { + "epoch": 1.416731123936455, + "grad_norm": 0.40477461909600554, + "learning_rate": 2.070906971681748e-06, + "loss": 0.3986, + "step": 5952 + }, + { + "epoch": 1.4169691200095198, + "grad_norm": 0.3863431875261796, + "learning_rate": 2.0693449858878543e-06, + "loss": 0.2625, + "step": 5953 + }, + { + "epoch": 1.4172071160825848, + "grad_norm": 0.42378362283530085, + "learning_rate": 2.0677834356617967e-06, + "loss": 0.2718, + "step": 5954 + }, + { + "epoch": 1.4174451121556495, + "grad_norm": 0.37534140974542646, + "learning_rate": 2.066222321235659e-06, + "loss": 0.3537, + "step": 5955 + }, + { + "epoch": 1.4176831082287142, + "grad_norm": 0.3421669011162348, + "learning_rate": 2.064661642841462e-06, + "loss": 0.3523, + "step": 5956 + }, + { + "epoch": 1.417921104301779, + "grad_norm": 0.37943922157081356, + "learning_rate": 2.0631014007111627e-06, + "loss": 0.2818, + "step": 5957 + }, + { + "epoch": 1.4181591003748437, + "grad_norm": 0.37246301648402214, + "learning_rate": 2.0615415950766504e-06, + "loss": 0.3343, + "step": 5958 + }, + { + "epoch": 1.4183970964479087, + "grad_norm": 0.38400127878264756, + "learning_rate": 2.0599822261697516e-06, + "loss": 0.3702, + "step": 5959 + }, + { + "epoch": 1.4186350925209734, + "grad_norm": 0.39219554424930037, + "learning_rate": 2.0584232942222247e-06, + "loss": 0.2835, + "step": 5960 + }, + { + "epoch": 1.4188730885940382, + "grad_norm": 0.39117355067041304, + "learning_rate": 2.056864799465769e-06, + "loss": 0.2966, + "step": 5961 + }, + { + "epoch": 1.4191110846671031, + "grad_norm": 0.39451400558685296, + "learning_rate": 2.055306742132014e-06, + "loss": 0.337, + "step": 5962 + }, + { + "epoch": 1.4193490807401679, + "grad_norm": 0.36635780928223005, + "learning_rate": 2.053749122452525e-06, + "loss": 0.362, + "step": 5963 + }, + { + "epoch": 1.4195870768132326, + "grad_norm": 0.3873092005359913, + "learning_rate": 2.052191940658803e-06, + "loss": 0.3079, + "step": 5964 + }, + { + "epoch": 1.4198250728862973, + "grad_norm": 0.3836880319475697, + "learning_rate": 2.050635196982284e-06, + "loss": 0.3435, + "step": 5965 + }, + { + "epoch": 1.420063068959362, + "grad_norm": 0.44296005657411364, + "learning_rate": 2.049078891654339e-06, + "loss": 0.37, + "step": 5966 + }, + { + "epoch": 1.420301065032427, + "grad_norm": 0.35016323703149027, + "learning_rate": 2.0475230249062727e-06, + "loss": 0.315, + "step": 5967 + }, + { + "epoch": 1.4205390611054918, + "grad_norm": 0.3950620054962618, + "learning_rate": 2.0459675969693256e-06, + "loss": 0.3173, + "step": 5968 + }, + { + "epoch": 1.4207770571785565, + "grad_norm": 0.3952176517038452, + "learning_rate": 2.044412608074672e-06, + "loss": 0.3234, + "step": 5969 + }, + { + "epoch": 1.4210150532516215, + "grad_norm": 0.38307975634819236, + "learning_rate": 2.042858058453422e-06, + "loss": 0.405, + "step": 5970 + }, + { + "epoch": 1.4212530493246862, + "grad_norm": 0.3364899399185918, + "learning_rate": 2.041303948336622e-06, + "loss": 0.3013, + "step": 5971 + }, + { + "epoch": 1.421491045397751, + "grad_norm": 0.38894109353307277, + "learning_rate": 2.0397502779552498e-06, + "loss": 0.2979, + "step": 5972 + }, + { + "epoch": 1.4217290414708157, + "grad_norm": 0.3510083872107239, + "learning_rate": 2.0381970475402196e-06, + "loss": 0.3435, + "step": 5973 + }, + { + "epoch": 1.4219670375438804, + "grad_norm": 0.3947818997277785, + "learning_rate": 2.0366442573223795e-06, + "loss": 0.3896, + "step": 5974 + }, + { + "epoch": 1.4222050336169454, + "grad_norm": 0.39395505014453025, + "learning_rate": 2.0350919075325124e-06, + "loss": 0.2891, + "step": 5975 + }, + { + "epoch": 1.4224430296900101, + "grad_norm": 0.3978783535558053, + "learning_rate": 2.0335399984013366e-06, + "loss": 0.3043, + "step": 5976 + }, + { + "epoch": 1.4226810257630749, + "grad_norm": 0.3682896039882474, + "learning_rate": 2.0319885301595034e-06, + "loss": 0.4002, + "step": 5977 + }, + { + "epoch": 1.4229190218361398, + "grad_norm": 0.37550375575321043, + "learning_rate": 2.0304375030375996e-06, + "loss": 0.2694, + "step": 5978 + }, + { + "epoch": 1.4231570179092046, + "grad_norm": 0.38790715040650325, + "learning_rate": 2.0288869172661463e-06, + "loss": 0.3062, + "step": 5979 + }, + { + "epoch": 1.4233950139822693, + "grad_norm": 0.39968408903014346, + "learning_rate": 2.0273367730755993e-06, + "loss": 0.3375, + "step": 5980 + }, + { + "epoch": 1.423633010055334, + "grad_norm": 0.3878895341440517, + "learning_rate": 2.025787070696348e-06, + "loss": 0.3548, + "step": 5981 + }, + { + "epoch": 1.4238710061283988, + "grad_norm": 0.39123351222051045, + "learning_rate": 2.0242378103587157e-06, + "loss": 0.2858, + "step": 5982 + }, + { + "epoch": 1.4241090022014637, + "grad_norm": 0.37665471025750874, + "learning_rate": 2.0226889922929603e-06, + "loss": 0.3093, + "step": 5983 + }, + { + "epoch": 1.4243469982745285, + "grad_norm": 0.39559868934355824, + "learning_rate": 2.0211406167292775e-06, + "loss": 0.3772, + "step": 5984 + }, + { + "epoch": 1.4245849943475932, + "grad_norm": 0.3851654558218671, + "learning_rate": 2.0195926838977926e-06, + "loss": 0.2878, + "step": 5985 + }, + { + "epoch": 1.4248229904206582, + "grad_norm": 0.37919297006292657, + "learning_rate": 2.018045194028567e-06, + "loss": 0.2562, + "step": 5986 + }, + { + "epoch": 1.425060986493723, + "grad_norm": 0.40761397021940626, + "learning_rate": 2.0164981473515926e-06, + "loss": 0.3191, + "step": 5987 + }, + { + "epoch": 1.4252989825667877, + "grad_norm": 0.4197443032408188, + "learning_rate": 2.014951544096804e-06, + "loss": 0.3657, + "step": 5988 + }, + { + "epoch": 1.4255369786398524, + "grad_norm": 0.37473340700791113, + "learning_rate": 2.013405384494063e-06, + "loss": 0.2811, + "step": 5989 + }, + { + "epoch": 1.4257749747129171, + "grad_norm": 0.40715204280059647, + "learning_rate": 2.0118596687731666e-06, + "loss": 0.3151, + "step": 5990 + }, + { + "epoch": 1.426012970785982, + "grad_norm": 0.3737319670986343, + "learning_rate": 2.0103143971638463e-06, + "loss": 0.3727, + "step": 5991 + }, + { + "epoch": 1.4262509668590468, + "grad_norm": 0.3698219377663196, + "learning_rate": 2.0087695698957676e-06, + "loss": 0.2946, + "step": 5992 + }, + { + "epoch": 1.4264889629321116, + "grad_norm": 0.37300313864826995, + "learning_rate": 2.0072251871985306e-06, + "loss": 0.2735, + "step": 5993 + }, + { + "epoch": 1.4267269590051765, + "grad_norm": 0.3944808909836751, + "learning_rate": 2.0056812493016684e-06, + "loss": 0.3603, + "step": 5994 + }, + { + "epoch": 1.4269649550782413, + "grad_norm": 0.3741150627911366, + "learning_rate": 2.0041377564346484e-06, + "loss": 0.3969, + "step": 5995 + }, + { + "epoch": 1.427202951151306, + "grad_norm": 0.36618783349147516, + "learning_rate": 2.0025947088268714e-06, + "loss": 0.3102, + "step": 5996 + }, + { + "epoch": 1.4274409472243708, + "grad_norm": 0.4085121504761507, + "learning_rate": 2.001052106707672e-06, + "loss": 0.3215, + "step": 5997 + }, + { + "epoch": 1.4276789432974355, + "grad_norm": 0.42374164985590546, + "learning_rate": 1.9995099503063214e-06, + "loss": 0.361, + "step": 5998 + }, + { + "epoch": 1.4279169393705005, + "grad_norm": 0.36596013003548744, + "learning_rate": 1.9979682398520205e-06, + "loss": 0.3197, + "step": 5999 + }, + { + "epoch": 1.4281549354435652, + "grad_norm": 0.38401848047343784, + "learning_rate": 1.9964269755739057e-06, + "loss": 0.2791, + "step": 6000 + }, + { + "epoch": 1.42839293151663, + "grad_norm": 0.3988267552673252, + "learning_rate": 1.9948861577010475e-06, + "loss": 0.313, + "step": 6001 + }, + { + "epoch": 1.428630927589695, + "grad_norm": 0.38693890781588514, + "learning_rate": 1.993345786462449e-06, + "loss": 0.391, + "step": 6002 + }, + { + "epoch": 1.4288689236627596, + "grad_norm": 0.36685062755023384, + "learning_rate": 1.991805862087048e-06, + "loss": 0.2948, + "step": 6003 + }, + { + "epoch": 1.4291069197358244, + "grad_norm": 0.5848360061934482, + "learning_rate": 1.9902663848037147e-06, + "loss": 0.278, + "step": 6004 + }, + { + "epoch": 1.4293449158088891, + "grad_norm": 0.405283771817613, + "learning_rate": 1.988727354841254e-06, + "loss": 0.3247, + "step": 6005 + }, + { + "epoch": 1.4295829118819539, + "grad_norm": 0.38737497620202277, + "learning_rate": 1.987188772428403e-06, + "loss": 0.3388, + "step": 6006 + }, + { + "epoch": 1.4298209079550188, + "grad_norm": 0.3778534408788696, + "learning_rate": 1.985650637793835e-06, + "loss": 0.2796, + "step": 6007 + }, + { + "epoch": 1.4300589040280836, + "grad_norm": 0.3917808141958219, + "learning_rate": 1.9841129511661526e-06, + "loss": 0.3428, + "step": 6008 + }, + { + "epoch": 1.4302969001011483, + "grad_norm": 0.39975809188737405, + "learning_rate": 1.9825757127738957e-06, + "loss": 0.396, + "step": 6009 + }, + { + "epoch": 1.4305348961742133, + "grad_norm": 0.37502506922397805, + "learning_rate": 1.9810389228455334e-06, + "loss": 0.2975, + "step": 6010 + }, + { + "epoch": 1.430772892247278, + "grad_norm": 0.38726845931918774, + "learning_rate": 1.9795025816094747e-06, + "loss": 0.273, + "step": 6011 + }, + { + "epoch": 1.4310108883203427, + "grad_norm": 0.3632876751872661, + "learning_rate": 1.9779666892940557e-06, + "loss": 0.3556, + "step": 6012 + }, + { + "epoch": 1.4312488843934075, + "grad_norm": 0.38274179138386594, + "learning_rate": 1.9764312461275482e-06, + "loss": 0.3783, + "step": 6013 + }, + { + "epoch": 1.4314868804664722, + "grad_norm": 0.3696767725810876, + "learning_rate": 1.974896252338155e-06, + "loss": 0.277, + "step": 6014 + }, + { + "epoch": 1.4317248765395372, + "grad_norm": 0.35996212200903566, + "learning_rate": 1.973361708154018e-06, + "loss": 0.3241, + "step": 6015 + }, + { + "epoch": 1.431962872612602, + "grad_norm": 0.3746809277181633, + "learning_rate": 1.9718276138032066e-06, + "loss": 0.4065, + "step": 6016 + }, + { + "epoch": 1.4322008686856667, + "grad_norm": 0.3662515570773327, + "learning_rate": 1.970293969513725e-06, + "loss": 0.3321, + "step": 6017 + }, + { + "epoch": 1.4324388647587316, + "grad_norm": 0.3893301791415084, + "learning_rate": 1.9687607755135114e-06, + "loss": 0.269, + "step": 6018 + }, + { + "epoch": 1.4326768608317964, + "grad_norm": 0.38494260892368, + "learning_rate": 1.9672280320304356e-06, + "loss": 0.3051, + "step": 6019 + }, + { + "epoch": 1.432914856904861, + "grad_norm": 0.38863096717866963, + "learning_rate": 1.965695739292301e-06, + "loss": 0.4005, + "step": 6020 + }, + { + "epoch": 1.4331528529779258, + "grad_norm": 0.3906075944195601, + "learning_rate": 1.964163897526845e-06, + "loss": 0.3081, + "step": 6021 + }, + { + "epoch": 1.4333908490509906, + "grad_norm": 0.4134133532934077, + "learning_rate": 1.9626325069617365e-06, + "loss": 0.3055, + "step": 6022 + }, + { + "epoch": 1.4336288451240555, + "grad_norm": 0.4136513446069149, + "learning_rate": 1.9611015678245786e-06, + "loss": 0.3594, + "step": 6023 + }, + { + "epoch": 1.4338668411971203, + "grad_norm": 0.3878063222204458, + "learning_rate": 1.9595710803429064e-06, + "loss": 0.3438, + "step": 6024 + }, + { + "epoch": 1.434104837270185, + "grad_norm": 0.3940209179695724, + "learning_rate": 1.958041044744186e-06, + "loss": 0.2871, + "step": 6025 + }, + { + "epoch": 1.43434283334325, + "grad_norm": 0.3824890901587333, + "learning_rate": 1.9565114612558232e-06, + "loss": 0.34, + "step": 6026 + }, + { + "epoch": 1.4345808294163147, + "grad_norm": 0.38272339908598974, + "learning_rate": 1.95498233010515e-06, + "loss": 0.3798, + "step": 6027 + }, + { + "epoch": 1.4348188254893794, + "grad_norm": 0.3610519966627758, + "learning_rate": 1.9534536515194312e-06, + "loss": 0.3026, + "step": 6028 + }, + { + "epoch": 1.4350568215624442, + "grad_norm": 0.3922292942167091, + "learning_rate": 1.9519254257258684e-06, + "loss": 0.2929, + "step": 6029 + }, + { + "epoch": 1.435294817635509, + "grad_norm": 0.4055572921879517, + "learning_rate": 1.950397652951593e-06, + "loss": 0.3556, + "step": 6030 + }, + { + "epoch": 1.4355328137085739, + "grad_norm": 0.41972072883290057, + "learning_rate": 1.94887033342367e-06, + "loss": 0.347, + "step": 6031 + }, + { + "epoch": 1.4357708097816386, + "grad_norm": 0.3783613808548642, + "learning_rate": 1.9473434673690974e-06, + "loss": 0.2675, + "step": 6032 + }, + { + "epoch": 1.4360088058547034, + "grad_norm": 0.374012703303672, + "learning_rate": 1.945817055014804e-06, + "loss": 0.3121, + "step": 6033 + }, + { + "epoch": 1.4362468019277683, + "grad_norm": 0.37927442883741, + "learning_rate": 1.9442910965876533e-06, + "loss": 0.3758, + "step": 6034 + }, + { + "epoch": 1.436484798000833, + "grad_norm": 0.38190378180904344, + "learning_rate": 1.94276559231444e-06, + "loss": 0.292, + "step": 6035 + }, + { + "epoch": 1.4367227940738978, + "grad_norm": 0.37836022258000657, + "learning_rate": 1.9412405424218915e-06, + "loss": 0.2698, + "step": 6036 + }, + { + "epoch": 1.4369607901469625, + "grad_norm": 0.38901824469541524, + "learning_rate": 1.9397159471366677e-06, + "loss": 0.3496, + "step": 6037 + }, + { + "epoch": 1.4371987862200273, + "grad_norm": 0.37161186422002496, + "learning_rate": 1.9381918066853632e-06, + "loss": 0.3693, + "step": 6038 + }, + { + "epoch": 1.4374367822930922, + "grad_norm": 0.38662177675097864, + "learning_rate": 1.9366681212945014e-06, + "loss": 0.2883, + "step": 6039 + }, + { + "epoch": 1.437674778366157, + "grad_norm": 0.3647394695355797, + "learning_rate": 1.9351448911905407e-06, + "loss": 0.2887, + "step": 6040 + }, + { + "epoch": 1.4379127744392217, + "grad_norm": 0.364210263276713, + "learning_rate": 1.933622116599868e-06, + "loss": 0.3901, + "step": 6041 + }, + { + "epoch": 1.4381507705122867, + "grad_norm": 0.4151429490488636, + "learning_rate": 1.9320997977488086e-06, + "loss": 0.3212, + "step": 6042 + }, + { + "epoch": 1.4383887665853514, + "grad_norm": 0.4095987714243739, + "learning_rate": 1.930577934863616e-06, + "loss": 0.2889, + "step": 6043 + }, + { + "epoch": 1.4386267626584162, + "grad_norm": 0.36732368664961057, + "learning_rate": 1.929056528170476e-06, + "loss": 0.324, + "step": 6044 + }, + { + "epoch": 1.438864758731481, + "grad_norm": 0.36769386073709703, + "learning_rate": 1.9275355778955073e-06, + "loss": 0.3886, + "step": 6045 + }, + { + "epoch": 1.4391027548045456, + "grad_norm": 0.3914105235529593, + "learning_rate": 1.926015084264761e-06, + "loss": 0.27, + "step": 6046 + }, + { + "epoch": 1.4393407508776106, + "grad_norm": 0.35201976589044254, + "learning_rate": 1.9244950475042195e-06, + "loss": 0.3152, + "step": 6047 + }, + { + "epoch": 1.4395787469506753, + "grad_norm": 0.37403165542953687, + "learning_rate": 1.922975467839799e-06, + "loss": 0.3402, + "step": 6048 + }, + { + "epoch": 1.43981674302374, + "grad_norm": 0.3704566985118877, + "learning_rate": 1.9214563454973452e-06, + "loss": 0.3232, + "step": 6049 + }, + { + "epoch": 1.440054739096805, + "grad_norm": 0.3591762003021762, + "learning_rate": 1.9199376807026383e-06, + "loss": 0.3001, + "step": 6050 + }, + { + "epoch": 1.4402927351698698, + "grad_norm": 0.3710069533511562, + "learning_rate": 1.918419473681389e-06, + "loss": 0.3465, + "step": 6051 + }, + { + "epoch": 1.4405307312429345, + "grad_norm": 0.40999024447887217, + "learning_rate": 1.9169017246592404e-06, + "loss": 0.4, + "step": 6052 + }, + { + "epoch": 1.4407687273159993, + "grad_norm": 0.3683061673886724, + "learning_rate": 1.915384433861766e-06, + "loss": 0.3081, + "step": 6053 + }, + { + "epoch": 1.441006723389064, + "grad_norm": 0.3822641283219383, + "learning_rate": 1.9138676015144765e-06, + "loss": 0.2872, + "step": 6054 + }, + { + "epoch": 1.441244719462129, + "grad_norm": 0.46115476067766986, + "learning_rate": 1.912351227842808e-06, + "loss": 0.3579, + "step": 6055 + }, + { + "epoch": 1.4414827155351937, + "grad_norm": 0.3663936685499609, + "learning_rate": 1.9108353130721326e-06, + "loss": 0.3425, + "step": 6056 + }, + { + "epoch": 1.4417207116082584, + "grad_norm": 0.3549319540090481, + "learning_rate": 1.9093198574277516e-06, + "loss": 0.2929, + "step": 6057 + }, + { + "epoch": 1.4419587076813234, + "grad_norm": 0.42037911174742915, + "learning_rate": 1.9078048611348992e-06, + "loss": 0.3327, + "step": 6058 + }, + { + "epoch": 1.4421967037543881, + "grad_norm": 0.3914790764090726, + "learning_rate": 1.9062903244187419e-06, + "loss": 0.3751, + "step": 6059 + }, + { + "epoch": 1.4424346998274529, + "grad_norm": 0.39191069864519656, + "learning_rate": 1.9047762475043775e-06, + "loss": 0.2849, + "step": 6060 + }, + { + "epoch": 1.4426726959005176, + "grad_norm": 0.5035295970708908, + "learning_rate": 1.9032626306168344e-06, + "loss": 0.2822, + "step": 6061 + }, + { + "epoch": 1.4429106919735823, + "grad_norm": 0.3617034512387851, + "learning_rate": 1.9017494739810737e-06, + "loss": 0.3608, + "step": 6062 + }, + { + "epoch": 1.4431486880466473, + "grad_norm": 0.3801209303019163, + "learning_rate": 1.9002367778219889e-06, + "loss": 0.3888, + "step": 6063 + }, + { + "epoch": 1.443386684119712, + "grad_norm": 0.38415720552229266, + "learning_rate": 1.8987245423644012e-06, + "loss": 0.3067, + "step": 6064 + }, + { + "epoch": 1.4436246801927768, + "grad_norm": 0.35618939435471303, + "learning_rate": 1.8972127678330703e-06, + "loss": 0.3021, + "step": 6065 + }, + { + "epoch": 1.4438626762658417, + "grad_norm": 0.39214870024334303, + "learning_rate": 1.8957014544526808e-06, + "loss": 0.3576, + "step": 6066 + }, + { + "epoch": 1.4441006723389065, + "grad_norm": 0.36937785124750994, + "learning_rate": 1.8941906024478524e-06, + "loss": 0.315, + "step": 6067 + }, + { + "epoch": 1.4443386684119712, + "grad_norm": 0.37155671677042273, + "learning_rate": 1.8926802120431325e-06, + "loss": 0.3286, + "step": 6068 + }, + { + "epoch": 1.444576664485036, + "grad_norm": 0.4021046642008438, + "learning_rate": 1.8911702834630063e-06, + "loss": 0.3203, + "step": 6069 + }, + { + "epoch": 1.4448146605581007, + "grad_norm": 0.4017487540968996, + "learning_rate": 1.8896608169318847e-06, + "loss": 0.375, + "step": 6070 + }, + { + "epoch": 1.4450526566311657, + "grad_norm": 0.36752083749151027, + "learning_rate": 1.8881518126741121e-06, + "loss": 0.2926, + "step": 6071 + }, + { + "epoch": 1.4452906527042304, + "grad_norm": 0.36577529364212397, + "learning_rate": 1.886643270913963e-06, + "loss": 0.2998, + "step": 6072 + }, + { + "epoch": 1.4455286487772951, + "grad_norm": 0.38891808139085265, + "learning_rate": 1.885135191875645e-06, + "loss": 0.3527, + "step": 6073 + }, + { + "epoch": 1.44576664485036, + "grad_norm": 0.3858655286851012, + "learning_rate": 1.8836275757832957e-06, + "loss": 0.3269, + "step": 6074 + }, + { + "epoch": 1.4460046409234248, + "grad_norm": 0.4342531875991616, + "learning_rate": 1.8821204228609835e-06, + "loss": 0.2787, + "step": 6075 + }, + { + "epoch": 1.4462426369964896, + "grad_norm": 0.38696089076302526, + "learning_rate": 1.880613733332709e-06, + "loss": 0.3401, + "step": 6076 + }, + { + "epoch": 1.4464806330695543, + "grad_norm": 0.3949331561518244, + "learning_rate": 1.879107507422404e-06, + "loss": 0.3802, + "step": 6077 + }, + { + "epoch": 1.446718629142619, + "grad_norm": 0.405406854410338, + "learning_rate": 1.8776017453539307e-06, + "loss": 0.3093, + "step": 6078 + }, + { + "epoch": 1.446956625215684, + "grad_norm": 0.37618600362179, + "learning_rate": 1.8760964473510823e-06, + "loss": 0.2896, + "step": 6079 + }, + { + "epoch": 1.4471946212887488, + "grad_norm": 0.4006013598116971, + "learning_rate": 1.8745916136375814e-06, + "loss": 0.3919, + "step": 6080 + }, + { + "epoch": 1.4474326173618135, + "grad_norm": 0.38408656575398153, + "learning_rate": 1.8730872444370874e-06, + "loss": 0.366, + "step": 6081 + }, + { + "epoch": 1.4476706134348785, + "grad_norm": 0.3948058469025689, + "learning_rate": 1.8715833399731854e-06, + "loss": 0.2908, + "step": 6082 + }, + { + "epoch": 1.4479086095079432, + "grad_norm": 0.3849530328785906, + "learning_rate": 1.870079900469392e-06, + "loss": 0.2974, + "step": 6083 + }, + { + "epoch": 1.448146605581008, + "grad_norm": 0.4378511368770222, + "learning_rate": 1.868576926149156e-06, + "loss": 0.3846, + "step": 6084 + }, + { + "epoch": 1.4483846016540727, + "grad_norm": 0.3862944035305655, + "learning_rate": 1.8670744172358563e-06, + "loss": 0.3343, + "step": 6085 + }, + { + "epoch": 1.4486225977271374, + "grad_norm": 0.4279048030239408, + "learning_rate": 1.865572373952803e-06, + "loss": 0.3044, + "step": 6086 + }, + { + "epoch": 1.4488605938002024, + "grad_norm": 0.45420594269270115, + "learning_rate": 1.8640707965232375e-06, + "loss": 0.337, + "step": 6087 + }, + { + "epoch": 1.4490985898732671, + "grad_norm": 0.38069002954177594, + "learning_rate": 1.86256968517033e-06, + "loss": 0.3705, + "step": 6088 + }, + { + "epoch": 1.4493365859463319, + "grad_norm": 0.41715068301428293, + "learning_rate": 1.8610690401171845e-06, + "loss": 0.2696, + "step": 6089 + }, + { + "epoch": 1.4495745820193968, + "grad_norm": 0.41094165450892295, + "learning_rate": 1.8595688615868324e-06, + "loss": 0.3237, + "step": 6090 + }, + { + "epoch": 1.4498125780924616, + "grad_norm": 0.40909914025067795, + "learning_rate": 1.8580691498022364e-06, + "loss": 0.3643, + "step": 6091 + }, + { + "epoch": 1.4500505741655263, + "grad_norm": 0.39406224435785253, + "learning_rate": 1.8565699049862939e-06, + "loss": 0.3093, + "step": 6092 + }, + { + "epoch": 1.450288570238591, + "grad_norm": 0.4265470858706377, + "learning_rate": 1.8550711273618283e-06, + "loss": 0.2583, + "step": 6093 + }, + { + "epoch": 1.4505265663116558, + "grad_norm": 0.4557324973621654, + "learning_rate": 1.8535728171515949e-06, + "loss": 0.3407, + "step": 6094 + }, + { + "epoch": 1.4507645623847207, + "grad_norm": 0.39977705869048036, + "learning_rate": 1.8520749745782784e-06, + "loss": 0.3649, + "step": 6095 + }, + { + "epoch": 1.4510025584577855, + "grad_norm": 0.37280847284038304, + "learning_rate": 1.8505775998644982e-06, + "loss": 0.3052, + "step": 6096 + }, + { + "epoch": 1.4512405545308502, + "grad_norm": 0.39792431548064744, + "learning_rate": 1.8490806932327993e-06, + "loss": 0.3123, + "step": 6097 + }, + { + "epoch": 1.4514785506039152, + "grad_norm": 0.42229729780036634, + "learning_rate": 1.8475842549056594e-06, + "loss": 0.3582, + "step": 6098 + }, + { + "epoch": 1.45171654667698, + "grad_norm": 0.3864779005126171, + "learning_rate": 1.8460882851054867e-06, + "loss": 0.3315, + "step": 6099 + }, + { + "epoch": 1.4519545427500447, + "grad_norm": 0.3953598919130047, + "learning_rate": 1.8445927840546186e-06, + "loss": 0.2967, + "step": 6100 + }, + { + "epoch": 1.4521925388231094, + "grad_norm": 0.38578775492268436, + "learning_rate": 1.8430977519753235e-06, + "loss": 0.3217, + "step": 6101 + }, + { + "epoch": 1.4524305348961741, + "grad_norm": 0.44987910181185625, + "learning_rate": 1.8416031890898006e-06, + "loss": 0.3781, + "step": 6102 + }, + { + "epoch": 1.4526685309692389, + "grad_norm": 0.3912127016231833, + "learning_rate": 1.840109095620179e-06, + "loss": 0.2981, + "step": 6103 + }, + { + "epoch": 1.4529065270423038, + "grad_norm": 0.4302943166340117, + "learning_rate": 1.838615471788518e-06, + "loss": 0.2725, + "step": 6104 + }, + { + "epoch": 1.4531445231153686, + "grad_norm": 0.43080639650484565, + "learning_rate": 1.8371223178168063e-06, + "loss": 0.3699, + "step": 6105 + }, + { + "epoch": 1.4533825191884333, + "grad_norm": 0.4082319577973362, + "learning_rate": 1.8356296339269635e-06, + "loss": 0.3588, + "step": 6106 + }, + { + "epoch": 1.4536205152614983, + "grad_norm": 0.36934438218455085, + "learning_rate": 1.8341374203408407e-06, + "loss": 0.2699, + "step": 6107 + }, + { + "epoch": 1.453858511334563, + "grad_norm": 0.3802597728212302, + "learning_rate": 1.8326456772802148e-06, + "loss": 0.3429, + "step": 6108 + }, + { + "epoch": 1.4540965074076277, + "grad_norm": 0.3935734960848936, + "learning_rate": 1.8311544049668001e-06, + "loss": 0.4122, + "step": 6109 + }, + { + "epoch": 1.4543345034806925, + "grad_norm": 0.4192793931557721, + "learning_rate": 1.8296636036222338e-06, + "loss": 0.2778, + "step": 6110 + }, + { + "epoch": 1.4545724995537572, + "grad_norm": 0.383099877153263, + "learning_rate": 1.8281732734680863e-06, + "loss": 0.2909, + "step": 6111 + }, + { + "epoch": 1.4548104956268222, + "grad_norm": 0.5516280401082402, + "learning_rate": 1.8266834147258577e-06, + "loss": 0.3406, + "step": 6112 + }, + { + "epoch": 1.455048491699887, + "grad_norm": 0.3873783424168311, + "learning_rate": 1.8251940276169777e-06, + "loss": 0.3955, + "step": 6113 + }, + { + "epoch": 1.4552864877729517, + "grad_norm": 0.38380201175264, + "learning_rate": 1.8237051123628057e-06, + "loss": 0.2968, + "step": 6114 + }, + { + "epoch": 1.4555244838460166, + "grad_norm": 0.39275514580382104, + "learning_rate": 1.8222166691846321e-06, + "loss": 0.2943, + "step": 6115 + }, + { + "epoch": 1.4557624799190814, + "grad_norm": 0.40228420689226446, + "learning_rate": 1.8207286983036765e-06, + "loss": 0.3527, + "step": 6116 + }, + { + "epoch": 1.456000475992146, + "grad_norm": 0.37618752702614194, + "learning_rate": 1.819241199941087e-06, + "loss": 0.3262, + "step": 6117 + }, + { + "epoch": 1.4562384720652108, + "grad_norm": 0.37547398305167284, + "learning_rate": 1.8177541743179423e-06, + "loss": 0.2852, + "step": 6118 + }, + { + "epoch": 1.4564764681382756, + "grad_norm": 0.41032362127204985, + "learning_rate": 1.8162676216552533e-06, + "loss": 0.3307, + "step": 6119 + }, + { + "epoch": 1.4567144642113405, + "grad_norm": 0.4347197449287449, + "learning_rate": 1.8147815421739578e-06, + "loss": 0.3799, + "step": 6120 + }, + { + "epoch": 1.4569524602844053, + "grad_norm": 0.3563768966230964, + "learning_rate": 1.8132959360949237e-06, + "loss": 0.2712, + "step": 6121 + }, + { + "epoch": 1.45719045635747, + "grad_norm": 0.38101919344310853, + "learning_rate": 1.811810803638947e-06, + "loss": 0.279, + "step": 6122 + }, + { + "epoch": 1.457428452430535, + "grad_norm": 0.418475439520036, + "learning_rate": 1.810326145026759e-06, + "loss": 0.3644, + "step": 6123 + }, + { + "epoch": 1.4576664485035997, + "grad_norm": 0.34723819719324317, + "learning_rate": 1.8088419604790135e-06, + "loss": 0.313, + "step": 6124 + }, + { + "epoch": 1.4579044445766645, + "grad_norm": 0.40905964751759843, + "learning_rate": 1.807358250216299e-06, + "loss": 0.2901, + "step": 6125 + }, + { + "epoch": 1.4581424406497292, + "grad_norm": 0.39190988886703376, + "learning_rate": 1.8058750144591308e-06, + "loss": 0.3643, + "step": 6126 + }, + { + "epoch": 1.458380436722794, + "grad_norm": 0.39412625081899383, + "learning_rate": 1.804392253427954e-06, + "loss": 0.4072, + "step": 6127 + }, + { + "epoch": 1.458618432795859, + "grad_norm": 0.38625582558289245, + "learning_rate": 1.8029099673431438e-06, + "loss": 0.3247, + "step": 6128 + }, + { + "epoch": 1.4588564288689236, + "grad_norm": 0.37374977195081105, + "learning_rate": 1.8014281564250046e-06, + "loss": 0.2899, + "step": 6129 + }, + { + "epoch": 1.4590944249419884, + "grad_norm": 0.4022256905018728, + "learning_rate": 1.7999468208937698e-06, + "loss": 0.3349, + "step": 6130 + }, + { + "epoch": 1.4593324210150533, + "grad_norm": 0.3782717385013846, + "learning_rate": 1.7984659609696037e-06, + "loss": 0.3659, + "step": 6131 + }, + { + "epoch": 1.459570417088118, + "grad_norm": 0.42371774399832857, + "learning_rate": 1.7969855768725973e-06, + "loss": 0.2867, + "step": 6132 + }, + { + "epoch": 1.4598084131611828, + "grad_norm": 0.4828644226623123, + "learning_rate": 1.7955056688227735e-06, + "loss": 0.3071, + "step": 6133 + }, + { + "epoch": 1.4600464092342476, + "grad_norm": 0.3711693938332572, + "learning_rate": 1.7940262370400823e-06, + "loss": 0.3749, + "step": 6134 + }, + { + "epoch": 1.4602844053073123, + "grad_norm": 0.38061515908607924, + "learning_rate": 1.792547281744403e-06, + "loss": 0.2909, + "step": 6135 + }, + { + "epoch": 1.4605224013803773, + "grad_norm": 0.39151517351766074, + "learning_rate": 1.7910688031555473e-06, + "loss": 0.2842, + "step": 6136 + }, + { + "epoch": 1.460760397453442, + "grad_norm": 0.41546568095239644, + "learning_rate": 1.7895908014932529e-06, + "loss": 0.3413, + "step": 6137 + }, + { + "epoch": 1.4609983935265067, + "grad_norm": 0.40057642081942424, + "learning_rate": 1.788113276977187e-06, + "loss": 0.3942, + "step": 6138 + }, + { + "epoch": 1.4612363895995717, + "grad_norm": 0.4025786155182079, + "learning_rate": 1.7866362298269468e-06, + "loss": 0.287, + "step": 6139 + }, + { + "epoch": 1.4614743856726364, + "grad_norm": 0.36206986714191153, + "learning_rate": 1.7851596602620568e-06, + "loss": 0.3139, + "step": 6140 + }, + { + "epoch": 1.4617123817457012, + "grad_norm": 0.4078012441791886, + "learning_rate": 1.7836835685019732e-06, + "loss": 0.37, + "step": 6141 + }, + { + "epoch": 1.461950377818766, + "grad_norm": 0.36946996756792294, + "learning_rate": 1.7822079547660792e-06, + "loss": 0.3086, + "step": 6142 + }, + { + "epoch": 1.4621883738918307, + "grad_norm": 0.3817427648802179, + "learning_rate": 1.7807328192736872e-06, + "loss": 0.2633, + "step": 6143 + }, + { + "epoch": 1.4624263699648956, + "grad_norm": 0.38265305398727073, + "learning_rate": 1.7792581622440392e-06, + "loss": 0.3307, + "step": 6144 + }, + { + "epoch": 1.4626643660379604, + "grad_norm": 0.5748853333539239, + "learning_rate": 1.777783983896304e-06, + "loss": 0.3755, + "step": 6145 + }, + { + "epoch": 1.462902362111025, + "grad_norm": 0.39571210226781933, + "learning_rate": 1.7763102844495838e-06, + "loss": 0.3066, + "step": 6146 + }, + { + "epoch": 1.46314035818409, + "grad_norm": 0.39702192397444136, + "learning_rate": 1.7748370641229063e-06, + "loss": 0.2969, + "step": 6147 + }, + { + "epoch": 1.4633783542571548, + "grad_norm": 0.3736958018420523, + "learning_rate": 1.773364323135227e-06, + "loss": 0.3624, + "step": 6148 + }, + { + "epoch": 1.4636163503302195, + "grad_norm": 0.3853561768534024, + "learning_rate": 1.7718920617054313e-06, + "loss": 0.359, + "step": 6149 + }, + { + "epoch": 1.4638543464032843, + "grad_norm": 0.3882989341472717, + "learning_rate": 1.7704202800523362e-06, + "loss": 0.267, + "step": 6150 + }, + { + "epoch": 1.464092342476349, + "grad_norm": 0.40235971375885304, + "learning_rate": 1.768948978394684e-06, + "loss": 0.3269, + "step": 6151 + }, + { + "epoch": 1.464330338549414, + "grad_norm": 0.41400386307178666, + "learning_rate": 1.7674781569511451e-06, + "loss": 0.3767, + "step": 6152 + }, + { + "epoch": 1.4645683346224787, + "grad_norm": 0.4015051161422788, + "learning_rate": 1.766007815940321e-06, + "loss": 0.2967, + "step": 6153 + }, + { + "epoch": 1.4648063306955434, + "grad_norm": 0.39178112671184995, + "learning_rate": 1.7645379555807408e-06, + "loss": 0.276, + "step": 6154 + }, + { + "epoch": 1.4650443267686084, + "grad_norm": 0.4115792252224875, + "learning_rate": 1.7630685760908623e-06, + "loss": 0.3404, + "step": 6155 + }, + { + "epoch": 1.4652823228416731, + "grad_norm": 0.38556483981697237, + "learning_rate": 1.7615996776890704e-06, + "loss": 0.3528, + "step": 6156 + }, + { + "epoch": 1.4655203189147379, + "grad_norm": 0.37944105872349776, + "learning_rate": 1.760131260593681e-06, + "loss": 0.2946, + "step": 6157 + }, + { + "epoch": 1.4657583149878026, + "grad_norm": 0.3902900142096629, + "learning_rate": 1.7586633250229368e-06, + "loss": 0.3307, + "step": 6158 + }, + { + "epoch": 1.4659963110608674, + "grad_norm": 0.3814146994358725, + "learning_rate": 1.7571958711950088e-06, + "loss": 0.3736, + "step": 6159 + }, + { + "epoch": 1.4662343071339323, + "grad_norm": 0.37047847418626345, + "learning_rate": 1.7557288993279981e-06, + "loss": 0.3168, + "step": 6160 + }, + { + "epoch": 1.466472303206997, + "grad_norm": 0.38122549662040817, + "learning_rate": 1.754262409639932e-06, + "loss": 0.2881, + "step": 6161 + }, + { + "epoch": 1.4667102992800618, + "grad_norm": 0.3933736686295024, + "learning_rate": 1.7527964023487676e-06, + "loss": 0.3072, + "step": 6162 + }, + { + "epoch": 1.4669482953531268, + "grad_norm": 0.4116114307272517, + "learning_rate": 1.751330877672388e-06, + "loss": 0.3761, + "step": 6163 + }, + { + "epoch": 1.4671862914261915, + "grad_norm": 0.36687857623003073, + "learning_rate": 1.7498658358286098e-06, + "loss": 0.285, + "step": 6164 + }, + { + "epoch": 1.4674242874992562, + "grad_norm": 0.35502119768687834, + "learning_rate": 1.7484012770351732e-06, + "loss": 0.327, + "step": 6165 + }, + { + "epoch": 1.467662283572321, + "grad_norm": 0.38055825453365044, + "learning_rate": 1.7469372015097469e-06, + "loss": 0.4053, + "step": 6166 + }, + { + "epoch": 1.4679002796453857, + "grad_norm": 0.37110944722737155, + "learning_rate": 1.7454736094699298e-06, + "loss": 0.2759, + "step": 6167 + }, + { + "epoch": 1.4681382757184507, + "grad_norm": 0.4369470705323925, + "learning_rate": 1.7440105011332476e-06, + "loss": 0.2671, + "step": 6168 + }, + { + "epoch": 1.4683762717915154, + "grad_norm": 0.5041602732328805, + "learning_rate": 1.7425478767171539e-06, + "loss": 0.3163, + "step": 6169 + }, + { + "epoch": 1.4686142678645802, + "grad_norm": 0.38125323959910473, + "learning_rate": 1.741085736439031e-06, + "loss": 0.3758, + "step": 6170 + }, + { + "epoch": 1.4688522639376451, + "grad_norm": 0.4598903890600633, + "learning_rate": 1.7396240805161896e-06, + "loss": 0.3144, + "step": 6171 + }, + { + "epoch": 1.4690902600107099, + "grad_norm": 0.38337527005033567, + "learning_rate": 1.7381629091658664e-06, + "loss": 0.288, + "step": 6172 + }, + { + "epoch": 1.4693282560837746, + "grad_norm": 0.37602121389981175, + "learning_rate": 1.7367022226052299e-06, + "loss": 0.3685, + "step": 6173 + }, + { + "epoch": 1.4695662521568393, + "grad_norm": 0.3515901765862798, + "learning_rate": 1.7352420210513732e-06, + "loss": 0.3525, + "step": 6174 + }, + { + "epoch": 1.469804248229904, + "grad_norm": 0.37679635447891247, + "learning_rate": 1.7337823047213186e-06, + "loss": 0.269, + "step": 6175 + }, + { + "epoch": 1.470042244302969, + "grad_norm": 0.421007003539755, + "learning_rate": 1.7323230738320162e-06, + "loss": 0.3184, + "step": 6176 + }, + { + "epoch": 1.4702802403760338, + "grad_norm": 0.3953495895644738, + "learning_rate": 1.7308643286003412e-06, + "loss": 0.4181, + "step": 6177 + }, + { + "epoch": 1.4705182364490985, + "grad_norm": 0.3904183739455049, + "learning_rate": 1.7294060692431035e-06, + "loss": 0.3224, + "step": 6178 + }, + { + "epoch": 1.4707562325221635, + "grad_norm": 0.3958574641622456, + "learning_rate": 1.7279482959770345e-06, + "loss": 0.281, + "step": 6179 + }, + { + "epoch": 1.4709942285952282, + "grad_norm": 0.3657347135001612, + "learning_rate": 1.7264910090187952e-06, + "loss": 0.3256, + "step": 6180 + }, + { + "epoch": 1.471232224668293, + "grad_norm": 0.36459178634963607, + "learning_rate": 1.7250342085849747e-06, + "loss": 0.3699, + "step": 6181 + }, + { + "epoch": 1.4714702207413577, + "grad_norm": 0.35909114336445935, + "learning_rate": 1.72357789489209e-06, + "loss": 0.2993, + "step": 6182 + }, + { + "epoch": 1.4717082168144224, + "grad_norm": 0.35376404383488347, + "learning_rate": 1.7221220681565842e-06, + "loss": 0.3323, + "step": 6183 + }, + { + "epoch": 1.4719462128874874, + "grad_norm": 0.39465896136944995, + "learning_rate": 1.7206667285948303e-06, + "loss": 0.3739, + "step": 6184 + }, + { + "epoch": 1.4721842089605521, + "grad_norm": 0.365373583943615, + "learning_rate": 1.7192118764231276e-06, + "loss": 0.2824, + "step": 6185 + }, + { + "epoch": 1.4724222050336169, + "grad_norm": 0.3970412374830905, + "learning_rate": 1.7177575118577022e-06, + "loss": 0.2851, + "step": 6186 + }, + { + "epoch": 1.4726602011066818, + "grad_norm": 0.38867388441314077, + "learning_rate": 1.7163036351147094e-06, + "loss": 0.368, + "step": 6187 + }, + { + "epoch": 1.4728981971797466, + "grad_norm": 0.3726608425142561, + "learning_rate": 1.7148502464102312e-06, + "loss": 0.3686, + "step": 6188 + }, + { + "epoch": 1.4731361932528113, + "grad_norm": 0.3895210104408985, + "learning_rate": 1.7133973459602776e-06, + "loss": 0.2775, + "step": 6189 + }, + { + "epoch": 1.473374189325876, + "grad_norm": 0.4279705014335874, + "learning_rate": 1.7119449339807825e-06, + "loss": 0.3314, + "step": 6190 + }, + { + "epoch": 1.4736121853989408, + "grad_norm": 0.376456866460005, + "learning_rate": 1.710493010687615e-06, + "loss": 0.3556, + "step": 6191 + }, + { + "epoch": 1.4738501814720057, + "grad_norm": 0.3605199356919443, + "learning_rate": 1.7090415762965646e-06, + "loss": 0.3045, + "step": 6192 + }, + { + "epoch": 1.4740881775450705, + "grad_norm": 0.44055911897196376, + "learning_rate": 1.7075906310233503e-06, + "loss": 0.292, + "step": 6193 + }, + { + "epoch": 1.4743261736181352, + "grad_norm": 0.3901396740042231, + "learning_rate": 1.7061401750836182e-06, + "loss": 0.3442, + "step": 6194 + }, + { + "epoch": 1.4745641696912002, + "grad_norm": 0.3987599004644487, + "learning_rate": 1.7046902086929428e-06, + "loss": 0.4148, + "step": 6195 + }, + { + "epoch": 1.474802165764265, + "grad_norm": 0.37091763674215245, + "learning_rate": 1.7032407320668243e-06, + "loss": 0.2732, + "step": 6196 + }, + { + "epoch": 1.4750401618373297, + "grad_norm": 0.347095443573286, + "learning_rate": 1.7017917454206905e-06, + "loss": 0.2876, + "step": 6197 + }, + { + "epoch": 1.4752781579103944, + "grad_norm": 0.3759182793149618, + "learning_rate": 1.7003432489698974e-06, + "loss": 0.3734, + "step": 6198 + }, + { + "epoch": 1.4755161539834591, + "grad_norm": 0.4063498607269111, + "learning_rate": 1.698895242929725e-06, + "loss": 0.3454, + "step": 6199 + }, + { + "epoch": 1.475754150056524, + "grad_norm": 0.3813713594888871, + "learning_rate": 1.6974477275153872e-06, + "loss": 0.2854, + "step": 6200 + }, + { + "epoch": 1.4759921461295888, + "grad_norm": 0.35289582689259275, + "learning_rate": 1.696000702942018e-06, + "loss": 0.3151, + "step": 6201 + }, + { + "epoch": 1.4762301422026536, + "grad_norm": 0.38603206935555157, + "learning_rate": 1.6945541694246809e-06, + "loss": 0.3949, + "step": 6202 + }, + { + "epoch": 1.4764681382757185, + "grad_norm": 0.35892990933497454, + "learning_rate": 1.6931081271783679e-06, + "loss": 0.2862, + "step": 6203 + }, + { + "epoch": 1.4767061343487833, + "grad_norm": 0.3831559472250615, + "learning_rate": 1.6916625764179934e-06, + "loss": 0.2722, + "step": 6204 + }, + { + "epoch": 1.476944130421848, + "grad_norm": 0.40027720342137013, + "learning_rate": 1.6902175173584062e-06, + "loss": 0.3442, + "step": 6205 + }, + { + "epoch": 1.4771821264949128, + "grad_norm": 0.383928566968621, + "learning_rate": 1.6887729502143762e-06, + "loss": 0.3227, + "step": 6206 + }, + { + "epoch": 1.4774201225679775, + "grad_norm": 0.4035104381552861, + "learning_rate": 1.6873288752006013e-06, + "loss": 0.2994, + "step": 6207 + }, + { + "epoch": 1.4776581186410425, + "grad_norm": 0.37119371517468214, + "learning_rate": 1.685885292531707e-06, + "loss": 0.333, + "step": 6208 + }, + { + "epoch": 1.4778961147141072, + "grad_norm": 0.3897446214278183, + "learning_rate": 1.6844422024222462e-06, + "loss": 0.3858, + "step": 6209 + }, + { + "epoch": 1.478134110787172, + "grad_norm": 0.36090692256055146, + "learning_rate": 1.6829996050866965e-06, + "loss": 0.2898, + "step": 6210 + }, + { + "epoch": 1.478372106860237, + "grad_norm": 0.3903554712089631, + "learning_rate": 1.6815575007394641e-06, + "loss": 0.2851, + "step": 6211 + }, + { + "epoch": 1.4786101029333016, + "grad_norm": 0.38668090625918494, + "learning_rate": 1.6801158895948816e-06, + "loss": 0.3617, + "step": 6212 + }, + { + "epoch": 1.4788480990063664, + "grad_norm": 0.3934627308345188, + "learning_rate": 1.6786747718672076e-06, + "loss": 0.3588, + "step": 6213 + }, + { + "epoch": 1.4790860950794311, + "grad_norm": 0.43569448317897996, + "learning_rate": 1.6772341477706284e-06, + "loss": 0.2777, + "step": 6214 + }, + { + "epoch": 1.4793240911524959, + "grad_norm": 0.4089467347728445, + "learning_rate": 1.675794017519256e-06, + "loss": 0.3008, + "step": 6215 + }, + { + "epoch": 1.4795620872255608, + "grad_norm": 0.3882878487464565, + "learning_rate": 1.6743543813271296e-06, + "loss": 0.3329, + "step": 6216 + }, + { + "epoch": 1.4798000832986256, + "grad_norm": 0.37303839591336113, + "learning_rate": 1.6729152394082144e-06, + "loss": 0.302, + "step": 6217 + }, + { + "epoch": 1.4800380793716903, + "grad_norm": 0.38100751920843007, + "learning_rate": 1.6714765919764015e-06, + "loss": 0.2819, + "step": 6218 + }, + { + "epoch": 1.4802760754447553, + "grad_norm": 0.3845807803382066, + "learning_rate": 1.6700384392455122e-06, + "loss": 0.3177, + "step": 6219 + }, + { + "epoch": 1.48051407151782, + "grad_norm": 0.34937828068409094, + "learning_rate": 1.6686007814292898e-06, + "loss": 0.371, + "step": 6220 + }, + { + "epoch": 1.4807520675908847, + "grad_norm": 0.3465525601796606, + "learning_rate": 1.6671636187414065e-06, + "loss": 0.2809, + "step": 6221 + }, + { + "epoch": 1.4809900636639495, + "grad_norm": 0.37505512040887995, + "learning_rate": 1.66572695139546e-06, + "loss": 0.3043, + "step": 6222 + }, + { + "epoch": 1.4812280597370142, + "grad_norm": 0.3849866579968192, + "learning_rate": 1.664290779604974e-06, + "loss": 0.3555, + "step": 6223 + }, + { + "epoch": 1.4814660558100792, + "grad_norm": 0.38249173168240747, + "learning_rate": 1.6628551035833995e-06, + "loss": 0.3256, + "step": 6224 + }, + { + "epoch": 1.481704051883144, + "grad_norm": 0.4133019140360971, + "learning_rate": 1.6614199235441141e-06, + "loss": 0.2854, + "step": 6225 + }, + { + "epoch": 1.4819420479562087, + "grad_norm": 0.3804351372256457, + "learning_rate": 1.6599852397004184e-06, + "loss": 0.3361, + "step": 6226 + }, + { + "epoch": 1.4821800440292736, + "grad_norm": 0.38603923989851835, + "learning_rate": 1.6585510522655463e-06, + "loss": 0.4, + "step": 6227 + }, + { + "epoch": 1.4824180401023384, + "grad_norm": 0.3518414478466677, + "learning_rate": 1.657117361452651e-06, + "loss": 0.2999, + "step": 6228 + }, + { + "epoch": 1.482656036175403, + "grad_norm": 0.4067824455417079, + "learning_rate": 1.6556841674748148e-06, + "loss": 0.2751, + "step": 6229 + }, + { + "epoch": 1.4828940322484678, + "grad_norm": 0.3888377616545728, + "learning_rate": 1.6542514705450453e-06, + "loss": 0.3871, + "step": 6230 + }, + { + "epoch": 1.4831320283215326, + "grad_norm": 0.37104379881621813, + "learning_rate": 1.6528192708762775e-06, + "loss": 0.3358, + "step": 6231 + }, + { + "epoch": 1.4833700243945975, + "grad_norm": 0.37321076058841374, + "learning_rate": 1.6513875686813696e-06, + "loss": 0.2893, + "step": 6232 + }, + { + "epoch": 1.4836080204676623, + "grad_norm": 0.40249229900762473, + "learning_rate": 1.6499563641731115e-06, + "loss": 0.3353, + "step": 6233 + }, + { + "epoch": 1.483846016540727, + "grad_norm": 0.39795621117297714, + "learning_rate": 1.6485256575642133e-06, + "loss": 0.4186, + "step": 6234 + }, + { + "epoch": 1.484084012613792, + "grad_norm": 0.3719720131348448, + "learning_rate": 1.6470954490673141e-06, + "loss": 0.3079, + "step": 6235 + }, + { + "epoch": 1.4843220086868567, + "grad_norm": 0.4065939918654111, + "learning_rate": 1.6456657388949782e-06, + "loss": 0.2705, + "step": 6236 + }, + { + "epoch": 1.4845600047599214, + "grad_norm": 0.38543792010700656, + "learning_rate": 1.6442365272596955e-06, + "loss": 0.3343, + "step": 6237 + }, + { + "epoch": 1.4847980008329862, + "grad_norm": 0.364965468199743, + "learning_rate": 1.6428078143738828e-06, + "loss": 0.3702, + "step": 6238 + }, + { + "epoch": 1.485035996906051, + "grad_norm": 0.39025088197310315, + "learning_rate": 1.6413796004498816e-06, + "loss": 0.2871, + "step": 6239 + }, + { + "epoch": 1.4852739929791159, + "grad_norm": 0.38934407019438205, + "learning_rate": 1.6399518856999597e-06, + "loss": 0.3311, + "step": 6240 + }, + { + "epoch": 1.4855119890521806, + "grad_norm": 0.4045158386339914, + "learning_rate": 1.6385246703363117e-06, + "loss": 0.3784, + "step": 6241 + }, + { + "epoch": 1.4857499851252454, + "grad_norm": 0.36882815626153603, + "learning_rate": 1.6370979545710564e-06, + "loss": 0.3222, + "step": 6242 + }, + { + "epoch": 1.4859879811983103, + "grad_norm": 0.38031964694752574, + "learning_rate": 1.6356717386162392e-06, + "loss": 0.297, + "step": 6243 + }, + { + "epoch": 1.486225977271375, + "grad_norm": 0.3535105134819767, + "learning_rate": 1.634246022683831e-06, + "loss": 0.3139, + "step": 6244 + }, + { + "epoch": 1.4864639733444398, + "grad_norm": 0.45655045384221404, + "learning_rate": 1.6328208069857288e-06, + "loss": 0.4003, + "step": 6245 + }, + { + "epoch": 1.4867019694175045, + "grad_norm": 0.358001681330835, + "learning_rate": 1.631396091733753e-06, + "loss": 0.2827, + "step": 6246 + }, + { + "epoch": 1.4869399654905693, + "grad_norm": 0.39972769951910697, + "learning_rate": 1.6299718771396544e-06, + "loss": 0.32, + "step": 6247 + }, + { + "epoch": 1.4871779615636342, + "grad_norm": 0.37405419443502563, + "learning_rate": 1.6285481634151057e-06, + "loss": 0.3557, + "step": 6248 + }, + { + "epoch": 1.487415957636699, + "grad_norm": 0.36391998904722717, + "learning_rate": 1.6271249507717058e-06, + "loss": 0.3482, + "step": 6249 + }, + { + "epoch": 1.4876539537097637, + "grad_norm": 0.36597440907805356, + "learning_rate": 1.6257022394209787e-06, + "loss": 0.2612, + "step": 6250 + }, + { + "epoch": 1.4878919497828287, + "grad_norm": 0.37262756229567356, + "learning_rate": 1.6242800295743755e-06, + "loss": 0.3512, + "step": 6251 + }, + { + "epoch": 1.4881299458558934, + "grad_norm": 0.393961403942417, + "learning_rate": 1.6228583214432708e-06, + "loss": 0.3568, + "step": 6252 + }, + { + "epoch": 1.4883679419289582, + "grad_norm": 0.3800387502035991, + "learning_rate": 1.6214371152389646e-06, + "loss": 0.2948, + "step": 6253 + }, + { + "epoch": 1.488605938002023, + "grad_norm": 0.4078216630673394, + "learning_rate": 1.6200164111726857e-06, + "loss": 0.2829, + "step": 6254 + }, + { + "epoch": 1.4888439340750876, + "grad_norm": 0.4060042960242336, + "learning_rate": 1.6185962094555857e-06, + "loss": 0.335, + "step": 6255 + }, + { + "epoch": 1.4890819301481526, + "grad_norm": 0.37150604082715344, + "learning_rate": 1.6171765102987401e-06, + "loss": 0.3452, + "step": 6256 + }, + { + "epoch": 1.4893199262212173, + "grad_norm": 0.4109956321194422, + "learning_rate": 1.6157573139131527e-06, + "loss": 0.2697, + "step": 6257 + }, + { + "epoch": 1.489557922294282, + "grad_norm": 0.3579171265280181, + "learning_rate": 1.6143386205097506e-06, + "loss": 0.3006, + "step": 6258 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 0.4053809117913336, + "learning_rate": 1.6129204302993845e-06, + "loss": 0.3799, + "step": 6259 + }, + { + "epoch": 1.4900339144404118, + "grad_norm": 0.3824402887929443, + "learning_rate": 1.611502743492837e-06, + "loss": 0.3014, + "step": 6260 + }, + { + "epoch": 1.4902719105134765, + "grad_norm": 0.37518205183540226, + "learning_rate": 1.6100855603008087e-06, + "loss": 0.3104, + "step": 6261 + }, + { + "epoch": 1.4905099065865413, + "grad_norm": 0.4420001474069447, + "learning_rate": 1.6086688809339291e-06, + "loss": 0.3232, + "step": 6262 + }, + { + "epoch": 1.490747902659606, + "grad_norm": 0.3790341363906722, + "learning_rate": 1.6072527056027509e-06, + "loss": 0.4002, + "step": 6263 + }, + { + "epoch": 1.490985898732671, + "grad_norm": 0.3871942676533004, + "learning_rate": 1.6058370345177531e-06, + "loss": 0.2823, + "step": 6264 + }, + { + "epoch": 1.4912238948057357, + "grad_norm": 0.39208286818203664, + "learning_rate": 1.6044218678893398e-06, + "loss": 0.3084, + "step": 6265 + }, + { + "epoch": 1.4914618908788004, + "grad_norm": 0.4084685412861594, + "learning_rate": 1.6030072059278396e-06, + "loss": 0.382, + "step": 6266 + }, + { + "epoch": 1.4916998869518654, + "grad_norm": 0.399630334749065, + "learning_rate": 1.6015930488435055e-06, + "loss": 0.3122, + "step": 6267 + }, + { + "epoch": 1.4919378830249301, + "grad_norm": 0.3744742890214211, + "learning_rate": 1.6001793968465173e-06, + "loss": 0.2994, + "step": 6268 + }, + { + "epoch": 1.4921758790979949, + "grad_norm": 0.4074445031836467, + "learning_rate": 1.5987662501469787e-06, + "loss": 0.3392, + "step": 6269 + }, + { + "epoch": 1.4924138751710596, + "grad_norm": 0.3701010877824389, + "learning_rate": 1.5973536089549174e-06, + "loss": 0.3836, + "step": 6270 + }, + { + "epoch": 1.4926518712441244, + "grad_norm": 0.34247020644927706, + "learning_rate": 1.595941473480287e-06, + "loss": 0.2838, + "step": 6271 + }, + { + "epoch": 1.4928898673171893, + "grad_norm": 0.4149376790144497, + "learning_rate": 1.594529843932966e-06, + "loss": 0.2893, + "step": 6272 + }, + { + "epoch": 1.493127863390254, + "grad_norm": 0.37778290292253097, + "learning_rate": 1.593118720522756e-06, + "loss": 0.3544, + "step": 6273 + }, + { + "epoch": 1.4933658594633188, + "grad_norm": 0.38515242574627634, + "learning_rate": 1.591708103459388e-06, + "loss": 0.3389, + "step": 6274 + }, + { + "epoch": 1.4936038555363838, + "grad_norm": 0.392490049369985, + "learning_rate": 1.590297992952513e-06, + "loss": 0.2979, + "step": 6275 + }, + { + "epoch": 1.4938418516094485, + "grad_norm": 0.4109028404933468, + "learning_rate": 1.588888389211708e-06, + "loss": 0.3001, + "step": 6276 + }, + { + "epoch": 1.4940798476825132, + "grad_norm": 0.37566084684050377, + "learning_rate": 1.587479292446475e-06, + "loss": 0.3806, + "step": 6277 + }, + { + "epoch": 1.494317843755578, + "grad_norm": 0.3770457435748563, + "learning_rate": 1.5860707028662415e-06, + "loss": 0.2925, + "step": 6278 + }, + { + "epoch": 1.4945558398286427, + "grad_norm": 0.40003813495891827, + "learning_rate": 1.5846626206803572e-06, + "loss": 0.317, + "step": 6279 + }, + { + "epoch": 1.4947938359017077, + "grad_norm": 0.38923435537963624, + "learning_rate": 1.5832550460980978e-06, + "loss": 0.3441, + "step": 6280 + }, + { + "epoch": 1.4950318319747724, + "grad_norm": 0.3630845762326767, + "learning_rate": 1.5818479793286663e-06, + "loss": 0.3697, + "step": 6281 + }, + { + "epoch": 1.4952698280478371, + "grad_norm": 0.38811524683296966, + "learning_rate": 1.5804414205811864e-06, + "loss": 0.2647, + "step": 6282 + }, + { + "epoch": 1.495507824120902, + "grad_norm": 0.36922187210672575, + "learning_rate": 1.5790353700647066e-06, + "loss": 0.3284, + "step": 6283 + }, + { + "epoch": 1.4957458201939668, + "grad_norm": 0.400379309159919, + "learning_rate": 1.5776298279882018e-06, + "loss": 0.3806, + "step": 6284 + }, + { + "epoch": 1.4959838162670316, + "grad_norm": 0.38009298047601064, + "learning_rate": 1.5762247945605696e-06, + "loss": 0.2978, + "step": 6285 + }, + { + "epoch": 1.4962218123400963, + "grad_norm": 0.3884850826229188, + "learning_rate": 1.5748202699906335e-06, + "loss": 0.2626, + "step": 6286 + }, + { + "epoch": 1.496459808413161, + "grad_norm": 0.43122694564171327, + "learning_rate": 1.5734162544871379e-06, + "loss": 0.349, + "step": 6287 + }, + { + "epoch": 1.496697804486226, + "grad_norm": 0.37117494480049085, + "learning_rate": 1.5720127482587582e-06, + "loss": 0.3717, + "step": 6288 + }, + { + "epoch": 1.4969358005592908, + "grad_norm": 0.6961660701599369, + "learning_rate": 1.5706097515140888e-06, + "loss": 0.2725, + "step": 6289 + }, + { + "epoch": 1.4971737966323555, + "grad_norm": 0.39992955711015266, + "learning_rate": 1.5692072644616497e-06, + "loss": 0.3149, + "step": 6290 + }, + { + "epoch": 1.4974117927054205, + "grad_norm": 0.3844509561461545, + "learning_rate": 1.5678052873098843e-06, + "loss": 0.3416, + "step": 6291 + }, + { + "epoch": 1.4976497887784852, + "grad_norm": 0.4114588280934326, + "learning_rate": 1.5664038202671616e-06, + "loss": 0.2867, + "step": 6292 + }, + { + "epoch": 1.49788778485155, + "grad_norm": 0.3838890205158945, + "learning_rate": 1.565002863541774e-06, + "loss": 0.2827, + "step": 6293 + }, + { + "epoch": 1.4981257809246147, + "grad_norm": 0.4048312301464947, + "learning_rate": 1.5636024173419389e-06, + "loss": 0.3504, + "step": 6294 + }, + { + "epoch": 1.4983637769976794, + "grad_norm": 0.3751861323780018, + "learning_rate": 1.562202481875797e-06, + "loss": 0.362, + "step": 6295 + }, + { + "epoch": 1.4986017730707444, + "grad_norm": 0.37261348585719195, + "learning_rate": 1.5608030573514131e-06, + "loss": 0.2872, + "step": 6296 + }, + { + "epoch": 1.4988397691438091, + "grad_norm": 0.6348899760877189, + "learning_rate": 1.5594041439767772e-06, + "loss": 0.2771, + "step": 6297 + }, + { + "epoch": 1.4990777652168739, + "grad_norm": 0.38641372515323824, + "learning_rate": 1.5580057419598011e-06, + "loss": 0.352, + "step": 6298 + }, + { + "epoch": 1.4993157612899388, + "grad_norm": 0.39465392512377706, + "learning_rate": 1.5566078515083227e-06, + "loss": 0.3409, + "step": 6299 + }, + { + "epoch": 1.4995537573630036, + "grad_norm": 0.3627033946670975, + "learning_rate": 1.5552104728301031e-06, + "loss": 0.2711, + "step": 6300 + }, + { + "epoch": 1.4997917534360683, + "grad_norm": 0.35357386216875875, + "learning_rate": 1.5538136061328256e-06, + "loss": 0.3098, + "step": 6301 + }, + { + "epoch": 1.500029749509133, + "grad_norm": 0.401206280542586, + "learning_rate": 1.552417251624102e-06, + "loss": 0.3832, + "step": 6302 + }, + { + "epoch": 1.5002677455821978, + "grad_norm": 0.35052024645889873, + "learning_rate": 1.5510214095114644e-06, + "loss": 0.2957, + "step": 6303 + }, + { + "epoch": 1.5005057416552627, + "grad_norm": 0.4270002745679526, + "learning_rate": 1.5496260800023688e-06, + "loss": 0.3093, + "step": 6304 + }, + { + "epoch": 1.5007437377283275, + "grad_norm": 0.39472985597101196, + "learning_rate": 1.5482312633041956e-06, + "loss": 0.3537, + "step": 6305 + }, + { + "epoch": 1.5009817338013924, + "grad_norm": 0.3586093492162438, + "learning_rate": 1.54683695962425e-06, + "loss": 0.3202, + "step": 6306 + }, + { + "epoch": 1.5012197298744572, + "grad_norm": 0.4005505431292118, + "learning_rate": 1.5454431691697575e-06, + "loss": 0.2976, + "step": 6307 + }, + { + "epoch": 1.501457725947522, + "grad_norm": 0.35924720446659664, + "learning_rate": 1.5440498921478726e-06, + "loss": 0.2957, + "step": 6308 + }, + { + "epoch": 1.5016957220205867, + "grad_norm": 0.396240213306404, + "learning_rate": 1.5426571287656705e-06, + "loss": 0.3937, + "step": 6309 + }, + { + "epoch": 1.5019337180936514, + "grad_norm": 0.44063755653514214, + "learning_rate": 1.5412648792301494e-06, + "loss": 0.3058, + "step": 6310 + }, + { + "epoch": 1.5021717141667161, + "grad_norm": 0.39238660452322954, + "learning_rate": 1.5398731437482322e-06, + "loss": 0.283, + "step": 6311 + }, + { + "epoch": 1.502409710239781, + "grad_norm": 0.3882700159407384, + "learning_rate": 1.538481922526765e-06, + "loss": 0.3765, + "step": 6312 + }, + { + "epoch": 1.5026477063128458, + "grad_norm": 0.39130343929171285, + "learning_rate": 1.5370912157725183e-06, + "loss": 0.3601, + "step": 6313 + }, + { + "epoch": 1.5028857023859108, + "grad_norm": 0.3727457835348889, + "learning_rate": 1.5357010236921853e-06, + "loss": 0.308, + "step": 6314 + }, + { + "epoch": 1.5031236984589755, + "grad_norm": 0.38525499619958226, + "learning_rate": 1.5343113464923808e-06, + "loss": 0.3076, + "step": 6315 + }, + { + "epoch": 1.5033616945320403, + "grad_norm": 0.3731034146144273, + "learning_rate": 1.5329221843796492e-06, + "loss": 0.3616, + "step": 6316 + }, + { + "epoch": 1.503599690605105, + "grad_norm": 0.38290221481731973, + "learning_rate": 1.5315335375604523e-06, + "loss": 0.2957, + "step": 6317 + }, + { + "epoch": 1.5038376866781697, + "grad_norm": 0.3851034270150811, + "learning_rate": 1.5301454062411769e-06, + "loss": 0.3072, + "step": 6318 + }, + { + "epoch": 1.5040756827512345, + "grad_norm": 0.4023568178915069, + "learning_rate": 1.5287577906281347e-06, + "loss": 0.3343, + "step": 6319 + }, + { + "epoch": 1.5043136788242994, + "grad_norm": 0.47865734964101286, + "learning_rate": 1.5273706909275593e-06, + "loss": 0.3684, + "step": 6320 + }, + { + "epoch": 1.5045516748973642, + "grad_norm": 0.36513450339658127, + "learning_rate": 1.5259841073456078e-06, + "loss": 0.3044, + "step": 6321 + }, + { + "epoch": 1.5047896709704291, + "grad_norm": 0.37595196535771713, + "learning_rate": 1.5245980400883609e-06, + "loss": 0.3099, + "step": 6322 + }, + { + "epoch": 1.5050276670434939, + "grad_norm": 0.386737994428304, + "learning_rate": 1.5232124893618228e-06, + "loss": 0.3566, + "step": 6323 + }, + { + "epoch": 1.5052656631165586, + "grad_norm": 0.341571020509951, + "learning_rate": 1.5218274553719198e-06, + "loss": 0.3182, + "step": 6324 + }, + { + "epoch": 1.5055036591896234, + "grad_norm": 0.3790719754813889, + "learning_rate": 1.5204429383245034e-06, + "loss": 0.2723, + "step": 6325 + }, + { + "epoch": 1.505741655262688, + "grad_norm": 0.3997178109826318, + "learning_rate": 1.5190589384253458e-06, + "loss": 0.2983, + "step": 6326 + }, + { + "epoch": 1.5059796513357528, + "grad_norm": 0.36641685229169446, + "learning_rate": 1.5176754558801448e-06, + "loss": 0.4068, + "step": 6327 + }, + { + "epoch": 1.5062176474088178, + "grad_norm": 0.34991820225784404, + "learning_rate": 1.516292490894518e-06, + "loss": 0.2743, + "step": 6328 + }, + { + "epoch": 1.5064556434818825, + "grad_norm": 0.35331131328827264, + "learning_rate": 1.514910043674011e-06, + "loss": 0.2919, + "step": 6329 + }, + { + "epoch": 1.5066936395549475, + "grad_norm": 0.37730212152272385, + "learning_rate": 1.5135281144240888e-06, + "loss": 0.3632, + "step": 6330 + }, + { + "epoch": 1.5069316356280122, + "grad_norm": 0.3634842374888263, + "learning_rate": 1.5121467033501403e-06, + "loss": 0.3557, + "step": 6331 + }, + { + "epoch": 1.507169631701077, + "grad_norm": 0.37734949621666297, + "learning_rate": 1.5107658106574764e-06, + "loss": 0.2589, + "step": 6332 + }, + { + "epoch": 1.5074076277741417, + "grad_norm": 0.3625755897347857, + "learning_rate": 1.5093854365513327e-06, + "loss": 0.3306, + "step": 6333 + }, + { + "epoch": 1.5076456238472065, + "grad_norm": 0.3709257192300279, + "learning_rate": 1.5080055812368654e-06, + "loss": 0.3503, + "step": 6334 + }, + { + "epoch": 1.5078836199202712, + "grad_norm": 0.3697093486108935, + "learning_rate": 1.5066262449191576e-06, + "loss": 0.2987, + "step": 6335 + }, + { + "epoch": 1.5081216159933362, + "grad_norm": 0.3975396214243995, + "learning_rate": 1.5052474278032125e-06, + "loss": 0.2942, + "step": 6336 + }, + { + "epoch": 1.508359612066401, + "grad_norm": 0.36248809899098255, + "learning_rate": 1.5038691300939552e-06, + "loss": 0.3452, + "step": 6337 + }, + { + "epoch": 1.5085976081394659, + "grad_norm": 0.37957957692819194, + "learning_rate": 1.5024913519962353e-06, + "loss": 0.38, + "step": 6338 + }, + { + "epoch": 1.5088356042125306, + "grad_norm": 0.36619032508426047, + "learning_rate": 1.5011140937148243e-06, + "loss": 0.2854, + "step": 6339 + }, + { + "epoch": 1.5090736002855953, + "grad_norm": 0.36939842494901753, + "learning_rate": 1.4997373554544176e-06, + "loss": 0.2817, + "step": 6340 + }, + { + "epoch": 1.50931159635866, + "grad_norm": 0.3996275043396052, + "learning_rate": 1.4983611374196323e-06, + "loss": 0.3731, + "step": 6341 + }, + { + "epoch": 1.5095495924317248, + "grad_norm": 0.37179348365865195, + "learning_rate": 1.4969854398150069e-06, + "loss": 0.3166, + "step": 6342 + }, + { + "epoch": 1.5097875885047896, + "grad_norm": 0.4072043189526779, + "learning_rate": 1.4956102628450065e-06, + "loss": 0.2694, + "step": 6343 + }, + { + "epoch": 1.5100255845778545, + "grad_norm": 0.41059367411920217, + "learning_rate": 1.4942356067140162e-06, + "loss": 0.327, + "step": 6344 + }, + { + "epoch": 1.5102635806509193, + "grad_norm": 0.3933303176194039, + "learning_rate": 1.4928614716263428e-06, + "loss": 0.3873, + "step": 6345 + }, + { + "epoch": 1.5105015767239842, + "grad_norm": 0.38198672911717163, + "learning_rate": 1.4914878577862168e-06, + "loss": 0.2796, + "step": 6346 + }, + { + "epoch": 1.510739572797049, + "grad_norm": 0.5557986276562655, + "learning_rate": 1.490114765397792e-06, + "loss": 0.3061, + "step": 6347 + }, + { + "epoch": 1.5109775688701137, + "grad_norm": 0.40633469354717716, + "learning_rate": 1.4887421946651436e-06, + "loss": 0.3823, + "step": 6348 + }, + { + "epoch": 1.5112155649431784, + "grad_norm": 0.40308923354303, + "learning_rate": 1.487370145792269e-06, + "loss": 0.3266, + "step": 6349 + }, + { + "epoch": 1.5114535610162432, + "grad_norm": 0.38544791649854926, + "learning_rate": 1.4859986189830894e-06, + "loss": 0.2822, + "step": 6350 + }, + { + "epoch": 1.511691557089308, + "grad_norm": 0.3987203797284687, + "learning_rate": 1.4846276144414468e-06, + "loss": 0.3075, + "step": 6351 + }, + { + "epoch": 1.5119295531623729, + "grad_norm": 0.6071459510037844, + "learning_rate": 1.483257132371107e-06, + "loss": 0.4008, + "step": 6352 + }, + { + "epoch": 1.5121675492354376, + "grad_norm": 0.3813619772349361, + "learning_rate": 1.4818871729757572e-06, + "loss": 0.277, + "step": 6353 + }, + { + "epoch": 1.5124055453085026, + "grad_norm": 0.39614277089555516, + "learning_rate": 1.4805177364590078e-06, + "loss": 0.3156, + "step": 6354 + }, + { + "epoch": 1.5126435413815673, + "grad_norm": 0.360231458449255, + "learning_rate": 1.4791488230243907e-06, + "loss": 0.3506, + "step": 6355 + }, + { + "epoch": 1.512881537454632, + "grad_norm": 0.3694141952246499, + "learning_rate": 1.4777804328753582e-06, + "loss": 0.3909, + "step": 6356 + }, + { + "epoch": 1.5131195335276968, + "grad_norm": 0.3556081977766239, + "learning_rate": 1.47641256621529e-06, + "loss": 0.2827, + "step": 6357 + }, + { + "epoch": 1.5133575296007615, + "grad_norm": 0.38860260107924, + "learning_rate": 1.4750452232474843e-06, + "loss": 0.3373, + "step": 6358 + }, + { + "epoch": 1.5135955256738263, + "grad_norm": 0.38185389209509235, + "learning_rate": 1.4736784041751617e-06, + "loss": 0.4104, + "step": 6359 + }, + { + "epoch": 1.5138335217468912, + "grad_norm": 0.35687668113180804, + "learning_rate": 1.4723121092014654e-06, + "loss": 0.2966, + "step": 6360 + }, + { + "epoch": 1.514071517819956, + "grad_norm": 0.3771926399784858, + "learning_rate": 1.4709463385294586e-06, + "loss": 0.294, + "step": 6361 + }, + { + "epoch": 1.514309513893021, + "grad_norm": 0.39400534017538535, + "learning_rate": 1.4695810923621323e-06, + "loss": 0.3273, + "step": 6362 + }, + { + "epoch": 1.5145475099660857, + "grad_norm": 0.36830931162825825, + "learning_rate": 1.4682163709023934e-06, + "loss": 0.389, + "step": 6363 + }, + { + "epoch": 1.5147855060391504, + "grad_norm": 0.3829512181500474, + "learning_rate": 1.4668521743530745e-06, + "loss": 0.3118, + "step": 6364 + }, + { + "epoch": 1.5150235021122151, + "grad_norm": 0.38703933279454056, + "learning_rate": 1.465488502916928e-06, + "loss": 0.3094, + "step": 6365 + }, + { + "epoch": 1.5152614981852799, + "grad_norm": 0.3730048469300558, + "learning_rate": 1.4641253567966302e-06, + "loss": 0.38, + "step": 6366 + }, + { + "epoch": 1.5154994942583446, + "grad_norm": 0.3774200036872977, + "learning_rate": 1.462762736194777e-06, + "loss": 0.3355, + "step": 6367 + }, + { + "epoch": 1.5157374903314096, + "grad_norm": 0.4263601312122989, + "learning_rate": 1.4614006413138882e-06, + "loss": 0.2627, + "step": 6368 + }, + { + "epoch": 1.5159754864044743, + "grad_norm": 0.45738040178098543, + "learning_rate": 1.460039072356405e-06, + "loss": 0.3157, + "step": 6369 + }, + { + "epoch": 1.516213482477539, + "grad_norm": 0.3827099888179909, + "learning_rate": 1.4586780295246888e-06, + "loss": 0.3668, + "step": 6370 + }, + { + "epoch": 1.516451478550604, + "grad_norm": 0.3839304092663823, + "learning_rate": 1.457317513021027e-06, + "loss": 0.2815, + "step": 6371 + }, + { + "epoch": 1.5166894746236688, + "grad_norm": 0.3874574711568108, + "learning_rate": 1.455957523047624e-06, + "loss": 0.292, + "step": 6372 + }, + { + "epoch": 1.5169274706967335, + "grad_norm": 0.365985707188615, + "learning_rate": 1.454598059806609e-06, + "loss": 0.368, + "step": 6373 + }, + { + "epoch": 1.5171654667697982, + "grad_norm": 0.3664109124443364, + "learning_rate": 1.4532391235000316e-06, + "loss": 0.339, + "step": 6374 + }, + { + "epoch": 1.517403462842863, + "grad_norm": 0.3873829121823865, + "learning_rate": 1.4518807143298625e-06, + "loss": 0.2742, + "step": 6375 + }, + { + "epoch": 1.5176414589159277, + "grad_norm": 0.36086287426960706, + "learning_rate": 1.4505228324979954e-06, + "loss": 0.3181, + "step": 6376 + }, + { + "epoch": 1.5178794549889927, + "grad_norm": 0.3800615640914873, + "learning_rate": 1.449165478206246e-06, + "loss": 0.3744, + "step": 6377 + }, + { + "epoch": 1.5181174510620574, + "grad_norm": 0.34268205116914047, + "learning_rate": 1.447808651656349e-06, + "loss": 0.3002, + "step": 6378 + }, + { + "epoch": 1.5183554471351224, + "grad_norm": 0.4034392600733182, + "learning_rate": 1.4464523530499636e-06, + "loss": 0.2793, + "step": 6379 + }, + { + "epoch": 1.5185934432081871, + "grad_norm": 0.3778878775280351, + "learning_rate": 1.4450965825886693e-06, + "loss": 0.3479, + "step": 6380 + }, + { + "epoch": 1.5188314392812519, + "grad_norm": 0.3622997863722037, + "learning_rate": 1.4437413404739669e-06, + "loss": 0.3375, + "step": 6381 + }, + { + "epoch": 1.5190694353543166, + "grad_norm": 0.3806285094389011, + "learning_rate": 1.442386626907279e-06, + "loss": 0.2532, + "step": 6382 + }, + { + "epoch": 1.5193074314273813, + "grad_norm": 0.3702090170524379, + "learning_rate": 1.4410324420899475e-06, + "loss": 0.3195, + "step": 6383 + }, + { + "epoch": 1.519545427500446, + "grad_norm": 0.3715020539789907, + "learning_rate": 1.4396787862232413e-06, + "loss": 0.3655, + "step": 6384 + }, + { + "epoch": 1.519783423573511, + "grad_norm": 0.3683352719161142, + "learning_rate": 1.438325659508346e-06, + "loss": 0.2953, + "step": 6385 + }, + { + "epoch": 1.5200214196465758, + "grad_norm": 0.3796246049843194, + "learning_rate": 1.436973062146369e-06, + "loss": 0.2857, + "step": 6386 + }, + { + "epoch": 1.5202594157196407, + "grad_norm": 0.37688288858107755, + "learning_rate": 1.4356209943383386e-06, + "loss": 0.3469, + "step": 6387 + }, + { + "epoch": 1.5204974117927055, + "grad_norm": 0.37028336062287065, + "learning_rate": 1.4342694562852084e-06, + "loss": 0.3181, + "step": 6388 + }, + { + "epoch": 1.5207354078657702, + "grad_norm": 0.39109691442662825, + "learning_rate": 1.432918448187849e-06, + "loss": 0.2942, + "step": 6389 + }, + { + "epoch": 1.520973403938835, + "grad_norm": 0.44854038551840963, + "learning_rate": 1.4315679702470537e-06, + "loss": 0.3434, + "step": 6390 + }, + { + "epoch": 1.5212114000118997, + "grad_norm": 0.41884702455758605, + "learning_rate": 1.430218022663536e-06, + "loss": 0.3763, + "step": 6391 + }, + { + "epoch": 1.5214493960849644, + "grad_norm": 0.3594762787773168, + "learning_rate": 1.428868605637933e-06, + "loss": 0.325, + "step": 6392 + }, + { + "epoch": 1.5216873921580294, + "grad_norm": 0.4335927443090939, + "learning_rate": 1.4275197193708007e-06, + "loss": 0.2827, + "step": 6393 + }, + { + "epoch": 1.5219253882310941, + "grad_norm": 0.39709453363546243, + "learning_rate": 1.4261713640626168e-06, + "loss": 0.3365, + "step": 6394 + }, + { + "epoch": 1.522163384304159, + "grad_norm": 0.3747147101012874, + "learning_rate": 1.4248235399137805e-06, + "loss": 0.3943, + "step": 6395 + }, + { + "epoch": 1.5224013803772238, + "grad_norm": 0.48060494905748885, + "learning_rate": 1.4234762471246116e-06, + "loss": 0.3053, + "step": 6396 + }, + { + "epoch": 1.5226393764502886, + "grad_norm": 0.3908845745235385, + "learning_rate": 1.4221294858953499e-06, + "loss": 0.3409, + "step": 6397 + }, + { + "epoch": 1.5228773725233533, + "grad_norm": 0.3658107584700326, + "learning_rate": 1.4207832564261603e-06, + "loss": 0.3652, + "step": 6398 + }, + { + "epoch": 1.523115368596418, + "grad_norm": 0.3781266672514177, + "learning_rate": 1.419437558917125e-06, + "loss": 0.334, + "step": 6399 + }, + { + "epoch": 1.5233533646694828, + "grad_norm": 0.3934768910170049, + "learning_rate": 1.4180923935682467e-06, + "loss": 0.2721, + "step": 6400 + }, + { + "epoch": 1.5235913607425478, + "grad_norm": 0.42820638390694693, + "learning_rate": 1.4167477605794505e-06, + "loss": 0.2985, + "step": 6401 + }, + { + "epoch": 1.5238293568156125, + "grad_norm": 0.40845089003089985, + "learning_rate": 1.4154036601505834e-06, + "loss": 0.3593, + "step": 6402 + }, + { + "epoch": 1.5240673528886775, + "grad_norm": 0.39510953166289925, + "learning_rate": 1.4140600924814101e-06, + "loss": 0.3068, + "step": 6403 + }, + { + "epoch": 1.5243053489617422, + "grad_norm": 0.39306339830594605, + "learning_rate": 1.4127170577716193e-06, + "loss": 0.2853, + "step": 6404 + }, + { + "epoch": 1.524543345034807, + "grad_norm": 0.3633843389560684, + "learning_rate": 1.4113745562208191e-06, + "loss": 0.368, + "step": 6405 + }, + { + "epoch": 1.5247813411078717, + "grad_norm": 0.3681166528899588, + "learning_rate": 1.4100325880285381e-06, + "loss": 0.3689, + "step": 6406 + }, + { + "epoch": 1.5250193371809364, + "grad_norm": 0.3597587077669697, + "learning_rate": 1.4086911533942254e-06, + "loss": 0.2705, + "step": 6407 + }, + { + "epoch": 1.5252573332540011, + "grad_norm": 0.3678376723681916, + "learning_rate": 1.4073502525172528e-06, + "loss": 0.302, + "step": 6408 + }, + { + "epoch": 1.525495329327066, + "grad_norm": 0.3967644755270021, + "learning_rate": 1.4060098855969102e-06, + "loss": 0.3791, + "step": 6409 + }, + { + "epoch": 1.5257333254001308, + "grad_norm": 0.3765325293672744, + "learning_rate": 1.4046700528324082e-06, + "loss": 0.2981, + "step": 6410 + }, + { + "epoch": 1.5259713214731958, + "grad_norm": 0.36491847328820715, + "learning_rate": 1.403330754422882e-06, + "loss": 0.2766, + "step": 6411 + }, + { + "epoch": 1.5262093175462605, + "grad_norm": 0.38299479011732535, + "learning_rate": 1.401991990567383e-06, + "loss": 0.3489, + "step": 6412 + }, + { + "epoch": 1.5264473136193253, + "grad_norm": 0.38079523523420805, + "learning_rate": 1.4006537614648846e-06, + "loss": 0.3457, + "step": 6413 + }, + { + "epoch": 1.52668530969239, + "grad_norm": 0.3893545607621366, + "learning_rate": 1.3993160673142791e-06, + "loss": 0.2929, + "step": 6414 + }, + { + "epoch": 1.5269233057654548, + "grad_norm": 0.3650551166262176, + "learning_rate": 1.3979789083143847e-06, + "loss": 0.3031, + "step": 6415 + }, + { + "epoch": 1.5271613018385195, + "grad_norm": 0.370501273331385, + "learning_rate": 1.3966422846639338e-06, + "loss": 0.3539, + "step": 6416 + }, + { + "epoch": 1.5273992979115845, + "grad_norm": 0.37724908299456045, + "learning_rate": 1.3953061965615822e-06, + "loss": 0.3347, + "step": 6417 + }, + { + "epoch": 1.5276372939846492, + "grad_norm": 0.4060598296506883, + "learning_rate": 1.3939706442059054e-06, + "loss": 0.253, + "step": 6418 + }, + { + "epoch": 1.5278752900577142, + "grad_norm": 0.435691084317172, + "learning_rate": 1.3926356277954001e-06, + "loss": 0.3471, + "step": 6419 + }, + { + "epoch": 1.528113286130779, + "grad_norm": 0.38356363994640225, + "learning_rate": 1.391301147528482e-06, + "loss": 0.3906, + "step": 6420 + }, + { + "epoch": 1.5283512822038436, + "grad_norm": 0.3809047541613839, + "learning_rate": 1.389967203603489e-06, + "loss": 0.3001, + "step": 6421 + }, + { + "epoch": 1.5285892782769084, + "grad_norm": 0.40431301095479877, + "learning_rate": 1.388633796218677e-06, + "loss": 0.2865, + "step": 6422 + }, + { + "epoch": 1.5288272743499731, + "grad_norm": 0.3637907972443383, + "learning_rate": 1.3873009255722236e-06, + "loss": 0.3513, + "step": 6423 + }, + { + "epoch": 1.5290652704230379, + "grad_norm": 0.36132364029306596, + "learning_rate": 1.3859685918622269e-06, + "loss": 0.3441, + "step": 6424 + }, + { + "epoch": 1.5293032664961028, + "grad_norm": 0.3913993625800555, + "learning_rate": 1.3846367952867025e-06, + "loss": 0.2711, + "step": 6425 + }, + { + "epoch": 1.5295412625691676, + "grad_norm": 0.3707148982517722, + "learning_rate": 1.3833055360435916e-06, + "loss": 0.308, + "step": 6426 + }, + { + "epoch": 1.5297792586422325, + "grad_norm": 0.3729561418753125, + "learning_rate": 1.3819748143307509e-06, + "loss": 0.3893, + "step": 6427 + }, + { + "epoch": 1.5300172547152973, + "grad_norm": 0.4422339804101922, + "learning_rate": 1.3806446303459586e-06, + "loss": 0.2895, + "step": 6428 + }, + { + "epoch": 1.530255250788362, + "grad_norm": 0.3578722524768168, + "learning_rate": 1.3793149842869125e-06, + "loss": 0.3015, + "step": 6429 + }, + { + "epoch": 1.5304932468614267, + "grad_norm": 0.39833175086792916, + "learning_rate": 1.377985876351231e-06, + "loss": 0.3476, + "step": 6430 + }, + { + "epoch": 1.5307312429344915, + "grad_norm": 0.3670466020585641, + "learning_rate": 1.376657306736453e-06, + "loss": 0.3009, + "step": 6431 + }, + { + "epoch": 1.5309692390075562, + "grad_norm": 0.3668993481646055, + "learning_rate": 1.375329275640036e-06, + "loss": 0.2879, + "step": 6432 + }, + { + "epoch": 1.5312072350806212, + "grad_norm": 0.3760947174886567, + "learning_rate": 1.3740017832593588e-06, + "loss": 0.3278, + "step": 6433 + }, + { + "epoch": 1.531445231153686, + "grad_norm": 0.42040674402359185, + "learning_rate": 1.3726748297917196e-06, + "loss": 0.4024, + "step": 6434 + }, + { + "epoch": 1.5316832272267509, + "grad_norm": 0.35338872181053815, + "learning_rate": 1.3713484154343366e-06, + "loss": 0.3243, + "step": 6435 + }, + { + "epoch": 1.5319212232998156, + "grad_norm": 0.36677960202123283, + "learning_rate": 1.370022540384347e-06, + "loss": 0.2743, + "step": 6436 + }, + { + "epoch": 1.5321592193728804, + "grad_norm": 0.39245535596015746, + "learning_rate": 1.368697204838808e-06, + "loss": 0.3666, + "step": 6437 + }, + { + "epoch": 1.532397215445945, + "grad_norm": 0.4062599071898869, + "learning_rate": 1.3673724089947005e-06, + "loss": 0.3457, + "step": 6438 + }, + { + "epoch": 1.5326352115190098, + "grad_norm": 0.406451461182181, + "learning_rate": 1.366048153048919e-06, + "loss": 0.2789, + "step": 6439 + }, + { + "epoch": 1.5328732075920746, + "grad_norm": 0.41876146445384105, + "learning_rate": 1.364724437198282e-06, + "loss": 0.2907, + "step": 6440 + }, + { + "epoch": 1.5331112036651395, + "grad_norm": 0.3781185900759409, + "learning_rate": 1.3634012616395249e-06, + "loss": 0.3733, + "step": 6441 + }, + { + "epoch": 1.5333491997382043, + "grad_norm": 0.355167285944409, + "learning_rate": 1.3620786265693065e-06, + "loss": 0.2928, + "step": 6442 + }, + { + "epoch": 1.5335871958112692, + "grad_norm": 0.37322147215613916, + "learning_rate": 1.360756532184202e-06, + "loss": 0.2858, + "step": 6443 + }, + { + "epoch": 1.533825191884334, + "grad_norm": 0.3630088334381094, + "learning_rate": 1.3594349786807075e-06, + "loss": 0.3354, + "step": 6444 + }, + { + "epoch": 1.5340631879573987, + "grad_norm": 0.37666334694230696, + "learning_rate": 1.3581139662552384e-06, + "loss": 0.3768, + "step": 6445 + }, + { + "epoch": 1.5343011840304634, + "grad_norm": 0.3646570924309373, + "learning_rate": 1.3567934951041295e-06, + "loss": 0.3052, + "step": 6446 + }, + { + "epoch": 1.5345391801035282, + "grad_norm": 0.40326392321455273, + "learning_rate": 1.3554735654236362e-06, + "loss": 0.3308, + "step": 6447 + }, + { + "epoch": 1.534777176176593, + "grad_norm": 0.3811017852408969, + "learning_rate": 1.354154177409932e-06, + "loss": 0.3621, + "step": 6448 + }, + { + "epoch": 1.5350151722496579, + "grad_norm": 0.3847249344728611, + "learning_rate": 1.3528353312591113e-06, + "loss": 0.3481, + "step": 6449 + }, + { + "epoch": 1.5352531683227226, + "grad_norm": 0.40310178526788526, + "learning_rate": 1.3515170271671869e-06, + "loss": 0.2535, + "step": 6450 + }, + { + "epoch": 1.5354911643957876, + "grad_norm": 0.4142458710891034, + "learning_rate": 1.350199265330091e-06, + "loss": 0.3009, + "step": 6451 + }, + { + "epoch": 1.5357291604688523, + "grad_norm": 0.41762456749772203, + "learning_rate": 1.3488820459436746e-06, + "loss": 0.3848, + "step": 6452 + }, + { + "epoch": 1.535967156541917, + "grad_norm": 0.3546073693357267, + "learning_rate": 1.3475653692037121e-06, + "loss": 0.2921, + "step": 6453 + }, + { + "epoch": 1.5362051526149818, + "grad_norm": 0.40900895467204246, + "learning_rate": 1.3462492353058925e-06, + "loss": 0.2866, + "step": 6454 + }, + { + "epoch": 1.5364431486880465, + "grad_norm": 0.3839566221518419, + "learning_rate": 1.3449336444458262e-06, + "loss": 0.351, + "step": 6455 + }, + { + "epoch": 1.5366811447611113, + "grad_norm": 0.39130970196546844, + "learning_rate": 1.3436185968190424e-06, + "loss": 0.3163, + "step": 6456 + }, + { + "epoch": 1.5369191408341762, + "grad_norm": 0.3741648167638271, + "learning_rate": 1.3423040926209891e-06, + "loss": 0.2968, + "step": 6457 + }, + { + "epoch": 1.537157136907241, + "grad_norm": 0.37882647569434985, + "learning_rate": 1.3409901320470353e-06, + "loss": 0.3218, + "step": 6458 + }, + { + "epoch": 1.537395132980306, + "grad_norm": 0.40274124899921193, + "learning_rate": 1.3396767152924667e-06, + "loss": 0.3784, + "step": 6459 + }, + { + "epoch": 1.5376331290533707, + "grad_norm": 0.3978813236044458, + "learning_rate": 1.3383638425524909e-06, + "loss": 0.3023, + "step": 6460 + }, + { + "epoch": 1.5378711251264354, + "grad_norm": 0.38122434614979217, + "learning_rate": 1.3370515140222322e-06, + "loss": 0.2824, + "step": 6461 + }, + { + "epoch": 1.5381091211995002, + "grad_norm": 0.3763755588799363, + "learning_rate": 1.3357397298967356e-06, + "loss": 0.358, + "step": 6462 + }, + { + "epoch": 1.538347117272565, + "grad_norm": 0.37502057107131376, + "learning_rate": 1.3344284903709647e-06, + "loss": 0.3488, + "step": 6463 + }, + { + "epoch": 1.5385851133456296, + "grad_norm": 0.3851990727799265, + "learning_rate": 1.3331177956398007e-06, + "loss": 0.2791, + "step": 6464 + }, + { + "epoch": 1.5388231094186946, + "grad_norm": 0.4611773475919031, + "learning_rate": 1.3318076458980472e-06, + "loss": 0.3228, + "step": 6465 + }, + { + "epoch": 1.5390611054917593, + "grad_norm": 0.4023609502927905, + "learning_rate": 1.330498041340425e-06, + "loss": 0.3756, + "step": 6466 + }, + { + "epoch": 1.5392991015648243, + "grad_norm": 0.36953198203324916, + "learning_rate": 1.3291889821615728e-06, + "loss": 0.3158, + "step": 6467 + }, + { + "epoch": 1.539537097637889, + "grad_norm": 0.39443173542625404, + "learning_rate": 1.3278804685560476e-06, + "loss": 0.2908, + "step": 6468 + }, + { + "epoch": 1.5397750937109538, + "grad_norm": 0.3713126115765714, + "learning_rate": 1.3265725007183306e-06, + "loss": 0.3272, + "step": 6469 + }, + { + "epoch": 1.5400130897840185, + "grad_norm": 0.38677506574041237, + "learning_rate": 1.3252650788428156e-06, + "loss": 0.3867, + "step": 6470 + }, + { + "epoch": 1.5402510858570833, + "grad_norm": 0.3478181826041587, + "learning_rate": 1.3239582031238191e-06, + "loss": 0.2743, + "step": 6471 + }, + { + "epoch": 1.540489081930148, + "grad_norm": 0.3406479266488313, + "learning_rate": 1.3226518737555744e-06, + "loss": 0.3027, + "step": 6472 + }, + { + "epoch": 1.540727078003213, + "grad_norm": 0.39762042117594537, + "learning_rate": 1.321346090932234e-06, + "loss": 0.3722, + "step": 6473 + }, + { + "epoch": 1.5409650740762777, + "grad_norm": 0.37089894103258814, + "learning_rate": 1.3200408548478705e-06, + "loss": 0.3375, + "step": 6474 + }, + { + "epoch": 1.5412030701493427, + "grad_norm": 0.3965123393190166, + "learning_rate": 1.318736165696473e-06, + "loss": 0.2923, + "step": 6475 + }, + { + "epoch": 1.5414410662224074, + "grad_norm": 0.39001895537688247, + "learning_rate": 1.3174320236719524e-06, + "loss": 0.3209, + "step": 6476 + }, + { + "epoch": 1.5416790622954721, + "grad_norm": 0.39041593273975445, + "learning_rate": 1.3161284289681347e-06, + "loss": 0.4009, + "step": 6477 + }, + { + "epoch": 1.5419170583685369, + "grad_norm": 0.3757834666815788, + "learning_rate": 1.3148253817787671e-06, + "loss": 0.3065, + "step": 6478 + }, + { + "epoch": 1.5421550544416016, + "grad_norm": 0.3850973543387022, + "learning_rate": 1.3135228822975148e-06, + "loss": 0.2928, + "step": 6479 + }, + { + "epoch": 1.5423930505146664, + "grad_norm": 0.38795859626153806, + "learning_rate": 1.3122209307179602e-06, + "loss": 0.3493, + "step": 6480 + }, + { + "epoch": 1.5426310465877313, + "grad_norm": 0.3781000254192901, + "learning_rate": 1.3109195272336073e-06, + "loss": 0.3487, + "step": 6481 + }, + { + "epoch": 1.542869042660796, + "grad_norm": 0.42706004699561867, + "learning_rate": 1.3096186720378762e-06, + "loss": 0.2718, + "step": 6482 + }, + { + "epoch": 1.543107038733861, + "grad_norm": 0.4346631528907676, + "learning_rate": 1.308318365324106e-06, + "loss": 0.3182, + "step": 6483 + }, + { + "epoch": 1.5433450348069258, + "grad_norm": 0.3671857753557718, + "learning_rate": 1.3070186072855551e-06, + "loss": 0.3902, + "step": 6484 + }, + { + "epoch": 1.5435830308799905, + "grad_norm": 0.3756868316351957, + "learning_rate": 1.3057193981153986e-06, + "loss": 0.3181, + "step": 6485 + }, + { + "epoch": 1.5438210269530552, + "grad_norm": 0.3921792506523292, + "learning_rate": 1.304420738006732e-06, + "loss": 0.2774, + "step": 6486 + }, + { + "epoch": 1.54405902302612, + "grad_norm": 0.3998042980004656, + "learning_rate": 1.3031226271525677e-06, + "loss": 0.3231, + "step": 6487 + }, + { + "epoch": 1.5442970190991847, + "grad_norm": 0.40991845946955335, + "learning_rate": 1.3018250657458377e-06, + "loss": 0.3567, + "step": 6488 + }, + { + "epoch": 1.5445350151722497, + "grad_norm": 0.3688970315279264, + "learning_rate": 1.3005280539793908e-06, + "loss": 0.2933, + "step": 6489 + }, + { + "epoch": 1.5447730112453144, + "grad_norm": 0.37617632189473976, + "learning_rate": 1.2992315920459958e-06, + "loss": 0.3308, + "step": 6490 + }, + { + "epoch": 1.5450110073183794, + "grad_norm": 0.3884799402239865, + "learning_rate": 1.2979356801383376e-06, + "loss": 0.3745, + "step": 6491 + }, + { + "epoch": 1.545249003391444, + "grad_norm": 0.3545462251747847, + "learning_rate": 1.296640318449023e-06, + "loss": 0.3218, + "step": 6492 + }, + { + "epoch": 1.5454869994645088, + "grad_norm": 0.3910225423977597, + "learning_rate": 1.295345507170574e-06, + "loss": 0.2844, + "step": 6493 + }, + { + "epoch": 1.5457249955375736, + "grad_norm": 0.39920924863182033, + "learning_rate": 1.2940512464954313e-06, + "loss": 0.3485, + "step": 6494 + }, + { + "epoch": 1.5459629916106383, + "grad_norm": 0.3725385615151038, + "learning_rate": 1.2927575366159529e-06, + "loss": 0.3894, + "step": 6495 + }, + { + "epoch": 1.546200987683703, + "grad_norm": 0.35822929977999596, + "learning_rate": 1.291464377724419e-06, + "loss": 0.3083, + "step": 6496 + }, + { + "epoch": 1.546438983756768, + "grad_norm": 0.394890044687265, + "learning_rate": 1.290171770013023e-06, + "loss": 0.3128, + "step": 6497 + }, + { + "epoch": 1.5466769798298328, + "grad_norm": 0.36206101450795275, + "learning_rate": 1.2888797136738785e-06, + "loss": 0.3694, + "step": 6498 + }, + { + "epoch": 1.5469149759028977, + "grad_norm": 0.35891273995138384, + "learning_rate": 1.2875882088990172e-06, + "loss": 0.3243, + "step": 6499 + }, + { + "epoch": 1.5471529719759625, + "grad_norm": 0.39198989856107563, + "learning_rate": 1.2862972558803893e-06, + "loss": 0.2723, + "step": 6500 + }, + { + "epoch": 1.5473909680490272, + "grad_norm": 0.4058267955634325, + "learning_rate": 1.285006854809861e-06, + "loss": 0.3337, + "step": 6501 + }, + { + "epoch": 1.547628964122092, + "grad_norm": 0.38126495691581086, + "learning_rate": 1.2837170058792192e-06, + "loss": 0.374, + "step": 6502 + }, + { + "epoch": 1.5478669601951567, + "grad_norm": 0.37306157896006814, + "learning_rate": 1.2824277092801663e-06, + "loss": 0.2942, + "step": 6503 + }, + { + "epoch": 1.5481049562682214, + "grad_norm": 0.3821763777566332, + "learning_rate": 1.2811389652043244e-06, + "loss": 0.2898, + "step": 6504 + }, + { + "epoch": 1.5483429523412864, + "grad_norm": 0.38236792478815307, + "learning_rate": 1.2798507738432326e-06, + "loss": 0.3272, + "step": 6505 + }, + { + "epoch": 1.5485809484143511, + "grad_norm": 0.3844001205504733, + "learning_rate": 1.2785631353883476e-06, + "loss": 0.3261, + "step": 6506 + }, + { + "epoch": 1.548818944487416, + "grad_norm": 0.3847625058188862, + "learning_rate": 1.2772760500310444e-06, + "loss": 0.3074, + "step": 6507 + }, + { + "epoch": 1.5490569405604808, + "grad_norm": 0.4101461881227883, + "learning_rate": 1.2759895179626147e-06, + "loss": 0.3132, + "step": 6508 + }, + { + "epoch": 1.5492949366335456, + "grad_norm": 0.3736053815442064, + "learning_rate": 1.2747035393742718e-06, + "loss": 0.3962, + "step": 6509 + }, + { + "epoch": 1.5495329327066103, + "grad_norm": 0.36963876676347174, + "learning_rate": 1.2734181144571422e-06, + "loss": 0.2847, + "step": 6510 + }, + { + "epoch": 1.549770928779675, + "grad_norm": 0.42277471628828484, + "learning_rate": 1.272133243402272e-06, + "loss": 0.2858, + "step": 6511 + }, + { + "epoch": 1.5500089248527398, + "grad_norm": 0.35688249609379674, + "learning_rate": 1.2708489264006246e-06, + "loss": 0.3236, + "step": 6512 + }, + { + "epoch": 1.5502469209258047, + "grad_norm": 0.3855474832519827, + "learning_rate": 1.2695651636430816e-06, + "loss": 0.3387, + "step": 6513 + }, + { + "epoch": 1.5504849169988695, + "grad_norm": 0.36968276951753154, + "learning_rate": 1.2682819553204424e-06, + "loss": 0.2947, + "step": 6514 + }, + { + "epoch": 1.5507229130719344, + "grad_norm": 0.399145007942629, + "learning_rate": 1.2669993016234222e-06, + "loss": 0.3258, + "step": 6515 + }, + { + "epoch": 1.5509609091449992, + "grad_norm": 0.39491923210379837, + "learning_rate": 1.2657172027426556e-06, + "loss": 0.3859, + "step": 6516 + }, + { + "epoch": 1.551198905218064, + "grad_norm": 0.3614434517449473, + "learning_rate": 1.264435658868695e-06, + "loss": 0.2889, + "step": 6517 + }, + { + "epoch": 1.5514369012911287, + "grad_norm": 0.3977718991319386, + "learning_rate": 1.2631546701920073e-06, + "loss": 0.2863, + "step": 6518 + }, + { + "epoch": 1.5516748973641934, + "grad_norm": 0.493812725827731, + "learning_rate": 1.2618742369029819e-06, + "loss": 0.3471, + "step": 6519 + }, + { + "epoch": 1.5519128934372581, + "grad_norm": 0.396858346322872, + "learning_rate": 1.2605943591919218e-06, + "loss": 0.3643, + "step": 6520 + }, + { + "epoch": 1.552150889510323, + "grad_norm": 0.4006368823356171, + "learning_rate": 1.2593150372490482e-06, + "loss": 0.2958, + "step": 6521 + }, + { + "epoch": 1.5523888855833878, + "grad_norm": 0.3651666797447085, + "learning_rate": 1.258036271264499e-06, + "loss": 0.2965, + "step": 6522 + }, + { + "epoch": 1.5526268816564528, + "grad_norm": 0.3609444928045324, + "learning_rate": 1.2567580614283331e-06, + "loss": 0.3453, + "step": 6523 + }, + { + "epoch": 1.5528648777295175, + "grad_norm": 0.35866895527736053, + "learning_rate": 1.255480407930522e-06, + "loss": 0.3647, + "step": 6524 + }, + { + "epoch": 1.5531028738025823, + "grad_norm": 0.3792035303974508, + "learning_rate": 1.254203310960958e-06, + "loss": 0.3127, + "step": 6525 + }, + { + "epoch": 1.553340869875647, + "grad_norm": 0.38922874090947, + "learning_rate": 1.2529267707094483e-06, + "loss": 0.3213, + "step": 6526 + }, + { + "epoch": 1.5535788659487118, + "grad_norm": 0.38138547100943027, + "learning_rate": 1.2516507873657184e-06, + "loss": 0.3742, + "step": 6527 + }, + { + "epoch": 1.5538168620217765, + "grad_norm": 0.3818945794940767, + "learning_rate": 1.2503753611194113e-06, + "loss": 0.2942, + "step": 6528 + }, + { + "epoch": 1.5540548580948415, + "grad_norm": 0.3825385830982494, + "learning_rate": 1.2491004921600868e-06, + "loss": 0.2755, + "step": 6529 + }, + { + "epoch": 1.5542928541679062, + "grad_norm": 0.3998069859296653, + "learning_rate": 1.247826180677222e-06, + "loss": 0.3644, + "step": 6530 + }, + { + "epoch": 1.5545308502409712, + "grad_norm": 0.35741830458922036, + "learning_rate": 1.2465524268602107e-06, + "loss": 0.3762, + "step": 6531 + }, + { + "epoch": 1.554768846314036, + "grad_norm": 0.3777877889719805, + "learning_rate": 1.2452792308983646e-06, + "loss": 0.2921, + "step": 6532 + }, + { + "epoch": 1.5550068423871006, + "grad_norm": 0.3803043899122809, + "learning_rate": 1.2440065929809125e-06, + "loss": 0.3158, + "step": 6533 + }, + { + "epoch": 1.5552448384601654, + "grad_norm": 0.36524440591889934, + "learning_rate": 1.2427345132969997e-06, + "loss": 0.3631, + "step": 6534 + }, + { + "epoch": 1.55548283453323, + "grad_norm": 0.38177834729615784, + "learning_rate": 1.2414629920356862e-06, + "loss": 0.2983, + "step": 6535 + }, + { + "epoch": 1.5557208306062948, + "grad_norm": 0.3903386365033605, + "learning_rate": 1.2401920293859558e-06, + "loss": 0.2908, + "step": 6536 + }, + { + "epoch": 1.5559588266793598, + "grad_norm": 0.37546580053400613, + "learning_rate": 1.238921625536703e-06, + "loss": 0.3295, + "step": 6537 + }, + { + "epoch": 1.5561968227524245, + "grad_norm": 0.37598661605642897, + "learning_rate": 1.2376517806767412e-06, + "loss": 0.377, + "step": 6538 + }, + { + "epoch": 1.5564348188254895, + "grad_norm": 0.38383478247802943, + "learning_rate": 1.2363824949948012e-06, + "loss": 0.2775, + "step": 6539 + }, + { + "epoch": 1.5566728148985542, + "grad_norm": 0.3763981357885576, + "learning_rate": 1.2351137686795294e-06, + "loss": 0.3129, + "step": 6540 + }, + { + "epoch": 1.556910810971619, + "grad_norm": 0.40852446358280264, + "learning_rate": 1.2338456019194906e-06, + "loss": 0.3695, + "step": 6541 + }, + { + "epoch": 1.5571488070446837, + "grad_norm": 0.3695119241962966, + "learning_rate": 1.2325779949031653e-06, + "loss": 0.3202, + "step": 6542 + }, + { + "epoch": 1.5573868031177485, + "grad_norm": 0.44568416123855525, + "learning_rate": 1.231310947818951e-06, + "loss": 0.2751, + "step": 6543 + }, + { + "epoch": 1.5576247991908132, + "grad_norm": 0.39409919509614555, + "learning_rate": 1.2300444608551636e-06, + "loss": 0.3013, + "step": 6544 + }, + { + "epoch": 1.5578627952638782, + "grad_norm": 0.3742902470173008, + "learning_rate": 1.228778534200032e-06, + "loss": 0.381, + "step": 6545 + }, + { + "epoch": 1.558100791336943, + "grad_norm": 0.32924222634354267, + "learning_rate": 1.2275131680417069e-06, + "loss": 0.3009, + "step": 6546 + }, + { + "epoch": 1.5583387874100079, + "grad_norm": 0.3667969007920045, + "learning_rate": 1.2262483625682514e-06, + "loss": 0.2832, + "step": 6547 + }, + { + "epoch": 1.5585767834830726, + "grad_norm": 0.3894529324837924, + "learning_rate": 1.2249841179676481e-06, + "loss": 0.4056, + "step": 6548 + }, + { + "epoch": 1.5588147795561373, + "grad_norm": 0.36782423847690526, + "learning_rate": 1.2237204344277925e-06, + "loss": 0.3222, + "step": 6549 + }, + { + "epoch": 1.559052775629202, + "grad_norm": 0.3724914266382319, + "learning_rate": 1.2224573121365024e-06, + "loss": 0.2706, + "step": 6550 + }, + { + "epoch": 1.5592907717022668, + "grad_norm": 0.418089388727327, + "learning_rate": 1.2211947512815076e-06, + "loss": 0.3172, + "step": 6551 + }, + { + "epoch": 1.5595287677753316, + "grad_norm": 0.402369301328807, + "learning_rate": 1.2199327520504562e-06, + "loss": 0.378, + "step": 6552 + }, + { + "epoch": 1.5597667638483965, + "grad_norm": 0.42301913940674296, + "learning_rate": 1.218671314630912e-06, + "loss": 0.3409, + "step": 6553 + }, + { + "epoch": 1.5600047599214613, + "grad_norm": 0.38961962135190426, + "learning_rate": 1.2174104392103558e-06, + "loss": 0.2681, + "step": 6554 + }, + { + "epoch": 1.5602427559945262, + "grad_norm": 0.4054494324727994, + "learning_rate": 1.2161501259761855e-06, + "loss": 0.3628, + "step": 6555 + }, + { + "epoch": 1.560480752067591, + "grad_norm": 0.3373988273851839, + "learning_rate": 1.2148903751157144e-06, + "loss": 0.3361, + "step": 6556 + }, + { + "epoch": 1.5607187481406557, + "grad_norm": 0.40537913952657784, + "learning_rate": 1.213631186816172e-06, + "loss": 0.2743, + "step": 6557 + }, + { + "epoch": 1.5609567442137204, + "grad_norm": 0.38794190280853064, + "learning_rate": 1.2123725612647064e-06, + "loss": 0.327, + "step": 6558 + }, + { + "epoch": 1.5611947402867852, + "grad_norm": 0.38511311663261943, + "learning_rate": 1.2111144986483792e-06, + "loss": 0.3744, + "step": 6559 + }, + { + "epoch": 1.56143273635985, + "grad_norm": 0.3674215063731254, + "learning_rate": 1.2098569991541698e-06, + "loss": 0.3067, + "step": 6560 + }, + { + "epoch": 1.5616707324329149, + "grad_norm": 0.40312804407214897, + "learning_rate": 1.2086000629689743e-06, + "loss": 0.3093, + "step": 6561 + }, + { + "epoch": 1.5619087285059796, + "grad_norm": 0.3762916570398983, + "learning_rate": 1.2073436902796038e-06, + "loss": 0.3063, + "step": 6562 + }, + { + "epoch": 1.5621467245790446, + "grad_norm": 0.36072105739082216, + "learning_rate": 1.2060878812727854e-06, + "loss": 0.3568, + "step": 6563 + }, + { + "epoch": 1.5623847206521093, + "grad_norm": 0.354835431200888, + "learning_rate": 1.2048326361351658e-06, + "loss": 0.2666, + "step": 6564 + }, + { + "epoch": 1.562622716725174, + "grad_norm": 0.4104035256897569, + "learning_rate": 1.203577955053305e-06, + "loss": 0.3111, + "step": 6565 + }, + { + "epoch": 1.5628607127982388, + "grad_norm": 0.3512367007914632, + "learning_rate": 1.2023238382136787e-06, + "loss": 0.3713, + "step": 6566 + }, + { + "epoch": 1.5630987088713035, + "grad_norm": 0.3646761606995802, + "learning_rate": 1.20107028580268e-06, + "loss": 0.2961, + "step": 6567 + }, + { + "epoch": 1.5633367049443683, + "grad_norm": 0.35406751256411023, + "learning_rate": 1.199817298006618e-06, + "loss": 0.2711, + "step": 6568 + }, + { + "epoch": 1.5635747010174332, + "grad_norm": 0.4043510614910653, + "learning_rate": 1.1985648750117179e-06, + "loss": 0.3151, + "step": 6569 + }, + { + "epoch": 1.563812697090498, + "grad_norm": 0.3791673629759354, + "learning_rate": 1.1973130170041198e-06, + "loss": 0.4031, + "step": 6570 + }, + { + "epoch": 1.564050693163563, + "grad_norm": 0.3842915787993255, + "learning_rate": 1.1960617241698813e-06, + "loss": 0.2776, + "step": 6571 + }, + { + "epoch": 1.5642886892366277, + "grad_norm": 0.4119663617052321, + "learning_rate": 1.1948109966949745e-06, + "loss": 0.3098, + "step": 6572 + }, + { + "epoch": 1.5645266853096924, + "grad_norm": 0.3592669784242863, + "learning_rate": 1.1935608347652906e-06, + "loss": 0.3729, + "step": 6573 + }, + { + "epoch": 1.5647646813827571, + "grad_norm": 0.373131383360226, + "learning_rate": 1.192311238566633e-06, + "loss": 0.3484, + "step": 6574 + }, + { + "epoch": 1.5650026774558219, + "grad_norm": 0.3833981711887363, + "learning_rate": 1.1910622082847234e-06, + "loss": 0.2392, + "step": 6575 + }, + { + "epoch": 1.5652406735288866, + "grad_norm": 0.34671002508415816, + "learning_rate": 1.1898137441051982e-06, + "loss": 0.3245, + "step": 6576 + }, + { + "epoch": 1.5654786696019516, + "grad_norm": 0.3632990316751966, + "learning_rate": 1.1885658462136085e-06, + "loss": 0.4118, + "step": 6577 + }, + { + "epoch": 1.5657166656750163, + "grad_norm": 0.35796363702341216, + "learning_rate": 1.1873185147954263e-06, + "loss": 0.2738, + "step": 6578 + }, + { + "epoch": 1.5659546617480813, + "grad_norm": 0.3527329899780401, + "learning_rate": 1.1860717500360336e-06, + "loss": 0.2714, + "step": 6579 + }, + { + "epoch": 1.566192657821146, + "grad_norm": 0.41834053159356094, + "learning_rate": 1.1848255521207308e-06, + "loss": 0.3636, + "step": 6580 + }, + { + "epoch": 1.5664306538942108, + "grad_norm": 0.39616933163013157, + "learning_rate": 1.1835799212347343e-06, + "loss": 0.3605, + "step": 6581 + }, + { + "epoch": 1.5666686499672755, + "grad_norm": 0.37466230103769954, + "learning_rate": 1.1823348575631749e-06, + "loss": 0.2658, + "step": 6582 + }, + { + "epoch": 1.5669066460403402, + "grad_norm": 0.39963898406246245, + "learning_rate": 1.1810903612911001e-06, + "loss": 0.3117, + "step": 6583 + }, + { + "epoch": 1.567144642113405, + "grad_norm": 0.3910180293713378, + "learning_rate": 1.179846432603473e-06, + "loss": 0.3908, + "step": 6584 + }, + { + "epoch": 1.56738263818647, + "grad_norm": 0.3978347412470743, + "learning_rate": 1.178603071685172e-06, + "loss": 0.3303, + "step": 6585 + }, + { + "epoch": 1.5676206342595347, + "grad_norm": 0.4194169551330718, + "learning_rate": 1.1773602787209915e-06, + "loss": 0.2969, + "step": 6586 + }, + { + "epoch": 1.5678586303325996, + "grad_norm": 0.4026400405007115, + "learning_rate": 1.1761180538956408e-06, + "loss": 0.3348, + "step": 6587 + }, + { + "epoch": 1.5680966264056644, + "grad_norm": 0.3774639635057926, + "learning_rate": 1.1748763973937454e-06, + "loss": 0.3725, + "step": 6588 + }, + { + "epoch": 1.5683346224787291, + "grad_norm": 0.37626958019160905, + "learning_rate": 1.1736353093998471e-06, + "loss": 0.3056, + "step": 6589 + }, + { + "epoch": 1.5685726185517939, + "grad_norm": 0.3938142965328492, + "learning_rate": 1.1723947900983995e-06, + "loss": 0.2885, + "step": 6590 + }, + { + "epoch": 1.5688106146248586, + "grad_norm": 0.3988797825778564, + "learning_rate": 1.1711548396737782e-06, + "loss": 0.3832, + "step": 6591 + }, + { + "epoch": 1.5690486106979233, + "grad_norm": 0.3627863918712503, + "learning_rate": 1.1699154583102684e-06, + "loss": 0.2937, + "step": 6592 + }, + { + "epoch": 1.5692866067709883, + "grad_norm": 0.37559495224162986, + "learning_rate": 1.1686766461920735e-06, + "loss": 0.3193, + "step": 6593 + }, + { + "epoch": 1.569524602844053, + "grad_norm": 0.3864117859142998, + "learning_rate": 1.1674384035033115e-06, + "loss": 0.3263, + "step": 6594 + }, + { + "epoch": 1.569762598917118, + "grad_norm": 0.3796716007438366, + "learning_rate": 1.1662007304280153e-06, + "loss": 0.3871, + "step": 6595 + }, + { + "epoch": 1.5700005949901827, + "grad_norm": 0.3728167400610051, + "learning_rate": 1.1649636271501346e-06, + "loss": 0.2965, + "step": 6596 + }, + { + "epoch": 1.5702385910632475, + "grad_norm": 0.3719186929240788, + "learning_rate": 1.1637270938535334e-06, + "loss": 0.298, + "step": 6597 + }, + { + "epoch": 1.5704765871363122, + "grad_norm": 0.37701153994508685, + "learning_rate": 1.1624911307219904e-06, + "loss": 0.353, + "step": 6598 + }, + { + "epoch": 1.570714583209377, + "grad_norm": 0.37159186789859183, + "learning_rate": 1.1612557379391993e-06, + "loss": 0.3372, + "step": 6599 + }, + { + "epoch": 1.5709525792824417, + "grad_norm": 0.3868110826031627, + "learning_rate": 1.1600209156887732e-06, + "loss": 0.2635, + "step": 6600 + }, + { + "epoch": 1.5711905753555067, + "grad_norm": 0.3807624942737973, + "learning_rate": 1.158786664154235e-06, + "loss": 0.3153, + "step": 6601 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.409345535932182, + "learning_rate": 1.1575529835190264e-06, + "loss": 0.3898, + "step": 6602 + }, + { + "epoch": 1.5716665675016364, + "grad_norm": 0.3879086278028145, + "learning_rate": 1.1563198739665017e-06, + "loss": 0.2732, + "step": 6603 + }, + { + "epoch": 1.571904563574701, + "grad_norm": 0.40901261697805064, + "learning_rate": 1.1550873356799297e-06, + "loss": 0.2728, + "step": 6604 + }, + { + "epoch": 1.5721425596477658, + "grad_norm": 0.3791446730046189, + "learning_rate": 1.1538553688425002e-06, + "loss": 0.365, + "step": 6605 + }, + { + "epoch": 1.5723805557208306, + "grad_norm": 0.3662473834687694, + "learning_rate": 1.1526239736373118e-06, + "loss": 0.3693, + "step": 6606 + }, + { + "epoch": 1.5726185517938953, + "grad_norm": 0.3745667664354613, + "learning_rate": 1.1513931502473807e-06, + "loss": 0.2745, + "step": 6607 + }, + { + "epoch": 1.57285654786696, + "grad_norm": 0.3758851474104438, + "learning_rate": 1.1501628988556368e-06, + "loss": 0.3148, + "step": 6608 + }, + { + "epoch": 1.573094543940025, + "grad_norm": 0.40537121108015156, + "learning_rate": 1.1489332196449271e-06, + "loss": 0.3888, + "step": 6609 + }, + { + "epoch": 1.5733325400130898, + "grad_norm": 0.3957681771676492, + "learning_rate": 1.1477041127980115e-06, + "loss": 0.3328, + "step": 6610 + }, + { + "epoch": 1.5735705360861547, + "grad_norm": 0.4123620794488522, + "learning_rate": 1.146475578497566e-06, + "loss": 0.2634, + "step": 6611 + }, + { + "epoch": 1.5738085321592195, + "grad_norm": 0.4057487023110117, + "learning_rate": 1.1452476169261812e-06, + "loss": 0.341, + "step": 6612 + }, + { + "epoch": 1.5740465282322842, + "grad_norm": 0.39959292930319834, + "learning_rate": 1.144020228266362e-06, + "loss": 0.3469, + "step": 6613 + }, + { + "epoch": 1.574284524305349, + "grad_norm": 0.34870011465820333, + "learning_rate": 1.1427934127005296e-06, + "loss": 0.284, + "step": 6614 + }, + { + "epoch": 1.5745225203784137, + "grad_norm": 0.41846723355624454, + "learning_rate": 1.1415671704110193e-06, + "loss": 0.2993, + "step": 6615 + }, + { + "epoch": 1.5747605164514784, + "grad_norm": 0.4035291904243755, + "learning_rate": 1.1403415015800801e-06, + "loss": 0.3623, + "step": 6616 + }, + { + "epoch": 1.5749985125245434, + "grad_norm": 0.3751084543135697, + "learning_rate": 1.1391164063898768e-06, + "loss": 0.307, + "step": 6617 + }, + { + "epoch": 1.575236508597608, + "grad_norm": 0.3848206656891993, + "learning_rate": 1.137891885022488e-06, + "loss": 0.2872, + "step": 6618 + }, + { + "epoch": 1.575474504670673, + "grad_norm": 0.36097442997024354, + "learning_rate": 1.1366679376599104e-06, + "loss": 0.3278, + "step": 6619 + }, + { + "epoch": 1.5757125007437378, + "grad_norm": 0.3676488968525861, + "learning_rate": 1.1354445644840516e-06, + "loss": 0.3784, + "step": 6620 + }, + { + "epoch": 1.5759504968168025, + "grad_norm": 0.3757067541931903, + "learning_rate": 1.1342217656767352e-06, + "loss": 0.294, + "step": 6621 + }, + { + "epoch": 1.5761884928898673, + "grad_norm": 0.4248595314868793, + "learning_rate": 1.1329995414196986e-06, + "loss": 0.3104, + "step": 6622 + }, + { + "epoch": 1.576426488962932, + "grad_norm": 0.39022645581178395, + "learning_rate": 1.1317778918945954e-06, + "loss": 0.3596, + "step": 6623 + }, + { + "epoch": 1.5766644850359968, + "grad_norm": 0.37146786462426146, + "learning_rate": 1.1305568172829928e-06, + "loss": 0.3432, + "step": 6624 + }, + { + "epoch": 1.5769024811090617, + "grad_norm": 0.3957415682588482, + "learning_rate": 1.1293363177663724e-06, + "loss": 0.2897, + "step": 6625 + }, + { + "epoch": 1.5771404771821265, + "grad_norm": 0.37407424529688554, + "learning_rate": 1.1281163935261298e-06, + "loss": 0.3319, + "step": 6626 + }, + { + "epoch": 1.5773784732551914, + "grad_norm": 0.5040169043587555, + "learning_rate": 1.1268970447435772e-06, + "loss": 0.4073, + "step": 6627 + }, + { + "epoch": 1.5776164693282562, + "grad_norm": 0.3494340725959216, + "learning_rate": 1.1256782715999409e-06, + "loss": 0.2889, + "step": 6628 + }, + { + "epoch": 1.577854465401321, + "grad_norm": 0.3592534966406634, + "learning_rate": 1.124460074276359e-06, + "loss": 0.2832, + "step": 6629 + }, + { + "epoch": 1.5780924614743856, + "grad_norm": 0.4051953408007675, + "learning_rate": 1.1232424529538865e-06, + "loss": 0.3587, + "step": 6630 + }, + { + "epoch": 1.5783304575474504, + "grad_norm": 0.3796932548423937, + "learning_rate": 1.1220254078134919e-06, + "loss": 0.3441, + "step": 6631 + }, + { + "epoch": 1.5785684536205151, + "grad_norm": 0.38402120878401813, + "learning_rate": 1.1208089390360566e-06, + "loss": 0.2712, + "step": 6632 + }, + { + "epoch": 1.57880644969358, + "grad_norm": 0.3928063655919691, + "learning_rate": 1.1195930468023808e-06, + "loss": 0.3301, + "step": 6633 + }, + { + "epoch": 1.5790444457666448, + "grad_norm": 0.3791650075605935, + "learning_rate": 1.1183777312931748e-06, + "loss": 0.3712, + "step": 6634 + }, + { + "epoch": 1.5792824418397098, + "grad_norm": 0.38088863826024594, + "learning_rate": 1.1171629926890648e-06, + "loss": 0.3067, + "step": 6635 + }, + { + "epoch": 1.5795204379127745, + "grad_norm": 0.4035315968898168, + "learning_rate": 1.1159488311705914e-06, + "loss": 0.2967, + "step": 6636 + }, + { + "epoch": 1.5797584339858393, + "grad_norm": 0.39150229184884205, + "learning_rate": 1.1147352469182077e-06, + "loss": 0.3563, + "step": 6637 + }, + { + "epoch": 1.579996430058904, + "grad_norm": 0.3945787408533381, + "learning_rate": 1.1135222401122835e-06, + "loss": 0.3706, + "step": 6638 + }, + { + "epoch": 1.5802344261319687, + "grad_norm": 0.3824380546742477, + "learning_rate": 1.112309810933101e-06, + "loss": 0.3121, + "step": 6639 + }, + { + "epoch": 1.5804724222050335, + "grad_norm": 0.38647053605937204, + "learning_rate": 1.111097959560858e-06, + "loss": 0.2656, + "step": 6640 + }, + { + "epoch": 1.5807104182780984, + "grad_norm": 0.39387059548636993, + "learning_rate": 1.109886686175664e-06, + "loss": 0.3632, + "step": 6641 + }, + { + "epoch": 1.5809484143511632, + "grad_norm": 0.38920881826537457, + "learning_rate": 1.1086759909575462e-06, + "loss": 0.2883, + "step": 6642 + }, + { + "epoch": 1.5811864104242281, + "grad_norm": 0.39014824674716264, + "learning_rate": 1.1074658740864425e-06, + "loss": 0.2812, + "step": 6643 + }, + { + "epoch": 1.5814244064972929, + "grad_norm": 0.4100290719394353, + "learning_rate": 1.1062563357422063e-06, + "loss": 0.3405, + "step": 6644 + }, + { + "epoch": 1.5816624025703576, + "grad_norm": 0.379281769403233, + "learning_rate": 1.1050473761046038e-06, + "loss": 0.3978, + "step": 6645 + }, + { + "epoch": 1.5819003986434224, + "grad_norm": 0.3843586382669758, + "learning_rate": 1.1038389953533197e-06, + "loss": 0.2815, + "step": 6646 + }, + { + "epoch": 1.582138394716487, + "grad_norm": 0.37029427889320665, + "learning_rate": 1.102631193667947e-06, + "loss": 0.2894, + "step": 6647 + }, + { + "epoch": 1.5823763907895518, + "grad_norm": 0.363151439990877, + "learning_rate": 1.1014239712279946e-06, + "loss": 0.3463, + "step": 6648 + }, + { + "epoch": 1.5826143868626168, + "grad_norm": 0.3499094972795553, + "learning_rate": 1.100217328212887e-06, + "loss": 0.3472, + "step": 6649 + }, + { + "epoch": 1.5828523829356815, + "grad_norm": 0.37693181662588054, + "learning_rate": 1.0990112648019595e-06, + "loss": 0.2838, + "step": 6650 + }, + { + "epoch": 1.5830903790087465, + "grad_norm": 0.4029662389391535, + "learning_rate": 1.0978057811744646e-06, + "loss": 0.3276, + "step": 6651 + }, + { + "epoch": 1.5833283750818112, + "grad_norm": 0.4147641724374556, + "learning_rate": 1.096600877509566e-06, + "loss": 0.3633, + "step": 6652 + }, + { + "epoch": 1.583566371154876, + "grad_norm": 0.3672722330153522, + "learning_rate": 1.0953965539863409e-06, + "loss": 0.2932, + "step": 6653 + }, + { + "epoch": 1.5838043672279407, + "grad_norm": 0.3893345967844479, + "learning_rate": 1.0941928107837852e-06, + "loss": 0.2635, + "step": 6654 + }, + { + "epoch": 1.5840423633010055, + "grad_norm": 0.39379673310513574, + "learning_rate": 1.0929896480808022e-06, + "loss": 0.372, + "step": 6655 + }, + { + "epoch": 1.5842803593740702, + "grad_norm": 0.3763842161032918, + "learning_rate": 1.0917870660562125e-06, + "loss": 0.3338, + "step": 6656 + }, + { + "epoch": 1.5845183554471352, + "grad_norm": 0.42973871300326605, + "learning_rate": 1.090585064888749e-06, + "loss": 0.2549, + "step": 6657 + }, + { + "epoch": 1.5847563515202, + "grad_norm": 0.3692281668495606, + "learning_rate": 1.0893836447570595e-06, + "loss": 0.3091, + "step": 6658 + }, + { + "epoch": 1.5849943475932649, + "grad_norm": 0.4105528991354411, + "learning_rate": 1.0881828058397025e-06, + "loss": 0.3591, + "step": 6659 + }, + { + "epoch": 1.5852323436663296, + "grad_norm": 0.3506145732459315, + "learning_rate": 1.0869825483151563e-06, + "loss": 0.3442, + "step": 6660 + }, + { + "epoch": 1.5854703397393943, + "grad_norm": 0.38609627382705203, + "learning_rate": 1.0857828723618063e-06, + "loss": 0.2698, + "step": 6661 + }, + { + "epoch": 1.585708335812459, + "grad_norm": 0.39582571031621394, + "learning_rate": 1.084583778157955e-06, + "loss": 0.3305, + "step": 6662 + }, + { + "epoch": 1.5859463318855238, + "grad_norm": 0.4773499221532177, + "learning_rate": 1.0833852658818167e-06, + "loss": 0.3659, + "step": 6663 + }, + { + "epoch": 1.5861843279585885, + "grad_norm": 0.387549231721086, + "learning_rate": 1.0821873357115203e-06, + "loss": 0.2837, + "step": 6664 + }, + { + "epoch": 1.5864223240316535, + "grad_norm": 0.3684761502468937, + "learning_rate": 1.0809899878251078e-06, + "loss": 0.3226, + "step": 6665 + }, + { + "epoch": 1.5866603201047182, + "grad_norm": 0.4189733561178494, + "learning_rate": 1.0797932224005348e-06, + "loss": 0.3902, + "step": 6666 + }, + { + "epoch": 1.5868983161777832, + "grad_norm": 0.3603259475366153, + "learning_rate": 1.0785970396156698e-06, + "loss": 0.3178, + "step": 6667 + }, + { + "epoch": 1.587136312250848, + "grad_norm": 0.37689639588625584, + "learning_rate": 1.0774014396482962e-06, + "loss": 0.2916, + "step": 6668 + }, + { + "epoch": 1.5873743083239127, + "grad_norm": 0.38804261264274786, + "learning_rate": 1.0762064226761087e-06, + "loss": 0.3446, + "step": 6669 + }, + { + "epoch": 1.5876123043969774, + "grad_norm": 0.36867385287936777, + "learning_rate": 1.075011988876717e-06, + "loss": 0.3833, + "step": 6670 + }, + { + "epoch": 1.5878503004700422, + "grad_norm": 0.7076069958920685, + "learning_rate": 1.073818138427643e-06, + "loss": 0.3034, + "step": 6671 + }, + { + "epoch": 1.588088296543107, + "grad_norm": 0.37858638589930405, + "learning_rate": 1.0726248715063226e-06, + "loss": 0.2957, + "step": 6672 + }, + { + "epoch": 1.5883262926161719, + "grad_norm": 0.3866414028696196, + "learning_rate": 1.0714321882901036e-06, + "loss": 0.3701, + "step": 6673 + }, + { + "epoch": 1.5885642886892366, + "grad_norm": 0.39497519093426026, + "learning_rate": 1.070240088956251e-06, + "loss": 0.3026, + "step": 6674 + }, + { + "epoch": 1.5888022847623016, + "grad_norm": 0.37372560276384903, + "learning_rate": 1.069048573681939e-06, + "loss": 0.2698, + "step": 6675 + }, + { + "epoch": 1.5890402808353663, + "grad_norm": 0.4009991162457296, + "learning_rate": 1.0678576426442554e-06, + "loss": 0.3123, + "step": 6676 + }, + { + "epoch": 1.589278276908431, + "grad_norm": 0.39506909405987567, + "learning_rate": 1.0666672960202028e-06, + "loss": 0.3798, + "step": 6677 + }, + { + "epoch": 1.5895162729814958, + "grad_norm": 0.3524389433062458, + "learning_rate": 1.0654775339866963e-06, + "loss": 0.2702, + "step": 6678 + }, + { + "epoch": 1.5897542690545605, + "grad_norm": 0.38272669069986726, + "learning_rate": 1.0642883567205635e-06, + "loss": 0.28, + "step": 6679 + }, + { + "epoch": 1.5899922651276253, + "grad_norm": 0.40293966435907647, + "learning_rate": 1.0630997643985447e-06, + "loss": 0.3393, + "step": 6680 + }, + { + "epoch": 1.5902302612006902, + "grad_norm": 0.3587703813764963, + "learning_rate": 1.061911757197296e-06, + "loss": 0.3498, + "step": 6681 + }, + { + "epoch": 1.590468257273755, + "grad_norm": 0.38543802046712716, + "learning_rate": 1.0607243352933838e-06, + "loss": 0.2903, + "step": 6682 + }, + { + "epoch": 1.59070625334682, + "grad_norm": 0.4301083842782692, + "learning_rate": 1.0595374988632884e-06, + "loss": 0.3182, + "step": 6683 + }, + { + "epoch": 1.5909442494198847, + "grad_norm": 0.40034262172534774, + "learning_rate": 1.058351248083403e-06, + "loss": 0.3613, + "step": 6684 + }, + { + "epoch": 1.5911822454929494, + "grad_norm": 0.382673941851363, + "learning_rate": 1.0571655831300342e-06, + "loss": 0.326, + "step": 6685 + }, + { + "epoch": 1.5914202415660141, + "grad_norm": 0.41204904076387927, + "learning_rate": 1.0559805041794002e-06, + "loss": 0.2805, + "step": 6686 + }, + { + "epoch": 1.5916582376390789, + "grad_norm": 0.38921959287793867, + "learning_rate": 1.0547960114076328e-06, + "loss": 0.3514, + "step": 6687 + }, + { + "epoch": 1.5918962337121436, + "grad_norm": 0.40306688375436, + "learning_rate": 1.053612104990779e-06, + "loss": 0.3668, + "step": 6688 + }, + { + "epoch": 1.5921342297852086, + "grad_norm": 0.4133630297518753, + "learning_rate": 1.052428785104795e-06, + "loss": 0.2802, + "step": 6689 + }, + { + "epoch": 1.5923722258582733, + "grad_norm": 0.3962023255094279, + "learning_rate": 1.051246051925552e-06, + "loss": 0.316, + "step": 6690 + }, + { + "epoch": 1.5926102219313383, + "grad_norm": 0.41861934380314547, + "learning_rate": 1.0500639056288325e-06, + "loss": 0.3799, + "step": 6691 + }, + { + "epoch": 1.592848218004403, + "grad_norm": 0.3766382477058585, + "learning_rate": 1.0488823463903341e-06, + "loss": 0.3612, + "step": 6692 + }, + { + "epoch": 1.5930862140774678, + "grad_norm": 0.3881147115564832, + "learning_rate": 1.0477013743856652e-06, + "loss": 0.274, + "step": 6693 + }, + { + "epoch": 1.5933242101505325, + "grad_norm": 0.3621383112707997, + "learning_rate": 1.0465209897903466e-06, + "loss": 0.3322, + "step": 6694 + }, + { + "epoch": 1.5935622062235972, + "grad_norm": 0.3636848700106123, + "learning_rate": 1.0453411927798136e-06, + "loss": 0.3889, + "step": 6695 + }, + { + "epoch": 1.593800202296662, + "grad_norm": 0.37473055964276525, + "learning_rate": 1.0441619835294125e-06, + "loss": 0.2783, + "step": 6696 + }, + { + "epoch": 1.594038198369727, + "grad_norm": 0.37206515088751896, + "learning_rate": 1.0429833622144037e-06, + "loss": 0.2903, + "step": 6697 + }, + { + "epoch": 1.5942761944427917, + "grad_norm": 0.39577591456358313, + "learning_rate": 1.0418053290099589e-06, + "loss": 0.3918, + "step": 6698 + }, + { + "epoch": 1.5945141905158566, + "grad_norm": 0.37165220358822043, + "learning_rate": 1.0406278840911632e-06, + "loss": 0.3808, + "step": 6699 + }, + { + "epoch": 1.5947521865889214, + "grad_norm": 0.3930220555144249, + "learning_rate": 1.0394510276330145e-06, + "loss": 0.2849, + "step": 6700 + }, + { + "epoch": 1.5949901826619861, + "grad_norm": 0.4069004575367408, + "learning_rate": 1.038274759810421e-06, + "loss": 0.3189, + "step": 6701 + }, + { + "epoch": 1.5952281787350509, + "grad_norm": 0.39687917691298413, + "learning_rate": 1.0370990807982073e-06, + "loss": 0.3705, + "step": 6702 + }, + { + "epoch": 1.5954661748081156, + "grad_norm": 0.3910720271352218, + "learning_rate": 1.0359239907711082e-06, + "loss": 0.2893, + "step": 6703 + }, + { + "epoch": 1.5957041708811803, + "grad_norm": 0.37980335830006934, + "learning_rate": 1.0347494899037703e-06, + "loss": 0.2815, + "step": 6704 + }, + { + "epoch": 1.5959421669542453, + "grad_norm": 0.39726445474059047, + "learning_rate": 1.033575578370754e-06, + "loss": 0.3508, + "step": 6705 + }, + { + "epoch": 1.59618016302731, + "grad_norm": 0.37676377876897194, + "learning_rate": 1.0324022563465307e-06, + "loss": 0.3562, + "step": 6706 + }, + { + "epoch": 1.596418159100375, + "grad_norm": 0.3593315132381354, + "learning_rate": 1.0312295240054853e-06, + "loss": 0.2756, + "step": 6707 + }, + { + "epoch": 1.5966561551734397, + "grad_norm": 0.37080438768334384, + "learning_rate": 1.030057381521916e-06, + "loss": 0.3206, + "step": 6708 + }, + { + "epoch": 1.5968941512465045, + "grad_norm": 0.37873106808738827, + "learning_rate": 1.028885829070032e-06, + "loss": 0.3971, + "step": 6709 + }, + { + "epoch": 1.5971321473195692, + "grad_norm": 0.3659434667845778, + "learning_rate": 1.027714866823954e-06, + "loss": 0.283, + "step": 6710 + }, + { + "epoch": 1.597370143392634, + "grad_norm": 0.3867687273653711, + "learning_rate": 1.0265444949577163e-06, + "loss": 0.2797, + "step": 6711 + }, + { + "epoch": 1.5976081394656987, + "grad_norm": 0.4072148084054436, + "learning_rate": 1.0253747136452657e-06, + "loss": 0.3506, + "step": 6712 + }, + { + "epoch": 1.5978461355387636, + "grad_norm": 0.3901721585652958, + "learning_rate": 1.0242055230604598e-06, + "loss": 0.3852, + "step": 6713 + }, + { + "epoch": 1.5980841316118284, + "grad_norm": 0.38077201173399056, + "learning_rate": 1.0230369233770688e-06, + "loss": 0.2825, + "step": 6714 + }, + { + "epoch": 1.5983221276848933, + "grad_norm": 0.34750351350090636, + "learning_rate": 1.0218689147687766e-06, + "loss": 0.318, + "step": 6715 + }, + { + "epoch": 1.598560123757958, + "grad_norm": 0.37793674402799227, + "learning_rate": 1.0207014974091788e-06, + "loss": 0.373, + "step": 6716 + }, + { + "epoch": 1.5987981198310228, + "grad_norm": 0.37210043250443864, + "learning_rate": 1.0195346714717813e-06, + "loss": 0.3131, + "step": 6717 + }, + { + "epoch": 1.5990361159040876, + "grad_norm": 0.397386731456454, + "learning_rate": 1.0183684371300035e-06, + "loss": 0.2892, + "step": 6718 + }, + { + "epoch": 1.5992741119771523, + "grad_norm": 0.3857199576264192, + "learning_rate": 1.0172027945571765e-06, + "loss": 0.3093, + "step": 6719 + }, + { + "epoch": 1.599512108050217, + "grad_norm": 0.3652664433334774, + "learning_rate": 1.0160377439265445e-06, + "loss": 0.3803, + "step": 6720 + }, + { + "epoch": 1.599750104123282, + "grad_norm": 0.39011567367020755, + "learning_rate": 1.014873285411262e-06, + "loss": 0.2906, + "step": 6721 + }, + { + "epoch": 1.5999881001963467, + "grad_norm": 0.4067204738621554, + "learning_rate": 1.013709419184396e-06, + "loss": 0.2948, + "step": 6722 + }, + { + "epoch": 1.6002260962694117, + "grad_norm": 0.38555422364472247, + "learning_rate": 1.0125461454189273e-06, + "loss": 0.3671, + "step": 6723 + }, + { + "epoch": 1.6004640923424764, + "grad_norm": 0.38611908239425363, + "learning_rate": 1.0113834642877457e-06, + "loss": 0.3296, + "step": 6724 + }, + { + "epoch": 1.6007020884155412, + "grad_norm": 0.4058794539643264, + "learning_rate": 1.0102213759636548e-06, + "loss": 0.2817, + "step": 6725 + }, + { + "epoch": 1.600940084488606, + "grad_norm": 0.38734559534856855, + "learning_rate": 1.0090598806193701e-06, + "loss": 0.33, + "step": 6726 + }, + { + "epoch": 1.6011780805616707, + "grad_norm": 0.393666212501604, + "learning_rate": 1.0078989784275183e-06, + "loss": 0.3874, + "step": 6727 + }, + { + "epoch": 1.6014160766347354, + "grad_norm": 0.36559390636596556, + "learning_rate": 1.0067386695606367e-06, + "loss": 0.27, + "step": 6728 + }, + { + "epoch": 1.6016540727078004, + "grad_norm": 0.409681509162357, + "learning_rate": 1.0055789541911788e-06, + "loss": 0.275, + "step": 6729 + }, + { + "epoch": 1.601892068780865, + "grad_norm": 0.4138944613457098, + "learning_rate": 1.004419832491505e-06, + "loss": 0.343, + "step": 6730 + }, + { + "epoch": 1.60213006485393, + "grad_norm": 0.3955901179511804, + "learning_rate": 1.0032613046338907e-06, + "loss": 0.3837, + "step": 6731 + }, + { + "epoch": 1.6023680609269948, + "grad_norm": 0.3840912144760111, + "learning_rate": 1.0021033707905202e-06, + "loss": 0.2799, + "step": 6732 + }, + { + "epoch": 1.6026060570000595, + "grad_norm": 0.3907610707868771, + "learning_rate": 1.0009460311334928e-06, + "loss": 0.319, + "step": 6733 + }, + { + "epoch": 1.6028440530731243, + "grad_norm": 0.38100921827655265, + "learning_rate": 9.99789285834815e-07, + "loss": 0.3672, + "step": 6734 + }, + { + "epoch": 1.603082049146189, + "grad_norm": 0.4088419649010157, + "learning_rate": 9.98633135066412e-07, + "loss": 0.3222, + "step": 6735 + }, + { + "epoch": 1.6033200452192538, + "grad_norm": 0.3801150461937333, + "learning_rate": 9.974775790001134e-07, + "loss": 0.2863, + "step": 6736 + }, + { + "epoch": 1.6035580412923187, + "grad_norm": 0.3695771436542104, + "learning_rate": 9.963226178076646e-07, + "loss": 0.3493, + "step": 6737 + }, + { + "epoch": 1.6037960373653835, + "grad_norm": 0.4646018757207895, + "learning_rate": 9.951682516607208e-07, + "loss": 0.3526, + "step": 6738 + }, + { + "epoch": 1.6040340334384484, + "grad_norm": 0.3739693404848989, + "learning_rate": 9.940144807308494e-07, + "loss": 0.2835, + "step": 6739 + }, + { + "epoch": 1.6042720295115132, + "grad_norm": 0.38989411473136254, + "learning_rate": 9.928613051895297e-07, + "loss": 0.3219, + "step": 6740 + }, + { + "epoch": 1.604510025584578, + "grad_norm": 0.39146856372760747, + "learning_rate": 9.91708725208152e-07, + "loss": 0.3897, + "step": 6741 + }, + { + "epoch": 1.6047480216576426, + "grad_norm": 0.35756410647769543, + "learning_rate": 9.90556740958017e-07, + "loss": 0.3037, + "step": 6742 + }, + { + "epoch": 1.6049860177307074, + "grad_norm": 0.41503456281085804, + "learning_rate": 9.894053526103397e-07, + "loss": 0.2837, + "step": 6743 + }, + { + "epoch": 1.605224013803772, + "grad_norm": 0.37911206000367126, + "learning_rate": 9.882545603362448e-07, + "loss": 0.3649, + "step": 6744 + }, + { + "epoch": 1.605462009876837, + "grad_norm": 0.39287333843325145, + "learning_rate": 9.87104364306768e-07, + "loss": 0.3827, + "step": 6745 + }, + { + "epoch": 1.6057000059499018, + "grad_norm": 0.3623004181552304, + "learning_rate": 9.859547646928568e-07, + "loss": 0.2571, + "step": 6746 + }, + { + "epoch": 1.6059380020229668, + "grad_norm": 0.3729145089679708, + "learning_rate": 9.848057616653705e-07, + "loss": 0.3106, + "step": 6747 + }, + { + "epoch": 1.6061759980960315, + "grad_norm": 0.39121154395488567, + "learning_rate": 9.83657355395079e-07, + "loss": 0.3548, + "step": 6748 + }, + { + "epoch": 1.6064139941690962, + "grad_norm": 0.3649057165068434, + "learning_rate": 9.82509546052664e-07, + "loss": 0.3453, + "step": 6749 + }, + { + "epoch": 1.606651990242161, + "grad_norm": 0.38056432571485566, + "learning_rate": 9.813623338087181e-07, + "loss": 0.2907, + "step": 6750 + }, + { + "epoch": 1.6068899863152257, + "grad_norm": 0.3773297281289237, + "learning_rate": 9.802157188337464e-07, + "loss": 0.3433, + "step": 6751 + }, + { + "epoch": 1.6071279823882905, + "grad_norm": 0.37897532559459307, + "learning_rate": 9.790697012981632e-07, + "loss": 0.3886, + "step": 6752 + }, + { + "epoch": 1.6073659784613554, + "grad_norm": 0.36839416215381326, + "learning_rate": 9.77924281372295e-07, + "loss": 0.2888, + "step": 6753 + }, + { + "epoch": 1.6076039745344202, + "grad_norm": 0.40572797254422144, + "learning_rate": 9.767794592263802e-07, + "loss": 0.2758, + "step": 6754 + }, + { + "epoch": 1.6078419706074851, + "grad_norm": 0.38690003338447304, + "learning_rate": 9.756352350305676e-07, + "loss": 0.3507, + "step": 6755 + }, + { + "epoch": 1.6080799666805499, + "grad_norm": 0.3633536684182977, + "learning_rate": 9.744916089549155e-07, + "loss": 0.3725, + "step": 6756 + }, + { + "epoch": 1.6083179627536146, + "grad_norm": 0.3430540268517774, + "learning_rate": 9.733485811693983e-07, + "loss": 0.2831, + "step": 6757 + }, + { + "epoch": 1.6085559588266793, + "grad_norm": 0.3967941777054526, + "learning_rate": 9.722061518438963e-07, + "loss": 0.3197, + "step": 6758 + }, + { + "epoch": 1.608793954899744, + "grad_norm": 0.3769154445985261, + "learning_rate": 9.71064321148203e-07, + "loss": 0.3843, + "step": 6759 + }, + { + "epoch": 1.6090319509728088, + "grad_norm": 0.36745266921746866, + "learning_rate": 9.699230892520222e-07, + "loss": 0.2952, + "step": 6760 + }, + { + "epoch": 1.6092699470458738, + "grad_norm": 0.3719289532826721, + "learning_rate": 9.687824563249687e-07, + "loss": 0.2952, + "step": 6761 + }, + { + "epoch": 1.6095079431189385, + "grad_norm": 0.3850579375106196, + "learning_rate": 9.67642422536571e-07, + "loss": 0.3314, + "step": 6762 + }, + { + "epoch": 1.6097459391920035, + "grad_norm": 0.36942355814013683, + "learning_rate": 9.665029880562655e-07, + "loss": 0.355, + "step": 6763 + }, + { + "epoch": 1.6099839352650682, + "grad_norm": 0.37525710772280524, + "learning_rate": 9.65364153053399e-07, + "loss": 0.2903, + "step": 6764 + }, + { + "epoch": 1.610221931338133, + "grad_norm": 0.37197742941684636, + "learning_rate": 9.64225917697232e-07, + "loss": 0.3024, + "step": 6765 + }, + { + "epoch": 1.6104599274111977, + "grad_norm": 0.3768595495711356, + "learning_rate": 9.630882821569338e-07, + "loss": 0.3852, + "step": 6766 + }, + { + "epoch": 1.6106979234842624, + "grad_norm": 0.35867128131233944, + "learning_rate": 9.61951246601585e-07, + "loss": 0.3093, + "step": 6767 + }, + { + "epoch": 1.6109359195573272, + "grad_norm": 0.40435567311806875, + "learning_rate": 9.608148112001774e-07, + "loss": 0.262, + "step": 6768 + }, + { + "epoch": 1.6111739156303921, + "grad_norm": 0.3786650690585147, + "learning_rate": 9.59678976121614e-07, + "loss": 0.319, + "step": 6769 + }, + { + "epoch": 1.6114119117034569, + "grad_norm": 0.3873536911631197, + "learning_rate": 9.58543741534706e-07, + "loss": 0.367, + "step": 6770 + }, + { + "epoch": 1.6116499077765218, + "grad_norm": 0.36824419357607824, + "learning_rate": 9.574091076081799e-07, + "loss": 0.2774, + "step": 6771 + }, + { + "epoch": 1.6118879038495866, + "grad_norm": 0.3891611443567881, + "learning_rate": 9.562750745106697e-07, + "loss": 0.2913, + "step": 6772 + }, + { + "epoch": 1.6121258999226513, + "grad_norm": 0.3995297635418241, + "learning_rate": 9.551416424107202e-07, + "loss": 0.3454, + "step": 6773 + }, + { + "epoch": 1.612363895995716, + "grad_norm": 0.41453803457475424, + "learning_rate": 9.54008811476787e-07, + "loss": 0.3433, + "step": 6774 + }, + { + "epoch": 1.6126018920687808, + "grad_norm": 0.39579544768901054, + "learning_rate": 9.528765818772379e-07, + "loss": 0.2877, + "step": 6775 + }, + { + "epoch": 1.6128398881418455, + "grad_norm": 0.3488915743792545, + "learning_rate": 9.517449537803497e-07, + "loss": 0.3048, + "step": 6776 + }, + { + "epoch": 1.6130778842149105, + "grad_norm": 0.38162237869505217, + "learning_rate": 9.506139273543108e-07, + "loss": 0.3919, + "step": 6777 + }, + { + "epoch": 1.6133158802879752, + "grad_norm": 0.40658809756542225, + "learning_rate": 9.494835027672189e-07, + "loss": 0.2859, + "step": 6778 + }, + { + "epoch": 1.6135538763610402, + "grad_norm": 0.4090597053962956, + "learning_rate": 9.483536801870835e-07, + "loss": 0.2859, + "step": 6779 + }, + { + "epoch": 1.613791872434105, + "grad_norm": 0.4162769342871607, + "learning_rate": 9.472244597818236e-07, + "loss": 0.3559, + "step": 6780 + }, + { + "epoch": 1.6140298685071697, + "grad_norm": 0.4127306882903097, + "learning_rate": 9.460958417192706e-07, + "loss": 0.3618, + "step": 6781 + }, + { + "epoch": 1.6142678645802344, + "grad_norm": 0.3705494652288955, + "learning_rate": 9.449678261671636e-07, + "loss": 0.2495, + "step": 6782 + }, + { + "epoch": 1.6145058606532992, + "grad_norm": 0.38615110793723045, + "learning_rate": 9.438404132931534e-07, + "loss": 0.3189, + "step": 6783 + }, + { + "epoch": 1.614743856726364, + "grad_norm": 0.4247897491555896, + "learning_rate": 9.427136032648038e-07, + "loss": 0.3723, + "step": 6784 + }, + { + "epoch": 1.6149818527994289, + "grad_norm": 0.38613015342079937, + "learning_rate": 9.415873962495847e-07, + "loss": 0.3214, + "step": 6785 + }, + { + "epoch": 1.6152198488724936, + "grad_norm": 0.38838589089115877, + "learning_rate": 9.40461792414879e-07, + "loss": 0.3093, + "step": 6786 + }, + { + "epoch": 1.6154578449455586, + "grad_norm": 0.4134809114242341, + "learning_rate": 9.393367919279794e-07, + "loss": 0.3258, + "step": 6787 + }, + { + "epoch": 1.6156958410186233, + "grad_norm": 0.3657710453341731, + "learning_rate": 9.382123949560868e-07, + "loss": 0.3751, + "step": 6788 + }, + { + "epoch": 1.615933837091688, + "grad_norm": 0.367777502716404, + "learning_rate": 9.370886016663178e-07, + "loss": 0.284, + "step": 6789 + }, + { + "epoch": 1.6161718331647528, + "grad_norm": 0.37224518047407673, + "learning_rate": 9.35965412225694e-07, + "loss": 0.3233, + "step": 6790 + }, + { + "epoch": 1.6164098292378175, + "grad_norm": 0.3679948065415524, + "learning_rate": 9.348428268011489e-07, + "loss": 0.38, + "step": 6791 + }, + { + "epoch": 1.6166478253108822, + "grad_norm": 0.368226197018974, + "learning_rate": 9.337208455595271e-07, + "loss": 0.3483, + "step": 6792 + }, + { + "epoch": 1.6168858213839472, + "grad_norm": 0.38742952029982053, + "learning_rate": 9.325994686675826e-07, + "loss": 0.2773, + "step": 6793 + }, + { + "epoch": 1.617123817457012, + "grad_norm": 0.4010018284261614, + "learning_rate": 9.314786962919798e-07, + "loss": 0.3711, + "step": 6794 + }, + { + "epoch": 1.617361813530077, + "grad_norm": 0.39828012262925994, + "learning_rate": 9.303585285992933e-07, + "loss": 0.3747, + "step": 6795 + }, + { + "epoch": 1.6175998096031416, + "grad_norm": 0.41099074455886697, + "learning_rate": 9.292389657560069e-07, + "loss": 0.2781, + "step": 6796 + }, + { + "epoch": 1.6178378056762064, + "grad_norm": 0.3833507644875006, + "learning_rate": 9.281200079285152e-07, + "loss": 0.2849, + "step": 6797 + }, + { + "epoch": 1.6180758017492711, + "grad_norm": 0.38415300774108235, + "learning_rate": 9.270016552831252e-07, + "loss": 0.3648, + "step": 6798 + }, + { + "epoch": 1.6183137978223359, + "grad_norm": 0.37049113228296326, + "learning_rate": 9.258839079860499e-07, + "loss": 0.3186, + "step": 6799 + }, + { + "epoch": 1.6185517938954006, + "grad_norm": 0.3725759851299388, + "learning_rate": 9.247667662034149e-07, + "loss": 0.2857, + "step": 6800 + }, + { + "epoch": 1.6187897899684656, + "grad_norm": 0.3974032467656143, + "learning_rate": 9.236502301012546e-07, + "loss": 0.3328, + "step": 6801 + }, + { + "epoch": 1.6190277860415303, + "grad_norm": 0.37422414801531034, + "learning_rate": 9.225342998455145e-07, + "loss": 0.3822, + "step": 6802 + }, + { + "epoch": 1.6192657821145953, + "grad_norm": 0.34722153079424223, + "learning_rate": 9.214189756020487e-07, + "loss": 0.2668, + "step": 6803 + }, + { + "epoch": 1.61950377818766, + "grad_norm": 0.3888234856151576, + "learning_rate": 9.203042575366228e-07, + "loss": 0.2795, + "step": 6804 + }, + { + "epoch": 1.6197417742607247, + "grad_norm": 0.40283307764457343, + "learning_rate": 9.191901458149106e-07, + "loss": 0.3836, + "step": 6805 + }, + { + "epoch": 1.6199797703337895, + "grad_norm": 0.38472144664022295, + "learning_rate": 9.180766406024971e-07, + "loss": 0.3505, + "step": 6806 + }, + { + "epoch": 1.6202177664068542, + "grad_norm": 0.37583986626555843, + "learning_rate": 9.16963742064877e-07, + "loss": 0.2628, + "step": 6807 + }, + { + "epoch": 1.620455762479919, + "grad_norm": 0.362264134804524, + "learning_rate": 9.158514503674543e-07, + "loss": 0.3411, + "step": 6808 + }, + { + "epoch": 1.620693758552984, + "grad_norm": 0.3769911946362758, + "learning_rate": 9.14739765675543e-07, + "loss": 0.371, + "step": 6809 + }, + { + "epoch": 1.6209317546260487, + "grad_norm": 0.3853498348483161, + "learning_rate": 9.136286881543666e-07, + "loss": 0.314, + "step": 6810 + }, + { + "epoch": 1.6211697506991136, + "grad_norm": 0.3983780077597233, + "learning_rate": 9.125182179690584e-07, + "loss": 0.2673, + "step": 6811 + }, + { + "epoch": 1.6214077467721784, + "grad_norm": 0.37571986287733705, + "learning_rate": 9.114083552846636e-07, + "loss": 0.3361, + "step": 6812 + }, + { + "epoch": 1.621645742845243, + "grad_norm": 0.3904265606928051, + "learning_rate": 9.102991002661337e-07, + "loss": 0.367, + "step": 6813 + }, + { + "epoch": 1.6218837389183078, + "grad_norm": 0.4223581365461809, + "learning_rate": 9.091904530783319e-07, + "loss": 0.3114, + "step": 6814 + }, + { + "epoch": 1.6221217349913726, + "grad_norm": 0.39388763487496037, + "learning_rate": 9.080824138860295e-07, + "loss": 0.3303, + "step": 6815 + }, + { + "epoch": 1.6223597310644373, + "grad_norm": 0.39770354399393776, + "learning_rate": 9.069749828539109e-07, + "loss": 0.3909, + "step": 6816 + }, + { + "epoch": 1.6225977271375023, + "grad_norm": 0.37675287316781486, + "learning_rate": 9.058681601465663e-07, + "loss": 0.3139, + "step": 6817 + }, + { + "epoch": 1.622835723210567, + "grad_norm": 0.359202858527697, + "learning_rate": 9.047619459284968e-07, + "loss": 0.2548, + "step": 6818 + }, + { + "epoch": 1.623073719283632, + "grad_norm": 0.3666753439305403, + "learning_rate": 9.036563403641136e-07, + "loss": 0.2944, + "step": 6819 + }, + { + "epoch": 1.6233117153566967, + "grad_norm": 0.48209117748060715, + "learning_rate": 9.025513436177368e-07, + "loss": 0.3864, + "step": 6820 + }, + { + "epoch": 1.6235497114297615, + "grad_norm": 0.3719293692788079, + "learning_rate": 9.014469558535965e-07, + "loss": 0.278, + "step": 6821 + }, + { + "epoch": 1.6237877075028262, + "grad_norm": 0.3969227125064937, + "learning_rate": 9.003431772358323e-07, + "loss": 0.306, + "step": 6822 + }, + { + "epoch": 1.624025703575891, + "grad_norm": 0.3951142656857, + "learning_rate": 8.992400079284919e-07, + "loss": 0.3504, + "step": 6823 + }, + { + "epoch": 1.6242636996489557, + "grad_norm": 0.3481746671653846, + "learning_rate": 8.981374480955347e-07, + "loss": 0.3283, + "step": 6824 + }, + { + "epoch": 1.6245016957220206, + "grad_norm": 0.3891762848601158, + "learning_rate": 8.970354979008261e-07, + "loss": 0.281, + "step": 6825 + }, + { + "epoch": 1.6247396917950854, + "grad_norm": 0.43069665710742855, + "learning_rate": 8.959341575081465e-07, + "loss": 0.3199, + "step": 6826 + }, + { + "epoch": 1.6249776878681503, + "grad_norm": 0.37051390449080696, + "learning_rate": 8.948334270811809e-07, + "loss": 0.3871, + "step": 6827 + }, + { + "epoch": 1.625215683941215, + "grad_norm": 0.3880703377585865, + "learning_rate": 8.937333067835247e-07, + "loss": 0.2861, + "step": 6828 + }, + { + "epoch": 1.6254536800142798, + "grad_norm": 0.384396753893017, + "learning_rate": 8.926337967786836e-07, + "loss": 0.2741, + "step": 6829 + }, + { + "epoch": 1.6256916760873446, + "grad_norm": 0.3569238940042476, + "learning_rate": 8.915348972300713e-07, + "loss": 0.357, + "step": 6830 + }, + { + "epoch": 1.6259296721604093, + "grad_norm": 0.3499062660519206, + "learning_rate": 8.904366083010119e-07, + "loss": 0.3204, + "step": 6831 + }, + { + "epoch": 1.626167668233474, + "grad_norm": 0.38069461747113986, + "learning_rate": 8.893389301547384e-07, + "loss": 0.2716, + "step": 6832 + }, + { + "epoch": 1.626405664306539, + "grad_norm": 0.35807608092297644, + "learning_rate": 8.882418629543926e-07, + "loss": 0.3028, + "step": 6833 + }, + { + "epoch": 1.6266436603796037, + "grad_norm": 0.38443325525551314, + "learning_rate": 8.871454068630259e-07, + "loss": 0.3946, + "step": 6834 + }, + { + "epoch": 1.6268816564526687, + "grad_norm": 0.357934918939066, + "learning_rate": 8.860495620435989e-07, + "loss": 0.3355, + "step": 6835 + }, + { + "epoch": 1.6271196525257334, + "grad_norm": 0.36332940789321744, + "learning_rate": 8.849543286589813e-07, + "loss": 0.297, + "step": 6836 + }, + { + "epoch": 1.6273576485987982, + "grad_norm": 0.39032305116436367, + "learning_rate": 8.838597068719518e-07, + "loss": 0.3483, + "step": 6837 + }, + { + "epoch": 1.627595644671863, + "grad_norm": 0.37557326175473943, + "learning_rate": 8.82765696845197e-07, + "loss": 0.3611, + "step": 6838 + }, + { + "epoch": 1.6278336407449276, + "grad_norm": 0.3693511552822045, + "learning_rate": 8.816722987413162e-07, + "loss": 0.2734, + "step": 6839 + }, + { + "epoch": 1.6280716368179924, + "grad_norm": 0.3718337287364868, + "learning_rate": 8.805795127228145e-07, + "loss": 0.305, + "step": 6840 + }, + { + "epoch": 1.6283096328910573, + "grad_norm": 0.3904012424314939, + "learning_rate": 8.794873389521069e-07, + "loss": 0.3579, + "step": 6841 + }, + { + "epoch": 1.628547628964122, + "grad_norm": 0.3643275958018295, + "learning_rate": 8.783957775915159e-07, + "loss": 0.3106, + "step": 6842 + }, + { + "epoch": 1.628785625037187, + "grad_norm": 0.38787892776494476, + "learning_rate": 8.77304828803277e-07, + "loss": 0.2696, + "step": 6843 + }, + { + "epoch": 1.6290236211102518, + "grad_norm": 0.3660591486708151, + "learning_rate": 8.762144927495309e-07, + "loss": 0.3095, + "step": 6844 + }, + { + "epoch": 1.6292616171833165, + "grad_norm": 0.39222908624194175, + "learning_rate": 8.751247695923292e-07, + "loss": 0.4224, + "step": 6845 + }, + { + "epoch": 1.6294996132563813, + "grad_norm": 0.42296916733578066, + "learning_rate": 8.740356594936311e-07, + "loss": 0.2812, + "step": 6846 + }, + { + "epoch": 1.629737609329446, + "grad_norm": 0.40203065491885415, + "learning_rate": 8.729471626153052e-07, + "loss": 0.345, + "step": 6847 + }, + { + "epoch": 1.6299756054025107, + "grad_norm": 0.3806358325222802, + "learning_rate": 8.7185927911913e-07, + "loss": 0.3598, + "step": 6848 + }, + { + "epoch": 1.6302136014755757, + "grad_norm": 0.3826490418036862, + "learning_rate": 8.707720091667904e-07, + "loss": 0.3363, + "step": 6849 + }, + { + "epoch": 1.6304515975486404, + "grad_norm": 0.41820535006387766, + "learning_rate": 8.696853529198829e-07, + "loss": 0.2485, + "step": 6850 + }, + { + "epoch": 1.6306895936217054, + "grad_norm": 0.3883852616080865, + "learning_rate": 8.685993105399104e-07, + "loss": 0.324, + "step": 6851 + }, + { + "epoch": 1.6309275896947701, + "grad_norm": 0.3924806045692454, + "learning_rate": 8.675138821882856e-07, + "loss": 0.3843, + "step": 6852 + }, + { + "epoch": 1.6311655857678349, + "grad_norm": 0.3886779181476492, + "learning_rate": 8.664290680263321e-07, + "loss": 0.2817, + "step": 6853 + }, + { + "epoch": 1.6314035818408996, + "grad_norm": 0.41445337145235195, + "learning_rate": 8.653448682152782e-07, + "loss": 0.2835, + "step": 6854 + }, + { + "epoch": 1.6316415779139644, + "grad_norm": 0.35876208864526293, + "learning_rate": 8.642612829162639e-07, + "loss": 0.3438, + "step": 6855 + }, + { + "epoch": 1.631879573987029, + "grad_norm": 0.3470504035205238, + "learning_rate": 8.631783122903353e-07, + "loss": 0.3624, + "step": 6856 + }, + { + "epoch": 1.632117570060094, + "grad_norm": 0.373620998827096, + "learning_rate": 8.620959564984504e-07, + "loss": 0.2853, + "step": 6857 + }, + { + "epoch": 1.6323555661331588, + "grad_norm": 0.40447652300735565, + "learning_rate": 8.610142157014728e-07, + "loss": 0.3606, + "step": 6858 + }, + { + "epoch": 1.6325935622062238, + "grad_norm": 0.39593981996363464, + "learning_rate": 8.599330900601766e-07, + "loss": 0.3431, + "step": 6859 + }, + { + "epoch": 1.6328315582792885, + "grad_norm": 0.3592935150194725, + "learning_rate": 8.588525797352432e-07, + "loss": 0.3056, + "step": 6860 + }, + { + "epoch": 1.6330695543523532, + "grad_norm": 0.37888268748077286, + "learning_rate": 8.577726848872636e-07, + "loss": 0.2669, + "step": 6861 + }, + { + "epoch": 1.633307550425418, + "grad_norm": 0.3640350475107053, + "learning_rate": 8.566934056767368e-07, + "loss": 0.3408, + "step": 6862 + }, + { + "epoch": 1.6335455464984827, + "grad_norm": 0.385536304300356, + "learning_rate": 8.556147422640704e-07, + "loss": 0.3648, + "step": 6863 + }, + { + "epoch": 1.6337835425715475, + "grad_norm": 0.3924071742555982, + "learning_rate": 8.545366948095802e-07, + "loss": 0.2619, + "step": 6864 + }, + { + "epoch": 1.6340215386446124, + "grad_norm": 0.40696056451988827, + "learning_rate": 8.5345926347349e-07, + "loss": 0.323, + "step": 6865 + }, + { + "epoch": 1.6342595347176772, + "grad_norm": 0.42407201911219133, + "learning_rate": 8.523824484159348e-07, + "loss": 0.3849, + "step": 6866 + }, + { + "epoch": 1.6344975307907421, + "grad_norm": 0.36284614007552857, + "learning_rate": 8.513062497969554e-07, + "loss": 0.337, + "step": 6867 + }, + { + "epoch": 1.6347355268638069, + "grad_norm": 0.3887082396171208, + "learning_rate": 8.502306677765004e-07, + "loss": 0.2736, + "step": 6868 + }, + { + "epoch": 1.6349735229368716, + "grad_norm": 0.38345494649062606, + "learning_rate": 8.491557025144276e-07, + "loss": 0.3312, + "step": 6869 + }, + { + "epoch": 1.6352115190099363, + "grad_norm": 0.38028798193743946, + "learning_rate": 8.480813541705057e-07, + "loss": 0.375, + "step": 6870 + }, + { + "epoch": 1.635449515083001, + "grad_norm": 0.38125998205133177, + "learning_rate": 8.470076229044077e-07, + "loss": 0.3121, + "step": 6871 + }, + { + "epoch": 1.6356875111560658, + "grad_norm": 0.3801307871037892, + "learning_rate": 8.45934508875717e-07, + "loss": 0.3276, + "step": 6872 + }, + { + "epoch": 1.6359255072291308, + "grad_norm": 0.4262493108275734, + "learning_rate": 8.448620122439255e-07, + "loss": 0.384, + "step": 6873 + }, + { + "epoch": 1.6361635033021955, + "grad_norm": 0.3533409555026496, + "learning_rate": 8.437901331684317e-07, + "loss": 0.3331, + "step": 6874 + }, + { + "epoch": 1.6364014993752605, + "grad_norm": 0.3740778241308746, + "learning_rate": 8.427188718085438e-07, + "loss": 0.2702, + "step": 6875 + }, + { + "epoch": 1.6366394954483252, + "grad_norm": 0.3899836491292678, + "learning_rate": 8.416482283234778e-07, + "loss": 0.3263, + "step": 6876 + }, + { + "epoch": 1.63687749152139, + "grad_norm": 0.37954634736543486, + "learning_rate": 8.405782028723575e-07, + "loss": 0.38, + "step": 6877 + }, + { + "epoch": 1.6371154875944547, + "grad_norm": 0.3537922473440547, + "learning_rate": 8.395087956142156e-07, + "loss": 0.2528, + "step": 6878 + }, + { + "epoch": 1.6373534836675194, + "grad_norm": 0.3919326904956134, + "learning_rate": 8.384400067079923e-07, + "loss": 0.2933, + "step": 6879 + }, + { + "epoch": 1.6375914797405842, + "grad_norm": 0.3740391889328203, + "learning_rate": 8.37371836312535e-07, + "loss": 0.3497, + "step": 6880 + }, + { + "epoch": 1.6378294758136491, + "grad_norm": 0.37761989502751364, + "learning_rate": 8.363042845866021e-07, + "loss": 0.3382, + "step": 6881 + }, + { + "epoch": 1.6380674718867139, + "grad_norm": 0.38783064891752894, + "learning_rate": 8.352373516888573e-07, + "loss": 0.2931, + "step": 6882 + }, + { + "epoch": 1.6383054679597788, + "grad_norm": 0.37480525199164966, + "learning_rate": 8.341710377778739e-07, + "loss": 0.3041, + "step": 6883 + }, + { + "epoch": 1.6385434640328436, + "grad_norm": 0.37392017588457593, + "learning_rate": 8.331053430121317e-07, + "loss": 0.3653, + "step": 6884 + }, + { + "epoch": 1.6387814601059083, + "grad_norm": 0.37701145032799044, + "learning_rate": 8.320402675500195e-07, + "loss": 0.2937, + "step": 6885 + }, + { + "epoch": 1.639019456178973, + "grad_norm": 0.3811824330700152, + "learning_rate": 8.309758115498334e-07, + "loss": 0.2808, + "step": 6886 + }, + { + "epoch": 1.6392574522520378, + "grad_norm": 0.3643999529788084, + "learning_rate": 8.299119751697788e-07, + "loss": 0.3547, + "step": 6887 + }, + { + "epoch": 1.6394954483251025, + "grad_norm": 0.36502990685126363, + "learning_rate": 8.288487585679677e-07, + "loss": 0.3857, + "step": 6888 + }, + { + "epoch": 1.6397334443981675, + "grad_norm": 0.3658225172262794, + "learning_rate": 8.277861619024208e-07, + "loss": 0.2652, + "step": 6889 + }, + { + "epoch": 1.6399714404712322, + "grad_norm": 0.38230360243140593, + "learning_rate": 8.267241853310654e-07, + "loss": 0.3224, + "step": 6890 + }, + { + "epoch": 1.6402094365442972, + "grad_norm": 0.38637764499463456, + "learning_rate": 8.25662829011738e-07, + "loss": 0.3791, + "step": 6891 + }, + { + "epoch": 1.640447432617362, + "grad_norm": 0.38029656509707016, + "learning_rate": 8.246020931021808e-07, + "loss": 0.2924, + "step": 6892 + }, + { + "epoch": 1.6406854286904267, + "grad_norm": 0.3747338210302809, + "learning_rate": 8.235419777600484e-07, + "loss": 0.2697, + "step": 6893 + }, + { + "epoch": 1.6409234247634914, + "grad_norm": 0.3600089909702746, + "learning_rate": 8.224824831428991e-07, + "loss": 0.3374, + "step": 6894 + }, + { + "epoch": 1.6411614208365561, + "grad_norm": 0.38204493116592925, + "learning_rate": 8.21423609408199e-07, + "loss": 0.3695, + "step": 6895 + }, + { + "epoch": 1.6413994169096209, + "grad_norm": 0.3865945626227688, + "learning_rate": 8.203653567133224e-07, + "loss": 0.2677, + "step": 6896 + }, + { + "epoch": 1.6416374129826858, + "grad_norm": 0.37313722469830596, + "learning_rate": 8.193077252155545e-07, + "loss": 0.2893, + "step": 6897 + }, + { + "epoch": 1.6418754090557506, + "grad_norm": 0.3817816737768635, + "learning_rate": 8.182507150720837e-07, + "loss": 0.3498, + "step": 6898 + }, + { + "epoch": 1.6421134051288155, + "grad_norm": 0.36328532420628223, + "learning_rate": 8.171943264400084e-07, + "loss": 0.3091, + "step": 6899 + }, + { + "epoch": 1.6423514012018803, + "grad_norm": 0.40910572921600497, + "learning_rate": 8.161385594763338e-07, + "loss": 0.2732, + "step": 6900 + }, + { + "epoch": 1.642589397274945, + "grad_norm": 0.4173927289371764, + "learning_rate": 8.150834143379726e-07, + "loss": 0.3009, + "step": 6901 + }, + { + "epoch": 1.6428273933480098, + "grad_norm": 0.4031792044847936, + "learning_rate": 8.140288911817462e-07, + "loss": 0.3839, + "step": 6902 + }, + { + "epoch": 1.6430653894210745, + "grad_norm": 0.36972889214424126, + "learning_rate": 8.129749901643824e-07, + "loss": 0.314, + "step": 6903 + }, + { + "epoch": 1.6433033854941392, + "grad_norm": 0.3875768012510301, + "learning_rate": 8.119217114425171e-07, + "loss": 0.2766, + "step": 6904 + }, + { + "epoch": 1.6435413815672042, + "grad_norm": 0.36865187596941734, + "learning_rate": 8.108690551726938e-07, + "loss": 0.3303, + "step": 6905 + }, + { + "epoch": 1.643779377640269, + "grad_norm": 0.3707979238870622, + "learning_rate": 8.098170215113627e-07, + "loss": 0.3094, + "step": 6906 + }, + { + "epoch": 1.644017373713334, + "grad_norm": 0.3961111362306213, + "learning_rate": 8.087656106148811e-07, + "loss": 0.262, + "step": 6907 + }, + { + "epoch": 1.6442553697863986, + "grad_norm": 0.37905376446342726, + "learning_rate": 8.077148226395171e-07, + "loss": 0.2942, + "step": 6908 + }, + { + "epoch": 1.6444933658594634, + "grad_norm": 0.3891989136340424, + "learning_rate": 8.066646577414427e-07, + "loss": 0.3634, + "step": 6909 + }, + { + "epoch": 1.6447313619325281, + "grad_norm": 0.3635988958220919, + "learning_rate": 8.056151160767384e-07, + "loss": 0.3054, + "step": 6910 + }, + { + "epoch": 1.6449693580055929, + "grad_norm": 0.3976482528225413, + "learning_rate": 8.045661978013919e-07, + "loss": 0.2771, + "step": 6911 + }, + { + "epoch": 1.6452073540786576, + "grad_norm": 0.36402867904746283, + "learning_rate": 8.035179030712981e-07, + "loss": 0.343, + "step": 6912 + }, + { + "epoch": 1.6454453501517223, + "grad_norm": 0.41373321397240326, + "learning_rate": 8.0247023204226e-07, + "loss": 0.3902, + "step": 6913 + }, + { + "epoch": 1.6456833462247873, + "grad_norm": 0.36420015222611274, + "learning_rate": 8.014231848699877e-07, + "loss": 0.2814, + "step": 6914 + }, + { + "epoch": 1.645921342297852, + "grad_norm": 0.3831807168567994, + "learning_rate": 8.003767617100977e-07, + "loss": 0.2808, + "step": 6915 + }, + { + "epoch": 1.646159338370917, + "grad_norm": 0.5084315028646742, + "learning_rate": 7.993309627181145e-07, + "loss": 0.3751, + "step": 6916 + }, + { + "epoch": 1.6463973344439817, + "grad_norm": 0.37628750772345293, + "learning_rate": 7.982857880494699e-07, + "loss": 0.3378, + "step": 6917 + }, + { + "epoch": 1.6466353305170465, + "grad_norm": 0.47297885018084573, + "learning_rate": 7.972412378595024e-07, + "loss": 0.2825, + "step": 6918 + }, + { + "epoch": 1.6468733265901112, + "grad_norm": 0.40733630305637625, + "learning_rate": 7.961973123034572e-07, + "loss": 0.3292, + "step": 6919 + }, + { + "epoch": 1.647111322663176, + "grad_norm": 0.3636765632920138, + "learning_rate": 7.951540115364892e-07, + "loss": 0.3736, + "step": 6920 + }, + { + "epoch": 1.6473493187362407, + "grad_norm": 0.3599660837275332, + "learning_rate": 7.941113357136587e-07, + "loss": 0.2892, + "step": 6921 + }, + { + "epoch": 1.6475873148093056, + "grad_norm": 0.3781156441865842, + "learning_rate": 7.930692849899319e-07, + "loss": 0.2934, + "step": 6922 + }, + { + "epoch": 1.6478253108823704, + "grad_norm": 0.39701363259275885, + "learning_rate": 7.920278595201825e-07, + "loss": 0.369, + "step": 6923 + }, + { + "epoch": 1.6480633069554353, + "grad_norm": 0.37156701594123176, + "learning_rate": 7.909870594591951e-07, + "loss": 0.3413, + "step": 6924 + }, + { + "epoch": 1.6483013030285, + "grad_norm": 0.41917032707799795, + "learning_rate": 7.89946884961656e-07, + "loss": 0.2803, + "step": 6925 + }, + { + "epoch": 1.6485392991015648, + "grad_norm": 0.4065671522305405, + "learning_rate": 7.88907336182162e-07, + "loss": 0.3162, + "step": 6926 + }, + { + "epoch": 1.6487772951746296, + "grad_norm": 0.3790168785337112, + "learning_rate": 7.878684132752152e-07, + "loss": 0.3934, + "step": 6927 + }, + { + "epoch": 1.6490152912476943, + "grad_norm": 0.3708736912100001, + "learning_rate": 7.868301163952253e-07, + "loss": 0.2832, + "step": 6928 + }, + { + "epoch": 1.649253287320759, + "grad_norm": 0.3935121315020632, + "learning_rate": 7.857924456965083e-07, + "loss": 0.3096, + "step": 6929 + }, + { + "epoch": 1.649491283393824, + "grad_norm": 0.3818251147778019, + "learning_rate": 7.847554013332892e-07, + "loss": 0.3547, + "step": 6930 + }, + { + "epoch": 1.6497292794668887, + "grad_norm": 0.38974578717813124, + "learning_rate": 7.83718983459697e-07, + "loss": 0.3627, + "step": 6931 + }, + { + "epoch": 1.6499672755399537, + "grad_norm": 0.3757552764551271, + "learning_rate": 7.8268319222977e-07, + "loss": 0.3034, + "step": 6932 + }, + { + "epoch": 1.6502052716130184, + "grad_norm": 0.384816533453232, + "learning_rate": 7.81648027797452e-07, + "loss": 0.3372, + "step": 6933 + }, + { + "epoch": 1.6504432676860832, + "grad_norm": 0.3667467367151375, + "learning_rate": 7.806134903165935e-07, + "loss": 0.3632, + "step": 6934 + }, + { + "epoch": 1.650681263759148, + "grad_norm": 0.3586707968143236, + "learning_rate": 7.795795799409522e-07, + "loss": 0.301, + "step": 6935 + }, + { + "epoch": 1.6509192598322127, + "grad_norm": 0.40472475833277655, + "learning_rate": 7.785462968241947e-07, + "loss": 0.2845, + "step": 6936 + }, + { + "epoch": 1.6511572559052774, + "grad_norm": 0.47875293593979357, + "learning_rate": 7.775136411198914e-07, + "loss": 0.3537, + "step": 6937 + }, + { + "epoch": 1.6513952519783424, + "grad_norm": 0.3744577504265213, + "learning_rate": 7.764816129815201e-07, + "loss": 0.3607, + "step": 6938 + }, + { + "epoch": 1.651633248051407, + "grad_norm": 0.37901981094401166, + "learning_rate": 7.754502125624658e-07, + "loss": 0.2718, + "step": 6939 + }, + { + "epoch": 1.651871244124472, + "grad_norm": 0.373755700776998, + "learning_rate": 7.744194400160204e-07, + "loss": 0.2972, + "step": 6940 + }, + { + "epoch": 1.6521092401975368, + "grad_norm": 0.39240847812458995, + "learning_rate": 7.73389295495382e-07, + "loss": 0.3436, + "step": 6941 + }, + { + "epoch": 1.6523472362706015, + "grad_norm": 0.39288015487594496, + "learning_rate": 7.723597791536553e-07, + "loss": 0.3164, + "step": 6942 + }, + { + "epoch": 1.6525852323436663, + "grad_norm": 0.3862794779553621, + "learning_rate": 7.713308911438527e-07, + "loss": 0.3161, + "step": 6943 + }, + { + "epoch": 1.652823228416731, + "grad_norm": 0.4601597581604055, + "learning_rate": 7.703026316188916e-07, + "loss": 0.3288, + "step": 6944 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.38403595804884394, + "learning_rate": 7.692750007315969e-07, + "loss": 0.4199, + "step": 6945 + }, + { + "epoch": 1.6532992205628607, + "grad_norm": 0.3689442026310426, + "learning_rate": 7.682479986346996e-07, + "loss": 0.2934, + "step": 6946 + }, + { + "epoch": 1.6535372166359255, + "grad_norm": 0.364745434398555, + "learning_rate": 7.67221625480839e-07, + "loss": 0.2731, + "step": 6947 + }, + { + "epoch": 1.6537752127089904, + "grad_norm": 0.438406775655767, + "learning_rate": 7.66195881422559e-07, + "loss": 0.3691, + "step": 6948 + }, + { + "epoch": 1.6540132087820552, + "grad_norm": 0.3721049405974233, + "learning_rate": 7.651707666123098e-07, + "loss": 0.3326, + "step": 6949 + }, + { + "epoch": 1.65425120485512, + "grad_norm": 0.3894342319600942, + "learning_rate": 7.641462812024486e-07, + "loss": 0.2969, + "step": 6950 + }, + { + "epoch": 1.6544892009281846, + "grad_norm": 0.3805823643582205, + "learning_rate": 7.631224253452408e-07, + "loss": 0.3233, + "step": 6951 + }, + { + "epoch": 1.6547271970012494, + "grad_norm": 0.42081777893965966, + "learning_rate": 7.620991991928561e-07, + "loss": 0.3882, + "step": 6952 + }, + { + "epoch": 1.6549651930743141, + "grad_norm": 0.37868479763553864, + "learning_rate": 7.61076602897371e-07, + "loss": 0.3121, + "step": 6953 + }, + { + "epoch": 1.655203189147379, + "grad_norm": 0.4244582354427234, + "learning_rate": 7.600546366107686e-07, + "loss": 0.2845, + "step": 6954 + }, + { + "epoch": 1.6554411852204438, + "grad_norm": 0.47303463276050134, + "learning_rate": 7.590333004849387e-07, + "loss": 0.377, + "step": 6955 + }, + { + "epoch": 1.6556791812935088, + "grad_norm": 0.3665165569695061, + "learning_rate": 7.580125946716765e-07, + "loss": 0.3637, + "step": 6956 + }, + { + "epoch": 1.6559171773665735, + "grad_norm": 0.379471968924663, + "learning_rate": 7.569925193226846e-07, + "loss": 0.2889, + "step": 6957 + }, + { + "epoch": 1.6561551734396383, + "grad_norm": 0.38456329877923556, + "learning_rate": 7.55973074589571e-07, + "loss": 0.3245, + "step": 6958 + }, + { + "epoch": 1.656393169512703, + "grad_norm": 0.3852464205929202, + "learning_rate": 7.549542606238508e-07, + "loss": 0.388, + "step": 6959 + }, + { + "epoch": 1.6566311655857677, + "grad_norm": 0.35614680443106633, + "learning_rate": 7.53936077576945e-07, + "loss": 0.3175, + "step": 6960 + }, + { + "epoch": 1.6568691616588325, + "grad_norm": 0.3892713822547582, + "learning_rate": 7.529185256001803e-07, + "loss": 0.2797, + "step": 6961 + }, + { + "epoch": 1.6571071577318974, + "grad_norm": 0.3748547210585344, + "learning_rate": 7.519016048447908e-07, + "loss": 0.3167, + "step": 6962 + }, + { + "epoch": 1.6573451538049622, + "grad_norm": 0.37044782848474667, + "learning_rate": 7.508853154619145e-07, + "loss": 0.3462, + "step": 6963 + }, + { + "epoch": 1.6575831498780271, + "grad_norm": 0.3581984373631393, + "learning_rate": 7.498696576025993e-07, + "loss": 0.2675, + "step": 6964 + }, + { + "epoch": 1.6578211459510919, + "grad_norm": 0.3814135531783868, + "learning_rate": 7.488546314177964e-07, + "loss": 0.3111, + "step": 6965 + }, + { + "epoch": 1.6580591420241566, + "grad_norm": 0.3994529997922176, + "learning_rate": 7.478402370583631e-07, + "loss": 0.3391, + "step": 6966 + }, + { + "epoch": 1.6582971380972213, + "grad_norm": 0.35098177887130955, + "learning_rate": 7.468264746750642e-07, + "loss": 0.2783, + "step": 6967 + }, + { + "epoch": 1.658535134170286, + "grad_norm": 0.3932454998383334, + "learning_rate": 7.458133444185694e-07, + "loss": 0.2877, + "step": 6968 + }, + { + "epoch": 1.6587731302433508, + "grad_norm": 0.4124862836963177, + "learning_rate": 7.448008464394557e-07, + "loss": 0.3172, + "step": 6969 + }, + { + "epoch": 1.6590111263164158, + "grad_norm": 0.4156043963623505, + "learning_rate": 7.437889808882043e-07, + "loss": 0.3898, + "step": 6970 + }, + { + "epoch": 1.6592491223894805, + "grad_norm": 0.3664091808631948, + "learning_rate": 7.427777479152043e-07, + "loss": 0.2909, + "step": 6971 + }, + { + "epoch": 1.6594871184625455, + "grad_norm": 0.3897124758944131, + "learning_rate": 7.417671476707489e-07, + "loss": 0.3297, + "step": 6972 + }, + { + "epoch": 1.6597251145356102, + "grad_norm": 0.3893820163964789, + "learning_rate": 7.407571803050384e-07, + "loss": 0.3597, + "step": 6973 + }, + { + "epoch": 1.659963110608675, + "grad_norm": 0.3695387069491869, + "learning_rate": 7.397478459681806e-07, + "loss": 0.3694, + "step": 6974 + }, + { + "epoch": 1.6602011066817397, + "grad_norm": 0.39522818795728604, + "learning_rate": 7.387391448101861e-07, + "loss": 0.282, + "step": 6975 + }, + { + "epoch": 1.6604391027548044, + "grad_norm": 0.40266534305717183, + "learning_rate": 7.377310769809736e-07, + "loss": 0.3463, + "step": 6976 + }, + { + "epoch": 1.6606770988278692, + "grad_norm": 0.3563962402516382, + "learning_rate": 7.367236426303653e-07, + "loss": 0.3897, + "step": 6977 + }, + { + "epoch": 1.6609150949009341, + "grad_norm": 0.35469458517661445, + "learning_rate": 7.357168419080929e-07, + "loss": 0.2726, + "step": 6978 + }, + { + "epoch": 1.6611530909739989, + "grad_norm": 0.38451642874884295, + "learning_rate": 7.347106749637911e-07, + "loss": 0.3007, + "step": 6979 + }, + { + "epoch": 1.6613910870470638, + "grad_norm": 0.3867505955294661, + "learning_rate": 7.337051419470014e-07, + "loss": 0.3803, + "step": 6980 + }, + { + "epoch": 1.6616290831201286, + "grad_norm": 0.36921539064006076, + "learning_rate": 7.327002430071706e-07, + "loss": 0.3494, + "step": 6981 + }, + { + "epoch": 1.6618670791931933, + "grad_norm": 0.3865817459852862, + "learning_rate": 7.316959782936516e-07, + "loss": 0.2724, + "step": 6982 + }, + { + "epoch": 1.662105075266258, + "grad_norm": 0.38384403692006913, + "learning_rate": 7.306923479557032e-07, + "loss": 0.3179, + "step": 6983 + }, + { + "epoch": 1.6623430713393228, + "grad_norm": 0.3753237383050048, + "learning_rate": 7.296893521424891e-07, + "loss": 0.3753, + "step": 6984 + }, + { + "epoch": 1.6625810674123875, + "grad_norm": 0.37042483156179623, + "learning_rate": 7.286869910030797e-07, + "loss": 0.307, + "step": 6985 + }, + { + "epoch": 1.6628190634854525, + "grad_norm": 0.3789501748283688, + "learning_rate": 7.276852646864507e-07, + "loss": 0.3027, + "step": 6986 + }, + { + "epoch": 1.6630570595585172, + "grad_norm": 0.4135444826893321, + "learning_rate": 7.266841733414837e-07, + "loss": 0.3552, + "step": 6987 + }, + { + "epoch": 1.6632950556315822, + "grad_norm": 0.39788532592242914, + "learning_rate": 7.256837171169651e-07, + "loss": 0.3437, + "step": 6988 + }, + { + "epoch": 1.663533051704647, + "grad_norm": 0.3593638442974499, + "learning_rate": 7.246838961615877e-07, + "loss": 0.3051, + "step": 6989 + }, + { + "epoch": 1.6637710477777117, + "grad_norm": 0.4159858649381969, + "learning_rate": 7.23684710623948e-07, + "loss": 0.3217, + "step": 6990 + }, + { + "epoch": 1.6640090438507764, + "grad_norm": 0.3672974524927207, + "learning_rate": 7.226861606525526e-07, + "loss": 0.3805, + "step": 6991 + }, + { + "epoch": 1.6642470399238412, + "grad_norm": 0.3787693493770054, + "learning_rate": 7.216882463958091e-07, + "loss": 0.3179, + "step": 6992 + }, + { + "epoch": 1.664485035996906, + "grad_norm": 0.3796098325253126, + "learning_rate": 7.20690968002033e-07, + "loss": 0.2713, + "step": 6993 + }, + { + "epoch": 1.6647230320699709, + "grad_norm": 0.38051154387730174, + "learning_rate": 7.196943256194439e-07, + "loss": 0.2955, + "step": 6994 + }, + { + "epoch": 1.6649610281430356, + "grad_norm": 0.38260035859588687, + "learning_rate": 7.186983193961677e-07, + "loss": 0.3802, + "step": 6995 + }, + { + "epoch": 1.6651990242161006, + "grad_norm": 0.3560984048327247, + "learning_rate": 7.177029494802351e-07, + "loss": 0.3178, + "step": 6996 + }, + { + "epoch": 1.6654370202891653, + "grad_norm": 0.38924386944904715, + "learning_rate": 7.167082160195837e-07, + "loss": 0.2741, + "step": 6997 + }, + { + "epoch": 1.66567501636223, + "grad_norm": 0.4039947533316631, + "learning_rate": 7.157141191620548e-07, + "loss": 0.3479, + "step": 6998 + }, + { + "epoch": 1.6659130124352948, + "grad_norm": 0.38643709485362443, + "learning_rate": 7.147206590553956e-07, + "loss": 0.3369, + "step": 6999 + }, + { + "epoch": 1.6661510085083595, + "grad_norm": 0.40129045045890266, + "learning_rate": 7.137278358472583e-07, + "loss": 0.2818, + "step": 7000 + }, + { + "epoch": 1.6663890045814242, + "grad_norm": 0.4499031615543838, + "learning_rate": 7.127356496852029e-07, + "loss": 0.3035, + "step": 7001 + }, + { + "epoch": 1.6666270006544892, + "grad_norm": 0.3748939127556976, + "learning_rate": 7.117441007166919e-07, + "loss": 0.3819, + "step": 7002 + }, + { + "epoch": 1.666864996727554, + "grad_norm": 0.35881304656890556, + "learning_rate": 7.10753189089094e-07, + "loss": 0.2835, + "step": 7003 + }, + { + "epoch": 1.667102992800619, + "grad_norm": 0.41355655256875934, + "learning_rate": 7.097629149496815e-07, + "loss": 0.2988, + "step": 7004 + }, + { + "epoch": 1.6673409888736836, + "grad_norm": 0.6309979385551239, + "learning_rate": 7.087732784456369e-07, + "loss": 0.3538, + "step": 7005 + }, + { + "epoch": 1.6675789849467484, + "grad_norm": 0.37545921085712725, + "learning_rate": 7.077842797240426e-07, + "loss": 0.3416, + "step": 7006 + }, + { + "epoch": 1.6678169810198131, + "grad_norm": 0.365443297712444, + "learning_rate": 7.067959189318885e-07, + "loss": 0.2977, + "step": 7007 + }, + { + "epoch": 1.6680549770928779, + "grad_norm": 0.37039435287713296, + "learning_rate": 7.058081962160696e-07, + "loss": 0.3164, + "step": 7008 + }, + { + "epoch": 1.6682929731659426, + "grad_norm": 0.37427937679786044, + "learning_rate": 7.048211117233861e-07, + "loss": 0.3486, + "step": 7009 + }, + { + "epoch": 1.6685309692390076, + "grad_norm": 0.3729855514273312, + "learning_rate": 7.038346656005429e-07, + "loss": 0.2989, + "step": 7010 + }, + { + "epoch": 1.6687689653120723, + "grad_norm": 0.374090406829846, + "learning_rate": 7.028488579941506e-07, + "loss": 0.2792, + "step": 7011 + }, + { + "epoch": 1.6690069613851373, + "grad_norm": 0.39515663629452236, + "learning_rate": 7.018636890507241e-07, + "loss": 0.3659, + "step": 7012 + }, + { + "epoch": 1.669244957458202, + "grad_norm": 0.35372595983883504, + "learning_rate": 7.008791589166847e-07, + "loss": 0.3607, + "step": 7013 + }, + { + "epoch": 1.6694829535312667, + "grad_norm": 0.3533302020072786, + "learning_rate": 6.998952677383569e-07, + "loss": 0.276, + "step": 7014 + }, + { + "epoch": 1.6697209496043315, + "grad_norm": 0.42475735192235214, + "learning_rate": 6.989120156619717e-07, + "loss": 0.3186, + "step": 7015 + }, + { + "epoch": 1.6699589456773962, + "grad_norm": 0.38764230011040096, + "learning_rate": 6.979294028336652e-07, + "loss": 0.3573, + "step": 7016 + }, + { + "epoch": 1.670196941750461, + "grad_norm": 0.3611283372327025, + "learning_rate": 6.969474293994771e-07, + "loss": 0.3349, + "step": 7017 + }, + { + "epoch": 1.670434937823526, + "grad_norm": 0.36366689855170664, + "learning_rate": 6.959660955053527e-07, + "loss": 0.296, + "step": 7018 + }, + { + "epoch": 1.6706729338965907, + "grad_norm": 0.5422835131958719, + "learning_rate": 6.949854012971441e-07, + "loss": 0.3383, + "step": 7019 + }, + { + "epoch": 1.6709109299696556, + "grad_norm": 0.35747903910606177, + "learning_rate": 6.94005346920606e-07, + "loss": 0.4007, + "step": 7020 + }, + { + "epoch": 1.6711489260427204, + "grad_norm": 0.36742844091875815, + "learning_rate": 6.930259325213978e-07, + "loss": 0.2987, + "step": 7021 + }, + { + "epoch": 1.671386922115785, + "grad_norm": 0.40350636301013826, + "learning_rate": 6.920471582450861e-07, + "loss": 0.3304, + "step": 7022 + }, + { + "epoch": 1.6716249181888498, + "grad_norm": 0.40006906898219396, + "learning_rate": 6.910690242371404e-07, + "loss": 0.3446, + "step": 7023 + }, + { + "epoch": 1.6718629142619146, + "grad_norm": 0.35727853106570195, + "learning_rate": 6.900915306429351e-07, + "loss": 0.3153, + "step": 7024 + }, + { + "epoch": 1.6721009103349793, + "grad_norm": 0.382181007945197, + "learning_rate": 6.891146776077507e-07, + "loss": 0.2882, + "step": 7025 + }, + { + "epoch": 1.6723389064080443, + "grad_norm": 0.4060235764358048, + "learning_rate": 6.881384652767709e-07, + "loss": 0.3207, + "step": 7026 + }, + { + "epoch": 1.672576902481109, + "grad_norm": 0.37097326742176884, + "learning_rate": 6.871628937950848e-07, + "loss": 0.3922, + "step": 7027 + }, + { + "epoch": 1.672814898554174, + "grad_norm": 0.40285375164369536, + "learning_rate": 6.861879633076878e-07, + "loss": 0.2923, + "step": 7028 + }, + { + "epoch": 1.6730528946272387, + "grad_norm": 0.37801910915821857, + "learning_rate": 6.852136739594783e-07, + "loss": 0.2882, + "step": 7029 + }, + { + "epoch": 1.6732908907003035, + "grad_norm": 0.36728422194745036, + "learning_rate": 6.842400258952597e-07, + "loss": 0.3977, + "step": 7030 + }, + { + "epoch": 1.6735288867733682, + "grad_norm": 0.39561139457941275, + "learning_rate": 6.832670192597396e-07, + "loss": 0.3719, + "step": 7031 + }, + { + "epoch": 1.673766882846433, + "grad_norm": 0.37538581127020354, + "learning_rate": 6.822946541975306e-07, + "loss": 0.2774, + "step": 7032 + }, + { + "epoch": 1.6740048789194977, + "grad_norm": 0.38177066214722755, + "learning_rate": 6.813229308531516e-07, + "loss": 0.3272, + "step": 7033 + }, + { + "epoch": 1.6742428749925626, + "grad_norm": 0.38578694289159443, + "learning_rate": 6.803518493710243e-07, + "loss": 0.4126, + "step": 7034 + }, + { + "epoch": 1.6744808710656274, + "grad_norm": 0.36307309359597634, + "learning_rate": 6.793814098954749e-07, + "loss": 0.331, + "step": 7035 + }, + { + "epoch": 1.6747188671386923, + "grad_norm": 0.4000089226486024, + "learning_rate": 6.784116125707352e-07, + "loss": 0.267, + "step": 7036 + }, + { + "epoch": 1.674956863211757, + "grad_norm": 0.36487357951661303, + "learning_rate": 6.774424575409405e-07, + "loss": 0.3127, + "step": 7037 + }, + { + "epoch": 1.6751948592848218, + "grad_norm": 0.35391638388560076, + "learning_rate": 6.764739449501317e-07, + "loss": 0.3769, + "step": 7038 + }, + { + "epoch": 1.6754328553578866, + "grad_norm": 0.35186947926475576, + "learning_rate": 6.755060749422537e-07, + "loss": 0.2923, + "step": 7039 + }, + { + "epoch": 1.6756708514309513, + "grad_norm": 0.44135361184534583, + "learning_rate": 6.745388476611553e-07, + "loss": 0.2981, + "step": 7040 + }, + { + "epoch": 1.675908847504016, + "grad_norm": 0.45872802613072794, + "learning_rate": 6.735722632505915e-07, + "loss": 0.37, + "step": 7041 + }, + { + "epoch": 1.676146843577081, + "grad_norm": 0.3580957612187762, + "learning_rate": 6.726063218542195e-07, + "loss": 0.314, + "step": 7042 + }, + { + "epoch": 1.6763848396501457, + "grad_norm": 0.35669432760875613, + "learning_rate": 6.716410236156029e-07, + "loss": 0.3014, + "step": 7043 + }, + { + "epoch": 1.6766228357232107, + "grad_norm": 0.38931922786060297, + "learning_rate": 6.706763686782086e-07, + "loss": 0.3144, + "step": 7044 + }, + { + "epoch": 1.6768608317962754, + "grad_norm": 0.40124294635503877, + "learning_rate": 6.697123571854075e-07, + "loss": 0.3697, + "step": 7045 + }, + { + "epoch": 1.6770988278693402, + "grad_norm": 0.4969890421919046, + "learning_rate": 6.687489892804766e-07, + "loss": 0.3089, + "step": 7046 + }, + { + "epoch": 1.677336823942405, + "grad_norm": 0.3566446713995905, + "learning_rate": 6.677862651065964e-07, + "loss": 0.2827, + "step": 7047 + }, + { + "epoch": 1.6775748200154696, + "grad_norm": 0.38848733751656256, + "learning_rate": 6.668241848068507e-07, + "loss": 0.3592, + "step": 7048 + }, + { + "epoch": 1.6778128160885344, + "grad_norm": 0.5950856594781679, + "learning_rate": 6.658627485242291e-07, + "loss": 0.3444, + "step": 7049 + }, + { + "epoch": 1.6780508121615993, + "grad_norm": 0.36830146545513864, + "learning_rate": 6.649019564016246e-07, + "loss": 0.2709, + "step": 7050 + }, + { + "epoch": 1.678288808234664, + "grad_norm": 0.37103816586070554, + "learning_rate": 6.639418085818339e-07, + "loss": 0.3196, + "step": 7051 + }, + { + "epoch": 1.678526804307729, + "grad_norm": 0.38203373085536374, + "learning_rate": 6.629823052075602e-07, + "loss": 0.402, + "step": 7052 + }, + { + "epoch": 1.6787648003807938, + "grad_norm": 0.3476339385449799, + "learning_rate": 6.620234464214076e-07, + "loss": 0.3066, + "step": 7053 + }, + { + "epoch": 1.6790027964538585, + "grad_norm": 0.3792905804858758, + "learning_rate": 6.610652323658867e-07, + "loss": 0.2825, + "step": 7054 + }, + { + "epoch": 1.6792407925269233, + "grad_norm": 0.43917028015511345, + "learning_rate": 6.601076631834135e-07, + "loss": 0.3524, + "step": 7055 + }, + { + "epoch": 1.679478788599988, + "grad_norm": 0.35999526768311224, + "learning_rate": 6.591507390163049e-07, + "loss": 0.3362, + "step": 7056 + }, + { + "epoch": 1.6797167846730527, + "grad_norm": 0.3958931440700458, + "learning_rate": 6.581944600067847e-07, + "loss": 0.299, + "step": 7057 + }, + { + "epoch": 1.6799547807461177, + "grad_norm": 0.40723904451700604, + "learning_rate": 6.572388262969781e-07, + "loss": 0.3382, + "step": 7058 + }, + { + "epoch": 1.6801927768191824, + "grad_norm": 0.37213316814749897, + "learning_rate": 6.562838380289155e-07, + "loss": 0.3613, + "step": 7059 + }, + { + "epoch": 1.6804307728922474, + "grad_norm": 0.3775949694068757, + "learning_rate": 6.553294953445344e-07, + "loss": 0.3037, + "step": 7060 + }, + { + "epoch": 1.6806687689653121, + "grad_norm": 0.4302521600801045, + "learning_rate": 6.543757983856724e-07, + "loss": 0.2837, + "step": 7061 + }, + { + "epoch": 1.6809067650383769, + "grad_norm": 0.3744153721393427, + "learning_rate": 6.534227472940718e-07, + "loss": 0.3633, + "step": 7062 + }, + { + "epoch": 1.6811447611114416, + "grad_norm": 0.3601321543703086, + "learning_rate": 6.524703422113803e-07, + "loss": 0.3534, + "step": 7063 + }, + { + "epoch": 1.6813827571845064, + "grad_norm": 0.3978043541470634, + "learning_rate": 6.515185832791493e-07, + "loss": 0.2833, + "step": 7064 + }, + { + "epoch": 1.681620753257571, + "grad_norm": 0.3650335732859001, + "learning_rate": 6.50567470638832e-07, + "loss": 0.3149, + "step": 7065 + }, + { + "epoch": 1.681858749330636, + "grad_norm": 0.38338937802115375, + "learning_rate": 6.49617004431789e-07, + "loss": 0.3667, + "step": 7066 + }, + { + "epoch": 1.6820967454037008, + "grad_norm": 0.40126636400041343, + "learning_rate": 6.486671847992826e-07, + "loss": 0.3275, + "step": 7067 + }, + { + "epoch": 1.6823347414767658, + "grad_norm": 0.4648961323218715, + "learning_rate": 6.477180118824788e-07, + "loss": 0.2984, + "step": 7068 + }, + { + "epoch": 1.6825727375498305, + "grad_norm": 0.3776079096365654, + "learning_rate": 6.467694858224488e-07, + "loss": 0.3161, + "step": 7069 + }, + { + "epoch": 1.6828107336228952, + "grad_norm": 0.3654293611333949, + "learning_rate": 6.458216067601669e-07, + "loss": 0.3563, + "step": 7070 + }, + { + "epoch": 1.68304872969596, + "grad_norm": 0.3938912402346958, + "learning_rate": 6.448743748365116e-07, + "loss": 0.2843, + "step": 7071 + }, + { + "epoch": 1.6832867257690247, + "grad_norm": 0.40857518732879755, + "learning_rate": 6.439277901922647e-07, + "loss": 0.2857, + "step": 7072 + }, + { + "epoch": 1.6835247218420895, + "grad_norm": 0.37369837115535326, + "learning_rate": 6.429818529681115e-07, + "loss": 0.3408, + "step": 7073 + }, + { + "epoch": 1.6837627179151544, + "grad_norm": 0.566489154234049, + "learning_rate": 6.420365633046433e-07, + "loss": 0.3525, + "step": 7074 + }, + { + "epoch": 1.6840007139882192, + "grad_norm": 0.40096527952286826, + "learning_rate": 6.410919213423522e-07, + "loss": 0.2766, + "step": 7075 + }, + { + "epoch": 1.6842387100612841, + "grad_norm": 0.3835017576074419, + "learning_rate": 6.40147927221636e-07, + "loss": 0.3331, + "step": 7076 + }, + { + "epoch": 1.6844767061343489, + "grad_norm": 0.4193203253283927, + "learning_rate": 6.392045810827957e-07, + "loss": 0.3989, + "step": 7077 + }, + { + "epoch": 1.6847147022074136, + "grad_norm": 0.3714016805455296, + "learning_rate": 6.382618830660353e-07, + "loss": 0.2883, + "step": 7078 + }, + { + "epoch": 1.6849526982804783, + "grad_norm": 0.381171748269651, + "learning_rate": 6.373198333114633e-07, + "loss": 0.2767, + "step": 7079 + }, + { + "epoch": 1.685190694353543, + "grad_norm": 0.39675713102514104, + "learning_rate": 6.363784319590916e-07, + "loss": 0.3425, + "step": 7080 + }, + { + "epoch": 1.6854286904266078, + "grad_norm": 0.4002931733011814, + "learning_rate": 6.354376791488343e-07, + "loss": 0.365, + "step": 7081 + }, + { + "epoch": 1.6856666864996728, + "grad_norm": 0.37518227406606824, + "learning_rate": 6.344975750205129e-07, + "loss": 0.2542, + "step": 7082 + }, + { + "epoch": 1.6859046825727375, + "grad_norm": 0.3638772212144532, + "learning_rate": 6.335581197138496e-07, + "loss": 0.3343, + "step": 7083 + }, + { + "epoch": 1.6861426786458025, + "grad_norm": 0.3890298957493163, + "learning_rate": 6.326193133684705e-07, + "loss": 0.3964, + "step": 7084 + }, + { + "epoch": 1.6863806747188672, + "grad_norm": 0.36557376045651974, + "learning_rate": 6.31681156123905e-07, + "loss": 0.2866, + "step": 7085 + }, + { + "epoch": 1.686618670791932, + "grad_norm": 0.43312850934266955, + "learning_rate": 6.307436481195866e-07, + "loss": 0.2639, + "step": 7086 + }, + { + "epoch": 1.6868566668649967, + "grad_norm": 0.3528390333914725, + "learning_rate": 6.298067894948512e-07, + "loss": 0.3374, + "step": 7087 + }, + { + "epoch": 1.6870946629380614, + "grad_norm": 0.34344204575606535, + "learning_rate": 6.288705803889411e-07, + "loss": 0.3649, + "step": 7088 + }, + { + "epoch": 1.6873326590111262, + "grad_norm": 0.38507748165524075, + "learning_rate": 6.279350209409995e-07, + "loss": 0.2789, + "step": 7089 + }, + { + "epoch": 1.6875706550841911, + "grad_norm": 0.3803975185088582, + "learning_rate": 6.270001112900736e-07, + "loss": 0.3158, + "step": 7090 + }, + { + "epoch": 1.6878086511572559, + "grad_norm": 0.3872558923912277, + "learning_rate": 6.260658515751139e-07, + "loss": 0.3569, + "step": 7091 + }, + { + "epoch": 1.6880466472303208, + "grad_norm": 0.3794470357203257, + "learning_rate": 6.251322419349748e-07, + "loss": 0.2971, + "step": 7092 + }, + { + "epoch": 1.6882846433033856, + "grad_norm": 0.38546650413181716, + "learning_rate": 6.241992825084131e-07, + "loss": 0.2802, + "step": 7093 + }, + { + "epoch": 1.6885226393764503, + "grad_norm": 0.37651470887537414, + "learning_rate": 6.232669734340907e-07, + "loss": 0.2954, + "step": 7094 + }, + { + "epoch": 1.688760635449515, + "grad_norm": 0.3885988985334594, + "learning_rate": 6.223353148505706e-07, + "loss": 0.3914, + "step": 7095 + }, + { + "epoch": 1.6889986315225798, + "grad_norm": 0.3511153525219931, + "learning_rate": 6.21404306896321e-07, + "loss": 0.2711, + "step": 7096 + }, + { + "epoch": 1.6892366275956445, + "grad_norm": 0.37884092771461375, + "learning_rate": 6.204739497097129e-07, + "loss": 0.2851, + "step": 7097 + }, + { + "epoch": 1.6894746236687095, + "grad_norm": 0.387337614231269, + "learning_rate": 6.1954424342902e-07, + "loss": 0.3698, + "step": 7098 + }, + { + "epoch": 1.6897126197417742, + "grad_norm": 0.3546161227406718, + "learning_rate": 6.186151881924202e-07, + "loss": 0.3163, + "step": 7099 + }, + { + "epoch": 1.6899506158148392, + "grad_norm": 0.3730093557742399, + "learning_rate": 6.176867841379919e-07, + "loss": 0.2882, + "step": 7100 + }, + { + "epoch": 1.690188611887904, + "grad_norm": 0.38800933074459065, + "learning_rate": 6.16759031403722e-07, + "loss": 0.3274, + "step": 7101 + }, + { + "epoch": 1.6904266079609687, + "grad_norm": 0.39567528441392813, + "learning_rate": 6.158319301274962e-07, + "loss": 0.3702, + "step": 7102 + }, + { + "epoch": 1.6906646040340334, + "grad_norm": 0.3813083923210492, + "learning_rate": 6.14905480447105e-07, + "loss": 0.2895, + "step": 7103 + }, + { + "epoch": 1.6909026001070981, + "grad_norm": 0.383759332023671, + "learning_rate": 6.139796825002409e-07, + "loss": 0.2933, + "step": 7104 + }, + { + "epoch": 1.6911405961801629, + "grad_norm": 0.3886124154209392, + "learning_rate": 6.130545364245011e-07, + "loss": 0.3486, + "step": 7105 + }, + { + "epoch": 1.6913785922532278, + "grad_norm": 0.3865197625654805, + "learning_rate": 6.121300423573851e-07, + "loss": 0.3864, + "step": 7106 + }, + { + "epoch": 1.6916165883262926, + "grad_norm": 0.4342380259704541, + "learning_rate": 6.112062004362957e-07, + "loss": 0.2811, + "step": 7107 + }, + { + "epoch": 1.6918545843993575, + "grad_norm": 0.38137795050806866, + "learning_rate": 6.102830107985369e-07, + "loss": 0.3386, + "step": 7108 + }, + { + "epoch": 1.6920925804724223, + "grad_norm": 0.39290592099891664, + "learning_rate": 6.093604735813202e-07, + "loss": 0.3844, + "step": 7109 + }, + { + "epoch": 1.692330576545487, + "grad_norm": 0.36869581745026214, + "learning_rate": 6.084385889217565e-07, + "loss": 0.3033, + "step": 7110 + }, + { + "epoch": 1.6925685726185518, + "grad_norm": 0.3781454086182421, + "learning_rate": 6.075173569568605e-07, + "loss": 0.2898, + "step": 7111 + }, + { + "epoch": 1.6928065686916165, + "grad_norm": 0.404850762268356, + "learning_rate": 6.065967778235499e-07, + "loss": 0.3371, + "step": 7112 + }, + { + "epoch": 1.6930445647646812, + "grad_norm": 0.41163273024333985, + "learning_rate": 6.056768516586453e-07, + "loss": 0.3546, + "step": 7113 + }, + { + "epoch": 1.6932825608377462, + "grad_norm": 0.3498312393278164, + "learning_rate": 6.047575785988702e-07, + "loss": 0.3086, + "step": 7114 + }, + { + "epoch": 1.693520556910811, + "grad_norm": 0.39282228454424806, + "learning_rate": 6.038389587808535e-07, + "loss": 0.2948, + "step": 7115 + }, + { + "epoch": 1.693758552983876, + "grad_norm": 0.3988205195809829, + "learning_rate": 6.029209923411228e-07, + "loss": 0.3773, + "step": 7116 + }, + { + "epoch": 1.6939965490569406, + "grad_norm": 0.3666168567052365, + "learning_rate": 6.02003679416111e-07, + "loss": 0.3061, + "step": 7117 + }, + { + "epoch": 1.6942345451300054, + "grad_norm": 0.4069498736744767, + "learning_rate": 6.010870201421537e-07, + "loss": 0.2989, + "step": 7118 + }, + { + "epoch": 1.6944725412030701, + "grad_norm": 0.3967722726787557, + "learning_rate": 6.001710146554896e-07, + "loss": 0.3228, + "step": 7119 + }, + { + "epoch": 1.6947105372761349, + "grad_norm": 0.36380296589170585, + "learning_rate": 5.992556630922585e-07, + "loss": 0.3656, + "step": 7120 + }, + { + "epoch": 1.6949485333491996, + "grad_norm": 0.3690082479582151, + "learning_rate": 5.983409655885053e-07, + "loss": 0.3001, + "step": 7121 + }, + { + "epoch": 1.6951865294222646, + "grad_norm": 0.506841855343455, + "learning_rate": 5.974269222801765e-07, + "loss": 0.2926, + "step": 7122 + }, + { + "epoch": 1.6954245254953293, + "grad_norm": 0.4096814364673883, + "learning_rate": 5.965135333031213e-07, + "loss": 0.3593, + "step": 7123 + }, + { + "epoch": 1.6956625215683943, + "grad_norm": 0.38736545080244283, + "learning_rate": 5.956007987930923e-07, + "loss": 0.3661, + "step": 7124 + }, + { + "epoch": 1.695900517641459, + "grad_norm": 0.3611883206027303, + "learning_rate": 5.946887188857442e-07, + "loss": 0.2677, + "step": 7125 + }, + { + "epoch": 1.6961385137145237, + "grad_norm": 0.38872165541623244, + "learning_rate": 5.937772937166342e-07, + "loss": 0.3316, + "step": 7126 + }, + { + "epoch": 1.6963765097875885, + "grad_norm": 0.36656583624158623, + "learning_rate": 5.928665234212233e-07, + "loss": 0.4022, + "step": 7127 + }, + { + "epoch": 1.6966145058606532, + "grad_norm": 0.35861126589804326, + "learning_rate": 5.919564081348733e-07, + "loss": 0.2982, + "step": 7128 + }, + { + "epoch": 1.696852501933718, + "grad_norm": 0.36794026503347826, + "learning_rate": 5.910469479928521e-07, + "loss": 0.3044, + "step": 7129 + }, + { + "epoch": 1.697090498006783, + "grad_norm": 0.3868057111195459, + "learning_rate": 5.90138143130326e-07, + "loss": 0.3525, + "step": 7130 + }, + { + "epoch": 1.6973284940798476, + "grad_norm": 0.3694887390714355, + "learning_rate": 5.89229993682367e-07, + "loss": 0.3685, + "step": 7131 + }, + { + "epoch": 1.6975664901529126, + "grad_norm": 0.3863367650360045, + "learning_rate": 5.883224997839482e-07, + "loss": 0.2786, + "step": 7132 + }, + { + "epoch": 1.6978044862259773, + "grad_norm": 0.40288142462099663, + "learning_rate": 5.874156615699455e-07, + "loss": 0.3231, + "step": 7133 + }, + { + "epoch": 1.698042482299042, + "grad_norm": 0.39261984171695463, + "learning_rate": 5.865094791751375e-07, + "loss": 0.3858, + "step": 7134 + }, + { + "epoch": 1.6982804783721068, + "grad_norm": 0.38986076248425855, + "learning_rate": 5.856039527342044e-07, + "loss": 0.3153, + "step": 7135 + }, + { + "epoch": 1.6985184744451716, + "grad_norm": 0.3869365959701934, + "learning_rate": 5.846990823817316e-07, + "loss": 0.2779, + "step": 7136 + }, + { + "epoch": 1.6987564705182363, + "grad_norm": 0.38638500561242545, + "learning_rate": 5.837948682522048e-07, + "loss": 0.3351, + "step": 7137 + }, + { + "epoch": 1.6989944665913013, + "grad_norm": 0.36095089742651565, + "learning_rate": 5.828913104800121e-07, + "loss": 0.3707, + "step": 7138 + }, + { + "epoch": 1.699232462664366, + "grad_norm": 0.37138461340811796, + "learning_rate": 5.819884091994444e-07, + "loss": 0.2501, + "step": 7139 + }, + { + "epoch": 1.699470458737431, + "grad_norm": 0.3601206372592032, + "learning_rate": 5.810861645446958e-07, + "loss": 0.2903, + "step": 7140 + }, + { + "epoch": 1.6997084548104957, + "grad_norm": 0.3929879463329381, + "learning_rate": 5.801845766498615e-07, + "loss": 0.3893, + "step": 7141 + }, + { + "epoch": 1.6999464508835604, + "grad_norm": 0.36103944808905125, + "learning_rate": 5.792836456489392e-07, + "loss": 0.3077, + "step": 7142 + }, + { + "epoch": 1.7001844469566252, + "grad_norm": 0.41181030157128806, + "learning_rate": 5.783833716758314e-07, + "loss": 0.2777, + "step": 7143 + }, + { + "epoch": 1.70042244302969, + "grad_norm": 0.36292079451006326, + "learning_rate": 5.774837548643403e-07, + "loss": 0.3314, + "step": 7144 + }, + { + "epoch": 1.7006604391027547, + "grad_norm": 0.38043247398479435, + "learning_rate": 5.765847953481707e-07, + "loss": 0.3875, + "step": 7145 + }, + { + "epoch": 1.7008984351758196, + "grad_norm": 0.38907488954363156, + "learning_rate": 5.756864932609307e-07, + "loss": 0.2598, + "step": 7146 + }, + { + "epoch": 1.7011364312488844, + "grad_norm": 0.4014263611690131, + "learning_rate": 5.747888487361303e-07, + "loss": 0.2803, + "step": 7147 + }, + { + "epoch": 1.7013744273219493, + "grad_norm": 0.37583669721002044, + "learning_rate": 5.738918619071809e-07, + "loss": 0.3621, + "step": 7148 + }, + { + "epoch": 1.701612423395014, + "grad_norm": 0.3633761512343576, + "learning_rate": 5.729955329073978e-07, + "loss": 0.3107, + "step": 7149 + }, + { + "epoch": 1.7018504194680788, + "grad_norm": 0.3922899931026979, + "learning_rate": 5.720998618699974e-07, + "loss": 0.2716, + "step": 7150 + }, + { + "epoch": 1.7020884155411435, + "grad_norm": 0.3822926529873602, + "learning_rate": 5.712048489280981e-07, + "loss": 0.3296, + "step": 7151 + }, + { + "epoch": 1.7023264116142083, + "grad_norm": 0.3755985930742893, + "learning_rate": 5.703104942147214e-07, + "loss": 0.3824, + "step": 7152 + }, + { + "epoch": 1.702564407687273, + "grad_norm": 0.36980830385212704, + "learning_rate": 5.694167978627907e-07, + "loss": 0.312, + "step": 7153 + }, + { + "epoch": 1.702802403760338, + "grad_norm": 0.432008104893982, + "learning_rate": 5.685237600051314e-07, + "loss": 0.2991, + "step": 7154 + }, + { + "epoch": 1.7030403998334027, + "grad_norm": 0.3870322232137778, + "learning_rate": 5.676313807744705e-07, + "loss": 0.3511, + "step": 7155 + }, + { + "epoch": 1.7032783959064677, + "grad_norm": 0.3975439565591834, + "learning_rate": 5.667396603034369e-07, + "loss": 0.3807, + "step": 7156 + }, + { + "epoch": 1.7035163919795324, + "grad_norm": 0.3536210783757654, + "learning_rate": 5.658485987245648e-07, + "loss": 0.2829, + "step": 7157 + }, + { + "epoch": 1.7037543880525972, + "grad_norm": 0.38718378271877263, + "learning_rate": 5.64958196170286e-07, + "loss": 0.3196, + "step": 7158 + }, + { + "epoch": 1.703992384125662, + "grad_norm": 0.40914299947041166, + "learning_rate": 5.640684527729373e-07, + "loss": 0.3649, + "step": 7159 + }, + { + "epoch": 1.7042303801987266, + "grad_norm": 0.37188576462750905, + "learning_rate": 5.631793686647558e-07, + "loss": 0.316, + "step": 7160 + }, + { + "epoch": 1.7044683762717914, + "grad_norm": 0.37633208910101296, + "learning_rate": 5.622909439778817e-07, + "loss": 0.2676, + "step": 7161 + }, + { + "epoch": 1.7047063723448563, + "grad_norm": 0.38454873470389395, + "learning_rate": 5.614031788443563e-07, + "loss": 0.3507, + "step": 7162 + }, + { + "epoch": 1.704944368417921, + "grad_norm": 0.3977198853767724, + "learning_rate": 5.605160733961252e-07, + "loss": 0.3885, + "step": 7163 + }, + { + "epoch": 1.705182364490986, + "grad_norm": 0.3933331704312476, + "learning_rate": 5.596296277650332e-07, + "loss": 0.2696, + "step": 7164 + }, + { + "epoch": 1.7054203605640508, + "grad_norm": 0.3822921368220508, + "learning_rate": 5.587438420828273e-07, + "loss": 0.3277, + "step": 7165 + }, + { + "epoch": 1.7056583566371155, + "grad_norm": 0.3861803935557107, + "learning_rate": 5.578587164811583e-07, + "loss": 0.3735, + "step": 7166 + }, + { + "epoch": 1.7058963527101803, + "grad_norm": 0.34946179290548746, + "learning_rate": 5.569742510915776e-07, + "loss": 0.292, + "step": 7167 + }, + { + "epoch": 1.706134348783245, + "grad_norm": 0.3777091948818582, + "learning_rate": 5.56090446045538e-07, + "loss": 0.2919, + "step": 7168 + }, + { + "epoch": 1.7063723448563097, + "grad_norm": 0.39415615795103837, + "learning_rate": 5.552073014743942e-07, + "loss": 0.3222, + "step": 7169 + }, + { + "epoch": 1.7066103409293747, + "grad_norm": 0.38807660041300635, + "learning_rate": 5.543248175094051e-07, + "loss": 0.3784, + "step": 7170 + }, + { + "epoch": 1.7068483370024394, + "grad_norm": 0.36815447326497414, + "learning_rate": 5.534429942817293e-07, + "loss": 0.2682, + "step": 7171 + }, + { + "epoch": 1.7070863330755044, + "grad_norm": 0.3782786257896689, + "learning_rate": 5.525618319224269e-07, + "loss": 0.3099, + "step": 7172 + }, + { + "epoch": 1.7073243291485691, + "grad_norm": 0.3774779346032418, + "learning_rate": 5.516813305624602e-07, + "loss": 0.3569, + "step": 7173 + }, + { + "epoch": 1.7075623252216339, + "grad_norm": 0.3825672141737441, + "learning_rate": 5.508014903326941e-07, + "loss": 0.339, + "step": 7174 + }, + { + "epoch": 1.7078003212946986, + "grad_norm": 0.38678799012825976, + "learning_rate": 5.499223113638946e-07, + "loss": 0.2895, + "step": 7175 + }, + { + "epoch": 1.7080383173677633, + "grad_norm": 0.45675241641075215, + "learning_rate": 5.490437937867287e-07, + "loss": 0.3156, + "step": 7176 + }, + { + "epoch": 1.708276313440828, + "grad_norm": 0.3990012412168306, + "learning_rate": 5.481659377317672e-07, + "loss": 0.4195, + "step": 7177 + }, + { + "epoch": 1.708514309513893, + "grad_norm": 0.361980501450529, + "learning_rate": 5.472887433294799e-07, + "loss": 0.2905, + "step": 7178 + }, + { + "epoch": 1.7087523055869578, + "grad_norm": 0.3934064270709845, + "learning_rate": 5.464122107102399e-07, + "loss": 0.2835, + "step": 7179 + }, + { + "epoch": 1.7089903016600227, + "grad_norm": 0.39868212939298053, + "learning_rate": 5.455363400043223e-07, + "loss": 0.3682, + "step": 7180 + }, + { + "epoch": 1.7092282977330875, + "grad_norm": 0.37266869092332056, + "learning_rate": 5.446611313419026e-07, + "loss": 0.3462, + "step": 7181 + }, + { + "epoch": 1.7094662938061522, + "grad_norm": 0.3622657497062413, + "learning_rate": 5.437865848530588e-07, + "loss": 0.2699, + "step": 7182 + }, + { + "epoch": 1.709704289879217, + "grad_norm": 0.37818014208202183, + "learning_rate": 5.429127006677681e-07, + "loss": 0.3102, + "step": 7183 + }, + { + "epoch": 1.7099422859522817, + "grad_norm": 0.4190339066632442, + "learning_rate": 5.420394789159151e-07, + "loss": 0.3571, + "step": 7184 + }, + { + "epoch": 1.7101802820253464, + "grad_norm": 0.4294561415916188, + "learning_rate": 5.411669197272795e-07, + "loss": 0.2779, + "step": 7185 + }, + { + "epoch": 1.7104182780984114, + "grad_norm": 0.3737799198142464, + "learning_rate": 5.402950232315457e-07, + "loss": 0.2662, + "step": 7186 + }, + { + "epoch": 1.7106562741714761, + "grad_norm": 0.3802806165041149, + "learning_rate": 5.394237895582999e-07, + "loss": 0.359, + "step": 7187 + }, + { + "epoch": 1.710894270244541, + "grad_norm": 0.3824886968859595, + "learning_rate": 5.385532188370279e-07, + "loss": 0.401, + "step": 7188 + }, + { + "epoch": 1.7111322663176058, + "grad_norm": 0.3682750615164672, + "learning_rate": 5.376833111971175e-07, + "loss": 0.2592, + "step": 7189 + }, + { + "epoch": 1.7113702623906706, + "grad_norm": 0.4080980675820481, + "learning_rate": 5.368140667678607e-07, + "loss": 0.3052, + "step": 7190 + }, + { + "epoch": 1.7116082584637353, + "grad_norm": 0.4315797792499158, + "learning_rate": 5.359454856784469e-07, + "loss": 0.3828, + "step": 7191 + }, + { + "epoch": 1.7118462545368, + "grad_norm": 0.3873206456391449, + "learning_rate": 5.350775680579695e-07, + "loss": 0.2887, + "step": 7192 + }, + { + "epoch": 1.7120842506098648, + "grad_norm": 0.39557005651691873, + "learning_rate": 5.342103140354226e-07, + "loss": 0.2943, + "step": 7193 + }, + { + "epoch": 1.7123222466829298, + "grad_norm": 0.3875924022674923, + "learning_rate": 5.333437237397015e-07, + "loss": 0.3258, + "step": 7194 + }, + { + "epoch": 1.7125602427559945, + "grad_norm": 0.39590844421878507, + "learning_rate": 5.324777972996026e-07, + "loss": 0.3854, + "step": 7195 + }, + { + "epoch": 1.7127982388290595, + "grad_norm": 0.37400174892709304, + "learning_rate": 5.316125348438239e-07, + "loss": 0.2619, + "step": 7196 + }, + { + "epoch": 1.7130362349021242, + "grad_norm": 0.40507111453694483, + "learning_rate": 5.307479365009644e-07, + "loss": 0.2943, + "step": 7197 + }, + { + "epoch": 1.713274230975189, + "grad_norm": 0.41809861423219297, + "learning_rate": 5.298840023995267e-07, + "loss": 0.3762, + "step": 7198 + }, + { + "epoch": 1.7135122270482537, + "grad_norm": 0.3901582679717628, + "learning_rate": 5.290207326679109e-07, + "loss": 0.3525, + "step": 7199 + }, + { + "epoch": 1.7137502231213184, + "grad_norm": 0.37330950872565466, + "learning_rate": 5.281581274344216e-07, + "loss": 0.2854, + "step": 7200 + }, + { + "epoch": 1.7139882191943832, + "grad_norm": 0.3781668431340482, + "learning_rate": 5.272961868272625e-07, + "loss": 0.3256, + "step": 7201 + }, + { + "epoch": 1.7142262152674481, + "grad_norm": 0.38276965275384967, + "learning_rate": 5.264349109745392e-07, + "loss": 0.3804, + "step": 7202 + }, + { + "epoch": 1.7144642113405129, + "grad_norm": 0.37802592125250717, + "learning_rate": 5.25574300004259e-07, + "loss": 0.2817, + "step": 7203 + }, + { + "epoch": 1.7147022074135778, + "grad_norm": 0.39422912395070164, + "learning_rate": 5.247143540443295e-07, + "loss": 0.3247, + "step": 7204 + }, + { + "epoch": 1.7149402034866426, + "grad_norm": 0.35708805870299815, + "learning_rate": 5.2385507322256e-07, + "loss": 0.3755, + "step": 7205 + }, + { + "epoch": 1.7151781995597073, + "grad_norm": 0.3733848792164133, + "learning_rate": 5.229964576666618e-07, + "loss": 0.3567, + "step": 7206 + }, + { + "epoch": 1.715416195632772, + "grad_norm": 0.4043891988399861, + "learning_rate": 5.221385075042451e-07, + "loss": 0.2927, + "step": 7207 + }, + { + "epoch": 1.7156541917058368, + "grad_norm": 0.36868782474294576, + "learning_rate": 5.212812228628234e-07, + "loss": 0.3259, + "step": 7208 + }, + { + "epoch": 1.7158921877789015, + "grad_norm": 0.38185612816752407, + "learning_rate": 5.204246038698102e-07, + "loss": 0.39, + "step": 7209 + }, + { + "epoch": 1.7161301838519665, + "grad_norm": 0.3691193320372495, + "learning_rate": 5.195686506525205e-07, + "loss": 0.306, + "step": 7210 + }, + { + "epoch": 1.7163681799250312, + "grad_norm": 0.4010679948783104, + "learning_rate": 5.187133633381686e-07, + "loss": 0.2615, + "step": 7211 + }, + { + "epoch": 1.7166061759980962, + "grad_norm": 0.3691780944853636, + "learning_rate": 5.178587420538733e-07, + "loss": 0.3175, + "step": 7212 + }, + { + "epoch": 1.716844172071161, + "grad_norm": 0.3869003239326718, + "learning_rate": 5.17004786926652e-07, + "loss": 0.3637, + "step": 7213 + }, + { + "epoch": 1.7170821681442257, + "grad_norm": 0.39395271592154035, + "learning_rate": 5.161514980834232e-07, + "loss": 0.2891, + "step": 7214 + }, + { + "epoch": 1.7173201642172904, + "grad_norm": 0.3683495140171302, + "learning_rate": 5.152988756510063e-07, + "loss": 0.3021, + "step": 7215 + }, + { + "epoch": 1.7175581602903551, + "grad_norm": 0.37064038864613685, + "learning_rate": 5.144469197561231e-07, + "loss": 0.3637, + "step": 7216 + }, + { + "epoch": 1.7177961563634199, + "grad_norm": 0.387245884205873, + "learning_rate": 5.135956305253953e-07, + "loss": 0.3318, + "step": 7217 + }, + { + "epoch": 1.7180341524364848, + "grad_norm": 0.3648749289239642, + "learning_rate": 5.127450080853447e-07, + "loss": 0.2687, + "step": 7218 + }, + { + "epoch": 1.7182721485095496, + "grad_norm": 0.3709832312749167, + "learning_rate": 5.118950525623955e-07, + "loss": 0.315, + "step": 7219 + }, + { + "epoch": 1.7185101445826145, + "grad_norm": 0.3771433733813478, + "learning_rate": 5.110457640828714e-07, + "loss": 0.3693, + "step": 7220 + }, + { + "epoch": 1.7187481406556793, + "grad_norm": 0.37194166595478395, + "learning_rate": 5.101971427729985e-07, + "loss": 0.306, + "step": 7221 + }, + { + "epoch": 1.718986136728744, + "grad_norm": 0.4198291358677723, + "learning_rate": 5.09349188758903e-07, + "loss": 0.2858, + "step": 7222 + }, + { + "epoch": 1.7192241328018087, + "grad_norm": 0.4062451136719214, + "learning_rate": 5.085019021666104e-07, + "loss": 0.3829, + "step": 7223 + }, + { + "epoch": 1.7194621288748735, + "grad_norm": 0.38423840200623954, + "learning_rate": 5.076552831220505e-07, + "loss": 0.3337, + "step": 7224 + }, + { + "epoch": 1.7197001249479382, + "grad_norm": 0.3799743269319547, + "learning_rate": 5.068093317510492e-07, + "loss": 0.2786, + "step": 7225 + }, + { + "epoch": 1.7199381210210032, + "grad_norm": 0.378826385850787, + "learning_rate": 5.059640481793382e-07, + "loss": 0.3434, + "step": 7226 + }, + { + "epoch": 1.720176117094068, + "grad_norm": 0.38264535191194776, + "learning_rate": 5.05119432532547e-07, + "loss": 0.3617, + "step": 7227 + }, + { + "epoch": 1.7204141131671329, + "grad_norm": 0.4302946941716753, + "learning_rate": 5.042754849362063e-07, + "loss": 0.2898, + "step": 7228 + }, + { + "epoch": 1.7206521092401976, + "grad_norm": 0.35178529312525003, + "learning_rate": 5.03432205515747e-07, + "loss": 0.2742, + "step": 7229 + }, + { + "epoch": 1.7208901053132624, + "grad_norm": 0.3644446480635659, + "learning_rate": 5.025895943965021e-07, + "loss": 0.3651, + "step": 7230 + }, + { + "epoch": 1.721128101386327, + "grad_norm": 0.35818553869400055, + "learning_rate": 5.01747651703704e-07, + "loss": 0.3395, + "step": 7231 + }, + { + "epoch": 1.7213660974593918, + "grad_norm": 0.3840129788680463, + "learning_rate": 5.009063775624857e-07, + "loss": 0.2691, + "step": 7232 + }, + { + "epoch": 1.7216040935324566, + "grad_norm": 0.3982821339866859, + "learning_rate": 5.000657720978824e-07, + "loss": 0.3457, + "step": 7233 + }, + { + "epoch": 1.7218420896055215, + "grad_norm": 0.40131118490637396, + "learning_rate": 4.992258354348284e-07, + "loss": 0.3598, + "step": 7234 + }, + { + "epoch": 1.7220800856785863, + "grad_norm": 0.38055567769870746, + "learning_rate": 4.983865676981586e-07, + "loss": 0.2975, + "step": 7235 + }, + { + "epoch": 1.7223180817516512, + "grad_norm": 0.41080399456136174, + "learning_rate": 4.9754796901261e-07, + "loss": 0.2739, + "step": 7236 + }, + { + "epoch": 1.722556077824716, + "grad_norm": 0.3662126709880048, + "learning_rate": 4.96710039502818e-07, + "loss": 0.3467, + "step": 7237 + }, + { + "epoch": 1.7227940738977807, + "grad_norm": 0.3938043911249227, + "learning_rate": 4.958727792933194e-07, + "loss": 0.3637, + "step": 7238 + }, + { + "epoch": 1.7230320699708455, + "grad_norm": 0.4031184138478153, + "learning_rate": 4.950361885085536e-07, + "loss": 0.2873, + "step": 7239 + }, + { + "epoch": 1.7232700660439102, + "grad_norm": 0.4025403942805333, + "learning_rate": 4.942002672728575e-07, + "loss": 0.2969, + "step": 7240 + }, + { + "epoch": 1.723508062116975, + "grad_norm": 0.3959523479792395, + "learning_rate": 4.933650157104697e-07, + "loss": 0.3719, + "step": 7241 + }, + { + "epoch": 1.72374605819004, + "grad_norm": 0.36786887087342524, + "learning_rate": 4.925304339455289e-07, + "loss": 0.3423, + "step": 7242 + }, + { + "epoch": 1.7239840542631046, + "grad_norm": 0.3874665309151205, + "learning_rate": 4.916965221020753e-07, + "loss": 0.2973, + "step": 7243 + }, + { + "epoch": 1.7242220503361696, + "grad_norm": 0.3880295711150305, + "learning_rate": 4.908632803040492e-07, + "loss": 0.3129, + "step": 7244 + }, + { + "epoch": 1.7244600464092343, + "grad_norm": 0.3748397703051552, + "learning_rate": 4.900307086752898e-07, + "loss": 0.3819, + "step": 7245 + }, + { + "epoch": 1.724698042482299, + "grad_norm": 0.359732975677239, + "learning_rate": 4.891988073395382e-07, + "loss": 0.2676, + "step": 7246 + }, + { + "epoch": 1.7249360385553638, + "grad_norm": 0.35344522976400145, + "learning_rate": 4.88367576420436e-07, + "loss": 0.286, + "step": 7247 + }, + { + "epoch": 1.7251740346284286, + "grad_norm": 0.3568660899520547, + "learning_rate": 4.875370160415243e-07, + "loss": 0.3399, + "step": 7248 + }, + { + "epoch": 1.7254120307014933, + "grad_norm": 0.371131074798834, + "learning_rate": 4.867071263262452e-07, + "loss": 0.3362, + "step": 7249 + }, + { + "epoch": 1.7256500267745583, + "grad_norm": 0.3910514202327229, + "learning_rate": 4.858779073979408e-07, + "loss": 0.2673, + "step": 7250 + }, + { + "epoch": 1.725888022847623, + "grad_norm": 0.3545398248925455, + "learning_rate": 4.850493593798528e-07, + "loss": 0.3272, + "step": 7251 + }, + { + "epoch": 1.726126018920688, + "grad_norm": 0.3979049249797813, + "learning_rate": 4.842214823951236e-07, + "loss": 0.359, + "step": 7252 + }, + { + "epoch": 1.7263640149937527, + "grad_norm": 0.3517488882321978, + "learning_rate": 4.833942765667981e-07, + "loss": 0.2883, + "step": 7253 + }, + { + "epoch": 1.7266020110668174, + "grad_norm": 0.37339923080107096, + "learning_rate": 4.825677420178187e-07, + "loss": 0.2837, + "step": 7254 + }, + { + "epoch": 1.7268400071398822, + "grad_norm": 0.3888091649358466, + "learning_rate": 4.817418788710287e-07, + "loss": 0.3544, + "step": 7255 + }, + { + "epoch": 1.727078003212947, + "grad_norm": 0.37811647136667986, + "learning_rate": 4.809166872491716e-07, + "loss": 0.3118, + "step": 7256 + }, + { + "epoch": 1.7273159992860116, + "grad_norm": 0.3737143324650708, + "learning_rate": 4.800921672748921e-07, + "loss": 0.267, + "step": 7257 + }, + { + "epoch": 1.7275539953590766, + "grad_norm": 0.48902821392635987, + "learning_rate": 4.792683190707331e-07, + "loss": 0.3132, + "step": 7258 + }, + { + "epoch": 1.7277919914321413, + "grad_norm": 0.3593715757632593, + "learning_rate": 4.784451427591396e-07, + "loss": 0.3794, + "step": 7259 + }, + { + "epoch": 1.7280299875052063, + "grad_norm": 0.37488004076926507, + "learning_rate": 4.776226384624555e-07, + "loss": 0.2982, + "step": 7260 + }, + { + "epoch": 1.728267983578271, + "grad_norm": 0.4022945491362904, + "learning_rate": 4.7680080630292613e-07, + "loss": 0.3157, + "step": 7261 + }, + { + "epoch": 1.7285059796513358, + "grad_norm": 0.39397197065419187, + "learning_rate": 4.75979646402695e-07, + "loss": 0.3123, + "step": 7262 + }, + { + "epoch": 1.7287439757244005, + "grad_norm": 0.40175779262681727, + "learning_rate": 4.7515915888380724e-07, + "loss": 0.3971, + "step": 7263 + }, + { + "epoch": 1.7289819717974653, + "grad_norm": 0.4001343316717571, + "learning_rate": 4.7433934386820813e-07, + "loss": 0.2903, + "step": 7264 + }, + { + "epoch": 1.72921996787053, + "grad_norm": 0.39178481360532014, + "learning_rate": 4.7352020147774067e-07, + "loss": 0.3215, + "step": 7265 + }, + { + "epoch": 1.729457963943595, + "grad_norm": 0.3668638438326148, + "learning_rate": 4.7270173183415203e-07, + "loss": 0.3626, + "step": 7266 + }, + { + "epoch": 1.7296959600166597, + "grad_norm": 0.381515302995729, + "learning_rate": 4.7188393505908594e-07, + "loss": 0.3434, + "step": 7267 + }, + { + "epoch": 1.7299339560897247, + "grad_norm": 0.38224529914825683, + "learning_rate": 4.710668112740874e-07, + "loss": 0.2686, + "step": 7268 + }, + { + "epoch": 1.7301719521627894, + "grad_norm": 0.39868968626428386, + "learning_rate": 4.7025036060059983e-07, + "loss": 0.3514, + "step": 7269 + }, + { + "epoch": 1.7304099482358541, + "grad_norm": 0.3877292058096328, + "learning_rate": 4.694345831599706e-07, + "loss": 0.373, + "step": 7270 + }, + { + "epoch": 1.7306479443089189, + "grad_norm": 0.364088046457619, + "learning_rate": 4.686194790734427e-07, + "loss": 0.2926, + "step": 7271 + }, + { + "epoch": 1.7308859403819836, + "grad_norm": 0.3655121541245187, + "learning_rate": 4.6780504846216155e-07, + "loss": 0.3324, + "step": 7272 + }, + { + "epoch": 1.7311239364550484, + "grad_norm": 0.37380101505935226, + "learning_rate": 4.6699129144717135e-07, + "loss": 0.3662, + "step": 7273 + }, + { + "epoch": 1.7313619325281133, + "grad_norm": 0.35389840940426953, + "learning_rate": 4.6617820814941594e-07, + "loss": 0.3308, + "step": 7274 + }, + { + "epoch": 1.731599928601178, + "grad_norm": 0.37982947460360994, + "learning_rate": 4.6536579868974083e-07, + "loss": 0.2662, + "step": 7275 + }, + { + "epoch": 1.731837924674243, + "grad_norm": 0.38660456006967564, + "learning_rate": 4.6455406318888896e-07, + "loss": 0.3559, + "step": 7276 + }, + { + "epoch": 1.7320759207473078, + "grad_norm": 0.42338649966136893, + "learning_rate": 4.6374300176750484e-07, + "loss": 0.3795, + "step": 7277 + }, + { + "epoch": 1.7323139168203725, + "grad_norm": 0.364628505385949, + "learning_rate": 4.629326145461327e-07, + "loss": 0.2946, + "step": 7278 + }, + { + "epoch": 1.7325519128934372, + "grad_norm": 0.36740380393685235, + "learning_rate": 4.6212290164521554e-07, + "loss": 0.3445, + "step": 7279 + }, + { + "epoch": 1.732789908966502, + "grad_norm": 0.40395669425071407, + "learning_rate": 4.613138631850955e-07, + "loss": 0.3394, + "step": 7280 + }, + { + "epoch": 1.7330279050395667, + "grad_norm": 0.3954298208790007, + "learning_rate": 4.6050549928601864e-07, + "loss": 0.3488, + "step": 7281 + }, + { + "epoch": 1.7332659011126317, + "grad_norm": 0.3667594695117182, + "learning_rate": 4.59697810068126e-07, + "loss": 0.3099, + "step": 7282 + }, + { + "epoch": 1.7335038971856964, + "grad_norm": 0.37536423714685246, + "learning_rate": 4.588907956514599e-07, + "loss": 0.3318, + "step": 7283 + }, + { + "epoch": 1.7337418932587614, + "grad_norm": 0.4090601125824329, + "learning_rate": 4.5808445615596386e-07, + "loss": 0.3491, + "step": 7284 + }, + { + "epoch": 1.7339798893318261, + "grad_norm": 0.40724673701790437, + "learning_rate": 4.5727879170147927e-07, + "loss": 0.2832, + "step": 7285 + }, + { + "epoch": 1.7342178854048909, + "grad_norm": 0.4192280684140144, + "learning_rate": 4.564738024077475e-07, + "loss": 0.2829, + "step": 7286 + }, + { + "epoch": 1.7344558814779556, + "grad_norm": 0.3683379060762051, + "learning_rate": 4.5566948839441014e-07, + "loss": 0.3196, + "step": 7287 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.37268180835647635, + "learning_rate": 4.5486584978100766e-07, + "loss": 0.3712, + "step": 7288 + }, + { + "epoch": 1.734931873624085, + "grad_norm": 0.3896434055889146, + "learning_rate": 4.5406288668698175e-07, + "loss": 0.2722, + "step": 7289 + }, + { + "epoch": 1.73516986969715, + "grad_norm": 0.37283077254852875, + "learning_rate": 4.5326059923167185e-07, + "loss": 0.2979, + "step": 7290 + }, + { + "epoch": 1.7354078657702148, + "grad_norm": 0.40147117472261845, + "learning_rate": 4.524589875343177e-07, + "loss": 0.3637, + "step": 7291 + }, + { + "epoch": 1.7356458618432797, + "grad_norm": 0.3635087671699396, + "learning_rate": 4.5165805171405786e-07, + "loss": 0.3019, + "step": 7292 + }, + { + "epoch": 1.7358838579163445, + "grad_norm": 0.359877509083195, + "learning_rate": 4.508577918899326e-07, + "loss": 0.2844, + "step": 7293 + }, + { + "epoch": 1.7361218539894092, + "grad_norm": 0.3989712129889001, + "learning_rate": 4.500582081808802e-07, + "loss": 0.3003, + "step": 7294 + }, + { + "epoch": 1.736359850062474, + "grad_norm": 0.43670169269521797, + "learning_rate": 4.492593007057383e-07, + "loss": 0.3806, + "step": 7295 + }, + { + "epoch": 1.7365978461355387, + "grad_norm": 0.4151227878379709, + "learning_rate": 4.484610695832431e-07, + "loss": 0.2829, + "step": 7296 + }, + { + "epoch": 1.7368358422086034, + "grad_norm": 0.39427740496318525, + "learning_rate": 4.476635149320341e-07, + "loss": 0.2821, + "step": 7297 + }, + { + "epoch": 1.7370738382816684, + "grad_norm": 0.39483412534077605, + "learning_rate": 4.4686663687064537e-07, + "loss": 0.3707, + "step": 7298 + }, + { + "epoch": 1.7373118343547331, + "grad_norm": 0.3811903403162823, + "learning_rate": 4.4607043551751385e-07, + "loss": 0.3131, + "step": 7299 + }, + { + "epoch": 1.737549830427798, + "grad_norm": 0.3787340344910432, + "learning_rate": 4.452749109909743e-07, + "loss": 0.2882, + "step": 7300 + }, + { + "epoch": 1.7377878265008628, + "grad_norm": 0.3977977827225253, + "learning_rate": 4.4448006340926163e-07, + "loss": 0.2914, + "step": 7301 + }, + { + "epoch": 1.7380258225739276, + "grad_norm": 0.3471686985062694, + "learning_rate": 4.4368589289050966e-07, + "loss": 0.3736, + "step": 7302 + }, + { + "epoch": 1.7382638186469923, + "grad_norm": 0.3567461740090027, + "learning_rate": 4.428923995527512e-07, + "loss": 0.2677, + "step": 7303 + }, + { + "epoch": 1.738501814720057, + "grad_norm": 0.3895718708014465, + "learning_rate": 4.4209958351392024e-07, + "loss": 0.2761, + "step": 7304 + }, + { + "epoch": 1.7387398107931218, + "grad_norm": 0.4146289851718209, + "learning_rate": 4.4130744489184805e-07, + "loss": 0.336, + "step": 7305 + }, + { + "epoch": 1.7389778068661867, + "grad_norm": 0.36115931059450657, + "learning_rate": 4.4051598380426606e-07, + "loss": 0.3318, + "step": 7306 + }, + { + "epoch": 1.7392158029392515, + "grad_norm": 0.3901592985931496, + "learning_rate": 4.3972520036880406e-07, + "loss": 0.2939, + "step": 7307 + }, + { + "epoch": 1.7394537990123164, + "grad_norm": 0.3861549670050002, + "learning_rate": 4.389350947029941e-07, + "loss": 0.3264, + "step": 7308 + }, + { + "epoch": 1.7396917950853812, + "grad_norm": 0.39366268242787633, + "learning_rate": 4.381456669242645e-07, + "loss": 0.3827, + "step": 7309 + }, + { + "epoch": 1.739929791158446, + "grad_norm": 0.4147331435398097, + "learning_rate": 4.3735691714994366e-07, + "loss": 0.3196, + "step": 7310 + }, + { + "epoch": 1.7401677872315107, + "grad_norm": 0.4033560437583594, + "learning_rate": 4.365688454972589e-07, + "loss": 0.284, + "step": 7311 + }, + { + "epoch": 1.7404057833045754, + "grad_norm": 0.36568041838421067, + "learning_rate": 4.357814520833381e-07, + "loss": 0.3219, + "step": 7312 + }, + { + "epoch": 1.7406437793776401, + "grad_norm": 0.33395093697829437, + "learning_rate": 4.349947370252067e-07, + "loss": 0.3806, + "step": 7313 + }, + { + "epoch": 1.740881775450705, + "grad_norm": 0.37826064768858925, + "learning_rate": 4.342087004397899e-07, + "loss": 0.3458, + "step": 7314 + }, + { + "epoch": 1.7411197715237698, + "grad_norm": 0.36042290741386074, + "learning_rate": 4.3342334244391215e-07, + "loss": 0.2922, + "step": 7315 + }, + { + "epoch": 1.7413577675968348, + "grad_norm": 0.3742173084173869, + "learning_rate": 4.326386631542978e-07, + "loss": 0.3492, + "step": 7316 + }, + { + "epoch": 1.7415957636698995, + "grad_norm": 0.3611783561237634, + "learning_rate": 4.3185466268756916e-07, + "loss": 0.3284, + "step": 7317 + }, + { + "epoch": 1.7418337597429643, + "grad_norm": 0.36444937618317436, + "learning_rate": 4.310713411602485e-07, + "loss": 0.284, + "step": 7318 + }, + { + "epoch": 1.742071755816029, + "grad_norm": 0.3619558848348502, + "learning_rate": 4.30288698688755e-07, + "loss": 0.3108, + "step": 7319 + }, + { + "epoch": 1.7423097518890938, + "grad_norm": 0.40390658897210363, + "learning_rate": 4.295067353894111e-07, + "loss": 0.3621, + "step": 7320 + }, + { + "epoch": 1.7425477479621585, + "grad_norm": 0.40363738867672744, + "learning_rate": 4.28725451378435e-07, + "loss": 0.2972, + "step": 7321 + }, + { + "epoch": 1.7427857440352235, + "grad_norm": 0.3606197822662685, + "learning_rate": 4.279448467719444e-07, + "loss": 0.2996, + "step": 7322 + }, + { + "epoch": 1.7430237401082882, + "grad_norm": 0.37968170265939494, + "learning_rate": 4.271649216859558e-07, + "loss": 0.3488, + "step": 7323 + }, + { + "epoch": 1.7432617361813532, + "grad_norm": 0.3581004782699803, + "learning_rate": 4.263856762363877e-07, + "loss": 0.3275, + "step": 7324 + }, + { + "epoch": 1.743499732254418, + "grad_norm": 0.36888664733014503, + "learning_rate": 4.256071105390536e-07, + "loss": 0.2782, + "step": 7325 + }, + { + "epoch": 1.7437377283274826, + "grad_norm": 0.3736404487957657, + "learning_rate": 4.2482922470966804e-07, + "loss": 0.3517, + "step": 7326 + }, + { + "epoch": 1.7439757244005474, + "grad_norm": 0.3838130180426928, + "learning_rate": 4.2405201886384364e-07, + "loss": 0.3888, + "step": 7327 + }, + { + "epoch": 1.7442137204736121, + "grad_norm": 0.39228239050902514, + "learning_rate": 4.232754931170929e-07, + "loss": 0.2859, + "step": 7328 + }, + { + "epoch": 1.7444517165466769, + "grad_norm": 0.38813077534276835, + "learning_rate": 4.22499647584827e-07, + "loss": 0.2981, + "step": 7329 + }, + { + "epoch": 1.7446897126197418, + "grad_norm": 0.37696497911373844, + "learning_rate": 4.2172448238235464e-07, + "loss": 0.3654, + "step": 7330 + }, + { + "epoch": 1.7449277086928066, + "grad_norm": 0.39042415238351036, + "learning_rate": 4.2094999762488597e-07, + "loss": 0.3358, + "step": 7331 + }, + { + "epoch": 1.7451657047658715, + "grad_norm": 0.3811058522780075, + "learning_rate": 4.2017619342752723e-07, + "loss": 0.2515, + "step": 7332 + }, + { + "epoch": 1.7454037008389363, + "grad_norm": 0.3830369971256063, + "learning_rate": 4.194030699052859e-07, + "loss": 0.3463, + "step": 7333 + }, + { + "epoch": 1.745641696912001, + "grad_norm": 0.3863649827385546, + "learning_rate": 4.1863062717306724e-07, + "loss": 0.3765, + "step": 7334 + }, + { + "epoch": 1.7458796929850657, + "grad_norm": 0.3740836674286429, + "learning_rate": 4.178588653456733e-07, + "loss": 0.2836, + "step": 7335 + }, + { + "epoch": 1.7461176890581305, + "grad_norm": 0.40533187846971, + "learning_rate": 4.1708778453781017e-07, + "loss": 0.285, + "step": 7336 + }, + { + "epoch": 1.7463556851311952, + "grad_norm": 0.37887619317785987, + "learning_rate": 4.163173848640778e-07, + "loss": 0.356, + "step": 7337 + }, + { + "epoch": 1.7465936812042602, + "grad_norm": 0.3617723264544318, + "learning_rate": 4.155476664389768e-07, + "loss": 0.3547, + "step": 7338 + }, + { + "epoch": 1.746831677277325, + "grad_norm": 0.37992103116699155, + "learning_rate": 4.147786293769068e-07, + "loss": 0.3065, + "step": 7339 + }, + { + "epoch": 1.7470696733503899, + "grad_norm": 0.3848821730711763, + "learning_rate": 4.140102737921653e-07, + "loss": 0.2986, + "step": 7340 + }, + { + "epoch": 1.7473076694234546, + "grad_norm": 0.37504711339234986, + "learning_rate": 4.1324259979894865e-07, + "loss": 0.3887, + "step": 7341 + }, + { + "epoch": 1.7475456654965194, + "grad_norm": 0.38275904514715237, + "learning_rate": 4.1247560751135283e-07, + "loss": 0.3301, + "step": 7342 + }, + { + "epoch": 1.747783661569584, + "grad_norm": 0.38728504743609043, + "learning_rate": 4.117092970433717e-07, + "loss": 0.2825, + "step": 7343 + }, + { + "epoch": 1.7480216576426488, + "grad_norm": 0.3869386086927362, + "learning_rate": 4.10943668508898e-07, + "loss": 0.3368, + "step": 7344 + }, + { + "epoch": 1.7482596537157136, + "grad_norm": 0.41661324253240994, + "learning_rate": 4.101787220217229e-07, + "loss": 0.3815, + "step": 7345 + }, + { + "epoch": 1.7484976497887785, + "grad_norm": 0.34535425484790955, + "learning_rate": 4.09414457695535e-07, + "loss": 0.2993, + "step": 7346 + }, + { + "epoch": 1.7487356458618433, + "grad_norm": 0.3806804219160961, + "learning_rate": 4.0865087564392556e-07, + "loss": 0.3206, + "step": 7347 + }, + { + "epoch": 1.7489736419349082, + "grad_norm": 0.39462354482109935, + "learning_rate": 4.0788797598038054e-07, + "loss": 0.3489, + "step": 7348 + }, + { + "epoch": 1.749211638007973, + "grad_norm": 0.3527473981630956, + "learning_rate": 4.0712575881828585e-07, + "loss": 0.3032, + "step": 7349 + }, + { + "epoch": 1.7494496340810377, + "grad_norm": 0.37882523971462584, + "learning_rate": 4.0636422427092483e-07, + "loss": 0.2507, + "step": 7350 + }, + { + "epoch": 1.7496876301541024, + "grad_norm": 0.4076311890215, + "learning_rate": 4.056033724514813e-07, + "loss": 0.3176, + "step": 7351 + }, + { + "epoch": 1.7499256262271672, + "grad_norm": 0.38737649623817405, + "learning_rate": 4.048432034730371e-07, + "loss": 0.3973, + "step": 7352 + }, + { + "epoch": 1.750163622300232, + "grad_norm": 0.36019762556095697, + "learning_rate": 4.040837174485718e-07, + "loss": 0.2926, + "step": 7353 + }, + { + "epoch": 1.7504016183732969, + "grad_norm": 0.3757932232128984, + "learning_rate": 4.033249144909629e-07, + "loss": 0.288, + "step": 7354 + }, + { + "epoch": 1.7506396144463616, + "grad_norm": 0.3784418578541022, + "learning_rate": 4.0256679471298856e-07, + "loss": 0.3387, + "step": 7355 + }, + { + "epoch": 1.7508776105194266, + "grad_norm": 0.36293048107545833, + "learning_rate": 4.01809358227323e-07, + "loss": 0.3649, + "step": 7356 + }, + { + "epoch": 1.7511156065924913, + "grad_norm": 0.37609039329825666, + "learning_rate": 4.0105260514654076e-07, + "loss": 0.3159, + "step": 7357 + }, + { + "epoch": 1.751353602665556, + "grad_norm": 0.4037114023056679, + "learning_rate": 4.00296535583114e-07, + "loss": 0.3318, + "step": 7358 + }, + { + "epoch": 1.7515915987386208, + "grad_norm": 0.42110395329000605, + "learning_rate": 3.9954114964941336e-07, + "loss": 0.3875, + "step": 7359 + }, + { + "epoch": 1.7518295948116855, + "grad_norm": 0.39591006197133266, + "learning_rate": 3.9878644745770745e-07, + "loss": 0.3029, + "step": 7360 + }, + { + "epoch": 1.7520675908847503, + "grad_norm": 0.395799256886029, + "learning_rate": 3.9803242912016427e-07, + "loss": 0.288, + "step": 7361 + }, + { + "epoch": 1.7523055869578152, + "grad_norm": 0.397500164340299, + "learning_rate": 3.972790947488481e-07, + "loss": 0.3269, + "step": 7362 + }, + { + "epoch": 1.75254358303088, + "grad_norm": 0.40196952358276516, + "learning_rate": 3.965264444557254e-07, + "loss": 0.3584, + "step": 7363 + }, + { + "epoch": 1.752781579103945, + "grad_norm": 0.37722120669417086, + "learning_rate": 3.9577447835265734e-07, + "loss": 0.2634, + "step": 7364 + }, + { + "epoch": 1.7530195751770097, + "grad_norm": 0.40416706874239217, + "learning_rate": 3.950231965514051e-07, + "loss": 0.3041, + "step": 7365 + }, + { + "epoch": 1.7532575712500744, + "grad_norm": 0.3524829672281888, + "learning_rate": 3.94272599163627e-07, + "loss": 0.3895, + "step": 7366 + }, + { + "epoch": 1.7534955673231392, + "grad_norm": 0.40869397302371735, + "learning_rate": 3.935226863008812e-07, + "loss": 0.2801, + "step": 7367 + }, + { + "epoch": 1.753733563396204, + "grad_norm": 0.3844052458895688, + "learning_rate": 3.9277345807462285e-07, + "loss": 0.2741, + "step": 7368 + }, + { + "epoch": 1.7539715594692686, + "grad_norm": 0.3940308898609167, + "learning_rate": 3.920249145962063e-07, + "loss": 0.3044, + "step": 7369 + }, + { + "epoch": 1.7542095555423336, + "grad_norm": 0.37516037241099764, + "learning_rate": 3.9127705597688305e-07, + "loss": 0.3628, + "step": 7370 + }, + { + "epoch": 1.7544475516153983, + "grad_norm": 0.36404405464445827, + "learning_rate": 3.9052988232780364e-07, + "loss": 0.2845, + "step": 7371 + }, + { + "epoch": 1.7546855476884633, + "grad_norm": 0.37799405975146255, + "learning_rate": 3.8978339376001594e-07, + "loss": 0.2724, + "step": 7372 + }, + { + "epoch": 1.754923543761528, + "grad_norm": 0.3851431947005174, + "learning_rate": 3.8903759038446676e-07, + "loss": 0.3726, + "step": 7373 + }, + { + "epoch": 1.7551615398345928, + "grad_norm": 0.35648046802246963, + "learning_rate": 3.882924723120024e-07, + "loss": 0.3407, + "step": 7374 + }, + { + "epoch": 1.7553995359076575, + "grad_norm": 0.3805853248880385, + "learning_rate": 3.875480396533648e-07, + "loss": 0.276, + "step": 7375 + }, + { + "epoch": 1.7556375319807223, + "grad_norm": 0.43047194201246963, + "learning_rate": 3.8680429251919504e-07, + "loss": 0.3141, + "step": 7376 + }, + { + "epoch": 1.755875528053787, + "grad_norm": 0.3672797811079228, + "learning_rate": 3.8606123102003124e-07, + "loss": 0.3954, + "step": 7377 + }, + { + "epoch": 1.756113524126852, + "grad_norm": 0.3655093853884352, + "learning_rate": 3.8531885526631286e-07, + "loss": 0.266, + "step": 7378 + }, + { + "epoch": 1.7563515201999167, + "grad_norm": 0.4008961738434252, + "learning_rate": 3.845771653683744e-07, + "loss": 0.2754, + "step": 7379 + }, + { + "epoch": 1.7565895162729817, + "grad_norm": 0.3754989479985082, + "learning_rate": 3.8383616143644884e-07, + "loss": 0.3759, + "step": 7380 + }, + { + "epoch": 1.7568275123460464, + "grad_norm": 0.3998697477070844, + "learning_rate": 3.8309584358066866e-07, + "loss": 0.3457, + "step": 7381 + }, + { + "epoch": 1.7570655084191111, + "grad_norm": 0.3805072843335442, + "learning_rate": 3.823562119110624e-07, + "loss": 0.305, + "step": 7382 + }, + { + "epoch": 1.7573035044921759, + "grad_norm": 0.38537738584226183, + "learning_rate": 3.816172665375584e-07, + "loss": 0.3079, + "step": 7383 + }, + { + "epoch": 1.7575415005652406, + "grad_norm": 0.37422390675327777, + "learning_rate": 3.808790075699814e-07, + "loss": 0.3951, + "step": 7384 + }, + { + "epoch": 1.7577794966383053, + "grad_norm": 0.3857626430504062, + "learning_rate": 3.80141435118056e-07, + "loss": 0.2866, + "step": 7385 + }, + { + "epoch": 1.7580174927113703, + "grad_norm": 0.3848693921935248, + "learning_rate": 3.794045492914028e-07, + "loss": 0.2799, + "step": 7386 + }, + { + "epoch": 1.758255488784435, + "grad_norm": 0.37773875751263675, + "learning_rate": 3.786683501995414e-07, + "loss": 0.3419, + "step": 7387 + }, + { + "epoch": 1.7584934848575, + "grad_norm": 0.39284310713498916, + "learning_rate": 3.779328379518898e-07, + "loss": 0.3779, + "step": 7388 + }, + { + "epoch": 1.7587314809305648, + "grad_norm": 0.38708082552128015, + "learning_rate": 3.771980126577629e-07, + "loss": 0.2952, + "step": 7389 + }, + { + "epoch": 1.7589694770036295, + "grad_norm": 0.3840167669470753, + "learning_rate": 3.7646387442637266e-07, + "loss": 0.3068, + "step": 7390 + }, + { + "epoch": 1.7592074730766942, + "grad_norm": 0.37577064742826666, + "learning_rate": 3.757304233668324e-07, + "loss": 0.3682, + "step": 7391 + }, + { + "epoch": 1.759445469149759, + "grad_norm": 0.37383506429697316, + "learning_rate": 3.749976595881505e-07, + "loss": 0.2946, + "step": 7392 + }, + { + "epoch": 1.7596834652228237, + "grad_norm": 0.3938798316973006, + "learning_rate": 3.742655831992331e-07, + "loss": 0.2602, + "step": 7393 + }, + { + "epoch": 1.7599214612958887, + "grad_norm": 0.382277877373375, + "learning_rate": 3.735341943088855e-07, + "loss": 0.3399, + "step": 7394 + }, + { + "epoch": 1.7601594573689534, + "grad_norm": 0.4438405373662082, + "learning_rate": 3.7280349302580954e-07, + "loss": 0.4037, + "step": 7395 + }, + { + "epoch": 1.7603974534420184, + "grad_norm": 0.38449221017426694, + "learning_rate": 3.720734794586062e-07, + "loss": 0.303, + "step": 7396 + }, + { + "epoch": 1.760635449515083, + "grad_norm": 0.37868139806924034, + "learning_rate": 3.7134415371577303e-07, + "loss": 0.2986, + "step": 7397 + }, + { + "epoch": 1.7608734455881478, + "grad_norm": 0.3943383369047064, + "learning_rate": 3.7061551590570565e-07, + "loss": 0.3536, + "step": 7398 + }, + { + "epoch": 1.7611114416612126, + "grad_norm": 0.36959772575231714, + "learning_rate": 3.698875661366985e-07, + "loss": 0.2997, + "step": 7399 + }, + { + "epoch": 1.7613494377342773, + "grad_norm": 0.40020141349107846, + "learning_rate": 3.691603045169417e-07, + "loss": 0.2694, + "step": 7400 + }, + { + "epoch": 1.761587433807342, + "grad_norm": 0.37634863654754686, + "learning_rate": 3.684337311545261e-07, + "loss": 0.3371, + "step": 7401 + }, + { + "epoch": 1.761825429880407, + "grad_norm": 0.38956164399358506, + "learning_rate": 3.677078461574368e-07, + "loss": 0.3591, + "step": 7402 + }, + { + "epoch": 1.7620634259534718, + "grad_norm": 0.3602522289158944, + "learning_rate": 3.6698264963355936e-07, + "loss": 0.2936, + "step": 7403 + }, + { + "epoch": 1.7623014220265367, + "grad_norm": 0.3761002128092343, + "learning_rate": 3.662581416906746e-07, + "loss": 0.3009, + "step": 7404 + }, + { + "epoch": 1.7625394180996015, + "grad_norm": 0.4007846365643332, + "learning_rate": 3.6553432243646435e-07, + "loss": 0.3523, + "step": 7405 + }, + { + "epoch": 1.7627774141726662, + "grad_norm": 0.35494662132278193, + "learning_rate": 3.6481119197850466e-07, + "loss": 0.3274, + "step": 7406 + }, + { + "epoch": 1.763015410245731, + "grad_norm": 0.3790695232145245, + "learning_rate": 3.640887504242707e-07, + "loss": 0.2875, + "step": 7407 + }, + { + "epoch": 1.7632534063187957, + "grad_norm": 0.4084614475813453, + "learning_rate": 3.6336699788113605e-07, + "loss": 0.3147, + "step": 7408 + }, + { + "epoch": 1.7634914023918604, + "grad_norm": 0.37225641815520183, + "learning_rate": 3.626459344563693e-07, + "loss": 0.384, + "step": 7409 + }, + { + "epoch": 1.7637293984649254, + "grad_norm": 0.36150847703005184, + "learning_rate": 3.619255602571403e-07, + "loss": 0.3092, + "step": 7410 + }, + { + "epoch": 1.7639673945379901, + "grad_norm": 0.3522846334053475, + "learning_rate": 3.612058753905129e-07, + "loss": 0.2549, + "step": 7411 + }, + { + "epoch": 1.764205390611055, + "grad_norm": 0.36618028447488987, + "learning_rate": 3.604868799634509e-07, + "loss": 0.3265, + "step": 7412 + }, + { + "epoch": 1.7644433866841198, + "grad_norm": 0.3566376234452962, + "learning_rate": 3.597685740828144e-07, + "loss": 0.3793, + "step": 7413 + }, + { + "epoch": 1.7646813827571846, + "grad_norm": 0.36692847158051417, + "learning_rate": 3.5905095785536135e-07, + "loss": 0.2872, + "step": 7414 + }, + { + "epoch": 1.7649193788302493, + "grad_norm": 0.37257656439382925, + "learning_rate": 3.5833403138774756e-07, + "loss": 0.3225, + "step": 7415 + }, + { + "epoch": 1.765157374903314, + "grad_norm": 0.3749686078408963, + "learning_rate": 3.5761779478652614e-07, + "loss": 0.3883, + "step": 7416 + }, + { + "epoch": 1.7653953709763788, + "grad_norm": 0.3583625843405949, + "learning_rate": 3.56902248158148e-07, + "loss": 0.2977, + "step": 7417 + }, + { + "epoch": 1.7656333670494437, + "grad_norm": 0.38524990025382316, + "learning_rate": 3.5618739160895864e-07, + "loss": 0.2877, + "step": 7418 + }, + { + "epoch": 1.7658713631225085, + "grad_norm": 0.37604720661475144, + "learning_rate": 3.55473225245207e-07, + "loss": 0.3219, + "step": 7419 + }, + { + "epoch": 1.7661093591955734, + "grad_norm": 0.3846604314430845, + "learning_rate": 3.5475974917303366e-07, + "loss": 0.3566, + "step": 7420 + }, + { + "epoch": 1.7663473552686382, + "grad_norm": 0.3680527964727019, + "learning_rate": 3.5404696349847944e-07, + "loss": 0.2627, + "step": 7421 + }, + { + "epoch": 1.766585351341703, + "grad_norm": 0.3991689202195992, + "learning_rate": 3.5333486832748176e-07, + "loss": 0.2867, + "step": 7422 + }, + { + "epoch": 1.7668233474147677, + "grad_norm": 0.3999787206661557, + "learning_rate": 3.5262346376587544e-07, + "loss": 0.3645, + "step": 7423 + }, + { + "epoch": 1.7670613434878324, + "grad_norm": 0.3757038581644989, + "learning_rate": 3.5191274991939306e-07, + "loss": 0.3368, + "step": 7424 + }, + { + "epoch": 1.7672993395608971, + "grad_norm": 0.382050405949238, + "learning_rate": 3.512027268936641e-07, + "loss": 0.2718, + "step": 7425 + }, + { + "epoch": 1.767537335633962, + "grad_norm": 0.38279994990044025, + "learning_rate": 3.504933947942157e-07, + "loss": 0.3359, + "step": 7426 + }, + { + "epoch": 1.7677753317070268, + "grad_norm": 0.38419625146613884, + "learning_rate": 3.4978475372647145e-07, + "loss": 0.3792, + "step": 7427 + }, + { + "epoch": 1.7680133277800918, + "grad_norm": 0.3984627542168908, + "learning_rate": 3.4907680379575426e-07, + "loss": 0.2835, + "step": 7428 + }, + { + "epoch": 1.7682513238531565, + "grad_norm": 0.37364849406584333, + "learning_rate": 3.4836954510728215e-07, + "loss": 0.2711, + "step": 7429 + }, + { + "epoch": 1.7684893199262213, + "grad_norm": 0.42900751703465184, + "learning_rate": 3.476629777661716e-07, + "loss": 0.3473, + "step": 7430 + }, + { + "epoch": 1.768727315999286, + "grad_norm": 0.3800418075868142, + "learning_rate": 3.469571018774348e-07, + "loss": 0.3456, + "step": 7431 + }, + { + "epoch": 1.7689653120723507, + "grad_norm": 0.43203134753145855, + "learning_rate": 3.462519175459844e-07, + "loss": 0.2851, + "step": 7432 + }, + { + "epoch": 1.7692033081454155, + "grad_norm": 0.40985553269034813, + "learning_rate": 3.4554742487662716e-07, + "loss": 0.3091, + "step": 7433 + }, + { + "epoch": 1.7694413042184804, + "grad_norm": 0.38887010959356444, + "learning_rate": 3.448436239740682e-07, + "loss": 0.3986, + "step": 7434 + }, + { + "epoch": 1.7696793002915452, + "grad_norm": 0.36915654756950994, + "learning_rate": 3.4414051494291e-07, + "loss": 0.2855, + "step": 7435 + }, + { + "epoch": 1.7699172963646101, + "grad_norm": 0.386681678539916, + "learning_rate": 3.434380978876517e-07, + "loss": 0.3074, + "step": 7436 + }, + { + "epoch": 1.7701552924376749, + "grad_norm": 0.37941576584294967, + "learning_rate": 3.4273637291268926e-07, + "loss": 0.3312, + "step": 7437 + }, + { + "epoch": 1.7703932885107396, + "grad_norm": 0.3603145085548946, + "learning_rate": 3.4203534012231753e-07, + "loss": 0.3694, + "step": 7438 + }, + { + "epoch": 1.7706312845838044, + "grad_norm": 0.38237676060980935, + "learning_rate": 3.413349996207266e-07, + "loss": 0.2842, + "step": 7439 + }, + { + "epoch": 1.770869280656869, + "grad_norm": 0.39329602306550704, + "learning_rate": 3.4063535151200424e-07, + "loss": 0.3089, + "step": 7440 + }, + { + "epoch": 1.7711072767299338, + "grad_norm": 0.3520088960926671, + "learning_rate": 3.3993639590013615e-07, + "loss": 0.367, + "step": 7441 + }, + { + "epoch": 1.7713452728029988, + "grad_norm": 0.3549709470925406, + "learning_rate": 3.3923813288900376e-07, + "loss": 0.303, + "step": 7442 + }, + { + "epoch": 1.7715832688760635, + "grad_norm": 0.3698693474185596, + "learning_rate": 3.3854056258238675e-07, + "loss": 0.2703, + "step": 7443 + }, + { + "epoch": 1.7718212649491285, + "grad_norm": 0.3744847290334765, + "learning_rate": 3.378436850839612e-07, + "loss": 0.3261, + "step": 7444 + }, + { + "epoch": 1.7720592610221932, + "grad_norm": 0.3841273561075149, + "learning_rate": 3.3714750049729903e-07, + "loss": 0.3532, + "step": 7445 + }, + { + "epoch": 1.772297257095258, + "grad_norm": 0.3673423324794477, + "learning_rate": 3.364520089258727e-07, + "loss": 0.2856, + "step": 7446 + }, + { + "epoch": 1.7725352531683227, + "grad_norm": 0.39543112396129604, + "learning_rate": 3.3575721047304887e-07, + "loss": 0.3264, + "step": 7447 + }, + { + "epoch": 1.7727732492413875, + "grad_norm": 0.38069356140983673, + "learning_rate": 3.350631052420911e-07, + "loss": 0.3384, + "step": 7448 + }, + { + "epoch": 1.7730112453144522, + "grad_norm": 0.39721194025902407, + "learning_rate": 3.3436969333616064e-07, + "loss": 0.3504, + "step": 7449 + }, + { + "epoch": 1.7732492413875172, + "grad_norm": 0.41294146060598996, + "learning_rate": 3.3367697485831573e-07, + "loss": 0.309, + "step": 7450 + }, + { + "epoch": 1.773487237460582, + "grad_norm": 0.4004419253097933, + "learning_rate": 3.3298494991151234e-07, + "loss": 0.3409, + "step": 7451 + }, + { + "epoch": 1.7737252335336469, + "grad_norm": 0.3787521305790956, + "learning_rate": 3.322936185986009e-07, + "loss": 0.4034, + "step": 7452 + }, + { + "epoch": 1.7739632296067116, + "grad_norm": 0.36357233108658643, + "learning_rate": 3.31602981022332e-07, + "loss": 0.2957, + "step": 7453 + }, + { + "epoch": 1.7742012256797763, + "grad_norm": 0.3702141009628778, + "learning_rate": 3.309130372853492e-07, + "loss": 0.2788, + "step": 7454 + }, + { + "epoch": 1.774439221752841, + "grad_norm": 0.41114542834457846, + "learning_rate": 3.302237874901981e-07, + "loss": 0.3846, + "step": 7455 + }, + { + "epoch": 1.7746772178259058, + "grad_norm": 0.35691967017327136, + "learning_rate": 3.2953523173931633e-07, + "loss": 0.353, + "step": 7456 + }, + { + "epoch": 1.7749152138989706, + "grad_norm": 0.3949364534978649, + "learning_rate": 3.2884737013504143e-07, + "loss": 0.2854, + "step": 7457 + }, + { + "epoch": 1.7751532099720353, + "grad_norm": 0.38350872066977443, + "learning_rate": 3.2816020277960604e-07, + "loss": 0.3232, + "step": 7458 + }, + { + "epoch": 1.7753912060451003, + "grad_norm": 0.4021154125147116, + "learning_rate": 3.2747372977513905e-07, + "loss": 0.3592, + "step": 7459 + }, + { + "epoch": 1.775629202118165, + "grad_norm": 0.3879804363704007, + "learning_rate": 3.2678795122366933e-07, + "loss": 0.2945, + "step": 7460 + }, + { + "epoch": 1.77586719819123, + "grad_norm": 0.4237627394310612, + "learning_rate": 3.2610286722711993e-07, + "loss": 0.2732, + "step": 7461 + }, + { + "epoch": 1.7761051942642947, + "grad_norm": 0.3924917993001975, + "learning_rate": 3.2541847788731153e-07, + "loss": 0.3461, + "step": 7462 + }, + { + "epoch": 1.7763431903373594, + "grad_norm": 0.3627340154994653, + "learning_rate": 3.2473478330595996e-07, + "loss": 0.3414, + "step": 7463 + }, + { + "epoch": 1.7765811864104242, + "grad_norm": 0.3879803415205633, + "learning_rate": 3.240517835846807e-07, + "loss": 0.2865, + "step": 7464 + }, + { + "epoch": 1.776819182483489, + "grad_norm": 0.36721296027414413, + "learning_rate": 3.233694788249836e-07, + "loss": 0.3411, + "step": 7465 + }, + { + "epoch": 1.7770571785565537, + "grad_norm": 0.37560707117027603, + "learning_rate": 3.2268786912827645e-07, + "loss": 0.3818, + "step": 7466 + }, + { + "epoch": 1.7772951746296186, + "grad_norm": 0.37942928089496036, + "learning_rate": 3.220069545958632e-07, + "loss": 0.3262, + "step": 7467 + }, + { + "epoch": 1.7775331707026834, + "grad_norm": 0.35866524083415724, + "learning_rate": 3.2132673532894397e-07, + "loss": 0.2738, + "step": 7468 + }, + { + "epoch": 1.7777711667757483, + "grad_norm": 0.4579563149174641, + "learning_rate": 3.206472114286169e-07, + "loss": 0.3463, + "step": 7469 + }, + { + "epoch": 1.778009162848813, + "grad_norm": 0.374846770314416, + "learning_rate": 3.1996838299587604e-07, + "loss": 0.369, + "step": 7470 + }, + { + "epoch": 1.7782471589218778, + "grad_norm": 0.3575595417175333, + "learning_rate": 3.192902501316114e-07, + "loss": 0.322, + "step": 7471 + }, + { + "epoch": 1.7784851549949425, + "grad_norm": 0.3793283868853329, + "learning_rate": 3.186128129366112e-07, + "loss": 0.2936, + "step": 7472 + }, + { + "epoch": 1.7787231510680073, + "grad_norm": 0.4086026355069854, + "learning_rate": 3.179360715115576e-07, + "loss": 0.355, + "step": 7473 + }, + { + "epoch": 1.778961147141072, + "grad_norm": 0.35603960063048445, + "learning_rate": 3.172600259570335e-07, + "loss": 0.3446, + "step": 7474 + }, + { + "epoch": 1.779199143214137, + "grad_norm": 0.3594124787406389, + "learning_rate": 3.165846763735153e-07, + "loss": 0.2763, + "step": 7475 + }, + { + "epoch": 1.7794371392872017, + "grad_norm": 0.35661873853503745, + "learning_rate": 3.1591002286137597e-07, + "loss": 0.3205, + "step": 7476 + }, + { + "epoch": 1.7796751353602667, + "grad_norm": 0.37799283767267144, + "learning_rate": 3.152360655208864e-07, + "loss": 0.3899, + "step": 7477 + }, + { + "epoch": 1.7799131314333314, + "grad_norm": 0.3943463609437691, + "learning_rate": 3.1456280445221256e-07, + "loss": 0.3142, + "step": 7478 + }, + { + "epoch": 1.7801511275063961, + "grad_norm": 0.3930479973881174, + "learning_rate": 3.138902397554183e-07, + "loss": 0.2794, + "step": 7479 + }, + { + "epoch": 1.7803891235794609, + "grad_norm": 0.37084479307111856, + "learning_rate": 3.132183715304632e-07, + "loss": 0.3584, + "step": 7480 + }, + { + "epoch": 1.7806271196525256, + "grad_norm": 0.3640689537300512, + "learning_rate": 3.125471998772023e-07, + "loss": 0.3562, + "step": 7481 + }, + { + "epoch": 1.7808651157255904, + "grad_norm": 0.3798227659828128, + "learning_rate": 3.118767248953908e-07, + "loss": 0.2747, + "step": 7482 + }, + { + "epoch": 1.7811031117986553, + "grad_norm": 0.39461973353159785, + "learning_rate": 3.112069466846762e-07, + "loss": 0.3147, + "step": 7483 + }, + { + "epoch": 1.78134110787172, + "grad_norm": 0.39876904022229337, + "learning_rate": 3.105378653446045e-07, + "loss": 0.4053, + "step": 7484 + }, + { + "epoch": 1.781579103944785, + "grad_norm": 0.363220255107949, + "learning_rate": 3.098694809746183e-07, + "loss": 0.3208, + "step": 7485 + }, + { + "epoch": 1.7818171000178498, + "grad_norm": 0.4052186140806434, + "learning_rate": 3.092017936740549e-07, + "loss": 0.2928, + "step": 7486 + }, + { + "epoch": 1.7820550960909145, + "grad_norm": 0.3622874186549825, + "learning_rate": 3.085348035421487e-07, + "loss": 0.3231, + "step": 7487 + }, + { + "epoch": 1.7822930921639792, + "grad_norm": 0.39892992338775146, + "learning_rate": 3.0786851067803326e-07, + "loss": 0.3581, + "step": 7488 + }, + { + "epoch": 1.782531088237044, + "grad_norm": 0.4289428994215822, + "learning_rate": 3.0720291518073485e-07, + "loss": 0.2747, + "step": 7489 + }, + { + "epoch": 1.7827690843101087, + "grad_norm": 0.4117645952035017, + "learning_rate": 3.065380171491772e-07, + "loss": 0.3095, + "step": 7490 + }, + { + "epoch": 1.7830070803831737, + "grad_norm": 0.3635531180905779, + "learning_rate": 3.0587381668218117e-07, + "loss": 0.3608, + "step": 7491 + }, + { + "epoch": 1.7832450764562384, + "grad_norm": 0.3678683816926744, + "learning_rate": 3.052103138784629e-07, + "loss": 0.3101, + "step": 7492 + }, + { + "epoch": 1.7834830725293034, + "grad_norm": 0.3893055958544627, + "learning_rate": 3.045475088366351e-07, + "loss": 0.2619, + "step": 7493 + }, + { + "epoch": 1.7837210686023681, + "grad_norm": 0.3745763428587718, + "learning_rate": 3.038854016552079e-07, + "loss": 0.3315, + "step": 7494 + }, + { + "epoch": 1.7839590646754329, + "grad_norm": 0.3797581457610855, + "learning_rate": 3.0322399243258583e-07, + "loss": 0.3711, + "step": 7495 + }, + { + "epoch": 1.7841970607484976, + "grad_norm": 0.39193763190558745, + "learning_rate": 3.0256328126707147e-07, + "loss": 0.3174, + "step": 7496 + }, + { + "epoch": 1.7844350568215623, + "grad_norm": 0.3855942012320072, + "learning_rate": 3.0190326825686234e-07, + "loss": 0.3247, + "step": 7497 + }, + { + "epoch": 1.784673052894627, + "grad_norm": 0.3719708597803907, + "learning_rate": 3.012439535000533e-07, + "loss": 0.3487, + "step": 7498 + }, + { + "epoch": 1.784911048967692, + "grad_norm": 0.3389285568386847, + "learning_rate": 3.005853370946338e-07, + "loss": 0.3018, + "step": 7499 + }, + { + "epoch": 1.7851490450407568, + "grad_norm": 0.3686403079627146, + "learning_rate": 2.9992741913849044e-07, + "loss": 0.2694, + "step": 7500 + }, + { + "epoch": 1.7853870411138217, + "grad_norm": 0.3739646758479323, + "learning_rate": 2.9927019972940785e-07, + "loss": 0.3262, + "step": 7501 + }, + { + "epoch": 1.7856250371868865, + "grad_norm": 0.3733801667577494, + "learning_rate": 2.9861367896506397e-07, + "loss": 0.3706, + "step": 7502 + }, + { + "epoch": 1.7858630332599512, + "grad_norm": 0.3896825661654586, + "learning_rate": 2.9795785694303413e-07, + "loss": 0.3129, + "step": 7503 + }, + { + "epoch": 1.786101029333016, + "grad_norm": 0.41296329329174347, + "learning_rate": 2.9730273376078923e-07, + "loss": 0.2796, + "step": 7504 + }, + { + "epoch": 1.7863390254060807, + "grad_norm": 0.3719130039198455, + "learning_rate": 2.9664830951569743e-07, + "loss": 0.3465, + "step": 7505 + }, + { + "epoch": 1.7865770214791454, + "grad_norm": 0.3562312311470605, + "learning_rate": 2.959945843050227e-07, + "loss": 0.3219, + "step": 7506 + }, + { + "epoch": 1.7868150175522104, + "grad_norm": 0.38540030460210417, + "learning_rate": 2.9534155822592336e-07, + "loss": 0.2877, + "step": 7507 + }, + { + "epoch": 1.7870530136252751, + "grad_norm": 0.378036461217732, + "learning_rate": 2.9468923137545626e-07, + "loss": 0.3406, + "step": 7508 + }, + { + "epoch": 1.78729100969834, + "grad_norm": 0.402519853904251, + "learning_rate": 2.940376038505732e-07, + "loss": 0.3822, + "step": 7509 + }, + { + "epoch": 1.7875290057714048, + "grad_norm": 0.37522413264398013, + "learning_rate": 2.933866757481224e-07, + "loss": 0.3143, + "step": 7510 + }, + { + "epoch": 1.7877670018444696, + "grad_norm": 0.3791804024356112, + "learning_rate": 2.9273644716484753e-07, + "loss": 0.2764, + "step": 7511 + }, + { + "epoch": 1.7880049979175343, + "grad_norm": 0.40095271597833637, + "learning_rate": 2.9208691819738844e-07, + "loss": 0.3297, + "step": 7512 + }, + { + "epoch": 1.788242993990599, + "grad_norm": 0.39865047915199964, + "learning_rate": 2.914380889422819e-07, + "loss": 0.3583, + "step": 7513 + }, + { + "epoch": 1.7884809900636638, + "grad_norm": 0.35173957915577636, + "learning_rate": 2.9078995949595847e-07, + "loss": 0.289, + "step": 7514 + }, + { + "epoch": 1.7887189861367288, + "grad_norm": 0.3817357829174564, + "learning_rate": 2.901425299547483e-07, + "loss": 0.3104, + "step": 7515 + }, + { + "epoch": 1.7889569822097935, + "grad_norm": 0.37478155473085156, + "learning_rate": 2.8949580041487457e-07, + "loss": 0.391, + "step": 7516 + }, + { + "epoch": 1.7891949782828585, + "grad_norm": 0.3725958003240469, + "learning_rate": 2.8884977097245694e-07, + "loss": 0.3156, + "step": 7517 + }, + { + "epoch": 1.7894329743559232, + "grad_norm": 0.3801605189515548, + "learning_rate": 2.8820444172351137e-07, + "loss": 0.281, + "step": 7518 + }, + { + "epoch": 1.789670970428988, + "grad_norm": 0.3623288254644941, + "learning_rate": 2.8755981276395005e-07, + "loss": 0.3224, + "step": 7519 + }, + { + "epoch": 1.7899089665020527, + "grad_norm": 0.3776619119078162, + "learning_rate": 2.869158841895808e-07, + "loss": 0.3689, + "step": 7520 + }, + { + "epoch": 1.7901469625751174, + "grad_norm": 0.35832837886741215, + "learning_rate": 2.862726560961071e-07, + "loss": 0.2744, + "step": 7521 + }, + { + "epoch": 1.7903849586481821, + "grad_norm": 0.38760652888202674, + "learning_rate": 2.8563012857912906e-07, + "loss": 0.303, + "step": 7522 + }, + { + "epoch": 1.790622954721247, + "grad_norm": 0.3754292098719095, + "learning_rate": 2.849883017341415e-07, + "loss": 0.3525, + "step": 7523 + }, + { + "epoch": 1.7908609507943118, + "grad_norm": 0.3492865594689972, + "learning_rate": 2.8434717565653635e-07, + "loss": 0.337, + "step": 7524 + }, + { + "epoch": 1.7910989468673768, + "grad_norm": 0.3666136541523488, + "learning_rate": 2.837067504416002e-07, + "loss": 0.3009, + "step": 7525 + }, + { + "epoch": 1.7913369429404415, + "grad_norm": 0.37743641568773106, + "learning_rate": 2.830670261845164e-07, + "loss": 0.3058, + "step": 7526 + }, + { + "epoch": 1.7915749390135063, + "grad_norm": 0.4041150603093485, + "learning_rate": 2.8242800298036443e-07, + "loss": 0.3588, + "step": 7527 + }, + { + "epoch": 1.791812935086571, + "grad_norm": 0.3678697803590038, + "learning_rate": 2.8178968092411717e-07, + "loss": 0.2898, + "step": 7528 + }, + { + "epoch": 1.7920509311596358, + "grad_norm": 0.40176844938042455, + "learning_rate": 2.8115206011064655e-07, + "loss": 0.2558, + "step": 7529 + }, + { + "epoch": 1.7922889272327005, + "grad_norm": 0.3917439000088701, + "learning_rate": 2.80515140634719e-07, + "loss": 0.3533, + "step": 7530 + }, + { + "epoch": 1.7925269233057655, + "grad_norm": 0.36028134434707115, + "learning_rate": 2.798789225909959e-07, + "loss": 0.3531, + "step": 7531 + }, + { + "epoch": 1.7927649193788302, + "grad_norm": 0.37200867528731646, + "learning_rate": 2.792434060740351e-07, + "loss": 0.2834, + "step": 7532 + }, + { + "epoch": 1.7930029154518952, + "grad_norm": 0.40052191062969406, + "learning_rate": 2.7860859117828985e-07, + "loss": 0.3288, + "step": 7533 + }, + { + "epoch": 1.79324091152496, + "grad_norm": 0.36692664516075085, + "learning_rate": 2.779744779981097e-07, + "loss": 0.4008, + "step": 7534 + }, + { + "epoch": 1.7934789075980246, + "grad_norm": 0.37862040939536806, + "learning_rate": 2.773410666277382e-07, + "loss": 0.3141, + "step": 7535 + }, + { + "epoch": 1.7937169036710894, + "grad_norm": 0.4291885543813882, + "learning_rate": 2.767083571613183e-07, + "loss": 0.2753, + "step": 7536 + }, + { + "epoch": 1.7939548997441541, + "grad_norm": 0.3603867639177607, + "learning_rate": 2.7607634969288535e-07, + "loss": 0.3465, + "step": 7537 + }, + { + "epoch": 1.7941928958172189, + "grad_norm": 0.3909700151084424, + "learning_rate": 2.7544504431637085e-07, + "loss": 0.4081, + "step": 7538 + }, + { + "epoch": 1.7944308918902838, + "grad_norm": 0.37728790668800677, + "learning_rate": 2.748144411256026e-07, + "loss": 0.2785, + "step": 7539 + }, + { + "epoch": 1.7946688879633486, + "grad_norm": 0.36294912722499145, + "learning_rate": 2.74184540214304e-07, + "loss": 0.3051, + "step": 7540 + }, + { + "epoch": 1.7949068840364135, + "grad_norm": 0.35721883529172505, + "learning_rate": 2.7355534167609334e-07, + "loss": 0.3684, + "step": 7541 + }, + { + "epoch": 1.7951448801094783, + "grad_norm": 0.3554032641789872, + "learning_rate": 2.7292684560448537e-07, + "loss": 0.2988, + "step": 7542 + }, + { + "epoch": 1.795382876182543, + "grad_norm": 0.43060025594932566, + "learning_rate": 2.722990520928903e-07, + "loss": 0.3254, + "step": 7543 + }, + { + "epoch": 1.7956208722556077, + "grad_norm": 0.380455936842252, + "learning_rate": 2.716719612346147e-07, + "loss": 0.3326, + "step": 7544 + }, + { + "epoch": 1.7958588683286725, + "grad_norm": 0.4177701326166782, + "learning_rate": 2.7104557312285786e-07, + "loss": 0.3889, + "step": 7545 + }, + { + "epoch": 1.7960968644017372, + "grad_norm": 0.3649041520875534, + "learning_rate": 2.7041988785071804e-07, + "loss": 0.2986, + "step": 7546 + }, + { + "epoch": 1.7963348604748022, + "grad_norm": 0.39864708497247475, + "learning_rate": 2.697949055111876e-07, + "loss": 0.2909, + "step": 7547 + }, + { + "epoch": 1.796572856547867, + "grad_norm": 0.380155366498838, + "learning_rate": 2.691706261971533e-07, + "loss": 0.363, + "step": 7548 + }, + { + "epoch": 1.7968108526209319, + "grad_norm": 0.3973304332381195, + "learning_rate": 2.685470500013987e-07, + "loss": 0.3397, + "step": 7549 + }, + { + "epoch": 1.7970488486939966, + "grad_norm": 0.3718159670234423, + "learning_rate": 2.679241770166036e-07, + "loss": 0.2915, + "step": 7550 + }, + { + "epoch": 1.7972868447670614, + "grad_norm": 0.43325048967411206, + "learning_rate": 2.673020073353411e-07, + "loss": 0.3344, + "step": 7551 + }, + { + "epoch": 1.797524840840126, + "grad_norm": 0.4179971224778607, + "learning_rate": 2.666805410500822e-07, + "loss": 0.4318, + "step": 7552 + }, + { + "epoch": 1.7977628369131908, + "grad_norm": 0.37436449309549835, + "learning_rate": 2.6605977825319094e-07, + "loss": 0.301, + "step": 7553 + }, + { + "epoch": 1.7980008329862556, + "grad_norm": 0.407183373159236, + "learning_rate": 2.6543971903692954e-07, + "loss": 0.2842, + "step": 7554 + }, + { + "epoch": 1.7982388290593205, + "grad_norm": 0.47155428570797986, + "learning_rate": 2.6482036349345265e-07, + "loss": 0.3289, + "step": 7555 + }, + { + "epoch": 1.7984768251323853, + "grad_norm": 0.3675704624500037, + "learning_rate": 2.642017117148116e-07, + "loss": 0.3144, + "step": 7556 + }, + { + "epoch": 1.7987148212054502, + "grad_norm": 0.3654773621312083, + "learning_rate": 2.63583763792955e-07, + "loss": 0.2829, + "step": 7557 + }, + { + "epoch": 1.798952817278515, + "grad_norm": 0.38467306352137903, + "learning_rate": 2.629665198197251e-07, + "loss": 0.3096, + "step": 7558 + }, + { + "epoch": 1.7991908133515797, + "grad_norm": 0.3771099539638736, + "learning_rate": 2.623499798868584e-07, + "loss": 0.3867, + "step": 7559 + }, + { + "epoch": 1.7994288094246444, + "grad_norm": 0.38421855757957385, + "learning_rate": 2.617341440859883e-07, + "loss": 0.2757, + "step": 7560 + }, + { + "epoch": 1.7996668054977092, + "grad_norm": 0.36935340044670584, + "learning_rate": 2.6111901250864325e-07, + "loss": 0.267, + "step": 7561 + }, + { + "epoch": 1.799904801570774, + "grad_norm": 0.3520631923142841, + "learning_rate": 2.6050458524624735e-07, + "loss": 0.3649, + "step": 7562 + }, + { + "epoch": 1.8001427976438389, + "grad_norm": 0.3642419618021161, + "learning_rate": 2.5989086239011975e-07, + "loss": 0.3463, + "step": 7563 + }, + { + "epoch": 1.8003807937169036, + "grad_norm": 0.3716992913649046, + "learning_rate": 2.5927784403147473e-07, + "loss": 0.2595, + "step": 7564 + }, + { + "epoch": 1.8006187897899686, + "grad_norm": 0.37707110768373664, + "learning_rate": 2.586655302614216e-07, + "loss": 0.3036, + "step": 7565 + }, + { + "epoch": 1.8008567858630333, + "grad_norm": 0.3722945299659591, + "learning_rate": 2.5805392117096597e-07, + "loss": 0.3856, + "step": 7566 + }, + { + "epoch": 1.801094781936098, + "grad_norm": 0.37881870157045106, + "learning_rate": 2.5744301685100727e-07, + "loss": 0.3017, + "step": 7567 + }, + { + "epoch": 1.8013327780091628, + "grad_norm": 0.3840903049457169, + "learning_rate": 2.5683281739234233e-07, + "loss": 0.2878, + "step": 7568 + }, + { + "epoch": 1.8015707740822275, + "grad_norm": 0.3737271689190538, + "learning_rate": 2.5622332288565975e-07, + "loss": 0.3099, + "step": 7569 + }, + { + "epoch": 1.8018087701552923, + "grad_norm": 0.410673409572434, + "learning_rate": 2.5561453342154763e-07, + "loss": 0.3553, + "step": 7570 + }, + { + "epoch": 1.8020467662283572, + "grad_norm": 0.3897577800417089, + "learning_rate": 2.5500644909048577e-07, + "loss": 0.2892, + "step": 7571 + }, + { + "epoch": 1.802284762301422, + "grad_norm": 0.3795777307796096, + "learning_rate": 2.543990699828519e-07, + "loss": 0.304, + "step": 7572 + }, + { + "epoch": 1.802522758374487, + "grad_norm": 0.3926970531258347, + "learning_rate": 2.5379239618891604e-07, + "loss": 0.3838, + "step": 7573 + }, + { + "epoch": 1.8027607544475517, + "grad_norm": 0.35843032279557285, + "learning_rate": 2.5318642779884605e-07, + "loss": 0.3366, + "step": 7574 + }, + { + "epoch": 1.8029987505206164, + "grad_norm": 0.3687701099994784, + "learning_rate": 2.525811649027032e-07, + "loss": 0.3038, + "step": 7575 + }, + { + "epoch": 1.8032367465936812, + "grad_norm": 0.3906979385647688, + "learning_rate": 2.5197660759044505e-07, + "loss": 0.3117, + "step": 7576 + }, + { + "epoch": 1.803474742666746, + "grad_norm": 0.434483976268019, + "learning_rate": 2.51372755951923e-07, + "loss": 0.382, + "step": 7577 + }, + { + "epoch": 1.8037127387398106, + "grad_norm": 0.3842828684371924, + "learning_rate": 2.5076961007688526e-07, + "loss": 0.2864, + "step": 7578 + }, + { + "epoch": 1.8039507348128756, + "grad_norm": 0.42535367755859443, + "learning_rate": 2.5016717005497347e-07, + "loss": 0.2755, + "step": 7579 + }, + { + "epoch": 1.8041887308859403, + "grad_norm": 0.376588372683319, + "learning_rate": 2.4956543597572546e-07, + "loss": 0.3643, + "step": 7580 + }, + { + "epoch": 1.8044267269590053, + "grad_norm": 0.3587861059882926, + "learning_rate": 2.4896440792857355e-07, + "loss": 0.3672, + "step": 7581 + }, + { + "epoch": 1.80466472303207, + "grad_norm": 0.40501079901398007, + "learning_rate": 2.483640860028458e-07, + "loss": 0.2751, + "step": 7582 + }, + { + "epoch": 1.8049027191051348, + "grad_norm": 0.3908238593281349, + "learning_rate": 2.4776447028776404e-07, + "loss": 0.3438, + "step": 7583 + }, + { + "epoch": 1.8051407151781995, + "grad_norm": 0.4008384905708165, + "learning_rate": 2.4716556087244716e-07, + "loss": 0.3841, + "step": 7584 + }, + { + "epoch": 1.8053787112512643, + "grad_norm": 0.363703944202256, + "learning_rate": 2.465673578459077e-07, + "loss": 0.2752, + "step": 7585 + }, + { + "epoch": 1.805616707324329, + "grad_norm": 0.40343185473148424, + "learning_rate": 2.459698612970529e-07, + "loss": 0.2837, + "step": 7586 + }, + { + "epoch": 1.805854703397394, + "grad_norm": 0.35789641416139295, + "learning_rate": 2.4537307131468566e-07, + "loss": 0.3523, + "step": 7587 + }, + { + "epoch": 1.8060926994704587, + "grad_norm": 0.3926977742362301, + "learning_rate": 2.447769879875039e-07, + "loss": 0.4072, + "step": 7588 + }, + { + "epoch": 1.8063306955435237, + "grad_norm": 0.3757795347751587, + "learning_rate": 2.441816114040996e-07, + "loss": 0.2978, + "step": 7589 + }, + { + "epoch": 1.8065686916165884, + "grad_norm": 0.3829503186037233, + "learning_rate": 2.435869416529618e-07, + "loss": 0.3131, + "step": 7590 + }, + { + "epoch": 1.8068066876896531, + "grad_norm": 0.41211846502557, + "learning_rate": 2.429929788224722e-07, + "loss": 0.3494, + "step": 7591 + }, + { + "epoch": 1.8070446837627179, + "grad_norm": 0.37268683629691657, + "learning_rate": 2.4239972300090897e-07, + "loss": 0.2897, + "step": 7592 + }, + { + "epoch": 1.8072826798357826, + "grad_norm": 0.3841386899279831, + "learning_rate": 2.418071742764444e-07, + "loss": 0.2877, + "step": 7593 + }, + { + "epoch": 1.8075206759088474, + "grad_norm": 0.36351385879987314, + "learning_rate": 2.4121533273714524e-07, + "loss": 0.3227, + "step": 7594 + }, + { + "epoch": 1.8077586719819123, + "grad_norm": 0.37194110953788245, + "learning_rate": 2.4062419847097507e-07, + "loss": 0.3593, + "step": 7595 + }, + { + "epoch": 1.807996668054977, + "grad_norm": 0.3905909596183342, + "learning_rate": 2.4003377156578967e-07, + "loss": 0.294, + "step": 7596 + }, + { + "epoch": 1.808234664128042, + "grad_norm": 0.37292298295629817, + "learning_rate": 2.3944405210934106e-07, + "loss": 0.3196, + "step": 7597 + }, + { + "epoch": 1.8084726602011068, + "grad_norm": 0.3933254447982632, + "learning_rate": 2.388550401892775e-07, + "loss": 0.3323, + "step": 7598 + }, + { + "epoch": 1.8087106562741715, + "grad_norm": 0.3710230013969392, + "learning_rate": 2.382667358931401e-07, + "loss": 0.3092, + "step": 7599 + }, + { + "epoch": 1.8089486523472362, + "grad_norm": 0.3914008246491944, + "learning_rate": 2.3767913930836552e-07, + "loss": 0.2787, + "step": 7600 + }, + { + "epoch": 1.809186648420301, + "grad_norm": 0.402510376468686, + "learning_rate": 2.37092250522285e-07, + "loss": 0.3047, + "step": 7601 + }, + { + "epoch": 1.8094246444933657, + "grad_norm": 0.3923964917222654, + "learning_rate": 2.3650606962212442e-07, + "loss": 0.3777, + "step": 7602 + }, + { + "epoch": 1.8096626405664307, + "grad_norm": 0.390152017290336, + "learning_rate": 2.3592059669500512e-07, + "loss": 0.2928, + "step": 7603 + }, + { + "epoch": 1.8099006366394954, + "grad_norm": 0.39619044275128396, + "learning_rate": 2.353358318279425e-07, + "loss": 0.2758, + "step": 7604 + }, + { + "epoch": 1.8101386327125604, + "grad_norm": 0.3778971109266139, + "learning_rate": 2.347517751078482e-07, + "loss": 0.373, + "step": 7605 + }, + { + "epoch": 1.810376628785625, + "grad_norm": 0.41970787322500813, + "learning_rate": 2.3416842662152606e-07, + "loss": 0.3298, + "step": 7606 + }, + { + "epoch": 1.8106146248586898, + "grad_norm": 0.37056563319434105, + "learning_rate": 2.3358578645567676e-07, + "loss": 0.276, + "step": 7607 + }, + { + "epoch": 1.8108526209317546, + "grad_norm": 0.4059835978022746, + "learning_rate": 2.3300385469689491e-07, + "loss": 0.32, + "step": 7608 + }, + { + "epoch": 1.8110906170048193, + "grad_norm": 0.3951261910417104, + "learning_rate": 2.324226314316702e-07, + "loss": 0.3878, + "step": 7609 + }, + { + "epoch": 1.811328613077884, + "grad_norm": 0.3525934295874246, + "learning_rate": 2.318421167463869e-07, + "loss": 0.2804, + "step": 7610 + }, + { + "epoch": 1.811566609150949, + "grad_norm": 0.4037839992260597, + "learning_rate": 2.3126231072732264e-07, + "loss": 0.3023, + "step": 7611 + }, + { + "epoch": 1.8118046052240138, + "grad_norm": 0.3608118803223138, + "learning_rate": 2.3068321346065236e-07, + "loss": 0.3508, + "step": 7612 + }, + { + "epoch": 1.8120426012970787, + "grad_norm": 0.3707655208185674, + "learning_rate": 2.3010482503244447e-07, + "loss": 0.3564, + "step": 7613 + }, + { + "epoch": 1.8122805973701435, + "grad_norm": 0.3741317976188311, + "learning_rate": 2.2952714552866017e-07, + "loss": 0.2794, + "step": 7614 + }, + { + "epoch": 1.8125185934432082, + "grad_norm": 0.39603813001981003, + "learning_rate": 2.2895017503515859e-07, + "loss": 0.2905, + "step": 7615 + }, + { + "epoch": 1.812756589516273, + "grad_norm": 0.3998027621001329, + "learning_rate": 2.2837391363769e-07, + "loss": 0.3547, + "step": 7616 + }, + { + "epoch": 1.8129945855893377, + "grad_norm": 0.37531093259849246, + "learning_rate": 2.2779836142190314e-07, + "loss": 0.3318, + "step": 7617 + }, + { + "epoch": 1.8132325816624024, + "grad_norm": 0.3743872139458294, + "learning_rate": 2.2722351847333844e-07, + "loss": 0.2864, + "step": 7618 + }, + { + "epoch": 1.8134705777354674, + "grad_norm": 0.3736656227074288, + "learning_rate": 2.266493848774315e-07, + "loss": 0.3149, + "step": 7619 + }, + { + "epoch": 1.8137085738085321, + "grad_norm": 0.3917461855932985, + "learning_rate": 2.2607596071951288e-07, + "loss": 0.3824, + "step": 7620 + }, + { + "epoch": 1.813946569881597, + "grad_norm": 0.38346753454473853, + "learning_rate": 2.255032460848078e-07, + "loss": 0.2818, + "step": 7621 + }, + { + "epoch": 1.8141845659546618, + "grad_norm": 0.38249166251681005, + "learning_rate": 2.2493124105843534e-07, + "loss": 0.2997, + "step": 7622 + }, + { + "epoch": 1.8144225620277266, + "grad_norm": 0.39871403476731015, + "learning_rate": 2.243599457254103e-07, + "loss": 0.3529, + "step": 7623 + }, + { + "epoch": 1.8146605581007913, + "grad_norm": 0.38237212611132454, + "learning_rate": 2.2378936017064035e-07, + "loss": 0.3541, + "step": 7624 + }, + { + "epoch": 1.814898554173856, + "grad_norm": 0.4176523388972532, + "learning_rate": 2.2321948447892984e-07, + "loss": 0.2793, + "step": 7625 + }, + { + "epoch": 1.8151365502469208, + "grad_norm": 0.3765532620043632, + "learning_rate": 2.22650318734976e-07, + "loss": 0.3217, + "step": 7626 + }, + { + "epoch": 1.8153745463199857, + "grad_norm": 0.5502986454158951, + "learning_rate": 2.2208186302337064e-07, + "loss": 0.363, + "step": 7627 + }, + { + "epoch": 1.8156125423930505, + "grad_norm": 0.3691080359982276, + "learning_rate": 2.2151411742860008e-07, + "loss": 0.3043, + "step": 7628 + }, + { + "epoch": 1.8158505384661154, + "grad_norm": 0.4185986270949634, + "learning_rate": 2.2094708203504623e-07, + "loss": 0.3095, + "step": 7629 + }, + { + "epoch": 1.8160885345391802, + "grad_norm": 0.39746628135216966, + "learning_rate": 2.2038075692698392e-07, + "loss": 0.3555, + "step": 7630 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.38678877865572725, + "learning_rate": 2.1981514218858302e-07, + "loss": 0.3517, + "step": 7631 + }, + { + "epoch": 1.8165645266853097, + "grad_norm": 0.3839615523300972, + "learning_rate": 2.1925023790390797e-07, + "loss": 0.2845, + "step": 7632 + }, + { + "epoch": 1.8168025227583744, + "grad_norm": 0.3969919578339475, + "learning_rate": 2.1868604415691775e-07, + "loss": 0.3163, + "step": 7633 + }, + { + "epoch": 1.8170405188314391, + "grad_norm": 0.40042063644372505, + "learning_rate": 2.1812256103146523e-07, + "loss": 0.3821, + "step": 7634 + }, + { + "epoch": 1.817278514904504, + "grad_norm": 0.3595786143418067, + "learning_rate": 2.1755978861129846e-07, + "loss": 0.2926, + "step": 7635 + }, + { + "epoch": 1.8175165109775688, + "grad_norm": 0.4000391772571078, + "learning_rate": 2.1699772698005884e-07, + "loss": 0.2917, + "step": 7636 + }, + { + "epoch": 1.8177545070506338, + "grad_norm": 0.39209491326042045, + "learning_rate": 2.164363762212829e-07, + "loss": 0.3798, + "step": 7637 + }, + { + "epoch": 1.8179925031236985, + "grad_norm": 0.36009581505557847, + "learning_rate": 2.1587573641839999e-07, + "loss": 0.3549, + "step": 7638 + }, + { + "epoch": 1.8182304991967633, + "grad_norm": 0.3891544552915791, + "learning_rate": 2.1531580765473737e-07, + "loss": 0.2979, + "step": 7639 + }, + { + "epoch": 1.818468495269828, + "grad_norm": 0.34833762751724157, + "learning_rate": 2.147565900135129e-07, + "loss": 0.3132, + "step": 7640 + }, + { + "epoch": 1.8187064913428928, + "grad_norm": 0.4102701100568015, + "learning_rate": 2.141980835778401e-07, + "loss": 0.3732, + "step": 7641 + }, + { + "epoch": 1.8189444874159575, + "grad_norm": 0.374891996884048, + "learning_rate": 2.136402884307276e-07, + "loss": 0.3313, + "step": 7642 + }, + { + "epoch": 1.8191824834890225, + "grad_norm": 0.36912237819842353, + "learning_rate": 2.130832046550757e-07, + "loss": 0.3006, + "step": 7643 + }, + { + "epoch": 1.8194204795620872, + "grad_norm": 0.4056734928284316, + "learning_rate": 2.1252683233368377e-07, + "loss": 0.3093, + "step": 7644 + }, + { + "epoch": 1.8196584756351522, + "grad_norm": 0.37038662624715685, + "learning_rate": 2.1197117154924006e-07, + "loss": 0.3657, + "step": 7645 + }, + { + "epoch": 1.819896471708217, + "grad_norm": 0.3710751679952538, + "learning_rate": 2.114162223843308e-07, + "loss": 0.2794, + "step": 7646 + }, + { + "epoch": 1.8201344677812816, + "grad_norm": 0.3778232887871572, + "learning_rate": 2.1086198492143494e-07, + "loss": 0.2793, + "step": 7647 + }, + { + "epoch": 1.8203724638543464, + "grad_norm": 0.41311741726892776, + "learning_rate": 2.1030845924292553e-07, + "loss": 0.3549, + "step": 7648 + }, + { + "epoch": 1.820610459927411, + "grad_norm": 0.3539492859309911, + "learning_rate": 2.0975564543107007e-07, + "loss": 0.3133, + "step": 7649 + }, + { + "epoch": 1.8208484560004758, + "grad_norm": 0.3918727984124256, + "learning_rate": 2.0920354356803118e-07, + "loss": 0.282, + "step": 7650 + }, + { + "epoch": 1.8210864520735408, + "grad_norm": 0.3829591506819489, + "learning_rate": 2.0865215373586377e-07, + "loss": 0.3235, + "step": 7651 + }, + { + "epoch": 1.8213244481466055, + "grad_norm": 0.3848423040694204, + "learning_rate": 2.081014760165184e-07, + "loss": 0.3783, + "step": 7652 + }, + { + "epoch": 1.8215624442196705, + "grad_norm": 0.3682491320220431, + "learning_rate": 2.0755151049183963e-07, + "loss": 0.2923, + "step": 7653 + }, + { + "epoch": 1.8218004402927352, + "grad_norm": 0.3710419857253826, + "learning_rate": 2.070022572435665e-07, + "loss": 0.2838, + "step": 7654 + }, + { + "epoch": 1.8220384363658, + "grad_norm": 0.3940817637494536, + "learning_rate": 2.0645371635333032e-07, + "loss": 0.3577, + "step": 7655 + }, + { + "epoch": 1.8222764324388647, + "grad_norm": 0.34803178778167626, + "learning_rate": 2.0590588790265874e-07, + "loss": 0.3054, + "step": 7656 + }, + { + "epoch": 1.8225144285119295, + "grad_norm": 0.3559915393103781, + "learning_rate": 2.0535877197297271e-07, + "loss": 0.285, + "step": 7657 + }, + { + "epoch": 1.8227524245849942, + "grad_norm": 0.3952700916510172, + "learning_rate": 2.0481236864558663e-07, + "loss": 0.3013, + "step": 7658 + }, + { + "epoch": 1.8229904206580592, + "grad_norm": 0.37451269015209115, + "learning_rate": 2.0426667800170996e-07, + "loss": 0.3538, + "step": 7659 + }, + { + "epoch": 1.823228416731124, + "grad_norm": 0.37289422275429907, + "learning_rate": 2.0372170012244563e-07, + "loss": 0.3015, + "step": 7660 + }, + { + "epoch": 1.8234664128041889, + "grad_norm": 0.3849495025851358, + "learning_rate": 2.031774350887905e-07, + "loss": 0.2954, + "step": 7661 + }, + { + "epoch": 1.8237044088772536, + "grad_norm": 0.3772895053852847, + "learning_rate": 2.0263388298163655e-07, + "loss": 0.3223, + "step": 7662 + }, + { + "epoch": 1.8239424049503183, + "grad_norm": 0.35964348698217047, + "learning_rate": 2.0209104388176858e-07, + "loss": 0.3607, + "step": 7663 + }, + { + "epoch": 1.824180401023383, + "grad_norm": 0.37541695568154915, + "learning_rate": 2.0154891786986595e-07, + "loss": 0.2967, + "step": 7664 + }, + { + "epoch": 1.8244183970964478, + "grad_norm": 0.3958066722757771, + "learning_rate": 2.0100750502650258e-07, + "loss": 0.3289, + "step": 7665 + }, + { + "epoch": 1.8246563931695126, + "grad_norm": 0.4131768602497947, + "learning_rate": 2.0046680543214403e-07, + "loss": 0.3372, + "step": 7666 + }, + { + "epoch": 1.8248943892425775, + "grad_norm": 0.42405294941130334, + "learning_rate": 1.9992681916715385e-07, + "loss": 0.3229, + "step": 7667 + }, + { + "epoch": 1.8251323853156423, + "grad_norm": 0.40593441902537913, + "learning_rate": 1.993875463117867e-07, + "loss": 0.2836, + "step": 7668 + }, + { + "epoch": 1.8253703813887072, + "grad_norm": 0.37653545873556477, + "learning_rate": 1.9884898694619127e-07, + "loss": 0.3173, + "step": 7669 + }, + { + "epoch": 1.825608377461772, + "grad_norm": 0.41304477293098335, + "learning_rate": 1.9831114115041017e-07, + "loss": 0.3672, + "step": 7670 + }, + { + "epoch": 1.8258463735348367, + "grad_norm": 0.33869784993849245, + "learning_rate": 1.9777400900438283e-07, + "loss": 0.2859, + "step": 7671 + }, + { + "epoch": 1.8260843696079014, + "grad_norm": 0.3714603948136559, + "learning_rate": 1.9723759058793868e-07, + "loss": 0.2998, + "step": 7672 + }, + { + "epoch": 1.8263223656809662, + "grad_norm": 0.40178937677479326, + "learning_rate": 1.9670188598080342e-07, + "loss": 0.3435, + "step": 7673 + }, + { + "epoch": 1.826560361754031, + "grad_norm": 0.3755897088267994, + "learning_rate": 1.9616689526259557e-07, + "loss": 0.3184, + "step": 7674 + }, + { + "epoch": 1.8267983578270959, + "grad_norm": 0.3412859979077663, + "learning_rate": 1.9563261851282822e-07, + "loss": 0.2572, + "step": 7675 + }, + { + "epoch": 1.8270363539001606, + "grad_norm": 0.369227042248507, + "learning_rate": 1.9509905581090837e-07, + "loss": 0.3358, + "step": 7676 + }, + { + "epoch": 1.8272743499732256, + "grad_norm": 0.37802690397166483, + "learning_rate": 1.9456620723613596e-07, + "loss": 0.4107, + "step": 7677 + }, + { + "epoch": 1.8275123460462903, + "grad_norm": 1.2815424892667338, + "learning_rate": 1.9403407286770592e-07, + "loss": 0.2987, + "step": 7678 + }, + { + "epoch": 1.827750342119355, + "grad_norm": 0.37575903236434194, + "learning_rate": 1.935026527847067e-07, + "loss": 0.2836, + "step": 7679 + }, + { + "epoch": 1.8279883381924198, + "grad_norm": 0.37225786292995694, + "learning_rate": 1.9297194706612012e-07, + "loss": 0.3522, + "step": 7680 + }, + { + "epoch": 1.8282263342654845, + "grad_norm": 0.4069520217778423, + "learning_rate": 1.9244195579082193e-07, + "loss": 0.3215, + "step": 7681 + }, + { + "epoch": 1.8284643303385493, + "grad_norm": 0.3777178114060692, + "learning_rate": 1.9191267903758304e-07, + "loss": 0.2767, + "step": 7682 + }, + { + "epoch": 1.8287023264116142, + "grad_norm": 0.3802098683973281, + "learning_rate": 1.913841168850661e-07, + "loss": 0.308, + "step": 7683 + }, + { + "epoch": 1.828940322484679, + "grad_norm": 0.49271704172627867, + "learning_rate": 1.9085626941182932e-07, + "loss": 0.3787, + "step": 7684 + }, + { + "epoch": 1.829178318557744, + "grad_norm": 0.3772677332419929, + "learning_rate": 1.903291366963228e-07, + "loss": 0.2788, + "step": 7685 + }, + { + "epoch": 1.8294163146308087, + "grad_norm": 0.36065380388280177, + "learning_rate": 1.8980271881689216e-07, + "loss": 0.2706, + "step": 7686 + }, + { + "epoch": 1.8296543107038734, + "grad_norm": 0.377496168662484, + "learning_rate": 1.892770158517765e-07, + "loss": 0.3517, + "step": 7687 + }, + { + "epoch": 1.8298923067769381, + "grad_norm": 0.35698841434630135, + "learning_rate": 1.8875202787910774e-07, + "loss": 0.3576, + "step": 7688 + }, + { + "epoch": 1.8301303028500029, + "grad_norm": 0.366279133501771, + "learning_rate": 1.882277549769118e-07, + "loss": 0.2936, + "step": 7689 + }, + { + "epoch": 1.8303682989230676, + "grad_norm": 0.3754152778664585, + "learning_rate": 1.8770419722310916e-07, + "loss": 0.2969, + "step": 7690 + }, + { + "epoch": 1.8306062949961326, + "grad_norm": 0.3887220576270878, + "learning_rate": 1.8718135469551313e-07, + "loss": 0.3747, + "step": 7691 + }, + { + "epoch": 1.8308442910691973, + "grad_norm": 0.38990633714055073, + "learning_rate": 1.866592274718315e-07, + "loss": 0.3552, + "step": 7692 + }, + { + "epoch": 1.8310822871422623, + "grad_norm": 0.37677144527387463, + "learning_rate": 1.8613781562966392e-07, + "loss": 0.2833, + "step": 7693 + }, + { + "epoch": 1.831320283215327, + "grad_norm": 0.425287347061427, + "learning_rate": 1.8561711924650728e-07, + "loss": 0.3178, + "step": 7694 + }, + { + "epoch": 1.8315582792883918, + "grad_norm": 0.38942860989951505, + "learning_rate": 1.8509713839974852e-07, + "loss": 0.3685, + "step": 7695 + }, + { + "epoch": 1.8317962753614565, + "grad_norm": 0.3639559057337997, + "learning_rate": 1.8457787316667032e-07, + "loss": 0.2576, + "step": 7696 + }, + { + "epoch": 1.8320342714345212, + "grad_norm": 0.3541221788742815, + "learning_rate": 1.84059323624447e-07, + "loss": 0.2868, + "step": 7697 + }, + { + "epoch": 1.832272267507586, + "grad_norm": 0.3872637366190305, + "learning_rate": 1.835414898501492e-07, + "loss": 0.3573, + "step": 7698 + }, + { + "epoch": 1.832510263580651, + "grad_norm": 0.3678827746800215, + "learning_rate": 1.8302437192073975e-07, + "loss": 0.3303, + "step": 7699 + }, + { + "epoch": 1.8327482596537157, + "grad_norm": 0.36454295345870347, + "learning_rate": 1.8250796991307494e-07, + "loss": 0.296, + "step": 7700 + }, + { + "epoch": 1.8329862557267806, + "grad_norm": 0.37266982455556436, + "learning_rate": 1.8199228390390457e-07, + "loss": 0.3295, + "step": 7701 + }, + { + "epoch": 1.8332242517998454, + "grad_norm": 0.4376692678565518, + "learning_rate": 1.814773139698728e-07, + "loss": 0.3667, + "step": 7702 + }, + { + "epoch": 1.8334622478729101, + "grad_norm": 0.3683739228990514, + "learning_rate": 1.8096306018751675e-07, + "loss": 0.2748, + "step": 7703 + }, + { + "epoch": 1.8337002439459749, + "grad_norm": 0.4073240247250818, + "learning_rate": 1.80449522633267e-07, + "loss": 0.2995, + "step": 7704 + }, + { + "epoch": 1.8339382400190396, + "grad_norm": 0.39393736737366614, + "learning_rate": 1.7993670138344798e-07, + "loss": 0.3601, + "step": 7705 + }, + { + "epoch": 1.8341762360921043, + "grad_norm": 0.3767167773666181, + "learning_rate": 1.7942459651427825e-07, + "loss": 0.3498, + "step": 7706 + }, + { + "epoch": 1.8344142321651693, + "grad_norm": 0.4089511356922392, + "learning_rate": 1.789132081018674e-07, + "loss": 0.3199, + "step": 7707 + }, + { + "epoch": 1.834652228238234, + "grad_norm": 0.3812109106624504, + "learning_rate": 1.7840253622222303e-07, + "loss": 0.3192, + "step": 7708 + }, + { + "epoch": 1.834890224311299, + "grad_norm": 0.36263457008589506, + "learning_rate": 1.7789258095124217e-07, + "loss": 0.3782, + "step": 7709 + }, + { + "epoch": 1.8351282203843637, + "grad_norm": 0.3724803988209711, + "learning_rate": 1.77383342364717e-07, + "loss": 0.2963, + "step": 7710 + }, + { + "epoch": 1.8353662164574285, + "grad_norm": 0.3765829889635161, + "learning_rate": 1.7687482053833304e-07, + "loss": 0.2866, + "step": 7711 + }, + { + "epoch": 1.8356042125304932, + "grad_norm": 0.3644840472067164, + "learning_rate": 1.7636701554766877e-07, + "loss": 0.3348, + "step": 7712 + }, + { + "epoch": 1.835842208603558, + "grad_norm": 0.4078076416765129, + "learning_rate": 1.7585992746819713e-07, + "loss": 0.3769, + "step": 7713 + }, + { + "epoch": 1.8360802046766227, + "grad_norm": 0.37270746494698037, + "learning_rate": 1.753535563752834e-07, + "loss": 0.2779, + "step": 7714 + }, + { + "epoch": 1.8363182007496877, + "grad_norm": 0.38980061433134444, + "learning_rate": 1.7484790234418791e-07, + "loss": 0.3094, + "step": 7715 + }, + { + "epoch": 1.8365561968227524, + "grad_norm": 0.3952094967807275, + "learning_rate": 1.7434296545006224e-07, + "loss": 0.3964, + "step": 7716 + }, + { + "epoch": 1.8367941928958174, + "grad_norm": 0.3729540471305891, + "learning_rate": 1.73838745767953e-07, + "loss": 0.3146, + "step": 7717 + }, + { + "epoch": 1.837032188968882, + "grad_norm": 0.40875312184567286, + "learning_rate": 1.7333524337279918e-07, + "loss": 0.3322, + "step": 7718 + }, + { + "epoch": 1.8372701850419468, + "grad_norm": 0.40437790963447295, + "learning_rate": 1.7283245833943473e-07, + "loss": 0.327, + "step": 7719 + }, + { + "epoch": 1.8375081811150116, + "grad_norm": 0.37395892343895687, + "learning_rate": 1.723303907425844e-07, + "loss": 0.3752, + "step": 7720 + }, + { + "epoch": 1.8377461771880763, + "grad_norm": 0.35037991077375436, + "learning_rate": 1.7182904065686956e-07, + "loss": 0.289, + "step": 7721 + }, + { + "epoch": 1.837984173261141, + "grad_norm": 0.39297451358371965, + "learning_rate": 1.713284081568023e-07, + "loss": 0.3139, + "step": 7722 + }, + { + "epoch": 1.838222169334206, + "grad_norm": 0.4465414484001565, + "learning_rate": 1.708284933167892e-07, + "loss": 0.3373, + "step": 7723 + }, + { + "epoch": 1.8384601654072708, + "grad_norm": 0.3641160770057071, + "learning_rate": 1.703292962111297e-07, + "loss": 0.3339, + "step": 7724 + }, + { + "epoch": 1.8386981614803357, + "grad_norm": 0.37173673768425497, + "learning_rate": 1.6983081691401727e-07, + "loss": 0.2737, + "step": 7725 + }, + { + "epoch": 1.8389361575534005, + "grad_norm": 0.37865859937279667, + "learning_rate": 1.6933305549953817e-07, + "loss": 0.3234, + "step": 7726 + }, + { + "epoch": 1.8391741536264652, + "grad_norm": 0.3858905080470261, + "learning_rate": 1.688360120416721e-07, + "loss": 0.3567, + "step": 7727 + }, + { + "epoch": 1.83941214969953, + "grad_norm": 0.36393548918786084, + "learning_rate": 1.6833968661429168e-07, + "loss": 0.2794, + "step": 7728 + }, + { + "epoch": 1.8396501457725947, + "grad_norm": 0.38271700713219603, + "learning_rate": 1.6784407929116342e-07, + "loss": 0.2761, + "step": 7729 + }, + { + "epoch": 1.8398881418456594, + "grad_norm": 0.4049677071792201, + "learning_rate": 1.6734919014594674e-07, + "loss": 0.3441, + "step": 7730 + }, + { + "epoch": 1.8401261379187244, + "grad_norm": 0.36928102997215106, + "learning_rate": 1.668550192521945e-07, + "loss": 0.3377, + "step": 7731 + }, + { + "epoch": 1.840364133991789, + "grad_norm": 0.37686053081429005, + "learning_rate": 1.6636156668335236e-07, + "loss": 0.2798, + "step": 7732 + }, + { + "epoch": 1.840602130064854, + "grad_norm": 0.37607937056638047, + "learning_rate": 1.6586883251275998e-07, + "loss": 0.3279, + "step": 7733 + }, + { + "epoch": 1.8408401261379188, + "grad_norm": 0.4077890202353418, + "learning_rate": 1.6537681681364993e-07, + "loss": 0.3957, + "step": 7734 + }, + { + "epoch": 1.8410781222109835, + "grad_norm": 0.3665125487760819, + "learning_rate": 1.64885519659147e-07, + "loss": 0.2984, + "step": 7735 + }, + { + "epoch": 1.8413161182840483, + "grad_norm": 0.3856906973870745, + "learning_rate": 1.6439494112227173e-07, + "loss": 0.2753, + "step": 7736 + }, + { + "epoch": 1.841554114357113, + "grad_norm": 0.370985815282774, + "learning_rate": 1.6390508127593463e-07, + "loss": 0.3377, + "step": 7737 + }, + { + "epoch": 1.8417921104301778, + "grad_norm": 0.3622514504955658, + "learning_rate": 1.634159401929425e-07, + "loss": 0.3836, + "step": 7738 + }, + { + "epoch": 1.8420301065032427, + "grad_norm": 0.3860158792125223, + "learning_rate": 1.6292751794599216e-07, + "loss": 0.2713, + "step": 7739 + }, + { + "epoch": 1.8422681025763075, + "grad_norm": 0.3973498950561496, + "learning_rate": 1.6243981460767666e-07, + "loss": 0.3111, + "step": 7740 + }, + { + "epoch": 1.8425060986493724, + "grad_norm": 0.3638871826673044, + "learning_rate": 1.619528302504797e-07, + "loss": 0.3788, + "step": 7741 + }, + { + "epoch": 1.8427440947224372, + "grad_norm": 0.3705555624225468, + "learning_rate": 1.6146656494678003e-07, + "loss": 0.3012, + "step": 7742 + }, + { + "epoch": 1.842982090795502, + "grad_norm": 0.3584047014227033, + "learning_rate": 1.609810187688482e-07, + "loss": 0.2983, + "step": 7743 + }, + { + "epoch": 1.8432200868685666, + "grad_norm": 0.3877701919251873, + "learning_rate": 1.6049619178884868e-07, + "loss": 0.3207, + "step": 7744 + }, + { + "epoch": 1.8434580829416314, + "grad_norm": 0.3813895710119214, + "learning_rate": 1.6001208407883884e-07, + "loss": 0.3819, + "step": 7745 + }, + { + "epoch": 1.8436960790146961, + "grad_norm": 0.376270806834782, + "learning_rate": 1.5952869571076835e-07, + "loss": 0.2711, + "step": 7746 + }, + { + "epoch": 1.843934075087761, + "grad_norm": 0.3843012907616639, + "learning_rate": 1.5904602675648083e-07, + "loss": 0.2944, + "step": 7747 + }, + { + "epoch": 1.8441720711608258, + "grad_norm": 0.3918616144624181, + "learning_rate": 1.5856407728771394e-07, + "loss": 0.3551, + "step": 7748 + }, + { + "epoch": 1.8444100672338908, + "grad_norm": 0.3541860195426503, + "learning_rate": 1.5808284737609592e-07, + "loss": 0.3476, + "step": 7749 + }, + { + "epoch": 1.8446480633069555, + "grad_norm": 0.4021959688745201, + "learning_rate": 1.5760233709315064e-07, + "loss": 0.2617, + "step": 7750 + }, + { + "epoch": 1.8448860593800203, + "grad_norm": 0.4117631142432679, + "learning_rate": 1.571225465102927e-07, + "loss": 0.3208, + "step": 7751 + }, + { + "epoch": 1.845124055453085, + "grad_norm": 0.41626948663412294, + "learning_rate": 1.566434756988311e-07, + "loss": 0.3668, + "step": 7752 + }, + { + "epoch": 1.8453620515261497, + "grad_norm": 0.3503906065492793, + "learning_rate": 1.561651247299689e-07, + "loss": 0.2722, + "step": 7753 + }, + { + "epoch": 1.8456000475992145, + "grad_norm": 0.3878284628086832, + "learning_rate": 1.556874936747993e-07, + "loss": 0.3053, + "step": 7754 + }, + { + "epoch": 1.8458380436722794, + "grad_norm": 0.3664751216986663, + "learning_rate": 1.5521058260431043e-07, + "loss": 0.3567, + "step": 7755 + }, + { + "epoch": 1.8460760397453442, + "grad_norm": 0.38503978655873494, + "learning_rate": 1.5473439158938398e-07, + "loss": 0.3633, + "step": 7756 + }, + { + "epoch": 1.8463140358184091, + "grad_norm": 0.37539732052773167, + "learning_rate": 1.5425892070079274e-07, + "loss": 0.2755, + "step": 7757 + }, + { + "epoch": 1.8465520318914739, + "grad_norm": 0.3905485468486571, + "learning_rate": 1.5378417000920355e-07, + "loss": 0.3062, + "step": 7758 + }, + { + "epoch": 1.8467900279645386, + "grad_norm": 0.3717300860222802, + "learning_rate": 1.5331013958517604e-07, + "loss": 0.3826, + "step": 7759 + }, + { + "epoch": 1.8470280240376034, + "grad_norm": 0.3825543677637209, + "learning_rate": 1.528368294991639e-07, + "loss": 0.2877, + "step": 7760 + }, + { + "epoch": 1.847266020110668, + "grad_norm": 0.38491103167235174, + "learning_rate": 1.523642398215114e-07, + "loss": 0.2753, + "step": 7761 + }, + { + "epoch": 1.8475040161837328, + "grad_norm": 0.385893795694505, + "learning_rate": 1.5189237062245732e-07, + "loss": 0.3507, + "step": 7762 + }, + { + "epoch": 1.8477420122567978, + "grad_norm": 0.36863061120979945, + "learning_rate": 1.5142122197213338e-07, + "loss": 0.3815, + "step": 7763 + }, + { + "epoch": 1.8479800083298625, + "grad_norm": 0.371943151815035, + "learning_rate": 1.5095079394056466e-07, + "loss": 0.2779, + "step": 7764 + }, + { + "epoch": 1.8482180044029275, + "grad_norm": 0.41031142975223894, + "learning_rate": 1.5048108659766693e-07, + "loss": 0.2951, + "step": 7765 + }, + { + "epoch": 1.8484560004759922, + "grad_norm": 0.36678805969636596, + "learning_rate": 1.500121000132515e-07, + "loss": 0.3523, + "step": 7766 + }, + { + "epoch": 1.848693996549057, + "grad_norm": 0.3656310119733287, + "learning_rate": 1.4954383425702102e-07, + "loss": 0.3059, + "step": 7767 + }, + { + "epoch": 1.8489319926221217, + "grad_norm": 0.4742087761434599, + "learning_rate": 1.4907628939857087e-07, + "loss": 0.3005, + "step": 7768 + }, + { + "epoch": 1.8491699886951865, + "grad_norm": 0.3832956933946555, + "learning_rate": 1.4860946550739052e-07, + "loss": 0.3152, + "step": 7769 + }, + { + "epoch": 1.8494079847682512, + "grad_norm": 0.3707364198788508, + "learning_rate": 1.4814336265286112e-07, + "loss": 0.3801, + "step": 7770 + }, + { + "epoch": 1.8496459808413162, + "grad_norm": 0.3794533583364384, + "learning_rate": 1.476779809042572e-07, + "loss": 0.2977, + "step": 7771 + }, + { + "epoch": 1.849883976914381, + "grad_norm": 0.4142293643990621, + "learning_rate": 1.4721332033074575e-07, + "loss": 0.31, + "step": 7772 + }, + { + "epoch": 1.8501219729874459, + "grad_norm": 0.364668075185516, + "learning_rate": 1.467493810013876e-07, + "loss": 0.3325, + "step": 7773 + }, + { + "epoch": 1.8503599690605106, + "grad_norm": 0.36286815622733415, + "learning_rate": 1.462861629851342e-07, + "loss": 0.3554, + "step": 7774 + }, + { + "epoch": 1.8505979651335753, + "grad_norm": 0.3942139877708106, + "learning_rate": 1.4582366635083223e-07, + "loss": 0.2767, + "step": 7775 + }, + { + "epoch": 1.85083596120664, + "grad_norm": 0.37049438808023333, + "learning_rate": 1.4536189116722056e-07, + "loss": 0.2968, + "step": 7776 + }, + { + "epoch": 1.8510739572797048, + "grad_norm": 0.42757648044300495, + "learning_rate": 1.4490083750292984e-07, + "loss": 0.3868, + "step": 7777 + }, + { + "epoch": 1.8513119533527695, + "grad_norm": 0.3666344543082101, + "learning_rate": 1.4444050542648302e-07, + "loss": 0.289, + "step": 7778 + }, + { + "epoch": 1.8515499494258345, + "grad_norm": 0.4136576161910318, + "learning_rate": 1.4398089500629874e-07, + "loss": 0.2883, + "step": 7779 + }, + { + "epoch": 1.8517879454988992, + "grad_norm": 0.38875371128162894, + "learning_rate": 1.4352200631068515e-07, + "loss": 0.3883, + "step": 7780 + }, + { + "epoch": 1.8520259415719642, + "grad_norm": 0.37524678870829536, + "learning_rate": 1.430638394078454e-07, + "loss": 0.3115, + "step": 7781 + }, + { + "epoch": 1.852263937645029, + "grad_norm": 0.3915337777058191, + "learning_rate": 1.4260639436587398e-07, + "loss": 0.2911, + "step": 7782 + }, + { + "epoch": 1.8525019337180937, + "grad_norm": 0.4065274951070914, + "learning_rate": 1.4214967125275814e-07, + "loss": 0.3267, + "step": 7783 + }, + { + "epoch": 1.8527399297911584, + "grad_norm": 0.3753625153872455, + "learning_rate": 1.4169367013637857e-07, + "loss": 0.374, + "step": 7784 + }, + { + "epoch": 1.8529779258642232, + "grad_norm": 0.39450510369704206, + "learning_rate": 1.4123839108450832e-07, + "loss": 0.3065, + "step": 7785 + }, + { + "epoch": 1.853215921937288, + "grad_norm": 0.38706116936482216, + "learning_rate": 1.4078383416481321e-07, + "loss": 0.273, + "step": 7786 + }, + { + "epoch": 1.8534539180103529, + "grad_norm": 0.3993104090511267, + "learning_rate": 1.4032999944485203e-07, + "loss": 0.3124, + "step": 7787 + }, + { + "epoch": 1.8536919140834176, + "grad_norm": 0.389522809964699, + "learning_rate": 1.398768869920747e-07, + "loss": 0.3647, + "step": 7788 + }, + { + "epoch": 1.8539299101564826, + "grad_norm": 0.4039304741367637, + "learning_rate": 1.3942449687382565e-07, + "loss": 0.2761, + "step": 7789 + }, + { + "epoch": 1.8541679062295473, + "grad_norm": 0.42136351566968644, + "learning_rate": 1.389728291573411e-07, + "loss": 0.328, + "step": 7790 + }, + { + "epoch": 1.854405902302612, + "grad_norm": 0.4212557672996947, + "learning_rate": 1.3852188390975073e-07, + "loss": 0.3918, + "step": 7791 + }, + { + "epoch": 1.8546438983756768, + "grad_norm": 0.3603764523291382, + "learning_rate": 1.380716611980748e-07, + "loss": 0.3351, + "step": 7792 + }, + { + "epoch": 1.8548818944487415, + "grad_norm": 0.35966797992735267, + "learning_rate": 1.3762216108922922e-07, + "loss": 0.2732, + "step": 7793 + }, + { + "epoch": 1.8551198905218063, + "grad_norm": 0.36333670692392717, + "learning_rate": 1.3717338365001943e-07, + "loss": 0.329, + "step": 7794 + }, + { + "epoch": 1.8553578865948712, + "grad_norm": 0.38329227054003784, + "learning_rate": 1.367253289471454e-07, + "loss": 0.3802, + "step": 7795 + }, + { + "epoch": 1.855595882667936, + "grad_norm": 0.3514869841098985, + "learning_rate": 1.3627799704719947e-07, + "loss": 0.2712, + "step": 7796 + }, + { + "epoch": 1.855833878741001, + "grad_norm": 0.381197916996613, + "learning_rate": 1.358313880166656e-07, + "loss": 0.302, + "step": 7797 + }, + { + "epoch": 1.8560718748140657, + "grad_norm": 0.402462868821984, + "learning_rate": 1.3538550192192078e-07, + "loss": 0.3535, + "step": 7798 + }, + { + "epoch": 1.8563098708871304, + "grad_norm": 0.3514030830100683, + "learning_rate": 1.3494033882923586e-07, + "loss": 0.3455, + "step": 7799 + }, + { + "epoch": 1.8565478669601951, + "grad_norm": 0.3781598067445243, + "learning_rate": 1.3449589880477176e-07, + "loss": 0.2882, + "step": 7800 + }, + { + "epoch": 1.8567858630332599, + "grad_norm": 0.38189268313397884, + "learning_rate": 1.3405218191458402e-07, + "loss": 0.3177, + "step": 7801 + }, + { + "epoch": 1.8570238591063246, + "grad_norm": 0.3890034707887386, + "learning_rate": 1.3360918822461989e-07, + "loss": 0.3846, + "step": 7802 + }, + { + "epoch": 1.8572618551793896, + "grad_norm": 0.354555606174368, + "learning_rate": 1.331669178007189e-07, + "loss": 0.2828, + "step": 7803 + }, + { + "epoch": 1.8574998512524543, + "grad_norm": 0.39468523966196106, + "learning_rate": 1.32725370708614e-07, + "loss": 0.2641, + "step": 7804 + }, + { + "epoch": 1.8577378473255193, + "grad_norm": 0.4158745671593587, + "learning_rate": 1.322845470139289e-07, + "loss": 0.3666, + "step": 7805 + }, + { + "epoch": 1.857975843398584, + "grad_norm": 0.38138886762707447, + "learning_rate": 1.3184444678218223e-07, + "loss": 0.3276, + "step": 7806 + }, + { + "epoch": 1.8582138394716488, + "grad_norm": 0.35114076586398596, + "learning_rate": 1.314050700787828e-07, + "loss": 0.2949, + "step": 7807 + }, + { + "epoch": 1.8584518355447135, + "grad_norm": 0.36724157814202074, + "learning_rate": 1.3096641696903334e-07, + "loss": 0.3347, + "step": 7808 + }, + { + "epoch": 1.8586898316177782, + "grad_norm": 0.38853816925181384, + "learning_rate": 1.3052848751812842e-07, + "loss": 0.3861, + "step": 7809 + }, + { + "epoch": 1.858927827690843, + "grad_norm": 0.3903142761173342, + "learning_rate": 1.3009128179115539e-07, + "loss": 0.3111, + "step": 7810 + }, + { + "epoch": 1.859165823763908, + "grad_norm": 0.38876795886287885, + "learning_rate": 1.2965479985309338e-07, + "loss": 0.2738, + "step": 7811 + }, + { + "epoch": 1.8594038198369727, + "grad_norm": 0.4101960252962491, + "learning_rate": 1.2921904176881494e-07, + "loss": 0.3681, + "step": 7812 + }, + { + "epoch": 1.8596418159100376, + "grad_norm": 0.36520915728705955, + "learning_rate": 1.2878400760308385e-07, + "loss": 0.3571, + "step": 7813 + }, + { + "epoch": 1.8598798119831024, + "grad_norm": 0.40612725661487625, + "learning_rate": 1.2834969742055725e-07, + "loss": 0.2713, + "step": 7814 + }, + { + "epoch": 1.860117808056167, + "grad_norm": 0.6769806659333333, + "learning_rate": 1.2791611128578463e-07, + "loss": 0.3343, + "step": 7815 + }, + { + "epoch": 1.8603558041292318, + "grad_norm": 0.40138777376249124, + "learning_rate": 1.2748324926320777e-07, + "loss": 0.3592, + "step": 7816 + }, + { + "epoch": 1.8605938002022966, + "grad_norm": 0.37763740906507537, + "learning_rate": 1.270511114171591e-07, + "loss": 0.3313, + "step": 7817 + }, + { + "epoch": 1.8608317962753613, + "grad_norm": 0.4238056700691326, + "learning_rate": 1.2661969781186723e-07, + "loss": 0.2498, + "step": 7818 + }, + { + "epoch": 1.8610697923484263, + "grad_norm": 0.37963559080128195, + "learning_rate": 1.2618900851144976e-07, + "loss": 0.3321, + "step": 7819 + }, + { + "epoch": 1.861307788421491, + "grad_norm": 0.3885643564174667, + "learning_rate": 1.2575904357991775e-07, + "loss": 0.3975, + "step": 7820 + }, + { + "epoch": 1.861545784494556, + "grad_norm": 0.34011787013626105, + "learning_rate": 1.2532980308117503e-07, + "loss": 0.272, + "step": 7821 + }, + { + "epoch": 1.8617837805676207, + "grad_norm": 0.3891036676630237, + "learning_rate": 1.2490128707901727e-07, + "loss": 0.293, + "step": 7822 + }, + { + "epoch": 1.8620217766406855, + "grad_norm": 0.38193328581420083, + "learning_rate": 1.2447349563713186e-07, + "loss": 0.3707, + "step": 7823 + }, + { + "epoch": 1.8622597727137502, + "grad_norm": 0.3658925256499234, + "learning_rate": 1.2404642881910012e-07, + "loss": 0.3051, + "step": 7824 + }, + { + "epoch": 1.862497768786815, + "grad_norm": 0.3801598075071461, + "learning_rate": 1.236200866883941e-07, + "loss": 0.2614, + "step": 7825 + }, + { + "epoch": 1.8627357648598797, + "grad_norm": 0.36720137768282524, + "learning_rate": 1.231944693083792e-07, + "loss": 0.3171, + "step": 7826 + }, + { + "epoch": 1.8629737609329446, + "grad_norm": 0.4198297803663176, + "learning_rate": 1.2276957674231204e-07, + "loss": 0.3851, + "step": 7827 + }, + { + "epoch": 1.8632117570060094, + "grad_norm": 0.35023106406242704, + "learning_rate": 1.223454090533427e-07, + "loss": 0.2795, + "step": 7828 + }, + { + "epoch": 1.8634497530790743, + "grad_norm": 0.34830108173573165, + "learning_rate": 1.219219663045129e-07, + "loss": 0.2655, + "step": 7829 + }, + { + "epoch": 1.863687749152139, + "grad_norm": 0.40557375679941676, + "learning_rate": 1.2149924855875737e-07, + "loss": 0.386, + "step": 7830 + }, + { + "epoch": 1.8639257452252038, + "grad_norm": 0.38135226051859966, + "learning_rate": 1.210772558789014e-07, + "loss": 0.3438, + "step": 7831 + }, + { + "epoch": 1.8641637412982686, + "grad_norm": 0.3573950365210543, + "learning_rate": 1.2065598832766369e-07, + "loss": 0.2979, + "step": 7832 + }, + { + "epoch": 1.8644017373713333, + "grad_norm": 0.35921086327394625, + "learning_rate": 1.202354459676558e-07, + "loss": 0.2962, + "step": 7833 + }, + { + "epoch": 1.864639733444398, + "grad_norm": 0.3826967666594862, + "learning_rate": 1.1981562886137998e-07, + "loss": 0.375, + "step": 7834 + }, + { + "epoch": 1.864877729517463, + "grad_norm": 0.3639494907850754, + "learning_rate": 1.1939653707123132e-07, + "loss": 0.281, + "step": 7835 + }, + { + "epoch": 1.8651157255905277, + "grad_norm": 0.37370185714825943, + "learning_rate": 1.1897817065949835e-07, + "loss": 0.2973, + "step": 7836 + }, + { + "epoch": 1.8653537216635927, + "grad_norm": 0.5048736432794805, + "learning_rate": 1.1856052968835907e-07, + "loss": 0.3218, + "step": 7837 + }, + { + "epoch": 1.8655917177366574, + "grad_norm": 0.36966116531403587, + "learning_rate": 1.1814361421988662e-07, + "loss": 0.3956, + "step": 7838 + }, + { + "epoch": 1.8658297138097222, + "grad_norm": 0.3560493828010198, + "learning_rate": 1.1772742431604423e-07, + "loss": 0.2868, + "step": 7839 + }, + { + "epoch": 1.866067709882787, + "grad_norm": 0.5370977594652094, + "learning_rate": 1.1731196003868794e-07, + "loss": 0.3322, + "step": 7840 + }, + { + "epoch": 1.8663057059558517, + "grad_norm": 0.3922980128293181, + "learning_rate": 1.1689722144956672e-07, + "loss": 0.3587, + "step": 7841 + }, + { + "epoch": 1.8665437020289164, + "grad_norm": 0.32486417759838265, + "learning_rate": 1.164832086103207e-07, + "loss": 0.3013, + "step": 7842 + }, + { + "epoch": 1.8667816981019814, + "grad_norm": 0.38729519267969026, + "learning_rate": 1.1606992158248177e-07, + "loss": 0.2793, + "step": 7843 + }, + { + "epoch": 1.867019694175046, + "grad_norm": 0.37956208767813593, + "learning_rate": 1.1565736042747522e-07, + "loss": 0.3162, + "step": 7844 + }, + { + "epoch": 1.867257690248111, + "grad_norm": 0.4116795086325041, + "learning_rate": 1.1524552520661702e-07, + "loss": 0.3733, + "step": 7845 + }, + { + "epoch": 1.8674956863211758, + "grad_norm": 0.39437952445062263, + "learning_rate": 1.1483441598111766e-07, + "loss": 0.3036, + "step": 7846 + }, + { + "epoch": 1.8677336823942405, + "grad_norm": 0.39085643388306573, + "learning_rate": 1.1442403281207714e-07, + "loss": 0.3244, + "step": 7847 + }, + { + "epoch": 1.8679716784673053, + "grad_norm": 0.4014816625744995, + "learning_rate": 1.140143757604889e-07, + "loss": 0.3738, + "step": 7848 + }, + { + "epoch": 1.86820967454037, + "grad_norm": 0.3769060375261056, + "learning_rate": 1.1360544488723756e-07, + "loss": 0.3044, + "step": 7849 + }, + { + "epoch": 1.8684476706134348, + "grad_norm": 0.3904744507763302, + "learning_rate": 1.1319724025310063e-07, + "loss": 0.2969, + "step": 7850 + }, + { + "epoch": 1.8686856666864997, + "grad_norm": 0.3773232634572751, + "learning_rate": 1.1278976191874735e-07, + "loss": 0.3294, + "step": 7851 + }, + { + "epoch": 1.8689236627595645, + "grad_norm": 0.3775912202758717, + "learning_rate": 1.1238300994473983e-07, + "loss": 0.3817, + "step": 7852 + }, + { + "epoch": 1.8691616588326294, + "grad_norm": 0.36177147793921166, + "learning_rate": 1.1197698439153027e-07, + "loss": 0.2878, + "step": 7853 + }, + { + "epoch": 1.8693996549056942, + "grad_norm": 0.3942909323087775, + "learning_rate": 1.1157168531946483e-07, + "loss": 0.2825, + "step": 7854 + }, + { + "epoch": 1.869637650978759, + "grad_norm": 0.38479407162289014, + "learning_rate": 1.1116711278878034e-07, + "loss": 0.3565, + "step": 7855 + }, + { + "epoch": 1.8698756470518236, + "grad_norm": 0.37536080195437777, + "learning_rate": 1.1076326685960758e-07, + "loss": 0.3585, + "step": 7856 + }, + { + "epoch": 1.8701136431248884, + "grad_norm": 0.4072733768621761, + "learning_rate": 1.1036014759196689e-07, + "loss": 0.2605, + "step": 7857 + }, + { + "epoch": 1.870351639197953, + "grad_norm": 0.3794776719624951, + "learning_rate": 1.09957755045772e-07, + "loss": 0.3195, + "step": 7858 + }, + { + "epoch": 1.870589635271018, + "grad_norm": 0.3781656516512505, + "learning_rate": 1.095560892808284e-07, + "loss": 0.3695, + "step": 7859 + }, + { + "epoch": 1.8708276313440828, + "grad_norm": 0.3477470340626564, + "learning_rate": 1.0915515035683444e-07, + "loss": 0.2976, + "step": 7860 + }, + { + "epoch": 1.8710656274171478, + "grad_norm": 0.39868562095433935, + "learning_rate": 1.0875493833337803e-07, + "loss": 0.2701, + "step": 7861 + }, + { + "epoch": 1.8713036234902125, + "grad_norm": 0.4054869894626742, + "learning_rate": 1.0835545326994213e-07, + "loss": 0.3525, + "step": 7862 + }, + { + "epoch": 1.8715416195632772, + "grad_norm": 0.36521275528859204, + "learning_rate": 1.079566952258987e-07, + "loss": 0.3625, + "step": 7863 + }, + { + "epoch": 1.871779615636342, + "grad_norm": 0.3635342656993625, + "learning_rate": 1.075586642605142e-07, + "loss": 0.268, + "step": 7864 + }, + { + "epoch": 1.8720176117094067, + "grad_norm": 0.37356788241263744, + "learning_rate": 1.0716136043294468e-07, + "loss": 0.2874, + "step": 7865 + }, + { + "epoch": 1.8722556077824715, + "grad_norm": 0.3961313481479438, + "learning_rate": 1.0676478380224065e-07, + "loss": 0.3772, + "step": 7866 + }, + { + "epoch": 1.8724936038555364, + "grad_norm": 0.37248128983442935, + "learning_rate": 1.063689344273422e-07, + "loss": 0.291, + "step": 7867 + }, + { + "epoch": 1.8727315999286012, + "grad_norm": 0.42564199958608884, + "learning_rate": 1.0597381236708282e-07, + "loss": 0.2925, + "step": 7868 + }, + { + "epoch": 1.8729695960016661, + "grad_norm": 0.3945675089261431, + "learning_rate": 1.055794176801872e-07, + "loss": 0.3273, + "step": 7869 + }, + { + "epoch": 1.8732075920747309, + "grad_norm": 0.36858121380582176, + "learning_rate": 1.0518575042527235e-07, + "loss": 0.425, + "step": 7870 + }, + { + "epoch": 1.8734455881477956, + "grad_norm": 0.4008228433529883, + "learning_rate": 1.0479281066084701e-07, + "loss": 0.2753, + "step": 7871 + }, + { + "epoch": 1.8736835842208603, + "grad_norm": 0.3970329160462698, + "learning_rate": 1.044005984453117e-07, + "loss": 0.3065, + "step": 7872 + }, + { + "epoch": 1.873921580293925, + "grad_norm": 0.3752973477487869, + "learning_rate": 1.0400911383695756e-07, + "loss": 0.3393, + "step": 7873 + }, + { + "epoch": 1.8741595763669898, + "grad_norm": 0.34507839687636127, + "learning_rate": 1.0361835689397137e-07, + "loss": 0.3171, + "step": 7874 + }, + { + "epoch": 1.8743975724400548, + "grad_norm": 0.3956732220335401, + "learning_rate": 1.0322832767442726e-07, + "loss": 0.2712, + "step": 7875 + }, + { + "epoch": 1.8746355685131195, + "grad_norm": 0.3478429638128932, + "learning_rate": 1.0283902623629439e-07, + "loss": 0.3163, + "step": 7876 + }, + { + "epoch": 1.8748735645861845, + "grad_norm": 0.43937488881046505, + "learning_rate": 1.0245045263743203e-07, + "loss": 0.3866, + "step": 7877 + }, + { + "epoch": 1.8751115606592492, + "grad_norm": 0.3712312999960514, + "learning_rate": 1.0206260693559234e-07, + "loss": 0.2936, + "step": 7878 + }, + { + "epoch": 1.875349556732314, + "grad_norm": 0.3667219720522898, + "learning_rate": 1.0167548918841752e-07, + "loss": 0.2841, + "step": 7879 + }, + { + "epoch": 1.8755875528053787, + "grad_norm": 0.3758275879637338, + "learning_rate": 1.0128909945344433e-07, + "loss": 0.3117, + "step": 7880 + }, + { + "epoch": 1.8758255488784434, + "grad_norm": 0.3693311113464016, + "learning_rate": 1.0090343778809908e-07, + "loss": 0.3607, + "step": 7881 + }, + { + "epoch": 1.8760635449515082, + "grad_norm": 0.3689248776098616, + "learning_rate": 1.0051850424970034e-07, + "loss": 0.287, + "step": 7882 + }, + { + "epoch": 1.8763015410245731, + "grad_norm": 0.3877904699869412, + "learning_rate": 1.0013429889546011e-07, + "loss": 0.3327, + "step": 7883 + }, + { + "epoch": 1.8765395370976379, + "grad_norm": 0.3813592994481828, + "learning_rate": 9.975082178247942e-08, + "loss": 0.3817, + "step": 7884 + }, + { + "epoch": 1.8767775331707028, + "grad_norm": 0.39447485339729055, + "learning_rate": 9.936807296775264e-08, + "loss": 0.3179, + "step": 7885 + }, + { + "epoch": 1.8770155292437676, + "grad_norm": 0.36838965887067854, + "learning_rate": 9.898605250816596e-08, + "loss": 0.2767, + "step": 7886 + }, + { + "epoch": 1.8772535253168323, + "grad_norm": 0.40695179381212354, + "learning_rate": 9.860476046049783e-08, + "loss": 0.3116, + "step": 7887 + }, + { + "epoch": 1.877491521389897, + "grad_norm": 0.37922684546428304, + "learning_rate": 9.822419688141627e-08, + "loss": 0.371, + "step": 7888 + }, + { + "epoch": 1.8777295174629618, + "grad_norm": 0.3605426474272245, + "learning_rate": 9.784436182748381e-08, + "loss": 0.3157, + "step": 7889 + }, + { + "epoch": 1.8779675135360265, + "grad_norm": 0.3855508493344283, + "learning_rate": 9.74652553551525e-08, + "loss": 0.3126, + "step": 7890 + }, + { + "epoch": 1.8782055096090915, + "grad_norm": 0.36654027571515835, + "learning_rate": 9.708687752076673e-08, + "loss": 0.3796, + "step": 7891 + }, + { + "epoch": 1.8784435056821562, + "grad_norm": 0.37798898644072126, + "learning_rate": 9.670922838056374e-08, + "loss": 0.3268, + "step": 7892 + }, + { + "epoch": 1.8786815017552212, + "grad_norm": 0.37273754822520255, + "learning_rate": 9.633230799067084e-08, + "loss": 0.2805, + "step": 7893 + }, + { + "epoch": 1.878919497828286, + "grad_norm": 0.38458823706391126, + "learning_rate": 9.595611640710767e-08, + "loss": 0.3494, + "step": 7894 + }, + { + "epoch": 1.8791574939013507, + "grad_norm": 0.39167494539281444, + "learning_rate": 9.558065368578561e-08, + "loss": 0.3506, + "step": 7895 + }, + { + "epoch": 1.8793954899744154, + "grad_norm": 0.3574825980828279, + "learning_rate": 9.520591988250838e-08, + "loss": 0.2689, + "step": 7896 + }, + { + "epoch": 1.8796334860474802, + "grad_norm": 0.37303769358161776, + "learning_rate": 9.483191505296974e-08, + "loss": 0.3235, + "step": 7897 + }, + { + "epoch": 1.879871482120545, + "grad_norm": 0.3651354155087522, + "learning_rate": 9.44586392527569e-08, + "loss": 0.3755, + "step": 7898 + }, + { + "epoch": 1.8801094781936099, + "grad_norm": 0.39194478737773597, + "learning_rate": 9.408609253734713e-08, + "loss": 0.3255, + "step": 7899 + }, + { + "epoch": 1.8803474742666746, + "grad_norm": 0.3713897973414568, + "learning_rate": 9.371427496211061e-08, + "loss": 0.2869, + "step": 7900 + }, + { + "epoch": 1.8805854703397396, + "grad_norm": 0.36090648920272766, + "learning_rate": 9.334318658230867e-08, + "loss": 0.3191, + "step": 7901 + }, + { + "epoch": 1.8808234664128043, + "grad_norm": 0.41318377703708387, + "learning_rate": 9.297282745309389e-08, + "loss": 0.3773, + "step": 7902 + }, + { + "epoch": 1.881061462485869, + "grad_norm": 0.3713769438671181, + "learning_rate": 9.260319762951109e-08, + "loss": 0.3186, + "step": 7903 + }, + { + "epoch": 1.8812994585589338, + "grad_norm": 0.3858895133643765, + "learning_rate": 9.223429716649634e-08, + "loss": 0.2755, + "step": 7904 + }, + { + "epoch": 1.8815374546319985, + "grad_norm": 0.422907620593951, + "learning_rate": 9.186612611887691e-08, + "loss": 0.3659, + "step": 7905 + }, + { + "epoch": 1.8817754507050632, + "grad_norm": 0.4024148982753146, + "learning_rate": 9.14986845413729e-08, + "loss": 0.3693, + "step": 7906 + }, + { + "epoch": 1.8820134467781282, + "grad_norm": 0.3803493722755298, + "learning_rate": 9.113197248859451e-08, + "loss": 0.2724, + "step": 7907 + }, + { + "epoch": 1.882251442851193, + "grad_norm": 0.4118593592302728, + "learning_rate": 9.076599001504482e-08, + "loss": 0.3282, + "step": 7908 + }, + { + "epoch": 1.882489438924258, + "grad_norm": 0.3953308094670974, + "learning_rate": 9.0400737175117e-08, + "loss": 0.3764, + "step": 7909 + }, + { + "epoch": 1.8827274349973226, + "grad_norm": 0.3696749681639606, + "learning_rate": 9.003621402309815e-08, + "loss": 0.277, + "step": 7910 + }, + { + "epoch": 1.8829654310703874, + "grad_norm": 0.3615369062856125, + "learning_rate": 8.96724206131644e-08, + "loss": 0.2998, + "step": 7911 + }, + { + "epoch": 1.8832034271434521, + "grad_norm": 0.41428190444047164, + "learning_rate": 8.930935699938415e-08, + "loss": 0.3354, + "step": 7912 + }, + { + "epoch": 1.8834414232165169, + "grad_norm": 0.3929646777015546, + "learning_rate": 8.89470232357187e-08, + "loss": 0.3684, + "step": 7913 + }, + { + "epoch": 1.8836794192895816, + "grad_norm": 0.3589833395799497, + "learning_rate": 8.858541937601827e-08, + "loss": 0.2739, + "step": 7914 + }, + { + "epoch": 1.8839174153626466, + "grad_norm": 0.3889496001692624, + "learning_rate": 8.82245454740277e-08, + "loss": 0.2876, + "step": 7915 + }, + { + "epoch": 1.8841554114357113, + "grad_norm": 0.42247196753368266, + "learning_rate": 8.786440158338072e-08, + "loss": 0.3592, + "step": 7916 + }, + { + "epoch": 1.8843934075087763, + "grad_norm": 0.36693957355643053, + "learning_rate": 8.750498775760453e-08, + "loss": 0.3229, + "step": 7917 + }, + { + "epoch": 1.884631403581841, + "grad_norm": 0.37575006564385804, + "learning_rate": 8.714630405011637e-08, + "loss": 0.2745, + "step": 7918 + }, + { + "epoch": 1.8848693996549057, + "grad_norm": 0.3699668444902597, + "learning_rate": 8.678835051422585e-08, + "loss": 0.3479, + "step": 7919 + }, + { + "epoch": 1.8851073957279705, + "grad_norm": 0.37951849531963455, + "learning_rate": 8.643112720313262e-08, + "loss": 0.3748, + "step": 7920 + }, + { + "epoch": 1.8853453918010352, + "grad_norm": 0.401706429927378, + "learning_rate": 8.607463416993034e-08, + "loss": 0.2929, + "step": 7921 + }, + { + "epoch": 1.8855833878741, + "grad_norm": 0.40506962687618053, + "learning_rate": 8.571887146760217e-08, + "loss": 0.3326, + "step": 7922 + }, + { + "epoch": 1.885821383947165, + "grad_norm": 0.38167411063561124, + "learning_rate": 8.536383914902301e-08, + "loss": 0.3743, + "step": 7923 + }, + { + "epoch": 1.8860593800202297, + "grad_norm": 0.37951337247623673, + "learning_rate": 8.500953726695959e-08, + "loss": 0.3368, + "step": 7924 + }, + { + "epoch": 1.8862973760932946, + "grad_norm": 0.3807047055623607, + "learning_rate": 8.465596587406977e-08, + "loss": 0.2633, + "step": 7925 + }, + { + "epoch": 1.8865353721663594, + "grad_norm": 0.37255448111092837, + "learning_rate": 8.430312502290316e-08, + "loss": 0.3475, + "step": 7926 + }, + { + "epoch": 1.886773368239424, + "grad_norm": 0.3904240340365388, + "learning_rate": 8.395101476590062e-08, + "loss": 0.3999, + "step": 7927 + }, + { + "epoch": 1.8870113643124888, + "grad_norm": 0.3590248194841884, + "learning_rate": 8.359963515539416e-08, + "loss": 0.2924, + "step": 7928 + }, + { + "epoch": 1.8872493603855536, + "grad_norm": 0.38702468210446556, + "learning_rate": 8.324898624360867e-08, + "loss": 0.2733, + "step": 7929 + }, + { + "epoch": 1.8874873564586183, + "grad_norm": 0.3814942319217911, + "learning_rate": 8.289906808265746e-08, + "loss": 0.3281, + "step": 7930 + }, + { + "epoch": 1.8877253525316833, + "grad_norm": 0.40004094569529125, + "learning_rate": 8.254988072454895e-08, + "loss": 0.3284, + "step": 7931 + }, + { + "epoch": 1.887963348604748, + "grad_norm": 0.3723689654618086, + "learning_rate": 8.220142422117939e-08, + "loss": 0.2681, + "step": 7932 + }, + { + "epoch": 1.888201344677813, + "grad_norm": 0.44726403333459647, + "learning_rate": 8.185369862433845e-08, + "loss": 0.3056, + "step": 7933 + }, + { + "epoch": 1.8884393407508777, + "grad_norm": 0.3806982139299245, + "learning_rate": 8.150670398570759e-08, + "loss": 0.3877, + "step": 7934 + }, + { + "epoch": 1.8886773368239425, + "grad_norm": 0.35216108572000016, + "learning_rate": 8.116044035685777e-08, + "loss": 0.2884, + "step": 7935 + }, + { + "epoch": 1.8889153328970072, + "grad_norm": 0.39643635386909837, + "learning_rate": 8.081490778925283e-08, + "loss": 0.2902, + "step": 7936 + }, + { + "epoch": 1.889153328970072, + "grad_norm": 0.4923610296056875, + "learning_rate": 8.047010633424723e-08, + "loss": 0.3359, + "step": 7937 + }, + { + "epoch": 1.8893913250431367, + "grad_norm": 0.379519284347565, + "learning_rate": 8.012603604308721e-08, + "loss": 0.3535, + "step": 7938 + }, + { + "epoch": 1.8896293211162016, + "grad_norm": 0.3635457197510201, + "learning_rate": 7.978269696691021e-08, + "loss": 0.2947, + "step": 7939 + }, + { + "epoch": 1.8898673171892664, + "grad_norm": 0.4033859742678164, + "learning_rate": 7.944008915674484e-08, + "loss": 0.311, + "step": 7940 + }, + { + "epoch": 1.8901053132623313, + "grad_norm": 0.3982200170533466, + "learning_rate": 7.909821266351092e-08, + "loss": 0.3703, + "step": 7941 + }, + { + "epoch": 1.890343309335396, + "grad_norm": 0.35373099489430676, + "learning_rate": 7.875706753801949e-08, + "loss": 0.3103, + "step": 7942 + }, + { + "epoch": 1.8905813054084608, + "grad_norm": 0.36747336775173683, + "learning_rate": 7.841665383097386e-08, + "loss": 0.2825, + "step": 7943 + }, + { + "epoch": 1.8908193014815255, + "grad_norm": 0.3650587611745797, + "learning_rate": 7.807697159296746e-08, + "loss": 0.3499, + "step": 7944 + }, + { + "epoch": 1.8910572975545903, + "grad_norm": 0.40187256746234257, + "learning_rate": 7.773802087448545e-08, + "loss": 0.356, + "step": 7945 + }, + { + "epoch": 1.891295293627655, + "grad_norm": 0.37435669326276105, + "learning_rate": 7.739980172590477e-08, + "loss": 0.275, + "step": 7946 + }, + { + "epoch": 1.89153328970072, + "grad_norm": 0.36212200367835184, + "learning_rate": 7.706231419749243e-08, + "loss": 0.3142, + "step": 7947 + }, + { + "epoch": 1.8917712857737847, + "grad_norm": 0.39951418029554553, + "learning_rate": 7.672555833940832e-08, + "loss": 0.3621, + "step": 7948 + }, + { + "epoch": 1.8920092818468497, + "grad_norm": 0.3786672956664201, + "learning_rate": 7.638953420170181e-08, + "loss": 0.3187, + "step": 7949 + }, + { + "epoch": 1.8922472779199144, + "grad_norm": 0.3923921053093728, + "learning_rate": 7.605424183431464e-08, + "loss": 0.2786, + "step": 7950 + }, + { + "epoch": 1.8924852739929792, + "grad_norm": 0.38868628053542476, + "learning_rate": 7.57196812870803e-08, + "loss": 0.3226, + "step": 7951 + }, + { + "epoch": 1.892723270066044, + "grad_norm": 0.3985157429933502, + "learning_rate": 7.538585260972175e-08, + "loss": 0.365, + "step": 7952 + }, + { + "epoch": 1.8929612661391086, + "grad_norm": 0.41970802109407324, + "learning_rate": 7.505275585185434e-08, + "loss": 0.2964, + "step": 7953 + }, + { + "epoch": 1.8931992622121734, + "grad_norm": 0.38733261918545264, + "learning_rate": 7.472039106298512e-08, + "loss": 0.3075, + "step": 7954 + }, + { + "epoch": 1.8934372582852383, + "grad_norm": 0.4141672520453948, + "learning_rate": 7.438875829251069e-08, + "loss": 0.3563, + "step": 7955 + }, + { + "epoch": 1.893675254358303, + "grad_norm": 0.3763459702365932, + "learning_rate": 7.405785758972106e-08, + "loss": 0.3256, + "step": 7956 + }, + { + "epoch": 1.893913250431368, + "grad_norm": 0.41340503131702716, + "learning_rate": 7.372768900379579e-08, + "loss": 0.276, + "step": 7957 + }, + { + "epoch": 1.8941512465044328, + "grad_norm": 0.3690483636794343, + "learning_rate": 7.339825258380618e-08, + "loss": 0.3253, + "step": 7958 + }, + { + "epoch": 1.8943892425774975, + "grad_norm": 0.3890667108974889, + "learning_rate": 7.306954837871415e-08, + "loss": 0.3858, + "step": 7959 + }, + { + "epoch": 1.8946272386505623, + "grad_norm": 0.3686373090623398, + "learning_rate": 7.2741576437374e-08, + "loss": 0.3156, + "step": 7960 + }, + { + "epoch": 1.894865234723627, + "grad_norm": 0.3662856876936937, + "learning_rate": 7.241433680852949e-08, + "loss": 0.2893, + "step": 7961 + }, + { + "epoch": 1.8951032307966917, + "grad_norm": 0.4337210767045377, + "learning_rate": 7.208782954081784e-08, + "loss": 0.347, + "step": 7962 + }, + { + "epoch": 1.8953412268697567, + "grad_norm": 0.37458324393332965, + "learning_rate": 7.176205468276465e-08, + "loss": 0.395, + "step": 7963 + }, + { + "epoch": 1.8955792229428214, + "grad_norm": 0.36173109464150077, + "learning_rate": 7.143701228278899e-08, + "loss": 0.2887, + "step": 7964 + }, + { + "epoch": 1.8958172190158864, + "grad_norm": 0.38262759512926825, + "learning_rate": 7.111270238920054e-08, + "loss": 0.2915, + "step": 7965 + }, + { + "epoch": 1.8960552150889511, + "grad_norm": 0.3827036517273416, + "learning_rate": 7.078912505019908e-08, + "loss": 0.3438, + "step": 7966 + }, + { + "epoch": 1.8962932111620159, + "grad_norm": 0.3848814530905264, + "learning_rate": 7.046628031387615e-08, + "loss": 0.301, + "step": 7967 + }, + { + "epoch": 1.8965312072350806, + "grad_norm": 0.4067525298074377, + "learning_rate": 7.014416822821557e-08, + "loss": 0.2432, + "step": 7968 + }, + { + "epoch": 1.8967692033081454, + "grad_norm": 0.3812327537956296, + "learning_rate": 6.982278884108907e-08, + "loss": 0.3234, + "step": 7969 + }, + { + "epoch": 1.89700719938121, + "grad_norm": 0.3605852849420692, + "learning_rate": 6.950214220026397e-08, + "loss": 0.4051, + "step": 7970 + }, + { + "epoch": 1.897245195454275, + "grad_norm": 0.3958665551698103, + "learning_rate": 6.918222835339438e-08, + "loss": 0.2686, + "step": 7971 + }, + { + "epoch": 1.8974831915273398, + "grad_norm": 0.38258672492708956, + "learning_rate": 6.886304734802896e-08, + "loss": 0.3141, + "step": 7972 + }, + { + "epoch": 1.8977211876004048, + "grad_norm": 0.3981035167445995, + "learning_rate": 6.854459923160472e-08, + "loss": 0.3582, + "step": 7973 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.36696157556372344, + "learning_rate": 6.822688405145161e-08, + "loss": 0.3243, + "step": 7974 + }, + { + "epoch": 1.8981971797465342, + "grad_norm": 0.3713373387211765, + "learning_rate": 6.790990185478963e-08, + "loss": 0.2546, + "step": 7975 + }, + { + "epoch": 1.898435175819599, + "grad_norm": 0.37672668107775037, + "learning_rate": 6.759365268872997e-08, + "loss": 0.3332, + "step": 7976 + }, + { + "epoch": 1.8986731718926637, + "grad_norm": 0.359979308125644, + "learning_rate": 6.727813660027616e-08, + "loss": 0.3621, + "step": 7977 + }, + { + "epoch": 1.8989111679657285, + "grad_norm": 0.3608027535934777, + "learning_rate": 6.696335363632012e-08, + "loss": 0.2963, + "step": 7978 + }, + { + "epoch": 1.8991491640387934, + "grad_norm": 0.37376170686319327, + "learning_rate": 6.66493038436472e-08, + "loss": 0.2775, + "step": 7979 + }, + { + "epoch": 1.8993871601118582, + "grad_norm": 0.38424326860289765, + "learning_rate": 6.633598726893342e-08, + "loss": 0.3589, + "step": 7980 + }, + { + "epoch": 1.8996251561849231, + "grad_norm": 0.394059524418179, + "learning_rate": 6.602340395874484e-08, + "loss": 0.3771, + "step": 7981 + }, + { + "epoch": 1.8998631522579879, + "grad_norm": 0.37382562755513166, + "learning_rate": 6.571155395953877e-08, + "loss": 0.2874, + "step": 7982 + }, + { + "epoch": 1.9001011483310526, + "grad_norm": 0.3874100204834066, + "learning_rate": 6.540043731766421e-08, + "loss": 0.3195, + "step": 7983 + }, + { + "epoch": 1.9003391444041173, + "grad_norm": 0.48921624228327715, + "learning_rate": 6.509005407936087e-08, + "loss": 0.3603, + "step": 7984 + }, + { + "epoch": 1.900577140477182, + "grad_norm": 0.35858308962539986, + "learning_rate": 6.478040429075961e-08, + "loss": 0.2859, + "step": 7985 + }, + { + "epoch": 1.9008151365502468, + "grad_norm": 0.3932336834908693, + "learning_rate": 6.447148799788139e-08, + "loss": 0.2843, + "step": 7986 + }, + { + "epoch": 1.9010531326233118, + "grad_norm": 0.36226032388387175, + "learning_rate": 6.416330524663895e-08, + "loss": 0.3366, + "step": 7987 + }, + { + "epoch": 1.9012911286963765, + "grad_norm": 0.41269859656532076, + "learning_rate": 6.385585608283673e-08, + "loss": 0.3668, + "step": 7988 + }, + { + "epoch": 1.9015291247694415, + "grad_norm": 0.3845523305990824, + "learning_rate": 6.35491405521682e-08, + "loss": 0.2758, + "step": 7989 + }, + { + "epoch": 1.9017671208425062, + "grad_norm": 0.40301323636395514, + "learning_rate": 6.324315870021858e-08, + "loss": 0.3034, + "step": 7990 + }, + { + "epoch": 1.902005116915571, + "grad_norm": 0.3907642073015502, + "learning_rate": 6.29379105724659e-08, + "loss": 0.3739, + "step": 7991 + }, + { + "epoch": 1.9022431129886357, + "grad_norm": 0.3606625473176197, + "learning_rate": 6.263339621427666e-08, + "loss": 0.3433, + "step": 7992 + }, + { + "epoch": 1.9024811090617004, + "grad_norm": 0.4090491853767471, + "learning_rate": 6.232961567090912e-08, + "loss": 0.2904, + "step": 7993 + }, + { + "epoch": 1.9027191051347652, + "grad_norm": 0.3812316669315538, + "learning_rate": 6.202656898751324e-08, + "loss": 0.3236, + "step": 7994 + }, + { + "epoch": 1.9029571012078301, + "grad_norm": 0.37564697095435645, + "learning_rate": 6.172425620912859e-08, + "loss": 0.3895, + "step": 7995 + }, + { + "epoch": 1.9031950972808949, + "grad_norm": 0.38971270053161045, + "learning_rate": 6.142267738068641e-08, + "loss": 0.2818, + "step": 7996 + }, + { + "epoch": 1.9034330933539596, + "grad_norm": 0.4139644761266531, + "learning_rate": 6.112183254700866e-08, + "loss": 0.3051, + "step": 7997 + }, + { + "epoch": 1.9036710894270246, + "grad_norm": 0.4000612669015111, + "learning_rate": 6.082172175280843e-08, + "loss": 0.3858, + "step": 7998 + }, + { + "epoch": 1.9039090855000893, + "grad_norm": 0.39138000482768825, + "learning_rate": 6.052234504269006e-08, + "loss": 0.2829, + "step": 7999 + }, + { + "epoch": 1.904147081573154, + "grad_norm": 0.4111965153470907, + "learning_rate": 6.022370246114795e-08, + "loss": 0.2599, + "step": 8000 + }, + { + "epoch": 1.9043850776462188, + "grad_norm": 0.38739615879574596, + "learning_rate": 5.992579405256826e-08, + "loss": 0.3336, + "step": 8001 + }, + { + "epoch": 1.9046230737192835, + "grad_norm": 0.35399850972575364, + "learning_rate": 5.96286198612267e-08, + "loss": 0.4036, + "step": 8002 + }, + { + "epoch": 1.9048610697923483, + "grad_norm": 0.3501272012525231, + "learning_rate": 5.933217993129126e-08, + "loss": 0.2928, + "step": 8003 + }, + { + "epoch": 1.9050990658654132, + "grad_norm": 0.3940432999472092, + "learning_rate": 5.903647430682002e-08, + "loss": 0.2813, + "step": 8004 + }, + { + "epoch": 1.905337061938478, + "grad_norm": 0.37153990539467646, + "learning_rate": 5.8741503031762294e-08, + "loss": 0.3495, + "step": 8005 + }, + { + "epoch": 1.905575058011543, + "grad_norm": 0.3650091291465742, + "learning_rate": 5.844726614995799e-08, + "loss": 0.3493, + "step": 8006 + }, + { + "epoch": 1.9058130540846077, + "grad_norm": 0.35578280548382063, + "learning_rate": 5.815376370513825e-08, + "loss": 0.2911, + "step": 8007 + }, + { + "epoch": 1.9060510501576724, + "grad_norm": 0.4008983674611723, + "learning_rate": 5.7860995740924296e-08, + "loss": 0.3143, + "step": 8008 + }, + { + "epoch": 1.9062890462307371, + "grad_norm": 0.38432833244657477, + "learning_rate": 5.756896230082909e-08, + "loss": 0.3776, + "step": 8009 + }, + { + "epoch": 1.9065270423038019, + "grad_norm": 0.3632804695720169, + "learning_rate": 5.7277663428256245e-08, + "loss": 0.2985, + "step": 8010 + }, + { + "epoch": 1.9067650383768666, + "grad_norm": 0.3906672244339681, + "learning_rate": 5.698709916649892e-08, + "loss": 0.2724, + "step": 8011 + }, + { + "epoch": 1.9070030344499316, + "grad_norm": 0.38051344490028755, + "learning_rate": 5.669726955874366e-08, + "loss": 0.3162, + "step": 8012 + }, + { + "epoch": 1.9072410305229963, + "grad_norm": 0.35282134896325107, + "learning_rate": 5.640817464806547e-08, + "loss": 0.3467, + "step": 8013 + }, + { + "epoch": 1.9074790265960613, + "grad_norm": 0.38263778679767824, + "learning_rate": 5.611981447743109e-08, + "loss": 0.2883, + "step": 8014 + }, + { + "epoch": 1.907717022669126, + "grad_norm": 0.37534374522335623, + "learning_rate": 5.5832189089697895e-08, + "loss": 0.3081, + "step": 8015 + }, + { + "epoch": 1.9079550187421908, + "grad_norm": 0.3824203414300469, + "learning_rate": 5.554529852761337e-08, + "loss": 0.3706, + "step": 8016 + }, + { + "epoch": 1.9081930148152555, + "grad_norm": 0.37799082292639585, + "learning_rate": 5.525914283381839e-08, + "loss": 0.3255, + "step": 8017 + }, + { + "epoch": 1.9084310108883202, + "grad_norm": 0.3920668628878149, + "learning_rate": 5.497372205084173e-08, + "loss": 0.2966, + "step": 8018 + }, + { + "epoch": 1.908669006961385, + "grad_norm": 0.34537104511664757, + "learning_rate": 5.468903622110389e-08, + "loss": 0.3038, + "step": 8019 + }, + { + "epoch": 1.90890700303445, + "grad_norm": 0.3841678485218043, + "learning_rate": 5.440508538691658e-08, + "loss": 0.3513, + "step": 8020 + }, + { + "epoch": 1.9091449991075147, + "grad_norm": 0.34656903960442637, + "learning_rate": 5.412186959048105e-08, + "loss": 0.2848, + "step": 8021 + }, + { + "epoch": 1.9093829951805796, + "grad_norm": 0.3940028400776739, + "learning_rate": 5.38393888738914e-08, + "loss": 0.2805, + "step": 8022 + }, + { + "epoch": 1.9096209912536444, + "grad_norm": 0.3819034767089527, + "learning_rate": 5.355764327913071e-08, + "loss": 0.3605, + "step": 8023 + }, + { + "epoch": 1.9098589873267091, + "grad_norm": 0.4050020295830134, + "learning_rate": 5.3276632848072716e-08, + "loss": 0.3406, + "step": 8024 + }, + { + "epoch": 1.9100969833997739, + "grad_norm": 0.3711188459584683, + "learning_rate": 5.299635762248345e-08, + "loss": 0.2649, + "step": 8025 + }, + { + "epoch": 1.9103349794728386, + "grad_norm": 0.3825043059653578, + "learning_rate": 5.271681764401848e-08, + "loss": 0.297, + "step": 8026 + }, + { + "epoch": 1.9105729755459033, + "grad_norm": 0.3872078261635392, + "learning_rate": 5.243801295422457e-08, + "loss": 0.3961, + "step": 8027 + }, + { + "epoch": 1.9108109716189683, + "grad_norm": 0.3675581463780866, + "learning_rate": 5.215994359453858e-08, + "loss": 0.2728, + "step": 8028 + }, + { + "epoch": 1.911048967692033, + "grad_norm": 0.39794626943566946, + "learning_rate": 5.188260960628855e-08, + "loss": 0.2939, + "step": 8029 + }, + { + "epoch": 1.911286963765098, + "grad_norm": 0.3881930755761182, + "learning_rate": 5.1606011030693184e-08, + "loss": 0.3508, + "step": 8030 + }, + { + "epoch": 1.9115249598381627, + "grad_norm": 0.34966274001131864, + "learning_rate": 5.1330147908861814e-08, + "loss": 0.3254, + "step": 8031 + }, + { + "epoch": 1.9117629559112275, + "grad_norm": 0.3810722565822783, + "learning_rate": 5.1055020281794987e-08, + "loss": 0.2676, + "step": 8032 + }, + { + "epoch": 1.9120009519842922, + "grad_norm": 0.3913423806435706, + "learning_rate": 5.078062819038332e-08, + "loss": 0.3274, + "step": 8033 + }, + { + "epoch": 1.912238948057357, + "grad_norm": 0.3957900932077373, + "learning_rate": 5.0506971675407526e-08, + "loss": 0.3859, + "step": 8034 + }, + { + "epoch": 1.9124769441304217, + "grad_norm": 0.41296040287870084, + "learning_rate": 5.0234050777540625e-08, + "loss": 0.2761, + "step": 8035 + }, + { + "epoch": 1.9127149402034866, + "grad_norm": 0.3617049666384471, + "learning_rate": 4.996186553734517e-08, + "loss": 0.258, + "step": 8036 + }, + { + "epoch": 1.9129529362765514, + "grad_norm": 0.4056471829864505, + "learning_rate": 4.9690415995274354e-08, + "loss": 0.3416, + "step": 8037 + }, + { + "epoch": 1.9131909323496163, + "grad_norm": 0.3575725862341246, + "learning_rate": 4.941970219167203e-08, + "loss": 0.3405, + "step": 8038 + }, + { + "epoch": 1.913428928422681, + "grad_norm": 0.3654418605929413, + "learning_rate": 4.914972416677433e-08, + "loss": 0.2714, + "step": 8039 + }, + { + "epoch": 1.9136669244957458, + "grad_norm": 0.3928718133203763, + "learning_rate": 4.8880481960705274e-08, + "loss": 0.3084, + "step": 8040 + }, + { + "epoch": 1.9139049205688106, + "grad_norm": 0.3860998147396003, + "learning_rate": 4.861197561348119e-08, + "loss": 0.3449, + "step": 8041 + }, + { + "epoch": 1.9141429166418753, + "grad_norm": 0.3828325589358172, + "learning_rate": 4.83442051650096e-08, + "loss": 0.3055, + "step": 8042 + }, + { + "epoch": 1.91438091271494, + "grad_norm": 0.39352418664155536, + "learning_rate": 4.8077170655086436e-08, + "loss": 0.2848, + "step": 8043 + }, + { + "epoch": 1.914618908788005, + "grad_norm": 0.4101375769165912, + "learning_rate": 4.781087212340052e-08, + "loss": 0.3486, + "step": 8044 + }, + { + "epoch": 1.9148569048610697, + "grad_norm": 0.39533684005769876, + "learning_rate": 4.754530960953074e-08, + "loss": 0.3536, + "step": 8045 + }, + { + "epoch": 1.9150949009341347, + "grad_norm": 0.37662010337344887, + "learning_rate": 4.728048315294553e-08, + "loss": 0.315, + "step": 8046 + }, + { + "epoch": 1.9153328970071994, + "grad_norm": 0.3934959783113809, + "learning_rate": 4.701639279300507e-08, + "loss": 0.3085, + "step": 8047 + }, + { + "epoch": 1.9155708930802642, + "grad_norm": 0.37377434092719386, + "learning_rate": 4.675303856895907e-08, + "loss": 0.381, + "step": 8048 + }, + { + "epoch": 1.915808889153329, + "grad_norm": 0.37694394724451996, + "learning_rate": 4.649042051994956e-08, + "loss": 0.3324, + "step": 8049 + }, + { + "epoch": 1.9160468852263937, + "grad_norm": 0.37595294353982567, + "learning_rate": 4.622853868500699e-08, + "loss": 0.2523, + "step": 8050 + }, + { + "epoch": 1.9162848812994584, + "grad_norm": 0.40725905564623294, + "learning_rate": 4.596739310305409e-08, + "loss": 0.3108, + "step": 8051 + }, + { + "epoch": 1.9165228773725234, + "grad_norm": 0.38543823318432485, + "learning_rate": 4.570698381290317e-08, + "loss": 0.3867, + "step": 8052 + }, + { + "epoch": 1.916760873445588, + "grad_norm": 0.3693394614353527, + "learning_rate": 4.5447310853258265e-08, + "loss": 0.306, + "step": 8053 + }, + { + "epoch": 1.916998869518653, + "grad_norm": 0.374918757902991, + "learning_rate": 4.5188374262712385e-08, + "loss": 0.2909, + "step": 8054 + }, + { + "epoch": 1.9172368655917178, + "grad_norm": 0.4268585385107588, + "learning_rate": 4.493017407975087e-08, + "loss": 0.3706, + "step": 8055 + }, + { + "epoch": 1.9174748616647825, + "grad_norm": 0.3550622241647807, + "learning_rate": 4.467271034274745e-08, + "loss": 0.3147, + "step": 8056 + }, + { + "epoch": 1.9177128577378473, + "grad_norm": 0.3673246840410985, + "learning_rate": 4.44159830899682e-08, + "loss": 0.2714, + "step": 8057 + }, + { + "epoch": 1.917950853810912, + "grad_norm": 0.38166747800870954, + "learning_rate": 4.415999235956925e-08, + "loss": 0.3132, + "step": 8058 + }, + { + "epoch": 1.9181888498839768, + "grad_norm": 0.37082445385628915, + "learning_rate": 4.390473818959684e-08, + "loss": 0.3427, + "step": 8059 + }, + { + "epoch": 1.9184268459570417, + "grad_norm": 0.3502851478207926, + "learning_rate": 4.3650220617988404e-08, + "loss": 0.3055, + "step": 8060 + }, + { + "epoch": 1.9186648420301065, + "grad_norm": 0.38376398396770767, + "learning_rate": 4.3396439682570904e-08, + "loss": 0.2819, + "step": 8061 + }, + { + "epoch": 1.9189028381031714, + "grad_norm": 0.37104295492217804, + "learning_rate": 4.3143395421063607e-08, + "loss": 0.3136, + "step": 8062 + }, + { + "epoch": 1.9191408341762362, + "grad_norm": 0.39064486463386716, + "learning_rate": 4.2891087871073656e-08, + "loss": 0.3645, + "step": 8063 + }, + { + "epoch": 1.919378830249301, + "grad_norm": 0.3959614629937633, + "learning_rate": 4.263951707010161e-08, + "loss": 0.2889, + "step": 8064 + }, + { + "epoch": 1.9196168263223656, + "grad_norm": 0.3976404851904098, + "learning_rate": 4.238868305553645e-08, + "loss": 0.2977, + "step": 8065 + }, + { + "epoch": 1.9198548223954304, + "grad_norm": 0.4187234518989236, + "learning_rate": 4.2138585864658354e-08, + "loss": 0.3897, + "step": 8066 + }, + { + "epoch": 1.920092818468495, + "grad_norm": 0.37755488054016684, + "learning_rate": 4.188922553463759e-08, + "loss": 0.3252, + "step": 8067 + }, + { + "epoch": 1.92033081454156, + "grad_norm": 0.3915281355437284, + "learning_rate": 4.164060210253618e-08, + "loss": 0.2707, + "step": 8068 + }, + { + "epoch": 1.9205688106146248, + "grad_norm": 0.39725191077466404, + "learning_rate": 4.1392715605305114e-08, + "loss": 0.346, + "step": 8069 + }, + { + "epoch": 1.9208068066876898, + "grad_norm": 0.3714723524123798, + "learning_rate": 4.1145566079786034e-08, + "loss": 0.3478, + "step": 8070 + }, + { + "epoch": 1.9210448027607545, + "grad_norm": 0.3707101045708407, + "learning_rate": 4.089915356271234e-08, + "loss": 0.3095, + "step": 8071 + }, + { + "epoch": 1.9212827988338192, + "grad_norm": 0.4055723864611074, + "learning_rate": 4.0653478090706965e-08, + "loss": 0.2812, + "step": 8072 + }, + { + "epoch": 1.921520794906884, + "grad_norm": 0.3801871133061815, + "learning_rate": 4.040853970028291e-08, + "loss": 0.3632, + "step": 8073 + }, + { + "epoch": 1.9217587909799487, + "grad_norm": 0.3539168169356299, + "learning_rate": 4.01643384278444e-08, + "loss": 0.3052, + "step": 8074 + }, + { + "epoch": 1.9219967870530135, + "grad_norm": 0.3611686215376351, + "learning_rate": 3.992087430968516e-08, + "loss": 0.2806, + "step": 8075 + }, + { + "epoch": 1.9222347831260784, + "grad_norm": 0.3913596751920842, + "learning_rate": 3.967814738199072e-08, + "loss": 0.3105, + "step": 8076 + }, + { + "epoch": 1.9224727791991432, + "grad_norm": 0.39705193175327785, + "learning_rate": 3.943615768083609e-08, + "loss": 0.3657, + "step": 8077 + }, + { + "epoch": 1.9227107752722081, + "grad_norm": 0.35680102901673433, + "learning_rate": 3.919490524218694e-08, + "loss": 0.2939, + "step": 8078 + }, + { + "epoch": 1.9229487713452729, + "grad_norm": 0.3936950953927909, + "learning_rate": 3.89543901018985e-08, + "loss": 0.2862, + "step": 8079 + }, + { + "epoch": 1.9231867674183376, + "grad_norm": 0.41880114246300687, + "learning_rate": 3.8714612295718824e-08, + "loss": 0.3537, + "step": 8080 + }, + { + "epoch": 1.9234247634914023, + "grad_norm": 0.3639233905209205, + "learning_rate": 3.8475571859283855e-08, + "loss": 0.3672, + "step": 8081 + }, + { + "epoch": 1.923662759564467, + "grad_norm": 0.3903343153212714, + "learning_rate": 3.823726882812129e-08, + "loss": 0.2758, + "step": 8082 + }, + { + "epoch": 1.9239007556375318, + "grad_norm": 0.3900369810611726, + "learning_rate": 3.7999703237648346e-08, + "loss": 0.3241, + "step": 8083 + }, + { + "epoch": 1.9241387517105968, + "grad_norm": 0.39117092554001687, + "learning_rate": 3.776287512317345e-08, + "loss": 0.3595, + "step": 8084 + }, + { + "epoch": 1.9243767477836615, + "grad_norm": 0.36654968786594255, + "learning_rate": 3.752678451989567e-08, + "loss": 0.2833, + "step": 8085 + }, + { + "epoch": 1.9246147438567265, + "grad_norm": 0.3809066672279184, + "learning_rate": 3.729143146290304e-08, + "loss": 0.2862, + "step": 8086 + }, + { + "epoch": 1.9248527399297912, + "grad_norm": 0.3490062069699892, + "learning_rate": 3.705681598717481e-08, + "loss": 0.3525, + "step": 8087 + }, + { + "epoch": 1.925090736002856, + "grad_norm": 0.379879949063031, + "learning_rate": 3.682293812758142e-08, + "loss": 0.3721, + "step": 8088 + }, + { + "epoch": 1.9253287320759207, + "grad_norm": 0.3778359839016277, + "learning_rate": 3.658979791888284e-08, + "loss": 0.3119, + "step": 8089 + }, + { + "epoch": 1.9255667281489854, + "grad_norm": 0.36956910970266094, + "learning_rate": 3.635739539572858e-08, + "loss": 0.3345, + "step": 8090 + }, + { + "epoch": 1.9258047242220502, + "grad_norm": 0.41589648566449683, + "learning_rate": 3.6125730592660445e-08, + "loss": 0.4026, + "step": 8091 + }, + { + "epoch": 1.9260427202951151, + "grad_norm": 0.37675858564217196, + "learning_rate": 3.589480354410868e-08, + "loss": 0.3182, + "step": 8092 + }, + { + "epoch": 1.9262807163681799, + "grad_norm": 0.3885940611238435, + "learning_rate": 3.5664614284395274e-08, + "loss": 0.2901, + "step": 8093 + }, + { + "epoch": 1.9265187124412448, + "grad_norm": 0.4267648886909655, + "learning_rate": 3.543516284773174e-08, + "loss": 0.3237, + "step": 8094 + }, + { + "epoch": 1.9267567085143096, + "grad_norm": 0.37790981095161896, + "learning_rate": 3.520644926822081e-08, + "loss": 0.4189, + "step": 8095 + }, + { + "epoch": 1.9269947045873743, + "grad_norm": 0.35608236467137666, + "learning_rate": 3.497847357985418e-08, + "loss": 0.3035, + "step": 8096 + }, + { + "epoch": 1.927232700660439, + "grad_norm": 0.35125924042990425, + "learning_rate": 3.475123581651529e-08, + "loss": 0.2894, + "step": 8097 + }, + { + "epoch": 1.9274706967335038, + "grad_norm": 0.3867965627570178, + "learning_rate": 3.452473601197659e-08, + "loss": 0.3634, + "step": 8098 + }, + { + "epoch": 1.9277086928065685, + "grad_norm": 0.35092993273506357, + "learning_rate": 3.42989741999028e-08, + "loss": 0.3621, + "step": 8099 + }, + { + "epoch": 1.9279466888796335, + "grad_norm": 0.37737964092644094, + "learning_rate": 3.4073950413846536e-08, + "loss": 0.2531, + "step": 8100 + }, + { + "epoch": 1.9281846849526982, + "grad_norm": 0.37016176061370865, + "learning_rate": 3.38496646872527e-08, + "loss": 0.3185, + "step": 8101 + }, + { + "epoch": 1.9284226810257632, + "grad_norm": 0.41856577365765557, + "learning_rate": 3.36261170534552e-08, + "loss": 0.3914, + "step": 8102 + }, + { + "epoch": 1.928660677098828, + "grad_norm": 0.350550972272308, + "learning_rate": 3.340330754567911e-08, + "loss": 0.295, + "step": 8103 + }, + { + "epoch": 1.9288986731718927, + "grad_norm": 0.37139534698154514, + "learning_rate": 3.3181236197038505e-08, + "loss": 0.295, + "step": 8104 + }, + { + "epoch": 1.9291366692449574, + "grad_norm": 0.4026861513587683, + "learning_rate": 3.295990304054031e-08, + "loss": 0.374, + "step": 8105 + }, + { + "epoch": 1.9293746653180222, + "grad_norm": 0.36621879141961106, + "learning_rate": 3.2739308109078215e-08, + "loss": 0.3791, + "step": 8106 + }, + { + "epoch": 1.929612661391087, + "grad_norm": 0.37842728624086314, + "learning_rate": 3.251945143543933e-08, + "loss": 0.2516, + "step": 8107 + }, + { + "epoch": 1.9298506574641519, + "grad_norm": 0.4177544855172531, + "learning_rate": 3.230033305229974e-08, + "loss": 0.3046, + "step": 8108 + }, + { + "epoch": 1.9300886535372166, + "grad_norm": 0.37909632728985976, + "learning_rate": 3.2081952992225605e-08, + "loss": 0.374, + "step": 8109 + }, + { + "epoch": 1.9303266496102816, + "grad_norm": 0.4361384086133121, + "learning_rate": 3.186431128767375e-08, + "loss": 0.3021, + "step": 8110 + }, + { + "epoch": 1.9305646456833463, + "grad_norm": 0.370480970702569, + "learning_rate": 3.164740797099053e-08, + "loss": 0.2905, + "step": 8111 + }, + { + "epoch": 1.930802641756411, + "grad_norm": 0.41765762330060385, + "learning_rate": 3.143124307441403e-08, + "loss": 0.3399, + "step": 8112 + }, + { + "epoch": 1.9310406378294758, + "grad_norm": 0.36824830434988537, + "learning_rate": 3.121581663007134e-08, + "loss": 0.3557, + "step": 8113 + }, + { + "epoch": 1.9312786339025405, + "grad_norm": 0.3842531168711303, + "learning_rate": 3.100112866997962e-08, + "loss": 0.2953, + "step": 8114 + }, + { + "epoch": 1.9315166299756052, + "grad_norm": 0.3817089022291876, + "learning_rate": 3.078717922604779e-08, + "loss": 0.3135, + "step": 8115 + }, + { + "epoch": 1.9317546260486702, + "grad_norm": 0.3710256940967274, + "learning_rate": 3.05739683300732e-08, + "loss": 0.3639, + "step": 8116 + }, + { + "epoch": 1.931992622121735, + "grad_norm": 0.39056691671635796, + "learning_rate": 3.036149601374494e-08, + "loss": 0.3277, + "step": 8117 + }, + { + "epoch": 1.9322306181948, + "grad_norm": 0.48173619690845865, + "learning_rate": 3.0149762308641083e-08, + "loss": 0.2814, + "step": 8118 + }, + { + "epoch": 1.9324686142678646, + "grad_norm": 0.3992688487992293, + "learning_rate": 2.993876724623035e-08, + "loss": 0.3308, + "step": 8119 + }, + { + "epoch": 1.9327066103409294, + "grad_norm": 0.3846765879945684, + "learning_rate": 2.972851085787265e-08, + "loss": 0.3655, + "step": 8120 + }, + { + "epoch": 1.9329446064139941, + "grad_norm": 0.335030815815972, + "learning_rate": 2.9518993174816323e-08, + "loss": 0.2838, + "step": 8121 + }, + { + "epoch": 1.9331826024870589, + "grad_norm": 0.39220164580944505, + "learning_rate": 2.9310214228202016e-08, + "loss": 0.2987, + "step": 8122 + }, + { + "epoch": 1.9334205985601236, + "grad_norm": 0.38126990911469083, + "learning_rate": 2.9102174049058796e-08, + "loss": 0.3555, + "step": 8123 + }, + { + "epoch": 1.9336585946331886, + "grad_norm": 0.37534376126487273, + "learning_rate": 2.8894872668305816e-08, + "loss": 0.3167, + "step": 8124 + }, + { + "epoch": 1.9338965907062533, + "grad_norm": 0.3782285028819716, + "learning_rate": 2.8688310116754546e-08, + "loss": 0.2798, + "step": 8125 + }, + { + "epoch": 1.9341345867793183, + "grad_norm": 0.4017596879571187, + "learning_rate": 2.848248642510487e-08, + "loss": 0.3332, + "step": 8126 + }, + { + "epoch": 1.934372582852383, + "grad_norm": 0.385095717269012, + "learning_rate": 2.8277401623946764e-08, + "loss": 0.3786, + "step": 8127 + }, + { + "epoch": 1.9346105789254477, + "grad_norm": 0.3681786253505828, + "learning_rate": 2.8073055743761956e-08, + "loss": 0.2888, + "step": 8128 + }, + { + "epoch": 1.9348485749985125, + "grad_norm": 0.40494290707801794, + "learning_rate": 2.7869448814920042e-08, + "loss": 0.2698, + "step": 8129 + }, + { + "epoch": 1.9350865710715772, + "grad_norm": 0.39678215471443756, + "learning_rate": 2.766658086768237e-08, + "loss": 0.3654, + "step": 8130 + }, + { + "epoch": 1.935324567144642, + "grad_norm": 0.38096391433914467, + "learning_rate": 2.746445193220093e-08, + "loss": 0.3331, + "step": 8131 + }, + { + "epoch": 1.935562563217707, + "grad_norm": 0.4207033973400047, + "learning_rate": 2.726306203851614e-08, + "loss": 0.291, + "step": 8132 + }, + { + "epoch": 1.9358005592907717, + "grad_norm": 0.35674390767561825, + "learning_rate": 2.706241121656017e-08, + "loss": 0.2914, + "step": 8133 + }, + { + "epoch": 1.9360385553638366, + "grad_norm": 0.37966258077198645, + "learning_rate": 2.6862499496154713e-08, + "loss": 0.4079, + "step": 8134 + }, + { + "epoch": 1.9362765514369014, + "grad_norm": 0.366878622762437, + "learning_rate": 2.6663326907010457e-08, + "loss": 0.2792, + "step": 8135 + }, + { + "epoch": 1.936514547509966, + "grad_norm": 0.3694716880277891, + "learning_rate": 2.6464893478730936e-08, + "loss": 0.2941, + "step": 8136 + }, + { + "epoch": 1.9367525435830308, + "grad_norm": 0.36316133821050267, + "learning_rate": 2.6267199240807563e-08, + "loss": 0.3586, + "step": 8137 + }, + { + "epoch": 1.9369905396560956, + "grad_norm": 0.37714358252792873, + "learning_rate": 2.6070244222622387e-08, + "loss": 0.3521, + "step": 8138 + }, + { + "epoch": 1.9372285357291603, + "grad_norm": 0.3846436958910601, + "learning_rate": 2.5874028453448106e-08, + "loss": 0.2677, + "step": 8139 + }, + { + "epoch": 1.9374665318022253, + "grad_norm": 0.3714862068184088, + "learning_rate": 2.567855196244695e-08, + "loss": 0.2874, + "step": 8140 + }, + { + "epoch": 1.93770452787529, + "grad_norm": 0.3857626420570937, + "learning_rate": 2.548381477867179e-08, + "loss": 0.3467, + "step": 8141 + }, + { + "epoch": 1.937942523948355, + "grad_norm": 0.3698968880808057, + "learning_rate": 2.528981693106558e-08, + "loss": 0.3375, + "step": 8142 + }, + { + "epoch": 1.9381805200214197, + "grad_norm": 0.3576794108662344, + "learning_rate": 2.5096558448460817e-08, + "loss": 0.2723, + "step": 8143 + }, + { + "epoch": 1.9384185160944845, + "grad_norm": 0.3902217274757619, + "learning_rate": 2.4904039359580635e-08, + "loss": 0.3334, + "step": 8144 + }, + { + "epoch": 1.9386565121675492, + "grad_norm": 0.3916377148051716, + "learning_rate": 2.4712259693038254e-08, + "loss": 0.3588, + "step": 8145 + }, + { + "epoch": 1.938894508240614, + "grad_norm": 0.3658288720995766, + "learning_rate": 2.452121947733699e-08, + "loss": 0.2968, + "step": 8146 + }, + { + "epoch": 1.9391325043136787, + "grad_norm": 0.4357430764134552, + "learning_rate": 2.4330918740869125e-08, + "loss": 0.314, + "step": 8147 + }, + { + "epoch": 1.9393705003867436, + "grad_norm": 0.40149441715579426, + "learning_rate": 2.414135751191926e-08, + "loss": 0.374, + "step": 8148 + }, + { + "epoch": 1.9396084964598084, + "grad_norm": 0.3611645193048641, + "learning_rate": 2.395253581866097e-08, + "loss": 0.3328, + "step": 8149 + }, + { + "epoch": 1.9398464925328733, + "grad_norm": 0.39467006084967055, + "learning_rate": 2.3764453689156808e-08, + "loss": 0.2535, + "step": 8150 + }, + { + "epoch": 1.940084488605938, + "grad_norm": 0.4124421719142737, + "learning_rate": 2.3577111151361078e-08, + "loss": 0.3229, + "step": 8151 + }, + { + "epoch": 1.9403224846790028, + "grad_norm": 0.3578248332226124, + "learning_rate": 2.3390508233117615e-08, + "loss": 0.4014, + "step": 8152 + }, + { + "epoch": 1.9405604807520676, + "grad_norm": 0.35974756105539724, + "learning_rate": 2.3204644962159793e-08, + "loss": 0.2963, + "step": 8153 + }, + { + "epoch": 1.9407984768251323, + "grad_norm": 0.3930501887956103, + "learning_rate": 2.301952136611163e-08, + "loss": 0.255, + "step": 8154 + }, + { + "epoch": 1.941036472898197, + "grad_norm": 0.39726170086743057, + "learning_rate": 2.2835137472487223e-08, + "loss": 0.3411, + "step": 8155 + }, + { + "epoch": 1.941274468971262, + "grad_norm": 0.36986727713553336, + "learning_rate": 2.2651493308690765e-08, + "loss": 0.3355, + "step": 8156 + }, + { + "epoch": 1.9415124650443267, + "grad_norm": 0.35051444026980566, + "learning_rate": 2.2468588902015975e-08, + "loss": 0.2846, + "step": 8157 + }, + { + "epoch": 1.9417504611173917, + "grad_norm": 0.37221414208027037, + "learning_rate": 2.2286424279646668e-08, + "loss": 0.3196, + "step": 8158 + }, + { + "epoch": 1.9419884571904564, + "grad_norm": 0.38381167717462056, + "learning_rate": 2.2104999468657852e-08, + "loss": 0.3715, + "step": 8159 + }, + { + "epoch": 1.9422264532635212, + "grad_norm": 0.3787505872883276, + "learning_rate": 2.192431449601351e-08, + "loss": 0.301, + "step": 8160 + }, + { + "epoch": 1.942464449336586, + "grad_norm": 0.36125888810575163, + "learning_rate": 2.1744369388567167e-08, + "loss": 0.2484, + "step": 8161 + }, + { + "epoch": 1.9427024454096506, + "grad_norm": 0.4252944882802151, + "learning_rate": 2.1565164173063536e-08, + "loss": 0.343, + "step": 8162 + }, + { + "epoch": 1.9429404414827154, + "grad_norm": 0.38454550235674245, + "learning_rate": 2.1386698876137424e-08, + "loss": 0.3642, + "step": 8163 + }, + { + "epoch": 1.9431784375557803, + "grad_norm": 0.3722701639410283, + "learning_rate": 2.1208973524312616e-08, + "loss": 0.3339, + "step": 8164 + }, + { + "epoch": 1.943416433628845, + "grad_norm": 0.39979583818697223, + "learning_rate": 2.103198814400409e-08, + "loss": 0.2935, + "step": 8165 + }, + { + "epoch": 1.94365442970191, + "grad_norm": 0.3753339389528479, + "learning_rate": 2.0855742761515808e-08, + "loss": 0.3629, + "step": 8166 + }, + { + "epoch": 1.9438924257749748, + "grad_norm": 0.36576216964926017, + "learning_rate": 2.0680237403041815e-08, + "loss": 0.3144, + "step": 8167 + }, + { + "epoch": 1.9441304218480395, + "grad_norm": 0.40044108633786357, + "learning_rate": 2.0505472094667356e-08, + "loss": 0.3043, + "step": 8168 + }, + { + "epoch": 1.9443684179211043, + "grad_norm": 0.39121889961558187, + "learning_rate": 2.0331446862366098e-08, + "loss": 0.3215, + "step": 8169 + }, + { + "epoch": 1.944606413994169, + "grad_norm": 0.3966140516297892, + "learning_rate": 2.0158161732003467e-08, + "loss": 0.3878, + "step": 8170 + }, + { + "epoch": 1.9448444100672337, + "grad_norm": 0.36014975036534674, + "learning_rate": 1.9985616729332747e-08, + "loss": 0.2826, + "step": 8171 + }, + { + "epoch": 1.9450824061402987, + "grad_norm": 0.3712453532736723, + "learning_rate": 1.9813811879999533e-08, + "loss": 0.2923, + "step": 8172 + }, + { + "epoch": 1.9453204022133634, + "grad_norm": 0.4017756063096361, + "learning_rate": 1.9642747209537295e-08, + "loss": 0.338, + "step": 8173 + }, + { + "epoch": 1.9455583982864284, + "grad_norm": 0.37378197959060855, + "learning_rate": 1.9472422743371245e-08, + "loss": 0.3247, + "step": 8174 + }, + { + "epoch": 1.9457963943594931, + "grad_norm": 0.4124391596209017, + "learning_rate": 1.9302838506815026e-08, + "loss": 0.2884, + "step": 8175 + }, + { + "epoch": 1.9460343904325579, + "grad_norm": 0.4033106524212199, + "learning_rate": 1.913399452507403e-08, + "loss": 0.3301, + "step": 8176 + }, + { + "epoch": 1.9462723865056226, + "grad_norm": 0.3966965084126736, + "learning_rate": 1.8965890823242072e-08, + "loss": 0.3688, + "step": 8177 + }, + { + "epoch": 1.9465103825786874, + "grad_norm": 0.33755930518081106, + "learning_rate": 1.879852742630306e-08, + "loss": 0.2721, + "step": 8178 + }, + { + "epoch": 1.946748378651752, + "grad_norm": 0.36588329404854725, + "learning_rate": 1.8631904359132646e-08, + "loss": 0.2917, + "step": 8179 + }, + { + "epoch": 1.946986374724817, + "grad_norm": 0.3587494911753443, + "learning_rate": 1.8466021646493802e-08, + "loss": 0.343, + "step": 8180 + }, + { + "epoch": 1.9472243707978818, + "grad_norm": 0.38139976856694574, + "learning_rate": 1.830087931304181e-08, + "loss": 0.3703, + "step": 8181 + }, + { + "epoch": 1.9474623668709468, + "grad_norm": 0.4031204694527784, + "learning_rate": 1.8136477383319805e-08, + "loss": 0.2817, + "step": 8182 + }, + { + "epoch": 1.9477003629440115, + "grad_norm": 0.3888962919598794, + "learning_rate": 1.797281588176325e-08, + "loss": 0.299, + "step": 8183 + }, + { + "epoch": 1.9479383590170762, + "grad_norm": 0.4082574436849942, + "learning_rate": 1.7809894832695463e-08, + "loss": 0.3874, + "step": 8184 + }, + { + "epoch": 1.948176355090141, + "grad_norm": 0.3623742069248171, + "learning_rate": 1.7647714260330407e-08, + "loss": 0.3092, + "step": 8185 + }, + { + "epoch": 1.9484143511632057, + "grad_norm": 0.39366833031118265, + "learning_rate": 1.748627418877269e-08, + "loss": 0.2897, + "step": 8186 + }, + { + "epoch": 1.9486523472362705, + "grad_norm": 0.3874663652919054, + "learning_rate": 1.7325574642016453e-08, + "loss": 0.3632, + "step": 8187 + }, + { + "epoch": 1.9488903433093354, + "grad_norm": 0.4022386962255987, + "learning_rate": 1.716561564394481e-08, + "loss": 0.3729, + "step": 8188 + }, + { + "epoch": 1.9491283393824002, + "grad_norm": 0.3760810663300212, + "learning_rate": 1.7006397218332084e-08, + "loss": 0.2598, + "step": 8189 + }, + { + "epoch": 1.9493663354554651, + "grad_norm": 0.3838950169229048, + "learning_rate": 1.6847919388842115e-08, + "loss": 0.3023, + "step": 8190 + }, + { + "epoch": 1.9496043315285299, + "grad_norm": 0.3762323973720483, + "learning_rate": 1.6690182179028845e-08, + "loss": 0.358, + "step": 8191 + }, + { + "epoch": 1.9498423276015946, + "grad_norm": 0.40170151586759195, + "learning_rate": 1.6533185612335188e-08, + "loss": 0.2977, + "step": 8192 + }, + { + "epoch": 1.9500803236746593, + "grad_norm": 0.38021345018959474, + "learning_rate": 1.6376929712095813e-08, + "loss": 0.2929, + "step": 8193 + }, + { + "epoch": 1.950318319747724, + "grad_norm": 0.3951132465369512, + "learning_rate": 1.6221414501532694e-08, + "loss": 0.3475, + "step": 8194 + }, + { + "epoch": 1.9505563158207888, + "grad_norm": 0.38568162349813606, + "learning_rate": 1.606664000376068e-08, + "loss": 0.3766, + "step": 8195 + }, + { + "epoch": 1.9507943118938538, + "grad_norm": 0.3780659119974086, + "learning_rate": 1.591260624178248e-08, + "loss": 0.2771, + "step": 8196 + }, + { + "epoch": 1.9510323079669185, + "grad_norm": 0.3958482547520693, + "learning_rate": 1.5759313238491447e-08, + "loss": 0.2964, + "step": 8197 + }, + { + "epoch": 1.9512703040399835, + "grad_norm": 0.39144688735676086, + "learning_rate": 1.5606761016670467e-08, + "loss": 0.3604, + "step": 8198 + }, + { + "epoch": 1.9515083001130482, + "grad_norm": 0.4028520449978215, + "learning_rate": 1.5454949598993075e-08, + "loss": 0.3177, + "step": 8199 + }, + { + "epoch": 1.951746296186113, + "grad_norm": 0.3749353165874801, + "learning_rate": 1.5303879008021773e-08, + "loss": 0.2832, + "step": 8200 + }, + { + "epoch": 1.9519842922591777, + "grad_norm": 0.39758890288042986, + "learning_rate": 1.5153549266209154e-08, + "loss": 0.3128, + "step": 8201 + }, + { + "epoch": 1.9522222883322424, + "grad_norm": 0.3508156781517721, + "learning_rate": 1.5003960395898465e-08, + "loss": 0.4, + "step": 8202 + }, + { + "epoch": 1.9524602844053072, + "grad_norm": 0.35738164902463865, + "learning_rate": 1.4855112419321916e-08, + "loss": 0.3172, + "step": 8203 + }, + { + "epoch": 1.9526982804783721, + "grad_norm": 0.3828648741919708, + "learning_rate": 1.4707005358602367e-08, + "loss": 0.2917, + "step": 8204 + }, + { + "epoch": 1.9529362765514369, + "grad_norm": 0.4386268966748521, + "learning_rate": 1.4559639235751654e-08, + "loss": 0.3149, + "step": 8205 + }, + { + "epoch": 1.9531742726245018, + "grad_norm": 0.3676937839979994, + "learning_rate": 1.4413014072672816e-08, + "loss": 0.3793, + "step": 8206 + }, + { + "epoch": 1.9534122686975666, + "grad_norm": 0.3692252990914482, + "learning_rate": 1.426712989115786e-08, + "loss": 0.2811, + "step": 8207 + }, + { + "epoch": 1.9536502647706313, + "grad_norm": 0.37336901880476925, + "learning_rate": 1.4121986712888336e-08, + "loss": 0.316, + "step": 8208 + }, + { + "epoch": 1.953888260843696, + "grad_norm": 0.40798571701364533, + "learning_rate": 1.3977584559435874e-08, + "loss": 0.374, + "step": 8209 + }, + { + "epoch": 1.9541262569167608, + "grad_norm": 0.36868651161847604, + "learning_rate": 1.3833923452262754e-08, + "loss": 0.3055, + "step": 8210 + }, + { + "epoch": 1.9543642529898255, + "grad_norm": 0.38007255786831645, + "learning_rate": 1.3691003412720783e-08, + "loss": 0.2946, + "step": 8211 + }, + { + "epoch": 1.9546022490628905, + "grad_norm": 0.35949632759077443, + "learning_rate": 1.3548824462050747e-08, + "loss": 0.3456, + "step": 8212 + }, + { + "epoch": 1.9548402451359552, + "grad_norm": 0.37479143142903726, + "learning_rate": 1.3407386621384078e-08, + "loss": 0.4028, + "step": 8213 + }, + { + "epoch": 1.9550782412090202, + "grad_norm": 0.3666312123014584, + "learning_rate": 1.3266689911742291e-08, + "loss": 0.2829, + "step": 8214 + }, + { + "epoch": 1.955316237282085, + "grad_norm": 0.4020518839752495, + "learning_rate": 1.312673435403644e-08, + "loss": 0.3137, + "step": 8215 + }, + { + "epoch": 1.9555542333551497, + "grad_norm": 0.4198314847703424, + "learning_rate": 1.2987519969067109e-08, + "loss": 0.3647, + "step": 8216 + }, + { + "epoch": 1.9557922294282144, + "grad_norm": 0.37449752005583575, + "learning_rate": 1.284904677752441e-08, + "loss": 0.3045, + "step": 8217 + }, + { + "epoch": 1.9560302255012791, + "grad_norm": 0.3788052759823918, + "learning_rate": 1.2711314799990216e-08, + "loss": 0.2877, + "step": 8218 + }, + { + "epoch": 1.9562682215743439, + "grad_norm": 0.3865156770622441, + "learning_rate": 1.2574324056934262e-08, + "loss": 0.3561, + "step": 8219 + }, + { + "epoch": 1.9565062176474088, + "grad_norm": 0.3781434327446371, + "learning_rate": 1.2438074568716374e-08, + "loss": 0.3798, + "step": 8220 + }, + { + "epoch": 1.9567442137204736, + "grad_norm": 0.36790890617104577, + "learning_rate": 1.230256635558702e-08, + "loss": 0.2838, + "step": 8221 + }, + { + "epoch": 1.9569822097935385, + "grad_norm": 0.39260234101874714, + "learning_rate": 1.21677994376862e-08, + "loss": 0.3047, + "step": 8222 + }, + { + "epoch": 1.9572202058666033, + "grad_norm": 0.3574524348159013, + "learning_rate": 1.2033773835042894e-08, + "loss": 0.3746, + "step": 8223 + }, + { + "epoch": 1.957458201939668, + "grad_norm": 0.3612325904669084, + "learning_rate": 1.1900489567577277e-08, + "loss": 0.3464, + "step": 8224 + }, + { + "epoch": 1.9576961980127328, + "grad_norm": 0.38077468030121714, + "learning_rate": 1.1767946655099061e-08, + "loss": 0.2666, + "step": 8225 + }, + { + "epoch": 1.9579341940857975, + "grad_norm": 0.368563482866289, + "learning_rate": 1.1636145117306374e-08, + "loss": 0.3406, + "step": 8226 + }, + { + "epoch": 1.9581721901588622, + "grad_norm": 0.42477667717805123, + "learning_rate": 1.1505084973789105e-08, + "loss": 0.3935, + "step": 8227 + }, + { + "epoch": 1.9584101862319272, + "grad_norm": 0.3438577540547306, + "learning_rate": 1.1374766244025003e-08, + "loss": 0.2929, + "step": 8228 + }, + { + "epoch": 1.958648182304992, + "grad_norm": 0.3782776548245008, + "learning_rate": 1.1245188947384133e-08, + "loss": 0.2818, + "step": 8229 + }, + { + "epoch": 1.958886178378057, + "grad_norm": 0.36863077536612854, + "learning_rate": 1.1116353103123312e-08, + "loss": 0.3458, + "step": 8230 + }, + { + "epoch": 1.9591241744511216, + "grad_norm": 0.3659543529085472, + "learning_rate": 1.0988258730391665e-08, + "loss": 0.3483, + "step": 8231 + }, + { + "epoch": 1.9593621705241864, + "grad_norm": 0.38260543409474423, + "learning_rate": 1.0860905848227298e-08, + "loss": 0.2871, + "step": 8232 + }, + { + "epoch": 1.9596001665972511, + "grad_norm": 0.41483183809961477, + "learning_rate": 1.0734294475557294e-08, + "loss": 0.3166, + "step": 8233 + }, + { + "epoch": 1.9598381626703159, + "grad_norm": 0.39222204343126627, + "learning_rate": 1.0608424631199376e-08, + "loss": 0.3949, + "step": 8234 + }, + { + "epoch": 1.9600761587433806, + "grad_norm": 0.3711141699280681, + "learning_rate": 1.0483296333861914e-08, + "loss": 0.3006, + "step": 8235 + }, + { + "epoch": 1.9603141548164456, + "grad_norm": 0.3763503542360473, + "learning_rate": 1.0358909602140588e-08, + "loss": 0.3047, + "step": 8236 + }, + { + "epoch": 1.9605521508895103, + "grad_norm": 0.3969121772442897, + "learning_rate": 1.0235264454523385e-08, + "loss": 0.3416, + "step": 8237 + }, + { + "epoch": 1.9607901469625753, + "grad_norm": 0.3816122834946716, + "learning_rate": 1.011236090938672e-08, + "loss": 0.3718, + "step": 8238 + }, + { + "epoch": 1.96102814303564, + "grad_norm": 0.38396544116333275, + "learning_rate": 9.99019898499709e-09, + "loss": 0.2717, + "step": 8239 + }, + { + "epoch": 1.9612661391087047, + "grad_norm": 0.44666178093194486, + "learning_rate": 9.868778699511083e-09, + "loss": 0.3074, + "step": 8240 + }, + { + "epoch": 1.9615041351817695, + "grad_norm": 0.4143851166339734, + "learning_rate": 9.748100070974265e-09, + "loss": 0.3759, + "step": 8241 + }, + { + "epoch": 1.9617421312548342, + "grad_norm": 0.3921730498280631, + "learning_rate": 9.628163117322286e-09, + "loss": 0.3331, + "step": 8242 + }, + { + "epoch": 1.961980127327899, + "grad_norm": 0.3726163829999817, + "learning_rate": 9.508967856381445e-09, + "loss": 0.2988, + "step": 8243 + }, + { + "epoch": 1.962218123400964, + "grad_norm": 0.39354313502014604, + "learning_rate": 9.390514305867015e-09, + "loss": 0.3275, + "step": 8244 + }, + { + "epoch": 1.9624561194740286, + "grad_norm": 0.37253432454149743, + "learning_rate": 9.272802483383248e-09, + "loss": 0.3857, + "step": 8245 + }, + { + "epoch": 1.9626941155470936, + "grad_norm": 0.3981252148153215, + "learning_rate": 9.155832406426147e-09, + "loss": 0.265, + "step": 8246 + }, + { + "epoch": 1.9629321116201583, + "grad_norm": 0.39768413263890057, + "learning_rate": 9.039604092379583e-09, + "loss": 0.2868, + "step": 8247 + }, + { + "epoch": 1.963170107693223, + "grad_norm": 0.4075926684645318, + "learning_rate": 8.92411755851863e-09, + "loss": 0.3498, + "step": 8248 + }, + { + "epoch": 1.9634081037662878, + "grad_norm": 0.3715207251676553, + "learning_rate": 8.809372822006779e-09, + "loss": 0.3479, + "step": 8249 + }, + { + "epoch": 1.9636460998393526, + "grad_norm": 0.3869938419383704, + "learning_rate": 8.69536989989872e-09, + "loss": 0.2767, + "step": 8250 + }, + { + "epoch": 1.9638840959124173, + "grad_norm": 0.37358902937459904, + "learning_rate": 8.582108809137013e-09, + "loss": 0.3084, + "step": 8251 + }, + { + "epoch": 1.9641220919854823, + "grad_norm": 0.4092803779218533, + "learning_rate": 8.469589566555968e-09, + "loss": 0.3679, + "step": 8252 + }, + { + "epoch": 1.964360088058547, + "grad_norm": 0.36047460363307426, + "learning_rate": 8.357812188878323e-09, + "loss": 0.2806, + "step": 8253 + }, + { + "epoch": 1.964598084131612, + "grad_norm": 0.3678747427954382, + "learning_rate": 8.246776692716896e-09, + "loss": 0.2568, + "step": 8254 + }, + { + "epoch": 1.9648360802046767, + "grad_norm": 0.40832629901247686, + "learning_rate": 8.1364830945746e-09, + "loss": 0.333, + "step": 8255 + }, + { + "epoch": 1.9650740762777414, + "grad_norm": 0.37115745740832373, + "learning_rate": 8.026931410843874e-09, + "loss": 0.3326, + "step": 8256 + }, + { + "epoch": 1.9653120723508062, + "grad_norm": 0.37423048974217404, + "learning_rate": 7.918121657806699e-09, + "loss": 0.2807, + "step": 8257 + }, + { + "epoch": 1.965550068423871, + "grad_norm": 0.3864398760686447, + "learning_rate": 7.81005385163458e-09, + "loss": 0.3276, + "step": 8258 + }, + { + "epoch": 1.9657880644969357, + "grad_norm": 0.388715422162384, + "learning_rate": 7.702728008389116e-09, + "loss": 0.3942, + "step": 8259 + }, + { + "epoch": 1.9660260605700006, + "grad_norm": 0.3429967987431223, + "learning_rate": 7.596144144021988e-09, + "loss": 0.2696, + "step": 8260 + }, + { + "epoch": 1.9662640566430654, + "grad_norm": 0.3884832865704859, + "learning_rate": 7.490302274373862e-09, + "loss": 0.2719, + "step": 8261 + }, + { + "epoch": 1.9665020527161303, + "grad_norm": 0.3804734222913664, + "learning_rate": 7.385202415175485e-09, + "loss": 0.3317, + "step": 8262 + }, + { + "epoch": 1.966740048789195, + "grad_norm": 0.36816873312551346, + "learning_rate": 7.280844582047142e-09, + "loss": 0.3705, + "step": 8263 + }, + { + "epoch": 1.9669780448622598, + "grad_norm": 0.36983723483013164, + "learning_rate": 7.1772287904997575e-09, + "loss": 0.3021, + "step": 8264 + }, + { + "epoch": 1.9672160409353245, + "grad_norm": 0.38603048071401885, + "learning_rate": 7.07435505593268e-09, + "loss": 0.2876, + "step": 8265 + }, + { + "epoch": 1.9674540370083893, + "grad_norm": 0.4003003944120439, + "learning_rate": 6.972223393634792e-09, + "loss": 0.3849, + "step": 8266 + }, + { + "epoch": 1.967692033081454, + "grad_norm": 0.3602611770065517, + "learning_rate": 6.870833818786727e-09, + "loss": 0.2768, + "step": 8267 + }, + { + "epoch": 1.967930029154519, + "grad_norm": 0.3884374384546158, + "learning_rate": 6.770186346456431e-09, + "loss": 0.2914, + "step": 8268 + }, + { + "epoch": 1.9681680252275837, + "grad_norm": 0.3744232480095091, + "learning_rate": 6.670280991603606e-09, + "loss": 0.3281, + "step": 8269 + }, + { + "epoch": 1.9684060213006487, + "grad_norm": 0.42375613579156163, + "learning_rate": 6.571117769075264e-09, + "loss": 0.3981, + "step": 8270 + }, + { + "epoch": 1.9686440173737134, + "grad_norm": 0.35315666764990467, + "learning_rate": 6.472696693610725e-09, + "loss": 0.2874, + "step": 8271 + }, + { + "epoch": 1.9688820134467782, + "grad_norm": 0.3452396601565405, + "learning_rate": 6.375017779837178e-09, + "loss": 0.2947, + "step": 8272 + }, + { + "epoch": 1.969120009519843, + "grad_norm": 0.3557979924222308, + "learning_rate": 6.278081042272455e-09, + "loss": 0.3548, + "step": 8273 + }, + { + "epoch": 1.9693580055929076, + "grad_norm": 0.3864839080975044, + "learning_rate": 6.181886495323364e-09, + "loss": 0.3309, + "step": 8274 + }, + { + "epoch": 1.9695960016659724, + "grad_norm": 0.4527567996253969, + "learning_rate": 6.086434153287357e-09, + "loss": 0.2887, + "step": 8275 + }, + { + "epoch": 1.9698339977390373, + "grad_norm": 0.4701596918796658, + "learning_rate": 5.991724030350865e-09, + "loss": 0.2977, + "step": 8276 + }, + { + "epoch": 1.970071993812102, + "grad_norm": 0.40077106587480205, + "learning_rate": 5.8977561405898496e-09, + "loss": 0.37, + "step": 8277 + }, + { + "epoch": 1.970309989885167, + "grad_norm": 0.36798667561511195, + "learning_rate": 5.804530497970362e-09, + "loss": 0.3101, + "step": 8278 + }, + { + "epoch": 1.9705479859582318, + "grad_norm": 0.4143172864878241, + "learning_rate": 5.712047116347985e-09, + "loss": 0.2745, + "step": 8279 + }, + { + "epoch": 1.9707859820312965, + "grad_norm": 0.3852482915444085, + "learning_rate": 5.620306009467835e-09, + "loss": 0.3629, + "step": 8280 + }, + { + "epoch": 1.9710239781043613, + "grad_norm": 0.37496502608000093, + "learning_rate": 5.529307190965671e-09, + "loss": 0.344, + "step": 8281 + }, + { + "epoch": 1.971261974177426, + "grad_norm": 0.3832082730157154, + "learning_rate": 5.439050674365676e-09, + "loss": 0.2708, + "step": 8282 + }, + { + "epoch": 1.9714999702504907, + "grad_norm": 0.40313323861297806, + "learning_rate": 5.349536473082118e-09, + "loss": 0.3132, + "step": 8283 + }, + { + "epoch": 1.9717379663235557, + "grad_norm": 0.3707010760106356, + "learning_rate": 5.260764600419354e-09, + "loss": 0.4018, + "step": 8284 + }, + { + "epoch": 1.9719759623966204, + "grad_norm": 0.43019076768807374, + "learning_rate": 5.172735069570722e-09, + "loss": 0.3143, + "step": 8285 + }, + { + "epoch": 1.9722139584696854, + "grad_norm": 0.46022861405119286, + "learning_rate": 5.0854478936190884e-09, + "loss": 0.2626, + "step": 8286 + }, + { + "epoch": 1.9724519545427501, + "grad_norm": 0.3912662063723855, + "learning_rate": 4.998903085539075e-09, + "loss": 0.319, + "step": 8287 + }, + { + "epoch": 1.9726899506158149, + "grad_norm": 0.3793180333217797, + "learning_rate": 4.913100658192061e-09, + "loss": 0.3523, + "step": 8288 + }, + { + "epoch": 1.9729279466888796, + "grad_norm": 0.3865661630275148, + "learning_rate": 4.828040624330621e-09, + "loss": 0.2683, + "step": 8289 + }, + { + "epoch": 1.9731659427619443, + "grad_norm": 0.37603160286322296, + "learning_rate": 4.743722996597422e-09, + "loss": 0.3024, + "step": 8290 + }, + { + "epoch": 1.973403938835009, + "grad_norm": 0.3751677087026661, + "learning_rate": 4.6601477875235505e-09, + "loss": 0.34, + "step": 8291 + }, + { + "epoch": 1.973641934908074, + "grad_norm": 0.376225703087134, + "learning_rate": 4.577315009530181e-09, + "loss": 0.3369, + "step": 8292 + }, + { + "epoch": 1.9738799309811388, + "grad_norm": 0.3927228549545841, + "learning_rate": 4.495224674928578e-09, + "loss": 0.2813, + "step": 8293 + }, + { + "epoch": 1.9741179270542037, + "grad_norm": 0.3827330662097907, + "learning_rate": 4.413876795919536e-09, + "loss": 0.3303, + "step": 8294 + }, + { + "epoch": 1.9743559231272685, + "grad_norm": 0.3862347082915683, + "learning_rate": 4.333271384593385e-09, + "loss": 0.3676, + "step": 8295 + }, + { + "epoch": 1.9745939192003332, + "grad_norm": 0.40932286583844174, + "learning_rate": 4.253408452929986e-09, + "loss": 0.27, + "step": 8296 + }, + { + "epoch": 1.974831915273398, + "grad_norm": 0.4000770232069227, + "learning_rate": 4.174288012798733e-09, + "loss": 0.3247, + "step": 8297 + }, + { + "epoch": 1.9750699113464627, + "grad_norm": 0.36945290953037635, + "learning_rate": 4.095910075959108e-09, + "loss": 0.3289, + "step": 8298 + }, + { + "epoch": 1.9753079074195274, + "grad_norm": 0.4060885406421163, + "learning_rate": 4.018274654059573e-09, + "loss": 0.3047, + "step": 8299 + }, + { + "epoch": 1.9755459034925924, + "grad_norm": 0.4119247754609785, + "learning_rate": 3.941381758639784e-09, + "loss": 0.2876, + "step": 8300 + }, + { + "epoch": 1.9757838995656571, + "grad_norm": 0.39855318475174173, + "learning_rate": 3.865231401126712e-09, + "loss": 0.3215, + "step": 8301 + }, + { + "epoch": 1.976021895638722, + "grad_norm": 0.3834493602365561, + "learning_rate": 3.789823592838526e-09, + "loss": 0.381, + "step": 8302 + }, + { + "epoch": 1.9762598917117868, + "grad_norm": 0.39370603464488824, + "learning_rate": 3.7151583449834826e-09, + "loss": 0.2854, + "step": 8303 + }, + { + "epoch": 1.9764978877848516, + "grad_norm": 0.38092804030560223, + "learning_rate": 3.6412356686577056e-09, + "loss": 0.2866, + "step": 8304 + }, + { + "epoch": 1.9767358838579163, + "grad_norm": 0.359154440466524, + "learning_rate": 3.5680555748479617e-09, + "loss": 0.3367, + "step": 8305 + }, + { + "epoch": 1.976973879930981, + "grad_norm": 0.3866822121685151, + "learning_rate": 3.4956180744311063e-09, + "loss": 0.3345, + "step": 8306 + }, + { + "epoch": 1.9772118760040458, + "grad_norm": 0.3877651597061357, + "learning_rate": 3.423923178172972e-09, + "loss": 0.2589, + "step": 8307 + }, + { + "epoch": 1.9774498720771108, + "grad_norm": 0.3888359989652597, + "learning_rate": 3.3529708967294794e-09, + "loss": 0.3375, + "step": 8308 + }, + { + "epoch": 1.9776878681501755, + "grad_norm": 0.38304911568033706, + "learning_rate": 3.282761240645527e-09, + "loss": 0.3554, + "step": 8309 + }, + { + "epoch": 1.9779258642232405, + "grad_norm": 0.3470736673254614, + "learning_rate": 3.213294220355545e-09, + "loss": 0.3175, + "step": 8310 + }, + { + "epoch": 1.9781638602963052, + "grad_norm": 0.377926105048321, + "learning_rate": 3.1445698461851638e-09, + "loss": 0.2487, + "step": 8311 + }, + { + "epoch": 1.97840185636937, + "grad_norm": 0.3806616330829527, + "learning_rate": 3.0765881283478794e-09, + "loss": 0.3329, + "step": 8312 + }, + { + "epoch": 1.9786398524424347, + "grad_norm": 0.3620860977402568, + "learning_rate": 3.0093490769472765e-09, + "loss": 0.3399, + "step": 8313 + }, + { + "epoch": 1.9788778485154994, + "grad_norm": 0.38013912104937325, + "learning_rate": 2.942852701977028e-09, + "loss": 0.2705, + "step": 8314 + }, + { + "epoch": 1.9791158445885642, + "grad_norm": 0.3818113771924823, + "learning_rate": 2.8770990133203392e-09, + "loss": 0.3248, + "step": 8315 + }, + { + "epoch": 1.9793538406616291, + "grad_norm": 0.4113225030445951, + "learning_rate": 2.8120880207493928e-09, + "loss": 0.3487, + "step": 8316 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.3703805736030292, + "learning_rate": 2.747819733927015e-09, + "loss": 0.3121, + "step": 8317 + }, + { + "epoch": 1.9798298328077588, + "grad_norm": 0.368702788372816, + "learning_rate": 2.6842941624044548e-09, + "loss": 0.2654, + "step": 8318 + }, + { + "epoch": 1.9800678288808236, + "grad_norm": 0.39480635521831586, + "learning_rate": 2.6215113156230487e-09, + "loss": 0.3279, + "step": 8319 + }, + { + "epoch": 1.9803058249538883, + "grad_norm": 0.35852264603642603, + "learning_rate": 2.559471202914776e-09, + "loss": 0.393, + "step": 8320 + }, + { + "epoch": 1.980543821026953, + "grad_norm": 0.3594864841251863, + "learning_rate": 2.498173833499484e-09, + "loss": 0.2659, + "step": 8321 + }, + { + "epoch": 1.9807818171000178, + "grad_norm": 0.40546718269932125, + "learning_rate": 2.4376192164882183e-09, + "loss": 0.3409, + "step": 8322 + }, + { + "epoch": 1.9810198131730825, + "grad_norm": 0.5532056344379126, + "learning_rate": 2.3778073608798914e-09, + "loss": 0.358, + "step": 8323 + }, + { + "epoch": 1.9812578092461475, + "grad_norm": 0.3546557183922418, + "learning_rate": 2.3187382755651687e-09, + "loss": 0.3298, + "step": 8324 + }, + { + "epoch": 1.9814958053192122, + "grad_norm": 0.37203732480489365, + "learning_rate": 2.2604119693220295e-09, + "loss": 0.2895, + "step": 8325 + }, + { + "epoch": 1.9817338013922772, + "grad_norm": 0.4172958208934213, + "learning_rate": 2.202828450820205e-09, + "loss": 0.3164, + "step": 8326 + }, + { + "epoch": 1.981971797465342, + "grad_norm": 0.3625690030083054, + "learning_rate": 2.1459877286172935e-09, + "loss": 0.3785, + "step": 8327 + }, + { + "epoch": 1.9822097935384067, + "grad_norm": 0.3603665358371541, + "learning_rate": 2.0898898111620935e-09, + "loss": 0.2816, + "step": 8328 + }, + { + "epoch": 1.9824477896114714, + "grad_norm": 0.3780300709578647, + "learning_rate": 2.034534706791269e-09, + "loss": 0.2825, + "step": 8329 + }, + { + "epoch": 1.9826857856845361, + "grad_norm": 0.37028301830691185, + "learning_rate": 1.979922423732128e-09, + "loss": 0.3403, + "step": 8330 + }, + { + "epoch": 1.9829237817576009, + "grad_norm": 0.3638771091784381, + "learning_rate": 1.9260529701015105e-09, + "loss": 0.3491, + "step": 8331 + }, + { + "epoch": 1.9831617778306658, + "grad_norm": 0.41702450685086956, + "learning_rate": 1.8729263539063457e-09, + "loss": 0.2864, + "step": 8332 + }, + { + "epoch": 1.9833997739037306, + "grad_norm": 0.37868852344723125, + "learning_rate": 1.820542583041429e-09, + "loss": 0.3317, + "step": 8333 + }, + { + "epoch": 1.9836377699767955, + "grad_norm": 0.3666826086769097, + "learning_rate": 1.76890166529331e-09, + "loss": 0.3906, + "step": 8334 + }, + { + "epoch": 1.9838757660498603, + "grad_norm": 0.35122611004045284, + "learning_rate": 1.718003608336405e-09, + "loss": 0.2839, + "step": 8335 + }, + { + "epoch": 1.984113762122925, + "grad_norm": 0.4202455413793847, + "learning_rate": 1.6678484197357737e-09, + "loss": 0.262, + "step": 8336 + }, + { + "epoch": 1.9843517581959897, + "grad_norm": 0.37863495269066555, + "learning_rate": 1.6184361069460085e-09, + "loss": 0.3527, + "step": 8337 + }, + { + "epoch": 1.9845897542690545, + "grad_norm": 0.35319699077317435, + "learning_rate": 1.569766677310125e-09, + "loss": 0.3451, + "step": 8338 + }, + { + "epoch": 1.9848277503421192, + "grad_norm": 0.34742580202122714, + "learning_rate": 1.521840138062336e-09, + "loss": 0.2781, + "step": 8339 + }, + { + "epoch": 1.9850657464151842, + "grad_norm": 0.38670204572295525, + "learning_rate": 1.4746564963258325e-09, + "loss": 0.2925, + "step": 8340 + }, + { + "epoch": 1.985303742488249, + "grad_norm": 0.3666529795353334, + "learning_rate": 1.4282157591122282e-09, + "loss": 0.3708, + "step": 8341 + }, + { + "epoch": 1.9855417385613139, + "grad_norm": 0.3696854659046601, + "learning_rate": 1.3825179333248895e-09, + "loss": 0.2828, + "step": 8342 + }, + { + "epoch": 1.9857797346343786, + "grad_norm": 0.406404518542554, + "learning_rate": 1.3375630257550509e-09, + "loss": 0.3009, + "step": 8343 + }, + { + "epoch": 1.9860177307074434, + "grad_norm": 0.385912160739807, + "learning_rate": 1.2933510430845898e-09, + "loss": 0.3693, + "step": 8344 + }, + { + "epoch": 1.986255726780508, + "grad_norm": 0.3925971908766997, + "learning_rate": 1.2498819918843609e-09, + "loss": 0.4039, + "step": 8345 + }, + { + "epoch": 1.9864937228535728, + "grad_norm": 0.3652691608712696, + "learning_rate": 1.2071558786141969e-09, + "loss": 0.273, + "step": 8346 + }, + { + "epoch": 1.9867317189266376, + "grad_norm": 0.38597466303071754, + "learning_rate": 1.1651727096251287e-09, + "loss": 0.3049, + "step": 8347 + }, + { + "epoch": 1.9869697149997025, + "grad_norm": 0.3810587229670749, + "learning_rate": 1.1239324911566096e-09, + "loss": 0.3677, + "step": 8348 + }, + { + "epoch": 1.9872077110727673, + "grad_norm": 0.40007821179401476, + "learning_rate": 1.083435229338181e-09, + "loss": 0.3497, + "step": 8349 + }, + { + "epoch": 1.9874457071458322, + "grad_norm": 0.40580202241849406, + "learning_rate": 1.043680930187807e-09, + "loss": 0.2742, + "step": 8350 + }, + { + "epoch": 1.987683703218897, + "grad_norm": 0.37923487971766323, + "learning_rate": 1.0046695996152046e-09, + "loss": 0.3465, + "step": 8351 + }, + { + "epoch": 1.9879216992919617, + "grad_norm": 0.37215554302402476, + "learning_rate": 9.66401243417958e-10, + "loss": 0.3798, + "step": 8352 + }, + { + "epoch": 1.9881596953650265, + "grad_norm": 0.39321133558205074, + "learning_rate": 9.288758672837406e-10, + "loss": 0.3067, + "step": 8353 + }, + { + "epoch": 1.9883976914380912, + "grad_norm": 0.3748563987454188, + "learning_rate": 8.92093476789202e-10, + "loss": 0.2742, + "step": 8354 + }, + { + "epoch": 1.988635687511156, + "grad_norm": 0.3891255633326672, + "learning_rate": 8.560540774016357e-10, + "loss": 0.3438, + "step": 8355 + }, + { + "epoch": 1.988873683584221, + "grad_norm": 0.3646078625631843, + "learning_rate": 8.207576744773127e-10, + "loss": 0.3308, + "step": 8356 + }, + { + "epoch": 1.9891116796572856, + "grad_norm": 0.36793355963820623, + "learning_rate": 7.862042732620367e-10, + "loss": 0.2491, + "step": 8357 + }, + { + "epoch": 1.9893496757303506, + "grad_norm": 0.3672753281223139, + "learning_rate": 7.523938788916996e-10, + "loss": 0.3369, + "step": 8358 + }, + { + "epoch": 1.9895876718034153, + "grad_norm": 0.39245638391380494, + "learning_rate": 7.193264963911706e-10, + "loss": 0.4073, + "step": 8359 + }, + { + "epoch": 1.98982566787648, + "grad_norm": 0.3865165837782479, + "learning_rate": 6.870021306742968e-10, + "loss": 0.2962, + "step": 8360 + }, + { + "epoch": 1.9900636639495448, + "grad_norm": 0.41219133327793983, + "learning_rate": 6.554207865466788e-10, + "loss": 0.302, + "step": 8361 + }, + { + "epoch": 1.9903016600226096, + "grad_norm": 0.4223513362715057, + "learning_rate": 6.245824687006741e-10, + "loss": 0.3742, + "step": 8362 + }, + { + "epoch": 1.9905396560956743, + "grad_norm": 0.3839303730562907, + "learning_rate": 5.944871817209486e-10, + "loss": 0.383, + "step": 8363 + }, + { + "epoch": 1.9907776521687393, + "grad_norm": 0.3926286842555442, + "learning_rate": 5.651349300794806e-10, + "loss": 0.292, + "step": 8364 + }, + { + "epoch": 1.991015648241804, + "grad_norm": 0.397668403394273, + "learning_rate": 5.365257181388917e-10, + "loss": 0.2986, + "step": 8365 + }, + { + "epoch": 1.991253644314869, + "grad_norm": 0.391019416533082, + "learning_rate": 5.086595501513358e-10, + "loss": 0.3498, + "step": 8366 + }, + { + "epoch": 1.9914916403879337, + "grad_norm": 0.42914431554738636, + "learning_rate": 4.815364302590553e-10, + "loss": 0.339, + "step": 8367 + }, + { + "epoch": 1.9917296364609984, + "grad_norm": 0.37889014824362105, + "learning_rate": 4.5515636249160446e-10, + "loss": 0.2723, + "step": 8368 + }, + { + "epoch": 1.9919676325340632, + "grad_norm": 0.38046794389103084, + "learning_rate": 4.2951935077140127e-10, + "loss": 0.3101, + "step": 8369 + }, + { + "epoch": 1.992205628607128, + "grad_norm": 0.40863630016035024, + "learning_rate": 4.04625398907621e-10, + "loss": 0.3595, + "step": 8370 + }, + { + "epoch": 1.9924436246801926, + "grad_norm": 0.34145231287271066, + "learning_rate": 3.80474510601192e-10, + "loss": 0.3063, + "step": 8371 + }, + { + "epoch": 1.9926816207532576, + "grad_norm": 0.37706309665477467, + "learning_rate": 3.5706668944035517e-10, + "loss": 0.3212, + "step": 8372 + }, + { + "epoch": 1.9929196168263223, + "grad_norm": 0.4166841421991216, + "learning_rate": 3.344019389045494e-10, + "loss": 0.3757, + "step": 8373 + }, + { + "epoch": 1.9931576128993873, + "grad_norm": 0.36453047466675503, + "learning_rate": 3.124802623627465e-10, + "loss": 0.318, + "step": 8374 + }, + { + "epoch": 1.993395608972452, + "grad_norm": 0.40191372694668726, + "learning_rate": 2.913016630723409e-10, + "loss": 0.2785, + "step": 8375 + }, + { + "epoch": 1.9936336050455168, + "grad_norm": 0.38151201580835575, + "learning_rate": 2.708661441813698e-10, + "loss": 0.3213, + "step": 8376 + }, + { + "epoch": 1.9938716011185815, + "grad_norm": 0.38965306740860267, + "learning_rate": 2.5117370872684843e-10, + "loss": 0.3845, + "step": 8377 + }, + { + "epoch": 1.9941095971916463, + "grad_norm": 0.3625652747058628, + "learning_rate": 2.3222435963643485e-10, + "loss": 0.298, + "step": 8378 + }, + { + "epoch": 1.994347593264711, + "grad_norm": 0.3795900323777344, + "learning_rate": 2.1401809972509957e-10, + "loss": 0.2865, + "step": 8379 + }, + { + "epoch": 1.994585589337776, + "grad_norm": 0.3930027956116569, + "learning_rate": 1.965549316995663e-10, + "loss": 0.3304, + "step": 8380 + }, + { + "epoch": 1.9948235854108407, + "grad_norm": 0.37631234555304693, + "learning_rate": 1.7983485815553646e-10, + "loss": 0.3261, + "step": 8381 + }, + { + "epoch": 1.9950615814839057, + "grad_norm": 0.4000269616966013, + "learning_rate": 1.6385788157713413e-10, + "loss": 0.2903, + "step": 8382 + }, + { + "epoch": 1.9952995775569704, + "grad_norm": 0.3941543276970109, + "learning_rate": 1.486240043396814e-10, + "loss": 0.3109, + "step": 8383 + }, + { + "epoch": 1.9955375736300351, + "grad_norm": 0.36557779503672044, + "learning_rate": 1.3413322870692304e-10, + "loss": 0.3941, + "step": 8384 + }, + { + "epoch": 1.9957755697030999, + "grad_norm": 0.3717543720800664, + "learning_rate": 1.203855568326917e-10, + "loss": 0.3063, + "step": 8385 + }, + { + "epoch": 1.9960135657761646, + "grad_norm": 0.37253382680661573, + "learning_rate": 1.0738099076035291e-10, + "loss": 0.3121, + "step": 8386 + }, + { + "epoch": 1.9962515618492294, + "grad_norm": 0.3692513186862969, + "learning_rate": 9.511953242280492e-11, + "loss": 0.3249, + "step": 8387 + }, + { + "epoch": 1.9964895579222943, + "grad_norm": 0.42548108491286396, + "learning_rate": 8.360118364192371e-11, + "loss": 0.3672, + "step": 8388 + }, + { + "epoch": 1.996727553995359, + "grad_norm": 0.381706175323407, + "learning_rate": 7.28259461296732e-11, + "loss": 0.2803, + "step": 8389 + }, + { + "epoch": 1.996965550068424, + "grad_norm": 0.3901616736475803, + "learning_rate": 6.27938214881052e-11, + "loss": 0.3152, + "step": 8390 + }, + { + "epoch": 1.9972035461414888, + "grad_norm": 0.36504854093795047, + "learning_rate": 5.350481120769413e-11, + "loss": 0.3544, + "step": 8391 + }, + { + "epoch": 1.9974415422145535, + "grad_norm": 0.37140042079834173, + "learning_rate": 4.4958916669002315e-11, + "loss": 0.3046, + "step": 8392 + }, + { + "epoch": 1.9976795382876182, + "grad_norm": 0.42614737328351837, + "learning_rate": 3.7156139142680014e-11, + "loss": 0.2803, + "step": 8393 + }, + { + "epoch": 1.997917534360683, + "grad_norm": 0.4103735541875691, + "learning_rate": 3.009647978780006e-11, + "loss": 0.3432, + "step": 8394 + }, + { + "epoch": 1.9981555304337477, + "grad_norm": 0.4593503351998582, + "learning_rate": 2.377993965407832e-11, + "loss": 0.3484, + "step": 8395 + }, + { + "epoch": 1.9983935265068127, + "grad_norm": 0.38126825431046657, + "learning_rate": 1.8206519680208368e-11, + "loss": 0.2837, + "step": 8396 + }, + { + "epoch": 1.9986315225798774, + "grad_norm": 0.3896583084295616, + "learning_rate": 1.3376220694416575e-11, + "loss": 0.2981, + "step": 8397 + }, + { + "epoch": 1.9988695186529424, + "grad_norm": 0.3774261304077494, + "learning_rate": 9.289043414462128e-12, + "loss": 0.3539, + "step": 8398 + }, + { + "epoch": 1.9991075147260071, + "grad_norm": 0.3547871077621748, + "learning_rate": 5.944988447637024e-12, + "loss": 0.3137, + "step": 8399 + }, + { + "epoch": 1.9993455107990719, + "grad_norm": 0.37445511560704847, + "learning_rate": 3.3440562918762855e-12, + "loss": 0.2957, + "step": 8400 + }, + { + "epoch": 1.9995835068721366, + "grad_norm": 0.37305187911281695, + "learning_rate": 1.4862473329824156e-12, + "loss": 0.3171, + "step": 8401 + }, + { + "epoch": 1.9998215029452013, + "grad_norm": 0.3725046051286145, + "learning_rate": 3.7156184684583597e-13, + "loss": 0.3945, + "step": 8402 + }, + { + "epoch": 1.9998215029452013, + "step": 8402, + "total_flos": 1.2470400040881357e+17, + "train_loss": 0.3506650565916145, + "train_runtime": 79311.4461, + "train_samples_per_second": 108.495, + "train_steps_per_second": 0.106 + } + ], + "logging_steps": 1.0, + "max_steps": 8402, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2470400040881357e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}