diff --git "a/checkpoint-1044/trainer_state.json" "b/checkpoint-1044/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1044/trainer_state.json" @@ -0,0 +1,7358 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6010362694300518, + "eval_steps": 869, + "global_step": 1044, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005757052389176742, + "grad_norm": 2.557003974914551, + "learning_rate": 0.0, + "loss": 5.4277, + "step": 1 + }, + { + "epoch": 0.0005757052389176742, + "eval_loss": 5.319709300994873, + "eval_runtime": 1026.7022, + "eval_samples_per_second": 2.496, + "eval_steps_per_second": 2.496, + "step": 1 + }, + { + "epoch": 0.0011514104778353484, + "grad_norm": 2.985229969024658, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.7019, + "step": 2 + }, + { + "epoch": 0.0017271157167530224, + "grad_norm": 3.0353081226348877, + "learning_rate": 8.000000000000001e-07, + "loss": 6.1934, + "step": 3 + }, + { + "epoch": 0.002302820955670697, + "grad_norm": 3.724905490875244, + "learning_rate": 1.2000000000000002e-06, + "loss": 5.4617, + "step": 4 + }, + { + "epoch": 0.0028785261945883708, + "grad_norm": 2.6505627632141113, + "learning_rate": 1.6000000000000001e-06, + "loss": 5.4285, + "step": 5 + }, + { + "epoch": 0.0034542314335060447, + "grad_norm": 2.7363409996032715, + "learning_rate": 2.0000000000000003e-06, + "loss": 5.8634, + "step": 6 + }, + { + "epoch": 0.004029936672423719, + "grad_norm": 3.082538366317749, + "learning_rate": 2.4000000000000003e-06, + "loss": 4.7461, + "step": 7 + }, + { + "epoch": 0.004605641911341394, + "grad_norm": 9.095250129699707, + "learning_rate": 2.8000000000000003e-06, + "loss": 7.5703, + "step": 8 + }, + { + "epoch": 0.0051813471502590676, + "grad_norm": 2.2597923278808594, + "learning_rate": 3.2000000000000003e-06, + "loss": 5.3631, + "step": 9 + }, + { + "epoch": 0.0057570523891767415, + "grad_norm": 5.053525924682617, + "learning_rate": 3.6e-06, + "loss": 6.0132, + "step": 10 + }, + { + "epoch": 0.0063327576280944155, + "grad_norm": 2.7407820224761963, + "learning_rate": 4.000000000000001e-06, + "loss": 5.9776, + "step": 11 + }, + { + "epoch": 0.0069084628670120895, + "grad_norm": 2.4892263412475586, + "learning_rate": 4.4e-06, + "loss": 5.524, + "step": 12 + }, + { + "epoch": 0.007484168105929764, + "grad_norm": 2.5302274227142334, + "learning_rate": 4.800000000000001e-06, + "loss": 5.8044, + "step": 13 + }, + { + "epoch": 0.008059873344847437, + "grad_norm": 2.992504358291626, + "learning_rate": 5.2e-06, + "loss": 6.0307, + "step": 14 + }, + { + "epoch": 0.008635578583765112, + "grad_norm": 4.081608295440674, + "learning_rate": 5.600000000000001e-06, + "loss": 4.6732, + "step": 15 + }, + { + "epoch": 0.009211283822682787, + "grad_norm": 2.33296799659729, + "learning_rate": 6e-06, + "loss": 4.6356, + "step": 16 + }, + { + "epoch": 0.00978698906160046, + "grad_norm": 2.798452854156494, + "learning_rate": 6.4000000000000006e-06, + "loss": 5.2941, + "step": 17 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 2.290029525756836, + "learning_rate": 6.800000000000001e-06, + "loss": 4.9405, + "step": 18 + }, + { + "epoch": 0.010938399539435808, + "grad_norm": 3.2164740562438965, + "learning_rate": 7.2e-06, + "loss": 5.6711, + "step": 19 + }, + { + "epoch": 0.011514104778353483, + "grad_norm": 2.4481987953186035, + "learning_rate": 7.6e-06, + "loss": 5.0366, + "step": 20 + }, + { + "epoch": 0.012089810017271158, + "grad_norm": 3.398063898086548, + "learning_rate": 8.000000000000001e-06, + "loss": 5.9377, + "step": 21 + }, + { + "epoch": 0.012665515256188831, + "grad_norm": 2.3936686515808105, + "learning_rate": 8.400000000000001e-06, + "loss": 5.4237, + "step": 22 + }, + { + "epoch": 0.013241220495106506, + "grad_norm": 2.7233810424804688, + "learning_rate": 8.8e-06, + "loss": 5.6551, + "step": 23 + }, + { + "epoch": 0.013816925734024179, + "grad_norm": 2.9957566261291504, + "learning_rate": 9.2e-06, + "loss": 4.7701, + "step": 24 + }, + { + "epoch": 0.014392630972941854, + "grad_norm": 6.397132396697998, + "learning_rate": 9.600000000000001e-06, + "loss": 6.4459, + "step": 25 + }, + { + "epoch": 0.014968336211859529, + "grad_norm": 3.0593409538269043, + "learning_rate": 1e-05, + "loss": 5.2758, + "step": 26 + }, + { + "epoch": 0.015544041450777202, + "grad_norm": 2.9723803997039795, + "learning_rate": 1.04e-05, + "loss": 5.6136, + "step": 27 + }, + { + "epoch": 0.016119746689694875, + "grad_norm": 2.03314471244812, + "learning_rate": 1.08e-05, + "loss": 5.3556, + "step": 28 + }, + { + "epoch": 0.01669545192861255, + "grad_norm": 1.777107834815979, + "learning_rate": 1.1200000000000001e-05, + "loss": 5.1061, + "step": 29 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 3.2192044258117676, + "learning_rate": 1.16e-05, + "loss": 5.2414, + "step": 30 + }, + { + "epoch": 0.017846862406447898, + "grad_norm": 3.924452066421509, + "learning_rate": 1.2e-05, + "loss": 5.2754, + "step": 31 + }, + { + "epoch": 0.018422567645365574, + "grad_norm": 3.5611093044281006, + "learning_rate": 1.24e-05, + "loss": 5.2817, + "step": 32 + }, + { + "epoch": 0.018998272884283247, + "grad_norm": 2.5194263458251953, + "learning_rate": 1.2800000000000001e-05, + "loss": 5.9063, + "step": 33 + }, + { + "epoch": 0.01957397812320092, + "grad_norm": 2.403895854949951, + "learning_rate": 1.32e-05, + "loss": 5.1161, + "step": 34 + }, + { + "epoch": 0.020149683362118594, + "grad_norm": 2.496400833129883, + "learning_rate": 1.3600000000000002e-05, + "loss": 5.3049, + "step": 35 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 3.0970828533172607, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.5807, + "step": 36 + }, + { + "epoch": 0.021301093839953943, + "grad_norm": 3.941403388977051, + "learning_rate": 1.44e-05, + "loss": 6.0418, + "step": 37 + }, + { + "epoch": 0.021876799078871616, + "grad_norm": 2.291431188583374, + "learning_rate": 1.48e-05, + "loss": 4.3686, + "step": 38 + }, + { + "epoch": 0.022452504317789293, + "grad_norm": 2.783054828643799, + "learning_rate": 1.52e-05, + "loss": 5.15, + "step": 39 + }, + { + "epoch": 0.023028209556706966, + "grad_norm": 3.579267978668213, + "learning_rate": 1.56e-05, + "loss": 5.7507, + "step": 40 + }, + { + "epoch": 0.02360391479562464, + "grad_norm": 3.5277323722839355, + "learning_rate": 1.6000000000000003e-05, + "loss": 6.112, + "step": 41 + }, + { + "epoch": 0.024179620034542316, + "grad_norm": 2.5100817680358887, + "learning_rate": 1.6400000000000002e-05, + "loss": 5.2133, + "step": 42 + }, + { + "epoch": 0.02475532527345999, + "grad_norm": 2.3821561336517334, + "learning_rate": 1.6800000000000002e-05, + "loss": 6.0345, + "step": 43 + }, + { + "epoch": 0.025331030512377662, + "grad_norm": 3.0675108432769775, + "learning_rate": 1.7199999999999998e-05, + "loss": 5.2294, + "step": 44 + }, + { + "epoch": 0.025906735751295335, + "grad_norm": 2.8790383338928223, + "learning_rate": 1.76e-05, + "loss": 5.6393, + "step": 45 + }, + { + "epoch": 0.02648244099021301, + "grad_norm": 3.3649141788482666, + "learning_rate": 1.8e-05, + "loss": 6.014, + "step": 46 + }, + { + "epoch": 0.027058146229130685, + "grad_norm": 3.4695286750793457, + "learning_rate": 1.84e-05, + "loss": 5.3457, + "step": 47 + }, + { + "epoch": 0.027633851468048358, + "grad_norm": 3.303622245788574, + "learning_rate": 1.88e-05, + "loss": 5.593, + "step": 48 + }, + { + "epoch": 0.028209556706966035, + "grad_norm": 2.481895923614502, + "learning_rate": 1.9200000000000003e-05, + "loss": 5.1439, + "step": 49 + }, + { + "epoch": 0.028785261945883708, + "grad_norm": 2.888579845428467, + "learning_rate": 1.9600000000000002e-05, + "loss": 4.6318, + "step": 50 + }, + { + "epoch": 0.02936096718480138, + "grad_norm": 3.4528300762176514, + "learning_rate": 2e-05, + "loss": 5.0376, + "step": 51 + }, + { + "epoch": 0.029936672423719057, + "grad_norm": 3.6751370429992676, + "learning_rate": 2.04e-05, + "loss": 4.9183, + "step": 52 + }, + { + "epoch": 0.03051237766263673, + "grad_norm": 3.382035970687866, + "learning_rate": 2.08e-05, + "loss": 5.499, + "step": 53 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 2.8802406787872314, + "learning_rate": 2.12e-05, + "loss": 5.3177, + "step": 54 + }, + { + "epoch": 0.03166378814047208, + "grad_norm": 6.158539772033691, + "learning_rate": 2.16e-05, + "loss": 6.2133, + "step": 55 + }, + { + "epoch": 0.03223949337938975, + "grad_norm": 2.599864959716797, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.3691, + "step": 56 + }, + { + "epoch": 0.03281519861830743, + "grad_norm": 3.4526188373565674, + "learning_rate": 2.2400000000000002e-05, + "loss": 5.3801, + "step": 57 + }, + { + "epoch": 0.0333909038572251, + "grad_norm": 9.494807243347168, + "learning_rate": 2.2800000000000002e-05, + "loss": 7.3116, + "step": 58 + }, + { + "epoch": 0.033966609096142776, + "grad_norm": 4.3456130027771, + "learning_rate": 2.32e-05, + "loss": 4.7467, + "step": 59 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.8471431732177734, + "learning_rate": 2.36e-05, + "loss": 5.2742, + "step": 60 + }, + { + "epoch": 0.03511801957397812, + "grad_norm": 3.985994815826416, + "learning_rate": 2.4e-05, + "loss": 5.4615, + "step": 61 + }, + { + "epoch": 0.035693724812895795, + "grad_norm": 9.588626861572266, + "learning_rate": 2.44e-05, + "loss": 6.8261, + "step": 62 + }, + { + "epoch": 0.03626943005181347, + "grad_norm": 5.3343915939331055, + "learning_rate": 2.48e-05, + "loss": 6.0899, + "step": 63 + }, + { + "epoch": 0.03684513529073115, + "grad_norm": 5.611617088317871, + "learning_rate": 2.5200000000000003e-05, + "loss": 6.4523, + "step": 64 + }, + { + "epoch": 0.03742084052964882, + "grad_norm": 4.497012615203857, + "learning_rate": 2.5600000000000002e-05, + "loss": 4.787, + "step": 65 + }, + { + "epoch": 0.037996545768566495, + "grad_norm": 5.032821178436279, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6337, + "step": 66 + }, + { + "epoch": 0.03857225100748417, + "grad_norm": 3.732733726501465, + "learning_rate": 2.64e-05, + "loss": 5.5212, + "step": 67 + }, + { + "epoch": 0.03914795624640184, + "grad_norm": 4.3597517013549805, + "learning_rate": 2.6800000000000004e-05, + "loss": 4.647, + "step": 68 + }, + { + "epoch": 0.039723661485319514, + "grad_norm": 5.359225273132324, + "learning_rate": 2.7200000000000004e-05, + "loss": 5.7052, + "step": 69 + }, + { + "epoch": 0.04029936672423719, + "grad_norm": 4.9161601066589355, + "learning_rate": 2.7600000000000003e-05, + "loss": 5.3191, + "step": 70 + }, + { + "epoch": 0.04087507196315487, + "grad_norm": 4.137385368347168, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.1797, + "step": 71 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 4.728359699249268, + "learning_rate": 2.84e-05, + "loss": 5.1125, + "step": 72 + }, + { + "epoch": 0.042026482440990214, + "grad_norm": 4.568793773651123, + "learning_rate": 2.88e-05, + "loss": 5.7705, + "step": 73 + }, + { + "epoch": 0.04260218767990789, + "grad_norm": 4.931026935577393, + "learning_rate": 2.9199999999999998e-05, + "loss": 5.1052, + "step": 74 + }, + { + "epoch": 0.04317789291882556, + "grad_norm": 4.697461128234863, + "learning_rate": 2.96e-05, + "loss": 5.1404, + "step": 75 + }, + { + "epoch": 0.04375359815774323, + "grad_norm": 6.393320083618164, + "learning_rate": 3e-05, + "loss": 6.2212, + "step": 76 + }, + { + "epoch": 0.04432930339666091, + "grad_norm": 5.876922607421875, + "learning_rate": 3.04e-05, + "loss": 5.7775, + "step": 77 + }, + { + "epoch": 0.044905008635578586, + "grad_norm": 4.749701499938965, + "learning_rate": 3.08e-05, + "loss": 4.7321, + "step": 78 + }, + { + "epoch": 0.04548071387449626, + "grad_norm": 4.894115447998047, + "learning_rate": 3.12e-05, + "loss": 5.2017, + "step": 79 + }, + { + "epoch": 0.04605641911341393, + "grad_norm": 5.125804424285889, + "learning_rate": 3.16e-05, + "loss": 5.1661, + "step": 80 + }, + { + "epoch": 0.046632124352331605, + "grad_norm": 7.571075439453125, + "learning_rate": 3.2000000000000005e-05, + "loss": 6.1439, + "step": 81 + }, + { + "epoch": 0.04720782959124928, + "grad_norm": 4.469061374664307, + "learning_rate": 3.24e-05, + "loss": 5.1732, + "step": 82 + }, + { + "epoch": 0.04778353483016695, + "grad_norm": 4.565371513366699, + "learning_rate": 3.2800000000000004e-05, + "loss": 5.4892, + "step": 83 + }, + { + "epoch": 0.04835924006908463, + "grad_norm": 5.844489097595215, + "learning_rate": 3.32e-05, + "loss": 5.875, + "step": 84 + }, + { + "epoch": 0.048934945308002305, + "grad_norm": 10.564720153808594, + "learning_rate": 3.3600000000000004e-05, + "loss": 5.9008, + "step": 85 + }, + { + "epoch": 0.04951065054691998, + "grad_norm": 6.923472881317139, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.4949, + "step": 86 + }, + { + "epoch": 0.05008635578583765, + "grad_norm": 6.902386665344238, + "learning_rate": 3.4399999999999996e-05, + "loss": 4.9801, + "step": 87 + }, + { + "epoch": 0.050662061024755324, + "grad_norm": 8.239148139953613, + "learning_rate": 3.48e-05, + "loss": 5.6578, + "step": 88 + }, + { + "epoch": 0.051237766263673, + "grad_norm": 6.162630081176758, + "learning_rate": 3.52e-05, + "loss": 4.9911, + "step": 89 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 7.2612433433532715, + "learning_rate": 3.56e-05, + "loss": 5.7976, + "step": 90 + }, + { + "epoch": 0.05238917674150835, + "grad_norm": 6.149419784545898, + "learning_rate": 3.6e-05, + "loss": 4.9756, + "step": 91 + }, + { + "epoch": 0.05296488198042602, + "grad_norm": 7.4116106033325195, + "learning_rate": 3.6400000000000004e-05, + "loss": 5.5805, + "step": 92 + }, + { + "epoch": 0.0535405872193437, + "grad_norm": 5.512300491333008, + "learning_rate": 3.68e-05, + "loss": 4.5575, + "step": 93 + }, + { + "epoch": 0.05411629245826137, + "grad_norm": 14.799551963806152, + "learning_rate": 3.72e-05, + "loss": 5.2244, + "step": 94 + }, + { + "epoch": 0.05469199769717904, + "grad_norm": 9.756938934326172, + "learning_rate": 3.76e-05, + "loss": 4.8444, + "step": 95 + }, + { + "epoch": 0.055267702936096716, + "grad_norm": 6.400147914886475, + "learning_rate": 3.8e-05, + "loss": 5.5091, + "step": 96 + }, + { + "epoch": 0.055843408175014396, + "grad_norm": 8.406181335449219, + "learning_rate": 3.8400000000000005e-05, + "loss": 5.2641, + "step": 97 + }, + { + "epoch": 0.05641911341393207, + "grad_norm": 6.860042572021484, + "learning_rate": 3.88e-05, + "loss": 5.2917, + "step": 98 + }, + { + "epoch": 0.05699481865284974, + "grad_norm": 7.542653560638428, + "learning_rate": 3.9200000000000004e-05, + "loss": 5.1584, + "step": 99 + }, + { + "epoch": 0.057570523891767415, + "grad_norm": 8.149137496948242, + "learning_rate": 3.960000000000001e-05, + "loss": 5.5326, + "step": 100 + }, + { + "epoch": 0.05814622913068509, + "grad_norm": 5.590121269226074, + "learning_rate": 4e-05, + "loss": 5.2789, + "step": 101 + }, + { + "epoch": 0.05872193436960276, + "grad_norm": 7.877676010131836, + "learning_rate": 4.0400000000000006e-05, + "loss": 4.8526, + "step": 102 + }, + { + "epoch": 0.059297639608520435, + "grad_norm": 5.773808479309082, + "learning_rate": 4.08e-05, + "loss": 5.033, + "step": 103 + }, + { + "epoch": 0.059873344847438115, + "grad_norm": 6.092824935913086, + "learning_rate": 4.12e-05, + "loss": 4.8936, + "step": 104 + }, + { + "epoch": 0.06044905008635579, + "grad_norm": 5.934675693511963, + "learning_rate": 4.16e-05, + "loss": 4.4764, + "step": 105 + }, + { + "epoch": 0.06102475532527346, + "grad_norm": 5.622652530670166, + "learning_rate": 4.2e-05, + "loss": 5.1344, + "step": 106 + }, + { + "epoch": 0.061600460564191134, + "grad_norm": 7.697418212890625, + "learning_rate": 4.24e-05, + "loss": 5.2087, + "step": 107 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 5.204082489013672, + "learning_rate": 4.2800000000000004e-05, + "loss": 4.6294, + "step": 108 + }, + { + "epoch": 0.06275187104202648, + "grad_norm": 6.288537979125977, + "learning_rate": 4.32e-05, + "loss": 5.3009, + "step": 109 + }, + { + "epoch": 0.06332757628094415, + "grad_norm": 6.717288017272949, + "learning_rate": 4.36e-05, + "loss": 5.5392, + "step": 110 + }, + { + "epoch": 0.06390328151986183, + "grad_norm": 5.432399272918701, + "learning_rate": 4.4000000000000006e-05, + "loss": 4.3602, + "step": 111 + }, + { + "epoch": 0.0644789867587795, + "grad_norm": 6.823062896728516, + "learning_rate": 4.44e-05, + "loss": 5.7343, + "step": 112 + }, + { + "epoch": 0.06505469199769717, + "grad_norm": 6.532074928283691, + "learning_rate": 4.4800000000000005e-05, + "loss": 5.0605, + "step": 113 + }, + { + "epoch": 0.06563039723661486, + "grad_norm": 5.982126712799072, + "learning_rate": 4.52e-05, + "loss": 5.2182, + "step": 114 + }, + { + "epoch": 0.06620610247553253, + "grad_norm": 5.759943962097168, + "learning_rate": 4.5600000000000004e-05, + "loss": 4.9098, + "step": 115 + }, + { + "epoch": 0.0667818077144502, + "grad_norm": 5.147834300994873, + "learning_rate": 4.600000000000001e-05, + "loss": 4.8671, + "step": 116 + }, + { + "epoch": 0.06735751295336788, + "grad_norm": 8.015042304992676, + "learning_rate": 4.64e-05, + "loss": 5.7445, + "step": 117 + }, + { + "epoch": 0.06793321819228555, + "grad_norm": 7.161843299865723, + "learning_rate": 4.6800000000000006e-05, + "loss": 5.9092, + "step": 118 + }, + { + "epoch": 0.06850892343120323, + "grad_norm": 9.394163131713867, + "learning_rate": 4.72e-05, + "loss": 4.7243, + "step": 119 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 4.96219539642334, + "learning_rate": 4.76e-05, + "loss": 4.7233, + "step": 120 + }, + { + "epoch": 0.06966033390903857, + "grad_norm": 6.473387241363525, + "learning_rate": 4.8e-05, + "loss": 5.1295, + "step": 121 + }, + { + "epoch": 0.07023603914795624, + "grad_norm": 6.797422885894775, + "learning_rate": 4.8400000000000004e-05, + "loss": 4.7697, + "step": 122 + }, + { + "epoch": 0.07081174438687392, + "grad_norm": 6.656020641326904, + "learning_rate": 4.88e-05, + "loss": 5.2377, + "step": 123 + }, + { + "epoch": 0.07138744962579159, + "grad_norm": 5.552718639373779, + "learning_rate": 4.92e-05, + "loss": 4.4741, + "step": 124 + }, + { + "epoch": 0.07196315486470926, + "grad_norm": 6.101820468902588, + "learning_rate": 4.96e-05, + "loss": 4.4192, + "step": 125 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 7.695935249328613, + "learning_rate": 5e-05, + "loss": 5.4128, + "step": 126 + }, + { + "epoch": 0.07311456534254462, + "grad_norm": 6.9946208000183105, + "learning_rate": 5.0400000000000005e-05, + "loss": 5.4829, + "step": 127 + }, + { + "epoch": 0.0736902705814623, + "grad_norm": 16.10480308532715, + "learning_rate": 5.08e-05, + "loss": 4.6945, + "step": 128 + }, + { + "epoch": 0.07426597582037997, + "grad_norm": 5.313148021697998, + "learning_rate": 5.1200000000000004e-05, + "loss": 4.2429, + "step": 129 + }, + { + "epoch": 0.07484168105929764, + "grad_norm": 5.506260871887207, + "learning_rate": 5.16e-05, + "loss": 4.7241, + "step": 130 + }, + { + "epoch": 0.07541738629821532, + "grad_norm": 5.655925273895264, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.4156, + "step": 131 + }, + { + "epoch": 0.07599309153713299, + "grad_norm": 6.528857231140137, + "learning_rate": 5.2400000000000007e-05, + "loss": 5.3606, + "step": 132 + }, + { + "epoch": 0.07656879677605066, + "grad_norm": 5.360299110412598, + "learning_rate": 5.28e-05, + "loss": 5.0686, + "step": 133 + }, + { + "epoch": 0.07714450201496834, + "grad_norm": 5.301785945892334, + "learning_rate": 5.3200000000000006e-05, + "loss": 4.845, + "step": 134 + }, + { + "epoch": 0.07772020725388601, + "grad_norm": 4.986385345458984, + "learning_rate": 5.360000000000001e-05, + "loss": 5.1493, + "step": 135 + }, + { + "epoch": 0.07829591249280368, + "grad_norm": 5.200460433959961, + "learning_rate": 5.4000000000000005e-05, + "loss": 4.781, + "step": 136 + }, + { + "epoch": 0.07887161773172136, + "grad_norm": 7.154032230377197, + "learning_rate": 5.440000000000001e-05, + "loss": 5.8801, + "step": 137 + }, + { + "epoch": 0.07944732297063903, + "grad_norm": 4.641168117523193, + "learning_rate": 5.4800000000000004e-05, + "loss": 5.1929, + "step": 138 + }, + { + "epoch": 0.0800230282095567, + "grad_norm": 4.8809123039245605, + "learning_rate": 5.520000000000001e-05, + "loss": 5.0221, + "step": 139 + }, + { + "epoch": 0.08059873344847437, + "grad_norm": 5.0507402420043945, + "learning_rate": 5.560000000000001e-05, + "loss": 4.8543, + "step": 140 + }, + { + "epoch": 0.08117443868739206, + "grad_norm": 6.459733963012695, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.051, + "step": 141 + }, + { + "epoch": 0.08175014392630973, + "grad_norm": 6.107847690582275, + "learning_rate": 5.6399999999999995e-05, + "loss": 4.8338, + "step": 142 + }, + { + "epoch": 0.08232584916522741, + "grad_norm": 6.28361701965332, + "learning_rate": 5.68e-05, + "loss": 5.1373, + "step": 143 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 4.957414627075195, + "learning_rate": 5.72e-05, + "loss": 4.8154, + "step": 144 + }, + { + "epoch": 0.08347725964306275, + "grad_norm": 4.774332046508789, + "learning_rate": 5.76e-05, + "loss": 4.7262, + "step": 145 + }, + { + "epoch": 0.08405296488198043, + "grad_norm": 7.41762113571167, + "learning_rate": 5.8e-05, + "loss": 5.5137, + "step": 146 + }, + { + "epoch": 0.0846286701208981, + "grad_norm": 7.484424591064453, + "learning_rate": 5.8399999999999997e-05, + "loss": 5.766, + "step": 147 + }, + { + "epoch": 0.08520437535981577, + "grad_norm": 4.917182922363281, + "learning_rate": 5.88e-05, + "loss": 5.0193, + "step": 148 + }, + { + "epoch": 0.08578008059873345, + "grad_norm": 4.608645915985107, + "learning_rate": 5.92e-05, + "loss": 5.0873, + "step": 149 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 6.5947794914245605, + "learning_rate": 5.96e-05, + "loss": 4.9855, + "step": 150 + }, + { + "epoch": 0.08693149107656879, + "grad_norm": 3.8302507400512695, + "learning_rate": 6e-05, + "loss": 3.7953, + "step": 151 + }, + { + "epoch": 0.08750719631548647, + "grad_norm": 3.6352171897888184, + "learning_rate": 6.04e-05, + "loss": 4.1647, + "step": 152 + }, + { + "epoch": 0.08808290155440414, + "grad_norm": 4.818563461303711, + "learning_rate": 6.08e-05, + "loss": 4.2128, + "step": 153 + }, + { + "epoch": 0.08865860679332183, + "grad_norm": 7.7323503494262695, + "learning_rate": 6.12e-05, + "loss": 5.4562, + "step": 154 + }, + { + "epoch": 0.0892343120322395, + "grad_norm": 5.785284996032715, + "learning_rate": 6.16e-05, + "loss": 4.8956, + "step": 155 + }, + { + "epoch": 0.08981001727115717, + "grad_norm": 6.181385040283203, + "learning_rate": 6.2e-05, + "loss": 5.2373, + "step": 156 + }, + { + "epoch": 0.09038572251007485, + "grad_norm": 6.015028476715088, + "learning_rate": 6.24e-05, + "loss": 4.3663, + "step": 157 + }, + { + "epoch": 0.09096142774899252, + "grad_norm": 4.41657829284668, + "learning_rate": 6.280000000000001e-05, + "loss": 4.5991, + "step": 158 + }, + { + "epoch": 0.09153713298791019, + "grad_norm": 6.5107622146606445, + "learning_rate": 6.32e-05, + "loss": 4.8784, + "step": 159 + }, + { + "epoch": 0.09211283822682786, + "grad_norm": 4.11070442199707, + "learning_rate": 6.36e-05, + "loss": 4.6766, + "step": 160 + }, + { + "epoch": 0.09268854346574554, + "grad_norm": 8.204343795776367, + "learning_rate": 6.400000000000001e-05, + "loss": 5.5088, + "step": 161 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 3.9389288425445557, + "learning_rate": 6.440000000000001e-05, + "loss": 4.3476, + "step": 162 + }, + { + "epoch": 0.09383995394358088, + "grad_norm": 5.597643852233887, + "learning_rate": 6.48e-05, + "loss": 4.9976, + "step": 163 + }, + { + "epoch": 0.09441565918249856, + "grad_norm": 8.994287490844727, + "learning_rate": 6.52e-05, + "loss": 5.5959, + "step": 164 + }, + { + "epoch": 0.09499136442141623, + "grad_norm": 5.60779333114624, + "learning_rate": 6.560000000000001e-05, + "loss": 4.6283, + "step": 165 + }, + { + "epoch": 0.0955670696603339, + "grad_norm": 4.319982528686523, + "learning_rate": 6.6e-05, + "loss": 4.041, + "step": 166 + }, + { + "epoch": 0.09614277489925158, + "grad_norm": 5.684337615966797, + "learning_rate": 6.64e-05, + "loss": 4.8941, + "step": 167 + }, + { + "epoch": 0.09671848013816926, + "grad_norm": 3.872518539428711, + "learning_rate": 6.680000000000001e-05, + "loss": 4.2242, + "step": 168 + }, + { + "epoch": 0.09729418537708694, + "grad_norm": 4.826557636260986, + "learning_rate": 6.720000000000001e-05, + "loss": 4.8546, + "step": 169 + }, + { + "epoch": 0.09786989061600461, + "grad_norm": 4.660156726837158, + "learning_rate": 6.76e-05, + "loss": 4.3797, + "step": 170 + }, + { + "epoch": 0.09844559585492228, + "grad_norm": 4.616059303283691, + "learning_rate": 6.800000000000001e-05, + "loss": 4.7293, + "step": 171 + }, + { + "epoch": 0.09902130109383996, + "grad_norm": 7.685507774353027, + "learning_rate": 6.840000000000001e-05, + "loss": 5.6251, + "step": 172 + }, + { + "epoch": 0.09959700633275763, + "grad_norm": 7.424576282501221, + "learning_rate": 6.879999999999999e-05, + "loss": 4.8253, + "step": 173 + }, + { + "epoch": 0.1001727115716753, + "grad_norm": 4.379521369934082, + "learning_rate": 6.92e-05, + "loss": 4.5287, + "step": 174 + }, + { + "epoch": 0.10074841681059298, + "grad_norm": 4.753964424133301, + "learning_rate": 6.96e-05, + "loss": 4.5554, + "step": 175 + }, + { + "epoch": 0.10132412204951065, + "grad_norm": 4.559609413146973, + "learning_rate": 7e-05, + "loss": 4.5615, + "step": 176 + }, + { + "epoch": 0.10189982728842832, + "grad_norm": 5.178406238555908, + "learning_rate": 7.04e-05, + "loss": 4.6344, + "step": 177 + }, + { + "epoch": 0.102475532527346, + "grad_norm": 7.4183526039123535, + "learning_rate": 7.08e-05, + "loss": 4.5451, + "step": 178 + }, + { + "epoch": 0.10305123776626367, + "grad_norm": 5.832037448883057, + "learning_rate": 7.12e-05, + "loss": 4.7097, + "step": 179 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 4.9681925773620605, + "learning_rate": 7.16e-05, + "loss": 4.6288, + "step": 180 + }, + { + "epoch": 0.10420264824409903, + "grad_norm": 4.886664867401123, + "learning_rate": 7.2e-05, + "loss": 4.7019, + "step": 181 + }, + { + "epoch": 0.1047783534830167, + "grad_norm": 4.668741226196289, + "learning_rate": 7.24e-05, + "loss": 4.4534, + "step": 182 + }, + { + "epoch": 0.10535405872193437, + "grad_norm": 7.459389686584473, + "learning_rate": 7.280000000000001e-05, + "loss": 5.4758, + "step": 183 + }, + { + "epoch": 0.10592976396085205, + "grad_norm": 31.545869827270508, + "learning_rate": 7.32e-05, + "loss": 6.179, + "step": 184 + }, + { + "epoch": 0.10650546919976972, + "grad_norm": 9.739182472229004, + "learning_rate": 7.36e-05, + "loss": 4.9662, + "step": 185 + }, + { + "epoch": 0.1070811744386874, + "grad_norm": 4.12076997756958, + "learning_rate": 7.4e-05, + "loss": 3.88, + "step": 186 + }, + { + "epoch": 0.10765687967760507, + "grad_norm": 5.808717727661133, + "learning_rate": 7.44e-05, + "loss": 4.6157, + "step": 187 + }, + { + "epoch": 0.10823258491652274, + "grad_norm": 3.6208741664886475, + "learning_rate": 7.48e-05, + "loss": 3.9156, + "step": 188 + }, + { + "epoch": 0.10880829015544041, + "grad_norm": 4.674955368041992, + "learning_rate": 7.52e-05, + "loss": 4.4751, + "step": 189 + }, + { + "epoch": 0.10938399539435809, + "grad_norm": 5.331599235534668, + "learning_rate": 7.560000000000001e-05, + "loss": 4.3887, + "step": 190 + }, + { + "epoch": 0.10995970063327576, + "grad_norm": 5.1405534744262695, + "learning_rate": 7.6e-05, + "loss": 4.9114, + "step": 191 + }, + { + "epoch": 0.11053540587219343, + "grad_norm": 3.7066593170166016, + "learning_rate": 7.64e-05, + "loss": 3.8948, + "step": 192 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 5.185431003570557, + "learning_rate": 7.680000000000001e-05, + "loss": 4.232, + "step": 193 + }, + { + "epoch": 0.11168681635002879, + "grad_norm": 4.900607585906982, + "learning_rate": 7.72e-05, + "loss": 4.667, + "step": 194 + }, + { + "epoch": 0.11226252158894647, + "grad_norm": 5.091091632843018, + "learning_rate": 7.76e-05, + "loss": 4.3946, + "step": 195 + }, + { + "epoch": 0.11283822682786414, + "grad_norm": 4.859619617462158, + "learning_rate": 7.800000000000001e-05, + "loss": 4.6306, + "step": 196 + }, + { + "epoch": 0.11341393206678181, + "grad_norm": 3.544200897216797, + "learning_rate": 7.840000000000001e-05, + "loss": 4.2118, + "step": 197 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 8.28862190246582, + "learning_rate": 7.88e-05, + "loss": 4.4431, + "step": 198 + }, + { + "epoch": 0.11456534254461716, + "grad_norm": 6.373688220977783, + "learning_rate": 7.920000000000001e-05, + "loss": 4.7554, + "step": 199 + }, + { + "epoch": 0.11514104778353483, + "grad_norm": 6.8544392585754395, + "learning_rate": 7.960000000000001e-05, + "loss": 4.8723, + "step": 200 + }, + { + "epoch": 0.1157167530224525, + "grad_norm": 7.207869052886963, + "learning_rate": 8e-05, + "loss": 4.1096, + "step": 201 + }, + { + "epoch": 0.11629245826137018, + "grad_norm": 4.9073333740234375, + "learning_rate": 8.04e-05, + "loss": 3.6834, + "step": 202 + }, + { + "epoch": 0.11686816350028785, + "grad_norm": 6.523554801940918, + "learning_rate": 8.080000000000001e-05, + "loss": 4.4934, + "step": 203 + }, + { + "epoch": 0.11744386873920552, + "grad_norm": 9.581537246704102, + "learning_rate": 8.120000000000001e-05, + "loss": 4.8199, + "step": 204 + }, + { + "epoch": 0.1180195739781232, + "grad_norm": 5.319664001464844, + "learning_rate": 8.16e-05, + "loss": 4.0881, + "step": 205 + }, + { + "epoch": 0.11859527921704087, + "grad_norm": 7.609442710876465, + "learning_rate": 8.2e-05, + "loss": 5.1011, + "step": 206 + }, + { + "epoch": 0.11917098445595854, + "grad_norm": 5.437283515930176, + "learning_rate": 8.24e-05, + "loss": 4.7683, + "step": 207 + }, + { + "epoch": 0.11974668969487623, + "grad_norm": 9.015962600708008, + "learning_rate": 8.28e-05, + "loss": 5.1197, + "step": 208 + }, + { + "epoch": 0.1203223949337939, + "grad_norm": 5.41486120223999, + "learning_rate": 8.32e-05, + "loss": 4.2228, + "step": 209 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 4.068630218505859, + "learning_rate": 8.36e-05, + "loss": 3.9683, + "step": 210 + }, + { + "epoch": 0.12147380541162925, + "grad_norm": 4.818974494934082, + "learning_rate": 8.4e-05, + "loss": 4.3969, + "step": 211 + }, + { + "epoch": 0.12204951065054692, + "grad_norm": 8.309637069702148, + "learning_rate": 8.44e-05, + "loss": 4.8983, + "step": 212 + }, + { + "epoch": 0.1226252158894646, + "grad_norm": 5.997379302978516, + "learning_rate": 8.48e-05, + "loss": 4.6983, + "step": 213 + }, + { + "epoch": 0.12320092112838227, + "grad_norm": 6.416568279266357, + "learning_rate": 8.52e-05, + "loss": 4.6, + "step": 214 + }, + { + "epoch": 0.12377662636729994, + "grad_norm": 5.038214206695557, + "learning_rate": 8.560000000000001e-05, + "loss": 4.1803, + "step": 215 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 5.035988807678223, + "learning_rate": 8.6e-05, + "loss": 4.1585, + "step": 216 + }, + { + "epoch": 0.12492803684513529, + "grad_norm": 6.7663726806640625, + "learning_rate": 8.64e-05, + "loss": 4.4256, + "step": 217 + }, + { + "epoch": 0.12550374208405296, + "grad_norm": 5.394269943237305, + "learning_rate": 8.680000000000001e-05, + "loss": 3.9008, + "step": 218 + }, + { + "epoch": 0.12607944732297063, + "grad_norm": 5.4501800537109375, + "learning_rate": 8.72e-05, + "loss": 3.9869, + "step": 219 + }, + { + "epoch": 0.1266551525618883, + "grad_norm": 4.7380170822143555, + "learning_rate": 8.76e-05, + "loss": 4.0876, + "step": 220 + }, + { + "epoch": 0.12723085780080598, + "grad_norm": 6.059116840362549, + "learning_rate": 8.800000000000001e-05, + "loss": 4.147, + "step": 221 + }, + { + "epoch": 0.12780656303972365, + "grad_norm": 5.5021586418151855, + "learning_rate": 8.840000000000001e-05, + "loss": 4.4547, + "step": 222 + }, + { + "epoch": 0.12838226827864133, + "grad_norm": 4.760106563568115, + "learning_rate": 8.88e-05, + "loss": 4.075, + "step": 223 + }, + { + "epoch": 0.128957973517559, + "grad_norm": 7.5847649574279785, + "learning_rate": 8.92e-05, + "loss": 4.6163, + "step": 224 + }, + { + "epoch": 0.12953367875647667, + "grad_norm": 6.257955074310303, + "learning_rate": 8.960000000000001e-05, + "loss": 4.6043, + "step": 225 + }, + { + "epoch": 0.13010938399539435, + "grad_norm": 7.368046283721924, + "learning_rate": 9e-05, + "loss": 4.7961, + "step": 226 + }, + { + "epoch": 0.13068508923431202, + "grad_norm": 4.385096549987793, + "learning_rate": 9.04e-05, + "loss": 4.1968, + "step": 227 + }, + { + "epoch": 0.13126079447322972, + "grad_norm": 6.34293794631958, + "learning_rate": 9.080000000000001e-05, + "loss": 4.3076, + "step": 228 + }, + { + "epoch": 0.1318364997121474, + "grad_norm": 6.403743267059326, + "learning_rate": 9.120000000000001e-05, + "loss": 3.8917, + "step": 229 + }, + { + "epoch": 0.13241220495106507, + "grad_norm": 6.792156219482422, + "learning_rate": 9.16e-05, + "loss": 3.9843, + "step": 230 + }, + { + "epoch": 0.13298791018998274, + "grad_norm": 8.062408447265625, + "learning_rate": 9.200000000000001e-05, + "loss": 4.2562, + "step": 231 + }, + { + "epoch": 0.1335636154289004, + "grad_norm": 8.513936042785645, + "learning_rate": 9.240000000000001e-05, + "loss": 4.6536, + "step": 232 + }, + { + "epoch": 0.13413932066781808, + "grad_norm": 5.92789363861084, + "learning_rate": 9.28e-05, + "loss": 4.104, + "step": 233 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 44.009300231933594, + "learning_rate": 9.320000000000002e-05, + "loss": 4.8297, + "step": 234 + }, + { + "epoch": 0.13529073114565343, + "grad_norm": 5.342921257019043, + "learning_rate": 9.360000000000001e-05, + "loss": 4.0662, + "step": 235 + }, + { + "epoch": 0.1358664363845711, + "grad_norm": 5.618771076202393, + "learning_rate": 9.4e-05, + "loss": 4.1692, + "step": 236 + }, + { + "epoch": 0.13644214162348878, + "grad_norm": 6.6655473709106445, + "learning_rate": 9.44e-05, + "loss": 4.2759, + "step": 237 + }, + { + "epoch": 0.13701784686240645, + "grad_norm": 6.415508270263672, + "learning_rate": 9.48e-05, + "loss": 4.025, + "step": 238 + }, + { + "epoch": 0.13759355210132412, + "grad_norm": 62.65280532836914, + "learning_rate": 9.52e-05, + "loss": 5.3187, + "step": 239 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 5.9870147705078125, + "learning_rate": 9.56e-05, + "loss": 4.3549, + "step": 240 + }, + { + "epoch": 0.13874496257915947, + "grad_norm": 6.323814868927002, + "learning_rate": 9.6e-05, + "loss": 4.0618, + "step": 241 + }, + { + "epoch": 0.13932066781807714, + "grad_norm": 7.25873327255249, + "learning_rate": 9.64e-05, + "loss": 4.6113, + "step": 242 + }, + { + "epoch": 0.13989637305699482, + "grad_norm": 6.708962440490723, + "learning_rate": 9.680000000000001e-05, + "loss": 4.2734, + "step": 243 + }, + { + "epoch": 0.1404720782959125, + "grad_norm": 6.766256332397461, + "learning_rate": 9.72e-05, + "loss": 3.8169, + "step": 244 + }, + { + "epoch": 0.14104778353483016, + "grad_norm": 9.25779914855957, + "learning_rate": 9.76e-05, + "loss": 4.0823, + "step": 245 + }, + { + "epoch": 0.14162348877374784, + "grad_norm": 6.24402379989624, + "learning_rate": 9.8e-05, + "loss": 3.9761, + "step": 246 + }, + { + "epoch": 0.1421991940126655, + "grad_norm": 4.627258777618408, + "learning_rate": 9.84e-05, + "loss": 3.3376, + "step": 247 + }, + { + "epoch": 0.14277489925158318, + "grad_norm": 6.5364766120910645, + "learning_rate": 9.88e-05, + "loss": 3.9101, + "step": 248 + }, + { + "epoch": 0.14335060449050085, + "grad_norm": 6.722381591796875, + "learning_rate": 9.92e-05, + "loss": 4.2916, + "step": 249 + }, + { + "epoch": 0.14392630972941853, + "grad_norm": 7.2800493240356445, + "learning_rate": 9.960000000000001e-05, + "loss": 4.1714, + "step": 250 + }, + { + "epoch": 0.1445020149683362, + "grad_norm": 9.137832641601562, + "learning_rate": 0.0001, + "loss": 3.9733, + "step": 251 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 5.290084362030029, + "learning_rate": 0.0001004, + "loss": 3.8465, + "step": 252 + }, + { + "epoch": 0.14565342544617155, + "grad_norm": 7.146475791931152, + "learning_rate": 0.00010080000000000001, + "loss": 4.154, + "step": 253 + }, + { + "epoch": 0.14622913068508925, + "grad_norm": 5.462000370025635, + "learning_rate": 0.00010120000000000001, + "loss": 3.8403, + "step": 254 + }, + { + "epoch": 0.14680483592400692, + "grad_norm": 8.053996086120605, + "learning_rate": 0.0001016, + "loss": 4.224, + "step": 255 + }, + { + "epoch": 0.1473805411629246, + "grad_norm": 56.904518127441406, + "learning_rate": 0.00010200000000000001, + "loss": 5.3512, + "step": 256 + }, + { + "epoch": 0.14795624640184227, + "grad_norm": 67.7396469116211, + "learning_rate": 0.00010240000000000001, + "loss": 4.136, + "step": 257 + }, + { + "epoch": 0.14853195164075994, + "grad_norm": 5.19423770904541, + "learning_rate": 0.0001028, + "loss": 3.6272, + "step": 258 + }, + { + "epoch": 0.1491076568796776, + "grad_norm": 6.946446418762207, + "learning_rate": 0.0001032, + "loss": 3.7617, + "step": 259 + }, + { + "epoch": 0.1496833621185953, + "grad_norm": 6.839754104614258, + "learning_rate": 0.00010360000000000001, + "loss": 4.2895, + "step": 260 + }, + { + "epoch": 0.15025906735751296, + "grad_norm": 7.3253254890441895, + "learning_rate": 0.00010400000000000001, + "loss": 4.0997, + "step": 261 + }, + { + "epoch": 0.15083477259643063, + "grad_norm": 6.981521129608154, + "learning_rate": 0.0001044, + "loss": 3.4663, + "step": 262 + }, + { + "epoch": 0.1514104778353483, + "grad_norm": 6.424066543579102, + "learning_rate": 0.00010480000000000001, + "loss": 4.0914, + "step": 263 + }, + { + "epoch": 0.15198618307426598, + "grad_norm": 6.7790398597717285, + "learning_rate": 0.00010520000000000001, + "loss": 4.0818, + "step": 264 + }, + { + "epoch": 0.15256188831318365, + "grad_norm": 7.887113094329834, + "learning_rate": 0.0001056, + "loss": 4.3784, + "step": 265 + }, + { + "epoch": 0.15313759355210133, + "grad_norm": 8.3016939163208, + "learning_rate": 0.00010600000000000002, + "loss": 3.7843, + "step": 266 + }, + { + "epoch": 0.153713298791019, + "grad_norm": 10.073237419128418, + "learning_rate": 0.00010640000000000001, + "loss": 4.0118, + "step": 267 + }, + { + "epoch": 0.15428900402993667, + "grad_norm": 6.9664106369018555, + "learning_rate": 0.00010680000000000001, + "loss": 3.8644, + "step": 268 + }, + { + "epoch": 0.15486470926885434, + "grad_norm": 8.479534149169922, + "learning_rate": 0.00010720000000000002, + "loss": 3.7009, + "step": 269 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 8.317602157592773, + "learning_rate": 0.00010760000000000001, + "loss": 3.7018, + "step": 270 + }, + { + "epoch": 0.1560161197466897, + "grad_norm": 6.020889759063721, + "learning_rate": 0.00010800000000000001, + "loss": 3.656, + "step": 271 + }, + { + "epoch": 0.15659182498560736, + "grad_norm": 7.147673606872559, + "learning_rate": 0.00010840000000000002, + "loss": 3.9216, + "step": 272 + }, + { + "epoch": 0.15716753022452504, + "grad_norm": 5.485556125640869, + "learning_rate": 0.00010880000000000002, + "loss": 3.4732, + "step": 273 + }, + { + "epoch": 0.1577432354634427, + "grad_norm": 7.432086944580078, + "learning_rate": 0.00010920000000000001, + "loss": 3.423, + "step": 274 + }, + { + "epoch": 0.15831894070236038, + "grad_norm": 6.897833824157715, + "learning_rate": 0.00010960000000000001, + "loss": 3.6169, + "step": 275 + }, + { + "epoch": 0.15889464594127806, + "grad_norm": 7.707437992095947, + "learning_rate": 0.00011000000000000002, + "loss": 3.6883, + "step": 276 + }, + { + "epoch": 0.15947035118019573, + "grad_norm": 5.546234607696533, + "learning_rate": 0.00011040000000000001, + "loss": 3.8388, + "step": 277 + }, + { + "epoch": 0.1600460564191134, + "grad_norm": 10.001431465148926, + "learning_rate": 0.00011080000000000001, + "loss": 3.372, + "step": 278 + }, + { + "epoch": 0.16062176165803108, + "grad_norm": 8.793180465698242, + "learning_rate": 0.00011120000000000002, + "loss": 3.7929, + "step": 279 + }, + { + "epoch": 0.16119746689694875, + "grad_norm": 8.189177513122559, + "learning_rate": 0.00011160000000000002, + "loss": 4.0091, + "step": 280 + }, + { + "epoch": 0.16177317213586645, + "grad_norm": 6.998697280883789, + "learning_rate": 0.00011200000000000001, + "loss": 3.648, + "step": 281 + }, + { + "epoch": 0.16234887737478412, + "grad_norm": 8.115317344665527, + "learning_rate": 0.00011240000000000002, + "loss": 4.0327, + "step": 282 + }, + { + "epoch": 0.1629245826137018, + "grad_norm": 7.597106456756592, + "learning_rate": 0.00011279999999999999, + "loss": 3.7811, + "step": 283 + }, + { + "epoch": 0.16350028785261947, + "grad_norm": 6.518374443054199, + "learning_rate": 0.0001132, + "loss": 3.3359, + "step": 284 + }, + { + "epoch": 0.16407599309153714, + "grad_norm": 6.962795257568359, + "learning_rate": 0.0001136, + "loss": 3.3726, + "step": 285 + }, + { + "epoch": 0.16465169833045482, + "grad_norm": 8.1845703125, + "learning_rate": 0.00011399999999999999, + "loss": 4.0042, + "step": 286 + }, + { + "epoch": 0.1652274035693725, + "grad_norm": 6.869271755218506, + "learning_rate": 0.0001144, + "loss": 3.4989, + "step": 287 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 12.261098861694336, + "learning_rate": 0.0001148, + "loss": 4.1045, + "step": 288 + }, + { + "epoch": 0.16637881404720783, + "grad_norm": 6.912962913513184, + "learning_rate": 0.0001152, + "loss": 3.6853, + "step": 289 + }, + { + "epoch": 0.1669545192861255, + "grad_norm": 8.545379638671875, + "learning_rate": 0.00011559999999999999, + "loss": 3.8903, + "step": 290 + }, + { + "epoch": 0.16753022452504318, + "grad_norm": 15.040228843688965, + "learning_rate": 0.000116, + "loss": 3.4079, + "step": 291 + }, + { + "epoch": 0.16810592976396085, + "grad_norm": 7.038132667541504, + "learning_rate": 0.0001164, + "loss": 3.7119, + "step": 292 + }, + { + "epoch": 0.16868163500287853, + "grad_norm": 6.259817123413086, + "learning_rate": 0.00011679999999999999, + "loss": 3.4931, + "step": 293 + }, + { + "epoch": 0.1692573402417962, + "grad_norm": 6.947351455688477, + "learning_rate": 0.0001172, + "loss": 3.677, + "step": 294 + }, + { + "epoch": 0.16983304548071387, + "grad_norm": 14.260014533996582, + "learning_rate": 0.0001176, + "loss": 3.9591, + "step": 295 + }, + { + "epoch": 0.17040875071963155, + "grad_norm": 6.70070743560791, + "learning_rate": 0.000118, + "loss": 3.2433, + "step": 296 + }, + { + "epoch": 0.17098445595854922, + "grad_norm": 11.697699546813965, + "learning_rate": 0.0001184, + "loss": 4.0909, + "step": 297 + }, + { + "epoch": 0.1715601611974669, + "grad_norm": 10.029029846191406, + "learning_rate": 0.0001188, + "loss": 3.5743, + "step": 298 + }, + { + "epoch": 0.17213586643638457, + "grad_norm": 6.6930365562438965, + "learning_rate": 0.0001192, + "loss": 3.2007, + "step": 299 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 21.772619247436523, + "learning_rate": 0.00011960000000000001, + "loss": 3.8505, + "step": 300 + }, + { + "epoch": 0.1732872769142199, + "grad_norm": 9.126256942749023, + "learning_rate": 0.00012, + "loss": 3.5777, + "step": 301 + }, + { + "epoch": 0.17386298215313759, + "grad_norm": 7.574469566345215, + "learning_rate": 0.0001204, + "loss": 3.5329, + "step": 302 + }, + { + "epoch": 0.17443868739205526, + "grad_norm": 6.436075687408447, + "learning_rate": 0.0001208, + "loss": 3.279, + "step": 303 + }, + { + "epoch": 0.17501439263097293, + "grad_norm": 5.945929527282715, + "learning_rate": 0.0001212, + "loss": 3.4338, + "step": 304 + }, + { + "epoch": 0.1755900978698906, + "grad_norm": 5.7057785987854, + "learning_rate": 0.0001216, + "loss": 3.2369, + "step": 305 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 9.411810874938965, + "learning_rate": 0.000122, + "loss": 3.5364, + "step": 306 + }, + { + "epoch": 0.17674150834772595, + "grad_norm": 8.872260093688965, + "learning_rate": 0.0001224, + "loss": 3.7803, + "step": 307 + }, + { + "epoch": 0.17731721358664365, + "grad_norm": 46.1115837097168, + "learning_rate": 0.0001228, + "loss": 3.7188, + "step": 308 + }, + { + "epoch": 0.17789291882556132, + "grad_norm": 48.33805465698242, + "learning_rate": 0.0001232, + "loss": 3.7491, + "step": 309 + }, + { + "epoch": 0.178468624064479, + "grad_norm": 7.272097587585449, + "learning_rate": 0.0001236, + "loss": 3.559, + "step": 310 + }, + { + "epoch": 0.17904432930339667, + "grad_norm": 7.471408367156982, + "learning_rate": 0.000124, + "loss": 3.6014, + "step": 311 + }, + { + "epoch": 0.17962003454231434, + "grad_norm": 11.095893859863281, + "learning_rate": 0.00012440000000000002, + "loss": 3.5741, + "step": 312 + }, + { + "epoch": 0.18019573978123202, + "grad_norm": 8.782601356506348, + "learning_rate": 0.0001248, + "loss": 3.2475, + "step": 313 + }, + { + "epoch": 0.1807714450201497, + "grad_norm": 7.485610485076904, + "learning_rate": 0.0001252, + "loss": 3.0304, + "step": 314 + }, + { + "epoch": 0.18134715025906736, + "grad_norm": 7.794425964355469, + "learning_rate": 0.00012560000000000002, + "loss": 2.9428, + "step": 315 + }, + { + "epoch": 0.18192285549798504, + "grad_norm": 6.470662593841553, + "learning_rate": 0.000126, + "loss": 3.4341, + "step": 316 + }, + { + "epoch": 0.1824985607369027, + "grad_norm": 10.054426193237305, + "learning_rate": 0.0001264, + "loss": 2.941, + "step": 317 + }, + { + "epoch": 0.18307426597582038, + "grad_norm": 93.38629150390625, + "learning_rate": 0.00012680000000000002, + "loss": 4.2291, + "step": 318 + }, + { + "epoch": 0.18364997121473806, + "grad_norm": 9.805968284606934, + "learning_rate": 0.0001272, + "loss": 3.0641, + "step": 319 + }, + { + "epoch": 0.18422567645365573, + "grad_norm": 6.104334831237793, + "learning_rate": 0.0001276, + "loss": 3.0856, + "step": 320 + }, + { + "epoch": 0.1848013816925734, + "grad_norm": 8.24195384979248, + "learning_rate": 0.00012800000000000002, + "loss": 3.0774, + "step": 321 + }, + { + "epoch": 0.18537708693149108, + "grad_norm": 6.327628135681152, + "learning_rate": 0.0001284, + "loss": 3.0826, + "step": 322 + }, + { + "epoch": 0.18595279217040875, + "grad_norm": 11.529990196228027, + "learning_rate": 0.00012880000000000001, + "loss": 3.7882, + "step": 323 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 9.700762748718262, + "learning_rate": 0.00012920000000000002, + "loss": 3.4958, + "step": 324 + }, + { + "epoch": 0.1871042026482441, + "grad_norm": 10.289152145385742, + "learning_rate": 0.0001296, + "loss": 3.3652, + "step": 325 + }, + { + "epoch": 0.18767990788716177, + "grad_norm": 6.888269901275635, + "learning_rate": 0.00013000000000000002, + "loss": 3.1086, + "step": 326 + }, + { + "epoch": 0.18825561312607944, + "grad_norm": 9.220719337463379, + "learning_rate": 0.0001304, + "loss": 3.5314, + "step": 327 + }, + { + "epoch": 0.1888313183649971, + "grad_norm": 9.044048309326172, + "learning_rate": 0.0001308, + "loss": 2.943, + "step": 328 + }, + { + "epoch": 0.1894070236039148, + "grad_norm": 11.338268280029297, + "learning_rate": 0.00013120000000000002, + "loss": 3.4617, + "step": 329 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 5.949525833129883, + "learning_rate": 0.0001316, + "loss": 2.8324, + "step": 330 + }, + { + "epoch": 0.19055843408175013, + "grad_norm": 9.158703804016113, + "learning_rate": 0.000132, + "loss": 3.1961, + "step": 331 + }, + { + "epoch": 0.1911341393206678, + "grad_norm": 8.708706855773926, + "learning_rate": 0.00013240000000000002, + "loss": 3.1941, + "step": 332 + }, + { + "epoch": 0.19170984455958548, + "grad_norm": 10.610583305358887, + "learning_rate": 0.0001328, + "loss": 3.3617, + "step": 333 + }, + { + "epoch": 0.19228554979850315, + "grad_norm": 8.023892402648926, + "learning_rate": 0.0001332, + "loss": 3.1775, + "step": 334 + }, + { + "epoch": 0.19286125503742085, + "grad_norm": 7.895623683929443, + "learning_rate": 0.00013360000000000002, + "loss": 3.1033, + "step": 335 + }, + { + "epoch": 0.19343696027633853, + "grad_norm": 6.376975059509277, + "learning_rate": 0.000134, + "loss": 2.808, + "step": 336 + }, + { + "epoch": 0.1940126655152562, + "grad_norm": 5.185142993927002, + "learning_rate": 0.00013440000000000001, + "loss": 2.8337, + "step": 337 + }, + { + "epoch": 0.19458837075417387, + "grad_norm": 6.408693790435791, + "learning_rate": 0.00013480000000000002, + "loss": 3.0604, + "step": 338 + }, + { + "epoch": 0.19516407599309155, + "grad_norm": 21.610239028930664, + "learning_rate": 0.0001352, + "loss": 3.431, + "step": 339 + }, + { + "epoch": 0.19573978123200922, + "grad_norm": 9.485398292541504, + "learning_rate": 0.00013560000000000002, + "loss": 3.2208, + "step": 340 + }, + { + "epoch": 0.1963154864709269, + "grad_norm": 6.460340976715088, + "learning_rate": 0.00013600000000000003, + "loss": 2.793, + "step": 341 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 5.64215612411499, + "learning_rate": 0.0001364, + "loss": 2.8589, + "step": 342 + }, + { + "epoch": 0.19746689694876224, + "grad_norm": 6.9033427238464355, + "learning_rate": 0.00013680000000000002, + "loss": 3.1031, + "step": 343 + }, + { + "epoch": 0.1980426021876799, + "grad_norm": 5.724493980407715, + "learning_rate": 0.00013720000000000003, + "loss": 2.8605, + "step": 344 + }, + { + "epoch": 0.19861830742659758, + "grad_norm": 15.779448509216309, + "learning_rate": 0.00013759999999999998, + "loss": 3.2151, + "step": 345 + }, + { + "epoch": 0.19919401266551526, + "grad_norm": 6.960752964019775, + "learning_rate": 0.000138, + "loss": 2.8537, + "step": 346 + }, + { + "epoch": 0.19976971790443293, + "grad_norm": 8.871850967407227, + "learning_rate": 0.0001384, + "loss": 2.7536, + "step": 347 + }, + { + "epoch": 0.2003454231433506, + "grad_norm": 6.670348644256592, + "learning_rate": 0.00013879999999999999, + "loss": 2.9525, + "step": 348 + }, + { + "epoch": 0.20092112838226828, + "grad_norm": 9.574007034301758, + "learning_rate": 0.0001392, + "loss": 2.7996, + "step": 349 + }, + { + "epoch": 0.20149683362118595, + "grad_norm": 5.3862223625183105, + "learning_rate": 0.0001396, + "loss": 2.662, + "step": 350 + }, + { + "epoch": 0.20207253886010362, + "grad_norm": 11.832735061645508, + "learning_rate": 0.00014, + "loss": 3.1706, + "step": 351 + }, + { + "epoch": 0.2026482440990213, + "grad_norm": 8.553043365478516, + "learning_rate": 0.0001404, + "loss": 2.8034, + "step": 352 + }, + { + "epoch": 0.20322394933793897, + "grad_norm": 17.231216430664062, + "learning_rate": 0.0001408, + "loss": 2.8267, + "step": 353 + }, + { + "epoch": 0.20379965457685664, + "grad_norm": 10.80978012084961, + "learning_rate": 0.0001412, + "loss": 2.7008, + "step": 354 + }, + { + "epoch": 0.20437535981577432, + "grad_norm": 7.117002010345459, + "learning_rate": 0.0001416, + "loss": 2.5399, + "step": 355 + }, + { + "epoch": 0.204951065054692, + "grad_norm": 5.009802341461182, + "learning_rate": 0.000142, + "loss": 2.7215, + "step": 356 + }, + { + "epoch": 0.20552677029360966, + "grad_norm": 16.786869049072266, + "learning_rate": 0.0001424, + "loss": 2.9873, + "step": 357 + }, + { + "epoch": 0.20610247553252733, + "grad_norm": 7.779325008392334, + "learning_rate": 0.0001428, + "loss": 2.892, + "step": 358 + }, + { + "epoch": 0.206678180771445, + "grad_norm": 9.354433059692383, + "learning_rate": 0.0001432, + "loss": 2.7065, + "step": 359 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 13.15522575378418, + "learning_rate": 0.0001436, + "loss": 2.8061, + "step": 360 + }, + { + "epoch": 0.20782959124928038, + "grad_norm": 6.927896976470947, + "learning_rate": 0.000144, + "loss": 2.8687, + "step": 361 + }, + { + "epoch": 0.20840529648819806, + "grad_norm": 8.532772064208984, + "learning_rate": 0.0001444, + "loss": 2.9418, + "step": 362 + }, + { + "epoch": 0.20898100172711573, + "grad_norm": 8.618231773376465, + "learning_rate": 0.0001448, + "loss": 2.588, + "step": 363 + }, + { + "epoch": 0.2095567069660334, + "grad_norm": 4.94150447845459, + "learning_rate": 0.0001452, + "loss": 2.5464, + "step": 364 + }, + { + "epoch": 0.21013241220495107, + "grad_norm": 5.547298431396484, + "learning_rate": 0.00014560000000000002, + "loss": 2.755, + "step": 365 + }, + { + "epoch": 0.21070811744386875, + "grad_norm": 8.270822525024414, + "learning_rate": 0.000146, + "loss": 2.8345, + "step": 366 + }, + { + "epoch": 0.21128382268278642, + "grad_norm": 6.572064399719238, + "learning_rate": 0.0001464, + "loss": 2.6624, + "step": 367 + }, + { + "epoch": 0.2118595279217041, + "grad_norm": 8.243054389953613, + "learning_rate": 0.00014680000000000002, + "loss": 2.7102, + "step": 368 + }, + { + "epoch": 0.21243523316062177, + "grad_norm": 6.671678066253662, + "learning_rate": 0.0001472, + "loss": 2.4775, + "step": 369 + }, + { + "epoch": 0.21301093839953944, + "grad_norm": 5.922910690307617, + "learning_rate": 0.0001476, + "loss": 2.919, + "step": 370 + }, + { + "epoch": 0.2135866436384571, + "grad_norm": 12.84566593170166, + "learning_rate": 0.000148, + "loss": 2.5189, + "step": 371 + }, + { + "epoch": 0.2141623488773748, + "grad_norm": 7.342642307281494, + "learning_rate": 0.0001484, + "loss": 2.8968, + "step": 372 + }, + { + "epoch": 0.21473805411629246, + "grad_norm": 14.625147819519043, + "learning_rate": 0.0001488, + "loss": 2.6793, + "step": 373 + }, + { + "epoch": 0.21531375935521013, + "grad_norm": 6.683467388153076, + "learning_rate": 0.0001492, + "loss": 2.3975, + "step": 374 + }, + { + "epoch": 0.2158894645941278, + "grad_norm": 12.186212539672852, + "learning_rate": 0.0001496, + "loss": 2.856, + "step": 375 + }, + { + "epoch": 0.21646516983304548, + "grad_norm": 8.417567253112793, + "learning_rate": 0.00015000000000000001, + "loss": 2.6326, + "step": 376 + }, + { + "epoch": 0.21704087507196315, + "grad_norm": 5.414144992828369, + "learning_rate": 0.0001504, + "loss": 2.7417, + "step": 377 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 13.388712882995605, + "learning_rate": 0.0001508, + "loss": 2.8813, + "step": 378 + }, + { + "epoch": 0.2181922855497985, + "grad_norm": 6.375700950622559, + "learning_rate": 0.00015120000000000002, + "loss": 2.7187, + "step": 379 + }, + { + "epoch": 0.21876799078871617, + "grad_norm": 9.897554397583008, + "learning_rate": 0.0001516, + "loss": 2.6278, + "step": 380 + }, + { + "epoch": 0.21934369602763384, + "grad_norm": 10.079334259033203, + "learning_rate": 0.000152, + "loss": 2.4861, + "step": 381 + }, + { + "epoch": 0.21991940126655152, + "grad_norm": 10.082268714904785, + "learning_rate": 0.00015240000000000002, + "loss": 2.6516, + "step": 382 + }, + { + "epoch": 0.2204951065054692, + "grad_norm": 9.192161560058594, + "learning_rate": 0.0001528, + "loss": 2.3307, + "step": 383 + }, + { + "epoch": 0.22107081174438686, + "grad_norm": 8.085034370422363, + "learning_rate": 0.0001532, + "loss": 2.3445, + "step": 384 + }, + { + "epoch": 0.22164651698330454, + "grad_norm": 5.418321132659912, + "learning_rate": 0.00015360000000000002, + "loss": 2.7119, + "step": 385 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 16.515369415283203, + "learning_rate": 0.000154, + "loss": 2.6647, + "step": 386 + }, + { + "epoch": 0.22279792746113988, + "grad_norm": 11.138907432556152, + "learning_rate": 0.0001544, + "loss": 2.6742, + "step": 387 + }, + { + "epoch": 0.22337363270005758, + "grad_norm": 20.75733184814453, + "learning_rate": 0.00015480000000000002, + "loss": 2.8834, + "step": 388 + }, + { + "epoch": 0.22394933793897526, + "grad_norm": 8.349270820617676, + "learning_rate": 0.0001552, + "loss": 2.6376, + "step": 389 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 6.902172088623047, + "learning_rate": 0.00015560000000000001, + "loss": 2.6186, + "step": 390 + }, + { + "epoch": 0.2251007484168106, + "grad_norm": 14.718120574951172, + "learning_rate": 0.00015600000000000002, + "loss": 2.7649, + "step": 391 + }, + { + "epoch": 0.22567645365572828, + "grad_norm": 5.805610656738281, + "learning_rate": 0.0001564, + "loss": 2.6221, + "step": 392 + }, + { + "epoch": 0.22625215889464595, + "grad_norm": 6.138345718383789, + "learning_rate": 0.00015680000000000002, + "loss": 2.5751, + "step": 393 + }, + { + "epoch": 0.22682786413356362, + "grad_norm": 29.98923683166504, + "learning_rate": 0.00015720000000000003, + "loss": 2.824, + "step": 394 + }, + { + "epoch": 0.2274035693724813, + "grad_norm": 31.91318702697754, + "learning_rate": 0.0001576, + "loss": 2.6134, + "step": 395 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 10.812357902526855, + "learning_rate": 0.00015800000000000002, + "loss": 2.4594, + "step": 396 + }, + { + "epoch": 0.22855497985031664, + "grad_norm": 7.6294755935668945, + "learning_rate": 0.00015840000000000003, + "loss": 2.5259, + "step": 397 + }, + { + "epoch": 0.22913068508923431, + "grad_norm": 5.666753768920898, + "learning_rate": 0.0001588, + "loss": 2.6108, + "step": 398 + }, + { + "epoch": 0.229706390328152, + "grad_norm": 6.732410907745361, + "learning_rate": 0.00015920000000000002, + "loss": 2.5352, + "step": 399 + }, + { + "epoch": 0.23028209556706966, + "grad_norm": 6.749885082244873, + "learning_rate": 0.0001596, + "loss": 2.5103, + "step": 400 + }, + { + "epoch": 0.23085780080598733, + "grad_norm": 5.389144420623779, + "learning_rate": 0.00016, + "loss": 2.5803, + "step": 401 + }, + { + "epoch": 0.231433506044905, + "grad_norm": 6.996800422668457, + "learning_rate": 0.00016040000000000002, + "loss": 2.7746, + "step": 402 + }, + { + "epoch": 0.23200921128382268, + "grad_norm": 22.8950138092041, + "learning_rate": 0.0001608, + "loss": 2.5619, + "step": 403 + }, + { + "epoch": 0.23258491652274035, + "grad_norm": 11.477226257324219, + "learning_rate": 0.00016120000000000002, + "loss": 2.4898, + "step": 404 + }, + { + "epoch": 0.23316062176165803, + "grad_norm": 8.584878921508789, + "learning_rate": 0.00016160000000000002, + "loss": 2.4191, + "step": 405 + }, + { + "epoch": 0.2337363270005757, + "grad_norm": 6.987226963043213, + "learning_rate": 0.000162, + "loss": 2.4045, + "step": 406 + }, + { + "epoch": 0.23431203223949337, + "grad_norm": 12.917460441589355, + "learning_rate": 0.00016240000000000002, + "loss": 2.656, + "step": 407 + }, + { + "epoch": 0.23488773747841105, + "grad_norm": 13.053242683410645, + "learning_rate": 0.0001628, + "loss": 2.5026, + "step": 408 + }, + { + "epoch": 0.23546344271732872, + "grad_norm": 6.013350486755371, + "learning_rate": 0.0001632, + "loss": 2.4027, + "step": 409 + }, + { + "epoch": 0.2360391479562464, + "grad_norm": 21.95798110961914, + "learning_rate": 0.0001636, + "loss": 2.4155, + "step": 410 + }, + { + "epoch": 0.23661485319516407, + "grad_norm": 6.197417259216309, + "learning_rate": 0.000164, + "loss": 2.2512, + "step": 411 + }, + { + "epoch": 0.23719055843408174, + "grad_norm": 5.798823356628418, + "learning_rate": 0.0001644, + "loss": 2.5775, + "step": 412 + }, + { + "epoch": 0.2377662636729994, + "grad_norm": 12.58922290802002, + "learning_rate": 0.0001648, + "loss": 2.2999, + "step": 413 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 6.5375213623046875, + "learning_rate": 0.0001652, + "loss": 2.5818, + "step": 414 + }, + { + "epoch": 0.23891767415083479, + "grad_norm": 8.916410446166992, + "learning_rate": 0.0001656, + "loss": 2.3589, + "step": 415 + }, + { + "epoch": 0.23949337938975246, + "grad_norm": 7.457561492919922, + "learning_rate": 0.000166, + "loss": 2.5489, + "step": 416 + }, + { + "epoch": 0.24006908462867013, + "grad_norm": 18.522987365722656, + "learning_rate": 0.0001664, + "loss": 2.6065, + "step": 417 + }, + { + "epoch": 0.2406447898675878, + "grad_norm": 64.20520782470703, + "learning_rate": 0.0001668, + "loss": 2.7258, + "step": 418 + }, + { + "epoch": 0.24122049510650548, + "grad_norm": 40.07137680053711, + "learning_rate": 0.0001672, + "loss": 2.6834, + "step": 419 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 6.103574752807617, + "learning_rate": 0.0001676, + "loss": 2.4909, + "step": 420 + }, + { + "epoch": 0.24237190558434082, + "grad_norm": 6.48091983795166, + "learning_rate": 0.000168, + "loss": 2.5598, + "step": 421 + }, + { + "epoch": 0.2429476108232585, + "grad_norm": 6.65122127532959, + "learning_rate": 0.0001684, + "loss": 2.0797, + "step": 422 + }, + { + "epoch": 0.24352331606217617, + "grad_norm": 7.160250663757324, + "learning_rate": 0.0001688, + "loss": 2.4701, + "step": 423 + }, + { + "epoch": 0.24409902130109384, + "grad_norm": 5.73784875869751, + "learning_rate": 0.0001692, + "loss": 2.333, + "step": 424 + }, + { + "epoch": 0.24467472654001152, + "grad_norm": 21.651309967041016, + "learning_rate": 0.0001696, + "loss": 2.5034, + "step": 425 + }, + { + "epoch": 0.2452504317789292, + "grad_norm": 17.80324935913086, + "learning_rate": 0.00017, + "loss": 2.4943, + "step": 426 + }, + { + "epoch": 0.24582613701784686, + "grad_norm": 6.137923240661621, + "learning_rate": 0.0001704, + "loss": 2.4143, + "step": 427 + }, + { + "epoch": 0.24640184225676454, + "grad_norm": 5.833311080932617, + "learning_rate": 0.0001708, + "loss": 2.4598, + "step": 428 + }, + { + "epoch": 0.2469775474956822, + "grad_norm": 20.596446990966797, + "learning_rate": 0.00017120000000000001, + "loss": 2.4136, + "step": 429 + }, + { + "epoch": 0.24755325273459988, + "grad_norm": 5.577768802642822, + "learning_rate": 0.0001716, + "loss": 2.4349, + "step": 430 + }, + { + "epoch": 0.24812895797351756, + "grad_norm": 6.16340446472168, + "learning_rate": 0.000172, + "loss": 2.5712, + "step": 431 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 5.587292671203613, + "learning_rate": 0.00017240000000000002, + "loss": 2.522, + "step": 432 + }, + { + "epoch": 0.2492803684513529, + "grad_norm": 7.1100945472717285, + "learning_rate": 0.0001728, + "loss": 2.2343, + "step": 433 + }, + { + "epoch": 0.24985607369027057, + "grad_norm": 6.089508056640625, + "learning_rate": 0.0001732, + "loss": 2.5291, + "step": 434 + }, + { + "epoch": 0.2504317789291883, + "grad_norm": 12.8109769821167, + "learning_rate": 0.00017360000000000002, + "loss": 2.4944, + "step": 435 + }, + { + "epoch": 0.2510074841681059, + "grad_norm": 9.722925186157227, + "learning_rate": 0.000174, + "loss": 2.2176, + "step": 436 + }, + { + "epoch": 0.2515831894070236, + "grad_norm": 13.540785789489746, + "learning_rate": 0.0001744, + "loss": 2.3636, + "step": 437 + }, + { + "epoch": 0.25215889464594127, + "grad_norm": 22.12358856201172, + "learning_rate": 0.00017480000000000002, + "loss": 2.4757, + "step": 438 + }, + { + "epoch": 0.25273459988485897, + "grad_norm": 8.760823249816895, + "learning_rate": 0.0001752, + "loss": 2.256, + "step": 439 + }, + { + "epoch": 0.2533103051237766, + "grad_norm": 7.311398506164551, + "learning_rate": 0.0001756, + "loss": 2.3274, + "step": 440 + }, + { + "epoch": 0.2538860103626943, + "grad_norm": 9.8610200881958, + "learning_rate": 0.00017600000000000002, + "loss": 2.4749, + "step": 441 + }, + { + "epoch": 0.25446171560161196, + "grad_norm": 7.475802898406982, + "learning_rate": 0.0001764, + "loss": 2.4547, + "step": 442 + }, + { + "epoch": 0.25503742084052966, + "grad_norm": 13.036137580871582, + "learning_rate": 0.00017680000000000001, + "loss": 2.1679, + "step": 443 + }, + { + "epoch": 0.2556131260794473, + "grad_norm": 11.247735977172852, + "learning_rate": 0.0001772, + "loss": 2.5446, + "step": 444 + }, + { + "epoch": 0.256188831318365, + "grad_norm": 7.0622124671936035, + "learning_rate": 0.0001776, + "loss": 2.3196, + "step": 445 + }, + { + "epoch": 0.25676453655728265, + "grad_norm": 5.404714107513428, + "learning_rate": 0.00017800000000000002, + "loss": 2.2713, + "step": 446 + }, + { + "epoch": 0.25734024179620035, + "grad_norm": 44.592891693115234, + "learning_rate": 0.0001784, + "loss": 2.2287, + "step": 447 + }, + { + "epoch": 0.257915947035118, + "grad_norm": 30.109132766723633, + "learning_rate": 0.0001788, + "loss": 2.3153, + "step": 448 + }, + { + "epoch": 0.2584916522740357, + "grad_norm": 15.7490873336792, + "learning_rate": 0.00017920000000000002, + "loss": 2.3081, + "step": 449 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 13.772661209106445, + "learning_rate": 0.0001796, + "loss": 2.2548, + "step": 450 + }, + { + "epoch": 0.25964306275187105, + "grad_norm": 6.858334064483643, + "learning_rate": 0.00018, + "loss": 2.4804, + "step": 451 + }, + { + "epoch": 0.2602187679907887, + "grad_norm": 6.23155403137207, + "learning_rate": 0.00018040000000000002, + "loss": 2.4281, + "step": 452 + }, + { + "epoch": 0.2607944732297064, + "grad_norm": 5.4447150230407715, + "learning_rate": 0.0001808, + "loss": 2.342, + "step": 453 + }, + { + "epoch": 0.26137017846862404, + "grad_norm": 11.79716682434082, + "learning_rate": 0.0001812, + "loss": 2.2528, + "step": 454 + }, + { + "epoch": 0.26194588370754174, + "grad_norm": 10.708625793457031, + "learning_rate": 0.00018160000000000002, + "loss": 2.0204, + "step": 455 + }, + { + "epoch": 0.26252158894645944, + "grad_norm": 21.41659164428711, + "learning_rate": 0.000182, + "loss": 2.2593, + "step": 456 + }, + { + "epoch": 0.2630972941853771, + "grad_norm": 5.636983394622803, + "learning_rate": 0.00018240000000000002, + "loss": 2.1412, + "step": 457 + }, + { + "epoch": 0.2636729994242948, + "grad_norm": 9.639352798461914, + "learning_rate": 0.00018280000000000003, + "loss": 2.1103, + "step": 458 + }, + { + "epoch": 0.26424870466321243, + "grad_norm": 4.263064384460449, + "learning_rate": 0.0001832, + "loss": 2.2493, + "step": 459 + }, + { + "epoch": 0.26482440990213013, + "grad_norm": 8.983839988708496, + "learning_rate": 0.00018360000000000002, + "loss": 2.3782, + "step": 460 + }, + { + "epoch": 0.2654001151410478, + "grad_norm": 9.911988258361816, + "learning_rate": 0.00018400000000000003, + "loss": 2.2117, + "step": 461 + }, + { + "epoch": 0.2659758203799655, + "grad_norm": 8.42939567565918, + "learning_rate": 0.0001844, + "loss": 2.0942, + "step": 462 + }, + { + "epoch": 0.2665515256188831, + "grad_norm": 9.866201400756836, + "learning_rate": 0.00018480000000000002, + "loss": 2.2761, + "step": 463 + }, + { + "epoch": 0.2671272308578008, + "grad_norm": 4.9825758934021, + "learning_rate": 0.00018520000000000003, + "loss": 2.1351, + "step": 464 + }, + { + "epoch": 0.26770293609671847, + "grad_norm": 3.4520153999328613, + "learning_rate": 0.0001856, + "loss": 2.234, + "step": 465 + }, + { + "epoch": 0.26827864133563617, + "grad_norm": 6.94691276550293, + "learning_rate": 0.00018600000000000002, + "loss": 2.1368, + "step": 466 + }, + { + "epoch": 0.2688543465745538, + "grad_norm": 19.923587799072266, + "learning_rate": 0.00018640000000000003, + "loss": 2.4301, + "step": 467 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 24.741535186767578, + "learning_rate": 0.00018680000000000001, + "loss": 2.256, + "step": 468 + }, + { + "epoch": 0.27000575705238916, + "grad_norm": 9.313246726989746, + "learning_rate": 0.00018720000000000002, + "loss": 2.6483, + "step": 469 + }, + { + "epoch": 0.27058146229130686, + "grad_norm": 10.217698097229004, + "learning_rate": 0.0001876, + "loss": 1.8293, + "step": 470 + }, + { + "epoch": 0.2711571675302245, + "grad_norm": 28.85066032409668, + "learning_rate": 0.000188, + "loss": 2.3165, + "step": 471 + }, + { + "epoch": 0.2717328727691422, + "grad_norm": 5.764794826507568, + "learning_rate": 0.0001884, + "loss": 2.6914, + "step": 472 + }, + { + "epoch": 0.27230857800805985, + "grad_norm": 8.115283966064453, + "learning_rate": 0.0001888, + "loss": 2.5605, + "step": 473 + }, + { + "epoch": 0.27288428324697755, + "grad_norm": 11.941910743713379, + "learning_rate": 0.0001892, + "loss": 1.9626, + "step": 474 + }, + { + "epoch": 0.2734599884858952, + "grad_norm": 11.117420196533203, + "learning_rate": 0.0001896, + "loss": 2.4614, + "step": 475 + }, + { + "epoch": 0.2740356937248129, + "grad_norm": 6.908642292022705, + "learning_rate": 0.00019, + "loss": 2.3911, + "step": 476 + }, + { + "epoch": 0.27461139896373055, + "grad_norm": 10.433818817138672, + "learning_rate": 0.0001904, + "loss": 2.3747, + "step": 477 + }, + { + "epoch": 0.27518710420264825, + "grad_norm": 8.546224594116211, + "learning_rate": 0.0001908, + "loss": 2.2947, + "step": 478 + }, + { + "epoch": 0.2757628094415659, + "grad_norm": 5.434266090393066, + "learning_rate": 0.0001912, + "loss": 2.2115, + "step": 479 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 9.27397346496582, + "learning_rate": 0.0001916, + "loss": 2.2165, + "step": 480 + }, + { + "epoch": 0.27691421991940124, + "grad_norm": 4.052639484405518, + "learning_rate": 0.000192, + "loss": 2.1148, + "step": 481 + }, + { + "epoch": 0.27748992515831894, + "grad_norm": 7.541112422943115, + "learning_rate": 0.00019240000000000001, + "loss": 2.2489, + "step": 482 + }, + { + "epoch": 0.27806563039723664, + "grad_norm": 20.005165100097656, + "learning_rate": 0.0001928, + "loss": 2.3552, + "step": 483 + }, + { + "epoch": 0.2786413356361543, + "grad_norm": 6.74354362487793, + "learning_rate": 0.0001932, + "loss": 2.0027, + "step": 484 + }, + { + "epoch": 0.279217040875072, + "grad_norm": 4.244668960571289, + "learning_rate": 0.00019360000000000002, + "loss": 2.466, + "step": 485 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 32.92999267578125, + "learning_rate": 0.000194, + "loss": 2.0058, + "step": 486 + }, + { + "epoch": 0.28036845135290733, + "grad_norm": 5.099974155426025, + "learning_rate": 0.0001944, + "loss": 2.1534, + "step": 487 + }, + { + "epoch": 0.280944156591825, + "grad_norm": 8.950968742370605, + "learning_rate": 0.0001948, + "loss": 2.3272, + "step": 488 + }, + { + "epoch": 0.2815198618307427, + "grad_norm": 29.126623153686523, + "learning_rate": 0.0001952, + "loss": 2.0942, + "step": 489 + }, + { + "epoch": 0.2820955670696603, + "grad_norm": 26.04970932006836, + "learning_rate": 0.0001956, + "loss": 2.3703, + "step": 490 + }, + { + "epoch": 0.282671272308578, + "grad_norm": 7.4286370277404785, + "learning_rate": 0.000196, + "loss": 1.8691, + "step": 491 + }, + { + "epoch": 0.28324697754749567, + "grad_norm": 6.331235408782959, + "learning_rate": 0.0001964, + "loss": 2.2338, + "step": 492 + }, + { + "epoch": 0.28382268278641337, + "grad_norm": 4.98259162902832, + "learning_rate": 0.0001968, + "loss": 1.9059, + "step": 493 + }, + { + "epoch": 0.284398388025331, + "grad_norm": 12.111970901489258, + "learning_rate": 0.0001972, + "loss": 2.0567, + "step": 494 + }, + { + "epoch": 0.2849740932642487, + "grad_norm": 4.433606147766113, + "learning_rate": 0.0001976, + "loss": 2.061, + "step": 495 + }, + { + "epoch": 0.28554979850316636, + "grad_norm": 9.483826637268066, + "learning_rate": 0.00019800000000000002, + "loss": 2.1855, + "step": 496 + }, + { + "epoch": 0.28612550374208406, + "grad_norm": 8.829517364501953, + "learning_rate": 0.0001984, + "loss": 2.0813, + "step": 497 + }, + { + "epoch": 0.2867012089810017, + "grad_norm": 5.547176361083984, + "learning_rate": 0.0001988, + "loss": 2.1782, + "step": 498 + }, + { + "epoch": 0.2872769142199194, + "grad_norm": 13.865377426147461, + "learning_rate": 0.00019920000000000002, + "loss": 1.9131, + "step": 499 + }, + { + "epoch": 0.28785261945883706, + "grad_norm": 13.441047668457031, + "learning_rate": 0.0001996, + "loss": 2.1865, + "step": 500 + }, + { + "epoch": 0.28842832469775476, + "grad_norm": 4.224601745605469, + "learning_rate": 0.0002, + "loss": 2.4949, + "step": 501 + }, + { + "epoch": 0.2890040299366724, + "grad_norm": 4.024444580078125, + "learning_rate": 0.000199999709749734, + "loss": 2.2281, + "step": 502 + }, + { + "epoch": 0.2895797351755901, + "grad_norm": 6.911625862121582, + "learning_rate": 0.000199998839000808, + "loss": 2.0534, + "step": 503 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 15.578252792358398, + "learning_rate": 0.00019999738775883837, + "loss": 2.1315, + "step": 504 + }, + { + "epoch": 0.29073114565342545, + "grad_norm": 14.918317794799805, + "learning_rate": 0.00019999535603318567, + "loss": 2.1605, + "step": 505 + }, + { + "epoch": 0.2913068508923431, + "grad_norm": 3.6653409004211426, + "learning_rate": 0.0001999927438369545, + "loss": 2.3675, + "step": 506 + }, + { + "epoch": 0.2918825561312608, + "grad_norm": 9.457073211669922, + "learning_rate": 0.0001999895511869936, + "loss": 2.2067, + "step": 507 + }, + { + "epoch": 0.2924582613701785, + "grad_norm": 16.254053115844727, + "learning_rate": 0.00019998577810389551, + "loss": 1.8262, + "step": 508 + }, + { + "epoch": 0.29303396660909614, + "grad_norm": 12.8787260055542, + "learning_rate": 0.00019998142461199664, + "loss": 2.1758, + "step": 509 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 7.122046947479248, + "learning_rate": 0.00019997649073937707, + "loss": 2.1842, + "step": 510 + }, + { + "epoch": 0.2941853770869315, + "grad_norm": 7.713693618774414, + "learning_rate": 0.00019997097651786033, + "loss": 2.1556, + "step": 511 + }, + { + "epoch": 0.2947610823258492, + "grad_norm": 5.447865962982178, + "learning_rate": 0.00019996488198301314, + "loss": 2.2058, + "step": 512 + }, + { + "epoch": 0.29533678756476683, + "grad_norm": 10.775145530700684, + "learning_rate": 0.0001999582071741453, + "loss": 2.365, + "step": 513 + }, + { + "epoch": 0.29591249280368453, + "grad_norm": 15.842108726501465, + "learning_rate": 0.00019995095213430937, + "loss": 2.1598, + "step": 514 + }, + { + "epoch": 0.2964881980426022, + "grad_norm": 27.204334259033203, + "learning_rate": 0.00019994311691030038, + "loss": 2.3135, + "step": 515 + }, + { + "epoch": 0.2970639032815199, + "grad_norm": 17.095380783081055, + "learning_rate": 0.0001999347015526556, + "loss": 2.1369, + "step": 516 + }, + { + "epoch": 0.2976396085204375, + "grad_norm": 5.58231258392334, + "learning_rate": 0.0001999257061156541, + "loss": 2.1524, + "step": 517 + }, + { + "epoch": 0.2982153137593552, + "grad_norm": 16.57658576965332, + "learning_rate": 0.00019991613065731652, + "loss": 2.2354, + "step": 518 + }, + { + "epoch": 0.2987910189982729, + "grad_norm": 10.48273754119873, + "learning_rate": 0.00019990597523940467, + "loss": 1.9177, + "step": 519 + }, + { + "epoch": 0.2993667242371906, + "grad_norm": 11.657440185546875, + "learning_rate": 0.00019989523992742096, + "loss": 2.1773, + "step": 520 + }, + { + "epoch": 0.2999424294761082, + "grad_norm": 8.428756713867188, + "learning_rate": 0.00019988392479060828, + "loss": 2.0253, + "step": 521 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 23.292964935302734, + "learning_rate": 0.00019987202990194938, + "loss": 2.1664, + "step": 522 + }, + { + "epoch": 0.30109383995394357, + "grad_norm": 11.076370239257812, + "learning_rate": 0.00019985955533816623, + "loss": 2.1639, + "step": 523 + }, + { + "epoch": 0.30166954519286127, + "grad_norm": 5.960238933563232, + "learning_rate": 0.00019984650117971993, + "loss": 2.28, + "step": 524 + }, + { + "epoch": 0.3022452504317789, + "grad_norm": 5.293085098266602, + "learning_rate": 0.00019983286751080984, + "loss": 2.0797, + "step": 525 + }, + { + "epoch": 0.3028209556706966, + "grad_norm": 6.787145614624023, + "learning_rate": 0.00019981865441937326, + "loss": 2.1623, + "step": 526 + }, + { + "epoch": 0.30339666090961426, + "grad_norm": 4.942052841186523, + "learning_rate": 0.00019980386199708468, + "loss": 1.9676, + "step": 527 + }, + { + "epoch": 0.30397236614853196, + "grad_norm": 3.7597603797912598, + "learning_rate": 0.0001997884903393553, + "loss": 2.0058, + "step": 528 + }, + { + "epoch": 0.3045480713874496, + "grad_norm": 4.015875339508057, + "learning_rate": 0.00019977253954533243, + "loss": 2.4905, + "step": 529 + }, + { + "epoch": 0.3051237766263673, + "grad_norm": 71.47596740722656, + "learning_rate": 0.00019975600971789873, + "loss": 2.2579, + "step": 530 + }, + { + "epoch": 0.30569948186528495, + "grad_norm": 5.182095050811768, + "learning_rate": 0.00019973890096367173, + "loss": 2.0343, + "step": 531 + }, + { + "epoch": 0.30627518710420265, + "grad_norm": 11.30048656463623, + "learning_rate": 0.000199721213393003, + "loss": 1.709, + "step": 532 + }, + { + "epoch": 0.3068508923431203, + "grad_norm": 7.1390700340271, + "learning_rate": 0.00019970294711997745, + "loss": 1.9582, + "step": 533 + }, + { + "epoch": 0.307426597582038, + "grad_norm": 4.473127841949463, + "learning_rate": 0.0001996841022624127, + "loss": 2.0572, + "step": 534 + }, + { + "epoch": 0.3080023028209557, + "grad_norm": 15.681950569152832, + "learning_rate": 0.00019966467894185812, + "loss": 2.0069, + "step": 535 + }, + { + "epoch": 0.30857800805987334, + "grad_norm": 3.4989001750946045, + "learning_rate": 0.0001996446772835943, + "loss": 2.3378, + "step": 536 + }, + { + "epoch": 0.30915371329879104, + "grad_norm": 6.156066417694092, + "learning_rate": 0.00019962409741663202, + "loss": 2.1491, + "step": 537 + }, + { + "epoch": 0.3097294185377087, + "grad_norm": 3.5091023445129395, + "learning_rate": 0.00019960293947371153, + "loss": 2.2707, + "step": 538 + }, + { + "epoch": 0.3103051237766264, + "grad_norm": 11.965502738952637, + "learning_rate": 0.00019958120359130178, + "loss": 1.9268, + "step": 539 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 23.450349807739258, + "learning_rate": 0.0001995588899095992, + "loss": 1.7938, + "step": 540 + }, + { + "epoch": 0.31145653425446174, + "grad_norm": 38.560482025146484, + "learning_rate": 0.00019953599857252733, + "loss": 2.2267, + "step": 541 + }, + { + "epoch": 0.3120322394933794, + "grad_norm": 15.327620506286621, + "learning_rate": 0.00019951252972773525, + "loss": 2.3168, + "step": 542 + }, + { + "epoch": 0.3126079447322971, + "grad_norm": 4.420673370361328, + "learning_rate": 0.0001994884835265973, + "loss": 2.0744, + "step": 543 + }, + { + "epoch": 0.31318364997121473, + "grad_norm": 19.81424903869629, + "learning_rate": 0.00019946386012421153, + "loss": 1.9736, + "step": 544 + }, + { + "epoch": 0.31375935521013243, + "grad_norm": 4.647876739501953, + "learning_rate": 0.00019943865967939908, + "loss": 2.1716, + "step": 545 + }, + { + "epoch": 0.3143350604490501, + "grad_norm": 5.088565349578857, + "learning_rate": 0.00019941288235470291, + "loss": 1.9915, + "step": 546 + }, + { + "epoch": 0.3149107656879678, + "grad_norm": 6.194237232208252, + "learning_rate": 0.00019938652831638697, + "loss": 1.9701, + "step": 547 + }, + { + "epoch": 0.3154864709268854, + "grad_norm": 5.4519429206848145, + "learning_rate": 0.00019935959773443497, + "loss": 2.2597, + "step": 548 + }, + { + "epoch": 0.3160621761658031, + "grad_norm": 7.437872409820557, + "learning_rate": 0.0001993320907825493, + "loss": 2.3016, + "step": 549 + }, + { + "epoch": 0.31663788140472077, + "grad_norm": 4.233456134796143, + "learning_rate": 0.00019930400763814993, + "loss": 1.8935, + "step": 550 + }, + { + "epoch": 0.31721358664363847, + "grad_norm": 5.772792339324951, + "learning_rate": 0.00019927534848237336, + "loss": 1.6373, + "step": 551 + }, + { + "epoch": 0.3177892918825561, + "grad_norm": 7.545225143432617, + "learning_rate": 0.0001992461135000713, + "loss": 1.9868, + "step": 552 + }, + { + "epoch": 0.3183649971214738, + "grad_norm": 5.72635555267334, + "learning_rate": 0.00019921630287980956, + "loss": 1.7728, + "step": 553 + }, + { + "epoch": 0.31894070236039146, + "grad_norm": 5.739555358886719, + "learning_rate": 0.0001991859168138668, + "loss": 1.8478, + "step": 554 + }, + { + "epoch": 0.31951640759930916, + "grad_norm": 3.295530319213867, + "learning_rate": 0.0001991549554982333, + "loss": 2.1454, + "step": 555 + }, + { + "epoch": 0.3200921128382268, + "grad_norm": 10.391168594360352, + "learning_rate": 0.0001991234191326098, + "loss": 2.2763, + "step": 556 + }, + { + "epoch": 0.3206678180771445, + "grad_norm": 14.846756935119629, + "learning_rate": 0.00019909130792040598, + "loss": 1.9783, + "step": 557 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 4.79947566986084, + "learning_rate": 0.0001990586220687394, + "loss": 2.1489, + "step": 558 + }, + { + "epoch": 0.32181922855497985, + "grad_norm": 4.786315441131592, + "learning_rate": 0.00019902536178843395, + "loss": 2.0194, + "step": 559 + }, + { + "epoch": 0.3223949337938975, + "grad_norm": 11.64875602722168, + "learning_rate": 0.00019899152729401868, + "loss": 1.8983, + "step": 560 + }, + { + "epoch": 0.3229706390328152, + "grad_norm": 9.477057456970215, + "learning_rate": 0.00019895711880372628, + "loss": 1.9139, + "step": 561 + }, + { + "epoch": 0.3235463442717329, + "grad_norm": 7.5819993019104, + "learning_rate": 0.00019892213653949166, + "loss": 1.843, + "step": 562 + }, + { + "epoch": 0.32412204951065055, + "grad_norm": 4.9545207023620605, + "learning_rate": 0.00019888658072695066, + "loss": 2.1052, + "step": 563 + }, + { + "epoch": 0.32469775474956825, + "grad_norm": 4.684484958648682, + "learning_rate": 0.0001988504515954385, + "loss": 1.7933, + "step": 564 + }, + { + "epoch": 0.3252734599884859, + "grad_norm": 8.41274356842041, + "learning_rate": 0.00019881374937798826, + "loss": 2.0737, + "step": 565 + }, + { + "epoch": 0.3258491652274036, + "grad_norm": 20.587425231933594, + "learning_rate": 0.00019877647431132948, + "loss": 1.6823, + "step": 566 + }, + { + "epoch": 0.32642487046632124, + "grad_norm": 12.793438911437988, + "learning_rate": 0.00019873862663588658, + "loss": 2.1764, + "step": 567 + }, + { + "epoch": 0.32700057570523894, + "grad_norm": 3.9023592472076416, + "learning_rate": 0.00019870020659577725, + "loss": 2.3804, + "step": 568 + }, + { + "epoch": 0.3275762809441566, + "grad_norm": 5.434683799743652, + "learning_rate": 0.000198661214438811, + "loss": 2.162, + "step": 569 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 5.3589019775390625, + "learning_rate": 0.00019862165041648744, + "loss": 2.2068, + "step": 570 + }, + { + "epoch": 0.32872769142199193, + "grad_norm": 5.979888439178467, + "learning_rate": 0.00019858151478399478, + "loss": 1.9811, + "step": 571 + }, + { + "epoch": 0.32930339666090963, + "grad_norm": 10.225967407226562, + "learning_rate": 0.0001985408078002081, + "loss": 1.7766, + "step": 572 + }, + { + "epoch": 0.3298791018998273, + "grad_norm": 7.599292278289795, + "learning_rate": 0.00019849952972768767, + "loss": 1.851, + "step": 573 + }, + { + "epoch": 0.330454807138745, + "grad_norm": 3.455409049987793, + "learning_rate": 0.0001984576808326773, + "loss": 1.9795, + "step": 574 + }, + { + "epoch": 0.3310305123776626, + "grad_norm": 4.577173709869385, + "learning_rate": 0.00019841526138510257, + "loss": 2.1139, + "step": 575 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 3.0964651107788086, + "learning_rate": 0.00019837227165856922, + "loss": 2.2629, + "step": 576 + }, + { + "epoch": 0.33218192285549797, + "grad_norm": 5.796529293060303, + "learning_rate": 0.0001983287119303612, + "loss": 2.1801, + "step": 577 + }, + { + "epoch": 0.33275762809441567, + "grad_norm": 13.77210521697998, + "learning_rate": 0.00019828458248143913, + "loss": 1.9382, + "step": 578 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 67.12165832519531, + "learning_rate": 0.00019823988359643805, + "loss": 2.0376, + "step": 579 + }, + { + "epoch": 0.333909038572251, + "grad_norm": 18.68467140197754, + "learning_rate": 0.00019819461556366615, + "loss": 2.1364, + "step": 580 + }, + { + "epoch": 0.33448474381116866, + "grad_norm": 3.4618403911590576, + "learning_rate": 0.00019814877867510244, + "loss": 2.4019, + "step": 581 + }, + { + "epoch": 0.33506044905008636, + "grad_norm": 2.982158899307251, + "learning_rate": 0.00019810237322639518, + "loss": 2.2236, + "step": 582 + }, + { + "epoch": 0.335636154289004, + "grad_norm": 18.654964447021484, + "learning_rate": 0.00019805539951685974, + "loss": 2.1278, + "step": 583 + }, + { + "epoch": 0.3362118595279217, + "grad_norm": 16.564912796020508, + "learning_rate": 0.00019800785784947683, + "loss": 1.8014, + "step": 584 + }, + { + "epoch": 0.33678756476683935, + "grad_norm": 31.10694122314453, + "learning_rate": 0.00019795974853089053, + "loss": 1.9206, + "step": 585 + }, + { + "epoch": 0.33736327000575705, + "grad_norm": 22.873422622680664, + "learning_rate": 0.00019791107187140618, + "loss": 2.0762, + "step": 586 + }, + { + "epoch": 0.3379389752446747, + "grad_norm": 4.631661415100098, + "learning_rate": 0.00019786182818498852, + "loss": 1.9247, + "step": 587 + }, + { + "epoch": 0.3385146804835924, + "grad_norm": 8.853531837463379, + "learning_rate": 0.00019781201778925969, + "loss": 1.7691, + "step": 588 + }, + { + "epoch": 0.3390903857225101, + "grad_norm": 10.22801685333252, + "learning_rate": 0.00019776164100549694, + "loss": 2.1087, + "step": 589 + }, + { + "epoch": 0.33966609096142775, + "grad_norm": 11.78855037689209, + "learning_rate": 0.0001977106981586309, + "loss": 2.2109, + "step": 590 + }, + { + "epoch": 0.34024179620034545, + "grad_norm": 9.323369026184082, + "learning_rate": 0.00019765918957724319, + "loss": 2.104, + "step": 591 + }, + { + "epoch": 0.3408175014392631, + "grad_norm": 10.116569519042969, + "learning_rate": 0.00019760711559356449, + "loss": 2.0949, + "step": 592 + }, + { + "epoch": 0.3413932066781808, + "grad_norm": 9.48695182800293, + "learning_rate": 0.00019755447654347226, + "loss": 2.1322, + "step": 593 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 11.74067497253418, + "learning_rate": 0.00019750127276648872, + "loss": 2.0404, + "step": 594 + }, + { + "epoch": 0.34254461715601614, + "grad_norm": 11.347387313842773, + "learning_rate": 0.00019744750460577856, + "loss": 1.6953, + "step": 595 + }, + { + "epoch": 0.3431203223949338, + "grad_norm": 3.8460686206817627, + "learning_rate": 0.00019739317240814668, + "loss": 2.1369, + "step": 596 + }, + { + "epoch": 0.3436960276338515, + "grad_norm": 3.9272382259368896, + "learning_rate": 0.00019733827652403615, + "loss": 2.1408, + "step": 597 + }, + { + "epoch": 0.34427173287276913, + "grad_norm": 6.900027751922607, + "learning_rate": 0.00019728281730752568, + "loss": 2.1793, + "step": 598 + }, + { + "epoch": 0.34484743811168683, + "grad_norm": 14.802238464355469, + "learning_rate": 0.00019722679511632757, + "loss": 2.0497, + "step": 599 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 12.431273460388184, + "learning_rate": 0.00019717021031178528, + "loss": 2.1025, + "step": 600 + }, + { + "epoch": 0.3459988485895222, + "grad_norm": 26.413490295410156, + "learning_rate": 0.00019711306325887116, + "loss": 2.0722, + "step": 601 + }, + { + "epoch": 0.3465745538284398, + "grad_norm": 15.21047592163086, + "learning_rate": 0.000197055354326184, + "loss": 1.8285, + "step": 602 + }, + { + "epoch": 0.3471502590673575, + "grad_norm": 4.017702579498291, + "learning_rate": 0.0001969970838859468, + "loss": 2.3358, + "step": 603 + }, + { + "epoch": 0.34772596430627517, + "grad_norm": 4.556800842285156, + "learning_rate": 0.00019693825231400423, + "loss": 2.1526, + "step": 604 + }, + { + "epoch": 0.34830166954519287, + "grad_norm": 8.760571479797363, + "learning_rate": 0.0001968788599898202, + "loss": 2.3013, + "step": 605 + }, + { + "epoch": 0.3488773747841105, + "grad_norm": 4.910373210906982, + "learning_rate": 0.0001968189072964757, + "loss": 2.038, + "step": 606 + }, + { + "epoch": 0.3494530800230282, + "grad_norm": 14.5215425491333, + "learning_rate": 0.00019675839462066582, + "loss": 2.2494, + "step": 607 + }, + { + "epoch": 0.35002878526194586, + "grad_norm": 7.343470573425293, + "learning_rate": 0.00019669732235269775, + "loss": 1.8103, + "step": 608 + }, + { + "epoch": 0.35060449050086356, + "grad_norm": 6.75926399230957, + "learning_rate": 0.00019663569088648796, + "loss": 2.0837, + "step": 609 + }, + { + "epoch": 0.3511801957397812, + "grad_norm": 4.556105136871338, + "learning_rate": 0.0001965735006195598, + "loss": 2.111, + "step": 610 + }, + { + "epoch": 0.3517559009786989, + "grad_norm": 7.004334449768066, + "learning_rate": 0.0001965107519530408, + "loss": 2.1351, + "step": 611 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 3.7574639320373535, + "learning_rate": 0.00019644744529166025, + "loss": 1.9899, + "step": 612 + }, + { + "epoch": 0.35290731145653426, + "grad_norm": 7.275668144226074, + "learning_rate": 0.0001963835810437465, + "loss": 1.7074, + "step": 613 + }, + { + "epoch": 0.3534830166954519, + "grad_norm": 4.05908727645874, + "learning_rate": 0.00019631915962122436, + "loss": 1.7602, + "step": 614 + }, + { + "epoch": 0.3540587219343696, + "grad_norm": 3.8614981174468994, + "learning_rate": 0.00019625418143961234, + "loss": 1.878, + "step": 615 + }, + { + "epoch": 0.3546344271732873, + "grad_norm": 3.8152644634246826, + "learning_rate": 0.00019618864691802013, + "loss": 2.1187, + "step": 616 + }, + { + "epoch": 0.35521013241220495, + "grad_norm": 7.1591668128967285, + "learning_rate": 0.00019612255647914574, + "loss": 1.889, + "step": 617 + }, + { + "epoch": 0.35578583765112265, + "grad_norm": 13.686311721801758, + "learning_rate": 0.00019605591054927294, + "loss": 1.8415, + "step": 618 + }, + { + "epoch": 0.3563615428900403, + "grad_norm": 4.854591369628906, + "learning_rate": 0.00019598870955826828, + "loss": 2.2113, + "step": 619 + }, + { + "epoch": 0.356937248128958, + "grad_norm": 8.912299156188965, + "learning_rate": 0.00019592095393957868, + "loss": 1.7633, + "step": 620 + }, + { + "epoch": 0.35751295336787564, + "grad_norm": 3.053098678588867, + "learning_rate": 0.00019585264413022818, + "loss": 1.9866, + "step": 621 + }, + { + "epoch": 0.35808865860679334, + "grad_norm": 22.903722763061523, + "learning_rate": 0.0001957837805708155, + "loss": 2.0566, + "step": 622 + }, + { + "epoch": 0.358664363845711, + "grad_norm": 2.5533032417297363, + "learning_rate": 0.000195714363705511, + "loss": 2.0484, + "step": 623 + }, + { + "epoch": 0.3592400690846287, + "grad_norm": 4.685166358947754, + "learning_rate": 0.00019564439398205388, + "loss": 1.9809, + "step": 624 + }, + { + "epoch": 0.35981577432354633, + "grad_norm": 6.896650314331055, + "learning_rate": 0.00019557387185174924, + "loss": 1.9147, + "step": 625 + }, + { + "epoch": 0.36039147956246403, + "grad_norm": 3.927499532699585, + "learning_rate": 0.00019550279776946525, + "loss": 1.8356, + "step": 626 + }, + { + "epoch": 0.3609671848013817, + "grad_norm": 2.7331018447875977, + "learning_rate": 0.00019543117219363016, + "loss": 1.9191, + "step": 627 + }, + { + "epoch": 0.3615428900402994, + "grad_norm": 6.016903400421143, + "learning_rate": 0.0001953589955862294, + "loss": 2.0919, + "step": 628 + }, + { + "epoch": 0.362118595279217, + "grad_norm": 5.431999683380127, + "learning_rate": 0.00019528626841280246, + "loss": 1.5794, + "step": 629 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 4.026440620422363, + "learning_rate": 0.00019521299114244004, + "loss": 2.2844, + "step": 630 + }, + { + "epoch": 0.36327000575705237, + "grad_norm": 7.0267863273620605, + "learning_rate": 0.00019513916424778097, + "loss": 1.8249, + "step": 631 + }, + { + "epoch": 0.3638457109959701, + "grad_norm": 4.048052787780762, + "learning_rate": 0.00019506478820500918, + "loss": 2.1139, + "step": 632 + }, + { + "epoch": 0.3644214162348877, + "grad_norm": 3.6997714042663574, + "learning_rate": 0.0001949898634938506, + "loss": 1.6588, + "step": 633 + }, + { + "epoch": 0.3649971214738054, + "grad_norm": 2.560260772705078, + "learning_rate": 0.00019491439059757002, + "loss": 1.9762, + "step": 634 + }, + { + "epoch": 0.36557282671272306, + "grad_norm": 17.04377555847168, + "learning_rate": 0.00019483837000296806, + "loss": 1.7949, + "step": 635 + }, + { + "epoch": 0.36614853195164077, + "grad_norm": 2.873385190963745, + "learning_rate": 0.00019476180220037807, + "loss": 1.9637, + "step": 636 + }, + { + "epoch": 0.3667242371905584, + "grad_norm": 7.77009391784668, + "learning_rate": 0.00019468468768366276, + "loss": 1.8636, + "step": 637 + }, + { + "epoch": 0.3672999424294761, + "grad_norm": 6.612690448760986, + "learning_rate": 0.00019460702695021123, + "loss": 1.734, + "step": 638 + }, + { + "epoch": 0.36787564766839376, + "grad_norm": 4.485565185546875, + "learning_rate": 0.0001945288205009357, + "loss": 1.6968, + "step": 639 + }, + { + "epoch": 0.36845135290731146, + "grad_norm": 2.9968698024749756, + "learning_rate": 0.0001944500688402682, + "loss": 1.8785, + "step": 640 + }, + { + "epoch": 0.3690270581462291, + "grad_norm": 4.952812194824219, + "learning_rate": 0.00019437077247615747, + "loss": 1.7285, + "step": 641 + }, + { + "epoch": 0.3696027633851468, + "grad_norm": 5.491662502288818, + "learning_rate": 0.00019429093192006543, + "loss": 1.6328, + "step": 642 + }, + { + "epoch": 0.3701784686240645, + "grad_norm": 3.8749518394470215, + "learning_rate": 0.00019421054768696422, + "loss": 2.2014, + "step": 643 + }, + { + "epoch": 0.37075417386298215, + "grad_norm": 6.7762627601623535, + "learning_rate": 0.0001941296202953326, + "loss": 2.0592, + "step": 644 + }, + { + "epoch": 0.37132987910189985, + "grad_norm": 3.307373046875, + "learning_rate": 0.00019404815026715267, + "loss": 2.3178, + "step": 645 + }, + { + "epoch": 0.3719055843408175, + "grad_norm": 3.4404947757720947, + "learning_rate": 0.00019396613812790666, + "loss": 2.3772, + "step": 646 + }, + { + "epoch": 0.3724812895797352, + "grad_norm": 4.453017711639404, + "learning_rate": 0.00019388358440657332, + "loss": 1.8855, + "step": 647 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 3.175896406173706, + "learning_rate": 0.00019380048963562466, + "loss": 1.8799, + "step": 648 + }, + { + "epoch": 0.37363270005757054, + "grad_norm": 22.680187225341797, + "learning_rate": 0.0001937168543510224, + "loss": 1.4755, + "step": 649 + }, + { + "epoch": 0.3742084052964882, + "grad_norm": 14.353453636169434, + "learning_rate": 0.00019363267909221468, + "loss": 1.9199, + "step": 650 + }, + { + "epoch": 0.3747841105354059, + "grad_norm": 3.4111273288726807, + "learning_rate": 0.00019354796440213237, + "loss": 2.0271, + "step": 651 + }, + { + "epoch": 0.37535981577432354, + "grad_norm": 2.5413737297058105, + "learning_rate": 0.00019346271082718575, + "loss": 2.0859, + "step": 652 + }, + { + "epoch": 0.37593552101324124, + "grad_norm": 2.3607568740844727, + "learning_rate": 0.00019337691891726087, + "loss": 2.1843, + "step": 653 + }, + { + "epoch": 0.3765112262521589, + "grad_norm": 6.652899265289307, + "learning_rate": 0.00019329058922571608, + "loss": 2.2823, + "step": 654 + }, + { + "epoch": 0.3770869314910766, + "grad_norm": 6.970069885253906, + "learning_rate": 0.00019320372230937835, + "loss": 2.0684, + "step": 655 + }, + { + "epoch": 0.3776626367299942, + "grad_norm": 6.668658256530762, + "learning_rate": 0.00019311631872853983, + "loss": 1.4474, + "step": 656 + }, + { + "epoch": 0.37823834196891193, + "grad_norm": 16.344221115112305, + "learning_rate": 0.00019302837904695418, + "loss": 1.958, + "step": 657 + }, + { + "epoch": 0.3788140472078296, + "grad_norm": 11.7747163772583, + "learning_rate": 0.00019293990383183277, + "loss": 1.9691, + "step": 658 + }, + { + "epoch": 0.3793897524467473, + "grad_norm": 3.1886119842529297, + "learning_rate": 0.00019285089365384138, + "loss": 2.0175, + "step": 659 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 7.979596138000488, + "learning_rate": 0.00019276134908709607, + "loss": 1.8705, + "step": 660 + }, + { + "epoch": 0.3805411629245826, + "grad_norm": 3.5675251483917236, + "learning_rate": 0.0001926712707091599, + "loss": 1.754, + "step": 661 + }, + { + "epoch": 0.38111686816350027, + "grad_norm": 2.696560859680176, + "learning_rate": 0.00019258065910103886, + "loss": 2.0815, + "step": 662 + }, + { + "epoch": 0.38169257340241797, + "grad_norm": 3.2219901084899902, + "learning_rate": 0.0001924895148471785, + "loss": 1.9866, + "step": 663 + }, + { + "epoch": 0.3822682786413356, + "grad_norm": 4.226520538330078, + "learning_rate": 0.00019239783853545962, + "loss": 1.848, + "step": 664 + }, + { + "epoch": 0.3828439838802533, + "grad_norm": 3.089921474456787, + "learning_rate": 0.00019230563075719513, + "loss": 2.3039, + "step": 665 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 3.959472179412842, + "learning_rate": 0.00019221289210712562, + "loss": 1.902, + "step": 666 + }, + { + "epoch": 0.38399539435808866, + "grad_norm": 9.597257614135742, + "learning_rate": 0.000192119623183416, + "loss": 1.9769, + "step": 667 + }, + { + "epoch": 0.3845710995970063, + "grad_norm": 15.12406063079834, + "learning_rate": 0.00019202582458765138, + "loss": 1.985, + "step": 668 + }, + { + "epoch": 0.385146804835924, + "grad_norm": 3.456636428833008, + "learning_rate": 0.00019193149692483326, + "loss": 1.6099, + "step": 669 + }, + { + "epoch": 0.3857225100748417, + "grad_norm": 2.650001287460327, + "learning_rate": 0.00019183664080337556, + "loss": 2.4192, + "step": 670 + }, + { + "epoch": 0.38629821531375935, + "grad_norm": 2.6241097450256348, + "learning_rate": 0.00019174125683510092, + "loss": 2.3614, + "step": 671 + }, + { + "epoch": 0.38687392055267705, + "grad_norm": 5.434971332550049, + "learning_rate": 0.00019164534563523641, + "loss": 1.782, + "step": 672 + }, + { + "epoch": 0.3874496257915947, + "grad_norm": 2.1434805393218994, + "learning_rate": 0.0001915489078224099, + "loss": 2.0285, + "step": 673 + }, + { + "epoch": 0.3880253310305124, + "grad_norm": 3.9473743438720703, + "learning_rate": 0.00019145194401864581, + "loss": 2.137, + "step": 674 + }, + { + "epoch": 0.38860103626943004, + "grad_norm": 3.913604259490967, + "learning_rate": 0.00019135445484936127, + "loss": 1.7514, + "step": 675 + }, + { + "epoch": 0.38917674150834775, + "grad_norm": 4.43229341506958, + "learning_rate": 0.000191256440943362, + "loss": 1.6698, + "step": 676 + }, + { + "epoch": 0.3897524467472654, + "grad_norm": 3.1151657104492188, + "learning_rate": 0.00019115790293283827, + "loss": 2.2421, + "step": 677 + }, + { + "epoch": 0.3903281519861831, + "grad_norm": 3.7743000984191895, + "learning_rate": 0.00019105884145336085, + "loss": 1.8634, + "step": 678 + }, + { + "epoch": 0.39090385722510074, + "grad_norm": 2.8315064907073975, + "learning_rate": 0.00019095925714387682, + "loss": 1.6003, + "step": 679 + }, + { + "epoch": 0.39147956246401844, + "grad_norm": 13.85084342956543, + "learning_rate": 0.00019085915064670557, + "loss": 1.9885, + "step": 680 + }, + { + "epoch": 0.3920552677029361, + "grad_norm": 8.601191520690918, + "learning_rate": 0.00019075852260753463, + "loss": 1.8575, + "step": 681 + }, + { + "epoch": 0.3926309729418538, + "grad_norm": 2.420215606689453, + "learning_rate": 0.00019065737367541545, + "loss": 2.0188, + "step": 682 + }, + { + "epoch": 0.39320667818077143, + "grad_norm": 9.427199363708496, + "learning_rate": 0.0001905557045027592, + "loss": 1.8087, + "step": 683 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 2.8770227432250977, + "learning_rate": 0.00019045351574533274, + "loss": 1.9231, + "step": 684 + }, + { + "epoch": 0.3943580886586068, + "grad_norm": 5.514732837677002, + "learning_rate": 0.00019035080806225404, + "loss": 1.8629, + "step": 685 + }, + { + "epoch": 0.3949337938975245, + "grad_norm": 5.505938529968262, + "learning_rate": 0.00019024758211598833, + "loss": 2.0178, + "step": 686 + }, + { + "epoch": 0.3955094991364421, + "grad_norm": 4.23834753036499, + "learning_rate": 0.00019014383857234355, + "loss": 1.5748, + "step": 687 + }, + { + "epoch": 0.3960852043753598, + "grad_norm": 4.225786209106445, + "learning_rate": 0.00019003957810046615, + "loss": 1.8404, + "step": 688 + }, + { + "epoch": 0.39666090961427747, + "grad_norm": 10.373225212097168, + "learning_rate": 0.00018993480137283685, + "loss": 1.9054, + "step": 689 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 5.083978652954102, + "learning_rate": 0.00018982950906526615, + "loss": 1.8938, + "step": 690 + }, + { + "epoch": 0.3978123200921128, + "grad_norm": 4.478074073791504, + "learning_rate": 0.00018972370185689, + "loss": 1.9073, + "step": 691 + }, + { + "epoch": 0.3983880253310305, + "grad_norm": 3.557939052581787, + "learning_rate": 0.00018961738043016556, + "loss": 1.847, + "step": 692 + }, + { + "epoch": 0.39896373056994816, + "grad_norm": 4.705951690673828, + "learning_rate": 0.00018951054547086666, + "loss": 1.7451, + "step": 693 + }, + { + "epoch": 0.39953943580886586, + "grad_norm": 2.8145923614501953, + "learning_rate": 0.00018940319766807943, + "loss": 2.054, + "step": 694 + }, + { + "epoch": 0.40011514104778356, + "grad_norm": 10.403020858764648, + "learning_rate": 0.00018929533771419783, + "loss": 1.8062, + "step": 695 + }, + { + "epoch": 0.4006908462867012, + "grad_norm": 5.947159290313721, + "learning_rate": 0.00018918696630491915, + "loss": 1.8459, + "step": 696 + }, + { + "epoch": 0.4012665515256189, + "grad_norm": 3.2348601818084717, + "learning_rate": 0.00018907808413923968, + "loss": 1.815, + "step": 697 + }, + { + "epoch": 0.40184225676453655, + "grad_norm": 8.500833511352539, + "learning_rate": 0.00018896869191945, + "loss": 1.535, + "step": 698 + }, + { + "epoch": 0.40241796200345425, + "grad_norm": 5.008488178253174, + "learning_rate": 0.0001888587903511306, + "loss": 2.3482, + "step": 699 + }, + { + "epoch": 0.4029936672423719, + "grad_norm": 2.739643096923828, + "learning_rate": 0.00018874838014314724, + "loss": 2.284, + "step": 700 + }, + { + "epoch": 0.4035693724812896, + "grad_norm": 10.596121788024902, + "learning_rate": 0.0001886374620076464, + "loss": 1.6078, + "step": 701 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 2.6627824306488037, + "learning_rate": 0.00018852603666005073, + "loss": 1.7632, + "step": 702 + }, + { + "epoch": 0.40472078295912495, + "grad_norm": 4.91156530380249, + "learning_rate": 0.00018841410481905434, + "loss": 1.8726, + "step": 703 + }, + { + "epoch": 0.4052964881980426, + "grad_norm": 3.1350510120391846, + "learning_rate": 0.0001883016672066183, + "loss": 2.0387, + "step": 704 + }, + { + "epoch": 0.4058721934369603, + "grad_norm": 16.92197036743164, + "learning_rate": 0.0001881887245479659, + "loss": 2.1191, + "step": 705 + }, + { + "epoch": 0.40644789867587794, + "grad_norm": 3.093526840209961, + "learning_rate": 0.00018807527757157787, + "loss": 2.0547, + "step": 706 + }, + { + "epoch": 0.40702360391479564, + "grad_norm": 4.2745680809021, + "learning_rate": 0.00018796132700918793, + "loss": 1.8785, + "step": 707 + }, + { + "epoch": 0.4075993091537133, + "grad_norm": 3.898991107940674, + "learning_rate": 0.00018784687359577791, + "loss": 1.6114, + "step": 708 + }, + { + "epoch": 0.408175014392631, + "grad_norm": 2.8433330059051514, + "learning_rate": 0.00018773191806957298, + "loss": 2.0685, + "step": 709 + }, + { + "epoch": 0.40875071963154863, + "grad_norm": 16.428258895874023, + "learning_rate": 0.00018761646117203696, + "loss": 1.8752, + "step": 710 + }, + { + "epoch": 0.40932642487046633, + "grad_norm": 4.984866142272949, + "learning_rate": 0.0001875005036478675, + "loss": 2.0355, + "step": 711 + }, + { + "epoch": 0.409902130109384, + "grad_norm": 3.2980496883392334, + "learning_rate": 0.00018738404624499136, + "loss": 1.8788, + "step": 712 + }, + { + "epoch": 0.4104778353483017, + "grad_norm": 2.58404541015625, + "learning_rate": 0.00018726708971455945, + "loss": 2.0856, + "step": 713 + }, + { + "epoch": 0.4110535405872193, + "grad_norm": 22.611204147338867, + "learning_rate": 0.00018714963481094207, + "loss": 1.7451, + "step": 714 + }, + { + "epoch": 0.411629245826137, + "grad_norm": 3.256192445755005, + "learning_rate": 0.0001870316822917241, + "loss": 1.7017, + "step": 715 + }, + { + "epoch": 0.41220495106505467, + "grad_norm": 3.7287187576293945, + "learning_rate": 0.00018691323291769992, + "loss": 1.5585, + "step": 716 + }, + { + "epoch": 0.41278065630397237, + "grad_norm": 4.698144912719727, + "learning_rate": 0.00018679428745286872, + "loss": 1.8391, + "step": 717 + }, + { + "epoch": 0.41335636154289, + "grad_norm": 8.514996528625488, + "learning_rate": 0.00018667484666442944, + "loss": 1.7789, + "step": 718 + }, + { + "epoch": 0.4139320667818077, + "grad_norm": 7.140060901641846, + "learning_rate": 0.00018655491132277589, + "loss": 1.894, + "step": 719 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.8144354820251465, + "learning_rate": 0.00018643448220149173, + "loss": 1.7206, + "step": 720 + }, + { + "epoch": 0.41508347725964306, + "grad_norm": 18.383852005004883, + "learning_rate": 0.0001863135600773455, + "loss": 1.9508, + "step": 721 + }, + { + "epoch": 0.41565918249856076, + "grad_norm": 4.58454704284668, + "learning_rate": 0.00018619214573028562, + "loss": 1.9023, + "step": 722 + }, + { + "epoch": 0.4162348877374784, + "grad_norm": 2.33950138092041, + "learning_rate": 0.00018607023994343533, + "loss": 1.9343, + "step": 723 + }, + { + "epoch": 0.4168105929763961, + "grad_norm": 3.2266829013824463, + "learning_rate": 0.0001859478435030877, + "loss": 2.0175, + "step": 724 + }, + { + "epoch": 0.41738629821531376, + "grad_norm": 2.7907965183258057, + "learning_rate": 0.00018582495719870047, + "loss": 1.9857, + "step": 725 + }, + { + "epoch": 0.41796200345423146, + "grad_norm": 9.727225303649902, + "learning_rate": 0.00018570158182289103, + "loss": 1.7811, + "step": 726 + }, + { + "epoch": 0.4185377086931491, + "grad_norm": 8.029142379760742, + "learning_rate": 0.00018557771817143132, + "loss": 2.1232, + "step": 727 + }, + { + "epoch": 0.4191134139320668, + "grad_norm": 17.49789810180664, + "learning_rate": 0.0001854533670432426, + "loss": 1.8807, + "step": 728 + }, + { + "epoch": 0.41968911917098445, + "grad_norm": 2.434882402420044, + "learning_rate": 0.00018532852924039035, + "loss": 1.839, + "step": 729 + }, + { + "epoch": 0.42026482440990215, + "grad_norm": 4.167110919952393, + "learning_rate": 0.0001852032055680792, + "loss": 1.803, + "step": 730 + }, + { + "epoch": 0.4208405296488198, + "grad_norm": 2.675874948501587, + "learning_rate": 0.00018507739683464752, + "loss": 2.3544, + "step": 731 + }, + { + "epoch": 0.4214162348877375, + "grad_norm": 3.3444297313690186, + "learning_rate": 0.00018495110385156237, + "loss": 2.0448, + "step": 732 + }, + { + "epoch": 0.42199194012665514, + "grad_norm": 4.82088041305542, + "learning_rate": 0.00018482432743341433, + "loss": 2.0651, + "step": 733 + }, + { + "epoch": 0.42256764536557284, + "grad_norm": 2.519653081893921, + "learning_rate": 0.000184697068397912, + "loss": 2.0949, + "step": 734 + }, + { + "epoch": 0.4231433506044905, + "grad_norm": 5.350189685821533, + "learning_rate": 0.0001845693275658769, + "loss": 1.7065, + "step": 735 + }, + { + "epoch": 0.4237190558434082, + "grad_norm": 3.690128803253174, + "learning_rate": 0.00018444110576123812, + "loss": 1.6012, + "step": 736 + }, + { + "epoch": 0.42429476108232583, + "grad_norm": 3.2592711448669434, + "learning_rate": 0.00018431240381102713, + "loss": 1.956, + "step": 737 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 8.19218921661377, + "learning_rate": 0.0001841832225453722, + "loss": 1.6416, + "step": 738 + }, + { + "epoch": 0.4254461715601612, + "grad_norm": 2.3770790100097656, + "learning_rate": 0.0001840535627974933, + "loss": 2.1405, + "step": 739 + }, + { + "epoch": 0.4260218767990789, + "grad_norm": 4.077558994293213, + "learning_rate": 0.00018392342540369657, + "loss": 1.5653, + "step": 740 + }, + { + "epoch": 0.4265975820379965, + "grad_norm": 2.6370084285736084, + "learning_rate": 0.00018379281120336897, + "loss": 1.768, + "step": 741 + }, + { + "epoch": 0.4271732872769142, + "grad_norm": 2.450295925140381, + "learning_rate": 0.00018366172103897283, + "loss": 1.7358, + "step": 742 + }, + { + "epoch": 0.42774899251583187, + "grad_norm": 5.39324951171875, + "learning_rate": 0.00018353015575604052, + "loss": 2.0319, + "step": 743 + }, + { + "epoch": 0.4283246977547496, + "grad_norm": 2.238994836807251, + "learning_rate": 0.0001833981162031689, + "loss": 2.1476, + "step": 744 + }, + { + "epoch": 0.4289004029936672, + "grad_norm": 2.7961604595184326, + "learning_rate": 0.00018326560323201382, + "loss": 1.937, + "step": 745 + }, + { + "epoch": 0.4294761082325849, + "grad_norm": 3.0251359939575195, + "learning_rate": 0.00018313261769728478, + "loss": 1.387, + "step": 746 + }, + { + "epoch": 0.43005181347150256, + "grad_norm": 12.313629150390625, + "learning_rate": 0.00018299916045673922, + "loss": 1.858, + "step": 747 + }, + { + "epoch": 0.43062751871042027, + "grad_norm": 2.7766458988189697, + "learning_rate": 0.00018286523237117717, + "loss": 1.7907, + "step": 748 + }, + { + "epoch": 0.43120322394933797, + "grad_norm": 4.888232707977295, + "learning_rate": 0.00018273083430443555, + "loss": 2.0781, + "step": 749 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5016186237335205, + "learning_rate": 0.00018259596712338268, + "loss": 1.9422, + "step": 750 + }, + { + "epoch": 0.4323546344271733, + "grad_norm": 2.471903085708618, + "learning_rate": 0.00018246063169791269, + "loss": 1.4743, + "step": 751 + }, + { + "epoch": 0.43293033966609096, + "grad_norm": 5.355947494506836, + "learning_rate": 0.0001823248289009399, + "loss": 1.7892, + "step": 752 + }, + { + "epoch": 0.43350604490500866, + "grad_norm": 30.411474227905273, + "learning_rate": 0.00018218855960839308, + "loss": 2.1997, + "step": 753 + }, + { + "epoch": 0.4340817501439263, + "grad_norm": 22.769906997680664, + "learning_rate": 0.00018205182469921001, + "loss": 2.2114, + "step": 754 + }, + { + "epoch": 0.434657455382844, + "grad_norm": 2.85595703125, + "learning_rate": 0.00018191462505533172, + "loss": 1.6781, + "step": 755 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 13.515556335449219, + "learning_rate": 0.00018177696156169664, + "loss": 1.8627, + "step": 756 + }, + { + "epoch": 0.43580886586067935, + "grad_norm": 2.449556589126587, + "learning_rate": 0.00018163883510623514, + "loss": 1.9787, + "step": 757 + }, + { + "epoch": 0.436384571099597, + "grad_norm": 3.347914457321167, + "learning_rate": 0.00018150024657986373, + "loss": 1.6179, + "step": 758 + }, + { + "epoch": 0.4369602763385147, + "grad_norm": 2.167959451675415, + "learning_rate": 0.00018136119687647912, + "loss": 2.2641, + "step": 759 + }, + { + "epoch": 0.43753598157743234, + "grad_norm": 1.9680399894714355, + "learning_rate": 0.00018122168689295283, + "loss": 2.1557, + "step": 760 + }, + { + "epoch": 0.43811168681635004, + "grad_norm": 1.9507166147232056, + "learning_rate": 0.000181081717529125, + "loss": 1.9336, + "step": 761 + }, + { + "epoch": 0.4386873920552677, + "grad_norm": 10.30239200592041, + "learning_rate": 0.0001809412896877989, + "loss": 1.716, + "step": 762 + }, + { + "epoch": 0.4392630972941854, + "grad_norm": 17.331459045410156, + "learning_rate": 0.0001808004042747349, + "loss": 1.7709, + "step": 763 + }, + { + "epoch": 0.43983880253310303, + "grad_norm": 8.791406631469727, + "learning_rate": 0.00018065906219864476, + "loss": 1.8645, + "step": 764 + }, + { + "epoch": 0.44041450777202074, + "grad_norm": 14.691398620605469, + "learning_rate": 0.0001805172643711857, + "loss": 1.732, + "step": 765 + }, + { + "epoch": 0.4409902130109384, + "grad_norm": 8.738691329956055, + "learning_rate": 0.00018037501170695459, + "loss": 1.8119, + "step": 766 + }, + { + "epoch": 0.4415659182498561, + "grad_norm": 3.6133551597595215, + "learning_rate": 0.00018023230512348193, + "loss": 2.2517, + "step": 767 + }, + { + "epoch": 0.4421416234887737, + "grad_norm": 4.219852447509766, + "learning_rate": 0.00018008914554122597, + "loss": 1.9683, + "step": 768 + }, + { + "epoch": 0.44271732872769143, + "grad_norm": 6.014896869659424, + "learning_rate": 0.00017994553388356695, + "loss": 1.6415, + "step": 769 + }, + { + "epoch": 0.4432930339666091, + "grad_norm": 5.801370620727539, + "learning_rate": 0.00017980147107680083, + "loss": 1.8638, + "step": 770 + }, + { + "epoch": 0.4438687392055268, + "grad_norm": 3.9522225856781006, + "learning_rate": 0.00017965695805013365, + "loss": 2.1045, + "step": 771 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.9866466522216797, + "learning_rate": 0.00017951199573567524, + "loss": 1.9505, + "step": 772 + }, + { + "epoch": 0.4450201496833621, + "grad_norm": 3.0734667778015137, + "learning_rate": 0.00017936658506843335, + "loss": 2.2168, + "step": 773 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 3.1300957202911377, + "learning_rate": 0.00017922072698630772, + "loss": 1.8417, + "step": 774 + }, + { + "epoch": 0.44617156016119747, + "grad_norm": 2.1265501976013184, + "learning_rate": 0.00017907442243008382, + "loss": 2.0039, + "step": 775 + }, + { + "epoch": 0.44674726540011517, + "grad_norm": 2.7996795177459717, + "learning_rate": 0.00017892767234342684, + "loss": 1.8003, + "step": 776 + }, + { + "epoch": 0.4473229706390328, + "grad_norm": 6.149320602416992, + "learning_rate": 0.00017878047767287577, + "loss": 1.7551, + "step": 777 + }, + { + "epoch": 0.4478986758779505, + "grad_norm": 2.3379878997802734, + "learning_rate": 0.00017863283936783708, + "loss": 1.9938, + "step": 778 + }, + { + "epoch": 0.44847438111686816, + "grad_norm": 3.4820477962493896, + "learning_rate": 0.00017848475838057873, + "loss": 1.5153, + "step": 779 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.582059860229492, + "learning_rate": 0.00017833623566622397, + "loss": 1.9166, + "step": 780 + }, + { + "epoch": 0.4496257915947035, + "grad_norm": 3.9701995849609375, + "learning_rate": 0.00017818727218274513, + "loss": 1.4762, + "step": 781 + }, + { + "epoch": 0.4502014968336212, + "grad_norm": 2.0633654594421387, + "learning_rate": 0.00017803786889095764, + "loss": 2.1013, + "step": 782 + }, + { + "epoch": 0.45077720207253885, + "grad_norm": 2.459294080734253, + "learning_rate": 0.00017788802675451352, + "loss": 1.8575, + "step": 783 + }, + { + "epoch": 0.45135290731145655, + "grad_norm": 5.101840496063232, + "learning_rate": 0.00017773774673989553, + "loss": 1.4693, + "step": 784 + }, + { + "epoch": 0.4519286125503742, + "grad_norm": 2.827829360961914, + "learning_rate": 0.0001775870298164106, + "loss": 2.1377, + "step": 785 + }, + { + "epoch": 0.4525043177892919, + "grad_norm": 2.8199522495269775, + "learning_rate": 0.0001774358769561838, + "loss": 1.9839, + "step": 786 + }, + { + "epoch": 0.45308002302820954, + "grad_norm": 3.852107286453247, + "learning_rate": 0.00017728428913415192, + "loss": 1.7629, + "step": 787 + }, + { + "epoch": 0.45365572826712725, + "grad_norm": 5.424905776977539, + "learning_rate": 0.00017713226732805738, + "loss": 1.8626, + "step": 788 + }, + { + "epoch": 0.4542314335060449, + "grad_norm": 8.507397651672363, + "learning_rate": 0.0001769798125184417, + "loss": 2.0023, + "step": 789 + }, + { + "epoch": 0.4548071387449626, + "grad_norm": 2.348231792449951, + "learning_rate": 0.00017682692568863926, + "loss": 1.8687, + "step": 790 + }, + { + "epoch": 0.45538284398388024, + "grad_norm": 16.692827224731445, + "learning_rate": 0.00017667360782477106, + "loss": 2.0102, + "step": 791 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 3.0447139739990234, + "learning_rate": 0.00017651985991573826, + "loss": 1.867, + "step": 792 + }, + { + "epoch": 0.4565342544617156, + "grad_norm": 2.9347379207611084, + "learning_rate": 0.00017636568295321573, + "loss": 1.6961, + "step": 793 + }, + { + "epoch": 0.4571099597006333, + "grad_norm": 6.137345790863037, + "learning_rate": 0.00017621107793164582, + "loss": 1.5024, + "step": 794 + }, + { + "epoch": 0.45768566493955093, + "grad_norm": 1.9799631834030151, + "learning_rate": 0.0001760560458482318, + "loss": 1.9756, + "step": 795 + }, + { + "epoch": 0.45826137017846863, + "grad_norm": 10.146332740783691, + "learning_rate": 0.00017590058770293156, + "loss": 1.8236, + "step": 796 + }, + { + "epoch": 0.4588370754173863, + "grad_norm": 2.7647223472595215, + "learning_rate": 0.00017574470449845103, + "loss": 1.9874, + "step": 797 + }, + { + "epoch": 0.459412780656304, + "grad_norm": 2.6900711059570312, + "learning_rate": 0.00017558839724023781, + "loss": 1.9816, + "step": 798 + }, + { + "epoch": 0.4599884858952216, + "grad_norm": 2.0507872104644775, + "learning_rate": 0.00017543166693647467, + "loss": 2.1267, + "step": 799 + }, + { + "epoch": 0.4605641911341393, + "grad_norm": 5.527276515960693, + "learning_rate": 0.00017527451459807292, + "loss": 1.9468, + "step": 800 + }, + { + "epoch": 0.46113989637305697, + "grad_norm": 8.115386009216309, + "learning_rate": 0.00017511694123866615, + "loss": 1.6804, + "step": 801 + }, + { + "epoch": 0.46171560161197467, + "grad_norm": 2.108691692352295, + "learning_rate": 0.0001749589478746034, + "loss": 1.5739, + "step": 802 + }, + { + "epoch": 0.46229130685089237, + "grad_norm": 15.620509147644043, + "learning_rate": 0.00017480053552494288, + "loss": 1.7336, + "step": 803 + }, + { + "epoch": 0.46286701208981, + "grad_norm": 3.442645788192749, + "learning_rate": 0.00017464170521144508, + "loss": 1.7112, + "step": 804 + }, + { + "epoch": 0.4634427173287277, + "grad_norm": 7.99544095993042, + "learning_rate": 0.0001744824579585665, + "loss": 1.447, + "step": 805 + }, + { + "epoch": 0.46401842256764536, + "grad_norm": 7.0831217765808105, + "learning_rate": 0.0001743227947934529, + "loss": 1.8631, + "step": 806 + }, + { + "epoch": 0.46459412780656306, + "grad_norm": 6.996182441711426, + "learning_rate": 0.0001741627167459326, + "loss": 1.6913, + "step": 807 + }, + { + "epoch": 0.4651698330454807, + "grad_norm": 2.5042760372161865, + "learning_rate": 0.00017400222484851001, + "loss": 1.8691, + "step": 808 + }, + { + "epoch": 0.4657455382843984, + "grad_norm": 3.7604565620422363, + "learning_rate": 0.00017384132013635874, + "loss": 2.0776, + "step": 809 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 4.427332878112793, + "learning_rate": 0.00017368000364731517, + "loss": 1.879, + "step": 810 + }, + { + "epoch": 0.46689694876223375, + "grad_norm": 3.7155957221984863, + "learning_rate": 0.0001735182764218716, + "loss": 1.9187, + "step": 811 + }, + { + "epoch": 0.4674726540011514, + "grad_norm": 3.7291319370269775, + "learning_rate": 0.00017335613950316962, + "loss": 1.5446, + "step": 812 + }, + { + "epoch": 0.4680483592400691, + "grad_norm": 5.578220367431641, + "learning_rate": 0.0001731935939369933, + "loss": 1.9119, + "step": 813 + }, + { + "epoch": 0.46862406447898675, + "grad_norm": 3.377821207046509, + "learning_rate": 0.00017303064077176246, + "loss": 2.0695, + "step": 814 + }, + { + "epoch": 0.46919976971790445, + "grad_norm": 2.6164028644561768, + "learning_rate": 0.000172867281058526, + "loss": 1.8049, + "step": 815 + }, + { + "epoch": 0.4697754749568221, + "grad_norm": 13.39267635345459, + "learning_rate": 0.00017270351585095507, + "loss": 1.45, + "step": 816 + }, + { + "epoch": 0.4703511801957398, + "grad_norm": 2.8776485919952393, + "learning_rate": 0.00017253934620533625, + "loss": 1.9204, + "step": 817 + }, + { + "epoch": 0.47092688543465744, + "grad_norm": 3.7855734825134277, + "learning_rate": 0.00017237477318056462, + "loss": 1.2213, + "step": 818 + }, + { + "epoch": 0.47150259067357514, + "grad_norm": 2.4604861736297607, + "learning_rate": 0.00017220979783813724, + "loss": 1.6834, + "step": 819 + }, + { + "epoch": 0.4720782959124928, + "grad_norm": 3.9738218784332275, + "learning_rate": 0.00017204442124214603, + "loss": 2.0102, + "step": 820 + }, + { + "epoch": 0.4726540011514105, + "grad_norm": 53.47602462768555, + "learning_rate": 0.00017187864445927103, + "loss": 1.9759, + "step": 821 + }, + { + "epoch": 0.47322970639032813, + "grad_norm": 9.791850090026855, + "learning_rate": 0.0001717124685587734, + "loss": 1.5549, + "step": 822 + }, + { + "epoch": 0.47380541162924583, + "grad_norm": 3.8684937953948975, + "learning_rate": 0.00017154589461248877, + "loss": 1.5908, + "step": 823 + }, + { + "epoch": 0.4743811168681635, + "grad_norm": 24.494279861450195, + "learning_rate": 0.00017137892369482004, + "loss": 1.6402, + "step": 824 + }, + { + "epoch": 0.4749568221070812, + "grad_norm": 3.5959551334381104, + "learning_rate": 0.00017121155688273057, + "loss": 1.6951, + "step": 825 + }, + { + "epoch": 0.4755325273459988, + "grad_norm": 5.078423976898193, + "learning_rate": 0.00017104379525573738, + "loss": 1.6469, + "step": 826 + }, + { + "epoch": 0.4761082325849165, + "grad_norm": 2.160147190093994, + "learning_rate": 0.00017087563989590386, + "loss": 1.756, + "step": 827 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 3.555968999862671, + "learning_rate": 0.00017070709188783318, + "loss": 1.9666, + "step": 828 + }, + { + "epoch": 0.47725964306275187, + "grad_norm": 2.406215190887451, + "learning_rate": 0.00017053815231866088, + "loss": 2.0264, + "step": 829 + }, + { + "epoch": 0.47783534830166957, + "grad_norm": 3.1693837642669678, + "learning_rate": 0.00017036882227804826, + "loss": 1.6301, + "step": 830 + }, + { + "epoch": 0.4784110535405872, + "grad_norm": 2.4648032188415527, + "learning_rate": 0.00017019910285817505, + "loss": 2.1071, + "step": 831 + }, + { + "epoch": 0.4789867587795049, + "grad_norm": 2.020519733428955, + "learning_rate": 0.00017002899515373252, + "loss": 2.0168, + "step": 832 + }, + { + "epoch": 0.47956246401842256, + "grad_norm": 8.258744239807129, + "learning_rate": 0.00016985850026191634, + "loss": 1.738, + "step": 833 + }, + { + "epoch": 0.48013816925734026, + "grad_norm": 2.6535940170288086, + "learning_rate": 0.0001696876192824196, + "loss": 2.0153, + "step": 834 + }, + { + "epoch": 0.4807138744962579, + "grad_norm": 2.494950294494629, + "learning_rate": 0.00016951635331742564, + "loss": 1.8319, + "step": 835 + }, + { + "epoch": 0.4812895797351756, + "grad_norm": 13.977831840515137, + "learning_rate": 0.0001693447034716009, + "loss": 1.6622, + "step": 836 + }, + { + "epoch": 0.48186528497409326, + "grad_norm": 5.264831066131592, + "learning_rate": 0.00016917267085208798, + "loss": 1.7066, + "step": 837 + }, + { + "epoch": 0.48244099021301096, + "grad_norm": 3.8573200702667236, + "learning_rate": 0.0001690002565684982, + "loss": 1.7634, + "step": 838 + }, + { + "epoch": 0.4830166954519286, + "grad_norm": 4.011209487915039, + "learning_rate": 0.0001688274617329048, + "loss": 1.6426, + "step": 839 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 4.699135780334473, + "learning_rate": 0.00016865428745983538, + "loss": 1.9267, + "step": 840 + }, + { + "epoch": 0.48416810592976395, + "grad_norm": 4.957156181335449, + "learning_rate": 0.0001684807348662651, + "loss": 1.8474, + "step": 841 + }, + { + "epoch": 0.48474381116868165, + "grad_norm": 2.934382677078247, + "learning_rate": 0.00016830680507160924, + "loss": 1.7862, + "step": 842 + }, + { + "epoch": 0.4853195164075993, + "grad_norm": 2.630108594894409, + "learning_rate": 0.00016813249919771592, + "loss": 1.8025, + "step": 843 + }, + { + "epoch": 0.485895221646517, + "grad_norm": 3.7840278148651123, + "learning_rate": 0.00016795781836885913, + "loss": 1.5822, + "step": 844 + }, + { + "epoch": 0.48647092688543464, + "grad_norm": 4.31719446182251, + "learning_rate": 0.00016778276371173123, + "loss": 1.4343, + "step": 845 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 11.9985933303833, + "learning_rate": 0.00016760733635543578, + "loss": 1.8163, + "step": 846 + }, + { + "epoch": 0.48762233736327, + "grad_norm": 3.754255533218384, + "learning_rate": 0.00016743153743148024, + "loss": 1.9249, + "step": 847 + }, + { + "epoch": 0.4881980426021877, + "grad_norm": 2.8391778469085693, + "learning_rate": 0.00016725536807376873, + "loss": 1.6164, + "step": 848 + }, + { + "epoch": 0.48877374784110533, + "grad_norm": 2.3037109375, + "learning_rate": 0.0001670788294185947, + "loss": 1.7085, + "step": 849 + }, + { + "epoch": 0.48934945308002303, + "grad_norm": 30.62044334411621, + "learning_rate": 0.00016690192260463346, + "loss": 1.8193, + "step": 850 + }, + { + "epoch": 0.4899251583189407, + "grad_norm": 9.023576736450195, + "learning_rate": 0.00016672464877293504, + "loss": 1.7972, + "step": 851 + }, + { + "epoch": 0.4905008635578584, + "grad_norm": 4.591246128082275, + "learning_rate": 0.00016654700906691664, + "loss": 1.5173, + "step": 852 + }, + { + "epoch": 0.491076568796776, + "grad_norm": 4.623873710632324, + "learning_rate": 0.00016636900463235549, + "loss": 1.6689, + "step": 853 + }, + { + "epoch": 0.4916522740356937, + "grad_norm": 2.144906759262085, + "learning_rate": 0.00016619063661738124, + "loss": 2.0085, + "step": 854 + }, + { + "epoch": 0.49222797927461137, + "grad_norm": 2.73002290725708, + "learning_rate": 0.00016601190617246858, + "loss": 1.5989, + "step": 855 + }, + { + "epoch": 0.49280368451352907, + "grad_norm": 9.77154541015625, + "learning_rate": 0.00016583281445042998, + "loss": 1.9491, + "step": 856 + }, + { + "epoch": 0.4933793897524468, + "grad_norm": 5.815699577331543, + "learning_rate": 0.00016565336260640812, + "loss": 2.0127, + "step": 857 + }, + { + "epoch": 0.4939550949913644, + "grad_norm": 3.791182041168213, + "learning_rate": 0.00016547355179786838, + "loss": 1.865, + "step": 858 + }, + { + "epoch": 0.4945308002302821, + "grad_norm": 3.301182508468628, + "learning_rate": 0.00016529338318459165, + "loss": 1.7362, + "step": 859 + }, + { + "epoch": 0.49510650546919976, + "grad_norm": 3.233245849609375, + "learning_rate": 0.00016511285792866648, + "loss": 1.5621, + "step": 860 + }, + { + "epoch": 0.49568221070811747, + "grad_norm": 4.960054397583008, + "learning_rate": 0.00016493197719448182, + "loss": 1.739, + "step": 861 + }, + { + "epoch": 0.4962579159470351, + "grad_norm": 1.8711457252502441, + "learning_rate": 0.00016475074214871953, + "loss": 1.8269, + "step": 862 + }, + { + "epoch": 0.4968336211859528, + "grad_norm": 2.714691638946533, + "learning_rate": 0.00016456915396034666, + "loss": 2.1049, + "step": 863 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 4.092164993286133, + "learning_rate": 0.0001643872138006082, + "loss": 1.8893, + "step": 864 + }, + { + "epoch": 0.49798503166378816, + "grad_norm": 3.0131890773773193, + "learning_rate": 0.00016420492284301917, + "loss": 1.9821, + "step": 865 + }, + { + "epoch": 0.4985607369027058, + "grad_norm": 10.343781471252441, + "learning_rate": 0.00016402228226335735, + "loss": 2.1446, + "step": 866 + }, + { + "epoch": 0.4991364421416235, + "grad_norm": 4.245277404785156, + "learning_rate": 0.00016383929323965555, + "loss": 2.072, + "step": 867 + }, + { + "epoch": 0.49971214738054115, + "grad_norm": 11.761308670043945, + "learning_rate": 0.0001636559569521941, + "loss": 1.3602, + "step": 868 + }, + { + "epoch": 0.5002878526194589, + "grad_norm": 9.922039985656738, + "learning_rate": 0.00016347227458349302, + "loss": 1.7432, + "step": 869 + }, + { + "epoch": 0.5002878526194589, + "eval_loss": 1.1380085945129395, + "eval_runtime": 1021.271, + "eval_samples_per_second": 2.51, + "eval_steps_per_second": 2.51, + "step": 869 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 16.10755157470703, + "learning_rate": 0.00016328824731830482, + "loss": 1.9049, + "step": 870 + }, + { + "epoch": 0.5014392630972941, + "grad_norm": 4.447638511657715, + "learning_rate": 0.00016310387634360638, + "loss": 1.7126, + "step": 871 + }, + { + "epoch": 0.5020149683362118, + "grad_norm": 17.62114715576172, + "learning_rate": 0.00016291916284859155, + "loss": 1.5879, + "step": 872 + }, + { + "epoch": 0.5025906735751295, + "grad_norm": 3.3077991008758545, + "learning_rate": 0.00016273410802466353, + "loss": 1.5913, + "step": 873 + }, + { + "epoch": 0.5031663788140472, + "grad_norm": 3.9533162117004395, + "learning_rate": 0.00016254871306542695, + "loss": 1.5365, + "step": 874 + }, + { + "epoch": 0.5037420840529648, + "grad_norm": 5.701480865478516, + "learning_rate": 0.00016236297916668045, + "loss": 1.5129, + "step": 875 + }, + { + "epoch": 0.5043177892918825, + "grad_norm": 12.35987377166748, + "learning_rate": 0.0001621769075264088, + "loss": 1.9525, + "step": 876 + }, + { + "epoch": 0.5048934945308002, + "grad_norm": 2.3410115242004395, + "learning_rate": 0.0001619904993447751, + "loss": 1.9946, + "step": 877 + }, + { + "epoch": 0.5054691997697179, + "grad_norm": 3.4730076789855957, + "learning_rate": 0.00016180375582411328, + "loss": 1.2603, + "step": 878 + }, + { + "epoch": 0.5060449050086355, + "grad_norm": 10.503929138183594, + "learning_rate": 0.00016161667816892012, + "loss": 1.7077, + "step": 879 + }, + { + "epoch": 0.5066206102475532, + "grad_norm": 7.053747177124023, + "learning_rate": 0.00016142926758584767, + "loss": 1.5856, + "step": 880 + }, + { + "epoch": 0.5071963154864709, + "grad_norm": 4.179106712341309, + "learning_rate": 0.00016124152528369519, + "loss": 1.6933, + "step": 881 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 2.949862241744995, + "learning_rate": 0.00016105345247340171, + "loss": 2.1307, + "step": 882 + }, + { + "epoch": 0.5083477259643063, + "grad_norm": 2.3588945865631104, + "learning_rate": 0.000160865050368038, + "loss": 2.2399, + "step": 883 + }, + { + "epoch": 0.5089234312032239, + "grad_norm": 3.3859925270080566, + "learning_rate": 0.00016067632018279865, + "loss": 1.9435, + "step": 884 + }, + { + "epoch": 0.5094991364421416, + "grad_norm": 3.3930764198303223, + "learning_rate": 0.00016048726313499457, + "loss": 1.9658, + "step": 885 + }, + { + "epoch": 0.5100748416810593, + "grad_norm": 3.163694381713867, + "learning_rate": 0.00016029788044404477, + "loss": 2.0123, + "step": 886 + }, + { + "epoch": 0.510650546919977, + "grad_norm": 2.4561283588409424, + "learning_rate": 0.00016010817333146876, + "loss": 1.8384, + "step": 887 + }, + { + "epoch": 0.5112262521588946, + "grad_norm": 3.2786359786987305, + "learning_rate": 0.00015991814302087853, + "loss": 1.54, + "step": 888 + }, + { + "epoch": 0.5118019573978123, + "grad_norm": 9.279755592346191, + "learning_rate": 0.0001597277907379707, + "loss": 1.3626, + "step": 889 + }, + { + "epoch": 0.51237766263673, + "grad_norm": 1.9229642152786255, + "learning_rate": 0.0001595371177105186, + "loss": 2.2197, + "step": 890 + }, + { + "epoch": 0.5129533678756477, + "grad_norm": 2.7186543941497803, + "learning_rate": 0.00015934612516836446, + "loss": 1.9476, + "step": 891 + }, + { + "epoch": 0.5135290731145653, + "grad_norm": 2.1631014347076416, + "learning_rate": 0.00015915481434341123, + "loss": 1.7848, + "step": 892 + }, + { + "epoch": 0.514104778353483, + "grad_norm": 11.677098274230957, + "learning_rate": 0.0001589631864696149, + "loss": 1.8612, + "step": 893 + }, + { + "epoch": 0.5146804835924007, + "grad_norm": 3.0613834857940674, + "learning_rate": 0.00015877124278297636, + "loss": 1.6289, + "step": 894 + }, + { + "epoch": 0.5152561888313184, + "grad_norm": 2.7426950931549072, + "learning_rate": 0.00015857898452153354, + "loss": 2.2257, + "step": 895 + }, + { + "epoch": 0.515831894070236, + "grad_norm": 3.3551688194274902, + "learning_rate": 0.00015838641292535339, + "loss": 1.9517, + "step": 896 + }, + { + "epoch": 0.5164075993091537, + "grad_norm": 9.800053596496582, + "learning_rate": 0.0001581935292365238, + "loss": 1.6841, + "step": 897 + }, + { + "epoch": 0.5169833045480714, + "grad_norm": 2.620741844177246, + "learning_rate": 0.00015800033469914572, + "loss": 1.9933, + "step": 898 + }, + { + "epoch": 0.5175590097869891, + "grad_norm": 8.128425598144531, + "learning_rate": 0.00015780683055932504, + "loss": 1.9019, + "step": 899 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 2.946974754333496, + "learning_rate": 0.00015761301806516468, + "loss": 1.7235, + "step": 900 + }, + { + "epoch": 0.5187104202648244, + "grad_norm": 10.84506607055664, + "learning_rate": 0.00015741889846675625, + "loss": 1.6609, + "step": 901 + }, + { + "epoch": 0.5192861255037421, + "grad_norm": 4.477931499481201, + "learning_rate": 0.00015722447301617237, + "loss": 2.0412, + "step": 902 + }, + { + "epoch": 0.5198618307426598, + "grad_norm": 13.774998664855957, + "learning_rate": 0.00015702974296745843, + "loss": 1.5644, + "step": 903 + }, + { + "epoch": 0.5204375359815774, + "grad_norm": 4.497225284576416, + "learning_rate": 0.00015683470957662425, + "loss": 1.7644, + "step": 904 + }, + { + "epoch": 0.5210132412204951, + "grad_norm": 3.543210983276367, + "learning_rate": 0.00015663937410163644, + "loss": 1.6813, + "step": 905 + }, + { + "epoch": 0.5215889464594128, + "grad_norm": 4.446226119995117, + "learning_rate": 0.00015644373780240994, + "loss": 1.5355, + "step": 906 + }, + { + "epoch": 0.5221646516983305, + "grad_norm": 2.871023416519165, + "learning_rate": 0.00015624780194080004, + "loss": 1.8574, + "step": 907 + }, + { + "epoch": 0.5227403569372481, + "grad_norm": 3.543043851852417, + "learning_rate": 0.00015605156778059426, + "loss": 1.6743, + "step": 908 + }, + { + "epoch": 0.5233160621761658, + "grad_norm": 3.302360773086548, + "learning_rate": 0.00015585503658750399, + "loss": 2.004, + "step": 909 + }, + { + "epoch": 0.5238917674150835, + "grad_norm": 3.226323127746582, + "learning_rate": 0.00015565820962915668, + "loss": 1.8727, + "step": 910 + }, + { + "epoch": 0.5244674726540012, + "grad_norm": 2.56103777885437, + "learning_rate": 0.0001554610881750873, + "loss": 1.5572, + "step": 911 + }, + { + "epoch": 0.5250431778929189, + "grad_norm": 2.682222366333008, + "learning_rate": 0.00015526367349673044, + "loss": 1.8991, + "step": 912 + }, + { + "epoch": 0.5256188831318365, + "grad_norm": 2.8308794498443604, + "learning_rate": 0.00015506596686741192, + "loss": 1.895, + "step": 913 + }, + { + "epoch": 0.5261945883707542, + "grad_norm": 2.992013931274414, + "learning_rate": 0.0001548679695623407, + "loss": 1.7397, + "step": 914 + }, + { + "epoch": 0.5267702936096719, + "grad_norm": 3.145866870880127, + "learning_rate": 0.00015466968285860055, + "loss": 1.7754, + "step": 915 + }, + { + "epoch": 0.5273459988485896, + "grad_norm": 3.8366172313690186, + "learning_rate": 0.00015447110803514186, + "loss": 1.4537, + "step": 916 + }, + { + "epoch": 0.5279217040875072, + "grad_norm": 3.056429624557495, + "learning_rate": 0.00015427224637277348, + "loss": 1.9531, + "step": 917 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 23.298696517944336, + "learning_rate": 0.00015407309915415425, + "loss": 1.6287, + "step": 918 + }, + { + "epoch": 0.5290731145653426, + "grad_norm": 2.360717535018921, + "learning_rate": 0.0001538736676637849, + "loss": 2.492, + "step": 919 + }, + { + "epoch": 0.5296488198042603, + "grad_norm": 3.8945038318634033, + "learning_rate": 0.00015367395318799973, + "loss": 1.8311, + "step": 920 + }, + { + "epoch": 0.5302245250431779, + "grad_norm": 2.1640625, + "learning_rate": 0.00015347395701495833, + "loss": 1.9314, + "step": 921 + }, + { + "epoch": 0.5308002302820956, + "grad_norm": 2.0766513347625732, + "learning_rate": 0.00015327368043463718, + "loss": 2.1496, + "step": 922 + }, + { + "epoch": 0.5313759355210133, + "grad_norm": 2.49560809135437, + "learning_rate": 0.00015307312473882137, + "loss": 1.6021, + "step": 923 + }, + { + "epoch": 0.531951640759931, + "grad_norm": 3.279759645462036, + "learning_rate": 0.00015287229122109633, + "loss": 1.7603, + "step": 924 + }, + { + "epoch": 0.5325273459988485, + "grad_norm": 2.0442922115325928, + "learning_rate": 0.0001526711811768395, + "loss": 2.0437, + "step": 925 + }, + { + "epoch": 0.5331030512377662, + "grad_norm": 2.3960659503936768, + "learning_rate": 0.0001524697959032118, + "loss": 1.4901, + "step": 926 + }, + { + "epoch": 0.533678756476684, + "grad_norm": 2.597766160964966, + "learning_rate": 0.00015226813669914948, + "loss": 1.6113, + "step": 927 + }, + { + "epoch": 0.5342544617156016, + "grad_norm": 12.007303237915039, + "learning_rate": 0.00015206620486535552, + "loss": 1.6338, + "step": 928 + }, + { + "epoch": 0.5348301669545192, + "grad_norm": 2.297700881958008, + "learning_rate": 0.0001518640017042915, + "loss": 1.5866, + "step": 929 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 4.744719505310059, + "learning_rate": 0.00015166152852016902, + "loss": 2.1694, + "step": 930 + }, + { + "epoch": 0.5359815774323546, + "grad_norm": 2.273824453353882, + "learning_rate": 0.00015145878661894125, + "loss": 1.9095, + "step": 931 + }, + { + "epoch": 0.5365572826712723, + "grad_norm": 3.70763897895813, + "learning_rate": 0.00015125577730829473, + "loss": 1.7984, + "step": 932 + }, + { + "epoch": 0.5371329879101899, + "grad_norm": 4.088851451873779, + "learning_rate": 0.00015105250189764063, + "loss": 1.8663, + "step": 933 + }, + { + "epoch": 0.5377086931491076, + "grad_norm": 1.9229241609573364, + "learning_rate": 0.0001508489616981066, + "loss": 2.0268, + "step": 934 + }, + { + "epoch": 0.5382843983880253, + "grad_norm": 6.149089336395264, + "learning_rate": 0.00015064515802252817, + "loss": 1.5163, + "step": 935 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 3.5841498374938965, + "learning_rate": 0.00015044109218544015, + "loss": 1.6879, + "step": 936 + }, + { + "epoch": 0.5394358088658607, + "grad_norm": 5.613948345184326, + "learning_rate": 0.00015023676550306848, + "loss": 1.4269, + "step": 937 + }, + { + "epoch": 0.5400115141047783, + "grad_norm": 5.511398792266846, + "learning_rate": 0.00015003217929332143, + "loss": 1.5413, + "step": 938 + }, + { + "epoch": 0.540587219343696, + "grad_norm": 5.14990758895874, + "learning_rate": 0.00014982733487578127, + "loss": 1.6226, + "step": 939 + }, + { + "epoch": 0.5411629245826137, + "grad_norm": 2.902458667755127, + "learning_rate": 0.0001496222335716957, + "loss": 1.6545, + "step": 940 + }, + { + "epoch": 0.5417386298215314, + "grad_norm": 4.643445014953613, + "learning_rate": 0.00014941687670396938, + "loss": 2.0121, + "step": 941 + }, + { + "epoch": 0.542314335060449, + "grad_norm": 3.1277265548706055, + "learning_rate": 0.00014921126559715528, + "loss": 1.6763, + "step": 942 + }, + { + "epoch": 0.5428900402993667, + "grad_norm": 1.9732697010040283, + "learning_rate": 0.00014900540157744625, + "loss": 1.8311, + "step": 943 + }, + { + "epoch": 0.5434657455382844, + "grad_norm": 7.364120006561279, + "learning_rate": 0.00014879928597266644, + "loss": 1.3101, + "step": 944 + }, + { + "epoch": 0.5440414507772021, + "grad_norm": 4.716955661773682, + "learning_rate": 0.0001485929201122628, + "loss": 1.8418, + "step": 945 + }, + { + "epoch": 0.5446171560161197, + "grad_norm": 2.276949167251587, + "learning_rate": 0.0001483863053272962, + "loss": 1.8196, + "step": 946 + }, + { + "epoch": 0.5451928612550374, + "grad_norm": 2.367461681365967, + "learning_rate": 0.00014817944295043332, + "loss": 1.952, + "step": 947 + }, + { + "epoch": 0.5457685664939551, + "grad_norm": 8.040431022644043, + "learning_rate": 0.0001479723343159377, + "loss": 1.9765, + "step": 948 + }, + { + "epoch": 0.5463442717328728, + "grad_norm": 1.7760546207427979, + "learning_rate": 0.0001477649807596613, + "loss": 1.8163, + "step": 949 + }, + { + "epoch": 0.5469199769717904, + "grad_norm": 6.472499370574951, + "learning_rate": 0.00014755738361903566, + "loss": 1.9802, + "step": 950 + }, + { + "epoch": 0.5474956822107081, + "grad_norm": 16.233253479003906, + "learning_rate": 0.00014734954423306371, + "loss": 1.8819, + "step": 951 + }, + { + "epoch": 0.5480713874496258, + "grad_norm": 2.759955644607544, + "learning_rate": 0.00014714146394231061, + "loss": 1.849, + "step": 952 + }, + { + "epoch": 0.5486470926885435, + "grad_norm": 2.3619799613952637, + "learning_rate": 0.00014693314408889554, + "loss": 1.9287, + "step": 953 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 7.854180335998535, + "learning_rate": 0.00014672458601648272, + "loss": 1.6121, + "step": 954 + }, + { + "epoch": 0.5497985031663788, + "grad_norm": 3.9259274005889893, + "learning_rate": 0.000146515791070273, + "loss": 1.4874, + "step": 955 + }, + { + "epoch": 0.5503742084052965, + "grad_norm": 3.472259521484375, + "learning_rate": 0.000146306760596995, + "loss": 1.7002, + "step": 956 + }, + { + "epoch": 0.5509499136442142, + "grad_norm": 9.12783145904541, + "learning_rate": 0.0001460974959448965, + "loss": 1.668, + "step": 957 + }, + { + "epoch": 0.5515256188831318, + "grad_norm": 3.1406407356262207, + "learning_rate": 0.00014588799846373574, + "loss": 1.6456, + "step": 958 + }, + { + "epoch": 0.5521013241220495, + "grad_norm": 2.9766335487365723, + "learning_rate": 0.00014567826950477277, + "loss": 1.5559, + "step": 959 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 3.6111319065093994, + "learning_rate": 0.00014546831042076052, + "loss": 1.7752, + "step": 960 + }, + { + "epoch": 0.5532527345998849, + "grad_norm": 4.001034259796143, + "learning_rate": 0.00014525812256593637, + "loss": 1.9761, + "step": 961 + }, + { + "epoch": 0.5538284398388025, + "grad_norm": 3.3448774814605713, + "learning_rate": 0.00014504770729601327, + "loss": 1.7559, + "step": 962 + }, + { + "epoch": 0.5544041450777202, + "grad_norm": 4.9256157875061035, + "learning_rate": 0.0001448370659681709, + "loss": 1.3841, + "step": 963 + }, + { + "epoch": 0.5549798503166379, + "grad_norm": 2.9494550228118896, + "learning_rate": 0.00014462619994104706, + "loss": 2.1265, + "step": 964 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 4.05312967300415, + "learning_rate": 0.00014441511057472893, + "loss": 1.3376, + "step": 965 + }, + { + "epoch": 0.5561312607944733, + "grad_norm": 4.632201194763184, + "learning_rate": 0.0001442037992307441, + "loss": 1.7425, + "step": 966 + }, + { + "epoch": 0.5567069660333909, + "grad_norm": 2.694762706756592, + "learning_rate": 0.00014399226727205205, + "loss": 2.0141, + "step": 967 + }, + { + "epoch": 0.5572826712723086, + "grad_norm": 2.672029495239258, + "learning_rate": 0.00014378051606303512, + "loss": 1.7991, + "step": 968 + }, + { + "epoch": 0.5578583765112263, + "grad_norm": 4.377752304077148, + "learning_rate": 0.00014356854696948986, + "loss": 1.5728, + "step": 969 + }, + { + "epoch": 0.558434081750144, + "grad_norm": 2.543248414993286, + "learning_rate": 0.00014335636135861824, + "loss": 2.063, + "step": 970 + }, + { + "epoch": 0.5590097869890616, + "grad_norm": 2.466642141342163, + "learning_rate": 0.00014314396059901863, + "loss": 1.7082, + "step": 971 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 4.953126430511475, + "learning_rate": 0.00014293134606067722, + "loss": 1.5022, + "step": 972 + }, + { + "epoch": 0.560161197466897, + "grad_norm": 5.41407585144043, + "learning_rate": 0.000142718519114959, + "loss": 1.7407, + "step": 973 + }, + { + "epoch": 0.5607369027058147, + "grad_norm": 2.768073797225952, + "learning_rate": 0.00014250548113459909, + "loss": 1.8574, + "step": 974 + }, + { + "epoch": 0.5613126079447323, + "grad_norm": 2.6404266357421875, + "learning_rate": 0.00014229223349369373, + "loss": 1.5253, + "step": 975 + }, + { + "epoch": 0.56188831318365, + "grad_norm": 2.4457454681396484, + "learning_rate": 0.00014207877756769138, + "loss": 1.9335, + "step": 976 + }, + { + "epoch": 0.5624640184225677, + "grad_norm": 12.25162410736084, + "learning_rate": 0.0001418651147333841, + "loss": 1.5656, + "step": 977 + }, + { + "epoch": 0.5630397236614854, + "grad_norm": 2.4453790187835693, + "learning_rate": 0.00014165124636889836, + "loss": 1.6927, + "step": 978 + }, + { + "epoch": 0.563615428900403, + "grad_norm": 2.6654469966888428, + "learning_rate": 0.0001414371738536865, + "loss": 1.5248, + "step": 979 + }, + { + "epoch": 0.5641911341393206, + "grad_norm": 2.3234498500823975, + "learning_rate": 0.00014122289856851735, + "loss": 1.7526, + "step": 980 + }, + { + "epoch": 0.5647668393782384, + "grad_norm": 2.1236610412597656, + "learning_rate": 0.0001410084218954679, + "loss": 1.81, + "step": 981 + }, + { + "epoch": 0.565342544617156, + "grad_norm": 2.213833808898926, + "learning_rate": 0.00014079374521791389, + "loss": 1.825, + "step": 982 + }, + { + "epoch": 0.5659182498560736, + "grad_norm": 5.118985176086426, + "learning_rate": 0.00014057886992052115, + "loss": 1.7381, + "step": 983 + }, + { + "epoch": 0.5664939550949913, + "grad_norm": 26.596086502075195, + "learning_rate": 0.00014036379738923668, + "loss": 1.4509, + "step": 984 + }, + { + "epoch": 0.567069660333909, + "grad_norm": 2.0581257343292236, + "learning_rate": 0.00014014852901127954, + "loss": 1.8644, + "step": 985 + }, + { + "epoch": 0.5676453655728267, + "grad_norm": 2.2645015716552734, + "learning_rate": 0.00013993306617513204, + "loss": 1.7144, + "step": 986 + }, + { + "epoch": 0.5682210708117443, + "grad_norm": 2.477935791015625, + "learning_rate": 0.00013971741027053071, + "loss": 1.6473, + "step": 987 + }, + { + "epoch": 0.568796776050662, + "grad_norm": 3.0278351306915283, + "learning_rate": 0.00013950156268845748, + "loss": 1.9054, + "step": 988 + }, + { + "epoch": 0.5693724812895797, + "grad_norm": 3.1719486713409424, + "learning_rate": 0.00013928552482113054, + "loss": 1.8626, + "step": 989 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 2.6236214637756348, + "learning_rate": 0.0001390692980619953, + "loss": 1.7584, + "step": 990 + }, + { + "epoch": 0.5705238917674151, + "grad_norm": 2.0949866771698, + "learning_rate": 0.00013885288380571575, + "loss": 1.8523, + "step": 991 + }, + { + "epoch": 0.5710995970063327, + "grad_norm": 3.192641496658325, + "learning_rate": 0.00013863628344816506, + "loss": 1.8358, + "step": 992 + }, + { + "epoch": 0.5716753022452504, + "grad_norm": 2.1230075359344482, + "learning_rate": 0.00013841949838641683, + "loss": 1.6974, + "step": 993 + }, + { + "epoch": 0.5722510074841681, + "grad_norm": 2.634091854095459, + "learning_rate": 0.00013820253001873602, + "loss": 1.8269, + "step": 994 + }, + { + "epoch": 0.5728267127230858, + "grad_norm": 2.884781837463379, + "learning_rate": 0.00013798537974456983, + "loss": 1.3469, + "step": 995 + }, + { + "epoch": 0.5734024179620034, + "grad_norm": 2.4384100437164307, + "learning_rate": 0.0001377680489645389, + "loss": 1.5845, + "step": 996 + }, + { + "epoch": 0.5739781232009211, + "grad_norm": 2.9231410026550293, + "learning_rate": 0.00013755053908042793, + "loss": 1.7073, + "step": 997 + }, + { + "epoch": 0.5745538284398388, + "grad_norm": 3.0704684257507324, + "learning_rate": 0.000137332851495177, + "loss": 1.4623, + "step": 998 + }, + { + "epoch": 0.5751295336787565, + "grad_norm": 2.402618885040283, + "learning_rate": 0.0001371149876128724, + "loss": 2.1455, + "step": 999 + }, + { + "epoch": 0.5757052389176741, + "grad_norm": 2.723708152770996, + "learning_rate": 0.00013689694883873733, + "loss": 1.5748, + "step": 1000 + }, + { + "epoch": 0.5762809441565918, + "grad_norm": 1.9962085485458374, + "learning_rate": 0.00013667873657912332, + "loss": 1.9438, + "step": 1001 + }, + { + "epoch": 0.5768566493955095, + "grad_norm": 5.691616058349609, + "learning_rate": 0.0001364603522415006, + "loss": 2.3176, + "step": 1002 + }, + { + "epoch": 0.5774323546344272, + "grad_norm": 2.1746699810028076, + "learning_rate": 0.00013624179723444952, + "loss": 1.8675, + "step": 1003 + }, + { + "epoch": 0.5780080598733448, + "grad_norm": 4.523983478546143, + "learning_rate": 0.00013602307296765108, + "loss": 1.5397, + "step": 1004 + }, + { + "epoch": 0.5785837651122625, + "grad_norm": 29.249481201171875, + "learning_rate": 0.0001358041808518782, + "loss": 1.7436, + "step": 1005 + }, + { + "epoch": 0.5791594703511802, + "grad_norm": 2.0091192722320557, + "learning_rate": 0.00013558512229898628, + "loss": 1.9335, + "step": 1006 + }, + { + "epoch": 0.5797351755900979, + "grad_norm": 2.7708356380462646, + "learning_rate": 0.00013536589872190425, + "loss": 1.7912, + "step": 1007 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 2.417196273803711, + "learning_rate": 0.00013514651153462555, + "loss": 1.8221, + "step": 1008 + }, + { + "epoch": 0.5808865860679332, + "grad_norm": 2.3912861347198486, + "learning_rate": 0.00013492696215219874, + "loss": 1.526, + "step": 1009 + }, + { + "epoch": 0.5814622913068509, + "grad_norm": 1.8041644096374512, + "learning_rate": 0.00013470725199071868, + "loss": 1.5973, + "step": 1010 + }, + { + "epoch": 0.5820379965457686, + "grad_norm": 3.6920905113220215, + "learning_rate": 0.00013448738246731723, + "loss": 1.8089, + "step": 1011 + }, + { + "epoch": 0.5826137017846862, + "grad_norm": 1.7696315050125122, + "learning_rate": 0.00013426735500015412, + "loss": 1.8036, + "step": 1012 + }, + { + "epoch": 0.5831894070236039, + "grad_norm": 2.644516706466675, + "learning_rate": 0.00013404717100840775, + "loss": 1.7614, + "step": 1013 + }, + { + "epoch": 0.5837651122625216, + "grad_norm": 6.610838413238525, + "learning_rate": 0.00013382683191226626, + "loss": 1.8272, + "step": 1014 + }, + { + "epoch": 0.5843408175014393, + "grad_norm": 25.5416202545166, + "learning_rate": 0.00013360633913291805, + "loss": 1.5991, + "step": 1015 + }, + { + "epoch": 0.584916522740357, + "grad_norm": 6.063877105712891, + "learning_rate": 0.00013338569409254285, + "loss": 1.9859, + "step": 1016 + }, + { + "epoch": 0.5854922279792746, + "grad_norm": 10.058717727661133, + "learning_rate": 0.00013316489821430257, + "loss": 1.5081, + "step": 1017 + }, + { + "epoch": 0.5860679332181923, + "grad_norm": 3.7417914867401123, + "learning_rate": 0.00013294395292233179, + "loss": 1.4591, + "step": 1018 + }, + { + "epoch": 0.58664363845711, + "grad_norm": 2.1001293659210205, + "learning_rate": 0.00013272285964172905, + "loss": 1.4471, + "step": 1019 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 4.2659454345703125, + "learning_rate": 0.00013250161979854727, + "loss": 1.6232, + "step": 1020 + }, + { + "epoch": 0.5877950489349453, + "grad_norm": 3.3600850105285645, + "learning_rate": 0.00013228023481978477, + "loss": 2.0188, + "step": 1021 + }, + { + "epoch": 0.588370754173863, + "grad_norm": 4.816986560821533, + "learning_rate": 0.00013205870613337598, + "loss": 1.883, + "step": 1022 + }, + { + "epoch": 0.5889464594127807, + "grad_norm": 7.538907527923584, + "learning_rate": 0.00013183703516818221, + "loss": 1.6426, + "step": 1023 + }, + { + "epoch": 0.5895221646516984, + "grad_norm": 3.869892120361328, + "learning_rate": 0.00013161522335398252, + "loss": 1.5805, + "step": 1024 + }, + { + "epoch": 0.590097869890616, + "grad_norm": 2.9215962886810303, + "learning_rate": 0.00013139327212146438, + "loss": 2.0736, + "step": 1025 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 3.044528007507324, + "learning_rate": 0.0001311711829022146, + "loss": 1.773, + "step": 1026 + }, + { + "epoch": 0.5912492803684514, + "grad_norm": 2.4565680027008057, + "learning_rate": 0.00013094895712870993, + "loss": 1.8526, + "step": 1027 + }, + { + "epoch": 0.5918249856073691, + "grad_norm": 2.8523366451263428, + "learning_rate": 0.00013072659623430797, + "loss": 1.6659, + "step": 1028 + }, + { + "epoch": 0.5924006908462867, + "grad_norm": 3.7679712772369385, + "learning_rate": 0.0001305041016532377, + "loss": 1.6689, + "step": 1029 + }, + { + "epoch": 0.5929763960852044, + "grad_norm": 2.0870449542999268, + "learning_rate": 0.0001302814748205906, + "loss": 1.955, + "step": 1030 + }, + { + "epoch": 0.5935521013241221, + "grad_norm": 4.419449329376221, + "learning_rate": 0.000130058717172311, + "loss": 2.0321, + "step": 1031 + }, + { + "epoch": 0.5941278065630398, + "grad_norm": 3.5084879398345947, + "learning_rate": 0.00012983583014518704, + "loss": 1.8397, + "step": 1032 + }, + { + "epoch": 0.5947035118019574, + "grad_norm": 5.1162943840026855, + "learning_rate": 0.00012961281517684137, + "loss": 1.3311, + "step": 1033 + }, + { + "epoch": 0.595279217040875, + "grad_norm": 3.4573514461517334, + "learning_rate": 0.00012938967370572187, + "loss": 1.7966, + "step": 1034 + }, + { + "epoch": 0.5958549222797928, + "grad_norm": 3.372185468673706, + "learning_rate": 0.00012916640717109234, + "loss": 1.8347, + "step": 1035 + }, + { + "epoch": 0.5964306275187105, + "grad_norm": 5.885191440582275, + "learning_rate": 0.00012894301701302325, + "loss": 1.5451, + "step": 1036 + }, + { + "epoch": 0.597006332757628, + "grad_norm": 3.0024893283843994, + "learning_rate": 0.00012871950467238243, + "loss": 1.614, + "step": 1037 + }, + { + "epoch": 0.5975820379965457, + "grad_norm": 3.6248486042022705, + "learning_rate": 0.0001284958715908258, + "loss": 1.9764, + "step": 1038 + }, + { + "epoch": 0.5981577432354634, + "grad_norm": 2.173245668411255, + "learning_rate": 0.00012827211921078807, + "loss": 2.005, + "step": 1039 + }, + { + "epoch": 0.5987334484743811, + "grad_norm": 3.430807590484619, + "learning_rate": 0.00012804824897547342, + "loss": 1.4051, + "step": 1040 + }, + { + "epoch": 0.5993091537132987, + "grad_norm": 2.929710865020752, + "learning_rate": 0.00012782426232884616, + "loss": 1.6919, + "step": 1041 + }, + { + "epoch": 0.5998848589522164, + "grad_norm": 10.41037654876709, + "learning_rate": 0.00012760016071562154, + "loss": 1.6745, + "step": 1042 + }, + { + "epoch": 0.6004605641911341, + "grad_norm": 2.4413540363311768, + "learning_rate": 0.00012737594558125622, + "loss": 1.7962, + "step": 1043 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 1.9357279539108276, + "learning_rate": 0.00012715161837193917, + "loss": 1.7911, + "step": 1044 + } + ], + "logging_steps": 1, + "max_steps": 1737, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 348, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2067894585976685e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}