{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4006908462867012, "eval_steps": 869, "global_step": 696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005757052389176742, "grad_norm": 2.557003974914551, "learning_rate": 0.0, "loss": 5.4277, "step": 1 }, { "epoch": 0.0005757052389176742, "eval_loss": 5.319709300994873, "eval_runtime": 1026.7022, "eval_samples_per_second": 2.496, "eval_steps_per_second": 2.496, "step": 1 }, { "epoch": 0.0011514104778353484, "grad_norm": 2.985229969024658, "learning_rate": 4.0000000000000003e-07, "loss": 5.7019, "step": 2 }, { "epoch": 0.0017271157167530224, "grad_norm": 3.0353081226348877, "learning_rate": 8.000000000000001e-07, "loss": 6.1934, "step": 3 }, { "epoch": 0.002302820955670697, "grad_norm": 3.724905490875244, "learning_rate": 1.2000000000000002e-06, "loss": 5.4617, "step": 4 }, { "epoch": 0.0028785261945883708, "grad_norm": 2.6505627632141113, "learning_rate": 1.6000000000000001e-06, "loss": 5.4285, "step": 5 }, { "epoch": 0.0034542314335060447, "grad_norm": 2.7363409996032715, "learning_rate": 2.0000000000000003e-06, "loss": 5.8634, "step": 6 }, { "epoch": 0.004029936672423719, "grad_norm": 3.082538366317749, "learning_rate": 2.4000000000000003e-06, "loss": 4.7461, "step": 7 }, { "epoch": 0.004605641911341394, "grad_norm": 9.095250129699707, "learning_rate": 2.8000000000000003e-06, "loss": 7.5703, "step": 8 }, { "epoch": 0.0051813471502590676, "grad_norm": 2.2597923278808594, "learning_rate": 3.2000000000000003e-06, "loss": 5.3631, "step": 9 }, { "epoch": 0.0057570523891767415, "grad_norm": 5.053525924682617, "learning_rate": 3.6e-06, "loss": 6.0132, "step": 10 }, { "epoch": 0.0063327576280944155, "grad_norm": 2.7407820224761963, "learning_rate": 4.000000000000001e-06, "loss": 5.9776, "step": 11 }, { "epoch": 0.0069084628670120895, "grad_norm": 2.4892263412475586, "learning_rate": 4.4e-06, "loss": 5.524, "step": 12 }, { "epoch": 0.007484168105929764, "grad_norm": 2.5302274227142334, "learning_rate": 4.800000000000001e-06, "loss": 5.8044, "step": 13 }, { "epoch": 0.008059873344847437, "grad_norm": 2.992504358291626, "learning_rate": 5.2e-06, "loss": 6.0307, "step": 14 }, { "epoch": 0.008635578583765112, "grad_norm": 4.081608295440674, "learning_rate": 5.600000000000001e-06, "loss": 4.6732, "step": 15 }, { "epoch": 0.009211283822682787, "grad_norm": 2.33296799659729, "learning_rate": 6e-06, "loss": 4.6356, "step": 16 }, { "epoch": 0.00978698906160046, "grad_norm": 2.798452854156494, "learning_rate": 6.4000000000000006e-06, "loss": 5.2941, "step": 17 }, { "epoch": 0.010362694300518135, "grad_norm": 2.290029525756836, "learning_rate": 6.800000000000001e-06, "loss": 4.9405, "step": 18 }, { "epoch": 0.010938399539435808, "grad_norm": 3.2164740562438965, "learning_rate": 7.2e-06, "loss": 5.6711, "step": 19 }, { "epoch": 0.011514104778353483, "grad_norm": 2.4481987953186035, "learning_rate": 7.6e-06, "loss": 5.0366, "step": 20 }, { "epoch": 0.012089810017271158, "grad_norm": 3.398063898086548, "learning_rate": 8.000000000000001e-06, "loss": 5.9377, "step": 21 }, { "epoch": 0.012665515256188831, "grad_norm": 2.3936686515808105, "learning_rate": 8.400000000000001e-06, "loss": 5.4237, "step": 22 }, { "epoch": 0.013241220495106506, "grad_norm": 2.7233810424804688, "learning_rate": 8.8e-06, "loss": 5.6551, "step": 23 }, { "epoch": 0.013816925734024179, "grad_norm": 2.9957566261291504, "learning_rate": 9.2e-06, "loss": 4.7701, "step": 24 }, { "epoch": 0.014392630972941854, "grad_norm": 6.397132396697998, "learning_rate": 9.600000000000001e-06, "loss": 6.4459, "step": 25 }, { "epoch": 0.014968336211859529, "grad_norm": 3.0593409538269043, "learning_rate": 1e-05, "loss": 5.2758, "step": 26 }, { "epoch": 0.015544041450777202, "grad_norm": 2.9723803997039795, "learning_rate": 1.04e-05, "loss": 5.6136, "step": 27 }, { "epoch": 0.016119746689694875, "grad_norm": 2.03314471244812, "learning_rate": 1.08e-05, "loss": 5.3556, "step": 28 }, { "epoch": 0.01669545192861255, "grad_norm": 1.777107834815979, "learning_rate": 1.1200000000000001e-05, "loss": 5.1061, "step": 29 }, { "epoch": 0.017271157167530225, "grad_norm": 3.2192044258117676, "learning_rate": 1.16e-05, "loss": 5.2414, "step": 30 }, { "epoch": 0.017846862406447898, "grad_norm": 3.924452066421509, "learning_rate": 1.2e-05, "loss": 5.2754, "step": 31 }, { "epoch": 0.018422567645365574, "grad_norm": 3.5611093044281006, "learning_rate": 1.24e-05, "loss": 5.2817, "step": 32 }, { "epoch": 0.018998272884283247, "grad_norm": 2.5194263458251953, "learning_rate": 1.2800000000000001e-05, "loss": 5.9063, "step": 33 }, { "epoch": 0.01957397812320092, "grad_norm": 2.403895854949951, "learning_rate": 1.32e-05, "loss": 5.1161, "step": 34 }, { "epoch": 0.020149683362118594, "grad_norm": 2.496400833129883, "learning_rate": 1.3600000000000002e-05, "loss": 5.3049, "step": 35 }, { "epoch": 0.02072538860103627, "grad_norm": 3.0970828533172607, "learning_rate": 1.4000000000000001e-05, "loss": 5.5807, "step": 36 }, { "epoch": 0.021301093839953943, "grad_norm": 3.941403388977051, "learning_rate": 1.44e-05, "loss": 6.0418, "step": 37 }, { "epoch": 0.021876799078871616, "grad_norm": 2.291431188583374, "learning_rate": 1.48e-05, "loss": 4.3686, "step": 38 }, { "epoch": 0.022452504317789293, "grad_norm": 2.783054828643799, "learning_rate": 1.52e-05, "loss": 5.15, "step": 39 }, { "epoch": 0.023028209556706966, "grad_norm": 3.579267978668213, "learning_rate": 1.56e-05, "loss": 5.7507, "step": 40 }, { "epoch": 0.02360391479562464, "grad_norm": 3.5277323722839355, "learning_rate": 1.6000000000000003e-05, "loss": 6.112, "step": 41 }, { "epoch": 0.024179620034542316, "grad_norm": 2.5100817680358887, "learning_rate": 1.6400000000000002e-05, "loss": 5.2133, "step": 42 }, { "epoch": 0.02475532527345999, "grad_norm": 2.3821561336517334, "learning_rate": 1.6800000000000002e-05, "loss": 6.0345, "step": 43 }, { "epoch": 0.025331030512377662, "grad_norm": 3.0675108432769775, "learning_rate": 1.7199999999999998e-05, "loss": 5.2294, "step": 44 }, { "epoch": 0.025906735751295335, "grad_norm": 2.8790383338928223, "learning_rate": 1.76e-05, "loss": 5.6393, "step": 45 }, { "epoch": 0.02648244099021301, "grad_norm": 3.3649141788482666, "learning_rate": 1.8e-05, "loss": 6.014, "step": 46 }, { "epoch": 0.027058146229130685, "grad_norm": 3.4695286750793457, "learning_rate": 1.84e-05, "loss": 5.3457, "step": 47 }, { "epoch": 0.027633851468048358, "grad_norm": 3.303622245788574, "learning_rate": 1.88e-05, "loss": 5.593, "step": 48 }, { "epoch": 0.028209556706966035, "grad_norm": 2.481895923614502, "learning_rate": 1.9200000000000003e-05, "loss": 5.1439, "step": 49 }, { "epoch": 0.028785261945883708, "grad_norm": 2.888579845428467, "learning_rate": 1.9600000000000002e-05, "loss": 4.6318, "step": 50 }, { "epoch": 0.02936096718480138, "grad_norm": 3.4528300762176514, "learning_rate": 2e-05, "loss": 5.0376, "step": 51 }, { "epoch": 0.029936672423719057, "grad_norm": 3.6751370429992676, "learning_rate": 2.04e-05, "loss": 4.9183, "step": 52 }, { "epoch": 0.03051237766263673, "grad_norm": 3.382035970687866, "learning_rate": 2.08e-05, "loss": 5.499, "step": 53 }, { "epoch": 0.031088082901554404, "grad_norm": 2.8802406787872314, "learning_rate": 2.12e-05, "loss": 5.3177, "step": 54 }, { "epoch": 0.03166378814047208, "grad_norm": 6.158539772033691, "learning_rate": 2.16e-05, "loss": 6.2133, "step": 55 }, { "epoch": 0.03223949337938975, "grad_norm": 2.599864959716797, "learning_rate": 2.2000000000000003e-05, "loss": 5.3691, "step": 56 }, { "epoch": 0.03281519861830743, "grad_norm": 3.4526188373565674, "learning_rate": 2.2400000000000002e-05, "loss": 5.3801, "step": 57 }, { "epoch": 0.0333909038572251, "grad_norm": 9.494807243347168, "learning_rate": 2.2800000000000002e-05, "loss": 7.3116, "step": 58 }, { "epoch": 0.033966609096142776, "grad_norm": 4.3456130027771, "learning_rate": 2.32e-05, "loss": 4.7467, "step": 59 }, { "epoch": 0.03454231433506045, "grad_norm": 3.8471431732177734, "learning_rate": 2.36e-05, "loss": 5.2742, "step": 60 }, { "epoch": 0.03511801957397812, "grad_norm": 3.985994815826416, "learning_rate": 2.4e-05, "loss": 5.4615, "step": 61 }, { "epoch": 0.035693724812895795, "grad_norm": 9.588626861572266, "learning_rate": 2.44e-05, "loss": 6.8261, "step": 62 }, { "epoch": 0.03626943005181347, "grad_norm": 5.3343915939331055, "learning_rate": 2.48e-05, "loss": 6.0899, "step": 63 }, { "epoch": 0.03684513529073115, "grad_norm": 5.611617088317871, "learning_rate": 2.5200000000000003e-05, "loss": 6.4523, "step": 64 }, { "epoch": 0.03742084052964882, "grad_norm": 4.497012615203857, "learning_rate": 2.5600000000000002e-05, "loss": 4.787, "step": 65 }, { "epoch": 0.037996545768566495, "grad_norm": 5.032821178436279, "learning_rate": 2.6000000000000002e-05, "loss": 5.6337, "step": 66 }, { "epoch": 0.03857225100748417, "grad_norm": 3.732733726501465, "learning_rate": 2.64e-05, "loss": 5.5212, "step": 67 }, { "epoch": 0.03914795624640184, "grad_norm": 4.3597517013549805, "learning_rate": 2.6800000000000004e-05, "loss": 4.647, "step": 68 }, { "epoch": 0.039723661485319514, "grad_norm": 5.359225273132324, "learning_rate": 2.7200000000000004e-05, "loss": 5.7052, "step": 69 }, { "epoch": 0.04029936672423719, "grad_norm": 4.9161601066589355, "learning_rate": 2.7600000000000003e-05, "loss": 5.3191, "step": 70 }, { "epoch": 0.04087507196315487, "grad_norm": 4.137385368347168, "learning_rate": 2.8000000000000003e-05, "loss": 5.1797, "step": 71 }, { "epoch": 0.04145077720207254, "grad_norm": 4.728359699249268, "learning_rate": 2.84e-05, "loss": 5.1125, "step": 72 }, { "epoch": 0.042026482440990214, "grad_norm": 4.568793773651123, "learning_rate": 2.88e-05, "loss": 5.7705, "step": 73 }, { "epoch": 0.04260218767990789, "grad_norm": 4.931026935577393, "learning_rate": 2.9199999999999998e-05, "loss": 5.1052, "step": 74 }, { "epoch": 0.04317789291882556, "grad_norm": 4.697461128234863, "learning_rate": 2.96e-05, "loss": 5.1404, "step": 75 }, { "epoch": 0.04375359815774323, "grad_norm": 6.393320083618164, "learning_rate": 3e-05, "loss": 6.2212, "step": 76 }, { "epoch": 0.04432930339666091, "grad_norm": 5.876922607421875, "learning_rate": 3.04e-05, "loss": 5.7775, "step": 77 }, { "epoch": 0.044905008635578586, "grad_norm": 4.749701499938965, "learning_rate": 3.08e-05, "loss": 4.7321, "step": 78 }, { "epoch": 0.04548071387449626, "grad_norm": 4.894115447998047, "learning_rate": 3.12e-05, "loss": 5.2017, "step": 79 }, { "epoch": 0.04605641911341393, "grad_norm": 5.125804424285889, "learning_rate": 3.16e-05, "loss": 5.1661, "step": 80 }, { "epoch": 0.046632124352331605, "grad_norm": 7.571075439453125, "learning_rate": 3.2000000000000005e-05, "loss": 6.1439, "step": 81 }, { "epoch": 0.04720782959124928, "grad_norm": 4.469061374664307, "learning_rate": 3.24e-05, "loss": 5.1732, "step": 82 }, { "epoch": 0.04778353483016695, "grad_norm": 4.565371513366699, "learning_rate": 3.2800000000000004e-05, "loss": 5.4892, "step": 83 }, { "epoch": 0.04835924006908463, "grad_norm": 5.844489097595215, "learning_rate": 3.32e-05, "loss": 5.875, "step": 84 }, { "epoch": 0.048934945308002305, "grad_norm": 10.564720153808594, "learning_rate": 3.3600000000000004e-05, "loss": 5.9008, "step": 85 }, { "epoch": 0.04951065054691998, "grad_norm": 6.923472881317139, "learning_rate": 3.4000000000000007e-05, "loss": 5.4949, "step": 86 }, { "epoch": 0.05008635578583765, "grad_norm": 6.902386665344238, "learning_rate": 3.4399999999999996e-05, "loss": 4.9801, "step": 87 }, { "epoch": 0.050662061024755324, "grad_norm": 8.239148139953613, "learning_rate": 3.48e-05, "loss": 5.6578, "step": 88 }, { "epoch": 0.051237766263673, "grad_norm": 6.162630081176758, "learning_rate": 3.52e-05, "loss": 4.9911, "step": 89 }, { "epoch": 0.05181347150259067, "grad_norm": 7.2612433433532715, "learning_rate": 3.56e-05, "loss": 5.7976, "step": 90 }, { "epoch": 0.05238917674150835, "grad_norm": 6.149419784545898, "learning_rate": 3.6e-05, "loss": 4.9756, "step": 91 }, { "epoch": 0.05296488198042602, "grad_norm": 7.4116106033325195, "learning_rate": 3.6400000000000004e-05, "loss": 5.5805, "step": 92 }, { "epoch": 0.0535405872193437, "grad_norm": 5.512300491333008, "learning_rate": 3.68e-05, "loss": 4.5575, "step": 93 }, { "epoch": 0.05411629245826137, "grad_norm": 14.799551963806152, "learning_rate": 3.72e-05, "loss": 5.2244, "step": 94 }, { "epoch": 0.05469199769717904, "grad_norm": 9.756938934326172, "learning_rate": 3.76e-05, "loss": 4.8444, "step": 95 }, { "epoch": 0.055267702936096716, "grad_norm": 6.400147914886475, "learning_rate": 3.8e-05, "loss": 5.5091, "step": 96 }, { "epoch": 0.055843408175014396, "grad_norm": 8.406181335449219, "learning_rate": 3.8400000000000005e-05, "loss": 5.2641, "step": 97 }, { "epoch": 0.05641911341393207, "grad_norm": 6.860042572021484, "learning_rate": 3.88e-05, "loss": 5.2917, "step": 98 }, { "epoch": 0.05699481865284974, "grad_norm": 7.542653560638428, "learning_rate": 3.9200000000000004e-05, "loss": 5.1584, "step": 99 }, { "epoch": 0.057570523891767415, "grad_norm": 8.149137496948242, "learning_rate": 3.960000000000001e-05, "loss": 5.5326, "step": 100 }, { "epoch": 0.05814622913068509, "grad_norm": 5.590121269226074, "learning_rate": 4e-05, "loss": 5.2789, "step": 101 }, { "epoch": 0.05872193436960276, "grad_norm": 7.877676010131836, "learning_rate": 4.0400000000000006e-05, "loss": 4.8526, "step": 102 }, { "epoch": 0.059297639608520435, "grad_norm": 5.773808479309082, "learning_rate": 4.08e-05, "loss": 5.033, "step": 103 }, { "epoch": 0.059873344847438115, "grad_norm": 6.092824935913086, "learning_rate": 4.12e-05, "loss": 4.8936, "step": 104 }, { "epoch": 0.06044905008635579, "grad_norm": 5.934675693511963, "learning_rate": 4.16e-05, "loss": 4.4764, "step": 105 }, { "epoch": 0.06102475532527346, "grad_norm": 5.622652530670166, "learning_rate": 4.2e-05, "loss": 5.1344, "step": 106 }, { "epoch": 0.061600460564191134, "grad_norm": 7.697418212890625, "learning_rate": 4.24e-05, "loss": 5.2087, "step": 107 }, { "epoch": 0.06217616580310881, "grad_norm": 5.204082489013672, "learning_rate": 4.2800000000000004e-05, "loss": 4.6294, "step": 108 }, { "epoch": 0.06275187104202648, "grad_norm": 6.288537979125977, "learning_rate": 4.32e-05, "loss": 5.3009, "step": 109 }, { "epoch": 0.06332757628094415, "grad_norm": 6.717288017272949, "learning_rate": 4.36e-05, "loss": 5.5392, "step": 110 }, { "epoch": 0.06390328151986183, "grad_norm": 5.432399272918701, "learning_rate": 4.4000000000000006e-05, "loss": 4.3602, "step": 111 }, { "epoch": 0.0644789867587795, "grad_norm": 6.823062896728516, "learning_rate": 4.44e-05, "loss": 5.7343, "step": 112 }, { "epoch": 0.06505469199769717, "grad_norm": 6.532074928283691, "learning_rate": 4.4800000000000005e-05, "loss": 5.0605, "step": 113 }, { "epoch": 0.06563039723661486, "grad_norm": 5.982126712799072, "learning_rate": 4.52e-05, "loss": 5.2182, "step": 114 }, { "epoch": 0.06620610247553253, "grad_norm": 5.759943962097168, "learning_rate": 4.5600000000000004e-05, "loss": 4.9098, "step": 115 }, { "epoch": 0.0667818077144502, "grad_norm": 5.147834300994873, "learning_rate": 4.600000000000001e-05, "loss": 4.8671, "step": 116 }, { "epoch": 0.06735751295336788, "grad_norm": 8.015042304992676, "learning_rate": 4.64e-05, "loss": 5.7445, "step": 117 }, { "epoch": 0.06793321819228555, "grad_norm": 7.161843299865723, "learning_rate": 4.6800000000000006e-05, "loss": 5.9092, "step": 118 }, { "epoch": 0.06850892343120323, "grad_norm": 9.394163131713867, "learning_rate": 4.72e-05, "loss": 4.7243, "step": 119 }, { "epoch": 0.0690846286701209, "grad_norm": 4.96219539642334, "learning_rate": 4.76e-05, "loss": 4.7233, "step": 120 }, { "epoch": 0.06966033390903857, "grad_norm": 6.473387241363525, "learning_rate": 4.8e-05, "loss": 5.1295, "step": 121 }, { "epoch": 0.07023603914795624, "grad_norm": 6.797422885894775, "learning_rate": 4.8400000000000004e-05, "loss": 4.7697, "step": 122 }, { "epoch": 0.07081174438687392, "grad_norm": 6.656020641326904, "learning_rate": 4.88e-05, "loss": 5.2377, "step": 123 }, { "epoch": 0.07138744962579159, "grad_norm": 5.552718639373779, "learning_rate": 4.92e-05, "loss": 4.4741, "step": 124 }, { "epoch": 0.07196315486470926, "grad_norm": 6.101820468902588, "learning_rate": 4.96e-05, "loss": 4.4192, "step": 125 }, { "epoch": 0.07253886010362694, "grad_norm": 7.695935249328613, "learning_rate": 5e-05, "loss": 5.4128, "step": 126 }, { "epoch": 0.07311456534254462, "grad_norm": 6.9946208000183105, "learning_rate": 5.0400000000000005e-05, "loss": 5.4829, "step": 127 }, { "epoch": 0.0736902705814623, "grad_norm": 16.10480308532715, "learning_rate": 5.08e-05, "loss": 4.6945, "step": 128 }, { "epoch": 0.07426597582037997, "grad_norm": 5.313148021697998, "learning_rate": 5.1200000000000004e-05, "loss": 4.2429, "step": 129 }, { "epoch": 0.07484168105929764, "grad_norm": 5.506260871887207, "learning_rate": 5.16e-05, "loss": 4.7241, "step": 130 }, { "epoch": 0.07541738629821532, "grad_norm": 5.655925273895264, "learning_rate": 5.2000000000000004e-05, "loss": 5.4156, "step": 131 }, { "epoch": 0.07599309153713299, "grad_norm": 6.528857231140137, "learning_rate": 5.2400000000000007e-05, "loss": 5.3606, "step": 132 }, { "epoch": 0.07656879677605066, "grad_norm": 5.360299110412598, "learning_rate": 5.28e-05, "loss": 5.0686, "step": 133 }, { "epoch": 0.07714450201496834, "grad_norm": 5.301785945892334, "learning_rate": 5.3200000000000006e-05, "loss": 4.845, "step": 134 }, { "epoch": 0.07772020725388601, "grad_norm": 4.986385345458984, "learning_rate": 5.360000000000001e-05, "loss": 5.1493, "step": 135 }, { "epoch": 0.07829591249280368, "grad_norm": 5.200460433959961, "learning_rate": 5.4000000000000005e-05, "loss": 4.781, "step": 136 }, { "epoch": 0.07887161773172136, "grad_norm": 7.154032230377197, "learning_rate": 5.440000000000001e-05, "loss": 5.8801, "step": 137 }, { "epoch": 0.07944732297063903, "grad_norm": 4.641168117523193, "learning_rate": 5.4800000000000004e-05, "loss": 5.1929, "step": 138 }, { "epoch": 0.0800230282095567, "grad_norm": 4.8809123039245605, "learning_rate": 5.520000000000001e-05, "loss": 5.0221, "step": 139 }, { "epoch": 0.08059873344847437, "grad_norm": 5.0507402420043945, "learning_rate": 5.560000000000001e-05, "loss": 4.8543, "step": 140 }, { "epoch": 0.08117443868739206, "grad_norm": 6.459733963012695, "learning_rate": 5.6000000000000006e-05, "loss": 5.051, "step": 141 }, { "epoch": 0.08175014392630973, "grad_norm": 6.107847690582275, "learning_rate": 5.6399999999999995e-05, "loss": 4.8338, "step": 142 }, { "epoch": 0.08232584916522741, "grad_norm": 6.28361701965332, "learning_rate": 5.68e-05, "loss": 5.1373, "step": 143 }, { "epoch": 0.08290155440414508, "grad_norm": 4.957414627075195, "learning_rate": 5.72e-05, "loss": 4.8154, "step": 144 }, { "epoch": 0.08347725964306275, "grad_norm": 4.774332046508789, "learning_rate": 5.76e-05, "loss": 4.7262, "step": 145 }, { "epoch": 0.08405296488198043, "grad_norm": 7.41762113571167, "learning_rate": 5.8e-05, "loss": 5.5137, "step": 146 }, { "epoch": 0.0846286701208981, "grad_norm": 7.484424591064453, "learning_rate": 5.8399999999999997e-05, "loss": 5.766, "step": 147 }, { "epoch": 0.08520437535981577, "grad_norm": 4.917182922363281, "learning_rate": 5.88e-05, "loss": 5.0193, "step": 148 }, { "epoch": 0.08578008059873345, "grad_norm": 4.608645915985107, "learning_rate": 5.92e-05, "loss": 5.0873, "step": 149 }, { "epoch": 0.08635578583765112, "grad_norm": 6.5947794914245605, "learning_rate": 5.96e-05, "loss": 4.9855, "step": 150 }, { "epoch": 0.08693149107656879, "grad_norm": 3.8302507400512695, "learning_rate": 6e-05, "loss": 3.7953, "step": 151 }, { "epoch": 0.08750719631548647, "grad_norm": 3.6352171897888184, "learning_rate": 6.04e-05, "loss": 4.1647, "step": 152 }, { "epoch": 0.08808290155440414, "grad_norm": 4.818563461303711, "learning_rate": 6.08e-05, "loss": 4.2128, "step": 153 }, { "epoch": 0.08865860679332183, "grad_norm": 7.7323503494262695, "learning_rate": 6.12e-05, "loss": 5.4562, "step": 154 }, { "epoch": 0.0892343120322395, "grad_norm": 5.785284996032715, "learning_rate": 6.16e-05, "loss": 4.8956, "step": 155 }, { "epoch": 0.08981001727115717, "grad_norm": 6.181385040283203, "learning_rate": 6.2e-05, "loss": 5.2373, "step": 156 }, { "epoch": 0.09038572251007485, "grad_norm": 6.015028476715088, "learning_rate": 6.24e-05, "loss": 4.3663, "step": 157 }, { "epoch": 0.09096142774899252, "grad_norm": 4.41657829284668, "learning_rate": 6.280000000000001e-05, "loss": 4.5991, "step": 158 }, { "epoch": 0.09153713298791019, "grad_norm": 6.5107622146606445, "learning_rate": 6.32e-05, "loss": 4.8784, "step": 159 }, { "epoch": 0.09211283822682786, "grad_norm": 4.11070442199707, "learning_rate": 6.36e-05, "loss": 4.6766, "step": 160 }, { "epoch": 0.09268854346574554, "grad_norm": 8.204343795776367, "learning_rate": 6.400000000000001e-05, "loss": 5.5088, "step": 161 }, { "epoch": 0.09326424870466321, "grad_norm": 3.9389288425445557, "learning_rate": 6.440000000000001e-05, "loss": 4.3476, "step": 162 }, { "epoch": 0.09383995394358088, "grad_norm": 5.597643852233887, "learning_rate": 6.48e-05, "loss": 4.9976, "step": 163 }, { "epoch": 0.09441565918249856, "grad_norm": 8.994287490844727, "learning_rate": 6.52e-05, "loss": 5.5959, "step": 164 }, { "epoch": 0.09499136442141623, "grad_norm": 5.60779333114624, "learning_rate": 6.560000000000001e-05, "loss": 4.6283, "step": 165 }, { "epoch": 0.0955670696603339, "grad_norm": 4.319982528686523, "learning_rate": 6.6e-05, "loss": 4.041, "step": 166 }, { "epoch": 0.09614277489925158, "grad_norm": 5.684337615966797, "learning_rate": 6.64e-05, "loss": 4.8941, "step": 167 }, { "epoch": 0.09671848013816926, "grad_norm": 3.872518539428711, "learning_rate": 6.680000000000001e-05, "loss": 4.2242, "step": 168 }, { "epoch": 0.09729418537708694, "grad_norm": 4.826557636260986, "learning_rate": 6.720000000000001e-05, "loss": 4.8546, "step": 169 }, { "epoch": 0.09786989061600461, "grad_norm": 4.660156726837158, "learning_rate": 6.76e-05, "loss": 4.3797, "step": 170 }, { "epoch": 0.09844559585492228, "grad_norm": 4.616059303283691, "learning_rate": 6.800000000000001e-05, "loss": 4.7293, "step": 171 }, { "epoch": 0.09902130109383996, "grad_norm": 7.685507774353027, "learning_rate": 6.840000000000001e-05, "loss": 5.6251, "step": 172 }, { "epoch": 0.09959700633275763, "grad_norm": 7.424576282501221, "learning_rate": 6.879999999999999e-05, "loss": 4.8253, "step": 173 }, { "epoch": 0.1001727115716753, "grad_norm": 4.379521369934082, "learning_rate": 6.92e-05, "loss": 4.5287, "step": 174 }, { "epoch": 0.10074841681059298, "grad_norm": 4.753964424133301, "learning_rate": 6.96e-05, "loss": 4.5554, "step": 175 }, { "epoch": 0.10132412204951065, "grad_norm": 4.559609413146973, "learning_rate": 7e-05, "loss": 4.5615, "step": 176 }, { "epoch": 0.10189982728842832, "grad_norm": 5.178406238555908, "learning_rate": 7.04e-05, "loss": 4.6344, "step": 177 }, { "epoch": 0.102475532527346, "grad_norm": 7.4183526039123535, "learning_rate": 7.08e-05, "loss": 4.5451, "step": 178 }, { "epoch": 0.10305123776626367, "grad_norm": 5.832037448883057, "learning_rate": 7.12e-05, "loss": 4.7097, "step": 179 }, { "epoch": 0.10362694300518134, "grad_norm": 4.9681925773620605, "learning_rate": 7.16e-05, "loss": 4.6288, "step": 180 }, { "epoch": 0.10420264824409903, "grad_norm": 4.886664867401123, "learning_rate": 7.2e-05, "loss": 4.7019, "step": 181 }, { "epoch": 0.1047783534830167, "grad_norm": 4.668741226196289, "learning_rate": 7.24e-05, "loss": 4.4534, "step": 182 }, { "epoch": 0.10535405872193437, "grad_norm": 7.459389686584473, "learning_rate": 7.280000000000001e-05, "loss": 5.4758, "step": 183 }, { "epoch": 0.10592976396085205, "grad_norm": 31.545869827270508, "learning_rate": 7.32e-05, "loss": 6.179, "step": 184 }, { "epoch": 0.10650546919976972, "grad_norm": 9.739182472229004, "learning_rate": 7.36e-05, "loss": 4.9662, "step": 185 }, { "epoch": 0.1070811744386874, "grad_norm": 4.12076997756958, "learning_rate": 7.4e-05, "loss": 3.88, "step": 186 }, { "epoch": 0.10765687967760507, "grad_norm": 5.808717727661133, "learning_rate": 7.44e-05, "loss": 4.6157, "step": 187 }, { "epoch": 0.10823258491652274, "grad_norm": 3.6208741664886475, "learning_rate": 7.48e-05, "loss": 3.9156, "step": 188 }, { "epoch": 0.10880829015544041, "grad_norm": 4.674955368041992, "learning_rate": 7.52e-05, "loss": 4.4751, "step": 189 }, { "epoch": 0.10938399539435809, "grad_norm": 5.331599235534668, "learning_rate": 7.560000000000001e-05, "loss": 4.3887, "step": 190 }, { "epoch": 0.10995970063327576, "grad_norm": 5.1405534744262695, "learning_rate": 7.6e-05, "loss": 4.9114, "step": 191 }, { "epoch": 0.11053540587219343, "grad_norm": 3.7066593170166016, "learning_rate": 7.64e-05, "loss": 3.8948, "step": 192 }, { "epoch": 0.1111111111111111, "grad_norm": 5.185431003570557, "learning_rate": 7.680000000000001e-05, "loss": 4.232, "step": 193 }, { "epoch": 0.11168681635002879, "grad_norm": 4.900607585906982, "learning_rate": 7.72e-05, "loss": 4.667, "step": 194 }, { "epoch": 0.11226252158894647, "grad_norm": 5.091091632843018, "learning_rate": 7.76e-05, "loss": 4.3946, "step": 195 }, { "epoch": 0.11283822682786414, "grad_norm": 4.859619617462158, "learning_rate": 7.800000000000001e-05, "loss": 4.6306, "step": 196 }, { "epoch": 0.11341393206678181, "grad_norm": 3.544200897216797, "learning_rate": 7.840000000000001e-05, "loss": 4.2118, "step": 197 }, { "epoch": 0.11398963730569948, "grad_norm": 8.28862190246582, "learning_rate": 7.88e-05, "loss": 4.4431, "step": 198 }, { "epoch": 0.11456534254461716, "grad_norm": 6.373688220977783, "learning_rate": 7.920000000000001e-05, "loss": 4.7554, "step": 199 }, { "epoch": 0.11514104778353483, "grad_norm": 6.8544392585754395, "learning_rate": 7.960000000000001e-05, "loss": 4.8723, "step": 200 }, { "epoch": 0.1157167530224525, "grad_norm": 7.207869052886963, "learning_rate": 8e-05, "loss": 4.1096, "step": 201 }, { "epoch": 0.11629245826137018, "grad_norm": 4.9073333740234375, "learning_rate": 8.04e-05, "loss": 3.6834, "step": 202 }, { "epoch": 0.11686816350028785, "grad_norm": 6.523554801940918, "learning_rate": 8.080000000000001e-05, "loss": 4.4934, "step": 203 }, { "epoch": 0.11744386873920552, "grad_norm": 9.581537246704102, "learning_rate": 8.120000000000001e-05, "loss": 4.8199, "step": 204 }, { "epoch": 0.1180195739781232, "grad_norm": 5.319664001464844, "learning_rate": 8.16e-05, "loss": 4.0881, "step": 205 }, { "epoch": 0.11859527921704087, "grad_norm": 7.609442710876465, "learning_rate": 8.2e-05, "loss": 5.1011, "step": 206 }, { "epoch": 0.11917098445595854, "grad_norm": 5.437283515930176, "learning_rate": 8.24e-05, "loss": 4.7683, "step": 207 }, { "epoch": 0.11974668969487623, "grad_norm": 9.015962600708008, "learning_rate": 8.28e-05, "loss": 5.1197, "step": 208 }, { "epoch": 0.1203223949337939, "grad_norm": 5.41486120223999, "learning_rate": 8.32e-05, "loss": 4.2228, "step": 209 }, { "epoch": 0.12089810017271158, "grad_norm": 4.068630218505859, "learning_rate": 8.36e-05, "loss": 3.9683, "step": 210 }, { "epoch": 0.12147380541162925, "grad_norm": 4.818974494934082, "learning_rate": 8.4e-05, "loss": 4.3969, "step": 211 }, { "epoch": 0.12204951065054692, "grad_norm": 8.309637069702148, "learning_rate": 8.44e-05, "loss": 4.8983, "step": 212 }, { "epoch": 0.1226252158894646, "grad_norm": 5.997379302978516, "learning_rate": 8.48e-05, "loss": 4.6983, "step": 213 }, { "epoch": 0.12320092112838227, "grad_norm": 6.416568279266357, "learning_rate": 8.52e-05, "loss": 4.6, "step": 214 }, { "epoch": 0.12377662636729994, "grad_norm": 5.038214206695557, "learning_rate": 8.560000000000001e-05, "loss": 4.1803, "step": 215 }, { "epoch": 0.12435233160621761, "grad_norm": 5.035988807678223, "learning_rate": 8.6e-05, "loss": 4.1585, "step": 216 }, { "epoch": 0.12492803684513529, "grad_norm": 6.7663726806640625, "learning_rate": 8.64e-05, "loss": 4.4256, "step": 217 }, { "epoch": 0.12550374208405296, "grad_norm": 5.394269943237305, "learning_rate": 8.680000000000001e-05, "loss": 3.9008, "step": 218 }, { "epoch": 0.12607944732297063, "grad_norm": 5.4501800537109375, "learning_rate": 8.72e-05, "loss": 3.9869, "step": 219 }, { "epoch": 0.1266551525618883, "grad_norm": 4.7380170822143555, "learning_rate": 8.76e-05, "loss": 4.0876, "step": 220 }, { "epoch": 0.12723085780080598, "grad_norm": 6.059116840362549, "learning_rate": 8.800000000000001e-05, "loss": 4.147, "step": 221 }, { "epoch": 0.12780656303972365, "grad_norm": 5.5021586418151855, "learning_rate": 8.840000000000001e-05, "loss": 4.4547, "step": 222 }, { "epoch": 0.12838226827864133, "grad_norm": 4.760106563568115, "learning_rate": 8.88e-05, "loss": 4.075, "step": 223 }, { "epoch": 0.128957973517559, "grad_norm": 7.5847649574279785, "learning_rate": 8.92e-05, "loss": 4.6163, "step": 224 }, { "epoch": 0.12953367875647667, "grad_norm": 6.257955074310303, "learning_rate": 8.960000000000001e-05, "loss": 4.6043, "step": 225 }, { "epoch": 0.13010938399539435, "grad_norm": 7.368046283721924, "learning_rate": 9e-05, "loss": 4.7961, "step": 226 }, { "epoch": 0.13068508923431202, "grad_norm": 4.385096549987793, "learning_rate": 9.04e-05, "loss": 4.1968, "step": 227 }, { "epoch": 0.13126079447322972, "grad_norm": 6.34293794631958, "learning_rate": 9.080000000000001e-05, "loss": 4.3076, "step": 228 }, { "epoch": 0.1318364997121474, "grad_norm": 6.403743267059326, "learning_rate": 9.120000000000001e-05, "loss": 3.8917, "step": 229 }, { "epoch": 0.13241220495106507, "grad_norm": 6.792156219482422, "learning_rate": 9.16e-05, "loss": 3.9843, "step": 230 }, { "epoch": 0.13298791018998274, "grad_norm": 8.062408447265625, "learning_rate": 9.200000000000001e-05, "loss": 4.2562, "step": 231 }, { "epoch": 0.1335636154289004, "grad_norm": 8.513936042785645, "learning_rate": 9.240000000000001e-05, "loss": 4.6536, "step": 232 }, { "epoch": 0.13413932066781808, "grad_norm": 5.92789363861084, "learning_rate": 9.28e-05, "loss": 4.104, "step": 233 }, { "epoch": 0.13471502590673576, "grad_norm": 44.009300231933594, "learning_rate": 9.320000000000002e-05, "loss": 4.8297, "step": 234 }, { "epoch": 0.13529073114565343, "grad_norm": 5.342921257019043, "learning_rate": 9.360000000000001e-05, "loss": 4.0662, "step": 235 }, { "epoch": 0.1358664363845711, "grad_norm": 5.618771076202393, "learning_rate": 9.4e-05, "loss": 4.1692, "step": 236 }, { "epoch": 0.13644214162348878, "grad_norm": 6.6655473709106445, "learning_rate": 9.44e-05, "loss": 4.2759, "step": 237 }, { "epoch": 0.13701784686240645, "grad_norm": 6.415508270263672, "learning_rate": 9.48e-05, "loss": 4.025, "step": 238 }, { "epoch": 0.13759355210132412, "grad_norm": 62.65280532836914, "learning_rate": 9.52e-05, "loss": 5.3187, "step": 239 }, { "epoch": 0.1381692573402418, "grad_norm": 5.9870147705078125, "learning_rate": 9.56e-05, "loss": 4.3549, "step": 240 }, { "epoch": 0.13874496257915947, "grad_norm": 6.323814868927002, "learning_rate": 9.6e-05, "loss": 4.0618, "step": 241 }, { "epoch": 0.13932066781807714, "grad_norm": 7.25873327255249, "learning_rate": 9.64e-05, "loss": 4.6113, "step": 242 }, { "epoch": 0.13989637305699482, "grad_norm": 6.708962440490723, "learning_rate": 9.680000000000001e-05, "loss": 4.2734, "step": 243 }, { "epoch": 0.1404720782959125, "grad_norm": 6.766256332397461, "learning_rate": 9.72e-05, "loss": 3.8169, "step": 244 }, { "epoch": 0.14104778353483016, "grad_norm": 9.25779914855957, "learning_rate": 9.76e-05, "loss": 4.0823, "step": 245 }, { "epoch": 0.14162348877374784, "grad_norm": 6.24402379989624, "learning_rate": 9.8e-05, "loss": 3.9761, "step": 246 }, { "epoch": 0.1421991940126655, "grad_norm": 4.627258777618408, "learning_rate": 9.84e-05, "loss": 3.3376, "step": 247 }, { "epoch": 0.14277489925158318, "grad_norm": 6.5364766120910645, "learning_rate": 9.88e-05, "loss": 3.9101, "step": 248 }, { "epoch": 0.14335060449050085, "grad_norm": 6.722381591796875, "learning_rate": 9.92e-05, "loss": 4.2916, "step": 249 }, { "epoch": 0.14392630972941853, "grad_norm": 7.2800493240356445, "learning_rate": 9.960000000000001e-05, "loss": 4.1714, "step": 250 }, { "epoch": 0.1445020149683362, "grad_norm": 9.137832641601562, "learning_rate": 0.0001, "loss": 3.9733, "step": 251 }, { "epoch": 0.14507772020725387, "grad_norm": 5.290084362030029, "learning_rate": 0.0001004, "loss": 3.8465, "step": 252 }, { "epoch": 0.14565342544617155, "grad_norm": 7.146475791931152, "learning_rate": 0.00010080000000000001, "loss": 4.154, "step": 253 }, { "epoch": 0.14622913068508925, "grad_norm": 5.462000370025635, "learning_rate": 0.00010120000000000001, "loss": 3.8403, "step": 254 }, { "epoch": 0.14680483592400692, "grad_norm": 8.053996086120605, "learning_rate": 0.0001016, "loss": 4.224, "step": 255 }, { "epoch": 0.1473805411629246, "grad_norm": 56.904518127441406, "learning_rate": 0.00010200000000000001, "loss": 5.3512, "step": 256 }, { "epoch": 0.14795624640184227, "grad_norm": 67.7396469116211, "learning_rate": 0.00010240000000000001, "loss": 4.136, "step": 257 }, { "epoch": 0.14853195164075994, "grad_norm": 5.19423770904541, "learning_rate": 0.0001028, "loss": 3.6272, "step": 258 }, { "epoch": 0.1491076568796776, "grad_norm": 6.946446418762207, "learning_rate": 0.0001032, "loss": 3.7617, "step": 259 }, { "epoch": 0.1496833621185953, "grad_norm": 6.839754104614258, "learning_rate": 0.00010360000000000001, "loss": 4.2895, "step": 260 }, { "epoch": 0.15025906735751296, "grad_norm": 7.3253254890441895, "learning_rate": 0.00010400000000000001, "loss": 4.0997, "step": 261 }, { "epoch": 0.15083477259643063, "grad_norm": 6.981521129608154, "learning_rate": 0.0001044, "loss": 3.4663, "step": 262 }, { "epoch": 0.1514104778353483, "grad_norm": 6.424066543579102, "learning_rate": 0.00010480000000000001, "loss": 4.0914, "step": 263 }, { "epoch": 0.15198618307426598, "grad_norm": 6.7790398597717285, "learning_rate": 0.00010520000000000001, "loss": 4.0818, "step": 264 }, { "epoch": 0.15256188831318365, "grad_norm": 7.887113094329834, "learning_rate": 0.0001056, "loss": 4.3784, "step": 265 }, { "epoch": 0.15313759355210133, "grad_norm": 8.3016939163208, "learning_rate": 0.00010600000000000002, "loss": 3.7843, "step": 266 }, { "epoch": 0.153713298791019, "grad_norm": 10.073237419128418, "learning_rate": 0.00010640000000000001, "loss": 4.0118, "step": 267 }, { "epoch": 0.15428900402993667, "grad_norm": 6.9664106369018555, "learning_rate": 0.00010680000000000001, "loss": 3.8644, "step": 268 }, { "epoch": 0.15486470926885434, "grad_norm": 8.479534149169922, "learning_rate": 0.00010720000000000002, "loss": 3.7009, "step": 269 }, { "epoch": 0.15544041450777202, "grad_norm": 8.317602157592773, "learning_rate": 0.00010760000000000001, "loss": 3.7018, "step": 270 }, { "epoch": 0.1560161197466897, "grad_norm": 6.020889759063721, "learning_rate": 0.00010800000000000001, "loss": 3.656, "step": 271 }, { "epoch": 0.15659182498560736, "grad_norm": 7.147673606872559, "learning_rate": 0.00010840000000000002, "loss": 3.9216, "step": 272 }, { "epoch": 0.15716753022452504, "grad_norm": 5.485556125640869, "learning_rate": 0.00010880000000000002, "loss": 3.4732, "step": 273 }, { "epoch": 0.1577432354634427, "grad_norm": 7.432086944580078, "learning_rate": 0.00010920000000000001, "loss": 3.423, "step": 274 }, { "epoch": 0.15831894070236038, "grad_norm": 6.897833824157715, "learning_rate": 0.00010960000000000001, "loss": 3.6169, "step": 275 }, { "epoch": 0.15889464594127806, "grad_norm": 7.707437992095947, "learning_rate": 0.00011000000000000002, "loss": 3.6883, "step": 276 }, { "epoch": 0.15947035118019573, "grad_norm": 5.546234607696533, "learning_rate": 0.00011040000000000001, "loss": 3.8388, "step": 277 }, { "epoch": 0.1600460564191134, "grad_norm": 10.001431465148926, "learning_rate": 0.00011080000000000001, "loss": 3.372, "step": 278 }, { "epoch": 0.16062176165803108, "grad_norm": 8.793180465698242, "learning_rate": 0.00011120000000000002, "loss": 3.7929, "step": 279 }, { "epoch": 0.16119746689694875, "grad_norm": 8.189177513122559, "learning_rate": 0.00011160000000000002, "loss": 4.0091, "step": 280 }, { "epoch": 0.16177317213586645, "grad_norm": 6.998697280883789, "learning_rate": 0.00011200000000000001, "loss": 3.648, "step": 281 }, { "epoch": 0.16234887737478412, "grad_norm": 8.115317344665527, "learning_rate": 0.00011240000000000002, "loss": 4.0327, "step": 282 }, { "epoch": 0.1629245826137018, "grad_norm": 7.597106456756592, "learning_rate": 0.00011279999999999999, "loss": 3.7811, "step": 283 }, { "epoch": 0.16350028785261947, "grad_norm": 6.518374443054199, "learning_rate": 0.0001132, "loss": 3.3359, "step": 284 }, { "epoch": 0.16407599309153714, "grad_norm": 6.962795257568359, "learning_rate": 0.0001136, "loss": 3.3726, "step": 285 }, { "epoch": 0.16465169833045482, "grad_norm": 8.1845703125, "learning_rate": 0.00011399999999999999, "loss": 4.0042, "step": 286 }, { "epoch": 0.1652274035693725, "grad_norm": 6.869271755218506, "learning_rate": 0.0001144, "loss": 3.4989, "step": 287 }, { "epoch": 0.16580310880829016, "grad_norm": 12.261098861694336, "learning_rate": 0.0001148, "loss": 4.1045, "step": 288 }, { "epoch": 0.16637881404720783, "grad_norm": 6.912962913513184, "learning_rate": 0.0001152, "loss": 3.6853, "step": 289 }, { "epoch": 0.1669545192861255, "grad_norm": 8.545379638671875, "learning_rate": 0.00011559999999999999, "loss": 3.8903, "step": 290 }, { "epoch": 0.16753022452504318, "grad_norm": 15.040228843688965, "learning_rate": 0.000116, "loss": 3.4079, "step": 291 }, { "epoch": 0.16810592976396085, "grad_norm": 7.038132667541504, "learning_rate": 0.0001164, "loss": 3.7119, "step": 292 }, { "epoch": 0.16868163500287853, "grad_norm": 6.259817123413086, "learning_rate": 0.00011679999999999999, "loss": 3.4931, "step": 293 }, { "epoch": 0.1692573402417962, "grad_norm": 6.947351455688477, "learning_rate": 0.0001172, "loss": 3.677, "step": 294 }, { "epoch": 0.16983304548071387, "grad_norm": 14.260014533996582, "learning_rate": 0.0001176, "loss": 3.9591, "step": 295 }, { "epoch": 0.17040875071963155, "grad_norm": 6.70070743560791, "learning_rate": 0.000118, "loss": 3.2433, "step": 296 }, { "epoch": 0.17098445595854922, "grad_norm": 11.697699546813965, "learning_rate": 0.0001184, "loss": 4.0909, "step": 297 }, { "epoch": 0.1715601611974669, "grad_norm": 10.029029846191406, "learning_rate": 0.0001188, "loss": 3.5743, "step": 298 }, { "epoch": 0.17213586643638457, "grad_norm": 6.6930365562438965, "learning_rate": 0.0001192, "loss": 3.2007, "step": 299 }, { "epoch": 0.17271157167530224, "grad_norm": 21.772619247436523, "learning_rate": 0.00011960000000000001, "loss": 3.8505, "step": 300 }, { "epoch": 0.1732872769142199, "grad_norm": 9.126256942749023, "learning_rate": 0.00012, "loss": 3.5777, "step": 301 }, { "epoch": 0.17386298215313759, "grad_norm": 7.574469566345215, "learning_rate": 0.0001204, "loss": 3.5329, "step": 302 }, { "epoch": 0.17443868739205526, "grad_norm": 6.436075687408447, "learning_rate": 0.0001208, "loss": 3.279, "step": 303 }, { "epoch": 0.17501439263097293, "grad_norm": 5.945929527282715, "learning_rate": 0.0001212, "loss": 3.4338, "step": 304 }, { "epoch": 0.1755900978698906, "grad_norm": 5.7057785987854, "learning_rate": 0.0001216, "loss": 3.2369, "step": 305 }, { "epoch": 0.17616580310880828, "grad_norm": 9.411810874938965, "learning_rate": 0.000122, "loss": 3.5364, "step": 306 }, { "epoch": 0.17674150834772595, "grad_norm": 8.872260093688965, "learning_rate": 0.0001224, "loss": 3.7803, "step": 307 }, { "epoch": 0.17731721358664365, "grad_norm": 46.1115837097168, "learning_rate": 0.0001228, "loss": 3.7188, "step": 308 }, { "epoch": 0.17789291882556132, "grad_norm": 48.33805465698242, "learning_rate": 0.0001232, "loss": 3.7491, "step": 309 }, { "epoch": 0.178468624064479, "grad_norm": 7.272097587585449, "learning_rate": 0.0001236, "loss": 3.559, "step": 310 }, { "epoch": 0.17904432930339667, "grad_norm": 7.471408367156982, "learning_rate": 0.000124, "loss": 3.6014, "step": 311 }, { "epoch": 0.17962003454231434, "grad_norm": 11.095893859863281, "learning_rate": 0.00012440000000000002, "loss": 3.5741, "step": 312 }, { "epoch": 0.18019573978123202, "grad_norm": 8.782601356506348, "learning_rate": 0.0001248, "loss": 3.2475, "step": 313 }, { "epoch": 0.1807714450201497, "grad_norm": 7.485610485076904, "learning_rate": 0.0001252, "loss": 3.0304, "step": 314 }, { "epoch": 0.18134715025906736, "grad_norm": 7.794425964355469, "learning_rate": 0.00012560000000000002, "loss": 2.9428, "step": 315 }, { "epoch": 0.18192285549798504, "grad_norm": 6.470662593841553, "learning_rate": 0.000126, "loss": 3.4341, "step": 316 }, { "epoch": 0.1824985607369027, "grad_norm": 10.054426193237305, "learning_rate": 0.0001264, "loss": 2.941, "step": 317 }, { "epoch": 0.18307426597582038, "grad_norm": 93.38629150390625, "learning_rate": 0.00012680000000000002, "loss": 4.2291, "step": 318 }, { "epoch": 0.18364997121473806, "grad_norm": 9.805968284606934, "learning_rate": 0.0001272, "loss": 3.0641, "step": 319 }, { "epoch": 0.18422567645365573, "grad_norm": 6.104334831237793, "learning_rate": 0.0001276, "loss": 3.0856, "step": 320 }, { "epoch": 0.1848013816925734, "grad_norm": 8.24195384979248, "learning_rate": 0.00012800000000000002, "loss": 3.0774, "step": 321 }, { "epoch": 0.18537708693149108, "grad_norm": 6.327628135681152, "learning_rate": 0.0001284, "loss": 3.0826, "step": 322 }, { "epoch": 0.18595279217040875, "grad_norm": 11.529990196228027, "learning_rate": 0.00012880000000000001, "loss": 3.7882, "step": 323 }, { "epoch": 0.18652849740932642, "grad_norm": 9.700762748718262, "learning_rate": 0.00012920000000000002, "loss": 3.4958, "step": 324 }, { "epoch": 0.1871042026482441, "grad_norm": 10.289152145385742, "learning_rate": 0.0001296, "loss": 3.3652, "step": 325 }, { "epoch": 0.18767990788716177, "grad_norm": 6.888269901275635, "learning_rate": 0.00013000000000000002, "loss": 3.1086, "step": 326 }, { "epoch": 0.18825561312607944, "grad_norm": 9.220719337463379, "learning_rate": 0.0001304, "loss": 3.5314, "step": 327 }, { "epoch": 0.1888313183649971, "grad_norm": 9.044048309326172, "learning_rate": 0.0001308, "loss": 2.943, "step": 328 }, { "epoch": 0.1894070236039148, "grad_norm": 11.338268280029297, "learning_rate": 0.00013120000000000002, "loss": 3.4617, "step": 329 }, { "epoch": 0.18998272884283246, "grad_norm": 5.949525833129883, "learning_rate": 0.0001316, "loss": 2.8324, "step": 330 }, { "epoch": 0.19055843408175013, "grad_norm": 9.158703804016113, "learning_rate": 0.000132, "loss": 3.1961, "step": 331 }, { "epoch": 0.1911341393206678, "grad_norm": 8.708706855773926, "learning_rate": 0.00013240000000000002, "loss": 3.1941, "step": 332 }, { "epoch": 0.19170984455958548, "grad_norm": 10.610583305358887, "learning_rate": 0.0001328, "loss": 3.3617, "step": 333 }, { "epoch": 0.19228554979850315, "grad_norm": 8.023892402648926, "learning_rate": 0.0001332, "loss": 3.1775, "step": 334 }, { "epoch": 0.19286125503742085, "grad_norm": 7.895623683929443, "learning_rate": 0.00013360000000000002, "loss": 3.1033, "step": 335 }, { "epoch": 0.19343696027633853, "grad_norm": 6.376975059509277, "learning_rate": 0.000134, "loss": 2.808, "step": 336 }, { "epoch": 0.1940126655152562, "grad_norm": 5.185142993927002, "learning_rate": 0.00013440000000000001, "loss": 2.8337, "step": 337 }, { "epoch": 0.19458837075417387, "grad_norm": 6.408693790435791, "learning_rate": 0.00013480000000000002, "loss": 3.0604, "step": 338 }, { "epoch": 0.19516407599309155, "grad_norm": 21.610239028930664, "learning_rate": 0.0001352, "loss": 3.431, "step": 339 }, { "epoch": 0.19573978123200922, "grad_norm": 9.485398292541504, "learning_rate": 0.00013560000000000002, "loss": 3.2208, "step": 340 }, { "epoch": 0.1963154864709269, "grad_norm": 6.460340976715088, "learning_rate": 0.00013600000000000003, "loss": 2.793, "step": 341 }, { "epoch": 0.19689119170984457, "grad_norm": 5.64215612411499, "learning_rate": 0.0001364, "loss": 2.8589, "step": 342 }, { "epoch": 0.19746689694876224, "grad_norm": 6.9033427238464355, "learning_rate": 0.00013680000000000002, "loss": 3.1031, "step": 343 }, { "epoch": 0.1980426021876799, "grad_norm": 5.724493980407715, "learning_rate": 0.00013720000000000003, "loss": 2.8605, "step": 344 }, { "epoch": 0.19861830742659758, "grad_norm": 15.779448509216309, "learning_rate": 0.00013759999999999998, "loss": 3.2151, "step": 345 }, { "epoch": 0.19919401266551526, "grad_norm": 6.960752964019775, "learning_rate": 0.000138, "loss": 2.8537, "step": 346 }, { "epoch": 0.19976971790443293, "grad_norm": 8.871850967407227, "learning_rate": 0.0001384, "loss": 2.7536, "step": 347 }, { "epoch": 0.2003454231433506, "grad_norm": 6.670348644256592, "learning_rate": 0.00013879999999999999, "loss": 2.9525, "step": 348 }, { "epoch": 0.20092112838226828, "grad_norm": 9.574007034301758, "learning_rate": 0.0001392, "loss": 2.7996, "step": 349 }, { "epoch": 0.20149683362118595, "grad_norm": 5.3862223625183105, "learning_rate": 0.0001396, "loss": 2.662, "step": 350 }, { "epoch": 0.20207253886010362, "grad_norm": 11.832735061645508, "learning_rate": 0.00014, "loss": 3.1706, "step": 351 }, { "epoch": 0.2026482440990213, "grad_norm": 8.553043365478516, "learning_rate": 0.0001404, "loss": 2.8034, "step": 352 }, { "epoch": 0.20322394933793897, "grad_norm": 17.231216430664062, "learning_rate": 0.0001408, "loss": 2.8267, "step": 353 }, { "epoch": 0.20379965457685664, "grad_norm": 10.80978012084961, "learning_rate": 0.0001412, "loss": 2.7008, "step": 354 }, { "epoch": 0.20437535981577432, "grad_norm": 7.117002010345459, "learning_rate": 0.0001416, "loss": 2.5399, "step": 355 }, { "epoch": 0.204951065054692, "grad_norm": 5.009802341461182, "learning_rate": 0.000142, "loss": 2.7215, "step": 356 }, { "epoch": 0.20552677029360966, "grad_norm": 16.786869049072266, "learning_rate": 0.0001424, "loss": 2.9873, "step": 357 }, { "epoch": 0.20610247553252733, "grad_norm": 7.779325008392334, "learning_rate": 0.0001428, "loss": 2.892, "step": 358 }, { "epoch": 0.206678180771445, "grad_norm": 9.354433059692383, "learning_rate": 0.0001432, "loss": 2.7065, "step": 359 }, { "epoch": 0.20725388601036268, "grad_norm": 13.15522575378418, "learning_rate": 0.0001436, "loss": 2.8061, "step": 360 }, { "epoch": 0.20782959124928038, "grad_norm": 6.927896976470947, "learning_rate": 0.000144, "loss": 2.8687, "step": 361 }, { "epoch": 0.20840529648819806, "grad_norm": 8.532772064208984, "learning_rate": 0.0001444, "loss": 2.9418, "step": 362 }, { "epoch": 0.20898100172711573, "grad_norm": 8.618231773376465, "learning_rate": 0.0001448, "loss": 2.588, "step": 363 }, { "epoch": 0.2095567069660334, "grad_norm": 4.94150447845459, "learning_rate": 0.0001452, "loss": 2.5464, "step": 364 }, { "epoch": 0.21013241220495107, "grad_norm": 5.547298431396484, "learning_rate": 0.00014560000000000002, "loss": 2.755, "step": 365 }, { "epoch": 0.21070811744386875, "grad_norm": 8.270822525024414, "learning_rate": 0.000146, "loss": 2.8345, "step": 366 }, { "epoch": 0.21128382268278642, "grad_norm": 6.572064399719238, "learning_rate": 0.0001464, "loss": 2.6624, "step": 367 }, { "epoch": 0.2118595279217041, "grad_norm": 8.243054389953613, "learning_rate": 0.00014680000000000002, "loss": 2.7102, "step": 368 }, { "epoch": 0.21243523316062177, "grad_norm": 6.671678066253662, "learning_rate": 0.0001472, "loss": 2.4775, "step": 369 }, { "epoch": 0.21301093839953944, "grad_norm": 5.922910690307617, "learning_rate": 0.0001476, "loss": 2.919, "step": 370 }, { "epoch": 0.2135866436384571, "grad_norm": 12.84566593170166, "learning_rate": 0.000148, "loss": 2.5189, "step": 371 }, { "epoch": 0.2141623488773748, "grad_norm": 7.342642307281494, "learning_rate": 0.0001484, "loss": 2.8968, "step": 372 }, { "epoch": 0.21473805411629246, "grad_norm": 14.625147819519043, "learning_rate": 0.0001488, "loss": 2.6793, "step": 373 }, { "epoch": 0.21531375935521013, "grad_norm": 6.683467388153076, "learning_rate": 0.0001492, "loss": 2.3975, "step": 374 }, { "epoch": 0.2158894645941278, "grad_norm": 12.186212539672852, "learning_rate": 0.0001496, "loss": 2.856, "step": 375 }, { "epoch": 0.21646516983304548, "grad_norm": 8.417567253112793, "learning_rate": 0.00015000000000000001, "loss": 2.6326, "step": 376 }, { "epoch": 0.21704087507196315, "grad_norm": 5.414144992828369, "learning_rate": 0.0001504, "loss": 2.7417, "step": 377 }, { "epoch": 0.21761658031088082, "grad_norm": 13.388712882995605, "learning_rate": 0.0001508, "loss": 2.8813, "step": 378 }, { "epoch": 0.2181922855497985, "grad_norm": 6.375700950622559, "learning_rate": 0.00015120000000000002, "loss": 2.7187, "step": 379 }, { "epoch": 0.21876799078871617, "grad_norm": 9.897554397583008, "learning_rate": 0.0001516, "loss": 2.6278, "step": 380 }, { "epoch": 0.21934369602763384, "grad_norm": 10.079334259033203, "learning_rate": 0.000152, "loss": 2.4861, "step": 381 }, { "epoch": 0.21991940126655152, "grad_norm": 10.082268714904785, "learning_rate": 0.00015240000000000002, "loss": 2.6516, "step": 382 }, { "epoch": 0.2204951065054692, "grad_norm": 9.192161560058594, "learning_rate": 0.0001528, "loss": 2.3307, "step": 383 }, { "epoch": 0.22107081174438686, "grad_norm": 8.085034370422363, "learning_rate": 0.0001532, "loss": 2.3445, "step": 384 }, { "epoch": 0.22164651698330454, "grad_norm": 5.418321132659912, "learning_rate": 0.00015360000000000002, "loss": 2.7119, "step": 385 }, { "epoch": 0.2222222222222222, "grad_norm": 16.515369415283203, "learning_rate": 0.000154, "loss": 2.6647, "step": 386 }, { "epoch": 0.22279792746113988, "grad_norm": 11.138907432556152, "learning_rate": 0.0001544, "loss": 2.6742, "step": 387 }, { "epoch": 0.22337363270005758, "grad_norm": 20.75733184814453, "learning_rate": 0.00015480000000000002, "loss": 2.8834, "step": 388 }, { "epoch": 0.22394933793897526, "grad_norm": 8.349270820617676, "learning_rate": 0.0001552, "loss": 2.6376, "step": 389 }, { "epoch": 0.22452504317789293, "grad_norm": 6.902172088623047, "learning_rate": 0.00015560000000000001, "loss": 2.6186, "step": 390 }, { "epoch": 0.2251007484168106, "grad_norm": 14.718120574951172, "learning_rate": 0.00015600000000000002, "loss": 2.7649, "step": 391 }, { "epoch": 0.22567645365572828, "grad_norm": 5.805610656738281, "learning_rate": 0.0001564, "loss": 2.6221, "step": 392 }, { "epoch": 0.22625215889464595, "grad_norm": 6.138345718383789, "learning_rate": 0.00015680000000000002, "loss": 2.5751, "step": 393 }, { "epoch": 0.22682786413356362, "grad_norm": 29.98923683166504, "learning_rate": 0.00015720000000000003, "loss": 2.824, "step": 394 }, { "epoch": 0.2274035693724813, "grad_norm": 31.91318702697754, "learning_rate": 0.0001576, "loss": 2.6134, "step": 395 }, { "epoch": 0.22797927461139897, "grad_norm": 10.812357902526855, "learning_rate": 0.00015800000000000002, "loss": 2.4594, "step": 396 }, { "epoch": 0.22855497985031664, "grad_norm": 7.6294755935668945, "learning_rate": 0.00015840000000000003, "loss": 2.5259, "step": 397 }, { "epoch": 0.22913068508923431, "grad_norm": 5.666753768920898, "learning_rate": 0.0001588, "loss": 2.6108, "step": 398 }, { "epoch": 0.229706390328152, "grad_norm": 6.732410907745361, "learning_rate": 0.00015920000000000002, "loss": 2.5352, "step": 399 }, { "epoch": 0.23028209556706966, "grad_norm": 6.749885082244873, "learning_rate": 0.0001596, "loss": 2.5103, "step": 400 }, { "epoch": 0.23085780080598733, "grad_norm": 5.389144420623779, "learning_rate": 0.00016, "loss": 2.5803, "step": 401 }, { "epoch": 0.231433506044905, "grad_norm": 6.996800422668457, "learning_rate": 0.00016040000000000002, "loss": 2.7746, "step": 402 }, { "epoch": 0.23200921128382268, "grad_norm": 22.8950138092041, "learning_rate": 0.0001608, "loss": 2.5619, "step": 403 }, { "epoch": 0.23258491652274035, "grad_norm": 11.477226257324219, "learning_rate": 0.00016120000000000002, "loss": 2.4898, "step": 404 }, { "epoch": 0.23316062176165803, "grad_norm": 8.584878921508789, "learning_rate": 0.00016160000000000002, "loss": 2.4191, "step": 405 }, { "epoch": 0.2337363270005757, "grad_norm": 6.987226963043213, "learning_rate": 0.000162, "loss": 2.4045, "step": 406 }, { "epoch": 0.23431203223949337, "grad_norm": 12.917460441589355, "learning_rate": 0.00016240000000000002, "loss": 2.656, "step": 407 }, { "epoch": 0.23488773747841105, "grad_norm": 13.053242683410645, "learning_rate": 0.0001628, "loss": 2.5026, "step": 408 }, { "epoch": 0.23546344271732872, "grad_norm": 6.013350486755371, "learning_rate": 0.0001632, "loss": 2.4027, "step": 409 }, { "epoch": 0.2360391479562464, "grad_norm": 21.95798110961914, "learning_rate": 0.0001636, "loss": 2.4155, "step": 410 }, { "epoch": 0.23661485319516407, "grad_norm": 6.197417259216309, "learning_rate": 0.000164, "loss": 2.2512, "step": 411 }, { "epoch": 0.23719055843408174, "grad_norm": 5.798823356628418, "learning_rate": 0.0001644, "loss": 2.5775, "step": 412 }, { "epoch": 0.2377662636729994, "grad_norm": 12.58922290802002, "learning_rate": 0.0001648, "loss": 2.2999, "step": 413 }, { "epoch": 0.23834196891191708, "grad_norm": 6.5375213623046875, "learning_rate": 0.0001652, "loss": 2.5818, "step": 414 }, { "epoch": 0.23891767415083479, "grad_norm": 8.916410446166992, "learning_rate": 0.0001656, "loss": 2.3589, "step": 415 }, { "epoch": 0.23949337938975246, "grad_norm": 7.457561492919922, "learning_rate": 0.000166, "loss": 2.5489, "step": 416 }, { "epoch": 0.24006908462867013, "grad_norm": 18.522987365722656, "learning_rate": 0.0001664, "loss": 2.6065, "step": 417 }, { "epoch": 0.2406447898675878, "grad_norm": 64.20520782470703, "learning_rate": 0.0001668, "loss": 2.7258, "step": 418 }, { "epoch": 0.24122049510650548, "grad_norm": 40.07137680053711, "learning_rate": 0.0001672, "loss": 2.6834, "step": 419 }, { "epoch": 0.24179620034542315, "grad_norm": 6.103574752807617, "learning_rate": 0.0001676, "loss": 2.4909, "step": 420 }, { "epoch": 0.24237190558434082, "grad_norm": 6.48091983795166, "learning_rate": 0.000168, "loss": 2.5598, "step": 421 }, { "epoch": 0.2429476108232585, "grad_norm": 6.65122127532959, "learning_rate": 0.0001684, "loss": 2.0797, "step": 422 }, { "epoch": 0.24352331606217617, "grad_norm": 7.160250663757324, "learning_rate": 0.0001688, "loss": 2.4701, "step": 423 }, { "epoch": 0.24409902130109384, "grad_norm": 5.73784875869751, "learning_rate": 0.0001692, "loss": 2.333, "step": 424 }, { "epoch": 0.24467472654001152, "grad_norm": 21.651309967041016, "learning_rate": 0.0001696, "loss": 2.5034, "step": 425 }, { "epoch": 0.2452504317789292, "grad_norm": 17.80324935913086, "learning_rate": 0.00017, "loss": 2.4943, "step": 426 }, { "epoch": 0.24582613701784686, "grad_norm": 6.137923240661621, "learning_rate": 0.0001704, "loss": 2.4143, "step": 427 }, { "epoch": 0.24640184225676454, "grad_norm": 5.833311080932617, "learning_rate": 0.0001708, "loss": 2.4598, "step": 428 }, { "epoch": 0.2469775474956822, "grad_norm": 20.596446990966797, "learning_rate": 0.00017120000000000001, "loss": 2.4136, "step": 429 }, { "epoch": 0.24755325273459988, "grad_norm": 5.577768802642822, "learning_rate": 0.0001716, "loss": 2.4349, "step": 430 }, { "epoch": 0.24812895797351756, "grad_norm": 6.16340446472168, "learning_rate": 0.000172, "loss": 2.5712, "step": 431 }, { "epoch": 0.24870466321243523, "grad_norm": 5.587292671203613, "learning_rate": 0.00017240000000000002, "loss": 2.522, "step": 432 }, { "epoch": 0.2492803684513529, "grad_norm": 7.1100945472717285, "learning_rate": 0.0001728, "loss": 2.2343, "step": 433 }, { "epoch": 0.24985607369027057, "grad_norm": 6.089508056640625, "learning_rate": 0.0001732, "loss": 2.5291, "step": 434 }, { "epoch": 0.2504317789291883, "grad_norm": 12.8109769821167, "learning_rate": 0.00017360000000000002, "loss": 2.4944, "step": 435 }, { "epoch": 0.2510074841681059, "grad_norm": 9.722925186157227, "learning_rate": 0.000174, "loss": 2.2176, "step": 436 }, { "epoch": 0.2515831894070236, "grad_norm": 13.540785789489746, "learning_rate": 0.0001744, "loss": 2.3636, "step": 437 }, { "epoch": 0.25215889464594127, "grad_norm": 22.12358856201172, "learning_rate": 0.00017480000000000002, "loss": 2.4757, "step": 438 }, { "epoch": 0.25273459988485897, "grad_norm": 8.760823249816895, "learning_rate": 0.0001752, "loss": 2.256, "step": 439 }, { "epoch": 0.2533103051237766, "grad_norm": 7.311398506164551, "learning_rate": 0.0001756, "loss": 2.3274, "step": 440 }, { "epoch": 0.2538860103626943, "grad_norm": 9.8610200881958, "learning_rate": 0.00017600000000000002, "loss": 2.4749, "step": 441 }, { "epoch": 0.25446171560161196, "grad_norm": 7.475802898406982, "learning_rate": 0.0001764, "loss": 2.4547, "step": 442 }, { "epoch": 0.25503742084052966, "grad_norm": 13.036137580871582, "learning_rate": 0.00017680000000000001, "loss": 2.1679, "step": 443 }, { "epoch": 0.2556131260794473, "grad_norm": 11.247735977172852, "learning_rate": 0.0001772, "loss": 2.5446, "step": 444 }, { "epoch": 0.256188831318365, "grad_norm": 7.0622124671936035, "learning_rate": 0.0001776, "loss": 2.3196, "step": 445 }, { "epoch": 0.25676453655728265, "grad_norm": 5.404714107513428, "learning_rate": 0.00017800000000000002, "loss": 2.2713, "step": 446 }, { "epoch": 0.25734024179620035, "grad_norm": 44.592891693115234, "learning_rate": 0.0001784, "loss": 2.2287, "step": 447 }, { "epoch": 0.257915947035118, "grad_norm": 30.109132766723633, "learning_rate": 0.0001788, "loss": 2.3153, "step": 448 }, { "epoch": 0.2584916522740357, "grad_norm": 15.7490873336792, "learning_rate": 0.00017920000000000002, "loss": 2.3081, "step": 449 }, { "epoch": 0.25906735751295334, "grad_norm": 13.772661209106445, "learning_rate": 0.0001796, "loss": 2.2548, "step": 450 }, { "epoch": 0.25964306275187105, "grad_norm": 6.858334064483643, "learning_rate": 0.00018, "loss": 2.4804, "step": 451 }, { "epoch": 0.2602187679907887, "grad_norm": 6.23155403137207, "learning_rate": 0.00018040000000000002, "loss": 2.4281, "step": 452 }, { "epoch": 0.2607944732297064, "grad_norm": 5.4447150230407715, "learning_rate": 0.0001808, "loss": 2.342, "step": 453 }, { "epoch": 0.26137017846862404, "grad_norm": 11.79716682434082, "learning_rate": 0.0001812, "loss": 2.2528, "step": 454 }, { "epoch": 0.26194588370754174, "grad_norm": 10.708625793457031, "learning_rate": 0.00018160000000000002, "loss": 2.0204, "step": 455 }, { "epoch": 0.26252158894645944, "grad_norm": 21.41659164428711, "learning_rate": 0.000182, "loss": 2.2593, "step": 456 }, { "epoch": 0.2630972941853771, "grad_norm": 5.636983394622803, "learning_rate": 0.00018240000000000002, "loss": 2.1412, "step": 457 }, { "epoch": 0.2636729994242948, "grad_norm": 9.639352798461914, "learning_rate": 0.00018280000000000003, "loss": 2.1103, "step": 458 }, { "epoch": 0.26424870466321243, "grad_norm": 4.263064384460449, "learning_rate": 0.0001832, "loss": 2.2493, "step": 459 }, { "epoch": 0.26482440990213013, "grad_norm": 8.983839988708496, "learning_rate": 0.00018360000000000002, "loss": 2.3782, "step": 460 }, { "epoch": 0.2654001151410478, "grad_norm": 9.911988258361816, "learning_rate": 0.00018400000000000003, "loss": 2.2117, "step": 461 }, { "epoch": 0.2659758203799655, "grad_norm": 8.42939567565918, "learning_rate": 0.0001844, "loss": 2.0942, "step": 462 }, { "epoch": 0.2665515256188831, "grad_norm": 9.866201400756836, "learning_rate": 0.00018480000000000002, "loss": 2.2761, "step": 463 }, { "epoch": 0.2671272308578008, "grad_norm": 4.9825758934021, "learning_rate": 0.00018520000000000003, "loss": 2.1351, "step": 464 }, { "epoch": 0.26770293609671847, "grad_norm": 3.4520153999328613, "learning_rate": 0.0001856, "loss": 2.234, "step": 465 }, { "epoch": 0.26827864133563617, "grad_norm": 6.94691276550293, "learning_rate": 0.00018600000000000002, "loss": 2.1368, "step": 466 }, { "epoch": 0.2688543465745538, "grad_norm": 19.923587799072266, "learning_rate": 0.00018640000000000003, "loss": 2.4301, "step": 467 }, { "epoch": 0.2694300518134715, "grad_norm": 24.741535186767578, "learning_rate": 0.00018680000000000001, "loss": 2.256, "step": 468 }, { "epoch": 0.27000575705238916, "grad_norm": 9.313246726989746, "learning_rate": 0.00018720000000000002, "loss": 2.6483, "step": 469 }, { "epoch": 0.27058146229130686, "grad_norm": 10.217698097229004, "learning_rate": 0.0001876, "loss": 1.8293, "step": 470 }, { "epoch": 0.2711571675302245, "grad_norm": 28.85066032409668, "learning_rate": 0.000188, "loss": 2.3165, "step": 471 }, { "epoch": 0.2717328727691422, "grad_norm": 5.764794826507568, "learning_rate": 0.0001884, "loss": 2.6914, "step": 472 }, { "epoch": 0.27230857800805985, "grad_norm": 8.115283966064453, "learning_rate": 0.0001888, "loss": 2.5605, "step": 473 }, { "epoch": 0.27288428324697755, "grad_norm": 11.941910743713379, "learning_rate": 0.0001892, "loss": 1.9626, "step": 474 }, { "epoch": 0.2734599884858952, "grad_norm": 11.117420196533203, "learning_rate": 0.0001896, "loss": 2.4614, "step": 475 }, { "epoch": 0.2740356937248129, "grad_norm": 6.908642292022705, "learning_rate": 0.00019, "loss": 2.3911, "step": 476 }, { "epoch": 0.27461139896373055, "grad_norm": 10.433818817138672, "learning_rate": 0.0001904, "loss": 2.3747, "step": 477 }, { "epoch": 0.27518710420264825, "grad_norm": 8.546224594116211, "learning_rate": 0.0001908, "loss": 2.2947, "step": 478 }, { "epoch": 0.2757628094415659, "grad_norm": 5.434266090393066, "learning_rate": 0.0001912, "loss": 2.2115, "step": 479 }, { "epoch": 0.2763385146804836, "grad_norm": 9.27397346496582, "learning_rate": 0.0001916, "loss": 2.2165, "step": 480 }, { "epoch": 0.27691421991940124, "grad_norm": 4.052639484405518, "learning_rate": 0.000192, "loss": 2.1148, "step": 481 }, { "epoch": 0.27748992515831894, "grad_norm": 7.541112422943115, "learning_rate": 0.00019240000000000001, "loss": 2.2489, "step": 482 }, { "epoch": 0.27806563039723664, "grad_norm": 20.005165100097656, "learning_rate": 0.0001928, "loss": 2.3552, "step": 483 }, { "epoch": 0.2786413356361543, "grad_norm": 6.74354362487793, "learning_rate": 0.0001932, "loss": 2.0027, "step": 484 }, { "epoch": 0.279217040875072, "grad_norm": 4.244668960571289, "learning_rate": 0.00019360000000000002, "loss": 2.466, "step": 485 }, { "epoch": 0.27979274611398963, "grad_norm": 32.92999267578125, "learning_rate": 0.000194, "loss": 2.0058, "step": 486 }, { "epoch": 0.28036845135290733, "grad_norm": 5.099974155426025, "learning_rate": 0.0001944, "loss": 2.1534, "step": 487 }, { "epoch": 0.280944156591825, "grad_norm": 8.950968742370605, "learning_rate": 0.0001948, "loss": 2.3272, "step": 488 }, { "epoch": 0.2815198618307427, "grad_norm": 29.126623153686523, "learning_rate": 0.0001952, "loss": 2.0942, "step": 489 }, { "epoch": 0.2820955670696603, "grad_norm": 26.04970932006836, "learning_rate": 0.0001956, "loss": 2.3703, "step": 490 }, { "epoch": 0.282671272308578, "grad_norm": 7.4286370277404785, "learning_rate": 0.000196, "loss": 1.8691, "step": 491 }, { "epoch": 0.28324697754749567, "grad_norm": 6.331235408782959, "learning_rate": 0.0001964, "loss": 2.2338, "step": 492 }, { "epoch": 0.28382268278641337, "grad_norm": 4.98259162902832, "learning_rate": 0.0001968, "loss": 1.9059, "step": 493 }, { "epoch": 0.284398388025331, "grad_norm": 12.111970901489258, "learning_rate": 0.0001972, "loss": 2.0567, "step": 494 }, { "epoch": 0.2849740932642487, "grad_norm": 4.433606147766113, "learning_rate": 0.0001976, "loss": 2.061, "step": 495 }, { "epoch": 0.28554979850316636, "grad_norm": 9.483826637268066, "learning_rate": 0.00019800000000000002, "loss": 2.1855, "step": 496 }, { "epoch": 0.28612550374208406, "grad_norm": 8.829517364501953, "learning_rate": 0.0001984, "loss": 2.0813, "step": 497 }, { "epoch": 0.2867012089810017, "grad_norm": 5.547176361083984, "learning_rate": 0.0001988, "loss": 2.1782, "step": 498 }, { "epoch": 0.2872769142199194, "grad_norm": 13.865377426147461, "learning_rate": 0.00019920000000000002, "loss": 1.9131, "step": 499 }, { "epoch": 0.28785261945883706, "grad_norm": 13.441047668457031, "learning_rate": 0.0001996, "loss": 2.1865, "step": 500 }, { "epoch": 0.28842832469775476, "grad_norm": 4.224601745605469, "learning_rate": 0.0002, "loss": 2.4949, "step": 501 }, { "epoch": 0.2890040299366724, "grad_norm": 4.024444580078125, "learning_rate": 0.000199999709749734, "loss": 2.2281, "step": 502 }, { "epoch": 0.2895797351755901, "grad_norm": 6.911625862121582, "learning_rate": 0.000199998839000808, "loss": 2.0534, "step": 503 }, { "epoch": 0.29015544041450775, "grad_norm": 15.578252792358398, "learning_rate": 0.00019999738775883837, "loss": 2.1315, "step": 504 }, { "epoch": 0.29073114565342545, "grad_norm": 14.918317794799805, "learning_rate": 0.00019999535603318567, "loss": 2.1605, "step": 505 }, { "epoch": 0.2913068508923431, "grad_norm": 3.6653409004211426, "learning_rate": 0.0001999927438369545, "loss": 2.3675, "step": 506 }, { "epoch": 0.2918825561312608, "grad_norm": 9.457073211669922, "learning_rate": 0.0001999895511869936, "loss": 2.2067, "step": 507 }, { "epoch": 0.2924582613701785, "grad_norm": 16.254053115844727, "learning_rate": 0.00019998577810389551, "loss": 1.8262, "step": 508 }, { "epoch": 0.29303396660909614, "grad_norm": 12.8787260055542, "learning_rate": 0.00019998142461199664, "loss": 2.1758, "step": 509 }, { "epoch": 0.29360967184801384, "grad_norm": 7.122046947479248, "learning_rate": 0.00019997649073937707, "loss": 2.1842, "step": 510 }, { "epoch": 0.2941853770869315, "grad_norm": 7.713693618774414, "learning_rate": 0.00019997097651786033, "loss": 2.1556, "step": 511 }, { "epoch": 0.2947610823258492, "grad_norm": 5.447865962982178, "learning_rate": 0.00019996488198301314, "loss": 2.2058, "step": 512 }, { "epoch": 0.29533678756476683, "grad_norm": 10.775145530700684, "learning_rate": 0.0001999582071741453, "loss": 2.365, "step": 513 }, { "epoch": 0.29591249280368453, "grad_norm": 15.842108726501465, "learning_rate": 0.00019995095213430937, "loss": 2.1598, "step": 514 }, { "epoch": 0.2964881980426022, "grad_norm": 27.204334259033203, "learning_rate": 0.00019994311691030038, "loss": 2.3135, "step": 515 }, { "epoch": 0.2970639032815199, "grad_norm": 17.095380783081055, "learning_rate": 0.0001999347015526556, "loss": 2.1369, "step": 516 }, { "epoch": 0.2976396085204375, "grad_norm": 5.58231258392334, "learning_rate": 0.0001999257061156541, "loss": 2.1524, "step": 517 }, { "epoch": 0.2982153137593552, "grad_norm": 16.57658576965332, "learning_rate": 0.00019991613065731652, "loss": 2.2354, "step": 518 }, { "epoch": 0.2987910189982729, "grad_norm": 10.48273754119873, "learning_rate": 0.00019990597523940467, "loss": 1.9177, "step": 519 }, { "epoch": 0.2993667242371906, "grad_norm": 11.657440185546875, "learning_rate": 0.00019989523992742096, "loss": 2.1773, "step": 520 }, { "epoch": 0.2999424294761082, "grad_norm": 8.428756713867188, "learning_rate": 0.00019988392479060828, "loss": 2.0253, "step": 521 }, { "epoch": 0.3005181347150259, "grad_norm": 23.292964935302734, "learning_rate": 0.00019987202990194938, "loss": 2.1664, "step": 522 }, { "epoch": 0.30109383995394357, "grad_norm": 11.076370239257812, "learning_rate": 0.00019985955533816623, "loss": 2.1639, "step": 523 }, { "epoch": 0.30166954519286127, "grad_norm": 5.960238933563232, "learning_rate": 0.00019984650117971993, "loss": 2.28, "step": 524 }, { "epoch": 0.3022452504317789, "grad_norm": 5.293085098266602, "learning_rate": 0.00019983286751080984, "loss": 2.0797, "step": 525 }, { "epoch": 0.3028209556706966, "grad_norm": 6.787145614624023, "learning_rate": 0.00019981865441937326, "loss": 2.1623, "step": 526 }, { "epoch": 0.30339666090961426, "grad_norm": 4.942052841186523, "learning_rate": 0.00019980386199708468, "loss": 1.9676, "step": 527 }, { "epoch": 0.30397236614853196, "grad_norm": 3.7597603797912598, "learning_rate": 0.0001997884903393553, "loss": 2.0058, "step": 528 }, { "epoch": 0.3045480713874496, "grad_norm": 4.015875339508057, "learning_rate": 0.00019977253954533243, "loss": 2.4905, "step": 529 }, { "epoch": 0.3051237766263673, "grad_norm": 71.47596740722656, "learning_rate": 0.00019975600971789873, "loss": 2.2579, "step": 530 }, { "epoch": 0.30569948186528495, "grad_norm": 5.182095050811768, "learning_rate": 0.00019973890096367173, "loss": 2.0343, "step": 531 }, { "epoch": 0.30627518710420265, "grad_norm": 11.30048656463623, "learning_rate": 0.000199721213393003, "loss": 1.709, "step": 532 }, { "epoch": 0.3068508923431203, "grad_norm": 7.1390700340271, "learning_rate": 0.00019970294711997745, "loss": 1.9582, "step": 533 }, { "epoch": 0.307426597582038, "grad_norm": 4.473127841949463, "learning_rate": 0.0001996841022624127, "loss": 2.0572, "step": 534 }, { "epoch": 0.3080023028209557, "grad_norm": 15.681950569152832, "learning_rate": 0.00019966467894185812, "loss": 2.0069, "step": 535 }, { "epoch": 0.30857800805987334, "grad_norm": 3.4989001750946045, "learning_rate": 0.0001996446772835943, "loss": 2.3378, "step": 536 }, { "epoch": 0.30915371329879104, "grad_norm": 6.156066417694092, "learning_rate": 0.00019962409741663202, "loss": 2.1491, "step": 537 }, { "epoch": 0.3097294185377087, "grad_norm": 3.5091023445129395, "learning_rate": 0.00019960293947371153, "loss": 2.2707, "step": 538 }, { "epoch": 0.3103051237766264, "grad_norm": 11.965502738952637, "learning_rate": 0.00019958120359130178, "loss": 1.9268, "step": 539 }, { "epoch": 0.31088082901554404, "grad_norm": 23.450349807739258, "learning_rate": 0.0001995588899095992, "loss": 1.7938, "step": 540 }, { "epoch": 0.31145653425446174, "grad_norm": 38.560482025146484, "learning_rate": 0.00019953599857252733, "loss": 2.2267, "step": 541 }, { "epoch": 0.3120322394933794, "grad_norm": 15.327620506286621, "learning_rate": 0.00019951252972773525, "loss": 2.3168, "step": 542 }, { "epoch": 0.3126079447322971, "grad_norm": 4.420673370361328, "learning_rate": 0.0001994884835265973, "loss": 2.0744, "step": 543 }, { "epoch": 0.31318364997121473, "grad_norm": 19.81424903869629, "learning_rate": 0.00019946386012421153, "loss": 1.9736, "step": 544 }, { "epoch": 0.31375935521013243, "grad_norm": 4.647876739501953, "learning_rate": 0.00019943865967939908, "loss": 2.1716, "step": 545 }, { "epoch": 0.3143350604490501, "grad_norm": 5.088565349578857, "learning_rate": 0.00019941288235470291, "loss": 1.9915, "step": 546 }, { "epoch": 0.3149107656879678, "grad_norm": 6.194237232208252, "learning_rate": 0.00019938652831638697, "loss": 1.9701, "step": 547 }, { "epoch": 0.3154864709268854, "grad_norm": 5.4519429206848145, "learning_rate": 0.00019935959773443497, "loss": 2.2597, "step": 548 }, { "epoch": 0.3160621761658031, "grad_norm": 7.437872409820557, "learning_rate": 0.0001993320907825493, "loss": 2.3016, "step": 549 }, { "epoch": 0.31663788140472077, "grad_norm": 4.233456134796143, "learning_rate": 0.00019930400763814993, "loss": 1.8935, "step": 550 }, { "epoch": 0.31721358664363847, "grad_norm": 5.772792339324951, "learning_rate": 0.00019927534848237336, "loss": 1.6373, "step": 551 }, { "epoch": 0.3177892918825561, "grad_norm": 7.545225143432617, "learning_rate": 0.0001992461135000713, "loss": 1.9868, "step": 552 }, { "epoch": 0.3183649971214738, "grad_norm": 5.72635555267334, "learning_rate": 0.00019921630287980956, "loss": 1.7728, "step": 553 }, { "epoch": 0.31894070236039146, "grad_norm": 5.739555358886719, "learning_rate": 0.0001991859168138668, "loss": 1.8478, "step": 554 }, { "epoch": 0.31951640759930916, "grad_norm": 3.295530319213867, "learning_rate": 0.0001991549554982333, "loss": 2.1454, "step": 555 }, { "epoch": 0.3200921128382268, "grad_norm": 10.391168594360352, "learning_rate": 0.0001991234191326098, "loss": 2.2763, "step": 556 }, { "epoch": 0.3206678180771445, "grad_norm": 14.846756935119629, "learning_rate": 0.00019909130792040598, "loss": 1.9783, "step": 557 }, { "epoch": 0.32124352331606215, "grad_norm": 4.79947566986084, "learning_rate": 0.0001990586220687394, "loss": 2.1489, "step": 558 }, { "epoch": 0.32181922855497985, "grad_norm": 4.786315441131592, "learning_rate": 0.00019902536178843395, "loss": 2.0194, "step": 559 }, { "epoch": 0.3223949337938975, "grad_norm": 11.64875602722168, "learning_rate": 0.00019899152729401868, "loss": 1.8983, "step": 560 }, { "epoch": 0.3229706390328152, "grad_norm": 9.477057456970215, "learning_rate": 0.00019895711880372628, "loss": 1.9139, "step": 561 }, { "epoch": 0.3235463442717329, "grad_norm": 7.5819993019104, "learning_rate": 0.00019892213653949166, "loss": 1.843, "step": 562 }, { "epoch": 0.32412204951065055, "grad_norm": 4.9545207023620605, "learning_rate": 0.00019888658072695066, "loss": 2.1052, "step": 563 }, { "epoch": 0.32469775474956825, "grad_norm": 4.684484958648682, "learning_rate": 0.0001988504515954385, "loss": 1.7933, "step": 564 }, { "epoch": 0.3252734599884859, "grad_norm": 8.41274356842041, "learning_rate": 0.00019881374937798826, "loss": 2.0737, "step": 565 }, { "epoch": 0.3258491652274036, "grad_norm": 20.587425231933594, "learning_rate": 0.00019877647431132948, "loss": 1.6823, "step": 566 }, { "epoch": 0.32642487046632124, "grad_norm": 12.793438911437988, "learning_rate": 0.00019873862663588658, "loss": 2.1764, "step": 567 }, { "epoch": 0.32700057570523894, "grad_norm": 3.9023592472076416, "learning_rate": 0.00019870020659577725, "loss": 2.3804, "step": 568 }, { "epoch": 0.3275762809441566, "grad_norm": 5.434683799743652, "learning_rate": 0.000198661214438811, "loss": 2.162, "step": 569 }, { "epoch": 0.3281519861830743, "grad_norm": 5.3589019775390625, "learning_rate": 0.00019862165041648744, "loss": 2.2068, "step": 570 }, { "epoch": 0.32872769142199193, "grad_norm": 5.979888439178467, "learning_rate": 0.00019858151478399478, "loss": 1.9811, "step": 571 }, { "epoch": 0.32930339666090963, "grad_norm": 10.225967407226562, "learning_rate": 0.0001985408078002081, "loss": 1.7766, "step": 572 }, { "epoch": 0.3298791018998273, "grad_norm": 7.599292278289795, "learning_rate": 0.00019849952972768767, "loss": 1.851, "step": 573 }, { "epoch": 0.330454807138745, "grad_norm": 3.455409049987793, "learning_rate": 0.0001984576808326773, "loss": 1.9795, "step": 574 }, { "epoch": 0.3310305123776626, "grad_norm": 4.577173709869385, "learning_rate": 0.00019841526138510257, "loss": 2.1139, "step": 575 }, { "epoch": 0.3316062176165803, "grad_norm": 3.0964651107788086, "learning_rate": 0.00019837227165856922, "loss": 2.2629, "step": 576 }, { "epoch": 0.33218192285549797, "grad_norm": 5.796529293060303, "learning_rate": 0.0001983287119303612, "loss": 2.1801, "step": 577 }, { "epoch": 0.33275762809441567, "grad_norm": 13.77210521697998, "learning_rate": 0.00019828458248143913, "loss": 1.9382, "step": 578 }, { "epoch": 0.3333333333333333, "grad_norm": 67.12165832519531, "learning_rate": 0.00019823988359643805, "loss": 2.0376, "step": 579 }, { "epoch": 0.333909038572251, "grad_norm": 18.68467140197754, "learning_rate": 0.00019819461556366615, "loss": 2.1364, "step": 580 }, { "epoch": 0.33448474381116866, "grad_norm": 3.4618403911590576, "learning_rate": 0.00019814877867510244, "loss": 2.4019, "step": 581 }, { "epoch": 0.33506044905008636, "grad_norm": 2.982158899307251, "learning_rate": 0.00019810237322639518, "loss": 2.2236, "step": 582 }, { "epoch": 0.335636154289004, "grad_norm": 18.654964447021484, "learning_rate": 0.00019805539951685974, "loss": 2.1278, "step": 583 }, { "epoch": 0.3362118595279217, "grad_norm": 16.564912796020508, "learning_rate": 0.00019800785784947683, "loss": 1.8014, "step": 584 }, { "epoch": 0.33678756476683935, "grad_norm": 31.10694122314453, "learning_rate": 0.00019795974853089053, "loss": 1.9206, "step": 585 }, { "epoch": 0.33736327000575705, "grad_norm": 22.873422622680664, "learning_rate": 0.00019791107187140618, "loss": 2.0762, "step": 586 }, { "epoch": 0.3379389752446747, "grad_norm": 4.631661415100098, "learning_rate": 0.00019786182818498852, "loss": 1.9247, "step": 587 }, { "epoch": 0.3385146804835924, "grad_norm": 8.853531837463379, "learning_rate": 0.00019781201778925969, "loss": 1.7691, "step": 588 }, { "epoch": 0.3390903857225101, "grad_norm": 10.22801685333252, "learning_rate": 0.00019776164100549694, "loss": 2.1087, "step": 589 }, { "epoch": 0.33966609096142775, "grad_norm": 11.78855037689209, "learning_rate": 0.0001977106981586309, "loss": 2.2109, "step": 590 }, { "epoch": 0.34024179620034545, "grad_norm": 9.323369026184082, "learning_rate": 0.00019765918957724319, "loss": 2.104, "step": 591 }, { "epoch": 0.3408175014392631, "grad_norm": 10.116569519042969, "learning_rate": 0.00019760711559356449, "loss": 2.0949, "step": 592 }, { "epoch": 0.3413932066781808, "grad_norm": 9.48695182800293, "learning_rate": 0.00019755447654347226, "loss": 2.1322, "step": 593 }, { "epoch": 0.34196891191709844, "grad_norm": 11.74067497253418, "learning_rate": 0.00019750127276648872, "loss": 2.0404, "step": 594 }, { "epoch": 0.34254461715601614, "grad_norm": 11.347387313842773, "learning_rate": 0.00019744750460577856, "loss": 1.6953, "step": 595 }, { "epoch": 0.3431203223949338, "grad_norm": 3.8460686206817627, "learning_rate": 0.00019739317240814668, "loss": 2.1369, "step": 596 }, { "epoch": 0.3436960276338515, "grad_norm": 3.9272382259368896, "learning_rate": 0.00019733827652403615, "loss": 2.1408, "step": 597 }, { "epoch": 0.34427173287276913, "grad_norm": 6.900027751922607, "learning_rate": 0.00019728281730752568, "loss": 2.1793, "step": 598 }, { "epoch": 0.34484743811168683, "grad_norm": 14.802238464355469, "learning_rate": 0.00019722679511632757, "loss": 2.0497, "step": 599 }, { "epoch": 0.3454231433506045, "grad_norm": 12.431273460388184, "learning_rate": 0.00019717021031178528, "loss": 2.1025, "step": 600 }, { "epoch": 0.3459988485895222, "grad_norm": 26.413490295410156, "learning_rate": 0.00019711306325887116, "loss": 2.0722, "step": 601 }, { "epoch": 0.3465745538284398, "grad_norm": 15.21047592163086, "learning_rate": 0.000197055354326184, "loss": 1.8285, "step": 602 }, { "epoch": 0.3471502590673575, "grad_norm": 4.017702579498291, "learning_rate": 0.0001969970838859468, "loss": 2.3358, "step": 603 }, { "epoch": 0.34772596430627517, "grad_norm": 4.556800842285156, "learning_rate": 0.00019693825231400423, "loss": 2.1526, "step": 604 }, { "epoch": 0.34830166954519287, "grad_norm": 8.760571479797363, "learning_rate": 0.0001968788599898202, "loss": 2.3013, "step": 605 }, { "epoch": 0.3488773747841105, "grad_norm": 4.910373210906982, "learning_rate": 0.0001968189072964757, "loss": 2.038, "step": 606 }, { "epoch": 0.3494530800230282, "grad_norm": 14.5215425491333, "learning_rate": 0.00019675839462066582, "loss": 2.2494, "step": 607 }, { "epoch": 0.35002878526194586, "grad_norm": 7.343470573425293, "learning_rate": 0.00019669732235269775, "loss": 1.8103, "step": 608 }, { "epoch": 0.35060449050086356, "grad_norm": 6.75926399230957, "learning_rate": 0.00019663569088648796, "loss": 2.0837, "step": 609 }, { "epoch": 0.3511801957397812, "grad_norm": 4.556105136871338, "learning_rate": 0.0001965735006195598, "loss": 2.111, "step": 610 }, { "epoch": 0.3517559009786989, "grad_norm": 7.004334449768066, "learning_rate": 0.0001965107519530408, "loss": 2.1351, "step": 611 }, { "epoch": 0.35233160621761656, "grad_norm": 3.7574639320373535, "learning_rate": 0.00019644744529166025, "loss": 1.9899, "step": 612 }, { "epoch": 0.35290731145653426, "grad_norm": 7.275668144226074, "learning_rate": 0.0001963835810437465, "loss": 1.7074, "step": 613 }, { "epoch": 0.3534830166954519, "grad_norm": 4.05908727645874, "learning_rate": 0.00019631915962122436, "loss": 1.7602, "step": 614 }, { "epoch": 0.3540587219343696, "grad_norm": 3.8614981174468994, "learning_rate": 0.00019625418143961234, "loss": 1.878, "step": 615 }, { "epoch": 0.3546344271732873, "grad_norm": 3.8152644634246826, "learning_rate": 0.00019618864691802013, "loss": 2.1187, "step": 616 }, { "epoch": 0.35521013241220495, "grad_norm": 7.1591668128967285, "learning_rate": 0.00019612255647914574, "loss": 1.889, "step": 617 }, { "epoch": 0.35578583765112265, "grad_norm": 13.686311721801758, "learning_rate": 0.00019605591054927294, "loss": 1.8415, "step": 618 }, { "epoch": 0.3563615428900403, "grad_norm": 4.854591369628906, "learning_rate": 0.00019598870955826828, "loss": 2.2113, "step": 619 }, { "epoch": 0.356937248128958, "grad_norm": 8.912299156188965, "learning_rate": 0.00019592095393957868, "loss": 1.7633, "step": 620 }, { "epoch": 0.35751295336787564, "grad_norm": 3.053098678588867, "learning_rate": 0.00019585264413022818, "loss": 1.9866, "step": 621 }, { "epoch": 0.35808865860679334, "grad_norm": 22.903722763061523, "learning_rate": 0.0001957837805708155, "loss": 2.0566, "step": 622 }, { "epoch": 0.358664363845711, "grad_norm": 2.5533032417297363, "learning_rate": 0.000195714363705511, "loss": 2.0484, "step": 623 }, { "epoch": 0.3592400690846287, "grad_norm": 4.685166358947754, "learning_rate": 0.00019564439398205388, "loss": 1.9809, "step": 624 }, { "epoch": 0.35981577432354633, "grad_norm": 6.896650314331055, "learning_rate": 0.00019557387185174924, "loss": 1.9147, "step": 625 }, { "epoch": 0.36039147956246403, "grad_norm": 3.927499532699585, "learning_rate": 0.00019550279776946525, "loss": 1.8356, "step": 626 }, { "epoch": 0.3609671848013817, "grad_norm": 2.7331018447875977, "learning_rate": 0.00019543117219363016, "loss": 1.9191, "step": 627 }, { "epoch": 0.3615428900402994, "grad_norm": 6.016903400421143, "learning_rate": 0.0001953589955862294, "loss": 2.0919, "step": 628 }, { "epoch": 0.362118595279217, "grad_norm": 5.431999683380127, "learning_rate": 0.00019528626841280246, "loss": 1.5794, "step": 629 }, { "epoch": 0.3626943005181347, "grad_norm": 4.026440620422363, "learning_rate": 0.00019521299114244004, "loss": 2.2844, "step": 630 }, { "epoch": 0.36327000575705237, "grad_norm": 7.0267863273620605, "learning_rate": 0.00019513916424778097, "loss": 1.8249, "step": 631 }, { "epoch": 0.3638457109959701, "grad_norm": 4.048052787780762, "learning_rate": 0.00019506478820500918, "loss": 2.1139, "step": 632 }, { "epoch": 0.3644214162348877, "grad_norm": 3.6997714042663574, "learning_rate": 0.0001949898634938506, "loss": 1.6588, "step": 633 }, { "epoch": 0.3649971214738054, "grad_norm": 2.560260772705078, "learning_rate": 0.00019491439059757002, "loss": 1.9762, "step": 634 }, { "epoch": 0.36557282671272306, "grad_norm": 17.04377555847168, "learning_rate": 0.00019483837000296806, "loss": 1.7949, "step": 635 }, { "epoch": 0.36614853195164077, "grad_norm": 2.873385190963745, "learning_rate": 0.00019476180220037807, "loss": 1.9637, "step": 636 }, { "epoch": 0.3667242371905584, "grad_norm": 7.77009391784668, "learning_rate": 0.00019468468768366276, "loss": 1.8636, "step": 637 }, { "epoch": 0.3672999424294761, "grad_norm": 6.612690448760986, "learning_rate": 0.00019460702695021123, "loss": 1.734, "step": 638 }, { "epoch": 0.36787564766839376, "grad_norm": 4.485565185546875, "learning_rate": 0.0001945288205009357, "loss": 1.6968, "step": 639 }, { "epoch": 0.36845135290731146, "grad_norm": 2.9968698024749756, "learning_rate": 0.0001944500688402682, "loss": 1.8785, "step": 640 }, { "epoch": 0.3690270581462291, "grad_norm": 4.952812194824219, "learning_rate": 0.00019437077247615747, "loss": 1.7285, "step": 641 }, { "epoch": 0.3696027633851468, "grad_norm": 5.491662502288818, "learning_rate": 0.00019429093192006543, "loss": 1.6328, "step": 642 }, { "epoch": 0.3701784686240645, "grad_norm": 3.8749518394470215, "learning_rate": 0.00019421054768696422, "loss": 2.2014, "step": 643 }, { "epoch": 0.37075417386298215, "grad_norm": 6.7762627601623535, "learning_rate": 0.0001941296202953326, "loss": 2.0592, "step": 644 }, { "epoch": 0.37132987910189985, "grad_norm": 3.307373046875, "learning_rate": 0.00019404815026715267, "loss": 2.3178, "step": 645 }, { "epoch": 0.3719055843408175, "grad_norm": 3.4404947757720947, "learning_rate": 0.00019396613812790666, "loss": 2.3772, "step": 646 }, { "epoch": 0.3724812895797352, "grad_norm": 4.453017711639404, "learning_rate": 0.00019388358440657332, "loss": 1.8855, "step": 647 }, { "epoch": 0.37305699481865284, "grad_norm": 3.175896406173706, "learning_rate": 0.00019380048963562466, "loss": 1.8799, "step": 648 }, { "epoch": 0.37363270005757054, "grad_norm": 22.680187225341797, "learning_rate": 0.0001937168543510224, "loss": 1.4755, "step": 649 }, { "epoch": 0.3742084052964882, "grad_norm": 14.353453636169434, "learning_rate": 0.00019363267909221468, "loss": 1.9199, "step": 650 }, { "epoch": 0.3747841105354059, "grad_norm": 3.4111273288726807, "learning_rate": 0.00019354796440213237, "loss": 2.0271, "step": 651 }, { "epoch": 0.37535981577432354, "grad_norm": 2.5413737297058105, "learning_rate": 0.00019346271082718575, "loss": 2.0859, "step": 652 }, { "epoch": 0.37593552101324124, "grad_norm": 2.3607568740844727, "learning_rate": 0.00019337691891726087, "loss": 2.1843, "step": 653 }, { "epoch": 0.3765112262521589, "grad_norm": 6.652899265289307, "learning_rate": 0.00019329058922571608, "loss": 2.2823, "step": 654 }, { "epoch": 0.3770869314910766, "grad_norm": 6.970069885253906, "learning_rate": 0.00019320372230937835, "loss": 2.0684, "step": 655 }, { "epoch": 0.3776626367299942, "grad_norm": 6.668658256530762, "learning_rate": 0.00019311631872853983, "loss": 1.4474, "step": 656 }, { "epoch": 0.37823834196891193, "grad_norm": 16.344221115112305, "learning_rate": 0.00019302837904695418, "loss": 1.958, "step": 657 }, { "epoch": 0.3788140472078296, "grad_norm": 11.7747163772583, "learning_rate": 0.00019293990383183277, "loss": 1.9691, "step": 658 }, { "epoch": 0.3793897524467473, "grad_norm": 3.1886119842529297, "learning_rate": 0.00019285089365384138, "loss": 2.0175, "step": 659 }, { "epoch": 0.3799654576856649, "grad_norm": 7.979596138000488, "learning_rate": 0.00019276134908709607, "loss": 1.8705, "step": 660 }, { "epoch": 0.3805411629245826, "grad_norm": 3.5675251483917236, "learning_rate": 0.0001926712707091599, "loss": 1.754, "step": 661 }, { "epoch": 0.38111686816350027, "grad_norm": 2.696560859680176, "learning_rate": 0.00019258065910103886, "loss": 2.0815, "step": 662 }, { "epoch": 0.38169257340241797, "grad_norm": 3.2219901084899902, "learning_rate": 0.0001924895148471785, "loss": 1.9866, "step": 663 }, { "epoch": 0.3822682786413356, "grad_norm": 4.226520538330078, "learning_rate": 0.00019239783853545962, "loss": 1.848, "step": 664 }, { "epoch": 0.3828439838802533, "grad_norm": 3.089921474456787, "learning_rate": 0.00019230563075719513, "loss": 2.3039, "step": 665 }, { "epoch": 0.38341968911917096, "grad_norm": 3.959472179412842, "learning_rate": 0.00019221289210712562, "loss": 1.902, "step": 666 }, { "epoch": 0.38399539435808866, "grad_norm": 9.597257614135742, "learning_rate": 0.000192119623183416, "loss": 1.9769, "step": 667 }, { "epoch": 0.3845710995970063, "grad_norm": 15.12406063079834, "learning_rate": 0.00019202582458765138, "loss": 1.985, "step": 668 }, { "epoch": 0.385146804835924, "grad_norm": 3.456636428833008, "learning_rate": 0.00019193149692483326, "loss": 1.6099, "step": 669 }, { "epoch": 0.3857225100748417, "grad_norm": 2.650001287460327, "learning_rate": 0.00019183664080337556, "loss": 2.4192, "step": 670 }, { "epoch": 0.38629821531375935, "grad_norm": 2.6241097450256348, "learning_rate": 0.00019174125683510092, "loss": 2.3614, "step": 671 }, { "epoch": 0.38687392055267705, "grad_norm": 5.434971332550049, "learning_rate": 0.00019164534563523641, "loss": 1.782, "step": 672 }, { "epoch": 0.3874496257915947, "grad_norm": 2.1434805393218994, "learning_rate": 0.0001915489078224099, "loss": 2.0285, "step": 673 }, { "epoch": 0.3880253310305124, "grad_norm": 3.9473743438720703, "learning_rate": 0.00019145194401864581, "loss": 2.137, "step": 674 }, { "epoch": 0.38860103626943004, "grad_norm": 3.913604259490967, "learning_rate": 0.00019135445484936127, "loss": 1.7514, "step": 675 }, { "epoch": 0.38917674150834775, "grad_norm": 4.43229341506958, "learning_rate": 0.000191256440943362, "loss": 1.6698, "step": 676 }, { "epoch": 0.3897524467472654, "grad_norm": 3.1151657104492188, "learning_rate": 0.00019115790293283827, "loss": 2.2421, "step": 677 }, { "epoch": 0.3903281519861831, "grad_norm": 3.7743000984191895, "learning_rate": 0.00019105884145336085, "loss": 1.8634, "step": 678 }, { "epoch": 0.39090385722510074, "grad_norm": 2.8315064907073975, "learning_rate": 0.00019095925714387682, "loss": 1.6003, "step": 679 }, { "epoch": 0.39147956246401844, "grad_norm": 13.85084342956543, "learning_rate": 0.00019085915064670557, "loss": 1.9885, "step": 680 }, { "epoch": 0.3920552677029361, "grad_norm": 8.601191520690918, "learning_rate": 0.00019075852260753463, "loss": 1.8575, "step": 681 }, { "epoch": 0.3926309729418538, "grad_norm": 2.420215606689453, "learning_rate": 0.00019065737367541545, "loss": 2.0188, "step": 682 }, { "epoch": 0.39320667818077143, "grad_norm": 9.427199363708496, "learning_rate": 0.0001905557045027592, "loss": 1.8087, "step": 683 }, { "epoch": 0.39378238341968913, "grad_norm": 2.8770227432250977, "learning_rate": 0.00019045351574533274, "loss": 1.9231, "step": 684 }, { "epoch": 0.3943580886586068, "grad_norm": 5.514732837677002, "learning_rate": 0.00019035080806225404, "loss": 1.8629, "step": 685 }, { "epoch": 0.3949337938975245, "grad_norm": 5.505938529968262, "learning_rate": 0.00019024758211598833, "loss": 2.0178, "step": 686 }, { "epoch": 0.3955094991364421, "grad_norm": 4.23834753036499, "learning_rate": 0.00019014383857234355, "loss": 1.5748, "step": 687 }, { "epoch": 0.3960852043753598, "grad_norm": 4.225786209106445, "learning_rate": 0.00019003957810046615, "loss": 1.8404, "step": 688 }, { "epoch": 0.39666090961427747, "grad_norm": 10.373225212097168, "learning_rate": 0.00018993480137283685, "loss": 1.9054, "step": 689 }, { "epoch": 0.39723661485319517, "grad_norm": 5.083978652954102, "learning_rate": 0.00018982950906526615, "loss": 1.8938, "step": 690 }, { "epoch": 0.3978123200921128, "grad_norm": 4.478074073791504, "learning_rate": 0.00018972370185689, "loss": 1.9073, "step": 691 }, { "epoch": 0.3983880253310305, "grad_norm": 3.557939052581787, "learning_rate": 0.00018961738043016556, "loss": 1.847, "step": 692 }, { "epoch": 0.39896373056994816, "grad_norm": 4.705951690673828, "learning_rate": 0.00018951054547086666, "loss": 1.7451, "step": 693 }, { "epoch": 0.39953943580886586, "grad_norm": 2.8145923614501953, "learning_rate": 0.00018940319766807943, "loss": 2.054, "step": 694 }, { "epoch": 0.40011514104778356, "grad_norm": 10.403020858764648, "learning_rate": 0.00018929533771419783, "loss": 1.8062, "step": 695 }, { "epoch": 0.4006908462867012, "grad_norm": 5.947159290313721, "learning_rate": 0.00018918696630491915, "loss": 1.8459, "step": 696 } ], "logging_steps": 1, "max_steps": 1737, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 348, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.04526305731779e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }