{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2003454231433506, "eval_steps": 869, "global_step": 348, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005757052389176742, "grad_norm": 2.557003974914551, "learning_rate": 0.0, "loss": 5.4277, "step": 1 }, { "epoch": 0.0005757052389176742, "eval_loss": 5.319709300994873, "eval_runtime": 1026.7022, "eval_samples_per_second": 2.496, "eval_steps_per_second": 2.496, "step": 1 }, { "epoch": 0.0011514104778353484, "grad_norm": 2.985229969024658, "learning_rate": 4.0000000000000003e-07, "loss": 5.7019, "step": 2 }, { "epoch": 0.0017271157167530224, "grad_norm": 3.0353081226348877, "learning_rate": 8.000000000000001e-07, "loss": 6.1934, "step": 3 }, { "epoch": 0.002302820955670697, "grad_norm": 3.724905490875244, "learning_rate": 1.2000000000000002e-06, "loss": 5.4617, "step": 4 }, { "epoch": 0.0028785261945883708, "grad_norm": 2.6505627632141113, "learning_rate": 1.6000000000000001e-06, "loss": 5.4285, "step": 5 }, { "epoch": 0.0034542314335060447, "grad_norm": 2.7363409996032715, "learning_rate": 2.0000000000000003e-06, "loss": 5.8634, "step": 6 }, { "epoch": 0.004029936672423719, "grad_norm": 3.082538366317749, "learning_rate": 2.4000000000000003e-06, "loss": 4.7461, "step": 7 }, { "epoch": 0.004605641911341394, "grad_norm": 9.095250129699707, "learning_rate": 2.8000000000000003e-06, "loss": 7.5703, "step": 8 }, { "epoch": 0.0051813471502590676, "grad_norm": 2.2597923278808594, "learning_rate": 3.2000000000000003e-06, "loss": 5.3631, "step": 9 }, { "epoch": 0.0057570523891767415, "grad_norm": 5.053525924682617, "learning_rate": 3.6e-06, "loss": 6.0132, "step": 10 }, { "epoch": 0.0063327576280944155, "grad_norm": 2.7407820224761963, "learning_rate": 4.000000000000001e-06, "loss": 5.9776, "step": 11 }, { "epoch": 0.0069084628670120895, "grad_norm": 2.4892263412475586, "learning_rate": 4.4e-06, "loss": 5.524, "step": 12 }, { "epoch": 0.007484168105929764, "grad_norm": 2.5302274227142334, "learning_rate": 4.800000000000001e-06, "loss": 5.8044, "step": 13 }, { "epoch": 0.008059873344847437, "grad_norm": 2.992504358291626, "learning_rate": 5.2e-06, "loss": 6.0307, "step": 14 }, { "epoch": 0.008635578583765112, "grad_norm": 4.081608295440674, "learning_rate": 5.600000000000001e-06, "loss": 4.6732, "step": 15 }, { "epoch": 0.009211283822682787, "grad_norm": 2.33296799659729, "learning_rate": 6e-06, "loss": 4.6356, "step": 16 }, { "epoch": 0.00978698906160046, "grad_norm": 2.798452854156494, "learning_rate": 6.4000000000000006e-06, "loss": 5.2941, "step": 17 }, { "epoch": 0.010362694300518135, "grad_norm": 2.290029525756836, "learning_rate": 6.800000000000001e-06, "loss": 4.9405, "step": 18 }, { "epoch": 0.010938399539435808, "grad_norm": 3.2164740562438965, "learning_rate": 7.2e-06, "loss": 5.6711, "step": 19 }, { "epoch": 0.011514104778353483, "grad_norm": 2.4481987953186035, "learning_rate": 7.6e-06, "loss": 5.0366, "step": 20 }, { "epoch": 0.012089810017271158, "grad_norm": 3.398063898086548, "learning_rate": 8.000000000000001e-06, "loss": 5.9377, "step": 21 }, { "epoch": 0.012665515256188831, "grad_norm": 2.3936686515808105, "learning_rate": 8.400000000000001e-06, "loss": 5.4237, "step": 22 }, { "epoch": 0.013241220495106506, "grad_norm": 2.7233810424804688, "learning_rate": 8.8e-06, "loss": 5.6551, "step": 23 }, { "epoch": 0.013816925734024179, "grad_norm": 2.9957566261291504, "learning_rate": 9.2e-06, "loss": 4.7701, "step": 24 }, { "epoch": 0.014392630972941854, "grad_norm": 6.397132396697998, "learning_rate": 9.600000000000001e-06, "loss": 6.4459, "step": 25 }, { "epoch": 0.014968336211859529, "grad_norm": 3.0593409538269043, "learning_rate": 1e-05, "loss": 5.2758, "step": 26 }, { "epoch": 0.015544041450777202, "grad_norm": 2.9723803997039795, "learning_rate": 1.04e-05, "loss": 5.6136, "step": 27 }, { "epoch": 0.016119746689694875, "grad_norm": 2.03314471244812, "learning_rate": 1.08e-05, "loss": 5.3556, "step": 28 }, { "epoch": 0.01669545192861255, "grad_norm": 1.777107834815979, "learning_rate": 1.1200000000000001e-05, "loss": 5.1061, "step": 29 }, { "epoch": 0.017271157167530225, "grad_norm": 3.2192044258117676, "learning_rate": 1.16e-05, "loss": 5.2414, "step": 30 }, { "epoch": 0.017846862406447898, "grad_norm": 3.924452066421509, "learning_rate": 1.2e-05, "loss": 5.2754, "step": 31 }, { "epoch": 0.018422567645365574, "grad_norm": 3.5611093044281006, "learning_rate": 1.24e-05, "loss": 5.2817, "step": 32 }, { "epoch": 0.018998272884283247, "grad_norm": 2.5194263458251953, "learning_rate": 1.2800000000000001e-05, "loss": 5.9063, "step": 33 }, { "epoch": 0.01957397812320092, "grad_norm": 2.403895854949951, "learning_rate": 1.32e-05, "loss": 5.1161, "step": 34 }, { "epoch": 0.020149683362118594, "grad_norm": 2.496400833129883, "learning_rate": 1.3600000000000002e-05, "loss": 5.3049, "step": 35 }, { "epoch": 0.02072538860103627, "grad_norm": 3.0970828533172607, "learning_rate": 1.4000000000000001e-05, "loss": 5.5807, "step": 36 }, { "epoch": 0.021301093839953943, "grad_norm": 3.941403388977051, "learning_rate": 1.44e-05, "loss": 6.0418, "step": 37 }, { "epoch": 0.021876799078871616, "grad_norm": 2.291431188583374, "learning_rate": 1.48e-05, "loss": 4.3686, "step": 38 }, { "epoch": 0.022452504317789293, "grad_norm": 2.783054828643799, "learning_rate": 1.52e-05, "loss": 5.15, "step": 39 }, { "epoch": 0.023028209556706966, "grad_norm": 3.579267978668213, "learning_rate": 1.56e-05, "loss": 5.7507, "step": 40 }, { "epoch": 0.02360391479562464, "grad_norm": 3.5277323722839355, "learning_rate": 1.6000000000000003e-05, "loss": 6.112, "step": 41 }, { "epoch": 0.024179620034542316, "grad_norm": 2.5100817680358887, "learning_rate": 1.6400000000000002e-05, "loss": 5.2133, "step": 42 }, { "epoch": 0.02475532527345999, "grad_norm": 2.3821561336517334, "learning_rate": 1.6800000000000002e-05, "loss": 6.0345, "step": 43 }, { "epoch": 0.025331030512377662, "grad_norm": 3.0675108432769775, "learning_rate": 1.7199999999999998e-05, "loss": 5.2294, "step": 44 }, { "epoch": 0.025906735751295335, "grad_norm": 2.8790383338928223, "learning_rate": 1.76e-05, "loss": 5.6393, "step": 45 }, { "epoch": 0.02648244099021301, "grad_norm": 3.3649141788482666, "learning_rate": 1.8e-05, "loss": 6.014, "step": 46 }, { "epoch": 0.027058146229130685, "grad_norm": 3.4695286750793457, "learning_rate": 1.84e-05, "loss": 5.3457, "step": 47 }, { "epoch": 0.027633851468048358, "grad_norm": 3.303622245788574, "learning_rate": 1.88e-05, "loss": 5.593, "step": 48 }, { "epoch": 0.028209556706966035, "grad_norm": 2.481895923614502, "learning_rate": 1.9200000000000003e-05, "loss": 5.1439, "step": 49 }, { "epoch": 0.028785261945883708, "grad_norm": 2.888579845428467, "learning_rate": 1.9600000000000002e-05, "loss": 4.6318, "step": 50 }, { "epoch": 0.02936096718480138, "grad_norm": 3.4528300762176514, "learning_rate": 2e-05, "loss": 5.0376, "step": 51 }, { "epoch": 0.029936672423719057, "grad_norm": 3.6751370429992676, "learning_rate": 2.04e-05, "loss": 4.9183, "step": 52 }, { "epoch": 0.03051237766263673, "grad_norm": 3.382035970687866, "learning_rate": 2.08e-05, "loss": 5.499, "step": 53 }, { "epoch": 0.031088082901554404, "grad_norm": 2.8802406787872314, "learning_rate": 2.12e-05, "loss": 5.3177, "step": 54 }, { "epoch": 0.03166378814047208, "grad_norm": 6.158539772033691, "learning_rate": 2.16e-05, "loss": 6.2133, "step": 55 }, { "epoch": 0.03223949337938975, "grad_norm": 2.599864959716797, "learning_rate": 2.2000000000000003e-05, "loss": 5.3691, "step": 56 }, { "epoch": 0.03281519861830743, "grad_norm": 3.4526188373565674, "learning_rate": 2.2400000000000002e-05, "loss": 5.3801, "step": 57 }, { "epoch": 0.0333909038572251, "grad_norm": 9.494807243347168, "learning_rate": 2.2800000000000002e-05, "loss": 7.3116, "step": 58 }, { "epoch": 0.033966609096142776, "grad_norm": 4.3456130027771, "learning_rate": 2.32e-05, "loss": 4.7467, "step": 59 }, { "epoch": 0.03454231433506045, "grad_norm": 3.8471431732177734, "learning_rate": 2.36e-05, "loss": 5.2742, "step": 60 }, { "epoch": 0.03511801957397812, "grad_norm": 3.985994815826416, "learning_rate": 2.4e-05, "loss": 5.4615, "step": 61 }, { "epoch": 0.035693724812895795, "grad_norm": 9.588626861572266, "learning_rate": 2.44e-05, "loss": 6.8261, "step": 62 }, { "epoch": 0.03626943005181347, "grad_norm": 5.3343915939331055, "learning_rate": 2.48e-05, "loss": 6.0899, "step": 63 }, { "epoch": 0.03684513529073115, "grad_norm": 5.611617088317871, "learning_rate": 2.5200000000000003e-05, "loss": 6.4523, "step": 64 }, { "epoch": 0.03742084052964882, "grad_norm": 4.497012615203857, "learning_rate": 2.5600000000000002e-05, "loss": 4.787, "step": 65 }, { "epoch": 0.037996545768566495, "grad_norm": 5.032821178436279, "learning_rate": 2.6000000000000002e-05, "loss": 5.6337, "step": 66 }, { "epoch": 0.03857225100748417, "grad_norm": 3.732733726501465, "learning_rate": 2.64e-05, "loss": 5.5212, "step": 67 }, { "epoch": 0.03914795624640184, "grad_norm": 4.3597517013549805, "learning_rate": 2.6800000000000004e-05, "loss": 4.647, "step": 68 }, { "epoch": 0.039723661485319514, "grad_norm": 5.359225273132324, "learning_rate": 2.7200000000000004e-05, "loss": 5.7052, "step": 69 }, { "epoch": 0.04029936672423719, "grad_norm": 4.9161601066589355, "learning_rate": 2.7600000000000003e-05, "loss": 5.3191, "step": 70 }, { "epoch": 0.04087507196315487, "grad_norm": 4.137385368347168, "learning_rate": 2.8000000000000003e-05, "loss": 5.1797, "step": 71 }, { "epoch": 0.04145077720207254, "grad_norm": 4.728359699249268, "learning_rate": 2.84e-05, "loss": 5.1125, "step": 72 }, { "epoch": 0.042026482440990214, "grad_norm": 4.568793773651123, "learning_rate": 2.88e-05, "loss": 5.7705, "step": 73 }, { "epoch": 0.04260218767990789, "grad_norm": 4.931026935577393, "learning_rate": 2.9199999999999998e-05, "loss": 5.1052, "step": 74 }, { "epoch": 0.04317789291882556, "grad_norm": 4.697461128234863, "learning_rate": 2.96e-05, "loss": 5.1404, "step": 75 }, { "epoch": 0.04375359815774323, "grad_norm": 6.393320083618164, "learning_rate": 3e-05, "loss": 6.2212, "step": 76 }, { "epoch": 0.04432930339666091, "grad_norm": 5.876922607421875, "learning_rate": 3.04e-05, "loss": 5.7775, "step": 77 }, { "epoch": 0.044905008635578586, "grad_norm": 4.749701499938965, "learning_rate": 3.08e-05, "loss": 4.7321, "step": 78 }, { "epoch": 0.04548071387449626, "grad_norm": 4.894115447998047, "learning_rate": 3.12e-05, "loss": 5.2017, "step": 79 }, { "epoch": 0.04605641911341393, "grad_norm": 5.125804424285889, "learning_rate": 3.16e-05, "loss": 5.1661, "step": 80 }, { "epoch": 0.046632124352331605, "grad_norm": 7.571075439453125, "learning_rate": 3.2000000000000005e-05, "loss": 6.1439, "step": 81 }, { "epoch": 0.04720782959124928, "grad_norm": 4.469061374664307, "learning_rate": 3.24e-05, "loss": 5.1732, "step": 82 }, { "epoch": 0.04778353483016695, "grad_norm": 4.565371513366699, "learning_rate": 3.2800000000000004e-05, "loss": 5.4892, "step": 83 }, { "epoch": 0.04835924006908463, "grad_norm": 5.844489097595215, "learning_rate": 3.32e-05, "loss": 5.875, "step": 84 }, { "epoch": 0.048934945308002305, "grad_norm": 10.564720153808594, "learning_rate": 3.3600000000000004e-05, "loss": 5.9008, "step": 85 }, { "epoch": 0.04951065054691998, "grad_norm": 6.923472881317139, "learning_rate": 3.4000000000000007e-05, "loss": 5.4949, "step": 86 }, { "epoch": 0.05008635578583765, "grad_norm": 6.902386665344238, "learning_rate": 3.4399999999999996e-05, "loss": 4.9801, "step": 87 }, { "epoch": 0.050662061024755324, "grad_norm": 8.239148139953613, "learning_rate": 3.48e-05, "loss": 5.6578, "step": 88 }, { "epoch": 0.051237766263673, "grad_norm": 6.162630081176758, "learning_rate": 3.52e-05, "loss": 4.9911, "step": 89 }, { "epoch": 0.05181347150259067, "grad_norm": 7.2612433433532715, "learning_rate": 3.56e-05, "loss": 5.7976, "step": 90 }, { "epoch": 0.05238917674150835, "grad_norm": 6.149419784545898, "learning_rate": 3.6e-05, "loss": 4.9756, "step": 91 }, { "epoch": 0.05296488198042602, "grad_norm": 7.4116106033325195, "learning_rate": 3.6400000000000004e-05, "loss": 5.5805, "step": 92 }, { "epoch": 0.0535405872193437, "grad_norm": 5.512300491333008, "learning_rate": 3.68e-05, "loss": 4.5575, "step": 93 }, { "epoch": 0.05411629245826137, "grad_norm": 14.799551963806152, "learning_rate": 3.72e-05, "loss": 5.2244, "step": 94 }, { "epoch": 0.05469199769717904, "grad_norm": 9.756938934326172, "learning_rate": 3.76e-05, "loss": 4.8444, "step": 95 }, { "epoch": 0.055267702936096716, "grad_norm": 6.400147914886475, "learning_rate": 3.8e-05, "loss": 5.5091, "step": 96 }, { "epoch": 0.055843408175014396, "grad_norm": 8.406181335449219, "learning_rate": 3.8400000000000005e-05, "loss": 5.2641, "step": 97 }, { "epoch": 0.05641911341393207, "grad_norm": 6.860042572021484, "learning_rate": 3.88e-05, "loss": 5.2917, "step": 98 }, { "epoch": 0.05699481865284974, "grad_norm": 7.542653560638428, "learning_rate": 3.9200000000000004e-05, "loss": 5.1584, "step": 99 }, { "epoch": 0.057570523891767415, "grad_norm": 8.149137496948242, "learning_rate": 3.960000000000001e-05, "loss": 5.5326, "step": 100 }, { "epoch": 0.05814622913068509, "grad_norm": 5.590121269226074, "learning_rate": 4e-05, "loss": 5.2789, "step": 101 }, { "epoch": 0.05872193436960276, "grad_norm": 7.877676010131836, "learning_rate": 4.0400000000000006e-05, "loss": 4.8526, "step": 102 }, { "epoch": 0.059297639608520435, "grad_norm": 5.773808479309082, "learning_rate": 4.08e-05, "loss": 5.033, "step": 103 }, { "epoch": 0.059873344847438115, "grad_norm": 6.092824935913086, "learning_rate": 4.12e-05, "loss": 4.8936, "step": 104 }, { "epoch": 0.06044905008635579, "grad_norm": 5.934675693511963, "learning_rate": 4.16e-05, "loss": 4.4764, "step": 105 }, { "epoch": 0.06102475532527346, "grad_norm": 5.622652530670166, "learning_rate": 4.2e-05, "loss": 5.1344, "step": 106 }, { "epoch": 0.061600460564191134, "grad_norm": 7.697418212890625, "learning_rate": 4.24e-05, "loss": 5.2087, "step": 107 }, { "epoch": 0.06217616580310881, "grad_norm": 5.204082489013672, "learning_rate": 4.2800000000000004e-05, "loss": 4.6294, "step": 108 }, { "epoch": 0.06275187104202648, "grad_norm": 6.288537979125977, "learning_rate": 4.32e-05, "loss": 5.3009, "step": 109 }, { "epoch": 0.06332757628094415, "grad_norm": 6.717288017272949, "learning_rate": 4.36e-05, "loss": 5.5392, "step": 110 }, { "epoch": 0.06390328151986183, "grad_norm": 5.432399272918701, "learning_rate": 4.4000000000000006e-05, "loss": 4.3602, "step": 111 }, { "epoch": 0.0644789867587795, "grad_norm": 6.823062896728516, "learning_rate": 4.44e-05, "loss": 5.7343, "step": 112 }, { "epoch": 0.06505469199769717, "grad_norm": 6.532074928283691, "learning_rate": 4.4800000000000005e-05, "loss": 5.0605, "step": 113 }, { "epoch": 0.06563039723661486, "grad_norm": 5.982126712799072, "learning_rate": 4.52e-05, "loss": 5.2182, "step": 114 }, { "epoch": 0.06620610247553253, "grad_norm": 5.759943962097168, "learning_rate": 4.5600000000000004e-05, "loss": 4.9098, "step": 115 }, { "epoch": 0.0667818077144502, "grad_norm": 5.147834300994873, "learning_rate": 4.600000000000001e-05, "loss": 4.8671, "step": 116 }, { "epoch": 0.06735751295336788, "grad_norm": 8.015042304992676, "learning_rate": 4.64e-05, "loss": 5.7445, "step": 117 }, { "epoch": 0.06793321819228555, "grad_norm": 7.161843299865723, "learning_rate": 4.6800000000000006e-05, "loss": 5.9092, "step": 118 }, { "epoch": 0.06850892343120323, "grad_norm": 9.394163131713867, "learning_rate": 4.72e-05, "loss": 4.7243, "step": 119 }, { "epoch": 0.0690846286701209, "grad_norm": 4.96219539642334, "learning_rate": 4.76e-05, "loss": 4.7233, "step": 120 }, { "epoch": 0.06966033390903857, "grad_norm": 6.473387241363525, "learning_rate": 4.8e-05, "loss": 5.1295, "step": 121 }, { "epoch": 0.07023603914795624, "grad_norm": 6.797422885894775, "learning_rate": 4.8400000000000004e-05, "loss": 4.7697, "step": 122 }, { "epoch": 0.07081174438687392, "grad_norm": 6.656020641326904, "learning_rate": 4.88e-05, "loss": 5.2377, "step": 123 }, { "epoch": 0.07138744962579159, "grad_norm": 5.552718639373779, "learning_rate": 4.92e-05, "loss": 4.4741, "step": 124 }, { "epoch": 0.07196315486470926, "grad_norm": 6.101820468902588, "learning_rate": 4.96e-05, "loss": 4.4192, "step": 125 }, { "epoch": 0.07253886010362694, "grad_norm": 7.695935249328613, "learning_rate": 5e-05, "loss": 5.4128, "step": 126 }, { "epoch": 0.07311456534254462, "grad_norm": 6.9946208000183105, "learning_rate": 5.0400000000000005e-05, "loss": 5.4829, "step": 127 }, { "epoch": 0.0736902705814623, "grad_norm": 16.10480308532715, "learning_rate": 5.08e-05, "loss": 4.6945, "step": 128 }, { "epoch": 0.07426597582037997, "grad_norm": 5.313148021697998, "learning_rate": 5.1200000000000004e-05, "loss": 4.2429, "step": 129 }, { "epoch": 0.07484168105929764, "grad_norm": 5.506260871887207, "learning_rate": 5.16e-05, "loss": 4.7241, "step": 130 }, { "epoch": 0.07541738629821532, "grad_norm": 5.655925273895264, "learning_rate": 5.2000000000000004e-05, "loss": 5.4156, "step": 131 }, { "epoch": 0.07599309153713299, "grad_norm": 6.528857231140137, "learning_rate": 5.2400000000000007e-05, "loss": 5.3606, "step": 132 }, { "epoch": 0.07656879677605066, "grad_norm": 5.360299110412598, "learning_rate": 5.28e-05, "loss": 5.0686, "step": 133 }, { "epoch": 0.07714450201496834, "grad_norm": 5.301785945892334, "learning_rate": 5.3200000000000006e-05, "loss": 4.845, "step": 134 }, { "epoch": 0.07772020725388601, "grad_norm": 4.986385345458984, "learning_rate": 5.360000000000001e-05, "loss": 5.1493, "step": 135 }, { "epoch": 0.07829591249280368, "grad_norm": 5.200460433959961, "learning_rate": 5.4000000000000005e-05, "loss": 4.781, "step": 136 }, { "epoch": 0.07887161773172136, "grad_norm": 7.154032230377197, "learning_rate": 5.440000000000001e-05, "loss": 5.8801, "step": 137 }, { "epoch": 0.07944732297063903, "grad_norm": 4.641168117523193, "learning_rate": 5.4800000000000004e-05, "loss": 5.1929, "step": 138 }, { "epoch": 0.0800230282095567, "grad_norm": 4.8809123039245605, "learning_rate": 5.520000000000001e-05, "loss": 5.0221, "step": 139 }, { "epoch": 0.08059873344847437, "grad_norm": 5.0507402420043945, "learning_rate": 5.560000000000001e-05, "loss": 4.8543, "step": 140 }, { "epoch": 0.08117443868739206, "grad_norm": 6.459733963012695, "learning_rate": 5.6000000000000006e-05, "loss": 5.051, "step": 141 }, { "epoch": 0.08175014392630973, "grad_norm": 6.107847690582275, "learning_rate": 5.6399999999999995e-05, "loss": 4.8338, "step": 142 }, { "epoch": 0.08232584916522741, "grad_norm": 6.28361701965332, "learning_rate": 5.68e-05, "loss": 5.1373, "step": 143 }, { "epoch": 0.08290155440414508, "grad_norm": 4.957414627075195, "learning_rate": 5.72e-05, "loss": 4.8154, "step": 144 }, { "epoch": 0.08347725964306275, "grad_norm": 4.774332046508789, "learning_rate": 5.76e-05, "loss": 4.7262, "step": 145 }, { "epoch": 0.08405296488198043, "grad_norm": 7.41762113571167, "learning_rate": 5.8e-05, "loss": 5.5137, "step": 146 }, { "epoch": 0.0846286701208981, "grad_norm": 7.484424591064453, "learning_rate": 5.8399999999999997e-05, "loss": 5.766, "step": 147 }, { "epoch": 0.08520437535981577, "grad_norm": 4.917182922363281, "learning_rate": 5.88e-05, "loss": 5.0193, "step": 148 }, { "epoch": 0.08578008059873345, "grad_norm": 4.608645915985107, "learning_rate": 5.92e-05, "loss": 5.0873, "step": 149 }, { "epoch": 0.08635578583765112, "grad_norm": 6.5947794914245605, "learning_rate": 5.96e-05, "loss": 4.9855, "step": 150 }, { "epoch": 0.08693149107656879, "grad_norm": 3.8302507400512695, "learning_rate": 6e-05, "loss": 3.7953, "step": 151 }, { "epoch": 0.08750719631548647, "grad_norm": 3.6352171897888184, "learning_rate": 6.04e-05, "loss": 4.1647, "step": 152 }, { "epoch": 0.08808290155440414, "grad_norm": 4.818563461303711, "learning_rate": 6.08e-05, "loss": 4.2128, "step": 153 }, { "epoch": 0.08865860679332183, "grad_norm": 7.7323503494262695, "learning_rate": 6.12e-05, "loss": 5.4562, "step": 154 }, { "epoch": 0.0892343120322395, "grad_norm": 5.785284996032715, "learning_rate": 6.16e-05, "loss": 4.8956, "step": 155 }, { "epoch": 0.08981001727115717, "grad_norm": 6.181385040283203, "learning_rate": 6.2e-05, "loss": 5.2373, "step": 156 }, { "epoch": 0.09038572251007485, "grad_norm": 6.015028476715088, "learning_rate": 6.24e-05, "loss": 4.3663, "step": 157 }, { "epoch": 0.09096142774899252, "grad_norm": 4.41657829284668, "learning_rate": 6.280000000000001e-05, "loss": 4.5991, "step": 158 }, { "epoch": 0.09153713298791019, "grad_norm": 6.5107622146606445, "learning_rate": 6.32e-05, "loss": 4.8784, "step": 159 }, { "epoch": 0.09211283822682786, "grad_norm": 4.11070442199707, "learning_rate": 6.36e-05, "loss": 4.6766, "step": 160 }, { "epoch": 0.09268854346574554, "grad_norm": 8.204343795776367, "learning_rate": 6.400000000000001e-05, "loss": 5.5088, "step": 161 }, { "epoch": 0.09326424870466321, "grad_norm": 3.9389288425445557, "learning_rate": 6.440000000000001e-05, "loss": 4.3476, "step": 162 }, { "epoch": 0.09383995394358088, "grad_norm": 5.597643852233887, "learning_rate": 6.48e-05, "loss": 4.9976, "step": 163 }, { "epoch": 0.09441565918249856, "grad_norm": 8.994287490844727, "learning_rate": 6.52e-05, "loss": 5.5959, "step": 164 }, { "epoch": 0.09499136442141623, "grad_norm": 5.60779333114624, "learning_rate": 6.560000000000001e-05, "loss": 4.6283, "step": 165 }, { "epoch": 0.0955670696603339, "grad_norm": 4.319982528686523, "learning_rate": 6.6e-05, "loss": 4.041, "step": 166 }, { "epoch": 0.09614277489925158, "grad_norm": 5.684337615966797, "learning_rate": 6.64e-05, "loss": 4.8941, "step": 167 }, { "epoch": 0.09671848013816926, "grad_norm": 3.872518539428711, "learning_rate": 6.680000000000001e-05, "loss": 4.2242, "step": 168 }, { "epoch": 0.09729418537708694, "grad_norm": 4.826557636260986, "learning_rate": 6.720000000000001e-05, "loss": 4.8546, "step": 169 }, { "epoch": 0.09786989061600461, "grad_norm": 4.660156726837158, "learning_rate": 6.76e-05, "loss": 4.3797, "step": 170 }, { "epoch": 0.09844559585492228, "grad_norm": 4.616059303283691, "learning_rate": 6.800000000000001e-05, "loss": 4.7293, "step": 171 }, { "epoch": 0.09902130109383996, "grad_norm": 7.685507774353027, "learning_rate": 6.840000000000001e-05, "loss": 5.6251, "step": 172 }, { "epoch": 0.09959700633275763, "grad_norm": 7.424576282501221, "learning_rate": 6.879999999999999e-05, "loss": 4.8253, "step": 173 }, { "epoch": 0.1001727115716753, "grad_norm": 4.379521369934082, "learning_rate": 6.92e-05, "loss": 4.5287, "step": 174 }, { "epoch": 0.10074841681059298, "grad_norm": 4.753964424133301, "learning_rate": 6.96e-05, "loss": 4.5554, "step": 175 }, { "epoch": 0.10132412204951065, "grad_norm": 4.559609413146973, "learning_rate": 7e-05, "loss": 4.5615, "step": 176 }, { "epoch": 0.10189982728842832, "grad_norm": 5.178406238555908, "learning_rate": 7.04e-05, "loss": 4.6344, "step": 177 }, { "epoch": 0.102475532527346, "grad_norm": 7.4183526039123535, "learning_rate": 7.08e-05, "loss": 4.5451, "step": 178 }, { "epoch": 0.10305123776626367, "grad_norm": 5.832037448883057, "learning_rate": 7.12e-05, "loss": 4.7097, "step": 179 }, { "epoch": 0.10362694300518134, "grad_norm": 4.9681925773620605, "learning_rate": 7.16e-05, "loss": 4.6288, "step": 180 }, { "epoch": 0.10420264824409903, "grad_norm": 4.886664867401123, "learning_rate": 7.2e-05, "loss": 4.7019, "step": 181 }, { "epoch": 0.1047783534830167, "grad_norm": 4.668741226196289, "learning_rate": 7.24e-05, "loss": 4.4534, "step": 182 }, { "epoch": 0.10535405872193437, "grad_norm": 7.459389686584473, "learning_rate": 7.280000000000001e-05, "loss": 5.4758, "step": 183 }, { "epoch": 0.10592976396085205, "grad_norm": 31.545869827270508, "learning_rate": 7.32e-05, "loss": 6.179, "step": 184 }, { "epoch": 0.10650546919976972, "grad_norm": 9.739182472229004, "learning_rate": 7.36e-05, "loss": 4.9662, "step": 185 }, { "epoch": 0.1070811744386874, "grad_norm": 4.12076997756958, "learning_rate": 7.4e-05, "loss": 3.88, "step": 186 }, { "epoch": 0.10765687967760507, "grad_norm": 5.808717727661133, "learning_rate": 7.44e-05, "loss": 4.6157, "step": 187 }, { "epoch": 0.10823258491652274, "grad_norm": 3.6208741664886475, "learning_rate": 7.48e-05, "loss": 3.9156, "step": 188 }, { "epoch": 0.10880829015544041, "grad_norm": 4.674955368041992, "learning_rate": 7.52e-05, "loss": 4.4751, "step": 189 }, { "epoch": 0.10938399539435809, "grad_norm": 5.331599235534668, "learning_rate": 7.560000000000001e-05, "loss": 4.3887, "step": 190 }, { "epoch": 0.10995970063327576, "grad_norm": 5.1405534744262695, "learning_rate": 7.6e-05, "loss": 4.9114, "step": 191 }, { "epoch": 0.11053540587219343, "grad_norm": 3.7066593170166016, "learning_rate": 7.64e-05, "loss": 3.8948, "step": 192 }, { "epoch": 0.1111111111111111, "grad_norm": 5.185431003570557, "learning_rate": 7.680000000000001e-05, "loss": 4.232, "step": 193 }, { "epoch": 0.11168681635002879, "grad_norm": 4.900607585906982, "learning_rate": 7.72e-05, "loss": 4.667, "step": 194 }, { "epoch": 0.11226252158894647, "grad_norm": 5.091091632843018, "learning_rate": 7.76e-05, "loss": 4.3946, "step": 195 }, { "epoch": 0.11283822682786414, "grad_norm": 4.859619617462158, "learning_rate": 7.800000000000001e-05, "loss": 4.6306, "step": 196 }, { "epoch": 0.11341393206678181, "grad_norm": 3.544200897216797, "learning_rate": 7.840000000000001e-05, "loss": 4.2118, "step": 197 }, { "epoch": 0.11398963730569948, "grad_norm": 8.28862190246582, "learning_rate": 7.88e-05, "loss": 4.4431, "step": 198 }, { "epoch": 0.11456534254461716, "grad_norm": 6.373688220977783, "learning_rate": 7.920000000000001e-05, "loss": 4.7554, "step": 199 }, { "epoch": 0.11514104778353483, "grad_norm": 6.8544392585754395, "learning_rate": 7.960000000000001e-05, "loss": 4.8723, "step": 200 }, { "epoch": 0.1157167530224525, "grad_norm": 7.207869052886963, "learning_rate": 8e-05, "loss": 4.1096, "step": 201 }, { "epoch": 0.11629245826137018, "grad_norm": 4.9073333740234375, "learning_rate": 8.04e-05, "loss": 3.6834, "step": 202 }, { "epoch": 0.11686816350028785, "grad_norm": 6.523554801940918, "learning_rate": 8.080000000000001e-05, "loss": 4.4934, "step": 203 }, { "epoch": 0.11744386873920552, "grad_norm": 9.581537246704102, "learning_rate": 8.120000000000001e-05, "loss": 4.8199, "step": 204 }, { "epoch": 0.1180195739781232, "grad_norm": 5.319664001464844, "learning_rate": 8.16e-05, "loss": 4.0881, "step": 205 }, { "epoch": 0.11859527921704087, "grad_norm": 7.609442710876465, "learning_rate": 8.2e-05, "loss": 5.1011, "step": 206 }, { "epoch": 0.11917098445595854, "grad_norm": 5.437283515930176, "learning_rate": 8.24e-05, "loss": 4.7683, "step": 207 }, { "epoch": 0.11974668969487623, "grad_norm": 9.015962600708008, "learning_rate": 8.28e-05, "loss": 5.1197, "step": 208 }, { "epoch": 0.1203223949337939, "grad_norm": 5.41486120223999, "learning_rate": 8.32e-05, "loss": 4.2228, "step": 209 }, { "epoch": 0.12089810017271158, "grad_norm": 4.068630218505859, "learning_rate": 8.36e-05, "loss": 3.9683, "step": 210 }, { "epoch": 0.12147380541162925, "grad_norm": 4.818974494934082, "learning_rate": 8.4e-05, "loss": 4.3969, "step": 211 }, { "epoch": 0.12204951065054692, "grad_norm": 8.309637069702148, "learning_rate": 8.44e-05, "loss": 4.8983, "step": 212 }, { "epoch": 0.1226252158894646, "grad_norm": 5.997379302978516, "learning_rate": 8.48e-05, "loss": 4.6983, "step": 213 }, { "epoch": 0.12320092112838227, "grad_norm": 6.416568279266357, "learning_rate": 8.52e-05, "loss": 4.6, "step": 214 }, { "epoch": 0.12377662636729994, "grad_norm": 5.038214206695557, "learning_rate": 8.560000000000001e-05, "loss": 4.1803, "step": 215 }, { "epoch": 0.12435233160621761, "grad_norm": 5.035988807678223, "learning_rate": 8.6e-05, "loss": 4.1585, "step": 216 }, { "epoch": 0.12492803684513529, "grad_norm": 6.7663726806640625, "learning_rate": 8.64e-05, "loss": 4.4256, "step": 217 }, { "epoch": 0.12550374208405296, "grad_norm": 5.394269943237305, "learning_rate": 8.680000000000001e-05, "loss": 3.9008, "step": 218 }, { "epoch": 0.12607944732297063, "grad_norm": 5.4501800537109375, "learning_rate": 8.72e-05, "loss": 3.9869, "step": 219 }, { "epoch": 0.1266551525618883, "grad_norm": 4.7380170822143555, "learning_rate": 8.76e-05, "loss": 4.0876, "step": 220 }, { "epoch": 0.12723085780080598, "grad_norm": 6.059116840362549, "learning_rate": 8.800000000000001e-05, "loss": 4.147, "step": 221 }, { "epoch": 0.12780656303972365, "grad_norm": 5.5021586418151855, "learning_rate": 8.840000000000001e-05, "loss": 4.4547, "step": 222 }, { "epoch": 0.12838226827864133, "grad_norm": 4.760106563568115, "learning_rate": 8.88e-05, "loss": 4.075, "step": 223 }, { "epoch": 0.128957973517559, "grad_norm": 7.5847649574279785, "learning_rate": 8.92e-05, "loss": 4.6163, "step": 224 }, { "epoch": 0.12953367875647667, "grad_norm": 6.257955074310303, "learning_rate": 8.960000000000001e-05, "loss": 4.6043, "step": 225 }, { "epoch": 0.13010938399539435, "grad_norm": 7.368046283721924, "learning_rate": 9e-05, "loss": 4.7961, "step": 226 }, { "epoch": 0.13068508923431202, "grad_norm": 4.385096549987793, "learning_rate": 9.04e-05, "loss": 4.1968, "step": 227 }, { "epoch": 0.13126079447322972, "grad_norm": 6.34293794631958, "learning_rate": 9.080000000000001e-05, "loss": 4.3076, "step": 228 }, { "epoch": 0.1318364997121474, "grad_norm": 6.403743267059326, "learning_rate": 9.120000000000001e-05, "loss": 3.8917, "step": 229 }, { "epoch": 0.13241220495106507, "grad_norm": 6.792156219482422, "learning_rate": 9.16e-05, "loss": 3.9843, "step": 230 }, { "epoch": 0.13298791018998274, "grad_norm": 8.062408447265625, "learning_rate": 9.200000000000001e-05, "loss": 4.2562, "step": 231 }, { "epoch": 0.1335636154289004, "grad_norm": 8.513936042785645, "learning_rate": 9.240000000000001e-05, "loss": 4.6536, "step": 232 }, { "epoch": 0.13413932066781808, "grad_norm": 5.92789363861084, "learning_rate": 9.28e-05, "loss": 4.104, "step": 233 }, { "epoch": 0.13471502590673576, "grad_norm": 44.009300231933594, "learning_rate": 9.320000000000002e-05, "loss": 4.8297, "step": 234 }, { "epoch": 0.13529073114565343, "grad_norm": 5.342921257019043, "learning_rate": 9.360000000000001e-05, "loss": 4.0662, "step": 235 }, { "epoch": 0.1358664363845711, "grad_norm": 5.618771076202393, "learning_rate": 9.4e-05, "loss": 4.1692, "step": 236 }, { "epoch": 0.13644214162348878, "grad_norm": 6.6655473709106445, "learning_rate": 9.44e-05, "loss": 4.2759, "step": 237 }, { "epoch": 0.13701784686240645, "grad_norm": 6.415508270263672, "learning_rate": 9.48e-05, "loss": 4.025, "step": 238 }, { "epoch": 0.13759355210132412, "grad_norm": 62.65280532836914, "learning_rate": 9.52e-05, "loss": 5.3187, "step": 239 }, { "epoch": 0.1381692573402418, "grad_norm": 5.9870147705078125, "learning_rate": 9.56e-05, "loss": 4.3549, "step": 240 }, { "epoch": 0.13874496257915947, "grad_norm": 6.323814868927002, "learning_rate": 9.6e-05, "loss": 4.0618, "step": 241 }, { "epoch": 0.13932066781807714, "grad_norm": 7.25873327255249, "learning_rate": 9.64e-05, "loss": 4.6113, "step": 242 }, { "epoch": 0.13989637305699482, "grad_norm": 6.708962440490723, "learning_rate": 9.680000000000001e-05, "loss": 4.2734, "step": 243 }, { "epoch": 0.1404720782959125, "grad_norm": 6.766256332397461, "learning_rate": 9.72e-05, "loss": 3.8169, "step": 244 }, { "epoch": 0.14104778353483016, "grad_norm": 9.25779914855957, "learning_rate": 9.76e-05, "loss": 4.0823, "step": 245 }, { "epoch": 0.14162348877374784, "grad_norm": 6.24402379989624, "learning_rate": 9.8e-05, "loss": 3.9761, "step": 246 }, { "epoch": 0.1421991940126655, "grad_norm": 4.627258777618408, "learning_rate": 9.84e-05, "loss": 3.3376, "step": 247 }, { "epoch": 0.14277489925158318, "grad_norm": 6.5364766120910645, "learning_rate": 9.88e-05, "loss": 3.9101, "step": 248 }, { "epoch": 0.14335060449050085, "grad_norm": 6.722381591796875, "learning_rate": 9.92e-05, "loss": 4.2916, "step": 249 }, { "epoch": 0.14392630972941853, "grad_norm": 7.2800493240356445, "learning_rate": 9.960000000000001e-05, "loss": 4.1714, "step": 250 }, { "epoch": 0.1445020149683362, "grad_norm": 9.137832641601562, "learning_rate": 0.0001, "loss": 3.9733, "step": 251 }, { "epoch": 0.14507772020725387, "grad_norm": 5.290084362030029, "learning_rate": 0.0001004, "loss": 3.8465, "step": 252 }, { "epoch": 0.14565342544617155, "grad_norm": 7.146475791931152, "learning_rate": 0.00010080000000000001, "loss": 4.154, "step": 253 }, { "epoch": 0.14622913068508925, "grad_norm": 5.462000370025635, "learning_rate": 0.00010120000000000001, "loss": 3.8403, "step": 254 }, { "epoch": 0.14680483592400692, "grad_norm": 8.053996086120605, "learning_rate": 0.0001016, "loss": 4.224, "step": 255 }, { "epoch": 0.1473805411629246, "grad_norm": 56.904518127441406, "learning_rate": 0.00010200000000000001, "loss": 5.3512, "step": 256 }, { "epoch": 0.14795624640184227, "grad_norm": 67.7396469116211, "learning_rate": 0.00010240000000000001, "loss": 4.136, "step": 257 }, { "epoch": 0.14853195164075994, "grad_norm": 5.19423770904541, "learning_rate": 0.0001028, "loss": 3.6272, "step": 258 }, { "epoch": 0.1491076568796776, "grad_norm": 6.946446418762207, "learning_rate": 0.0001032, "loss": 3.7617, "step": 259 }, { "epoch": 0.1496833621185953, "grad_norm": 6.839754104614258, "learning_rate": 0.00010360000000000001, "loss": 4.2895, "step": 260 }, { "epoch": 0.15025906735751296, "grad_norm": 7.3253254890441895, "learning_rate": 0.00010400000000000001, "loss": 4.0997, "step": 261 }, { "epoch": 0.15083477259643063, "grad_norm": 6.981521129608154, "learning_rate": 0.0001044, "loss": 3.4663, "step": 262 }, { "epoch": 0.1514104778353483, "grad_norm": 6.424066543579102, "learning_rate": 0.00010480000000000001, "loss": 4.0914, "step": 263 }, { "epoch": 0.15198618307426598, "grad_norm": 6.7790398597717285, "learning_rate": 0.00010520000000000001, "loss": 4.0818, "step": 264 }, { "epoch": 0.15256188831318365, "grad_norm": 7.887113094329834, "learning_rate": 0.0001056, "loss": 4.3784, "step": 265 }, { "epoch": 0.15313759355210133, "grad_norm": 8.3016939163208, "learning_rate": 0.00010600000000000002, "loss": 3.7843, "step": 266 }, { "epoch": 0.153713298791019, "grad_norm": 10.073237419128418, "learning_rate": 0.00010640000000000001, "loss": 4.0118, "step": 267 }, { "epoch": 0.15428900402993667, "grad_norm": 6.9664106369018555, "learning_rate": 0.00010680000000000001, "loss": 3.8644, "step": 268 }, { "epoch": 0.15486470926885434, "grad_norm": 8.479534149169922, "learning_rate": 0.00010720000000000002, "loss": 3.7009, "step": 269 }, { "epoch": 0.15544041450777202, "grad_norm": 8.317602157592773, "learning_rate": 0.00010760000000000001, "loss": 3.7018, "step": 270 }, { "epoch": 0.1560161197466897, "grad_norm": 6.020889759063721, "learning_rate": 0.00010800000000000001, "loss": 3.656, "step": 271 }, { "epoch": 0.15659182498560736, "grad_norm": 7.147673606872559, "learning_rate": 0.00010840000000000002, "loss": 3.9216, "step": 272 }, { "epoch": 0.15716753022452504, "grad_norm": 5.485556125640869, "learning_rate": 0.00010880000000000002, "loss": 3.4732, "step": 273 }, { "epoch": 0.1577432354634427, "grad_norm": 7.432086944580078, "learning_rate": 0.00010920000000000001, "loss": 3.423, "step": 274 }, { "epoch": 0.15831894070236038, "grad_norm": 6.897833824157715, "learning_rate": 0.00010960000000000001, "loss": 3.6169, "step": 275 }, { "epoch": 0.15889464594127806, "grad_norm": 7.707437992095947, "learning_rate": 0.00011000000000000002, "loss": 3.6883, "step": 276 }, { "epoch": 0.15947035118019573, "grad_norm": 5.546234607696533, "learning_rate": 0.00011040000000000001, "loss": 3.8388, "step": 277 }, { "epoch": 0.1600460564191134, "grad_norm": 10.001431465148926, "learning_rate": 0.00011080000000000001, "loss": 3.372, "step": 278 }, { "epoch": 0.16062176165803108, "grad_norm": 8.793180465698242, "learning_rate": 0.00011120000000000002, "loss": 3.7929, "step": 279 }, { "epoch": 0.16119746689694875, "grad_norm": 8.189177513122559, "learning_rate": 0.00011160000000000002, "loss": 4.0091, "step": 280 }, { "epoch": 0.16177317213586645, "grad_norm": 6.998697280883789, "learning_rate": 0.00011200000000000001, "loss": 3.648, "step": 281 }, { "epoch": 0.16234887737478412, "grad_norm": 8.115317344665527, "learning_rate": 0.00011240000000000002, "loss": 4.0327, "step": 282 }, { "epoch": 0.1629245826137018, "grad_norm": 7.597106456756592, "learning_rate": 0.00011279999999999999, "loss": 3.7811, "step": 283 }, { "epoch": 0.16350028785261947, "grad_norm": 6.518374443054199, "learning_rate": 0.0001132, "loss": 3.3359, "step": 284 }, { "epoch": 0.16407599309153714, "grad_norm": 6.962795257568359, "learning_rate": 0.0001136, "loss": 3.3726, "step": 285 }, { "epoch": 0.16465169833045482, "grad_norm": 8.1845703125, "learning_rate": 0.00011399999999999999, "loss": 4.0042, "step": 286 }, { "epoch": 0.1652274035693725, "grad_norm": 6.869271755218506, "learning_rate": 0.0001144, "loss": 3.4989, "step": 287 }, { "epoch": 0.16580310880829016, "grad_norm": 12.261098861694336, "learning_rate": 0.0001148, "loss": 4.1045, "step": 288 }, { "epoch": 0.16637881404720783, "grad_norm": 6.912962913513184, "learning_rate": 0.0001152, "loss": 3.6853, "step": 289 }, { "epoch": 0.1669545192861255, "grad_norm": 8.545379638671875, "learning_rate": 0.00011559999999999999, "loss": 3.8903, "step": 290 }, { "epoch": 0.16753022452504318, "grad_norm": 15.040228843688965, "learning_rate": 0.000116, "loss": 3.4079, "step": 291 }, { "epoch": 0.16810592976396085, "grad_norm": 7.038132667541504, "learning_rate": 0.0001164, "loss": 3.7119, "step": 292 }, { "epoch": 0.16868163500287853, "grad_norm": 6.259817123413086, "learning_rate": 0.00011679999999999999, "loss": 3.4931, "step": 293 }, { "epoch": 0.1692573402417962, "grad_norm": 6.947351455688477, "learning_rate": 0.0001172, "loss": 3.677, "step": 294 }, { "epoch": 0.16983304548071387, "grad_norm": 14.260014533996582, "learning_rate": 0.0001176, "loss": 3.9591, "step": 295 }, { "epoch": 0.17040875071963155, "grad_norm": 6.70070743560791, "learning_rate": 0.000118, "loss": 3.2433, "step": 296 }, { "epoch": 0.17098445595854922, "grad_norm": 11.697699546813965, "learning_rate": 0.0001184, "loss": 4.0909, "step": 297 }, { "epoch": 0.1715601611974669, "grad_norm": 10.029029846191406, "learning_rate": 0.0001188, "loss": 3.5743, "step": 298 }, { "epoch": 0.17213586643638457, "grad_norm": 6.6930365562438965, "learning_rate": 0.0001192, "loss": 3.2007, "step": 299 }, { "epoch": 0.17271157167530224, "grad_norm": 21.772619247436523, "learning_rate": 0.00011960000000000001, "loss": 3.8505, "step": 300 }, { "epoch": 0.1732872769142199, "grad_norm": 9.126256942749023, "learning_rate": 0.00012, "loss": 3.5777, "step": 301 }, { "epoch": 0.17386298215313759, "grad_norm": 7.574469566345215, "learning_rate": 0.0001204, "loss": 3.5329, "step": 302 }, { "epoch": 0.17443868739205526, "grad_norm": 6.436075687408447, "learning_rate": 0.0001208, "loss": 3.279, "step": 303 }, { "epoch": 0.17501439263097293, "grad_norm": 5.945929527282715, "learning_rate": 0.0001212, "loss": 3.4338, "step": 304 }, { "epoch": 0.1755900978698906, "grad_norm": 5.7057785987854, "learning_rate": 0.0001216, "loss": 3.2369, "step": 305 }, { "epoch": 0.17616580310880828, "grad_norm": 9.411810874938965, "learning_rate": 0.000122, "loss": 3.5364, "step": 306 }, { "epoch": 0.17674150834772595, "grad_norm": 8.872260093688965, "learning_rate": 0.0001224, "loss": 3.7803, "step": 307 }, { "epoch": 0.17731721358664365, "grad_norm": 46.1115837097168, "learning_rate": 0.0001228, "loss": 3.7188, "step": 308 }, { "epoch": 0.17789291882556132, "grad_norm": 48.33805465698242, "learning_rate": 0.0001232, "loss": 3.7491, "step": 309 }, { "epoch": 0.178468624064479, "grad_norm": 7.272097587585449, "learning_rate": 0.0001236, "loss": 3.559, "step": 310 }, { "epoch": 0.17904432930339667, "grad_norm": 7.471408367156982, "learning_rate": 0.000124, "loss": 3.6014, "step": 311 }, { "epoch": 0.17962003454231434, "grad_norm": 11.095893859863281, "learning_rate": 0.00012440000000000002, "loss": 3.5741, "step": 312 }, { "epoch": 0.18019573978123202, "grad_norm": 8.782601356506348, "learning_rate": 0.0001248, "loss": 3.2475, "step": 313 }, { "epoch": 0.1807714450201497, "grad_norm": 7.485610485076904, "learning_rate": 0.0001252, "loss": 3.0304, "step": 314 }, { "epoch": 0.18134715025906736, "grad_norm": 7.794425964355469, "learning_rate": 0.00012560000000000002, "loss": 2.9428, "step": 315 }, { "epoch": 0.18192285549798504, "grad_norm": 6.470662593841553, "learning_rate": 0.000126, "loss": 3.4341, "step": 316 }, { "epoch": 0.1824985607369027, "grad_norm": 10.054426193237305, "learning_rate": 0.0001264, "loss": 2.941, "step": 317 }, { "epoch": 0.18307426597582038, "grad_norm": 93.38629150390625, "learning_rate": 0.00012680000000000002, "loss": 4.2291, "step": 318 }, { "epoch": 0.18364997121473806, "grad_norm": 9.805968284606934, "learning_rate": 0.0001272, "loss": 3.0641, "step": 319 }, { "epoch": 0.18422567645365573, "grad_norm": 6.104334831237793, "learning_rate": 0.0001276, "loss": 3.0856, "step": 320 }, { "epoch": 0.1848013816925734, "grad_norm": 8.24195384979248, "learning_rate": 0.00012800000000000002, "loss": 3.0774, "step": 321 }, { "epoch": 0.18537708693149108, "grad_norm": 6.327628135681152, "learning_rate": 0.0001284, "loss": 3.0826, "step": 322 }, { "epoch": 0.18595279217040875, "grad_norm": 11.529990196228027, "learning_rate": 0.00012880000000000001, "loss": 3.7882, "step": 323 }, { "epoch": 0.18652849740932642, "grad_norm": 9.700762748718262, "learning_rate": 0.00012920000000000002, "loss": 3.4958, "step": 324 }, { "epoch": 0.1871042026482441, "grad_norm": 10.289152145385742, "learning_rate": 0.0001296, "loss": 3.3652, "step": 325 }, { "epoch": 0.18767990788716177, "grad_norm": 6.888269901275635, "learning_rate": 0.00013000000000000002, "loss": 3.1086, "step": 326 }, { "epoch": 0.18825561312607944, "grad_norm": 9.220719337463379, "learning_rate": 0.0001304, "loss": 3.5314, "step": 327 }, { "epoch": 0.1888313183649971, "grad_norm": 9.044048309326172, "learning_rate": 0.0001308, "loss": 2.943, "step": 328 }, { "epoch": 0.1894070236039148, "grad_norm": 11.338268280029297, "learning_rate": 0.00013120000000000002, "loss": 3.4617, "step": 329 }, { "epoch": 0.18998272884283246, "grad_norm": 5.949525833129883, "learning_rate": 0.0001316, "loss": 2.8324, "step": 330 }, { "epoch": 0.19055843408175013, "grad_norm": 9.158703804016113, "learning_rate": 0.000132, "loss": 3.1961, "step": 331 }, { "epoch": 0.1911341393206678, "grad_norm": 8.708706855773926, "learning_rate": 0.00013240000000000002, "loss": 3.1941, "step": 332 }, { "epoch": 0.19170984455958548, "grad_norm": 10.610583305358887, "learning_rate": 0.0001328, "loss": 3.3617, "step": 333 }, { "epoch": 0.19228554979850315, "grad_norm": 8.023892402648926, "learning_rate": 0.0001332, "loss": 3.1775, "step": 334 }, { "epoch": 0.19286125503742085, "grad_norm": 7.895623683929443, "learning_rate": 0.00013360000000000002, "loss": 3.1033, "step": 335 }, { "epoch": 0.19343696027633853, "grad_norm": 6.376975059509277, "learning_rate": 0.000134, "loss": 2.808, "step": 336 }, { "epoch": 0.1940126655152562, "grad_norm": 5.185142993927002, "learning_rate": 0.00013440000000000001, "loss": 2.8337, "step": 337 }, { "epoch": 0.19458837075417387, "grad_norm": 6.408693790435791, "learning_rate": 0.00013480000000000002, "loss": 3.0604, "step": 338 }, { "epoch": 0.19516407599309155, "grad_norm": 21.610239028930664, "learning_rate": 0.0001352, "loss": 3.431, "step": 339 }, { "epoch": 0.19573978123200922, "grad_norm": 9.485398292541504, "learning_rate": 0.00013560000000000002, "loss": 3.2208, "step": 340 }, { "epoch": 0.1963154864709269, "grad_norm": 6.460340976715088, "learning_rate": 0.00013600000000000003, "loss": 2.793, "step": 341 }, { "epoch": 0.19689119170984457, "grad_norm": 5.64215612411499, "learning_rate": 0.0001364, "loss": 2.8589, "step": 342 }, { "epoch": 0.19746689694876224, "grad_norm": 6.9033427238464355, "learning_rate": 0.00013680000000000002, "loss": 3.1031, "step": 343 }, { "epoch": 0.1980426021876799, "grad_norm": 5.724493980407715, "learning_rate": 0.00013720000000000003, "loss": 2.8605, "step": 344 }, { "epoch": 0.19861830742659758, "grad_norm": 15.779448509216309, "learning_rate": 0.00013759999999999998, "loss": 3.2151, "step": 345 }, { "epoch": 0.19919401266551526, "grad_norm": 6.960752964019775, "learning_rate": 0.000138, "loss": 2.8537, "step": 346 }, { "epoch": 0.19976971790443293, "grad_norm": 8.871850967407227, "learning_rate": 0.0001384, "loss": 2.7536, "step": 347 }, { "epoch": 0.2003454231433506, "grad_norm": 6.670348644256592, "learning_rate": 0.00013879999999999999, "loss": 2.9525, "step": 348 } ], "logging_steps": 1, "max_steps": 1737, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 348, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.022631528658895e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }