|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2003454231433506, |
|
"eval_steps": 869, |
|
"global_step": 348, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005757052389176742, |
|
"grad_norm": 2.557003974914551, |
|
"learning_rate": 0.0, |
|
"loss": 5.4277, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005757052389176742, |
|
"eval_loss": 5.319709300994873, |
|
"eval_runtime": 1026.7022, |
|
"eval_samples_per_second": 2.496, |
|
"eval_steps_per_second": 2.496, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0011514104778353484, |
|
"grad_norm": 2.985229969024658, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 5.7019, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0017271157167530224, |
|
"grad_norm": 3.0353081226348877, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 6.1934, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002302820955670697, |
|
"grad_norm": 3.724905490875244, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 5.4617, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0028785261945883708, |
|
"grad_norm": 2.6505627632141113, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 5.4285, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0034542314335060447, |
|
"grad_norm": 2.7363409996032715, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 5.8634, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004029936672423719, |
|
"grad_norm": 3.082538366317749, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 4.7461, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004605641911341394, |
|
"grad_norm": 9.095250129699707, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 7.5703, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0051813471502590676, |
|
"grad_norm": 2.2597923278808594, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 5.3631, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0057570523891767415, |
|
"grad_norm": 5.053525924682617, |
|
"learning_rate": 3.6e-06, |
|
"loss": 6.0132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0063327576280944155, |
|
"grad_norm": 2.7407820224761963, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 5.9776, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0069084628670120895, |
|
"grad_norm": 2.4892263412475586, |
|
"learning_rate": 4.4e-06, |
|
"loss": 5.524, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.007484168105929764, |
|
"grad_norm": 2.5302274227142334, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 5.8044, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.008059873344847437, |
|
"grad_norm": 2.992504358291626, |
|
"learning_rate": 5.2e-06, |
|
"loss": 6.0307, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.008635578583765112, |
|
"grad_norm": 4.081608295440674, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 4.6732, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009211283822682787, |
|
"grad_norm": 2.33296799659729, |
|
"learning_rate": 6e-06, |
|
"loss": 4.6356, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00978698906160046, |
|
"grad_norm": 2.798452854156494, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 5.2941, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.010362694300518135, |
|
"grad_norm": 2.290029525756836, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 4.9405, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.010938399539435808, |
|
"grad_norm": 3.2164740562438965, |
|
"learning_rate": 7.2e-06, |
|
"loss": 5.6711, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.011514104778353483, |
|
"grad_norm": 2.4481987953186035, |
|
"learning_rate": 7.6e-06, |
|
"loss": 5.0366, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012089810017271158, |
|
"grad_norm": 3.398063898086548, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 5.9377, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.012665515256188831, |
|
"grad_norm": 2.3936686515808105, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 5.4237, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.013241220495106506, |
|
"grad_norm": 2.7233810424804688, |
|
"learning_rate": 8.8e-06, |
|
"loss": 5.6551, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.013816925734024179, |
|
"grad_norm": 2.9957566261291504, |
|
"learning_rate": 9.2e-06, |
|
"loss": 4.7701, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.014392630972941854, |
|
"grad_norm": 6.397132396697998, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 6.4459, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014968336211859529, |
|
"grad_norm": 3.0593409538269043, |
|
"learning_rate": 1e-05, |
|
"loss": 5.2758, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.015544041450777202, |
|
"grad_norm": 2.9723803997039795, |
|
"learning_rate": 1.04e-05, |
|
"loss": 5.6136, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.016119746689694875, |
|
"grad_norm": 2.03314471244812, |
|
"learning_rate": 1.08e-05, |
|
"loss": 5.3556, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01669545192861255, |
|
"grad_norm": 1.777107834815979, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 5.1061, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.017271157167530225, |
|
"grad_norm": 3.2192044258117676, |
|
"learning_rate": 1.16e-05, |
|
"loss": 5.2414, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017846862406447898, |
|
"grad_norm": 3.924452066421509, |
|
"learning_rate": 1.2e-05, |
|
"loss": 5.2754, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.018422567645365574, |
|
"grad_norm": 3.5611093044281006, |
|
"learning_rate": 1.24e-05, |
|
"loss": 5.2817, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.018998272884283247, |
|
"grad_norm": 2.5194263458251953, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 5.9063, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01957397812320092, |
|
"grad_norm": 2.403895854949951, |
|
"learning_rate": 1.32e-05, |
|
"loss": 5.1161, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.020149683362118594, |
|
"grad_norm": 2.496400833129883, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 5.3049, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02072538860103627, |
|
"grad_norm": 3.0970828533172607, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 5.5807, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.021301093839953943, |
|
"grad_norm": 3.941403388977051, |
|
"learning_rate": 1.44e-05, |
|
"loss": 6.0418, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.021876799078871616, |
|
"grad_norm": 2.291431188583374, |
|
"learning_rate": 1.48e-05, |
|
"loss": 4.3686, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.022452504317789293, |
|
"grad_norm": 2.783054828643799, |
|
"learning_rate": 1.52e-05, |
|
"loss": 5.15, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.023028209556706966, |
|
"grad_norm": 3.579267978668213, |
|
"learning_rate": 1.56e-05, |
|
"loss": 5.7507, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02360391479562464, |
|
"grad_norm": 3.5277323722839355, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 6.112, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.024179620034542316, |
|
"grad_norm": 2.5100817680358887, |
|
"learning_rate": 1.6400000000000002e-05, |
|
"loss": 5.2133, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02475532527345999, |
|
"grad_norm": 2.3821561336517334, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 6.0345, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.025331030512377662, |
|
"grad_norm": 3.0675108432769775, |
|
"learning_rate": 1.7199999999999998e-05, |
|
"loss": 5.2294, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.025906735751295335, |
|
"grad_norm": 2.8790383338928223, |
|
"learning_rate": 1.76e-05, |
|
"loss": 5.6393, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02648244099021301, |
|
"grad_norm": 3.3649141788482666, |
|
"learning_rate": 1.8e-05, |
|
"loss": 6.014, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.027058146229130685, |
|
"grad_norm": 3.4695286750793457, |
|
"learning_rate": 1.84e-05, |
|
"loss": 5.3457, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.027633851468048358, |
|
"grad_norm": 3.303622245788574, |
|
"learning_rate": 1.88e-05, |
|
"loss": 5.593, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.028209556706966035, |
|
"grad_norm": 2.481895923614502, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 5.1439, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.028785261945883708, |
|
"grad_norm": 2.888579845428467, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 4.6318, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02936096718480138, |
|
"grad_norm": 3.4528300762176514, |
|
"learning_rate": 2e-05, |
|
"loss": 5.0376, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.029936672423719057, |
|
"grad_norm": 3.6751370429992676, |
|
"learning_rate": 2.04e-05, |
|
"loss": 4.9183, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03051237766263673, |
|
"grad_norm": 3.382035970687866, |
|
"learning_rate": 2.08e-05, |
|
"loss": 5.499, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.031088082901554404, |
|
"grad_norm": 2.8802406787872314, |
|
"learning_rate": 2.12e-05, |
|
"loss": 5.3177, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03166378814047208, |
|
"grad_norm": 6.158539772033691, |
|
"learning_rate": 2.16e-05, |
|
"loss": 6.2133, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03223949337938975, |
|
"grad_norm": 2.599864959716797, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 5.3691, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03281519861830743, |
|
"grad_norm": 3.4526188373565674, |
|
"learning_rate": 2.2400000000000002e-05, |
|
"loss": 5.3801, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0333909038572251, |
|
"grad_norm": 9.494807243347168, |
|
"learning_rate": 2.2800000000000002e-05, |
|
"loss": 7.3116, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.033966609096142776, |
|
"grad_norm": 4.3456130027771, |
|
"learning_rate": 2.32e-05, |
|
"loss": 4.7467, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03454231433506045, |
|
"grad_norm": 3.8471431732177734, |
|
"learning_rate": 2.36e-05, |
|
"loss": 5.2742, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03511801957397812, |
|
"grad_norm": 3.985994815826416, |
|
"learning_rate": 2.4e-05, |
|
"loss": 5.4615, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.035693724812895795, |
|
"grad_norm": 9.588626861572266, |
|
"learning_rate": 2.44e-05, |
|
"loss": 6.8261, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03626943005181347, |
|
"grad_norm": 5.3343915939331055, |
|
"learning_rate": 2.48e-05, |
|
"loss": 6.0899, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03684513529073115, |
|
"grad_norm": 5.611617088317871, |
|
"learning_rate": 2.5200000000000003e-05, |
|
"loss": 6.4523, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03742084052964882, |
|
"grad_norm": 4.497012615203857, |
|
"learning_rate": 2.5600000000000002e-05, |
|
"loss": 4.787, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.037996545768566495, |
|
"grad_norm": 5.032821178436279, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 5.6337, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.03857225100748417, |
|
"grad_norm": 3.732733726501465, |
|
"learning_rate": 2.64e-05, |
|
"loss": 5.5212, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.03914795624640184, |
|
"grad_norm": 4.3597517013549805, |
|
"learning_rate": 2.6800000000000004e-05, |
|
"loss": 4.647, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.039723661485319514, |
|
"grad_norm": 5.359225273132324, |
|
"learning_rate": 2.7200000000000004e-05, |
|
"loss": 5.7052, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04029936672423719, |
|
"grad_norm": 4.9161601066589355, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 5.3191, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04087507196315487, |
|
"grad_norm": 4.137385368347168, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 5.1797, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04145077720207254, |
|
"grad_norm": 4.728359699249268, |
|
"learning_rate": 2.84e-05, |
|
"loss": 5.1125, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.042026482440990214, |
|
"grad_norm": 4.568793773651123, |
|
"learning_rate": 2.88e-05, |
|
"loss": 5.7705, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04260218767990789, |
|
"grad_norm": 4.931026935577393, |
|
"learning_rate": 2.9199999999999998e-05, |
|
"loss": 5.1052, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04317789291882556, |
|
"grad_norm": 4.697461128234863, |
|
"learning_rate": 2.96e-05, |
|
"loss": 5.1404, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04375359815774323, |
|
"grad_norm": 6.393320083618164, |
|
"learning_rate": 3e-05, |
|
"loss": 6.2212, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04432930339666091, |
|
"grad_norm": 5.876922607421875, |
|
"learning_rate": 3.04e-05, |
|
"loss": 5.7775, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.044905008635578586, |
|
"grad_norm": 4.749701499938965, |
|
"learning_rate": 3.08e-05, |
|
"loss": 4.7321, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04548071387449626, |
|
"grad_norm": 4.894115447998047, |
|
"learning_rate": 3.12e-05, |
|
"loss": 5.2017, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04605641911341393, |
|
"grad_norm": 5.125804424285889, |
|
"learning_rate": 3.16e-05, |
|
"loss": 5.1661, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.046632124352331605, |
|
"grad_norm": 7.571075439453125, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 6.1439, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04720782959124928, |
|
"grad_norm": 4.469061374664307, |
|
"learning_rate": 3.24e-05, |
|
"loss": 5.1732, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04778353483016695, |
|
"grad_norm": 4.565371513366699, |
|
"learning_rate": 3.2800000000000004e-05, |
|
"loss": 5.4892, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04835924006908463, |
|
"grad_norm": 5.844489097595215, |
|
"learning_rate": 3.32e-05, |
|
"loss": 5.875, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.048934945308002305, |
|
"grad_norm": 10.564720153808594, |
|
"learning_rate": 3.3600000000000004e-05, |
|
"loss": 5.9008, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04951065054691998, |
|
"grad_norm": 6.923472881317139, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 5.4949, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05008635578583765, |
|
"grad_norm": 6.902386665344238, |
|
"learning_rate": 3.4399999999999996e-05, |
|
"loss": 4.9801, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.050662061024755324, |
|
"grad_norm": 8.239148139953613, |
|
"learning_rate": 3.48e-05, |
|
"loss": 5.6578, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.051237766263673, |
|
"grad_norm": 6.162630081176758, |
|
"learning_rate": 3.52e-05, |
|
"loss": 4.9911, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 7.2612433433532715, |
|
"learning_rate": 3.56e-05, |
|
"loss": 5.7976, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05238917674150835, |
|
"grad_norm": 6.149419784545898, |
|
"learning_rate": 3.6e-05, |
|
"loss": 4.9756, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05296488198042602, |
|
"grad_norm": 7.4116106033325195, |
|
"learning_rate": 3.6400000000000004e-05, |
|
"loss": 5.5805, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0535405872193437, |
|
"grad_norm": 5.512300491333008, |
|
"learning_rate": 3.68e-05, |
|
"loss": 4.5575, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.05411629245826137, |
|
"grad_norm": 14.799551963806152, |
|
"learning_rate": 3.72e-05, |
|
"loss": 5.2244, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.05469199769717904, |
|
"grad_norm": 9.756938934326172, |
|
"learning_rate": 3.76e-05, |
|
"loss": 4.8444, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.055267702936096716, |
|
"grad_norm": 6.400147914886475, |
|
"learning_rate": 3.8e-05, |
|
"loss": 5.5091, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.055843408175014396, |
|
"grad_norm": 8.406181335449219, |
|
"learning_rate": 3.8400000000000005e-05, |
|
"loss": 5.2641, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.05641911341393207, |
|
"grad_norm": 6.860042572021484, |
|
"learning_rate": 3.88e-05, |
|
"loss": 5.2917, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05699481865284974, |
|
"grad_norm": 7.542653560638428, |
|
"learning_rate": 3.9200000000000004e-05, |
|
"loss": 5.1584, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.057570523891767415, |
|
"grad_norm": 8.149137496948242, |
|
"learning_rate": 3.960000000000001e-05, |
|
"loss": 5.5326, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05814622913068509, |
|
"grad_norm": 5.590121269226074, |
|
"learning_rate": 4e-05, |
|
"loss": 5.2789, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05872193436960276, |
|
"grad_norm": 7.877676010131836, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 4.8526, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.059297639608520435, |
|
"grad_norm": 5.773808479309082, |
|
"learning_rate": 4.08e-05, |
|
"loss": 5.033, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.059873344847438115, |
|
"grad_norm": 6.092824935913086, |
|
"learning_rate": 4.12e-05, |
|
"loss": 4.8936, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06044905008635579, |
|
"grad_norm": 5.934675693511963, |
|
"learning_rate": 4.16e-05, |
|
"loss": 4.4764, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06102475532527346, |
|
"grad_norm": 5.622652530670166, |
|
"learning_rate": 4.2e-05, |
|
"loss": 5.1344, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.061600460564191134, |
|
"grad_norm": 7.697418212890625, |
|
"learning_rate": 4.24e-05, |
|
"loss": 5.2087, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06217616580310881, |
|
"grad_norm": 5.204082489013672, |
|
"learning_rate": 4.2800000000000004e-05, |
|
"loss": 4.6294, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06275187104202648, |
|
"grad_norm": 6.288537979125977, |
|
"learning_rate": 4.32e-05, |
|
"loss": 5.3009, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.06332757628094415, |
|
"grad_norm": 6.717288017272949, |
|
"learning_rate": 4.36e-05, |
|
"loss": 5.5392, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06390328151986183, |
|
"grad_norm": 5.432399272918701, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 4.3602, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0644789867587795, |
|
"grad_norm": 6.823062896728516, |
|
"learning_rate": 4.44e-05, |
|
"loss": 5.7343, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.06505469199769717, |
|
"grad_norm": 6.532074928283691, |
|
"learning_rate": 4.4800000000000005e-05, |
|
"loss": 5.0605, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.06563039723661486, |
|
"grad_norm": 5.982126712799072, |
|
"learning_rate": 4.52e-05, |
|
"loss": 5.2182, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06620610247553253, |
|
"grad_norm": 5.759943962097168, |
|
"learning_rate": 4.5600000000000004e-05, |
|
"loss": 4.9098, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0667818077144502, |
|
"grad_norm": 5.147834300994873, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 4.8671, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.06735751295336788, |
|
"grad_norm": 8.015042304992676, |
|
"learning_rate": 4.64e-05, |
|
"loss": 5.7445, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.06793321819228555, |
|
"grad_norm": 7.161843299865723, |
|
"learning_rate": 4.6800000000000006e-05, |
|
"loss": 5.9092, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.06850892343120323, |
|
"grad_norm": 9.394163131713867, |
|
"learning_rate": 4.72e-05, |
|
"loss": 4.7243, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0690846286701209, |
|
"grad_norm": 4.96219539642334, |
|
"learning_rate": 4.76e-05, |
|
"loss": 4.7233, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06966033390903857, |
|
"grad_norm": 6.473387241363525, |
|
"learning_rate": 4.8e-05, |
|
"loss": 5.1295, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.07023603914795624, |
|
"grad_norm": 6.797422885894775, |
|
"learning_rate": 4.8400000000000004e-05, |
|
"loss": 4.7697, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07081174438687392, |
|
"grad_norm": 6.656020641326904, |
|
"learning_rate": 4.88e-05, |
|
"loss": 5.2377, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07138744962579159, |
|
"grad_norm": 5.552718639373779, |
|
"learning_rate": 4.92e-05, |
|
"loss": 4.4741, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07196315486470926, |
|
"grad_norm": 6.101820468902588, |
|
"learning_rate": 4.96e-05, |
|
"loss": 4.4192, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07253886010362694, |
|
"grad_norm": 7.695935249328613, |
|
"learning_rate": 5e-05, |
|
"loss": 5.4128, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07311456534254462, |
|
"grad_norm": 6.9946208000183105, |
|
"learning_rate": 5.0400000000000005e-05, |
|
"loss": 5.4829, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0736902705814623, |
|
"grad_norm": 16.10480308532715, |
|
"learning_rate": 5.08e-05, |
|
"loss": 4.6945, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.07426597582037997, |
|
"grad_norm": 5.313148021697998, |
|
"learning_rate": 5.1200000000000004e-05, |
|
"loss": 4.2429, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.07484168105929764, |
|
"grad_norm": 5.506260871887207, |
|
"learning_rate": 5.16e-05, |
|
"loss": 4.7241, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07541738629821532, |
|
"grad_norm": 5.655925273895264, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 5.4156, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.07599309153713299, |
|
"grad_norm": 6.528857231140137, |
|
"learning_rate": 5.2400000000000007e-05, |
|
"loss": 5.3606, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07656879677605066, |
|
"grad_norm": 5.360299110412598, |
|
"learning_rate": 5.28e-05, |
|
"loss": 5.0686, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.07714450201496834, |
|
"grad_norm": 5.301785945892334, |
|
"learning_rate": 5.3200000000000006e-05, |
|
"loss": 4.845, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.07772020725388601, |
|
"grad_norm": 4.986385345458984, |
|
"learning_rate": 5.360000000000001e-05, |
|
"loss": 5.1493, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07829591249280368, |
|
"grad_norm": 5.200460433959961, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 4.781, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.07887161773172136, |
|
"grad_norm": 7.154032230377197, |
|
"learning_rate": 5.440000000000001e-05, |
|
"loss": 5.8801, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.07944732297063903, |
|
"grad_norm": 4.641168117523193, |
|
"learning_rate": 5.4800000000000004e-05, |
|
"loss": 5.1929, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0800230282095567, |
|
"grad_norm": 4.8809123039245605, |
|
"learning_rate": 5.520000000000001e-05, |
|
"loss": 5.0221, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.08059873344847437, |
|
"grad_norm": 5.0507402420043945, |
|
"learning_rate": 5.560000000000001e-05, |
|
"loss": 4.8543, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08117443868739206, |
|
"grad_norm": 6.459733963012695, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 5.051, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08175014392630973, |
|
"grad_norm": 6.107847690582275, |
|
"learning_rate": 5.6399999999999995e-05, |
|
"loss": 4.8338, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08232584916522741, |
|
"grad_norm": 6.28361701965332, |
|
"learning_rate": 5.68e-05, |
|
"loss": 5.1373, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.08290155440414508, |
|
"grad_norm": 4.957414627075195, |
|
"learning_rate": 5.72e-05, |
|
"loss": 4.8154, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08347725964306275, |
|
"grad_norm": 4.774332046508789, |
|
"learning_rate": 5.76e-05, |
|
"loss": 4.7262, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08405296488198043, |
|
"grad_norm": 7.41762113571167, |
|
"learning_rate": 5.8e-05, |
|
"loss": 5.5137, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0846286701208981, |
|
"grad_norm": 7.484424591064453, |
|
"learning_rate": 5.8399999999999997e-05, |
|
"loss": 5.766, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.08520437535981577, |
|
"grad_norm": 4.917182922363281, |
|
"learning_rate": 5.88e-05, |
|
"loss": 5.0193, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.08578008059873345, |
|
"grad_norm": 4.608645915985107, |
|
"learning_rate": 5.92e-05, |
|
"loss": 5.0873, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"grad_norm": 6.5947794914245605, |
|
"learning_rate": 5.96e-05, |
|
"loss": 4.9855, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08693149107656879, |
|
"grad_norm": 3.8302507400512695, |
|
"learning_rate": 6e-05, |
|
"loss": 3.7953, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.08750719631548647, |
|
"grad_norm": 3.6352171897888184, |
|
"learning_rate": 6.04e-05, |
|
"loss": 4.1647, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.08808290155440414, |
|
"grad_norm": 4.818563461303711, |
|
"learning_rate": 6.08e-05, |
|
"loss": 4.2128, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.08865860679332183, |
|
"grad_norm": 7.7323503494262695, |
|
"learning_rate": 6.12e-05, |
|
"loss": 5.4562, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0892343120322395, |
|
"grad_norm": 5.785284996032715, |
|
"learning_rate": 6.16e-05, |
|
"loss": 4.8956, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08981001727115717, |
|
"grad_norm": 6.181385040283203, |
|
"learning_rate": 6.2e-05, |
|
"loss": 5.2373, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.09038572251007485, |
|
"grad_norm": 6.015028476715088, |
|
"learning_rate": 6.24e-05, |
|
"loss": 4.3663, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.09096142774899252, |
|
"grad_norm": 4.41657829284668, |
|
"learning_rate": 6.280000000000001e-05, |
|
"loss": 4.5991, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09153713298791019, |
|
"grad_norm": 6.5107622146606445, |
|
"learning_rate": 6.32e-05, |
|
"loss": 4.8784, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.09211283822682786, |
|
"grad_norm": 4.11070442199707, |
|
"learning_rate": 6.36e-05, |
|
"loss": 4.6766, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09268854346574554, |
|
"grad_norm": 8.204343795776367, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 5.5088, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.09326424870466321, |
|
"grad_norm": 3.9389288425445557, |
|
"learning_rate": 6.440000000000001e-05, |
|
"loss": 4.3476, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09383995394358088, |
|
"grad_norm": 5.597643852233887, |
|
"learning_rate": 6.48e-05, |
|
"loss": 4.9976, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.09441565918249856, |
|
"grad_norm": 8.994287490844727, |
|
"learning_rate": 6.52e-05, |
|
"loss": 5.5959, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.09499136442141623, |
|
"grad_norm": 5.60779333114624, |
|
"learning_rate": 6.560000000000001e-05, |
|
"loss": 4.6283, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0955670696603339, |
|
"grad_norm": 4.319982528686523, |
|
"learning_rate": 6.6e-05, |
|
"loss": 4.041, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.09614277489925158, |
|
"grad_norm": 5.684337615966797, |
|
"learning_rate": 6.64e-05, |
|
"loss": 4.8941, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.09671848013816926, |
|
"grad_norm": 3.872518539428711, |
|
"learning_rate": 6.680000000000001e-05, |
|
"loss": 4.2242, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.09729418537708694, |
|
"grad_norm": 4.826557636260986, |
|
"learning_rate": 6.720000000000001e-05, |
|
"loss": 4.8546, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.09786989061600461, |
|
"grad_norm": 4.660156726837158, |
|
"learning_rate": 6.76e-05, |
|
"loss": 4.3797, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09844559585492228, |
|
"grad_norm": 4.616059303283691, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 4.7293, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.09902130109383996, |
|
"grad_norm": 7.685507774353027, |
|
"learning_rate": 6.840000000000001e-05, |
|
"loss": 5.6251, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.09959700633275763, |
|
"grad_norm": 7.424576282501221, |
|
"learning_rate": 6.879999999999999e-05, |
|
"loss": 4.8253, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1001727115716753, |
|
"grad_norm": 4.379521369934082, |
|
"learning_rate": 6.92e-05, |
|
"loss": 4.5287, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10074841681059298, |
|
"grad_norm": 4.753964424133301, |
|
"learning_rate": 6.96e-05, |
|
"loss": 4.5554, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10132412204951065, |
|
"grad_norm": 4.559609413146973, |
|
"learning_rate": 7e-05, |
|
"loss": 4.5615, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.10189982728842832, |
|
"grad_norm": 5.178406238555908, |
|
"learning_rate": 7.04e-05, |
|
"loss": 4.6344, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.102475532527346, |
|
"grad_norm": 7.4183526039123535, |
|
"learning_rate": 7.08e-05, |
|
"loss": 4.5451, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.10305123776626367, |
|
"grad_norm": 5.832037448883057, |
|
"learning_rate": 7.12e-05, |
|
"loss": 4.7097, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 4.9681925773620605, |
|
"learning_rate": 7.16e-05, |
|
"loss": 4.6288, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10420264824409903, |
|
"grad_norm": 4.886664867401123, |
|
"learning_rate": 7.2e-05, |
|
"loss": 4.7019, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1047783534830167, |
|
"grad_norm": 4.668741226196289, |
|
"learning_rate": 7.24e-05, |
|
"loss": 4.4534, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.10535405872193437, |
|
"grad_norm": 7.459389686584473, |
|
"learning_rate": 7.280000000000001e-05, |
|
"loss": 5.4758, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.10592976396085205, |
|
"grad_norm": 31.545869827270508, |
|
"learning_rate": 7.32e-05, |
|
"loss": 6.179, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.10650546919976972, |
|
"grad_norm": 9.739182472229004, |
|
"learning_rate": 7.36e-05, |
|
"loss": 4.9662, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1070811744386874, |
|
"grad_norm": 4.12076997756958, |
|
"learning_rate": 7.4e-05, |
|
"loss": 3.88, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.10765687967760507, |
|
"grad_norm": 5.808717727661133, |
|
"learning_rate": 7.44e-05, |
|
"loss": 4.6157, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.10823258491652274, |
|
"grad_norm": 3.6208741664886475, |
|
"learning_rate": 7.48e-05, |
|
"loss": 3.9156, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.10880829015544041, |
|
"grad_norm": 4.674955368041992, |
|
"learning_rate": 7.52e-05, |
|
"loss": 4.4751, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.10938399539435809, |
|
"grad_norm": 5.331599235534668, |
|
"learning_rate": 7.560000000000001e-05, |
|
"loss": 4.3887, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10995970063327576, |
|
"grad_norm": 5.1405534744262695, |
|
"learning_rate": 7.6e-05, |
|
"loss": 4.9114, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.11053540587219343, |
|
"grad_norm": 3.7066593170166016, |
|
"learning_rate": 7.64e-05, |
|
"loss": 3.8948, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 5.185431003570557, |
|
"learning_rate": 7.680000000000001e-05, |
|
"loss": 4.232, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.11168681635002879, |
|
"grad_norm": 4.900607585906982, |
|
"learning_rate": 7.72e-05, |
|
"loss": 4.667, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.11226252158894647, |
|
"grad_norm": 5.091091632843018, |
|
"learning_rate": 7.76e-05, |
|
"loss": 4.3946, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11283822682786414, |
|
"grad_norm": 4.859619617462158, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 4.6306, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.11341393206678181, |
|
"grad_norm": 3.544200897216797, |
|
"learning_rate": 7.840000000000001e-05, |
|
"loss": 4.2118, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.11398963730569948, |
|
"grad_norm": 8.28862190246582, |
|
"learning_rate": 7.88e-05, |
|
"loss": 4.4431, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.11456534254461716, |
|
"grad_norm": 6.373688220977783, |
|
"learning_rate": 7.920000000000001e-05, |
|
"loss": 4.7554, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.11514104778353483, |
|
"grad_norm": 6.8544392585754395, |
|
"learning_rate": 7.960000000000001e-05, |
|
"loss": 4.8723, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1157167530224525, |
|
"grad_norm": 7.207869052886963, |
|
"learning_rate": 8e-05, |
|
"loss": 4.1096, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.11629245826137018, |
|
"grad_norm": 4.9073333740234375, |
|
"learning_rate": 8.04e-05, |
|
"loss": 3.6834, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.11686816350028785, |
|
"grad_norm": 6.523554801940918, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 4.4934, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.11744386873920552, |
|
"grad_norm": 9.581537246704102, |
|
"learning_rate": 8.120000000000001e-05, |
|
"loss": 4.8199, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1180195739781232, |
|
"grad_norm": 5.319664001464844, |
|
"learning_rate": 8.16e-05, |
|
"loss": 4.0881, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.11859527921704087, |
|
"grad_norm": 7.609442710876465, |
|
"learning_rate": 8.2e-05, |
|
"loss": 5.1011, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.11917098445595854, |
|
"grad_norm": 5.437283515930176, |
|
"learning_rate": 8.24e-05, |
|
"loss": 4.7683, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.11974668969487623, |
|
"grad_norm": 9.015962600708008, |
|
"learning_rate": 8.28e-05, |
|
"loss": 5.1197, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1203223949337939, |
|
"grad_norm": 5.41486120223999, |
|
"learning_rate": 8.32e-05, |
|
"loss": 4.2228, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.12089810017271158, |
|
"grad_norm": 4.068630218505859, |
|
"learning_rate": 8.36e-05, |
|
"loss": 3.9683, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12147380541162925, |
|
"grad_norm": 4.818974494934082, |
|
"learning_rate": 8.4e-05, |
|
"loss": 4.3969, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.12204951065054692, |
|
"grad_norm": 8.309637069702148, |
|
"learning_rate": 8.44e-05, |
|
"loss": 4.8983, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.1226252158894646, |
|
"grad_norm": 5.997379302978516, |
|
"learning_rate": 8.48e-05, |
|
"loss": 4.6983, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.12320092112838227, |
|
"grad_norm": 6.416568279266357, |
|
"learning_rate": 8.52e-05, |
|
"loss": 4.6, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.12377662636729994, |
|
"grad_norm": 5.038214206695557, |
|
"learning_rate": 8.560000000000001e-05, |
|
"loss": 4.1803, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.12435233160621761, |
|
"grad_norm": 5.035988807678223, |
|
"learning_rate": 8.6e-05, |
|
"loss": 4.1585, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.12492803684513529, |
|
"grad_norm": 6.7663726806640625, |
|
"learning_rate": 8.64e-05, |
|
"loss": 4.4256, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.12550374208405296, |
|
"grad_norm": 5.394269943237305, |
|
"learning_rate": 8.680000000000001e-05, |
|
"loss": 3.9008, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.12607944732297063, |
|
"grad_norm": 5.4501800537109375, |
|
"learning_rate": 8.72e-05, |
|
"loss": 3.9869, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1266551525618883, |
|
"grad_norm": 4.7380170822143555, |
|
"learning_rate": 8.76e-05, |
|
"loss": 4.0876, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12723085780080598, |
|
"grad_norm": 6.059116840362549, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 4.147, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.12780656303972365, |
|
"grad_norm": 5.5021586418151855, |
|
"learning_rate": 8.840000000000001e-05, |
|
"loss": 4.4547, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.12838226827864133, |
|
"grad_norm": 4.760106563568115, |
|
"learning_rate": 8.88e-05, |
|
"loss": 4.075, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.128957973517559, |
|
"grad_norm": 7.5847649574279785, |
|
"learning_rate": 8.92e-05, |
|
"loss": 4.6163, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"grad_norm": 6.257955074310303, |
|
"learning_rate": 8.960000000000001e-05, |
|
"loss": 4.6043, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.13010938399539435, |
|
"grad_norm": 7.368046283721924, |
|
"learning_rate": 9e-05, |
|
"loss": 4.7961, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.13068508923431202, |
|
"grad_norm": 4.385096549987793, |
|
"learning_rate": 9.04e-05, |
|
"loss": 4.1968, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.13126079447322972, |
|
"grad_norm": 6.34293794631958, |
|
"learning_rate": 9.080000000000001e-05, |
|
"loss": 4.3076, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1318364997121474, |
|
"grad_norm": 6.403743267059326, |
|
"learning_rate": 9.120000000000001e-05, |
|
"loss": 3.8917, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.13241220495106507, |
|
"grad_norm": 6.792156219482422, |
|
"learning_rate": 9.16e-05, |
|
"loss": 3.9843, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13298791018998274, |
|
"grad_norm": 8.062408447265625, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 4.2562, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1335636154289004, |
|
"grad_norm": 8.513936042785645, |
|
"learning_rate": 9.240000000000001e-05, |
|
"loss": 4.6536, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.13413932066781808, |
|
"grad_norm": 5.92789363861084, |
|
"learning_rate": 9.28e-05, |
|
"loss": 4.104, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.13471502590673576, |
|
"grad_norm": 44.009300231933594, |
|
"learning_rate": 9.320000000000002e-05, |
|
"loss": 4.8297, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.13529073114565343, |
|
"grad_norm": 5.342921257019043, |
|
"learning_rate": 9.360000000000001e-05, |
|
"loss": 4.0662, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1358664363845711, |
|
"grad_norm": 5.618771076202393, |
|
"learning_rate": 9.4e-05, |
|
"loss": 4.1692, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.13644214162348878, |
|
"grad_norm": 6.6655473709106445, |
|
"learning_rate": 9.44e-05, |
|
"loss": 4.2759, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.13701784686240645, |
|
"grad_norm": 6.415508270263672, |
|
"learning_rate": 9.48e-05, |
|
"loss": 4.025, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.13759355210132412, |
|
"grad_norm": 62.65280532836914, |
|
"learning_rate": 9.52e-05, |
|
"loss": 5.3187, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.1381692573402418, |
|
"grad_norm": 5.9870147705078125, |
|
"learning_rate": 9.56e-05, |
|
"loss": 4.3549, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13874496257915947, |
|
"grad_norm": 6.323814868927002, |
|
"learning_rate": 9.6e-05, |
|
"loss": 4.0618, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.13932066781807714, |
|
"grad_norm": 7.25873327255249, |
|
"learning_rate": 9.64e-05, |
|
"loss": 4.6113, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.13989637305699482, |
|
"grad_norm": 6.708962440490723, |
|
"learning_rate": 9.680000000000001e-05, |
|
"loss": 4.2734, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.1404720782959125, |
|
"grad_norm": 6.766256332397461, |
|
"learning_rate": 9.72e-05, |
|
"loss": 3.8169, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.14104778353483016, |
|
"grad_norm": 9.25779914855957, |
|
"learning_rate": 9.76e-05, |
|
"loss": 4.0823, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.14162348877374784, |
|
"grad_norm": 6.24402379989624, |
|
"learning_rate": 9.8e-05, |
|
"loss": 3.9761, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1421991940126655, |
|
"grad_norm": 4.627258777618408, |
|
"learning_rate": 9.84e-05, |
|
"loss": 3.3376, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.14277489925158318, |
|
"grad_norm": 6.5364766120910645, |
|
"learning_rate": 9.88e-05, |
|
"loss": 3.9101, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.14335060449050085, |
|
"grad_norm": 6.722381591796875, |
|
"learning_rate": 9.92e-05, |
|
"loss": 4.2916, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.14392630972941853, |
|
"grad_norm": 7.2800493240356445, |
|
"learning_rate": 9.960000000000001e-05, |
|
"loss": 4.1714, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1445020149683362, |
|
"grad_norm": 9.137832641601562, |
|
"learning_rate": 0.0001, |
|
"loss": 3.9733, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.14507772020725387, |
|
"grad_norm": 5.290084362030029, |
|
"learning_rate": 0.0001004, |
|
"loss": 3.8465, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.14565342544617155, |
|
"grad_norm": 7.146475791931152, |
|
"learning_rate": 0.00010080000000000001, |
|
"loss": 4.154, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.14622913068508925, |
|
"grad_norm": 5.462000370025635, |
|
"learning_rate": 0.00010120000000000001, |
|
"loss": 3.8403, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.14680483592400692, |
|
"grad_norm": 8.053996086120605, |
|
"learning_rate": 0.0001016, |
|
"loss": 4.224, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1473805411629246, |
|
"grad_norm": 56.904518127441406, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 5.3512, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.14795624640184227, |
|
"grad_norm": 67.7396469116211, |
|
"learning_rate": 0.00010240000000000001, |
|
"loss": 4.136, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.14853195164075994, |
|
"grad_norm": 5.19423770904541, |
|
"learning_rate": 0.0001028, |
|
"loss": 3.6272, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.1491076568796776, |
|
"grad_norm": 6.946446418762207, |
|
"learning_rate": 0.0001032, |
|
"loss": 3.7617, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.1496833621185953, |
|
"grad_norm": 6.839754104614258, |
|
"learning_rate": 0.00010360000000000001, |
|
"loss": 4.2895, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15025906735751296, |
|
"grad_norm": 7.3253254890441895, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 4.0997, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.15083477259643063, |
|
"grad_norm": 6.981521129608154, |
|
"learning_rate": 0.0001044, |
|
"loss": 3.4663, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1514104778353483, |
|
"grad_norm": 6.424066543579102, |
|
"learning_rate": 0.00010480000000000001, |
|
"loss": 4.0914, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.15198618307426598, |
|
"grad_norm": 6.7790398597717285, |
|
"learning_rate": 0.00010520000000000001, |
|
"loss": 4.0818, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.15256188831318365, |
|
"grad_norm": 7.887113094329834, |
|
"learning_rate": 0.0001056, |
|
"loss": 4.3784, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.15313759355210133, |
|
"grad_norm": 8.3016939163208, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 3.7843, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.153713298791019, |
|
"grad_norm": 10.073237419128418, |
|
"learning_rate": 0.00010640000000000001, |
|
"loss": 4.0118, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.15428900402993667, |
|
"grad_norm": 6.9664106369018555, |
|
"learning_rate": 0.00010680000000000001, |
|
"loss": 3.8644, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.15486470926885434, |
|
"grad_norm": 8.479534149169922, |
|
"learning_rate": 0.00010720000000000002, |
|
"loss": 3.7009, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 8.317602157592773, |
|
"learning_rate": 0.00010760000000000001, |
|
"loss": 3.7018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1560161197466897, |
|
"grad_norm": 6.020889759063721, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 3.656, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.15659182498560736, |
|
"grad_norm": 7.147673606872559, |
|
"learning_rate": 0.00010840000000000002, |
|
"loss": 3.9216, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.15716753022452504, |
|
"grad_norm": 5.485556125640869, |
|
"learning_rate": 0.00010880000000000002, |
|
"loss": 3.4732, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.1577432354634427, |
|
"grad_norm": 7.432086944580078, |
|
"learning_rate": 0.00010920000000000001, |
|
"loss": 3.423, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.15831894070236038, |
|
"grad_norm": 6.897833824157715, |
|
"learning_rate": 0.00010960000000000001, |
|
"loss": 3.6169, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.15889464594127806, |
|
"grad_norm": 7.707437992095947, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 3.6883, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.15947035118019573, |
|
"grad_norm": 5.546234607696533, |
|
"learning_rate": 0.00011040000000000001, |
|
"loss": 3.8388, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.1600460564191134, |
|
"grad_norm": 10.001431465148926, |
|
"learning_rate": 0.00011080000000000001, |
|
"loss": 3.372, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.16062176165803108, |
|
"grad_norm": 8.793180465698242, |
|
"learning_rate": 0.00011120000000000002, |
|
"loss": 3.7929, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.16119746689694875, |
|
"grad_norm": 8.189177513122559, |
|
"learning_rate": 0.00011160000000000002, |
|
"loss": 4.0091, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16177317213586645, |
|
"grad_norm": 6.998697280883789, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 3.648, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.16234887737478412, |
|
"grad_norm": 8.115317344665527, |
|
"learning_rate": 0.00011240000000000002, |
|
"loss": 4.0327, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.1629245826137018, |
|
"grad_norm": 7.597106456756592, |
|
"learning_rate": 0.00011279999999999999, |
|
"loss": 3.7811, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.16350028785261947, |
|
"grad_norm": 6.518374443054199, |
|
"learning_rate": 0.0001132, |
|
"loss": 3.3359, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.16407599309153714, |
|
"grad_norm": 6.962795257568359, |
|
"learning_rate": 0.0001136, |
|
"loss": 3.3726, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.16465169833045482, |
|
"grad_norm": 8.1845703125, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 4.0042, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.1652274035693725, |
|
"grad_norm": 6.869271755218506, |
|
"learning_rate": 0.0001144, |
|
"loss": 3.4989, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.16580310880829016, |
|
"grad_norm": 12.261098861694336, |
|
"learning_rate": 0.0001148, |
|
"loss": 4.1045, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.16637881404720783, |
|
"grad_norm": 6.912962913513184, |
|
"learning_rate": 0.0001152, |
|
"loss": 3.6853, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.1669545192861255, |
|
"grad_norm": 8.545379638671875, |
|
"learning_rate": 0.00011559999999999999, |
|
"loss": 3.8903, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.16753022452504318, |
|
"grad_norm": 15.040228843688965, |
|
"learning_rate": 0.000116, |
|
"loss": 3.4079, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.16810592976396085, |
|
"grad_norm": 7.038132667541504, |
|
"learning_rate": 0.0001164, |
|
"loss": 3.7119, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.16868163500287853, |
|
"grad_norm": 6.259817123413086, |
|
"learning_rate": 0.00011679999999999999, |
|
"loss": 3.4931, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.1692573402417962, |
|
"grad_norm": 6.947351455688477, |
|
"learning_rate": 0.0001172, |
|
"loss": 3.677, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.16983304548071387, |
|
"grad_norm": 14.260014533996582, |
|
"learning_rate": 0.0001176, |
|
"loss": 3.9591, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.17040875071963155, |
|
"grad_norm": 6.70070743560791, |
|
"learning_rate": 0.000118, |
|
"loss": 3.2433, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.17098445595854922, |
|
"grad_norm": 11.697699546813965, |
|
"learning_rate": 0.0001184, |
|
"loss": 4.0909, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.1715601611974669, |
|
"grad_norm": 10.029029846191406, |
|
"learning_rate": 0.0001188, |
|
"loss": 3.5743, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.17213586643638457, |
|
"grad_norm": 6.6930365562438965, |
|
"learning_rate": 0.0001192, |
|
"loss": 3.2007, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"grad_norm": 21.772619247436523, |
|
"learning_rate": 0.00011960000000000001, |
|
"loss": 3.8505, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1732872769142199, |
|
"grad_norm": 9.126256942749023, |
|
"learning_rate": 0.00012, |
|
"loss": 3.5777, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.17386298215313759, |
|
"grad_norm": 7.574469566345215, |
|
"learning_rate": 0.0001204, |
|
"loss": 3.5329, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.17443868739205526, |
|
"grad_norm": 6.436075687408447, |
|
"learning_rate": 0.0001208, |
|
"loss": 3.279, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.17501439263097293, |
|
"grad_norm": 5.945929527282715, |
|
"learning_rate": 0.0001212, |
|
"loss": 3.4338, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.1755900978698906, |
|
"grad_norm": 5.7057785987854, |
|
"learning_rate": 0.0001216, |
|
"loss": 3.2369, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.17616580310880828, |
|
"grad_norm": 9.411810874938965, |
|
"learning_rate": 0.000122, |
|
"loss": 3.5364, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.17674150834772595, |
|
"grad_norm": 8.872260093688965, |
|
"learning_rate": 0.0001224, |
|
"loss": 3.7803, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.17731721358664365, |
|
"grad_norm": 46.1115837097168, |
|
"learning_rate": 0.0001228, |
|
"loss": 3.7188, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.17789291882556132, |
|
"grad_norm": 48.33805465698242, |
|
"learning_rate": 0.0001232, |
|
"loss": 3.7491, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.178468624064479, |
|
"grad_norm": 7.272097587585449, |
|
"learning_rate": 0.0001236, |
|
"loss": 3.559, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.17904432930339667, |
|
"grad_norm": 7.471408367156982, |
|
"learning_rate": 0.000124, |
|
"loss": 3.6014, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.17962003454231434, |
|
"grad_norm": 11.095893859863281, |
|
"learning_rate": 0.00012440000000000002, |
|
"loss": 3.5741, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.18019573978123202, |
|
"grad_norm": 8.782601356506348, |
|
"learning_rate": 0.0001248, |
|
"loss": 3.2475, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.1807714450201497, |
|
"grad_norm": 7.485610485076904, |
|
"learning_rate": 0.0001252, |
|
"loss": 3.0304, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.18134715025906736, |
|
"grad_norm": 7.794425964355469, |
|
"learning_rate": 0.00012560000000000002, |
|
"loss": 2.9428, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.18192285549798504, |
|
"grad_norm": 6.470662593841553, |
|
"learning_rate": 0.000126, |
|
"loss": 3.4341, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.1824985607369027, |
|
"grad_norm": 10.054426193237305, |
|
"learning_rate": 0.0001264, |
|
"loss": 2.941, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.18307426597582038, |
|
"grad_norm": 93.38629150390625, |
|
"learning_rate": 0.00012680000000000002, |
|
"loss": 4.2291, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.18364997121473806, |
|
"grad_norm": 9.805968284606934, |
|
"learning_rate": 0.0001272, |
|
"loss": 3.0641, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.18422567645365573, |
|
"grad_norm": 6.104334831237793, |
|
"learning_rate": 0.0001276, |
|
"loss": 3.0856, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1848013816925734, |
|
"grad_norm": 8.24195384979248, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 3.0774, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.18537708693149108, |
|
"grad_norm": 6.327628135681152, |
|
"learning_rate": 0.0001284, |
|
"loss": 3.0826, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.18595279217040875, |
|
"grad_norm": 11.529990196228027, |
|
"learning_rate": 0.00012880000000000001, |
|
"loss": 3.7882, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.18652849740932642, |
|
"grad_norm": 9.700762748718262, |
|
"learning_rate": 0.00012920000000000002, |
|
"loss": 3.4958, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.1871042026482441, |
|
"grad_norm": 10.289152145385742, |
|
"learning_rate": 0.0001296, |
|
"loss": 3.3652, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.18767990788716177, |
|
"grad_norm": 6.888269901275635, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 3.1086, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.18825561312607944, |
|
"grad_norm": 9.220719337463379, |
|
"learning_rate": 0.0001304, |
|
"loss": 3.5314, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.1888313183649971, |
|
"grad_norm": 9.044048309326172, |
|
"learning_rate": 0.0001308, |
|
"loss": 2.943, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.1894070236039148, |
|
"grad_norm": 11.338268280029297, |
|
"learning_rate": 0.00013120000000000002, |
|
"loss": 3.4617, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.18998272884283246, |
|
"grad_norm": 5.949525833129883, |
|
"learning_rate": 0.0001316, |
|
"loss": 2.8324, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19055843408175013, |
|
"grad_norm": 9.158703804016113, |
|
"learning_rate": 0.000132, |
|
"loss": 3.1961, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.1911341393206678, |
|
"grad_norm": 8.708706855773926, |
|
"learning_rate": 0.00013240000000000002, |
|
"loss": 3.1941, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.19170984455958548, |
|
"grad_norm": 10.610583305358887, |
|
"learning_rate": 0.0001328, |
|
"loss": 3.3617, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.19228554979850315, |
|
"grad_norm": 8.023892402648926, |
|
"learning_rate": 0.0001332, |
|
"loss": 3.1775, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.19286125503742085, |
|
"grad_norm": 7.895623683929443, |
|
"learning_rate": 0.00013360000000000002, |
|
"loss": 3.1033, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.19343696027633853, |
|
"grad_norm": 6.376975059509277, |
|
"learning_rate": 0.000134, |
|
"loss": 2.808, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.1940126655152562, |
|
"grad_norm": 5.185142993927002, |
|
"learning_rate": 0.00013440000000000001, |
|
"loss": 2.8337, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.19458837075417387, |
|
"grad_norm": 6.408693790435791, |
|
"learning_rate": 0.00013480000000000002, |
|
"loss": 3.0604, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.19516407599309155, |
|
"grad_norm": 21.610239028930664, |
|
"learning_rate": 0.0001352, |
|
"loss": 3.431, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.19573978123200922, |
|
"grad_norm": 9.485398292541504, |
|
"learning_rate": 0.00013560000000000002, |
|
"loss": 3.2208, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1963154864709269, |
|
"grad_norm": 6.460340976715088, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 2.793, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.19689119170984457, |
|
"grad_norm": 5.64215612411499, |
|
"learning_rate": 0.0001364, |
|
"loss": 2.8589, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.19746689694876224, |
|
"grad_norm": 6.9033427238464355, |
|
"learning_rate": 0.00013680000000000002, |
|
"loss": 3.1031, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.1980426021876799, |
|
"grad_norm": 5.724493980407715, |
|
"learning_rate": 0.00013720000000000003, |
|
"loss": 2.8605, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.19861830742659758, |
|
"grad_norm": 15.779448509216309, |
|
"learning_rate": 0.00013759999999999998, |
|
"loss": 3.2151, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.19919401266551526, |
|
"grad_norm": 6.960752964019775, |
|
"learning_rate": 0.000138, |
|
"loss": 2.8537, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.19976971790443293, |
|
"grad_norm": 8.871850967407227, |
|
"learning_rate": 0.0001384, |
|
"loss": 2.7536, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2003454231433506, |
|
"grad_norm": 6.670348644256592, |
|
"learning_rate": 0.00013879999999999999, |
|
"loss": 2.9525, |
|
"step": 348 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1737, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 348, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.022631528658895e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|