|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 42.73504273504273, |
|
"eval_steps": 500, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10683760683760683, |
|
"grad_norm": 17.50655133370389, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 12.5513, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21367521367521367, |
|
"grad_norm": 3.9805514071551547, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 6.0084, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"grad_norm": 4.788411878788504, |
|
"learning_rate": 2.4e-05, |
|
"loss": 5.3805, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42735042735042733, |
|
"grad_norm": 5.333725120141322, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 5.2249, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5341880341880342, |
|
"grad_norm": 15.609303150641697, |
|
"learning_rate": 4e-05, |
|
"loss": 4.9602, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 7.144767244257112, |
|
"learning_rate": 4.8e-05, |
|
"loss": 4.6148, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7478632478632479, |
|
"grad_norm": 25.080573283918373, |
|
"learning_rate": 5.6e-05, |
|
"loss": 4.4841, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 3.8773812391539053, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 4.3781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 3.1869326815909034, |
|
"learning_rate": 7.2e-05, |
|
"loss": 4.3205, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0683760683760684, |
|
"grad_norm": 5.6076616736212594, |
|
"learning_rate": 8e-05, |
|
"loss": 4.2551, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1752136752136753, |
|
"grad_norm": 3.645026039480543, |
|
"learning_rate": 7.999878333936981e-05, |
|
"loss": 4.2133, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 2.9794225693992953, |
|
"learning_rate": 7.999513343642656e-05, |
|
"loss": 4.151, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 3.38064005034008, |
|
"learning_rate": 7.998905052800726e-05, |
|
"loss": 4.1177, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4957264957264957, |
|
"grad_norm": 2.67282703236483, |
|
"learning_rate": 7.998053500882307e-05, |
|
"loss": 4.0967, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6025641025641026, |
|
"grad_norm": 2.42080044442278, |
|
"learning_rate": 7.996958743143387e-05, |
|
"loss": 4.0583, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 2.295466399827455, |
|
"learning_rate": 7.995620850621226e-05, |
|
"loss": 4.0284, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8162393162393162, |
|
"grad_norm": 2.402563069574593, |
|
"learning_rate": 7.994039910129751e-05, |
|
"loss": 4.0166, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 2.29603312108278, |
|
"learning_rate": 7.992216024253926e-05, |
|
"loss": 3.9985, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.02991452991453, |
|
"grad_norm": 3.248293648782872, |
|
"learning_rate": 7.990149311343093e-05, |
|
"loss": 3.9787, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.1367521367521367, |
|
"grad_norm": 3.134410726356944, |
|
"learning_rate": 7.987839905503288e-05, |
|
"loss": 3.9654, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.2435897435897436, |
|
"grad_norm": 1.9375432532620036, |
|
"learning_rate": 7.985287956588549e-05, |
|
"loss": 3.9518, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.3504273504273505, |
|
"grad_norm": 2.3186207778993406, |
|
"learning_rate": 7.982493630191183e-05, |
|
"loss": 3.9365, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.4572649572649574, |
|
"grad_norm": 6.541119724742696, |
|
"learning_rate": 7.979457107631025e-05, |
|
"loss": 3.9315, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 2.0067233088832968, |
|
"learning_rate": 7.976178585943674e-05, |
|
"loss": 3.9206, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.6709401709401708, |
|
"grad_norm": 4.556924836426837, |
|
"learning_rate": 7.972658277867703e-05, |
|
"loss": 3.9179, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 2.0720526525590044, |
|
"learning_rate": 7.968896411830859e-05, |
|
"loss": 3.9087, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 2.1508177732338107, |
|
"learning_rate": 7.964893231935236e-05, |
|
"loss": 3.8873, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.9914529914529915, |
|
"grad_norm": 2.3292099645567546, |
|
"learning_rate": 7.960648997941442e-05, |
|
"loss": 3.8856, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.0982905982905984, |
|
"grad_norm": 2.0007587424147673, |
|
"learning_rate": 7.95616398525174e-05, |
|
"loss": 3.8742, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.2051282051282053, |
|
"grad_norm": 1.846720032879114, |
|
"learning_rate": 7.951438484892173e-05, |
|
"loss": 3.87, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.3119658119658117, |
|
"grad_norm": 1.9472394231718875, |
|
"learning_rate": 7.946472803493691e-05, |
|
"loss": 3.865, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 1.78425859382252, |
|
"learning_rate": 7.941267263272242e-05, |
|
"loss": 3.8535, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.5256410256410255, |
|
"grad_norm": 4.031312927228092, |
|
"learning_rate": 7.935822202007872e-05, |
|
"loss": 3.8422, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.6324786324786325, |
|
"grad_norm": 1.8872813142317872, |
|
"learning_rate": 7.930137973022801e-05, |
|
"loss": 3.8451, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.7393162393162394, |
|
"grad_norm": 1.9797333625472995, |
|
"learning_rate": 7.924214945158504e-05, |
|
"loss": 3.8427, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.7965847278493243, |
|
"learning_rate": 7.918053502751772e-05, |
|
"loss": 3.8368, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.952991452991453, |
|
"grad_norm": 2.011018560831316, |
|
"learning_rate": 7.911654045609771e-05, |
|
"loss": 3.8374, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.05982905982906, |
|
"grad_norm": 1.6786645657635046, |
|
"learning_rate": 7.905016988984106e-05, |
|
"loss": 3.8245, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 2.2311798939523557, |
|
"learning_rate": 7.898142763543867e-05, |
|
"loss": 3.8165, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"grad_norm": 1.7909392129453563, |
|
"learning_rate": 7.891031815347695e-05, |
|
"loss": 3.8081, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.380341880341881, |
|
"grad_norm": 2.0523668535469204, |
|
"learning_rate": 7.883684605814828e-05, |
|
"loss": 3.8063, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.487179487179487, |
|
"grad_norm": 1.874013704927665, |
|
"learning_rate": 7.876101611695162e-05, |
|
"loss": 3.7996, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.594017094017094, |
|
"grad_norm": 1.8879502978248355, |
|
"learning_rate": 7.868283325038319e-05, |
|
"loss": 3.811, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.700854700854701, |
|
"grad_norm": 1.7754414533299234, |
|
"learning_rate": 7.860230253161715e-05, |
|
"loss": 3.7997, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 1.8465961536252966, |
|
"learning_rate": 7.851942918617646e-05, |
|
"loss": 3.7968, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.914529914529915, |
|
"grad_norm": 1.7798643913475303, |
|
"learning_rate": 7.843421859159373e-05, |
|
"loss": 3.7995, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.021367521367521, |
|
"grad_norm": 1.6909531477579731, |
|
"learning_rate": 7.834667627706234e-05, |
|
"loss": 3.7938, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 1.8289702979878133, |
|
"learning_rate": 7.825680792307762e-05, |
|
"loss": 3.7808, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.235042735042735, |
|
"grad_norm": 1.6910478242983937, |
|
"learning_rate": 7.816461936106827e-05, |
|
"loss": 3.7769, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.3418803418803416, |
|
"grad_norm": 1.8114942851638463, |
|
"learning_rate": 7.807011657301797e-05, |
|
"loss": 3.7716, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.448717948717949, |
|
"grad_norm": 1.6319961332247308, |
|
"learning_rate": 7.797330569107721e-05, |
|
"loss": 3.7716, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 1.7220662808660416, |
|
"learning_rate": 7.787419299716536e-05, |
|
"loss": 3.769, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.662393162393163, |
|
"grad_norm": 1.8873247077840911, |
|
"learning_rate": 7.777278492256316e-05, |
|
"loss": 3.7706, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 1.6976382567518187, |
|
"learning_rate": 7.766908804749518e-05, |
|
"loss": 3.7624, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.8760683760683765, |
|
"grad_norm": 1.8676652496639343, |
|
"learning_rate": 7.756310910070307e-05, |
|
"loss": 3.77, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.982905982905983, |
|
"grad_norm": 1.712178516145055, |
|
"learning_rate": 7.74548549590088e-05, |
|
"loss": 3.7639, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.089743589743589, |
|
"grad_norm": 1.679182535606097, |
|
"learning_rate": 7.73443326468685e-05, |
|
"loss": 3.7478, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.196581196581197, |
|
"grad_norm": 1.574526470389441, |
|
"learning_rate": 7.723154933591662e-05, |
|
"loss": 3.7542, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 6.303418803418803, |
|
"grad_norm": 1.6785375872855937, |
|
"learning_rate": 7.711651234450059e-05, |
|
"loss": 3.7438, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 6.410256410256411, |
|
"grad_norm": 1.7945514417084472, |
|
"learning_rate": 7.699922913720591e-05, |
|
"loss": 3.7441, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.517094017094017, |
|
"grad_norm": 1.6372117640617172, |
|
"learning_rate": 7.687970732437185e-05, |
|
"loss": 3.7459, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.6239316239316235, |
|
"grad_norm": 1.6732118912179677, |
|
"learning_rate": 7.675795466159753e-05, |
|
"loss": 3.7477, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.730769230769231, |
|
"grad_norm": 1.6015289303122424, |
|
"learning_rate": 7.66339790492388e-05, |
|
"loss": 3.738, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"grad_norm": 1.679231377704378, |
|
"learning_rate": 7.650778853189546e-05, |
|
"loss": 3.7412, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"grad_norm": 1.6926843338267605, |
|
"learning_rate": 7.637939129788935e-05, |
|
"loss": 3.7447, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 7.051282051282051, |
|
"grad_norm": 1.7053205005415435, |
|
"learning_rate": 7.624879567873296e-05, |
|
"loss": 3.7282, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 7.1581196581196584, |
|
"grad_norm": 1.7400948311186981, |
|
"learning_rate": 7.611601014858889e-05, |
|
"loss": 3.7215, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 7.264957264957265, |
|
"grad_norm": 2.528264958264508, |
|
"learning_rate": 7.598104332371991e-05, |
|
"loss": 3.7196, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 7.371794871794872, |
|
"grad_norm": 1.5849521375824331, |
|
"learning_rate": 7.584390396192986e-05, |
|
"loss": 3.7221, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 7.478632478632479, |
|
"grad_norm": 1.7924005277052726, |
|
"learning_rate": 7.570460096199537e-05, |
|
"loss": 3.7236, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.585470085470085, |
|
"grad_norm": 1.7867674787154022, |
|
"learning_rate": 7.556314336308853e-05, |
|
"loss": 3.7173, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 1.6452085862488215, |
|
"learning_rate": 7.541954034419019e-05, |
|
"loss": 3.715, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.799145299145299, |
|
"grad_norm": 2.100545190250131, |
|
"learning_rate": 7.527380122349443e-05, |
|
"loss": 3.7039, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.905982905982906, |
|
"grad_norm": 1.6770874481466609, |
|
"learning_rate": 7.512593545780399e-05, |
|
"loss": 3.7224, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 8.012820512820513, |
|
"grad_norm": 1.7001506486226698, |
|
"learning_rate": 7.497595264191646e-05, |
|
"loss": 3.7162, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 8.11965811965812, |
|
"grad_norm": 1.6817162615240098, |
|
"learning_rate": 7.482386250800185e-05, |
|
"loss": 3.7046, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 8.226495726495726, |
|
"grad_norm": 1.719195360223785, |
|
"learning_rate": 7.4669674924971e-05, |
|
"loss": 3.695, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 1.6285706684402375, |
|
"learning_rate": 7.451339989783522e-05, |
|
"loss": 3.6942, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 8.44017094017094, |
|
"grad_norm": 1.7752159712591726, |
|
"learning_rate": 7.435504756705705e-05, |
|
"loss": 3.692, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 1.7150172534664192, |
|
"learning_rate": 7.419462820789231e-05, |
|
"loss": 3.6984, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.653846153846153, |
|
"grad_norm": 1.5396911576453884, |
|
"learning_rate": 7.403215222972335e-05, |
|
"loss": 3.6963, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.760683760683762, |
|
"grad_norm": 1.5283160224829966, |
|
"learning_rate": 7.38676301753835e-05, |
|
"loss": 3.6999, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 8.867521367521368, |
|
"grad_norm": 1.5551933347468931, |
|
"learning_rate": 7.370107272047316e-05, |
|
"loss": 3.6933, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 8.974358974358974, |
|
"grad_norm": 1.7511071381440466, |
|
"learning_rate": 7.353249067266687e-05, |
|
"loss": 3.6964, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 9.081196581196581, |
|
"grad_norm": 1.4171376380434266, |
|
"learning_rate": 7.336189497101211e-05, |
|
"loss": 3.6785, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 9.188034188034187, |
|
"grad_norm": 1.5981957007543055, |
|
"learning_rate": 7.318929668521952e-05, |
|
"loss": 3.6899, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 9.294871794871796, |
|
"grad_norm": 3.0421520699356517, |
|
"learning_rate": 7.301470701494449e-05, |
|
"loss": 3.6823, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 9.401709401709402, |
|
"grad_norm": 1.6095054351392535, |
|
"learning_rate": 7.283813728906054e-05, |
|
"loss": 3.6824, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 9.508547008547009, |
|
"grad_norm": 1.7626031317872317, |
|
"learning_rate": 7.265959896492414e-05, |
|
"loss": 3.6732, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 1.6601170434249126, |
|
"learning_rate": 7.24791036276313e-05, |
|
"loss": 3.6731, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.722222222222221, |
|
"grad_norm": 1.6596574101192159, |
|
"learning_rate": 7.22966629892658e-05, |
|
"loss": 3.6719, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 9.82905982905983, |
|
"grad_norm": 10.220820132678703, |
|
"learning_rate": 7.211228888813919e-05, |
|
"loss": 3.6815, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 9.935897435897436, |
|
"grad_norm": 1.474086077471036, |
|
"learning_rate": 7.19259932880227e-05, |
|
"loss": 3.6809, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 10.042735042735043, |
|
"grad_norm": 1.5361596630778747, |
|
"learning_rate": 7.173778827737085e-05, |
|
"loss": 3.6613, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 10.149572649572649, |
|
"grad_norm": 1.5694651428853257, |
|
"learning_rate": 7.154768606853705e-05, |
|
"loss": 3.6593, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 1.529138627518509, |
|
"learning_rate": 7.135569899698122e-05, |
|
"loss": 3.655, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 10.363247863247864, |
|
"grad_norm": 1.7020816932150489, |
|
"learning_rate": 7.116183952046932e-05, |
|
"loss": 3.667, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 10.47008547008547, |
|
"grad_norm": 1.6247700748638545, |
|
"learning_rate": 7.096612021826493e-05, |
|
"loss": 3.6613, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 10.576923076923077, |
|
"grad_norm": 1.5047790372740069, |
|
"learning_rate": 7.076855379031314e-05, |
|
"loss": 3.6588, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 10.683760683760683, |
|
"grad_norm": 2.084505311018084, |
|
"learning_rate": 7.05691530564163e-05, |
|
"loss": 3.6553, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 10.790598290598291, |
|
"grad_norm": 1.546832536827408, |
|
"learning_rate": 7.036793095540228e-05, |
|
"loss": 3.6604, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 10.897435897435898, |
|
"grad_norm": 1.604609052751935, |
|
"learning_rate": 7.01649005442849e-05, |
|
"loss": 3.6584, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 11.004273504273504, |
|
"grad_norm": 1.485629705887829, |
|
"learning_rate": 6.996007499741652e-05, |
|
"loss": 3.6538, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 1.8424475644024803, |
|
"learning_rate": 6.975346760563337e-05, |
|
"loss": 3.6424, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 11.217948717948717, |
|
"grad_norm": 1.6286932716825095, |
|
"learning_rate": 6.954509177539306e-05, |
|
"loss": 3.6373, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 11.324786324786325, |
|
"grad_norm": 1.5150525299886066, |
|
"learning_rate": 6.933496102790455e-05, |
|
"loss": 3.6433, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 11.431623931623932, |
|
"grad_norm": 1.4628099488038697, |
|
"learning_rate": 6.912308899825094e-05, |
|
"loss": 3.6347, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 11.538461538461538, |
|
"grad_norm": 1.5758824797735944, |
|
"learning_rate": 6.890948943450462e-05, |
|
"loss": 3.6372, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 11.645299145299145, |
|
"grad_norm": 1.5210645926590014, |
|
"learning_rate": 6.869417619683515e-05, |
|
"loss": 3.6398, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 11.752136752136753, |
|
"grad_norm": 1.4454661519624443, |
|
"learning_rate": 6.847716325660999e-05, |
|
"loss": 3.6382, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 11.85897435897436, |
|
"grad_norm": 1.7686322014608322, |
|
"learning_rate": 6.825846469548783e-05, |
|
"loss": 3.645, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 11.965811965811966, |
|
"grad_norm": 1.6215030750877166, |
|
"learning_rate": 6.80380947045049e-05, |
|
"loss": 3.6488, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 12.072649572649572, |
|
"grad_norm": 1.5454385426401656, |
|
"learning_rate": 6.781606758315411e-05, |
|
"loss": 3.6292, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 12.179487179487179, |
|
"grad_norm": 1.5219417956848793, |
|
"learning_rate": 6.759239773845719e-05, |
|
"loss": 3.6307, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 12.286324786324787, |
|
"grad_norm": 1.5542005226736204, |
|
"learning_rate": 6.736709968402982e-05, |
|
"loss": 3.6293, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 12.393162393162394, |
|
"grad_norm": 1.623986735247479, |
|
"learning_rate": 6.714018803913991e-05, |
|
"loss": 3.6219, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 1.5195039486822781, |
|
"learning_rate": 6.69116775277589e-05, |
|
"loss": 3.6295, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 12.606837606837606, |
|
"grad_norm": 1.594705930764426, |
|
"learning_rate": 6.66815829776064e-05, |
|
"loss": 3.6336, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 12.713675213675213, |
|
"grad_norm": 1.480108373256865, |
|
"learning_rate": 6.644991931918805e-05, |
|
"loss": 3.6184, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 1.5002000993141473, |
|
"learning_rate": 6.621670158482664e-05, |
|
"loss": 3.6275, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 12.927350427350428, |
|
"grad_norm": 1.5712595498850146, |
|
"learning_rate": 6.598194490768677e-05, |
|
"loss": 3.6197, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 13.034188034188034, |
|
"grad_norm": 1.5018502585873035, |
|
"learning_rate": 6.574566452079276e-05, |
|
"loss": 3.6143, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 13.14102564102564, |
|
"grad_norm": 1.49259152360227, |
|
"learning_rate": 6.550787575604034e-05, |
|
"loss": 3.6135, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 13.247863247863247, |
|
"grad_norm": 1.5699547507210274, |
|
"learning_rate": 6.526859404320169e-05, |
|
"loss": 3.6129, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 13.354700854700855, |
|
"grad_norm": 1.5110194439447986, |
|
"learning_rate": 6.502783490892421e-05, |
|
"loss": 3.6102, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 13.461538461538462, |
|
"grad_norm": 1.5863106575168338, |
|
"learning_rate": 6.478561397572317e-05, |
|
"loss": 3.6108, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 13.568376068376068, |
|
"grad_norm": 1.4631757267400136, |
|
"learning_rate": 6.454194696096775e-05, |
|
"loss": 3.6127, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 13.675213675213675, |
|
"grad_norm": 1.4900676589043844, |
|
"learning_rate": 6.429684967586138e-05, |
|
"loss": 3.6073, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 13.782051282051283, |
|
"grad_norm": 2.079377678794041, |
|
"learning_rate": 6.405033802441565e-05, |
|
"loss": 3.6039, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 1.5029050194176434, |
|
"learning_rate": 6.380242800241835e-05, |
|
"loss": 3.6211, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 13.995726495726496, |
|
"grad_norm": 1.518341540130534, |
|
"learning_rate": 6.355313569639555e-05, |
|
"loss": 3.6128, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 14.102564102564102, |
|
"grad_norm": 1.5345576626262458, |
|
"learning_rate": 6.330247728256771e-05, |
|
"loss": 3.5815, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 14.209401709401709, |
|
"grad_norm": 1.4705033084244592, |
|
"learning_rate": 6.305046902580013e-05, |
|
"loss": 3.5873, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 14.316239316239317, |
|
"grad_norm": 1.5238057009473032, |
|
"learning_rate": 6.279712727854743e-05, |
|
"loss": 3.5935, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 14.423076923076923, |
|
"grad_norm": 1.5620920793205895, |
|
"learning_rate": 6.254246847979254e-05, |
|
"loss": 3.6014, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 14.52991452991453, |
|
"grad_norm": 1.547710770903625, |
|
"learning_rate": 6.228650915397998e-05, |
|
"loss": 3.5993, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 14.636752136752136, |
|
"grad_norm": 1.5496045315673772, |
|
"learning_rate": 6.20292659099436e-05, |
|
"loss": 3.5986, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 14.743589743589745, |
|
"grad_norm": 1.5740526741210388, |
|
"learning_rate": 6.177075543982883e-05, |
|
"loss": 3.5988, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 14.850427350427351, |
|
"grad_norm": 1.5398680616808194, |
|
"learning_rate": 6.151099451800965e-05, |
|
"loss": 3.5886, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 14.957264957264957, |
|
"grad_norm": 1.6259731166677076, |
|
"learning_rate": 6.125000000000001e-05, |
|
"loss": 3.591, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 15.064102564102564, |
|
"grad_norm": 1.4806043416840873, |
|
"learning_rate": 6.0987788821360164e-05, |
|
"loss": 3.5731, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 15.17094017094017, |
|
"grad_norm": 1.5939703396343463, |
|
"learning_rate": 6.0724377996597723e-05, |
|
"loss": 3.5764, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 15.277777777777779, |
|
"grad_norm": 1.4202060336422229, |
|
"learning_rate": 6.045978461806366e-05, |
|
"loss": 3.5845, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 1.4775589098861128, |
|
"learning_rate": 6.019402585484308e-05, |
|
"loss": 3.571, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 15.491452991452991, |
|
"grad_norm": 1.5630724811844643, |
|
"learning_rate": 5.992711895164133e-05, |
|
"loss": 3.5808, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 15.598290598290598, |
|
"grad_norm": 1.5543909604764383, |
|
"learning_rate": 5.965908122766483e-05, |
|
"loss": 3.5753, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 15.705128205128204, |
|
"grad_norm": 1.5465008832335148, |
|
"learning_rate": 5.938993007549739e-05, |
|
"loss": 3.5723, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 15.811965811965813, |
|
"grad_norm": 1.4743809807346027, |
|
"learning_rate": 5.9119682959971563e-05, |
|
"loss": 3.578, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 15.918803418803419, |
|
"grad_norm": 1.4493286420178282, |
|
"learning_rate": 5.884835741703538e-05, |
|
"loss": 3.5727, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 16.025641025641026, |
|
"grad_norm": 1.436645336671343, |
|
"learning_rate": 5.857597105261453e-05, |
|
"loss": 3.571, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 16.132478632478634, |
|
"grad_norm": 1.5383062683209867, |
|
"learning_rate": 5.8302541541469855e-05, |
|
"loss": 3.5543, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 16.23931623931624, |
|
"grad_norm": 1.5681746624668993, |
|
"learning_rate": 5.8028086626050457e-05, |
|
"loss": 3.5595, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 16.346153846153847, |
|
"grad_norm": 1.5410354284215502, |
|
"learning_rate": 5.775262411534252e-05, |
|
"loss": 3.5588, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 16.45299145299145, |
|
"grad_norm": 1.5203964006378836, |
|
"learning_rate": 5.747617188371358e-05, |
|
"loss": 3.5569, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 16.55982905982906, |
|
"grad_norm": 1.4817981236989008, |
|
"learning_rate": 5.7198747869752824e-05, |
|
"loss": 3.5629, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 1.500308799747035, |
|
"learning_rate": 5.692037007510691e-05, |
|
"loss": 3.5596, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 16.773504273504273, |
|
"grad_norm": 1.7695298352772697, |
|
"learning_rate": 5.6641056563312026e-05, |
|
"loss": 3.5527, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 16.88034188034188, |
|
"grad_norm": 1.5340102094238413, |
|
"learning_rate": 5.6360825458621665e-05, |
|
"loss": 3.5622, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 16.98717948717949, |
|
"grad_norm": 1.5412854247341883, |
|
"learning_rate": 5.6079694944830634e-05, |
|
"loss": 3.5621, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 17.094017094017094, |
|
"grad_norm": 1.5584998998853048, |
|
"learning_rate": 5.579768326409509e-05, |
|
"loss": 3.54, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 17.200854700854702, |
|
"grad_norm": 1.4673715169312684, |
|
"learning_rate": 5.5514808715748866e-05, |
|
"loss": 3.5478, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 17.307692307692307, |
|
"grad_norm": 1.8376102051157188, |
|
"learning_rate": 5.523108965511602e-05, |
|
"loss": 3.5396, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 17.414529914529915, |
|
"grad_norm": 1.4166786903918271, |
|
"learning_rate": 5.494654449231982e-05, |
|
"loss": 3.5426, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 17.521367521367523, |
|
"grad_norm": 1.46334786902459, |
|
"learning_rate": 5.466119169108811e-05, |
|
"loss": 3.5411, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 17.628205128205128, |
|
"grad_norm": 1.484842407521401, |
|
"learning_rate": 5.437504976755523e-05, |
|
"loss": 3.5372, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 17.735042735042736, |
|
"grad_norm": 1.464526739786086, |
|
"learning_rate": 5.408813728906053e-05, |
|
"loss": 3.5437, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 17.84188034188034, |
|
"grad_norm": 1.4706425749332082, |
|
"learning_rate": 5.380047287294361e-05, |
|
"loss": 3.5462, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 17.94871794871795, |
|
"grad_norm": 1.5282873960612633, |
|
"learning_rate": 5.351207518533616e-05, |
|
"loss": 3.5453, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 18.055555555555557, |
|
"grad_norm": 1.553954215098199, |
|
"learning_rate": 5.322296293995085e-05, |
|
"loss": 3.5261, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 18.162393162393162, |
|
"grad_norm": 1.460026176296364, |
|
"learning_rate": 5.293315489686698e-05, |
|
"loss": 3.5165, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 18.26923076923077, |
|
"grad_norm": 1.4489055613879473, |
|
"learning_rate": 5.264266986131315e-05, |
|
"loss": 3.5263, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 18.376068376068375, |
|
"grad_norm": 1.5224175584643513, |
|
"learning_rate": 5.235152668244701e-05, |
|
"loss": 3.5297, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 18.482905982905983, |
|
"grad_norm": 1.4655236052554852, |
|
"learning_rate": 5.205974425213225e-05, |
|
"loss": 3.524, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 18.58974358974359, |
|
"grad_norm": 1.4634201252745132, |
|
"learning_rate": 5.176734150371261e-05, |
|
"loss": 3.5258, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 18.696581196581196, |
|
"grad_norm": 1.4580158251307793, |
|
"learning_rate": 5.147433741078342e-05, |
|
"loss": 3.5203, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 18.803418803418804, |
|
"grad_norm": 1.526325559628089, |
|
"learning_rate": 5.118075098596038e-05, |
|
"loss": 3.5241, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 18.91025641025641, |
|
"grad_norm": 1.4073654836876215, |
|
"learning_rate": 5.0886601279645845e-05, |
|
"loss": 3.5104, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 19.017094017094017, |
|
"grad_norm": 1.6261138766390175, |
|
"learning_rate": 5.059190737879274e-05, |
|
"loss": 3.513, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 19.123931623931625, |
|
"grad_norm": 1.5122999705305489, |
|
"learning_rate": 5.029668840566597e-05, |
|
"loss": 3.5047, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 19.23076923076923, |
|
"grad_norm": 1.4420542513870556, |
|
"learning_rate": 5.000096351660167e-05, |
|
"loss": 3.4968, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 19.337606837606838, |
|
"grad_norm": 1.493772453873553, |
|
"learning_rate": 4.970475190076407e-05, |
|
"loss": 3.498, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 19.444444444444443, |
|
"grad_norm": 1.545900673783324, |
|
"learning_rate": 4.940807277890043e-05, |
|
"loss": 3.4975, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 19.55128205128205, |
|
"grad_norm": 1.474713414039892, |
|
"learning_rate": 4.9110945402093846e-05, |
|
"loss": 3.5058, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 19.65811965811966, |
|
"grad_norm": 1.499848755523026, |
|
"learning_rate": 4.881338905051394e-05, |
|
"loss": 3.4989, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 19.764957264957264, |
|
"grad_norm": 1.5022062130566565, |
|
"learning_rate": 4.851542303216601e-05, |
|
"loss": 3.5071, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 19.871794871794872, |
|
"grad_norm": 1.4286746413547837, |
|
"learning_rate": 4.8217066681637957e-05, |
|
"loss": 3.5038, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 19.978632478632477, |
|
"grad_norm": 1.443317748257993, |
|
"learning_rate": 4.7918339358845805e-05, |
|
"loss": 3.5052, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 20.085470085470085, |
|
"grad_norm": 1.4168322503801087, |
|
"learning_rate": 4.7619260447777446e-05, |
|
"loss": 3.4788, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 20.192307692307693, |
|
"grad_norm": 1.5714516374827268, |
|
"learning_rate": 4.7319849355234804e-05, |
|
"loss": 3.4783, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 20.299145299145298, |
|
"grad_norm": 1.5603822470902127, |
|
"learning_rate": 4.702012550957461e-05, |
|
"loss": 3.4893, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 20.405982905982906, |
|
"grad_norm": 1.4623232710258578, |
|
"learning_rate": 4.672010835944771e-05, |
|
"loss": 3.4892, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 20.51282051282051, |
|
"grad_norm": 1.5754771160859642, |
|
"learning_rate": 4.6419817372537015e-05, |
|
"loss": 3.4803, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 20.61965811965812, |
|
"grad_norm": 1.516101722857511, |
|
"learning_rate": 4.6119272034294325e-05, |
|
"loss": 3.4808, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 20.726495726495727, |
|
"grad_norm": 1.591646788840805, |
|
"learning_rate": 4.5818491846675944e-05, |
|
"loss": 3.486, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 20.833333333333332, |
|
"grad_norm": 1.5392104334340817, |
|
"learning_rate": 4.551749632687723e-05, |
|
"loss": 3.487, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 20.94017094017094, |
|
"grad_norm": 1.510393848466065, |
|
"learning_rate": 4.52163050060661e-05, |
|
"loss": 3.49, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 21.04700854700855, |
|
"grad_norm": 1.5683979110448965, |
|
"learning_rate": 4.491493742811573e-05, |
|
"loss": 3.4729, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 21.153846153846153, |
|
"grad_norm": 1.49609375, |
|
"learning_rate": 4.461341314833641e-05, |
|
"loss": 3.4667, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 21.26068376068376, |
|
"grad_norm": 1.591729323001061, |
|
"learning_rate": 4.431175173220652e-05, |
|
"loss": 3.454, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 21.367521367521366, |
|
"grad_norm": 1.4498777305089914, |
|
"learning_rate": 4.400997275410307e-05, |
|
"loss": 3.4678, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 21.474358974358974, |
|
"grad_norm": 1.5371081341322819, |
|
"learning_rate": 4.3708095796031497e-05, |
|
"loss": 3.4591, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 21.581196581196583, |
|
"grad_norm": 1.5191382363247403, |
|
"learning_rate": 4.340614044635496e-05, |
|
"loss": 3.4687, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 21.688034188034187, |
|
"grad_norm": 1.6543824084800223, |
|
"learning_rate": 4.3104126298523424e-05, |
|
"loss": 3.4513, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 21.794871794871796, |
|
"grad_norm": 1.5503952353080468, |
|
"learning_rate": 4.280207294980212e-05, |
|
"loss": 3.4601, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 21.9017094017094, |
|
"grad_norm": 1.6721050068584753, |
|
"learning_rate": 4.25e-05, |
|
"loss": 3.4676, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 22.00854700854701, |
|
"grad_norm": 1.534759314862195, |
|
"learning_rate": 4.219792705019789e-05, |
|
"loss": 3.4573, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 22.115384615384617, |
|
"grad_norm": 1.6067637026570358, |
|
"learning_rate": 4.189587370147658e-05, |
|
"loss": 3.4376, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 22.22222222222222, |
|
"grad_norm": 1.5281042366723503, |
|
"learning_rate": 4.1593859553645044e-05, |
|
"loss": 3.4359, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 22.32905982905983, |
|
"grad_norm": 1.5541495921379098, |
|
"learning_rate": 4.129190420396853e-05, |
|
"loss": 3.4464, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 22.435897435897434, |
|
"grad_norm": 1.5269639891879354, |
|
"learning_rate": 4.099002724589694e-05, |
|
"loss": 3.4484, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 22.542735042735043, |
|
"grad_norm": 1.5232774699351355, |
|
"learning_rate": 4.0688248267793484e-05, |
|
"loss": 3.4422, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 22.64957264957265, |
|
"grad_norm": 1.5252723794548009, |
|
"learning_rate": 4.0386586851663594e-05, |
|
"loss": 3.4402, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 22.756410256410255, |
|
"grad_norm": 1.5220017147380378, |
|
"learning_rate": 4.008506257188428e-05, |
|
"loss": 3.4424, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 22.863247863247864, |
|
"grad_norm": 1.446123380269863, |
|
"learning_rate": 3.978369499393392e-05, |
|
"loss": 3.4518, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 22.97008547008547, |
|
"grad_norm": 1.4734617099815863, |
|
"learning_rate": 3.948250367312279e-05, |
|
"loss": 3.4402, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 1.500255086189984, |
|
"learning_rate": 3.918150815332407e-05, |
|
"loss": 3.4211, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 23.183760683760685, |
|
"grad_norm": 1.5683304153371995, |
|
"learning_rate": 3.888072796570568e-05, |
|
"loss": 3.42, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 23.29059829059829, |
|
"grad_norm": 1.597209690800403, |
|
"learning_rate": 3.8580182627463004e-05, |
|
"loss": 3.4182, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 23.397435897435898, |
|
"grad_norm": 1.548782020835125, |
|
"learning_rate": 3.8279891640552296e-05, |
|
"loss": 3.4128, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 23.504273504273506, |
|
"grad_norm": 1.5510359778298841, |
|
"learning_rate": 3.7979874490425396e-05, |
|
"loss": 3.4189, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 23.61111111111111, |
|
"grad_norm": 1.4877303097447068, |
|
"learning_rate": 3.768015064476521e-05, |
|
"loss": 3.413, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 23.71794871794872, |
|
"grad_norm": 1.4998179961094804, |
|
"learning_rate": 3.7380739552222574e-05, |
|
"loss": 3.4195, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 23.824786324786324, |
|
"grad_norm": 1.6276035725910374, |
|
"learning_rate": 3.7081660641154215e-05, |
|
"loss": 3.4181, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 23.931623931623932, |
|
"grad_norm": 1.607925058174758, |
|
"learning_rate": 3.678293331836205e-05, |
|
"loss": 3.4212, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 24.03846153846154, |
|
"grad_norm": 1.7136000031007326, |
|
"learning_rate": 3.648457696783399e-05, |
|
"loss": 3.4093, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 24.145299145299145, |
|
"grad_norm": 1.6337873843059596, |
|
"learning_rate": 3.618661094948606e-05, |
|
"loss": 3.3832, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 24.252136752136753, |
|
"grad_norm": 1.6296652802427443, |
|
"learning_rate": 3.588905459790617e-05, |
|
"loss": 3.3787, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 24.358974358974358, |
|
"grad_norm": 1.4873240839694675, |
|
"learning_rate": 3.5591927221099575e-05, |
|
"loss": 3.3954, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 24.465811965811966, |
|
"grad_norm": 1.5356828014656962, |
|
"learning_rate": 3.529524809923594e-05, |
|
"loss": 3.3886, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 24.572649572649574, |
|
"grad_norm": 1.6430980390249932, |
|
"learning_rate": 3.499903648339834e-05, |
|
"loss": 3.3932, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 24.67948717948718, |
|
"grad_norm": 1.5788207833678471, |
|
"learning_rate": 3.470331159433404e-05, |
|
"loss": 3.3857, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 24.786324786324787, |
|
"grad_norm": 1.7653838811655183, |
|
"learning_rate": 3.440809262120728e-05, |
|
"loss": 3.4, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 24.89316239316239, |
|
"grad_norm": 1.7491130624850513, |
|
"learning_rate": 3.411339872035418e-05, |
|
"loss": 3.3933, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 1.6409601505004305, |
|
"learning_rate": 3.381924901403964e-05, |
|
"loss": 3.3935, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 25.10683760683761, |
|
"grad_norm": 1.606820532856031, |
|
"learning_rate": 3.35256625892166e-05, |
|
"loss": 3.3522, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 25.213675213675213, |
|
"grad_norm": 1.6564514469500473, |
|
"learning_rate": 3.323265849628739e-05, |
|
"loss": 3.3691, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 25.32051282051282, |
|
"grad_norm": 1.6048468431738794, |
|
"learning_rate": 3.294025574786775e-05, |
|
"loss": 3.3633, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 25.427350427350426, |
|
"grad_norm": 1.7161740506697982, |
|
"learning_rate": 3.2648473317552994e-05, |
|
"loss": 3.3663, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 25.534188034188034, |
|
"grad_norm": 1.5546586690558504, |
|
"learning_rate": 3.2357330138686864e-05, |
|
"loss": 3.3628, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"grad_norm": 1.582958486589817, |
|
"learning_rate": 3.206684510313303e-05, |
|
"loss": 3.3603, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 25.747863247863247, |
|
"grad_norm": 1.542709599231902, |
|
"learning_rate": 3.1777037060049166e-05, |
|
"loss": 3.3655, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 25.854700854700855, |
|
"grad_norm": 1.6117766154964244, |
|
"learning_rate": 3.1487924814663854e-05, |
|
"loss": 3.3727, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 25.96153846153846, |
|
"grad_norm": 1.7027238845939607, |
|
"learning_rate": 3.119952712705641e-05, |
|
"loss": 3.3723, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 26.068376068376068, |
|
"grad_norm": 1.7056235088986273, |
|
"learning_rate": 3.091186271093947e-05, |
|
"loss": 3.3468, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 26.175213675213676, |
|
"grad_norm": 1.6578623283297151, |
|
"learning_rate": 3.062495023244479e-05, |
|
"loss": 3.3408, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 26.28205128205128, |
|
"grad_norm": 1.6672004877959212, |
|
"learning_rate": 3.0338808308911903e-05, |
|
"loss": 3.3351, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 26.38888888888889, |
|
"grad_norm": 1.725424136664808, |
|
"learning_rate": 3.0053455507680195e-05, |
|
"loss": 3.3385, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 26.495726495726494, |
|
"grad_norm": 1.6013693344114848, |
|
"learning_rate": 2.9768910344883982e-05, |
|
"loss": 3.3414, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 26.602564102564102, |
|
"grad_norm": 1.6507989682981181, |
|
"learning_rate": 2.9485191284251144e-05, |
|
"loss": 3.3418, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 26.70940170940171, |
|
"grad_norm": 1.7581686379678405, |
|
"learning_rate": 2.9202316735904915e-05, |
|
"loss": 3.347, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 26.816239316239315, |
|
"grad_norm": 1.6879890580930017, |
|
"learning_rate": 2.892030505516938e-05, |
|
"loss": 3.329, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 26.923076923076923, |
|
"grad_norm": 1.6792107903652251, |
|
"learning_rate": 2.863917454137834e-05, |
|
"loss": 3.3429, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 27.02991452991453, |
|
"grad_norm": 1.7124059637383322, |
|
"learning_rate": 2.8358943436687987e-05, |
|
"loss": 3.3266, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 27.136752136752136, |
|
"grad_norm": 1.7334326168505239, |
|
"learning_rate": 2.80796299248931e-05, |
|
"loss": 3.3149, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 27.243589743589745, |
|
"grad_norm": 1.7176568456381722, |
|
"learning_rate": 2.7801252130247195e-05, |
|
"loss": 3.3212, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 27.35042735042735, |
|
"grad_norm": 1.7570421353380783, |
|
"learning_rate": 2.7523828116286425e-05, |
|
"loss": 3.3191, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 27.457264957264957, |
|
"grad_norm": 1.7448672226066309, |
|
"learning_rate": 2.7247375884657498e-05, |
|
"loss": 3.3158, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 27.564102564102566, |
|
"grad_norm": 1.6265664253590588, |
|
"learning_rate": 2.697191337394956e-05, |
|
"loss": 3.3184, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 27.67094017094017, |
|
"grad_norm": 1.7674312298681476, |
|
"learning_rate": 2.6697458458530164e-05, |
|
"loss": 3.3068, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 27.77777777777778, |
|
"grad_norm": 1.7281653097738896, |
|
"learning_rate": 2.6424028947385463e-05, |
|
"loss": 3.305, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 27.884615384615383, |
|
"grad_norm": 1.6757655998868577, |
|
"learning_rate": 2.615164258296462e-05, |
|
"loss": 3.3151, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 27.99145299145299, |
|
"grad_norm": 1.6927031375120576, |
|
"learning_rate": 2.5880317040028456e-05, |
|
"loss": 3.3091, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 28.0982905982906, |
|
"grad_norm": 1.7454895749333972, |
|
"learning_rate": 2.5610069924502617e-05, |
|
"loss": 3.279, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 28.205128205128204, |
|
"grad_norm": 1.8316623991528065, |
|
"learning_rate": 2.534091877233518e-05, |
|
"loss": 3.2811, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 28.311965811965813, |
|
"grad_norm": 1.725868258132857, |
|
"learning_rate": 2.5072881048358683e-05, |
|
"loss": 3.2789, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 28.418803418803417, |
|
"grad_norm": 1.6963201244367443, |
|
"learning_rate": 2.4805974145156925e-05, |
|
"loss": 3.2826, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 28.525641025641026, |
|
"grad_norm": 1.742348291478239, |
|
"learning_rate": 2.4540215381936355e-05, |
|
"loss": 3.2826, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 28.632478632478634, |
|
"grad_norm": 1.8214431329353475, |
|
"learning_rate": 2.4275622003402272e-05, |
|
"loss": 3.2902, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 28.73931623931624, |
|
"grad_norm": 1.7553417558831383, |
|
"learning_rate": 2.4012211178639852e-05, |
|
"loss": 3.2872, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 28.846153846153847, |
|
"grad_norm": 1.8105617551387587, |
|
"learning_rate": 2.375000000000001e-05, |
|
"loss": 3.2916, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 28.95299145299145, |
|
"grad_norm": 1.7855238921981962, |
|
"learning_rate": 2.348900548199037e-05, |
|
"loss": 3.2871, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 29.05982905982906, |
|
"grad_norm": 1.7871660734260615, |
|
"learning_rate": 2.322924456017118e-05, |
|
"loss": 3.2676, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 29.166666666666668, |
|
"grad_norm": 1.8967670265399914, |
|
"learning_rate": 2.2970734090056414e-05, |
|
"loss": 3.2382, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 29.273504273504273, |
|
"grad_norm": 1.7868169826763705, |
|
"learning_rate": 2.271349084602002e-05, |
|
"loss": 3.2517, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 29.38034188034188, |
|
"grad_norm": 1.9317444868871811, |
|
"learning_rate": 2.245753152020746e-05, |
|
"loss": 3.2592, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 29.487179487179485, |
|
"grad_norm": 1.8694439903902382, |
|
"learning_rate": 2.2202872721452575e-05, |
|
"loss": 3.2402, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 29.594017094017094, |
|
"grad_norm": 1.907871807154451, |
|
"learning_rate": 2.1949530974199887e-05, |
|
"loss": 3.2545, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 29.700854700854702, |
|
"grad_norm": 1.809374515091958, |
|
"learning_rate": 2.1697522717432292e-05, |
|
"loss": 3.247, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 29.807692307692307, |
|
"grad_norm": 1.805345291545111, |
|
"learning_rate": 2.144686430360447e-05, |
|
"loss": 3.2506, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 29.914529914529915, |
|
"grad_norm": 1.797933648104017, |
|
"learning_rate": 2.1197571997581665e-05, |
|
"loss": 3.2524, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 30.021367521367523, |
|
"grad_norm": 1.9388853473260568, |
|
"learning_rate": 2.0949661975584367e-05, |
|
"loss": 3.2459, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 30.128205128205128, |
|
"grad_norm": 1.7582868148049866, |
|
"learning_rate": 2.070315032413864e-05, |
|
"loss": 3.2272, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 30.235042735042736, |
|
"grad_norm": 1.8745030698305942, |
|
"learning_rate": 2.0458053039032265e-05, |
|
"loss": 3.2212, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 30.34188034188034, |
|
"grad_norm": 1.800409969690226, |
|
"learning_rate": 2.0214386024276854e-05, |
|
"loss": 3.224, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 30.44871794871795, |
|
"grad_norm": 1.8714060990555732, |
|
"learning_rate": 1.9972165091075787e-05, |
|
"loss": 3.2262, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 30.555555555555557, |
|
"grad_norm": 1.8482145147540658, |
|
"learning_rate": 1.9731405956798333e-05, |
|
"loss": 3.2264, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 30.662393162393162, |
|
"grad_norm": 1.9172055965648316, |
|
"learning_rate": 1.949212424395967e-05, |
|
"loss": 3.2248, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 1.9310880204205334, |
|
"learning_rate": 1.925433547920724e-05, |
|
"loss": 3.2169, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 30.876068376068375, |
|
"grad_norm": 1.9329611620092266, |
|
"learning_rate": 1.901805509231324e-05, |
|
"loss": 3.2267, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 30.982905982905983, |
|
"grad_norm": 1.8547174800001984, |
|
"learning_rate": 1.878329841517336e-05, |
|
"loss": 3.217, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 31.08974358974359, |
|
"grad_norm": 1.7364694818856983, |
|
"learning_rate": 1.855008068081196e-05, |
|
"loss": 3.2016, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 31.196581196581196, |
|
"grad_norm": 1.8819759932435103, |
|
"learning_rate": 1.8318417022393614e-05, |
|
"loss": 3.1824, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 31.303418803418804, |
|
"grad_norm": 1.9235117076004822, |
|
"learning_rate": 1.808832247224111e-05, |
|
"loss": 3.1979, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 31.41025641025641, |
|
"grad_norm": 1.8096199172951792, |
|
"learning_rate": 1.7859811960860106e-05, |
|
"loss": 3.1959, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 31.517094017094017, |
|
"grad_norm": 1.9824080443252146, |
|
"learning_rate": 1.763290031597019e-05, |
|
"loss": 3.1929, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 31.623931623931625, |
|
"grad_norm": 1.8470203684544455, |
|
"learning_rate": 1.740760226154283e-05, |
|
"loss": 3.1979, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 31.73076923076923, |
|
"grad_norm": 1.8520854742540558, |
|
"learning_rate": 1.71839324168459e-05, |
|
"loss": 3.1934, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 31.837606837606838, |
|
"grad_norm": 1.8470971065908115, |
|
"learning_rate": 1.6961905295495106e-05, |
|
"loss": 3.1851, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 31.944444444444443, |
|
"grad_norm": 1.8606032474106087, |
|
"learning_rate": 1.6741535304512178e-05, |
|
"loss": 3.1972, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 32.05128205128205, |
|
"grad_norm": 1.9467679748998603, |
|
"learning_rate": 1.6522836743390025e-05, |
|
"loss": 3.1685, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 32.15811965811966, |
|
"grad_norm": 1.9755201644621432, |
|
"learning_rate": 1.630582380316487e-05, |
|
"loss": 3.1585, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 32.26495726495727, |
|
"grad_norm": 1.9717752736743752, |
|
"learning_rate": 1.6090510565495406e-05, |
|
"loss": 3.1647, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 32.37179487179487, |
|
"grad_norm": 1.9484597603291864, |
|
"learning_rate": 1.5876911001749066e-05, |
|
"loss": 3.1572, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 32.47863247863248, |
|
"grad_norm": 2.0190422011977076, |
|
"learning_rate": 1.5665038972095462e-05, |
|
"loss": 3.1677, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 32.585470085470085, |
|
"grad_norm": 2.0018694008788507, |
|
"learning_rate": 1.545490822460696e-05, |
|
"loss": 3.1596, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 32.69230769230769, |
|
"grad_norm": 1.9317622594847947, |
|
"learning_rate": 1.5246532394366637e-05, |
|
"loss": 3.1512, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 32.7991452991453, |
|
"grad_norm": 1.939556015374055, |
|
"learning_rate": 1.5039925002583505e-05, |
|
"loss": 3.1508, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 32.9059829059829, |
|
"grad_norm": 1.9314825072339465, |
|
"learning_rate": 1.4835099455715124e-05, |
|
"loss": 3.1559, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 33.01282051282051, |
|
"grad_norm": 2.002199155994756, |
|
"learning_rate": 1.4632069044597711e-05, |
|
"loss": 3.1583, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 33.11965811965812, |
|
"grad_norm": 1.94120643247509, |
|
"learning_rate": 1.4430846943583707e-05, |
|
"loss": 3.137, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 33.22649572649573, |
|
"grad_norm": 2.0323075109268594, |
|
"learning_rate": 1.4231446209686875e-05, |
|
"loss": 3.144, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 1.8398275516388178, |
|
"learning_rate": 1.4033879781735071e-05, |
|
"loss": 3.1402, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 33.44017094017094, |
|
"grad_norm": 1.9227151042513004, |
|
"learning_rate": 1.3838160479530688e-05, |
|
"loss": 3.1145, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 33.547008547008545, |
|
"grad_norm": 2.0943205326548644, |
|
"learning_rate": 1.3644301003018776e-05, |
|
"loss": 3.1343, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 33.65384615384615, |
|
"grad_norm": 2.056329687365839, |
|
"learning_rate": 1.3452313931462957e-05, |
|
"loss": 3.1516, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 33.76068376068376, |
|
"grad_norm": 2.0407832693322616, |
|
"learning_rate": 1.3262211722629166e-05, |
|
"loss": 3.1306, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 33.86752136752137, |
|
"grad_norm": 1.9753732835530098, |
|
"learning_rate": 1.3074006711977312e-05, |
|
"loss": 3.1255, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 33.97435897435897, |
|
"grad_norm": 2.0533348823929596, |
|
"learning_rate": 1.2887711111860816e-05, |
|
"loss": 3.1291, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 34.08119658119658, |
|
"grad_norm": 1.9851072867584276, |
|
"learning_rate": 1.270333701073422e-05, |
|
"loss": 3.1151, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 34.18803418803419, |
|
"grad_norm": 1.9583359440995798, |
|
"learning_rate": 1.2520896372368716e-05, |
|
"loss": 3.1057, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 34.294871794871796, |
|
"grad_norm": 2.020343078712297, |
|
"learning_rate": 1.2340401035075869e-05, |
|
"loss": 3.117, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 34.401709401709404, |
|
"grad_norm": 1.9569342126602145, |
|
"learning_rate": 1.2161862710939476e-05, |
|
"loss": 3.1123, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 34.50854700854701, |
|
"grad_norm": 1.9598035839496117, |
|
"learning_rate": 1.1985292985055514e-05, |
|
"loss": 3.1129, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 34.61538461538461, |
|
"grad_norm": 1.9758674092962976, |
|
"learning_rate": 1.1810703314780488e-05, |
|
"loss": 3.117, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 34.72222222222222, |
|
"grad_norm": 2.020822964146809, |
|
"learning_rate": 1.1638105028987887e-05, |
|
"loss": 3.1107, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 34.82905982905983, |
|
"grad_norm": 2.034351973395989, |
|
"learning_rate": 1.1467509327333134e-05, |
|
"loss": 3.1121, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 34.93589743589744, |
|
"grad_norm": 2.0787426023644113, |
|
"learning_rate": 1.1298927279526844e-05, |
|
"loss": 3.0885, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 35.042735042735046, |
|
"grad_norm": 2.0398209721415927, |
|
"learning_rate": 1.1132369824616499e-05, |
|
"loss": 3.1038, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 35.14957264957265, |
|
"grad_norm": 2.0772037615310985, |
|
"learning_rate": 1.0967847770276674e-05, |
|
"loss": 3.0692, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 35.256410256410255, |
|
"grad_norm": 2.0532519761394976, |
|
"learning_rate": 1.0805371792107705e-05, |
|
"loss": 3.0918, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 35.363247863247864, |
|
"grad_norm": 2.092273932018464, |
|
"learning_rate": 1.0644952432942967e-05, |
|
"loss": 3.0851, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 35.47008547008547, |
|
"grad_norm": 2.0272103855610695, |
|
"learning_rate": 1.0486600102164802e-05, |
|
"loss": 3.095, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 35.57692307692308, |
|
"grad_norm": 1.9982121344245152, |
|
"learning_rate": 1.0330325075029005e-05, |
|
"loss": 3.075, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 35.68376068376068, |
|
"grad_norm": 2.06961519986216, |
|
"learning_rate": 1.0176137491998153e-05, |
|
"loss": 3.0707, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 35.79059829059829, |
|
"grad_norm": 2.0995806456973414, |
|
"learning_rate": 1.002404735808355e-05, |
|
"loss": 3.0728, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 35.8974358974359, |
|
"grad_norm": 2.0785784692696536, |
|
"learning_rate": 9.874064542196033e-06, |
|
"loss": 3.101, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 36.004273504273506, |
|
"grad_norm": 2.046538463619717, |
|
"learning_rate": 9.72619877650558e-06, |
|
"loss": 3.0851, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 36.111111111111114, |
|
"grad_norm": 2.023732403212745, |
|
"learning_rate": 9.580459655809827e-06, |
|
"loss": 3.0623, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 36.217948717948715, |
|
"grad_norm": 2.0773001733336076, |
|
"learning_rate": 9.436856636911479e-06, |
|
"loss": 3.0529, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 36.324786324786324, |
|
"grad_norm": 2.0516402125983695, |
|
"learning_rate": 9.295399038004633e-06, |
|
"loss": 3.0461, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 36.43162393162393, |
|
"grad_norm": 2.1174524570789073, |
|
"learning_rate": 9.156096038070163e-06, |
|
"loss": 3.0454, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 36.53846153846154, |
|
"grad_norm": 2.0514328856826998, |
|
"learning_rate": 9.018956676280101e-06, |
|
"loss": 3.0703, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 36.64529914529915, |
|
"grad_norm": 2.165466183870704, |
|
"learning_rate": 8.88398985141111e-06, |
|
"loss": 3.0554, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 36.75213675213675, |
|
"grad_norm": 2.0912934880999225, |
|
"learning_rate": 8.75120432126705e-06, |
|
"loss": 3.0511, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 36.85897435897436, |
|
"grad_norm": 2.0425737921879845, |
|
"learning_rate": 8.620608702110672e-06, |
|
"loss": 3.0763, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 36.965811965811966, |
|
"grad_norm": 2.1232600100726087, |
|
"learning_rate": 8.492211468104547e-06, |
|
"loss": 3.062, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 37.072649572649574, |
|
"grad_norm": 2.1110434256405495, |
|
"learning_rate": 8.366020950761204e-06, |
|
"loss": 3.0434, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 37.17948717948718, |
|
"grad_norm": 2.1450862756909035, |
|
"learning_rate": 8.242045338402464e-06, |
|
"loss": 3.0422, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 37.28632478632478, |
|
"grad_norm": 2.0605806610085566, |
|
"learning_rate": 8.12029267562816e-06, |
|
"loss": 3.0413, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 37.39316239316239, |
|
"grad_norm": 2.0451305614475324, |
|
"learning_rate": 8.000770862794092e-06, |
|
"loss": 3.047, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 37.5, |
|
"grad_norm": 2.2090559323000467, |
|
"learning_rate": 7.883487655499422e-06, |
|
"loss": 3.0383, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 37.60683760683761, |
|
"grad_norm": 2.11643422369848, |
|
"learning_rate": 7.768450664083389e-06, |
|
"loss": 3.0443, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 37.71367521367522, |
|
"grad_norm": 2.116595083312843, |
|
"learning_rate": 7.655667353131507e-06, |
|
"loss": 3.0281, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 37.82051282051282, |
|
"grad_norm": 2.0966523963349597, |
|
"learning_rate": 7.545145040991213e-06, |
|
"loss": 3.0375, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 37.927350427350426, |
|
"grad_norm": 2.1393354109439686, |
|
"learning_rate": 7.4368908992969494e-06, |
|
"loss": 3.0451, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 38.034188034188034, |
|
"grad_norm": 2.171999152497746, |
|
"learning_rate": 7.330911952504839e-06, |
|
"loss": 3.0244, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 38.14102564102564, |
|
"grad_norm": 2.201534265102945, |
|
"learning_rate": 7.227215077436858e-06, |
|
"loss": 3.0086, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 38.24786324786325, |
|
"grad_norm": 2.1852434781244177, |
|
"learning_rate": 7.125807002834633e-06, |
|
"loss": 3.0096, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 38.35470085470085, |
|
"grad_norm": 2.1435749304995184, |
|
"learning_rate": 7.026694308922806e-06, |
|
"loss": 3.0155, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 2.013436955865304, |
|
"learning_rate": 6.929883426982045e-06, |
|
"loss": 3.0241, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 38.56837606837607, |
|
"grad_norm": 2.3006132676567166, |
|
"learning_rate": 6.835380638931743e-06, |
|
"loss": 3.0139, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 38.675213675213676, |
|
"grad_norm": 2.053756909257222, |
|
"learning_rate": 6.7431920769223915e-06, |
|
"loss": 3.0185, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 38.782051282051285, |
|
"grad_norm": 2.212010530974445, |
|
"learning_rate": 6.653323722937661e-06, |
|
"loss": 3.0187, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 38.888888888888886, |
|
"grad_norm": 2.1214954424397146, |
|
"learning_rate": 6.565781408406267e-06, |
|
"loss": 3.0172, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 38.995726495726494, |
|
"grad_norm": 2.240490101433792, |
|
"learning_rate": 6.4805708138235415e-06, |
|
"loss": 3.0359, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 39.1025641025641, |
|
"grad_norm": 2.054957615027752, |
|
"learning_rate": 6.397697468382851e-06, |
|
"loss": 3.0037, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 39.20940170940171, |
|
"grad_norm": 2.1534118771277666, |
|
"learning_rate": 6.317166749616825e-06, |
|
"loss": 2.9826, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 39.31623931623932, |
|
"grad_norm": 2.1670160623191896, |
|
"learning_rate": 6.238983883048393e-06, |
|
"loss": 2.9874, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 39.42307692307692, |
|
"grad_norm": 2.0968230739935096, |
|
"learning_rate": 6.163153941851729e-06, |
|
"loss": 2.997, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 39.52991452991453, |
|
"grad_norm": 2.2848578364181877, |
|
"learning_rate": 6.08968184652305e-06, |
|
"loss": 2.998, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 39.636752136752136, |
|
"grad_norm": 2.1921025494234345, |
|
"learning_rate": 6.018572364561333e-06, |
|
"loss": 2.9946, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 39.743589743589745, |
|
"grad_norm": 2.1811327227787447, |
|
"learning_rate": 5.949830110158959e-06, |
|
"loss": 3.0138, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 39.85042735042735, |
|
"grad_norm": 2.1892063570097218, |
|
"learning_rate": 5.883459543902297e-06, |
|
"loss": 2.9962, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 39.957264957264954, |
|
"grad_norm": 2.1993511153341863, |
|
"learning_rate": 5.819464972482287e-06, |
|
"loss": 2.9952, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 40.06410256410256, |
|
"grad_norm": 2.2680369199677757, |
|
"learning_rate": 5.7578505484149624e-06, |
|
"loss": 3.0053, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 40.17094017094017, |
|
"grad_norm": 2.178325268419898, |
|
"learning_rate": 5.698620269771997e-06, |
|
"loss": 2.985, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 40.27777777777778, |
|
"grad_norm": 2.1967082626425634, |
|
"learning_rate": 5.641777979921297e-06, |
|
"loss": 2.9902, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 40.38461538461539, |
|
"grad_norm": 2.3133523761980537, |
|
"learning_rate": 5.587327367277585e-06, |
|
"loss": 2.9592, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 40.49145299145299, |
|
"grad_norm": 2.171592419580568, |
|
"learning_rate": 5.535271965063095e-06, |
|
"loss": 2.9763, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 40.598290598290596, |
|
"grad_norm": 2.205495978905872, |
|
"learning_rate": 5.4856151510782684e-06, |
|
"loss": 2.9977, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 40.705128205128204, |
|
"grad_norm": 2.219457808947546, |
|
"learning_rate": 5.438360147482613e-06, |
|
"loss": 2.9937, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 40.81196581196581, |
|
"grad_norm": 2.143182048689108, |
|
"learning_rate": 5.393510020585584e-06, |
|
"loss": 2.9858, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 40.91880341880342, |
|
"grad_norm": 2.2602763212966908, |
|
"learning_rate": 5.351067680647651e-06, |
|
"loss": 2.984, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 41.02564102564103, |
|
"grad_norm": 2.237613918025757, |
|
"learning_rate": 5.311035881691424e-06, |
|
"loss": 2.9765, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 41.13247863247863, |
|
"grad_norm": 2.253413260507676, |
|
"learning_rate": 5.273417221322976e-06, |
|
"loss": 2.9712, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 41.23931623931624, |
|
"grad_norm": 2.2190398980533974, |
|
"learning_rate": 5.2382141405632635e-06, |
|
"loss": 2.9578, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 41.34615384615385, |
|
"grad_norm": 2.1465746762525546, |
|
"learning_rate": 5.205428923689753e-06, |
|
"loss": 2.9654, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 41.452991452991455, |
|
"grad_norm": 2.3382052127852138, |
|
"learning_rate": 5.175063698088176e-06, |
|
"loss": 2.9657, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 41.55982905982906, |
|
"grad_norm": 2.307601276460699, |
|
"learning_rate": 5.147120434114517e-06, |
|
"loss": 2.9467, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 41.666666666666664, |
|
"grad_norm": 2.1265313857448955, |
|
"learning_rate": 5.121600944967128e-06, |
|
"loss": 2.9668, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 41.77350427350427, |
|
"grad_norm": 2.2877212083930907, |
|
"learning_rate": 5.098506886569086e-06, |
|
"loss": 2.9834, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 41.88034188034188, |
|
"grad_norm": 2.2687584616763368, |
|
"learning_rate": 5.077839757460745e-06, |
|
"loss": 2.9736, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 41.98717948717949, |
|
"grad_norm": 2.218212008732717, |
|
"learning_rate": 5.059600898702501e-06, |
|
"loss": 2.9504, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 42.0940170940171, |
|
"grad_norm": 2.2536964781674453, |
|
"learning_rate": 5.043791493787751e-06, |
|
"loss": 2.9602, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 42.2008547008547, |
|
"grad_norm": 2.2352386086099743, |
|
"learning_rate": 5.030412568566141e-06, |
|
"loss": 2.9694, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 42.30769230769231, |
|
"grad_norm": 2.1999717493844324, |
|
"learning_rate": 5.019464991176942e-06, |
|
"loss": 2.9492, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 42.414529914529915, |
|
"grad_norm": 2.2665941993498455, |
|
"learning_rate": 5.010949471992765e-06, |
|
"loss": 2.9823, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 42.52136752136752, |
|
"grad_norm": 2.2472033604271853, |
|
"learning_rate": 5.00486656357345e-06, |
|
"loss": 2.9619, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 42.62820512820513, |
|
"grad_norm": 2.165219324821127, |
|
"learning_rate": 5.001216660630201e-06, |
|
"loss": 2.9508, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 42.73504273504273, |
|
"grad_norm": 2.1433868412005084, |
|
"learning_rate": 5e-06, |
|
"loss": 2.9738, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 42.73504273504273, |
|
"step": 20000, |
|
"total_flos": 9814671360000000.0, |
|
"train_loss": 3.479709552001953, |
|
"train_runtime": 156320.8197, |
|
"train_samples_per_second": 16.377, |
|
"train_steps_per_second": 0.128 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 43, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9814671360000000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|