|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 2094, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004775549188156638, |
|
"grad_norm": 1.5947464157895972, |
|
"learning_rate": 4.7619047619047623e-07, |
|
"loss": 1.1507, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009551098376313277, |
|
"grad_norm": 1.0799275908070096, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 1.1084, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014326647564469915, |
|
"grad_norm": 0.8982107389870104, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 1.0874, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019102196752626553, |
|
"grad_norm": 0.7755204372056955, |
|
"learning_rate": 1.904761904761905e-06, |
|
"loss": 1.1068, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02387774594078319, |
|
"grad_norm": 0.7764375006527714, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 1.1152, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02865329512893983, |
|
"grad_norm": 0.8635581450994269, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 1.0964, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.033428844317096466, |
|
"grad_norm": 0.7474505965610809, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.1005, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.038204393505253106, |
|
"grad_norm": 0.7625379001449059, |
|
"learning_rate": 3.80952380952381e-06, |
|
"loss": 1.0933, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04297994269340974, |
|
"grad_norm": 0.7457628875786683, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 1.0397, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04775549188156638, |
|
"grad_norm": 0.7668973425259701, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 1.0723, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05253104106972302, |
|
"grad_norm": 0.7418999154350163, |
|
"learning_rate": 5.2380952380952384e-06, |
|
"loss": 1.0753, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05730659025787966, |
|
"grad_norm": 0.7536046873082832, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 1.0604, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06208213944603629, |
|
"grad_norm": 0.7126742899879043, |
|
"learning_rate": 6.1904761904761914e-06, |
|
"loss": 1.0844, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06685768863419293, |
|
"grad_norm": 0.7388477971520834, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0738, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07163323782234957, |
|
"grad_norm": 0.7063263241327802, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.0638, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07640878701050621, |
|
"grad_norm": 0.7435364320190926, |
|
"learning_rate": 7.61904761904762e-06, |
|
"loss": 1.089, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08118433619866285, |
|
"grad_norm": 0.7196795649504337, |
|
"learning_rate": 8.095238095238097e-06, |
|
"loss": 1.0215, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08595988538681948, |
|
"grad_norm": 0.792285178872838, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.0489, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09073543457497613, |
|
"grad_norm": 0.8765129377706619, |
|
"learning_rate": 9.047619047619049e-06, |
|
"loss": 1.047, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09551098376313276, |
|
"grad_norm": 0.6926043469847205, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 1.0484, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09551098376313276, |
|
"eval_loss": 1.0037423372268677, |
|
"eval_runtime": 534.236, |
|
"eval_samples_per_second": 27.872, |
|
"eval_steps_per_second": 3.485, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10028653295128939, |
|
"grad_norm": 0.7189100769671237, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0786, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10506208213944604, |
|
"grad_norm": 0.6530269733207105, |
|
"learning_rate": 9.99930486701988e-06, |
|
"loss": 1.0568, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10983763132760267, |
|
"grad_norm": 0.7951921984643434, |
|
"learning_rate": 9.99721966136347e-06, |
|
"loss": 1.0264, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11461318051575932, |
|
"grad_norm": 0.7425556549789157, |
|
"learning_rate": 9.99374496282885e-06, |
|
"loss": 1.0093, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11938872970391595, |
|
"grad_norm": 0.7965678052101759, |
|
"learning_rate": 9.988881737567046e-06, |
|
"loss": 1.0172, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12416427889207259, |
|
"grad_norm": 0.6836841577113884, |
|
"learning_rate": 9.982631337813363e-06, |
|
"loss": 1.0369, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12893982808022922, |
|
"grad_norm": 0.7403849280275662, |
|
"learning_rate": 9.974995501511404e-06, |
|
"loss": 0.998, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13371537726838587, |
|
"grad_norm": 0.8090595134087124, |
|
"learning_rate": 9.965976351829827e-06, |
|
"loss": 1.0232, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1384909264565425, |
|
"grad_norm": 0.7595821597209347, |
|
"learning_rate": 9.95557639657199e-06, |
|
"loss": 1.0093, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14326647564469913, |
|
"grad_norm": 0.6967946094456987, |
|
"learning_rate": 9.943798527478652e-06, |
|
"loss": 1.0048, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14804202483285578, |
|
"grad_norm": 0.8063573400514253, |
|
"learning_rate": 9.930646019423909e-06, |
|
"loss": 0.9838, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15281757402101243, |
|
"grad_norm": 0.8442835891928957, |
|
"learning_rate": 9.916122529504605e-06, |
|
"loss": 0.9935, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15759312320916904, |
|
"grad_norm": 0.781371439165008, |
|
"learning_rate": 9.900232096023478e-06, |
|
"loss": 0.9834, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1623686723973257, |
|
"grad_norm": 0.8183448490469395, |
|
"learning_rate": 9.882979137366275e-06, |
|
"loss": 0.9638, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16714422158548234, |
|
"grad_norm": 0.7604900553397272, |
|
"learning_rate": 9.864368450773227e-06, |
|
"loss": 0.983, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17191977077363896, |
|
"grad_norm": 0.8396013068852565, |
|
"learning_rate": 9.844405211005145e-06, |
|
"loss": 0.9759, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1766953199617956, |
|
"grad_norm": 0.8365439774930438, |
|
"learning_rate": 9.823094968904572e-06, |
|
"loss": 0.9927, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18147086914995225, |
|
"grad_norm": 0.7930078729758488, |
|
"learning_rate": 9.800443649852347e-06, |
|
"loss": 0.985, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18624641833810887, |
|
"grad_norm": 0.6951067649253269, |
|
"learning_rate": 9.776457552120034e-06, |
|
"loss": 0.9578, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19102196752626552, |
|
"grad_norm": 0.7758966278638689, |
|
"learning_rate": 9.751143345118675e-06, |
|
"loss": 0.9828, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19102196752626552, |
|
"eval_loss": 0.911033034324646, |
|
"eval_runtime": 534.1402, |
|
"eval_samples_per_second": 27.877, |
|
"eval_steps_per_second": 3.486, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19579751671442217, |
|
"grad_norm": 0.8178388178466306, |
|
"learning_rate": 9.724508067544328e-06, |
|
"loss": 0.9593, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20057306590257878, |
|
"grad_norm": 0.7948626800066163, |
|
"learning_rate": 9.696559125420949e-06, |
|
"loss": 0.9342, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20534861509073543, |
|
"grad_norm": 0.7690821022431013, |
|
"learning_rate": 9.667304290041102e-06, |
|
"loss": 0.9182, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21012416427889208, |
|
"grad_norm": 0.8011672592566321, |
|
"learning_rate": 9.636751695805154e-06, |
|
"loss": 0.9399, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2148997134670487, |
|
"grad_norm": 0.7896064611214817, |
|
"learning_rate": 9.604909837959456e-06, |
|
"loss": 0.9546, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21967526265520534, |
|
"grad_norm": 0.8683971543565645, |
|
"learning_rate": 9.57178757023422e-06, |
|
"loss": 0.9493, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.224450811843362, |
|
"grad_norm": 0.7840423345877752, |
|
"learning_rate": 9.537394102381719e-06, |
|
"loss": 0.951, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22922636103151864, |
|
"grad_norm": 0.8816713401350009, |
|
"learning_rate": 9.501738997615471e-06, |
|
"loss": 0.902, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23400191021967526, |
|
"grad_norm": 0.7272663417426294, |
|
"learning_rate": 9.464832169951171e-06, |
|
"loss": 0.9121, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2387774594078319, |
|
"grad_norm": 0.8166914666826738, |
|
"learning_rate": 9.426683881450058e-06, |
|
"loss": 0.9149, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24355300859598855, |
|
"grad_norm": 0.806333386634663, |
|
"learning_rate": 9.387304739365524e-06, |
|
"loss": 0.9141, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.24832855778414517, |
|
"grad_norm": 0.8500670140915615, |
|
"learning_rate": 9.346705693193722e-06, |
|
"loss": 0.9046, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2531041069723018, |
|
"grad_norm": 0.8536991575306397, |
|
"learning_rate": 9.304898031629038e-06, |
|
"loss": 0.907, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.25787965616045844, |
|
"grad_norm": 0.7805889795178423, |
|
"learning_rate": 9.261893379425218e-06, |
|
"loss": 0.9095, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2626552053486151, |
|
"grad_norm": 0.983470886698574, |
|
"learning_rate": 9.217703694163083e-06, |
|
"loss": 0.8811, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.26743075453677173, |
|
"grad_norm": 0.8247344852185301, |
|
"learning_rate": 9.172341262925675e-06, |
|
"loss": 0.8743, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2722063037249284, |
|
"grad_norm": 0.7947362543336306, |
|
"learning_rate": 9.125818698881798e-06, |
|
"loss": 0.8659, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.276981852913085, |
|
"grad_norm": 0.7251555329092241, |
|
"learning_rate": 9.078148937778889e-06, |
|
"loss": 0.906, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2817574021012416, |
|
"grad_norm": 0.789421032185164, |
|
"learning_rate": 9.029345234346183e-06, |
|
"loss": 0.8859, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.28653295128939826, |
|
"grad_norm": 0.8701223156155673, |
|
"learning_rate": 8.979421158609206e-06, |
|
"loss": 0.8785, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28653295128939826, |
|
"eval_loss": 0.8167237639427185, |
|
"eval_runtime": 534.2082, |
|
"eval_samples_per_second": 27.873, |
|
"eval_steps_per_second": 3.486, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2913085004775549, |
|
"grad_norm": 0.8629003851548017, |
|
"learning_rate": 8.928390592116576e-06, |
|
"loss": 0.8539, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.29608404966571156, |
|
"grad_norm": 0.8486389448069024, |
|
"learning_rate": 8.876267724080197e-06, |
|
"loss": 0.8527, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3008595988538682, |
|
"grad_norm": 0.762838712746454, |
|
"learning_rate": 8.823067047429908e-06, |
|
"loss": 0.8683, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.30563514804202485, |
|
"grad_norm": 0.8625392396534273, |
|
"learning_rate": 8.768803354783668e-06, |
|
"loss": 0.8649, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3104106972301815, |
|
"grad_norm": 0.8333540473972623, |
|
"learning_rate": 8.71349173433443e-06, |
|
"loss": 0.8622, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3151862464183381, |
|
"grad_norm": 0.7664644528325345, |
|
"learning_rate": 8.65714756565482e-06, |
|
"loss": 0.8359, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.31996179560649474, |
|
"grad_norm": 0.8563484088758611, |
|
"learning_rate": 8.599786515420789e-06, |
|
"loss": 0.8569, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3247373447946514, |
|
"grad_norm": 0.8053932610453219, |
|
"learning_rate": 8.541424533055455e-06, |
|
"loss": 0.8458, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.32951289398280803, |
|
"grad_norm": 0.8666024535787029, |
|
"learning_rate": 8.48207784629431e-06, |
|
"loss": 0.8447, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3342884431709647, |
|
"grad_norm": 0.8868169973573014, |
|
"learning_rate": 8.421762956673043e-06, |
|
"loss": 0.8365, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3390639923591213, |
|
"grad_norm": 0.8954608340327667, |
|
"learning_rate": 8.360496634939243e-06, |
|
"loss": 0.8335, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3438395415472779, |
|
"grad_norm": 0.7784372366116787, |
|
"learning_rate": 8.298295916389234e-06, |
|
"loss": 0.8458, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.34861509073543456, |
|
"grad_norm": 0.8749905429657314, |
|
"learning_rate": 8.235178096131355e-06, |
|
"loss": 0.8185, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3533906399235912, |
|
"grad_norm": 0.8021513738690056, |
|
"learning_rate": 8.171160724277005e-06, |
|
"loss": 0.8009, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.35816618911174786, |
|
"grad_norm": 0.8463663258199511, |
|
"learning_rate": 8.106261601060773e-06, |
|
"loss": 0.8277, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3629417382999045, |
|
"grad_norm": 0.8443067155200391, |
|
"learning_rate": 8.040498771891031e-06, |
|
"loss": 0.8432, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.36771728748806115, |
|
"grad_norm": 0.8612360765810276, |
|
"learning_rate": 7.973890522332348e-06, |
|
"loss": 0.7933, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.37249283667621774, |
|
"grad_norm": 0.7566639910663036, |
|
"learning_rate": 7.90645537302113e-06, |
|
"loss": 0.8122, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3772683858643744, |
|
"grad_norm": 0.8598522471784897, |
|
"learning_rate": 7.838212074515899e-06, |
|
"loss": 0.7713, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.38204393505253104, |
|
"grad_norm": 0.8264097384771215, |
|
"learning_rate": 7.769179602083642e-06, |
|
"loss": 0.7863, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.38204393505253104, |
|
"eval_loss": 0.7238086462020874, |
|
"eval_runtime": 534.3413, |
|
"eval_samples_per_second": 27.866, |
|
"eval_steps_per_second": 3.485, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3868194842406877, |
|
"grad_norm": 0.8302298544450823, |
|
"learning_rate": 7.699377150423673e-06, |
|
"loss": 0.7703, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.39159503342884433, |
|
"grad_norm": 0.9074490770748711, |
|
"learning_rate": 7.628824128330485e-06, |
|
"loss": 0.7651, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.396370582617001, |
|
"grad_norm": 0.9084786994105255, |
|
"learning_rate": 7.557540153297086e-06, |
|
"loss": 0.777, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.40114613180515757, |
|
"grad_norm": 0.7941270341446054, |
|
"learning_rate": 7.485545046060272e-06, |
|
"loss": 0.7659, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4059216809933142, |
|
"grad_norm": 0.8418275807952966, |
|
"learning_rate": 7.412858825089423e-06, |
|
"loss": 0.7422, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.41069723018147086, |
|
"grad_norm": 0.8272295917894095, |
|
"learning_rate": 7.3395017010202965e-06, |
|
"loss": 0.7812, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4154727793696275, |
|
"grad_norm": 0.8505418063106487, |
|
"learning_rate": 7.265494071035401e-06, |
|
"loss": 0.7461, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.42024832855778416, |
|
"grad_norm": 0.8254281894571944, |
|
"learning_rate": 7.19085651319249e-06, |
|
"loss": 0.7475, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4250238777459408, |
|
"grad_norm": 0.7405023384966558, |
|
"learning_rate": 7.115609780702767e-06, |
|
"loss": 0.7485, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4297994269340974, |
|
"grad_norm": 0.8069701422330396, |
|
"learning_rate": 7.039774796160391e-06, |
|
"loss": 0.7502, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.43457497612225404, |
|
"grad_norm": 0.7893053819873398, |
|
"learning_rate": 6.9633726457248864e-06, |
|
"loss": 0.7307, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4393505253104107, |
|
"grad_norm": 0.8684426658728022, |
|
"learning_rate": 6.886424573258057e-06, |
|
"loss": 0.7407, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.44412607449856734, |
|
"grad_norm": 0.8806215484013901, |
|
"learning_rate": 6.808951974417077e-06, |
|
"loss": 0.7232, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.448901623686724, |
|
"grad_norm": 1.0078825768218607, |
|
"learning_rate": 6.73097639070535e-06, |
|
"loss": 0.7217, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.45367717287488063, |
|
"grad_norm": 0.820169166397048, |
|
"learning_rate": 6.652519503482829e-06, |
|
"loss": 0.7275, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4584527220630373, |
|
"grad_norm": 0.9615211451349941, |
|
"learning_rate": 6.573603127937443e-06, |
|
"loss": 0.7244, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.46322827125119387, |
|
"grad_norm": 0.9105145919711279, |
|
"learning_rate": 6.494249207019317e-06, |
|
"loss": 0.7184, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.4680038204393505, |
|
"grad_norm": 0.8720579325571185, |
|
"learning_rate": 6.414479805339465e-06, |
|
"loss": 0.6887, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.47277936962750716, |
|
"grad_norm": 0.8869722170599964, |
|
"learning_rate": 6.3343171030346525e-06, |
|
"loss": 0.6858, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4775549188156638, |
|
"grad_norm": 0.8820354385866871, |
|
"learning_rate": 6.253783389600136e-06, |
|
"loss": 0.7073, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4775549188156638, |
|
"eval_loss": 0.6407110095024109, |
|
"eval_runtime": 534.1565, |
|
"eval_samples_per_second": 27.876, |
|
"eval_steps_per_second": 3.486, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.48233046800382046, |
|
"grad_norm": 0.9876137273298787, |
|
"learning_rate": 6.172901057692007e-06, |
|
"loss": 0.7207, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4871060171919771, |
|
"grad_norm": 0.8646444637817178, |
|
"learning_rate": 6.0916925969008275e-06, |
|
"loss": 0.7363, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4918815663801337, |
|
"grad_norm": 0.8847812706441835, |
|
"learning_rate": 6.010180587498347e-06, |
|
"loss": 0.6729, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.49665711556829034, |
|
"grad_norm": 0.7915082131571504, |
|
"learning_rate": 5.928387694158968e-06, |
|
"loss": 0.6956, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.501432664756447, |
|
"grad_norm": 1.046517690383018, |
|
"learning_rate": 5.8463366596577706e-06, |
|
"loss": 0.6896, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5062082139446036, |
|
"grad_norm": 0.8847177405317377, |
|
"learning_rate": 5.764050298546808e-06, |
|
"loss": 0.6861, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5109837631327603, |
|
"grad_norm": 0.8672793826369924, |
|
"learning_rate": 5.68155149081145e-06, |
|
"loss": 0.6762, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5157593123209169, |
|
"grad_norm": 0.9340453072759527, |
|
"learning_rate": 5.598863175508526e-06, |
|
"loss": 0.6717, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5205348615090736, |
|
"grad_norm": 0.8259949274241196, |
|
"learning_rate": 5.516008344388053e-06, |
|
"loss": 0.6825, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5253104106972302, |
|
"grad_norm": 0.9029867272763249, |
|
"learning_rate": 5.433010035500299e-06, |
|
"loss": 0.6771, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5300859598853869, |
|
"grad_norm": 0.800953127657272, |
|
"learning_rate": 5.3498913267899864e-06, |
|
"loss": 0.674, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5348615090735435, |
|
"grad_norm": 0.9245770325043161, |
|
"learning_rate": 5.2666753296793895e-06, |
|
"loss": 0.6662, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5396370582617, |
|
"grad_norm": 0.8502150053204679, |
|
"learning_rate": 5.183385182642136e-06, |
|
"loss": 0.6765, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5444126074498568, |
|
"grad_norm": 1.5468980444160696, |
|
"learning_rate": 5.100044044769472e-06, |
|
"loss": 0.6682, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5491881566380133, |
|
"grad_norm": 1.0568604331225828, |
|
"learning_rate": 5.016675089330817e-06, |
|
"loss": 0.6583, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.55396370582617, |
|
"grad_norm": 0.9277335589404493, |
|
"learning_rate": 4.933301497330344e-06, |
|
"loss": 0.6456, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5587392550143266, |
|
"grad_norm": 0.9015066761493195, |
|
"learning_rate": 4.849946451061444e-06, |
|
"loss": 0.673, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5635148042024832, |
|
"grad_norm": 0.916248819386327, |
|
"learning_rate": 4.766633127660805e-06, |
|
"loss": 0.6372, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5682903533906399, |
|
"grad_norm": 0.9944122764504796, |
|
"learning_rate": 4.683384692663937e-06, |
|
"loss": 0.6352, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5730659025787965, |
|
"grad_norm": 0.8995871939871956, |
|
"learning_rate": 4.600224293563926e-06, |
|
"loss": 0.6143, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5730659025787965, |
|
"eval_loss": 0.5672308206558228, |
|
"eval_runtime": 534.5729, |
|
"eval_samples_per_second": 27.854, |
|
"eval_steps_per_second": 3.483, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5778414517669532, |
|
"grad_norm": 0.9029828803944917, |
|
"learning_rate": 4.517175053375191e-06, |
|
"loss": 0.6482, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5826170009551098, |
|
"grad_norm": 0.9997552015425194, |
|
"learning_rate": 4.434260064204067e-06, |
|
"loss": 0.6244, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5873925501432665, |
|
"grad_norm": 0.8505066146958208, |
|
"learning_rate": 4.351502380827959e-06, |
|
"loss": 0.6231, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5921680993314231, |
|
"grad_norm": 0.9601951382006098, |
|
"learning_rate": 4.268925014284898e-06, |
|
"loss": 0.6515, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5969436485195797, |
|
"grad_norm": 0.858109652878112, |
|
"learning_rate": 4.18655092547524e-06, |
|
"loss": 0.6027, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6017191977077364, |
|
"grad_norm": 1.0279381548988882, |
|
"learning_rate": 4.104403018777323e-06, |
|
"loss": 0.636, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.606494746895893, |
|
"grad_norm": 0.8684044204496176, |
|
"learning_rate": 4.022504135678822e-06, |
|
"loss": 0.6356, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6112702960840497, |
|
"grad_norm": 1.2002839065266542, |
|
"learning_rate": 3.94087704842561e-06, |
|
"loss": 0.6303, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6160458452722063, |
|
"grad_norm": 1.0212819754078601, |
|
"learning_rate": 3.859544453689853e-06, |
|
"loss": 0.6181, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.620821394460363, |
|
"grad_norm": 1.1643909557826269, |
|
"learning_rate": 3.778528966259137e-06, |
|
"loss": 0.6075, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6255969436485196, |
|
"grad_norm": 0.8318901215082086, |
|
"learning_rate": 3.697853112748345e-06, |
|
"loss": 0.6106, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6303724928366762, |
|
"grad_norm": 0.9063102495466279, |
|
"learning_rate": 3.6175393253360704e-06, |
|
"loss": 0.599, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6351480420248329, |
|
"grad_norm": 0.9567097209608001, |
|
"learning_rate": 3.537609935527264e-06, |
|
"loss": 0.5996, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6399235912129895, |
|
"grad_norm": 0.939453389364599, |
|
"learning_rate": 3.458087167943905e-06, |
|
"loss": 0.5867, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6446991404011462, |
|
"grad_norm": 0.9944415765925527, |
|
"learning_rate": 3.3789931341453564e-06, |
|
"loss": 0.614, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6494746895893028, |
|
"grad_norm": 0.8911567397377756, |
|
"learning_rate": 3.3003498264801915e-06, |
|
"loss": 0.5858, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6542502387774594, |
|
"grad_norm": 0.9190740572643366, |
|
"learning_rate": 3.2221791119711372e-06, |
|
"loss": 0.6073, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6590257879656161, |
|
"grad_norm": 0.8722067899669511, |
|
"learning_rate": 3.144502726234889e-06, |
|
"loss": 0.598, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6638013371537727, |
|
"grad_norm": 0.8704883954915125, |
|
"learning_rate": 3.067342267438446e-06, |
|
"loss": 0.5864, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6685768863419294, |
|
"grad_norm": 0.9586746506286237, |
|
"learning_rate": 2.9907191902936773e-06, |
|
"loss": 0.5726, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6685768863419294, |
|
"eval_loss": 0.5096372961997986, |
|
"eval_runtime": 534.0593, |
|
"eval_samples_per_second": 27.881, |
|
"eval_steps_per_second": 3.487, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.673352435530086, |
|
"grad_norm": 0.9771151805675299, |
|
"learning_rate": 2.914654800091768e-06, |
|
"loss": 0.5678, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6781279847182426, |
|
"grad_norm": 0.9844163415808749, |
|
"learning_rate": 2.8391702467792137e-06, |
|
"loss": 0.5875, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6829035339063992, |
|
"grad_norm": 0.936929121667794, |
|
"learning_rate": 2.764286519077014e-06, |
|
"loss": 0.5745, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6876790830945558, |
|
"grad_norm": 0.9581940513551886, |
|
"learning_rate": 2.6900244386446903e-06, |
|
"loss": 0.5748, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6924546322827125, |
|
"grad_norm": 0.9382097505155865, |
|
"learning_rate": 2.616404654290752e-06, |
|
"loss": 0.582, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6972301814708691, |
|
"grad_norm": 0.9458807920061071, |
|
"learning_rate": 2.5434476362312375e-06, |
|
"loss": 0.5859, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7020057306590258, |
|
"grad_norm": 0.8536247554325601, |
|
"learning_rate": 2.4711736703979015e-06, |
|
"loss": 0.5778, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7067812798471824, |
|
"grad_norm": 0.8896142850001317, |
|
"learning_rate": 2.399602852797647e-06, |
|
"loss": 0.5833, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.711556829035339, |
|
"grad_norm": 0.9369088545555486, |
|
"learning_rate": 2.3287550839247625e-06, |
|
"loss": 0.5677, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7163323782234957, |
|
"grad_norm": 0.9352682466004876, |
|
"learning_rate": 2.2586500632275333e-06, |
|
"loss": 0.5501, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7211079274116523, |
|
"grad_norm": 0.9291292330708577, |
|
"learning_rate": 2.1893072836307433e-06, |
|
"loss": 0.5432, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.725883476599809, |
|
"grad_norm": 1.1278542414631672, |
|
"learning_rate": 2.1207460261156066e-06, |
|
"loss": 0.6017, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7306590257879656, |
|
"grad_norm": 0.8496342199267922, |
|
"learning_rate": 2.052985354358622e-06, |
|
"loss": 0.5361, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7354345749761223, |
|
"grad_norm": 0.8448590719351696, |
|
"learning_rate": 1.986044109430869e-06, |
|
"loss": 0.544, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7402101241642789, |
|
"grad_norm": 1.0014560087074114, |
|
"learning_rate": 1.91994090455918e-06, |
|
"loss": 0.5544, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7449856733524355, |
|
"grad_norm": 0.9943362148840331, |
|
"learning_rate": 1.8546941199506752e-06, |
|
"loss": 0.5743, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7497612225405922, |
|
"grad_norm": 0.9488632116893986, |
|
"learning_rate": 1.790321897682083e-06, |
|
"loss": 0.5516, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7545367717287488, |
|
"grad_norm": 0.9282545781122443, |
|
"learning_rate": 1.7268421366552851e-06, |
|
"loss": 0.5598, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7593123209169055, |
|
"grad_norm": 0.893009147729329, |
|
"learning_rate": 1.6642724876204658e-06, |
|
"loss": 0.5457, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7640878701050621, |
|
"grad_norm": 0.8952401614954113, |
|
"learning_rate": 1.602630348268267e-06, |
|
"loss": 0.5623, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7640878701050621, |
|
"eval_loss": 0.46827441453933716, |
|
"eval_runtime": 534.0896, |
|
"eval_samples_per_second": 27.879, |
|
"eval_steps_per_second": 3.486, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7688634192932188, |
|
"grad_norm": 0.9378147502660392, |
|
"learning_rate": 1.541932858392296e-06, |
|
"loss": 0.5522, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7736389684813754, |
|
"grad_norm": 0.8793552837932004, |
|
"learning_rate": 1.482196895123364e-06, |
|
"loss": 0.5321, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.778414517669532, |
|
"grad_norm": 0.8650313644836122, |
|
"learning_rate": 1.423439068236736e-06, |
|
"loss": 0.5789, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7831900668576887, |
|
"grad_norm": 0.9837452596609937, |
|
"learning_rate": 1.3656757155337413e-06, |
|
"loss": 0.5628, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7879656160458453, |
|
"grad_norm": 1.0526446379669654, |
|
"learning_rate": 1.3089228982989771e-06, |
|
"loss": 0.5139, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.792741165234002, |
|
"grad_norm": 0.9071660119485095, |
|
"learning_rate": 1.2531963968344346e-06, |
|
"loss": 0.5229, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7975167144221585, |
|
"grad_norm": 0.8509970739173898, |
|
"learning_rate": 1.1985117060717278e-06, |
|
"loss": 0.5184, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8022922636103151, |
|
"grad_norm": 0.8689645530199235, |
|
"learning_rate": 1.1448840312636812e-06, |
|
"loss": 0.5248, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8070678127984718, |
|
"grad_norm": 0.8888334350846111, |
|
"learning_rate": 1.0923282837564537e-06, |
|
"loss": 0.5451, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8118433619866284, |
|
"grad_norm": 1.0944895277253541, |
|
"learning_rate": 1.0408590768434018e-06, |
|
"loss": 0.522, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8166189111747851, |
|
"grad_norm": 1.0504131852344616, |
|
"learning_rate": 9.904907217018e-07, |
|
"loss": 0.5143, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8213944603629417, |
|
"grad_norm": 0.9813714332571194, |
|
"learning_rate": 9.412372234135753e-07, |
|
"loss": 0.5339, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8261700095510984, |
|
"grad_norm": 0.8392110216415885, |
|
"learning_rate": 8.931122770711425e-07, |
|
"loss": 0.5326, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.830945558739255, |
|
"grad_norm": 0.883891691537776, |
|
"learning_rate": 8.461292639694519e-07, |
|
"loss": 0.5308, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8357211079274116, |
|
"grad_norm": 0.9330631339104432, |
|
"learning_rate": 8.003012478852679e-07, |
|
"loss": 0.4943, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8404966571155683, |
|
"grad_norm": 0.9077272187489582, |
|
"learning_rate": 7.556409714447488e-07, |
|
"loss": 0.5474, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8452722063037249, |
|
"grad_norm": 0.8412019707689536, |
|
"learning_rate": 7.121608525803142e-07, |
|
"loss": 0.5301, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8500477554918816, |
|
"grad_norm": 1.0343479774517594, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 0.5302, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8548233046800382, |
|
"grad_norm": 0.9838478459223126, |
|
"learning_rate": 6.287891152148823e-07, |
|
"loss": 0.5075, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8595988538681948, |
|
"grad_norm": 1.009194175526722, |
|
"learning_rate": 5.889206784915863e-07, |
|
"loss": 0.5206, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8595988538681948, |
|
"eval_loss": 0.44646286964416504, |
|
"eval_runtime": 534.2652, |
|
"eval_samples_per_second": 27.87, |
|
"eval_steps_per_second": 3.485, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8643744030563515, |
|
"grad_norm": 0.8387985123574973, |
|
"learning_rate": 5.502787564540102e-07, |
|
"loss": 0.5305, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8691499522445081, |
|
"grad_norm": 0.9238833740564283, |
|
"learning_rate": 5.128740936119242e-07, |
|
"loss": 0.5115, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8739255014326648, |
|
"grad_norm": 0.8643626924619122, |
|
"learning_rate": 4.7671709045122914e-07, |
|
"loss": 0.501, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8787010506208214, |
|
"grad_norm": 0.8843512593283425, |
|
"learning_rate": 4.4181780054206925e-07, |
|
"loss": 0.5316, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8834765998089781, |
|
"grad_norm": 1.0280116743208123, |
|
"learning_rate": 4.081859277434025e-07, |
|
"loss": 0.5084, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8882521489971347, |
|
"grad_norm": 0.9217334180362886, |
|
"learning_rate": 3.758308235048158e-07, |
|
"loss": 0.4988, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8930276981852913, |
|
"grad_norm": 0.9278902432782374, |
|
"learning_rate": 3.4476148426632215e-07, |
|
"loss": 0.5248, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.897803247373448, |
|
"grad_norm": 0.8498974666627348, |
|
"learning_rate": 3.1498654895687095e-07, |
|
"loss": 0.5263, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9025787965616046, |
|
"grad_norm": 0.914856710615246, |
|
"learning_rate": 2.8651429659226906e-07, |
|
"loss": 0.5129, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9073543457497613, |
|
"grad_norm": 0.9485264410476115, |
|
"learning_rate": 2.593526439731697e-07, |
|
"loss": 0.5033, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9121298949379179, |
|
"grad_norm": 0.922810539225268, |
|
"learning_rate": 2.3350914348378606e-07, |
|
"loss": 0.5157, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9169054441260746, |
|
"grad_norm": 0.956199326320254, |
|
"learning_rate": 2.0899098099192273e-07, |
|
"loss": 0.5158, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9216809933142311, |
|
"grad_norm": 0.8741702574957524, |
|
"learning_rate": 1.8580497385092376e-07, |
|
"loss": 0.5145, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9264565425023877, |
|
"grad_norm": 0.9941012086649309, |
|
"learning_rate": 1.6395756900408454e-07, |
|
"loss": 0.5321, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9312320916905444, |
|
"grad_norm": 0.8582163500365767, |
|
"learning_rate": 1.4345484119206222e-07, |
|
"loss": 0.5065, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.936007640878701, |
|
"grad_norm": 0.9879634600102223, |
|
"learning_rate": 1.2430249126376913e-07, |
|
"loss": 0.54, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9407831900668577, |
|
"grad_norm": 0.9616760638465843, |
|
"learning_rate": 1.065058445912398e-07, |
|
"loss": 0.5084, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9455587392550143, |
|
"grad_norm": 1.0543325130204897, |
|
"learning_rate": 9.006984958888742e-08, |
|
"loss": 0.527, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9503342884431709, |
|
"grad_norm": 0.8916780720938148, |
|
"learning_rate": 7.499907633758797e-08, |
|
"loss": 0.4929, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9551098376313276, |
|
"grad_norm": 0.8743144782771384, |
|
"learning_rate": 6.129771531395045e-08, |
|
"loss": 0.5054, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9551098376313276, |
|
"eval_loss": 0.43961772322654724, |
|
"eval_runtime": 534.3343, |
|
"eval_samples_per_second": 27.866, |
|
"eval_steps_per_second": 3.485, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9598853868194842, |
|
"grad_norm": 0.9030091469256598, |
|
"learning_rate": 4.896957622514298e-08, |
|
"loss": 0.4983, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.9646609360076409, |
|
"grad_norm": 0.9300966338414236, |
|
"learning_rate": 3.801808694959053e-08, |
|
"loss": 0.5219, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9694364851957975, |
|
"grad_norm": 0.9956560344691349, |
|
"learning_rate": 2.8446292583844126e-08, |
|
"loss": 0.5397, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9742120343839542, |
|
"grad_norm": 0.944281751372382, |
|
"learning_rate": 2.025685459588145e-08, |
|
"loss": 0.525, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9789875835721108, |
|
"grad_norm": 0.8939170547532904, |
|
"learning_rate": 1.3452050085075441e-08, |
|
"loss": 0.5086, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9837631327602674, |
|
"grad_norm": 0.8895914526936296, |
|
"learning_rate": 8.033771149041913e-09, |
|
"loss": 0.5122, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9885386819484241, |
|
"grad_norm": 0.8484510879585117, |
|
"learning_rate": 4.003524357534261e-09, |
|
"loss": 0.5168, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9933142311365807, |
|
"grad_norm": 0.9676882963530836, |
|
"learning_rate": 1.3624303335380006e-09, |
|
"loss": 0.5155, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.9980897803247374, |
|
"grad_norm": 0.8272201128188289, |
|
"learning_rate": 1.1122344167613374e-10, |
|
"loss": 0.5051, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2094, |
|
"total_flos": 985274787299328.0, |
|
"train_loss": 0.7358792713970487, |
|
"train_runtime": 28849.9475, |
|
"train_samples_per_second": 4.645, |
|
"train_steps_per_second": 0.073 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2094, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 985274787299328.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|