|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 2875, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017391304347826087, |
|
"grad_norm": 3.047056528200788, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.6248, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.034782608695652174, |
|
"grad_norm": 2.174143954299078, |
|
"learning_rate": 1.3194444444444446e-06, |
|
"loss": 0.6183, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05217391304347826, |
|
"grad_norm": 1.864151160655997, |
|
"learning_rate": 2.0138888888888893e-06, |
|
"loss": 0.6015, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06956521739130435, |
|
"grad_norm": 1.667310343818362, |
|
"learning_rate": 2.7083333333333334e-06, |
|
"loss": 0.5665, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 2.455941646453557, |
|
"learning_rate": 3.4027777777777783e-06, |
|
"loss": 0.54, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10434782608695652, |
|
"grad_norm": 1.9685053486872826, |
|
"learning_rate": 4.097222222222222e-06, |
|
"loss": 0.5399, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12173913043478261, |
|
"grad_norm": 2.106997653377029, |
|
"learning_rate": 4.791666666666668e-06, |
|
"loss": 0.5317, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1391304347826087, |
|
"grad_norm": 1.780173897349018, |
|
"learning_rate": 5.486111111111112e-06, |
|
"loss": 0.5183, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1565217391304348, |
|
"grad_norm": 1.467101178589284, |
|
"learning_rate": 6.180555555555557e-06, |
|
"loss": 0.5355, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 1.521288337808483, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.5215, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19130434782608696, |
|
"grad_norm": 1.5769259173207397, |
|
"learning_rate": 7.569444444444445e-06, |
|
"loss": 0.5239, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20869565217391303, |
|
"grad_norm": 1.2892203345431497, |
|
"learning_rate": 8.263888888888888e-06, |
|
"loss": 0.5288, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22608695652173913, |
|
"grad_norm": 1.6642372146048716, |
|
"learning_rate": 8.958333333333334e-06, |
|
"loss": 0.5563, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24347826086956523, |
|
"grad_norm": 1.5134595782170073, |
|
"learning_rate": 9.652777777777779e-06, |
|
"loss": 0.5047, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 1.2945158944777713, |
|
"learning_rate": 9.999917294352674e-06, |
|
"loss": 0.5146, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2782608695652174, |
|
"grad_norm": 1.7472246575863004, |
|
"learning_rate": 9.99925566559051e-06, |
|
"loss": 0.5342, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2956521739130435, |
|
"grad_norm": 1.5193618638564104, |
|
"learning_rate": 9.997932495618156e-06, |
|
"loss": 0.5083, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3130434782608696, |
|
"grad_norm": 1.554664459221582, |
|
"learning_rate": 9.995947959527968e-06, |
|
"loss": 0.5063, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33043478260869563, |
|
"grad_norm": 2.1482072415475, |
|
"learning_rate": 9.993302319929523e-06, |
|
"loss": 0.5434, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 1.4192382000638246, |
|
"learning_rate": 9.98999592691486e-06, |
|
"loss": 0.5189, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3652173913043478, |
|
"grad_norm": 1.208599645402354, |
|
"learning_rate": 9.986029218012164e-06, |
|
"loss": 0.5382, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3826086956521739, |
|
"grad_norm": 1.1836104891903814, |
|
"learning_rate": 9.981402718127853e-06, |
|
"loss": 0.5276, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5978655869742624, |
|
"learning_rate": 9.976117039477133e-06, |
|
"loss": 0.5017, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41739130434782606, |
|
"grad_norm": 1.3359478565965037, |
|
"learning_rate": 9.970172881502982e-06, |
|
"loss": 0.4997, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 1.3948206340232912, |
|
"learning_rate": 9.963571030783582e-06, |
|
"loss": 0.5582, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.45217391304347826, |
|
"grad_norm": 1.3011147589302579, |
|
"learning_rate": 9.956312360928253e-06, |
|
"loss": 0.5457, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46956521739130436, |
|
"grad_norm": 1.080591596133825, |
|
"learning_rate": 9.948397832461829e-06, |
|
"loss": 0.5552, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48695652173913045, |
|
"grad_norm": 1.2339442843093948, |
|
"learning_rate": 9.93982849269757e-06, |
|
"loss": 0.5203, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5043478260869565, |
|
"grad_norm": 1.5073250471212296, |
|
"learning_rate": 9.930605475598566e-06, |
|
"loss": 0.5409, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 1.2705670265704732, |
|
"learning_rate": 9.92073000162768e-06, |
|
"loss": 0.5404, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5391304347826087, |
|
"grad_norm": 1.4491195774316425, |
|
"learning_rate": 9.910203377586053e-06, |
|
"loss": 0.5188, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5565217391304348, |
|
"grad_norm": 1.4642066613650184, |
|
"learning_rate": 9.899026996440173e-06, |
|
"loss": 0.4946, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5739130434782609, |
|
"grad_norm": 1.18880785751321, |
|
"learning_rate": 9.887202337137549e-06, |
|
"loss": 0.5298, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.591304347826087, |
|
"grad_norm": 1.2818806362850075, |
|
"learning_rate": 9.874730964411001e-06, |
|
"loss": 0.5299, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 1.07998471259005, |
|
"learning_rate": 9.861614528571607e-06, |
|
"loss": 0.5315, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6260869565217392, |
|
"grad_norm": 1.944925155516315, |
|
"learning_rate": 9.847854765290321e-06, |
|
"loss": 0.5447, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6434782608695652, |
|
"grad_norm": 1.511844572828546, |
|
"learning_rate": 9.83345349536829e-06, |
|
"loss": 0.5291, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6608695652173913, |
|
"grad_norm": 1.338025282900669, |
|
"learning_rate": 9.818412624495911e-06, |
|
"loss": 0.5107, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6782608695652174, |
|
"grad_norm": 1.0620753499324938, |
|
"learning_rate": 9.802734143000668e-06, |
|
"loss": 0.5441, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 1.3469601654140646, |
|
"learning_rate": 9.786420125583734e-06, |
|
"loss": 0.5403, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7130434782608696, |
|
"grad_norm": 1.1557449160960933, |
|
"learning_rate": 9.769472731045451e-06, |
|
"loss": 0.5249, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7304347826086957, |
|
"grad_norm": 1.275996117914776, |
|
"learning_rate": 9.751894201999647e-06, |
|
"loss": 0.5399, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7478260869565218, |
|
"grad_norm": 1.2172722683153971, |
|
"learning_rate": 9.733686864576883e-06, |
|
"loss": 0.5367, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7652173913043478, |
|
"grad_norm": 1.3421828009403691, |
|
"learning_rate": 9.714853128116634e-06, |
|
"loss": 0.5251, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 1.0911936537573848, |
|
"learning_rate": 9.695395484848476e-06, |
|
"loss": 0.555, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3133546561526954, |
|
"learning_rate": 9.675316509562282e-06, |
|
"loss": 0.5379, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8173913043478261, |
|
"grad_norm": 1.0450523994543341, |
|
"learning_rate": 9.654618859267516e-06, |
|
"loss": 0.5115, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8347826086956521, |
|
"grad_norm": 1.0976625003665517, |
|
"learning_rate": 9.633305272841632e-06, |
|
"loss": 0.5323, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8521739130434782, |
|
"grad_norm": 1.1811352794597962, |
|
"learning_rate": 9.61137857066764e-06, |
|
"loss": 0.5379, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 1.3489402359600888, |
|
"learning_rate": 9.5888416542609e-06, |
|
"loss": 0.5353, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8869565217391304, |
|
"grad_norm": 1.2068307201497395, |
|
"learning_rate": 9.565697505885165e-06, |
|
"loss": 0.506, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9043478260869565, |
|
"grad_norm": 1.2234807296718222, |
|
"learning_rate": 9.541949188157937e-06, |
|
"loss": 0.5291, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9217391304347826, |
|
"grad_norm": 1.2256177402823996, |
|
"learning_rate": 9.517599843645216e-06, |
|
"loss": 0.51, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9391304347826087, |
|
"grad_norm": 1.2281368130382897, |
|
"learning_rate": 9.492652694445629e-06, |
|
"loss": 0.5136, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 1.1730852438772608, |
|
"learning_rate": 9.467111041764072e-06, |
|
"loss": 0.5402, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9739130434782609, |
|
"grad_norm": 1.0983121942778715, |
|
"learning_rate": 9.44097826547486e-06, |
|
"loss": 0.5508, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.991304347826087, |
|
"grad_norm": 1.1697621737952535, |
|
"learning_rate": 9.414257823674482e-06, |
|
"loss": 0.5516, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.008695652173913, |
|
"grad_norm": 1.2095450172211872, |
|
"learning_rate": 9.386953252223989e-06, |
|
"loss": 0.5011, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0260869565217392, |
|
"grad_norm": 1.1615174794799035, |
|
"learning_rate": 9.35906816428111e-06, |
|
"loss": 0.4851, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 1.0792347526096802, |
|
"learning_rate": 9.330606249822125e-06, |
|
"loss": 0.4795, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0608695652173914, |
|
"grad_norm": 1.097575063279078, |
|
"learning_rate": 9.30157127515358e-06, |
|
"loss": 0.4955, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0782608695652174, |
|
"grad_norm": 1.1031575053788267, |
|
"learning_rate": 9.271967082413899e-06, |
|
"loss": 0.4884, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0956521739130434, |
|
"grad_norm": 1.2767392201135948, |
|
"learning_rate": 9.241797589064959e-06, |
|
"loss": 0.4655, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1130434782608696, |
|
"grad_norm": 1.1033082483572434, |
|
"learning_rate": 9.211066787373702e-06, |
|
"loss": 0.4663, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 1.576080502703696, |
|
"learning_rate": 9.179778743883855e-06, |
|
"loss": 0.4725, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1478260869565218, |
|
"grad_norm": 1.175175499900073, |
|
"learning_rate": 9.147937598877797e-06, |
|
"loss": 0.4921, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1652173913043478, |
|
"grad_norm": 1.1370691886093878, |
|
"learning_rate": 9.115547565828695e-06, |
|
"loss": 0.4649, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.182608695652174, |
|
"grad_norm": 1.2601647572096586, |
|
"learning_rate": 9.082612930842942e-06, |
|
"loss": 0.5285, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.2187851162701455, |
|
"learning_rate": 9.049138052092982e-06, |
|
"loss": 0.4947, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 1.1224072986238753, |
|
"learning_rate": 9.015127359240603e-06, |
|
"loss": 0.4663, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2347826086956522, |
|
"grad_norm": 1.2366635360634342, |
|
"learning_rate": 8.980585352850775e-06, |
|
"loss": 0.4828, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2521739130434781, |
|
"grad_norm": 1.4981539175915906, |
|
"learning_rate": 8.94551660379609e-06, |
|
"loss": 0.4743, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2695652173913043, |
|
"grad_norm": 1.0729421363170109, |
|
"learning_rate": 8.909925752651914e-06, |
|
"loss": 0.4996, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.2869565217391306, |
|
"grad_norm": 1.1955218613972736, |
|
"learning_rate": 8.873817509082305e-06, |
|
"loss": 0.4863, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 1.126282527653828, |
|
"learning_rate": 8.837196651216802e-06, |
|
"loss": 0.465, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3217391304347825, |
|
"grad_norm": 1.253199532462182, |
|
"learning_rate": 8.800068025018133e-06, |
|
"loss": 0.5031, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3391304347826087, |
|
"grad_norm": 1.1627263868301658, |
|
"learning_rate": 8.762436543640965e-06, |
|
"loss": 0.4783, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.3565217391304347, |
|
"grad_norm": 1.2722684925328063, |
|
"learning_rate": 8.724307186781756e-06, |
|
"loss": 0.4883, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.373913043478261, |
|
"grad_norm": 1.1113001887408591, |
|
"learning_rate": 8.685685000019803e-06, |
|
"loss": 0.4947, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 1.0089575387474536, |
|
"learning_rate": 8.646575094149568e-06, |
|
"loss": 0.4684, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4086956521739131, |
|
"grad_norm": 1.139961867951673, |
|
"learning_rate": 8.606982644504378e-06, |
|
"loss": 0.4938, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4260869565217391, |
|
"grad_norm": 1.099302351381361, |
|
"learning_rate": 8.566912890271584e-06, |
|
"loss": 0.4723, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4434782608695653, |
|
"grad_norm": 1.097933896784172, |
|
"learning_rate": 8.526371133799277e-06, |
|
"loss": 0.5066, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4608695652173913, |
|
"grad_norm": 1.2653660292097024, |
|
"learning_rate": 8.485362739894617e-06, |
|
"loss": 0.4809, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 1.031416554665846, |
|
"learning_rate": 8.443893135113956e-06, |
|
"loss": 0.4849, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4956521739130435, |
|
"grad_norm": 1.2133975894170272, |
|
"learning_rate": 8.401967807044713e-06, |
|
"loss": 0.4731, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5130434782608697, |
|
"grad_norm": 1.3527575598259336, |
|
"learning_rate": 8.359592303579241e-06, |
|
"loss": 0.4993, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5304347826086957, |
|
"grad_norm": 1.5538636433489414, |
|
"learning_rate": 8.316772232180677e-06, |
|
"loss": 0.4831, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5478260869565217, |
|
"grad_norm": 1.0668506396404902, |
|
"learning_rate": 8.273513259140911e-06, |
|
"loss": 0.4549, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 1.3545786110485585, |
|
"learning_rate": 8.22982110883079e-06, |
|
"loss": 0.4889, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5826086956521739, |
|
"grad_norm": 1.343041115784667, |
|
"learning_rate": 8.185701562942614e-06, |
|
"loss": 0.507, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2218861872968096, |
|
"learning_rate": 8.141160459725063e-06, |
|
"loss": 0.4812, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.617391304347826, |
|
"grad_norm": 1.2015960018363139, |
|
"learning_rate": 8.096203693210626e-06, |
|
"loss": 0.4965, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.634782608695652, |
|
"grad_norm": 1.096282312846291, |
|
"learning_rate": 8.050837212435662e-06, |
|
"loss": 0.4557, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 1.0909506176288037, |
|
"learning_rate": 8.00506702065318e-06, |
|
"loss": 0.4702, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6695652173913045, |
|
"grad_norm": 1.5020729724704236, |
|
"learning_rate": 7.958899174538423e-06, |
|
"loss": 0.4695, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6869565217391305, |
|
"grad_norm": 1.1184823557537737, |
|
"learning_rate": 7.912339783387429e-06, |
|
"loss": 0.4869, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.7043478260869565, |
|
"grad_norm": 1.5926675735162503, |
|
"learning_rate": 7.865395008308572e-06, |
|
"loss": 0.4556, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.7217391304347827, |
|
"grad_norm": 1.357255668450024, |
|
"learning_rate": 7.818071061407295e-06, |
|
"loss": 0.5226, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 1.1738545925108272, |
|
"learning_rate": 7.770374204964062e-06, |
|
"loss": 0.4651, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7565217391304349, |
|
"grad_norm": 1.1820022034633777, |
|
"learning_rate": 7.722310750605693e-06, |
|
"loss": 0.498, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7739130434782608, |
|
"grad_norm": 1.3439499575644294, |
|
"learning_rate": 7.673887058470155e-06, |
|
"loss": 0.4832, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.7913043478260868, |
|
"grad_norm": 1.5304830271309717, |
|
"learning_rate": 7.625109536364938e-06, |
|
"loss": 0.4716, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.808695652173913, |
|
"grad_norm": 1.2364034202681973, |
|
"learning_rate": 7.5759846389191234e-06, |
|
"loss": 0.4696, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 1.0229154001597585, |
|
"learning_rate": 7.526518866729256e-06, |
|
"loss": 0.5235, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8434782608695652, |
|
"grad_norm": 1.7878383661619177, |
|
"learning_rate": 7.476718765499131e-06, |
|
"loss": 0.4936, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.8608695652173912, |
|
"grad_norm": 1.1220136346350835, |
|
"learning_rate": 7.426590925173624e-06, |
|
"loss": 0.4954, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8782608695652174, |
|
"grad_norm": 1.0276108610586765, |
|
"learning_rate": 7.376141979066644e-06, |
|
"loss": 0.5226, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.8956521739130436, |
|
"grad_norm": 1.0538696180373945, |
|
"learning_rate": 7.325378602983371e-06, |
|
"loss": 0.4997, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 1.0941944221450914, |
|
"learning_rate": 7.274307514336854e-06, |
|
"loss": 0.5069, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9304347826086956, |
|
"grad_norm": 1.0475459053099843, |
|
"learning_rate": 7.222935471259113e-06, |
|
"loss": 0.4516, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.9478260869565216, |
|
"grad_norm": 0.9438781712939313, |
|
"learning_rate": 7.171269271706847e-06, |
|
"loss": 0.4771, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.9652173913043478, |
|
"grad_norm": 1.26652793242089, |
|
"learning_rate": 7.119315752561879e-06, |
|
"loss": 0.4628, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.982608695652174, |
|
"grad_norm": 1.0230830545480807, |
|
"learning_rate": 7.0670817887264375e-06, |
|
"loss": 0.4961, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.3043673360211918, |
|
"learning_rate": 7.014574292213423e-06, |
|
"loss": 0.4854, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.017391304347826, |
|
"grad_norm": 1.2839938386528644, |
|
"learning_rate": 6.9618002112317525e-06, |
|
"loss": 0.4159, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.034782608695652, |
|
"grad_norm": 1.485302697617233, |
|
"learning_rate": 6.908766529266915e-06, |
|
"loss": 0.4513, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.0521739130434784, |
|
"grad_norm": 1.3513626513686101, |
|
"learning_rate": 6.855480264156864e-06, |
|
"loss": 0.4147, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.0695652173913044, |
|
"grad_norm": 1.1971012611734106, |
|
"learning_rate": 6.8019484671633586e-06, |
|
"loss": 0.4279, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 1.1493410270586013, |
|
"learning_rate": 6.748178222038891e-06, |
|
"loss": 0.4121, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.1043478260869564, |
|
"grad_norm": 1.2061559921790461, |
|
"learning_rate": 6.694176644089297e-06, |
|
"loss": 0.4302, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.121739130434783, |
|
"grad_norm": 1.2134461208204077, |
|
"learning_rate": 6.639950879232218e-06, |
|
"loss": 0.4143, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.139130434782609, |
|
"grad_norm": 1.1936990104884397, |
|
"learning_rate": 6.585508103051478e-06, |
|
"loss": 0.411, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.1565217391304348, |
|
"grad_norm": 1.473818496639146, |
|
"learning_rate": 6.530855519847568e-06, |
|
"loss": 0.4247, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 1.2273898491461919, |
|
"learning_rate": 6.4760003616843106e-06, |
|
"loss": 0.4, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1913043478260867, |
|
"grad_norm": 1.193058454958422, |
|
"learning_rate": 6.420949887431855e-06, |
|
"loss": 0.4016, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.208695652173913, |
|
"grad_norm": 1.3961066733797438, |
|
"learning_rate": 6.3657113818061225e-06, |
|
"loss": 0.4408, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.226086956521739, |
|
"grad_norm": 1.292360770899796, |
|
"learning_rate": 6.310292154404844e-06, |
|
"loss": 0.3999, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.243478260869565, |
|
"grad_norm": 3.4982840024869417, |
|
"learning_rate": 6.254699538740292e-06, |
|
"loss": 0.4329, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 2.265019151887774, |
|
"learning_rate": 6.198940891268844e-06, |
|
"loss": 0.4282, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2782608695652176, |
|
"grad_norm": 1.1398620538733453, |
|
"learning_rate": 6.14302359041753e-06, |
|
"loss": 0.4222, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.2956521739130435, |
|
"grad_norm": 1.523968350586471, |
|
"learning_rate": 6.086955035607655e-06, |
|
"loss": 0.4317, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.3130434782608695, |
|
"grad_norm": 2.2066066593878197, |
|
"learning_rate": 6.030742646275647e-06, |
|
"loss": 0.4212, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.3304347826086955, |
|
"grad_norm": 1.2410016732543188, |
|
"learning_rate": 5.9743938608912626e-06, |
|
"loss": 0.4334, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.3478260869565215, |
|
"grad_norm": 1.210595936193342, |
|
"learning_rate": 5.917916135973263e-06, |
|
"loss": 0.4153, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.365217391304348, |
|
"grad_norm": 1.4742810962432356, |
|
"learning_rate": 5.861316945102717e-06, |
|
"loss": 0.4103, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.382608695652174, |
|
"grad_norm": 1.1807404650454025, |
|
"learning_rate": 5.804603777934032e-06, |
|
"loss": 0.4104, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.32289460697371, |
|
"learning_rate": 5.74778413920386e-06, |
|
"loss": 0.4245, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.417391304347826, |
|
"grad_norm": 1.4542008533192958, |
|
"learning_rate": 5.690865547738021e-06, |
|
"loss": 0.4412, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 1.2739934708683387, |
|
"learning_rate": 5.6338555354565445e-06, |
|
"loss": 0.4281, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4521739130434783, |
|
"grad_norm": 1.563662237308135, |
|
"learning_rate": 5.576761646376987e-06, |
|
"loss": 0.4359, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.4695652173913043, |
|
"grad_norm": 1.225022509026475, |
|
"learning_rate": 5.519591435616153e-06, |
|
"loss": 0.4377, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.4869565217391303, |
|
"grad_norm": 1.1686568195887326, |
|
"learning_rate": 5.462352468390333e-06, |
|
"loss": 0.4229, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.5043478260869563, |
|
"grad_norm": 1.2575917217265098, |
|
"learning_rate": 5.405052319014223e-06, |
|
"loss": 0.4225, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.5217391304347827, |
|
"grad_norm": 1.2880775613395745, |
|
"learning_rate": 5.347698569898624e-06, |
|
"loss": 0.4245, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5391304347826087, |
|
"grad_norm": 1.3156168488856514, |
|
"learning_rate": 5.290298810547083e-06, |
|
"loss": 0.3993, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.5565217391304347, |
|
"grad_norm": 1.1736379125037195, |
|
"learning_rate": 5.232860636551583e-06, |
|
"loss": 0.4324, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.573913043478261, |
|
"grad_norm": 1.2353830078230719, |
|
"learning_rate": 5.175391648587443e-06, |
|
"loss": 0.4185, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.591304347826087, |
|
"grad_norm": 1.1458194395748447, |
|
"learning_rate": 5.117899451407526e-06, |
|
"loss": 0.4309, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 1.2538290106691092, |
|
"learning_rate": 5.060391652835925e-06, |
|
"loss": 0.4292, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.626086956521739, |
|
"grad_norm": 1.571430145629218, |
|
"learning_rate": 5.002875862761234e-06, |
|
"loss": 0.4268, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.643478260869565, |
|
"grad_norm": 1.1840870452769912, |
|
"learning_rate": 4.9453596921295435e-06, |
|
"loss": 0.4319, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.660869565217391, |
|
"grad_norm": 1.0371855886325287, |
|
"learning_rate": 4.8878507519373055e-06, |
|
"loss": 0.4378, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.6782608695652175, |
|
"grad_norm": 1.0983294473272651, |
|
"learning_rate": 4.830356652224181e-06, |
|
"loss": 0.4058, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.6956521739130435, |
|
"grad_norm": 1.127081289246426, |
|
"learning_rate": 4.77288500106602e-06, |
|
"loss": 0.4165, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.7130434782608694, |
|
"grad_norm": 1.236794619724129, |
|
"learning_rate": 4.715443403568103e-06, |
|
"loss": 0.4148, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.730434782608696, |
|
"grad_norm": 1.5345488197083625, |
|
"learning_rate": 4.65803946085877e-06, |
|
"loss": 0.4088, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.747826086956522, |
|
"grad_norm": 1.3004765992472387, |
|
"learning_rate": 4.600680769083585e-06, |
|
"loss": 0.4359, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.765217391304348, |
|
"grad_norm": 1.2501644106321015, |
|
"learning_rate": 4.543374918400142e-06, |
|
"loss": 0.4386, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 1.3823645221856542, |
|
"learning_rate": 4.486129491973687e-06, |
|
"loss": 0.4517, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.2761252508293937, |
|
"learning_rate": 4.4289520649736475e-06, |
|
"loss": 0.4283, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.8173913043478263, |
|
"grad_norm": 1.1632722443857555, |
|
"learning_rate": 4.371850203571225e-06, |
|
"loss": 0.421, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.8347826086956522, |
|
"grad_norm": 1.2193532972111443, |
|
"learning_rate": 4.314831463938184e-06, |
|
"loss": 0.4188, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.8521739130434782, |
|
"grad_norm": 1.4424575510136903, |
|
"learning_rate": 4.257903391246954e-06, |
|
"loss": 0.4094, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.869565217391304, |
|
"grad_norm": 1.095166597880803, |
|
"learning_rate": 4.201073518672195e-06, |
|
"loss": 0.4411, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8869565217391306, |
|
"grad_norm": 1.2662835313537542, |
|
"learning_rate": 4.144349366393949e-06, |
|
"loss": 0.4423, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.9043478260869566, |
|
"grad_norm": 1.1922956806962224, |
|
"learning_rate": 4.08773844060251e-06, |
|
"loss": 0.4254, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.9217391304347826, |
|
"grad_norm": 1.257711038107532, |
|
"learning_rate": 4.031248232505139e-06, |
|
"loss": 0.4358, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.9391304347826086, |
|
"grad_norm": 1.6971639546827986, |
|
"learning_rate": 3.97488621733478e-06, |
|
"loss": 0.4084, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 1.349596141789289, |
|
"learning_rate": 3.918659853360864e-06, |
|
"loss": 0.4274, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.973913043478261, |
|
"grad_norm": 1.3678548460193103, |
|
"learning_rate": 3.862576580902383e-06, |
|
"loss": 0.4187, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.991304347826087, |
|
"grad_norm": 1.2860411793928146, |
|
"learning_rate": 3.8066438213433234e-06, |
|
"loss": 0.4303, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.008695652173913, |
|
"grad_norm": 1.5134066645346198, |
|
"learning_rate": 3.7508689761506055e-06, |
|
"loss": 0.3752, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.026086956521739, |
|
"grad_norm": 1.424990341239896, |
|
"learning_rate": 3.6952594258946693e-06, |
|
"loss": 0.3434, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 1.6649951275575265, |
|
"learning_rate": 3.6398225292728185e-06, |
|
"loss": 0.3607, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0608695652173914, |
|
"grad_norm": 1.8127043018682598, |
|
"learning_rate": 3.584565622135453e-06, |
|
"loss": 0.3623, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.0782608695652174, |
|
"grad_norm": 1.3226724796309823, |
|
"learning_rate": 3.5294960165153363e-06, |
|
"loss": 0.3579, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.0956521739130434, |
|
"grad_norm": 1.4747389958766943, |
|
"learning_rate": 3.474620999660007e-06, |
|
"loss": 0.3973, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.1130434782608694, |
|
"grad_norm": 1.9045203788068308, |
|
"learning_rate": 3.4199478330674745e-06, |
|
"loss": 0.3579, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.130434782608696, |
|
"grad_norm": 1.2899295921725031, |
|
"learning_rate": 3.365483751525317e-06, |
|
"loss": 0.3634, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.1478260869565218, |
|
"grad_norm": 1.4141207800877191, |
|
"learning_rate": 3.3112359621533193e-06, |
|
"loss": 0.3427, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.1652173913043478, |
|
"grad_norm": 1.4287974659319675, |
|
"learning_rate": 3.257211643449768e-06, |
|
"loss": 0.3425, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.1826086956521737, |
|
"grad_norm": 1.3872599954983553, |
|
"learning_rate": 3.203417944341536e-06, |
|
"loss": 0.3458, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.2293598598474245, |
|
"learning_rate": 3.149861983238082e-06, |
|
"loss": 0.3689, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.217391304347826, |
|
"grad_norm": 2.0325771541464217, |
|
"learning_rate": 3.0965508470894812e-06, |
|
"loss": 0.3521, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.234782608695652, |
|
"grad_norm": 1.4460102517208502, |
|
"learning_rate": 3.0434915904486284e-06, |
|
"loss": 0.358, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.252173913043478, |
|
"grad_norm": 1.3048211800924494, |
|
"learning_rate": 2.990691234537721e-06, |
|
"loss": 0.3405, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.269565217391304, |
|
"grad_norm": 1.3241574204938475, |
|
"learning_rate": 2.938156766319156e-06, |
|
"loss": 0.384, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.2869565217391306, |
|
"grad_norm": 1.6150241974779345, |
|
"learning_rate": 2.885895137570958e-06, |
|
"loss": 0.3621, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.3043478260869565, |
|
"grad_norm": 1.2579063705669289, |
|
"learning_rate": 2.83391326396687e-06, |
|
"loss": 0.3884, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.3217391304347825, |
|
"grad_norm": 1.3801202825398762, |
|
"learning_rate": 2.7822180241612077e-06, |
|
"loss": 0.3798, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.3391304347826085, |
|
"grad_norm": 1.396049263670027, |
|
"learning_rate": 2.7308162588786303e-06, |
|
"loss": 0.3605, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.356521739130435, |
|
"grad_norm": 1.3748023066384665, |
|
"learning_rate": 2.6797147700089167e-06, |
|
"loss": 0.3579, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.373913043478261, |
|
"grad_norm": 1.5175283161299356, |
|
"learning_rate": 2.6289203197068834e-06, |
|
"loss": 0.3784, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.391304347826087, |
|
"grad_norm": 1.5851194282172685, |
|
"learning_rate": 2.5784396294975677e-06, |
|
"loss": 0.3606, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.408695652173913, |
|
"grad_norm": 1.349779292865959, |
|
"learning_rate": 2.528279379386783e-06, |
|
"loss": 0.3694, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.426086956521739, |
|
"grad_norm": 1.5339986272397241, |
|
"learning_rate": 2.478446206977159e-06, |
|
"loss": 0.3771, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.4434782608695653, |
|
"grad_norm": 1.430106842741753, |
|
"learning_rate": 2.4289467065898085e-06, |
|
"loss": 0.369, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.4608695652173913, |
|
"grad_norm": 1.457323252888418, |
|
"learning_rate": 2.3797874283917127e-06, |
|
"loss": 0.3403, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 1.4282143674816088, |
|
"learning_rate": 2.3309748775289497e-06, |
|
"loss": 0.3845, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4956521739130437, |
|
"grad_norm": 1.3173625440079775, |
|
"learning_rate": 2.282515513265885e-06, |
|
"loss": 0.3695, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.5130434782608697, |
|
"grad_norm": 1.5391795918668048, |
|
"learning_rate": 2.2344157481304267e-06, |
|
"loss": 0.384, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.5304347826086957, |
|
"grad_norm": 1.5462876476217569, |
|
"learning_rate": 2.1866819470654727e-06, |
|
"loss": 0.362, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.5478260869565217, |
|
"grad_norm": 1.5819871351937547, |
|
"learning_rate": 2.1393204265866467e-06, |
|
"loss": 0.3788, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.5652173913043477, |
|
"grad_norm": 1.6521343485618902, |
|
"learning_rate": 2.09233745394645e-06, |
|
"loss": 0.3785, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.5826086956521737, |
|
"grad_norm": 1.5040618975765006, |
|
"learning_rate": 2.0457392463049285e-06, |
|
"loss": 0.359, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.6074930743024214, |
|
"learning_rate": 1.9995319699069664e-06, |
|
"loss": 0.3652, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.617391304347826, |
|
"grad_norm": 1.5631144357601194, |
|
"learning_rate": 1.9537217392663218e-06, |
|
"loss": 0.3626, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.634782608695652, |
|
"grad_norm": 1.5044302087614625, |
|
"learning_rate": 1.908314616356505e-06, |
|
"loss": 0.388, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.6521739130434785, |
|
"grad_norm": 1.3802858769348085, |
|
"learning_rate": 1.8633166098086103e-06, |
|
"loss": 0.3629, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.6695652173913045, |
|
"grad_norm": 1.4222636474825245, |
|
"learning_rate": 1.818733674116207e-06, |
|
"loss": 0.3651, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.6869565217391305, |
|
"grad_norm": 1.5026251469181622, |
|
"learning_rate": 1.7745717088473895e-06, |
|
"loss": 0.3772, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.7043478260869565, |
|
"grad_norm": 1.4848721046911413, |
|
"learning_rate": 1.7308365578641089e-06, |
|
"loss": 0.3522, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.7217391304347824, |
|
"grad_norm": 1.6975508268605253, |
|
"learning_rate": 1.687534008548854e-06, |
|
"loss": 0.3775, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.7391304347826084, |
|
"grad_norm": 1.7810729043290516, |
|
"learning_rate": 1.6446697910388294e-06, |
|
"loss": 0.3647, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.756521739130435, |
|
"grad_norm": 1.4737138340480456, |
|
"learning_rate": 1.6022495774676916e-06, |
|
"loss": 0.3723, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.773913043478261, |
|
"grad_norm": 1.5559891939482013, |
|
"learning_rate": 1.5602789812149727e-06, |
|
"loss": 0.3503, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.791304347826087, |
|
"grad_norm": 1.663018576508128, |
|
"learning_rate": 1.5187635561632685e-06, |
|
"loss": 0.3561, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.8086956521739133, |
|
"grad_norm": 2.634784063202797, |
|
"learning_rate": 1.477708795963308e-06, |
|
"loss": 0.3406, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.8260869565217392, |
|
"grad_norm": 1.5139630905983323, |
|
"learning_rate": 1.4371201333069868e-06, |
|
"loss": 0.3681, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.8434782608695652, |
|
"grad_norm": 2.3176470378092424, |
|
"learning_rate": 1.3970029392084771e-06, |
|
"loss": 0.3541, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.860869565217391, |
|
"grad_norm": 1.5959870558495743, |
|
"learning_rate": 1.3573625222934829e-06, |
|
"loss": 0.3675, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.878260869565217, |
|
"grad_norm": 1.6337373914467612, |
|
"learning_rate": 1.3182041280967656e-06, |
|
"loss": 0.3514, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.8956521739130436, |
|
"grad_norm": 1.4395314046176468, |
|
"learning_rate": 1.2795329383680138e-06, |
|
"loss": 0.3566, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.9130434782608696, |
|
"grad_norm": 1.3199487970653823, |
|
"learning_rate": 1.241354070386151e-06, |
|
"loss": 0.3441, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.9304347826086956, |
|
"grad_norm": 1.7513753951221092, |
|
"learning_rate": 1.2036725762821783e-06, |
|
"loss": 0.3628, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.9478260869565216, |
|
"grad_norm": 1.5442153725537786, |
|
"learning_rate": 1.1664934423706348e-06, |
|
"loss": 0.3497, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.965217391304348, |
|
"grad_norm": 1.663632062954429, |
|
"learning_rate": 1.12982158848977e-06, |
|
"loss": 0.3576, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.982608695652174, |
|
"grad_norm": 1.8278674162694661, |
|
"learning_rate": 1.0936618673505112e-06, |
|
"loss": 0.3744, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.4687537010366873, |
|
"learning_rate": 1.0580190638943138e-06, |
|
"loss": 0.3549, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.017391304347826, |
|
"grad_norm": 1.6716521197547283, |
|
"learning_rate": 1.022897894659981e-06, |
|
"loss": 0.3487, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.034782608695652, |
|
"grad_norm": 1.4645341272970525, |
|
"learning_rate": 9.883030071595335e-07, |
|
"loss": 0.3158, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.052173913043478, |
|
"grad_norm": 1.6557398280558575, |
|
"learning_rate": 9.542389792632112e-07, |
|
"loss": 0.331, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.069565217391304, |
|
"grad_norm": 1.5960738974574165, |
|
"learning_rate": 9.20710318593701e-07, |
|
"loss": 0.3265, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.086956521739131, |
|
"grad_norm": 1.4988502007449698, |
|
"learning_rate": 8.877214619296421e-07, |
|
"loss": 0.336, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.104347826086957, |
|
"grad_norm": 1.4424789164433198, |
|
"learning_rate": 8.552767746185215e-07, |
|
"loss": 0.3347, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.121739130434783, |
|
"grad_norm": 2.2719026550199475, |
|
"learning_rate": 8.233805499990166e-07, |
|
"loss": 0.3305, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.139130434782609, |
|
"grad_norm": 1.4765476496178676, |
|
"learning_rate": 7.920370088328672e-07, |
|
"loss": 0.3484, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.156521739130435, |
|
"grad_norm": 1.7030261009934748, |
|
"learning_rate": 7.612502987463477e-07, |
|
"loss": 0.3163, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.173913043478261, |
|
"grad_norm": 1.4010667448317558, |
|
"learning_rate": 7.310244936814232e-07, |
|
"loss": 0.3222, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.191304347826087, |
|
"grad_norm": 1.381057060755463, |
|
"learning_rate": 7.013635933566515e-07, |
|
"loss": 0.3175, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.208695652173913, |
|
"grad_norm": 1.6489840526877189, |
|
"learning_rate": 6.72271522737909e-07, |
|
"loss": 0.3027, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.226086956521739, |
|
"grad_norm": 1.4490054494783193, |
|
"learning_rate": 6.437521315190087e-07, |
|
"loss": 0.3529, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.243478260869566, |
|
"grad_norm": 1.2850499028342035, |
|
"learning_rate": 6.158091936122773e-07, |
|
"loss": 0.3285, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.260869565217392, |
|
"grad_norm": 1.2958305173488132, |
|
"learning_rate": 5.884464066491613e-07, |
|
"loss": 0.3128, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.278260869565218, |
|
"grad_norm": 1.716194835927619, |
|
"learning_rate": 5.616673914909282e-07, |
|
"loss": 0.3377, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.2956521739130435, |
|
"grad_norm": 1.5456459782523917, |
|
"learning_rate": 5.354756917495224e-07, |
|
"loss": 0.359, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.3130434782608695, |
|
"grad_norm": 1.7049849318119408, |
|
"learning_rate": 5.098747733186498e-07, |
|
"loss": 0.3194, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.3304347826086955, |
|
"grad_norm": 1.4376855175332917, |
|
"learning_rate": 4.84868023915141e-07, |
|
"loss": 0.3296, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 1.5575044356910057, |
|
"learning_rate": 4.6045875263066474e-07, |
|
"loss": 0.3347, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.3652173913043475, |
|
"grad_norm": 1.6696231773916848, |
|
"learning_rate": 4.366501894938363e-07, |
|
"loss": 0.3509, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.3826086956521735, |
|
"grad_norm": 1.4793874890497716, |
|
"learning_rate": 4.1344548504280213e-07, |
|
"loss": 0.316, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.7504184745527158, |
|
"learning_rate": 3.908477099083291e-07, |
|
"loss": 0.3368, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.417391304347826, |
|
"grad_norm": 1.5433746257360492, |
|
"learning_rate": 3.6885985440747895e-07, |
|
"loss": 0.3147, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.434782608695652, |
|
"grad_norm": 1.6205451434238438, |
|
"learning_rate": 3.474848281479032e-07, |
|
"loss": 0.3198, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.452173913043478, |
|
"grad_norm": 1.7748707264655563, |
|
"learning_rate": 3.2672545964282263e-07, |
|
"loss": 0.3337, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.469565217391304, |
|
"grad_norm": 2.034670741566414, |
|
"learning_rate": 3.0658449593673256e-07, |
|
"loss": 0.2871, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.48695652173913, |
|
"grad_norm": 1.9535751002955573, |
|
"learning_rate": 2.8706460224189656e-07, |
|
"loss": 0.3106, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.504347826086956, |
|
"grad_norm": 1.4860584696429842, |
|
"learning_rate": 2.6816836158566117e-07, |
|
"loss": 0.3304, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.521739130434782, |
|
"grad_norm": 1.5646385009356472, |
|
"learning_rate": 2.498982744686501e-07, |
|
"loss": 0.328, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.539130434782608, |
|
"grad_norm": 1.6223923163321592, |
|
"learning_rate": 2.3225675853387974e-07, |
|
"loss": 0.3099, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.556521739130435, |
|
"grad_norm": 1.4069397141672941, |
|
"learning_rate": 2.1524614824683377e-07, |
|
"loss": 0.3318, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.573913043478261, |
|
"grad_norm": 1.5612850701477952, |
|
"learning_rate": 1.98868694586552e-07, |
|
"loss": 0.3376, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.591304347826087, |
|
"grad_norm": 1.6586621274603865, |
|
"learning_rate": 1.8312656474776093e-07, |
|
"loss": 0.3034, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.608695652173913, |
|
"grad_norm": 1.6277894314324821, |
|
"learning_rate": 1.6802184185409355e-07, |
|
"loss": 0.334, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.626086956521739, |
|
"grad_norm": 1.5367253210879437, |
|
"learning_rate": 1.5355652468243332e-07, |
|
"loss": 0.3368, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.643478260869565, |
|
"grad_norm": 1.42580964043298, |
|
"learning_rate": 1.3973252739842236e-07, |
|
"loss": 0.3107, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.660869565217391, |
|
"grad_norm": 1.5836881272840495, |
|
"learning_rate": 1.2655167930316236e-07, |
|
"loss": 0.3337, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.678260869565217, |
|
"grad_norm": 1.3933170878068741, |
|
"learning_rate": 1.1401572459114441e-07, |
|
"loss": 0.2988, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.695652173913043, |
|
"grad_norm": 1.6629891174849243, |
|
"learning_rate": 1.0212632211944906e-07, |
|
"loss": 0.3038, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.71304347826087, |
|
"grad_norm": 1.5180536343786482, |
|
"learning_rate": 9.088504518822817e-08, |
|
"loss": 0.2919, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.730434782608696, |
|
"grad_norm": 1.4774316548557718, |
|
"learning_rate": 8.029338133251518e-08, |
|
"loss": 0.3253, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.747826086956522, |
|
"grad_norm": 1.4916465022440153, |
|
"learning_rate": 7.035273212538274e-08, |
|
"loss": 0.3346, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.765217391304348, |
|
"grad_norm": 1.6408617577143456, |
|
"learning_rate": 6.10644129924759e-08, |
|
"loss": 0.2921, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.782608695652174, |
|
"grad_norm": 1.5935791526940084, |
|
"learning_rate": 5.242965303794312e-08, |
|
"loss": 0.3298, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.3615616479228225, |
|
"learning_rate": 4.444959488179301e-08, |
|
"loss": 0.3084, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.817391304347826, |
|
"grad_norm": 1.3883499339712515, |
|
"learning_rate": 3.7125294508693066e-08, |
|
"loss": 0.3117, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.834782608695652, |
|
"grad_norm": 1.483718903196712, |
|
"learning_rate": 3.045772112823253e-08, |
|
"loss": 0.3259, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.852173913043478, |
|
"grad_norm": 1.5939760050066456, |
|
"learning_rate": 2.4447757046670017e-08, |
|
"loss": 0.3079, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.869565217391305, |
|
"grad_norm": 1.5266765004604943, |
|
"learning_rate": 1.9096197550179664e-08, |
|
"loss": 0.3214, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.886956521739131, |
|
"grad_norm": 1.7078593350970195, |
|
"learning_rate": 1.4403750799613092e-08, |
|
"loss": 0.3419, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.904347826086957, |
|
"grad_norm": 1.595487144492393, |
|
"learning_rate": 1.0371037736787714e-08, |
|
"loss": 0.3224, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.921739130434783, |
|
"grad_norm": 1.470874925227166, |
|
"learning_rate": 6.998592002321336e-09, |
|
"loss": 0.3303, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.939130434782609, |
|
"grad_norm": 1.6419169053737916, |
|
"learning_rate": 4.286859865014204e-09, |
|
"loss": 0.3237, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.956521739130435, |
|
"grad_norm": 1.7910736800765954, |
|
"learning_rate": 2.236200162798463e-09, |
|
"loss": 0.2943, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.973913043478261, |
|
"grad_norm": 1.4483382289129645, |
|
"learning_rate": 8.468842552505907e-10, |
|
"loss": 0.3202, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.9913043478260875, |
|
"grad_norm": 1.4884875727469478, |
|
"learning_rate": 1.1909598768400366e-10, |
|
"loss": 0.3389, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2875, |
|
"total_flos": 33447808868352.0, |
|
"train_loss": 0.42618958398570184, |
|
"train_runtime": 1921.5134, |
|
"train_samples_per_second": 11.957, |
|
"train_steps_per_second": 1.496 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 33447808868352.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|