|
{ |
|
"best_metric": 1.6915712356567383, |
|
"best_model_checkpoint": "./health_analysis_results/checkpoint-8752", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 10940, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004570383912248629, |
|
"grad_norm": 86932.7734375, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.6599, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009140767824497258, |
|
"grad_norm": 41141.640625, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.6196, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013711151736745886, |
|
"grad_norm": 64399.953125, |
|
"learning_rate": 3e-06, |
|
"loss": 2.6801, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018281535648994516, |
|
"grad_norm": 47369.9140625, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.5896, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022851919561243144, |
|
"grad_norm": 42862.3359375, |
|
"learning_rate": 5e-06, |
|
"loss": 2.6417, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.027422303473491772, |
|
"grad_norm": 43046.7734375, |
|
"learning_rate": 6e-06, |
|
"loss": 2.5624, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031992687385740404, |
|
"grad_norm": 57193.46875, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.7588, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03656307129798903, |
|
"grad_norm": 47019.44921875, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.7033, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04113345521023766, |
|
"grad_norm": 96947.703125, |
|
"learning_rate": 9e-06, |
|
"loss": 2.9225, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04570383912248629, |
|
"grad_norm": 53870.5234375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.65, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.050274223034734916, |
|
"grad_norm": 57196.71875, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.726, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.054844606946983544, |
|
"grad_norm": 102628.6015625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.5591, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05941499085923217, |
|
"grad_norm": 59579.734375, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.7329, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06398537477148081, |
|
"grad_norm": 63045.7890625, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.7527, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06855575868372943, |
|
"grad_norm": 55244.3828125, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.8104, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07312614259597806, |
|
"grad_norm": 101346.0078125, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.6889, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07769652650822668, |
|
"grad_norm": 47467.60546875, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 2.646, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08226691042047532, |
|
"grad_norm": 92061.4765625, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.6756, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08683729433272395, |
|
"grad_norm": 52667.0078125, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.5845, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09140767824497258, |
|
"grad_norm": 74090.5859375, |
|
"learning_rate": 2e-05, |
|
"loss": 2.6605, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09597806215722121, |
|
"grad_norm": 92731.8984375, |
|
"learning_rate": 2.1e-05, |
|
"loss": 2.7177, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10054844606946983, |
|
"grad_norm": 132096.203125, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.6394, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10511882998171847, |
|
"grad_norm": 64807.15625, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 2.5417, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10968921389396709, |
|
"grad_norm": 124301.5390625, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.5855, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11425959780621572, |
|
"grad_norm": 106672.2109375, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.5796, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11882998171846434, |
|
"grad_norm": 103185.78125, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.558, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12340036563071298, |
|
"grad_norm": 539943.625, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 2.533, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12797074954296161, |
|
"grad_norm": 136090.453125, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.3531, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13254113345521024, |
|
"grad_norm": 190959.8125, |
|
"learning_rate": 2.9e-05, |
|
"loss": 2.4295, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13711151736745886, |
|
"grad_norm": 321799.75, |
|
"learning_rate": 3e-05, |
|
"loss": 2.3202, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1416819012797075, |
|
"grad_norm": 341939.46875, |
|
"learning_rate": 3.1e-05, |
|
"loss": 2.3956, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14625228519195613, |
|
"grad_norm": 118726.578125, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.2453, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15082266910420475, |
|
"grad_norm": 199988.390625, |
|
"learning_rate": 3.3e-05, |
|
"loss": 2.2402, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15539305301645337, |
|
"grad_norm": 200156.25, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 2.1299, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15996343692870202, |
|
"grad_norm": 235010.1875, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.0552, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16453382084095064, |
|
"grad_norm": 168717.734375, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.0332, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16910420475319926, |
|
"grad_norm": 164222.140625, |
|
"learning_rate": 3.7e-05, |
|
"loss": 2.1069, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1736745886654479, |
|
"grad_norm": 216883.390625, |
|
"learning_rate": 3.8e-05, |
|
"loss": 2.1525, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17824497257769653, |
|
"grad_norm": 238809.9375, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 2.0176, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18281535648994515, |
|
"grad_norm": 148073.28125, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9141, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18738574040219377, |
|
"grad_norm": 211357.671875, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.9354, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19195612431444242, |
|
"grad_norm": 289800.125, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.0263, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19652650822669104, |
|
"grad_norm": 354819.71875, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.9236, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20109689213893966, |
|
"grad_norm": 187694.3125, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.8974, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2056672760511883, |
|
"grad_norm": 235554.203125, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.8884, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21023765996343693, |
|
"grad_norm": 194993.0625, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.8713, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21480804387568556, |
|
"grad_norm": 190949.453125, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.9149, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21937842778793418, |
|
"grad_norm": 209000.484375, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.9436, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22394881170018283, |
|
"grad_norm": 151603.03125, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.868, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22851919561243145, |
|
"grad_norm": 297051.9375, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8631, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23308957952468007, |
|
"grad_norm": 193912.53125, |
|
"learning_rate": 4.995210727969349e-05, |
|
"loss": 1.866, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2376599634369287, |
|
"grad_norm": 185368.859375, |
|
"learning_rate": 4.9904214559386976e-05, |
|
"loss": 1.8017, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24223034734917734, |
|
"grad_norm": 482963.65625, |
|
"learning_rate": 4.985632183908046e-05, |
|
"loss": 1.958, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24680073126142596, |
|
"grad_norm": 216741.5625, |
|
"learning_rate": 4.980842911877395e-05, |
|
"loss": 1.8688, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2513711151736746, |
|
"grad_norm": 248310.640625, |
|
"learning_rate": 4.976053639846743e-05, |
|
"loss": 1.8206, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.25594149908592323, |
|
"grad_norm": 174778.59375, |
|
"learning_rate": 4.971264367816092e-05, |
|
"loss": 1.7981, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.26051188299817185, |
|
"grad_norm": 168291.53125, |
|
"learning_rate": 4.966475095785441e-05, |
|
"loss": 1.8444, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.26508226691042047, |
|
"grad_norm": 119908.9609375, |
|
"learning_rate": 4.96168582375479e-05, |
|
"loss": 1.8102, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2696526508226691, |
|
"grad_norm": 130359.1640625, |
|
"learning_rate": 4.9568965517241384e-05, |
|
"loss": 1.819, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2742230347349177, |
|
"grad_norm": 122891.2734375, |
|
"learning_rate": 4.952107279693487e-05, |
|
"loss": 1.8189, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.27879341864716634, |
|
"grad_norm": 133346.09375, |
|
"learning_rate": 4.947318007662836e-05, |
|
"loss": 1.8422, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.283363802559415, |
|
"grad_norm": 179827.6875, |
|
"learning_rate": 4.9425287356321845e-05, |
|
"loss": 1.8674, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.28793418647166363, |
|
"grad_norm": 158770.609375, |
|
"learning_rate": 4.9377394636015325e-05, |
|
"loss": 1.8341, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.29250457038391225, |
|
"grad_norm": 199898.640625, |
|
"learning_rate": 4.932950191570881e-05, |
|
"loss": 1.8177, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2970749542961609, |
|
"grad_norm": 136966.828125, |
|
"learning_rate": 4.92816091954023e-05, |
|
"loss": 1.6618, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3016453382084095, |
|
"grad_norm": 131470.1875, |
|
"learning_rate": 4.9233716475095786e-05, |
|
"loss": 1.9034, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3062157221206581, |
|
"grad_norm": 137772.28125, |
|
"learning_rate": 4.918582375478927e-05, |
|
"loss": 1.7766, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.31078610603290674, |
|
"grad_norm": 225044.28125, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 1.8404, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3153564899451554, |
|
"grad_norm": 117430.6015625, |
|
"learning_rate": 4.9090038314176246e-05, |
|
"loss": 1.7514, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.31992687385740404, |
|
"grad_norm": 143649.03125, |
|
"learning_rate": 4.904214559386973e-05, |
|
"loss": 1.7891, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32449725776965266, |
|
"grad_norm": 132381.140625, |
|
"learning_rate": 4.899425287356322e-05, |
|
"loss": 1.728, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3290676416819013, |
|
"grad_norm": 180736.796875, |
|
"learning_rate": 4.894636015325671e-05, |
|
"loss": 1.7465, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3336380255941499, |
|
"grad_norm": 102364.2734375, |
|
"learning_rate": 4.8898467432950194e-05, |
|
"loss": 1.7522, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3382084095063985, |
|
"grad_norm": 124118.0234375, |
|
"learning_rate": 4.885057471264368e-05, |
|
"loss": 1.7294, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.34277879341864714, |
|
"grad_norm": 102199.5234375, |
|
"learning_rate": 4.880268199233717e-05, |
|
"loss": 1.866, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3473491773308958, |
|
"grad_norm": 128679.890625, |
|
"learning_rate": 4.8754789272030654e-05, |
|
"loss": 1.8846, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.35191956124314444, |
|
"grad_norm": 100680.3671875, |
|
"learning_rate": 4.870689655172414e-05, |
|
"loss": 1.7317, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.35648994515539306, |
|
"grad_norm": 125967.859375, |
|
"learning_rate": 4.865900383141763e-05, |
|
"loss": 1.8308, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3610603290676417, |
|
"grad_norm": 124871.5859375, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 1.7843, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3656307129798903, |
|
"grad_norm": 79497.8515625, |
|
"learning_rate": 4.85632183908046e-05, |
|
"loss": 1.7379, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3702010968921389, |
|
"grad_norm": 130445.0546875, |
|
"learning_rate": 4.851532567049808e-05, |
|
"loss": 1.8268, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.37477148080438755, |
|
"grad_norm": 104012.1484375, |
|
"learning_rate": 4.846743295019157e-05, |
|
"loss": 1.7416, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3793418647166362, |
|
"grad_norm": 146501.59375, |
|
"learning_rate": 4.8419540229885056e-05, |
|
"loss": 1.7859, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.38391224862888484, |
|
"grad_norm": 142532.4375, |
|
"learning_rate": 4.837164750957854e-05, |
|
"loss": 1.7631, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.38848263254113347, |
|
"grad_norm": 130007.6640625, |
|
"learning_rate": 4.8323754789272036e-05, |
|
"loss": 1.7579, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3930530164533821, |
|
"grad_norm": 130011.5078125, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 1.7896, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3976234003656307, |
|
"grad_norm": 111523.03125, |
|
"learning_rate": 4.822796934865901e-05, |
|
"loss": 1.7826, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.40219378427787933, |
|
"grad_norm": 155793.453125, |
|
"learning_rate": 4.81800766283525e-05, |
|
"loss": 1.8351, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.40676416819012795, |
|
"grad_norm": 129142.6875, |
|
"learning_rate": 4.813218390804598e-05, |
|
"loss": 1.8079, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4113345521023766, |
|
"grad_norm": 132226.46875, |
|
"learning_rate": 4.8084291187739464e-05, |
|
"loss": 1.7534, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.41590493601462525, |
|
"grad_norm": 126845.109375, |
|
"learning_rate": 4.803639846743295e-05, |
|
"loss": 1.9015, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.42047531992687387, |
|
"grad_norm": 136638.390625, |
|
"learning_rate": 4.798850574712644e-05, |
|
"loss": 1.7273, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4250457038391225, |
|
"grad_norm": 142308.25, |
|
"learning_rate": 4.7940613026819925e-05, |
|
"loss": 1.7398, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4296160877513711, |
|
"grad_norm": 115603.34375, |
|
"learning_rate": 4.789272030651341e-05, |
|
"loss": 1.8353, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.43418647166361973, |
|
"grad_norm": 131319.734375, |
|
"learning_rate": 4.78448275862069e-05, |
|
"loss": 1.6825, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.43875685557586835, |
|
"grad_norm": 112166.5703125, |
|
"learning_rate": 4.7796934865900385e-05, |
|
"loss": 1.7169, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.443327239488117, |
|
"grad_norm": 131488.796875, |
|
"learning_rate": 4.774904214559387e-05, |
|
"loss": 1.7215, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.44789762340036565, |
|
"grad_norm": 149545.140625, |
|
"learning_rate": 4.770114942528736e-05, |
|
"loss": 1.6931, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4524680073126143, |
|
"grad_norm": 163557.375, |
|
"learning_rate": 4.7653256704980846e-05, |
|
"loss": 1.7066, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"grad_norm": 145251.359375, |
|
"learning_rate": 4.760536398467433e-05, |
|
"loss": 1.797, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4616087751371115, |
|
"grad_norm": 100632.7734375, |
|
"learning_rate": 4.755747126436782e-05, |
|
"loss": 1.6975, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.46617915904936014, |
|
"grad_norm": 153170.90625, |
|
"learning_rate": 4.7509578544061307e-05, |
|
"loss": 1.6741, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.47074954296160876, |
|
"grad_norm": 78909.109375, |
|
"learning_rate": 4.7461685823754793e-05, |
|
"loss": 1.6999, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4753199268738574, |
|
"grad_norm": 110082.7421875, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 1.749, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.47989031078610606, |
|
"grad_norm": 100923.5390625, |
|
"learning_rate": 4.736590038314177e-05, |
|
"loss": 1.7683, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4844606946983547, |
|
"grad_norm": 98683.3203125, |
|
"learning_rate": 4.7318007662835254e-05, |
|
"loss": 1.7535, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4890310786106033, |
|
"grad_norm": 116789.328125, |
|
"learning_rate": 4.7270114942528734e-05, |
|
"loss": 1.8248, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4936014625228519, |
|
"grad_norm": 114926.984375, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 1.7018, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.49817184643510054, |
|
"grad_norm": 91417.0859375, |
|
"learning_rate": 4.717432950191571e-05, |
|
"loss": 1.8116, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5027422303473492, |
|
"grad_norm": 128865.3125, |
|
"learning_rate": 4.7126436781609195e-05, |
|
"loss": 1.6932, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5073126142595978, |
|
"grad_norm": 122375.984375, |
|
"learning_rate": 4.707854406130268e-05, |
|
"loss": 1.7383, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5118829981718465, |
|
"grad_norm": 96160.1328125, |
|
"learning_rate": 4.7030651340996175e-05, |
|
"loss": 1.6952, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5164533820840951, |
|
"grad_norm": 124809.203125, |
|
"learning_rate": 4.698275862068966e-05, |
|
"loss": 1.7973, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5210237659963437, |
|
"grad_norm": 112799.5625, |
|
"learning_rate": 4.693486590038315e-05, |
|
"loss": 1.7527, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5255941499085923, |
|
"grad_norm": 82923.640625, |
|
"learning_rate": 4.688697318007663e-05, |
|
"loss": 1.7041, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5301645338208409, |
|
"grad_norm": 130412.3671875, |
|
"learning_rate": 4.6839080459770116e-05, |
|
"loss": 1.7021, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5347349177330896, |
|
"grad_norm": 104821.015625, |
|
"learning_rate": 4.67911877394636e-05, |
|
"loss": 1.7333, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5393053016453382, |
|
"grad_norm": 100490.953125, |
|
"learning_rate": 4.674329501915709e-05, |
|
"loss": 1.6688, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5438756855575868, |
|
"grad_norm": 86425.4453125, |
|
"learning_rate": 4.669540229885058e-05, |
|
"loss": 1.7381, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5484460694698354, |
|
"grad_norm": 83740.2734375, |
|
"learning_rate": 4.6647509578544064e-05, |
|
"loss": 1.8439, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.553016453382084, |
|
"grad_norm": 140177.421875, |
|
"learning_rate": 4.659961685823755e-05, |
|
"loss": 1.7227, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5575868372943327, |
|
"grad_norm": 144323.71875, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 1.7395, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5621572212065814, |
|
"grad_norm": 97354.59375, |
|
"learning_rate": 4.6503831417624524e-05, |
|
"loss": 1.7719, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.56672760511883, |
|
"grad_norm": 72904.7578125, |
|
"learning_rate": 4.6455938697318004e-05, |
|
"loss": 1.6712, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5712979890310786, |
|
"grad_norm": 162248.328125, |
|
"learning_rate": 4.640804597701149e-05, |
|
"loss": 1.7491, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5758683729433273, |
|
"grad_norm": 105222.6328125, |
|
"learning_rate": 4.6360153256704985e-05, |
|
"loss": 1.7503, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5804387568555759, |
|
"grad_norm": 113333.96875, |
|
"learning_rate": 4.631226053639847e-05, |
|
"loss": 1.751, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5850091407678245, |
|
"grad_norm": 126183.7734375, |
|
"learning_rate": 4.626436781609196e-05, |
|
"loss": 1.8747, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5895795246800731, |
|
"grad_norm": 118274.03125, |
|
"learning_rate": 4.6216475095785446e-05, |
|
"loss": 1.6831, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5941499085923218, |
|
"grad_norm": 108177.03125, |
|
"learning_rate": 4.616858237547893e-05, |
|
"loss": 1.6359, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5987202925045704, |
|
"grad_norm": 81988.9140625, |
|
"learning_rate": 4.612068965517242e-05, |
|
"loss": 1.7058, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.603290676416819, |
|
"grad_norm": 79780.96875, |
|
"learning_rate": 4.60727969348659e-05, |
|
"loss": 1.7388, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6078610603290676, |
|
"grad_norm": 166808.515625, |
|
"learning_rate": 4.6024904214559386e-05, |
|
"loss": 1.7623, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6124314442413162, |
|
"grad_norm": 111601.921875, |
|
"learning_rate": 4.597701149425287e-05, |
|
"loss": 1.761, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6170018281535649, |
|
"grad_norm": 106101.40625, |
|
"learning_rate": 4.592911877394636e-05, |
|
"loss": 1.685, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6215722120658135, |
|
"grad_norm": 125174.578125, |
|
"learning_rate": 4.588122605363985e-05, |
|
"loss": 1.7784, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6261425959780622, |
|
"grad_norm": 107639.9375, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 1.7636, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6307129798903108, |
|
"grad_norm": 144034.265625, |
|
"learning_rate": 4.578544061302682e-05, |
|
"loss": 1.7237, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6352833638025595, |
|
"grad_norm": 135349.234375, |
|
"learning_rate": 4.573754789272031e-05, |
|
"loss": 1.6869, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6398537477148081, |
|
"grad_norm": 92048.3359375, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 1.6916, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6444241316270567, |
|
"grad_norm": 105181.109375, |
|
"learning_rate": 4.564176245210728e-05, |
|
"loss": 1.7276, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6489945155393053, |
|
"grad_norm": 111967.078125, |
|
"learning_rate": 4.559386973180077e-05, |
|
"loss": 1.7594, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6535648994515539, |
|
"grad_norm": 95790.1171875, |
|
"learning_rate": 4.5545977011494255e-05, |
|
"loss": 1.6922, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6581352833638026, |
|
"grad_norm": 133646.671875, |
|
"learning_rate": 4.549808429118774e-05, |
|
"loss": 1.7427, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6627056672760512, |
|
"grad_norm": 85705.421875, |
|
"learning_rate": 4.545019157088123e-05, |
|
"loss": 1.6791, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6672760511882998, |
|
"grad_norm": 131938.828125, |
|
"learning_rate": 4.5402298850574716e-05, |
|
"loss": 1.7781, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6718464351005484, |
|
"grad_norm": 138080.515625, |
|
"learning_rate": 4.53544061302682e-05, |
|
"loss": 1.7237, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.676416819012797, |
|
"grad_norm": 81319.125, |
|
"learning_rate": 4.530651340996169e-05, |
|
"loss": 1.7527, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6809872029250457, |
|
"grad_norm": 114187.8203125, |
|
"learning_rate": 4.5258620689655176e-05, |
|
"loss": 1.7414, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6855575868372943, |
|
"grad_norm": 88061.1640625, |
|
"learning_rate": 4.5210727969348656e-05, |
|
"loss": 1.7703, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6901279707495429, |
|
"grad_norm": 70474.1015625, |
|
"learning_rate": 4.516283524904214e-05, |
|
"loss": 1.6569, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6946983546617916, |
|
"grad_norm": 93419.6171875, |
|
"learning_rate": 4.511494252873563e-05, |
|
"loss": 1.7559, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6992687385740403, |
|
"grad_norm": 109502.859375, |
|
"learning_rate": 4.506704980842912e-05, |
|
"loss": 1.6971, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7038391224862889, |
|
"grad_norm": 172617.59375, |
|
"learning_rate": 4.501915708812261e-05, |
|
"loss": 1.8061, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7084095063985375, |
|
"grad_norm": 103427.5078125, |
|
"learning_rate": 4.49712643678161e-05, |
|
"loss": 1.7566, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7129798903107861, |
|
"grad_norm": 77810.8125, |
|
"learning_rate": 4.4923371647509585e-05, |
|
"loss": 1.6869, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7175502742230347, |
|
"grad_norm": 112475.9453125, |
|
"learning_rate": 4.487547892720307e-05, |
|
"loss": 1.6338, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7221206581352834, |
|
"grad_norm": 99816.34375, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 1.7478, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.726691042047532, |
|
"grad_norm": 73693.90625, |
|
"learning_rate": 4.477969348659004e-05, |
|
"loss": 1.7733, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7312614259597806, |
|
"grad_norm": 125517.2734375, |
|
"learning_rate": 4.4731800766283525e-05, |
|
"loss": 1.7643, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7358318098720292, |
|
"grad_norm": 125624.765625, |
|
"learning_rate": 4.468390804597701e-05, |
|
"loss": 1.789, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7404021937842779, |
|
"grad_norm": 96340.703125, |
|
"learning_rate": 4.46360153256705e-05, |
|
"loss": 1.8048, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7449725776965265, |
|
"grad_norm": 77806.5, |
|
"learning_rate": 4.4588122605363986e-05, |
|
"loss": 1.6585, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7495429616087751, |
|
"grad_norm": 115383.6171875, |
|
"learning_rate": 4.454022988505747e-05, |
|
"loss": 1.7022, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7541133455210237, |
|
"grad_norm": 128746.96875, |
|
"learning_rate": 4.449233716475096e-05, |
|
"loss": 1.7007, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7586837294332724, |
|
"grad_norm": 92168.8515625, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 1.78, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7632541133455211, |
|
"grad_norm": 79167.046875, |
|
"learning_rate": 4.4396551724137933e-05, |
|
"loss": 1.8273, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7678244972577697, |
|
"grad_norm": 113320.921875, |
|
"learning_rate": 4.434865900383142e-05, |
|
"loss": 1.6285, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7723948811700183, |
|
"grad_norm": 80329.4140625, |
|
"learning_rate": 4.430076628352491e-05, |
|
"loss": 1.6705, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7769652650822669, |
|
"grad_norm": 112864.1875, |
|
"learning_rate": 4.4252873563218394e-05, |
|
"loss": 1.6774, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7815356489945156, |
|
"grad_norm": 83703.5703125, |
|
"learning_rate": 4.420498084291188e-05, |
|
"loss": 1.7612, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7861060329067642, |
|
"grad_norm": 89239.75, |
|
"learning_rate": 4.415708812260537e-05, |
|
"loss": 1.6861, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7906764168190128, |
|
"grad_norm": 89113.1484375, |
|
"learning_rate": 4.4109195402298855e-05, |
|
"loss": 1.751, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7952468007312614, |
|
"grad_norm": 74121.84375, |
|
"learning_rate": 4.406130268199234e-05, |
|
"loss": 1.7383, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.79981718464351, |
|
"grad_norm": 98291.3125, |
|
"learning_rate": 4.401340996168583e-05, |
|
"loss": 1.6507, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8043875685557587, |
|
"grad_norm": 101062.6484375, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 1.6809, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8089579524680073, |
|
"grad_norm": 82451.6015625, |
|
"learning_rate": 4.3917624521072795e-05, |
|
"loss": 1.6369, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8135283363802559, |
|
"grad_norm": 93937.7109375, |
|
"learning_rate": 4.386973180076628e-05, |
|
"loss": 1.8125, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8180987202925045, |
|
"grad_norm": 86336.0546875, |
|
"learning_rate": 4.382183908045977e-05, |
|
"loss": 1.7483, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8226691042047533, |
|
"grad_norm": 94180.6953125, |
|
"learning_rate": 4.3773946360153256e-05, |
|
"loss": 1.7549, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8272394881170019, |
|
"grad_norm": 89638.5703125, |
|
"learning_rate": 4.372605363984675e-05, |
|
"loss": 1.6895, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8318098720292505, |
|
"grad_norm": 129005.0, |
|
"learning_rate": 4.367816091954024e-05, |
|
"loss": 1.7419, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8363802559414991, |
|
"grad_norm": 115378.0703125, |
|
"learning_rate": 4.3630268199233724e-05, |
|
"loss": 1.6716, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8409506398537477, |
|
"grad_norm": 84872.3046875, |
|
"learning_rate": 4.3582375478927204e-05, |
|
"loss": 1.6927, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8455210237659964, |
|
"grad_norm": 82049.890625, |
|
"learning_rate": 4.353448275862069e-05, |
|
"loss": 1.7318, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.850091407678245, |
|
"grad_norm": 81004.65625, |
|
"learning_rate": 4.348659003831418e-05, |
|
"loss": 1.6633, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8546617915904936, |
|
"grad_norm": 186330.078125, |
|
"learning_rate": 4.3438697318007664e-05, |
|
"loss": 1.8362, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8592321755027422, |
|
"grad_norm": 95703.015625, |
|
"learning_rate": 4.339080459770115e-05, |
|
"loss": 1.8003, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8638025594149908, |
|
"grad_norm": 75972.78125, |
|
"learning_rate": 4.334291187739464e-05, |
|
"loss": 1.7282, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8683729433272395, |
|
"grad_norm": 105656.9765625, |
|
"learning_rate": 4.3295019157088125e-05, |
|
"loss": 1.7041, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8729433272394881, |
|
"grad_norm": 98981.046875, |
|
"learning_rate": 4.324712643678161e-05, |
|
"loss": 1.6535, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8775137111517367, |
|
"grad_norm": 131719.453125, |
|
"learning_rate": 4.31992337164751e-05, |
|
"loss": 1.7521, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8820840950639853, |
|
"grad_norm": 62522.44140625, |
|
"learning_rate": 4.3151340996168586e-05, |
|
"loss": 1.6794, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.886654478976234, |
|
"grad_norm": 104086.2265625, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 1.7029, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8912248628884827, |
|
"grad_norm": 120783.3359375, |
|
"learning_rate": 4.305555555555556e-05, |
|
"loss": 1.6313, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8957952468007313, |
|
"grad_norm": 99394.8203125, |
|
"learning_rate": 4.3007662835249046e-05, |
|
"loss": 1.7735, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9003656307129799, |
|
"grad_norm": 152363.28125, |
|
"learning_rate": 4.295977011494253e-05, |
|
"loss": 1.6732, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9049360146252285, |
|
"grad_norm": 156369.4375, |
|
"learning_rate": 4.291187739463602e-05, |
|
"loss": 1.765, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9095063985374772, |
|
"grad_norm": 63156.875, |
|
"learning_rate": 4.286398467432951e-05, |
|
"loss": 1.6294, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"grad_norm": 106310.6015625, |
|
"learning_rate": 4.2816091954022994e-05, |
|
"loss": 1.7492, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9186471663619744, |
|
"grad_norm": 95234.3515625, |
|
"learning_rate": 4.2768199233716474e-05, |
|
"loss": 1.6726, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.923217550274223, |
|
"grad_norm": 105046.6015625, |
|
"learning_rate": 4.272030651340996e-05, |
|
"loss": 1.6848, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9277879341864717, |
|
"grad_norm": 91991.890625, |
|
"learning_rate": 4.267241379310345e-05, |
|
"loss": 1.7447, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9323583180987203, |
|
"grad_norm": 95458.09375, |
|
"learning_rate": 4.2624521072796934e-05, |
|
"loss": 1.7068, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9369287020109689, |
|
"grad_norm": 108366.8828125, |
|
"learning_rate": 4.257662835249042e-05, |
|
"loss": 1.6272, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9414990859232175, |
|
"grad_norm": 79585.796875, |
|
"learning_rate": 4.252873563218391e-05, |
|
"loss": 1.7444, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9460694698354661, |
|
"grad_norm": 108535.515625, |
|
"learning_rate": 4.2480842911877395e-05, |
|
"loss": 1.6754, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9506398537477148, |
|
"grad_norm": 101708.1640625, |
|
"learning_rate": 4.243295019157089e-05, |
|
"loss": 1.8225, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.9552102376599635, |
|
"grad_norm": 149960.90625, |
|
"learning_rate": 4.238505747126437e-05, |
|
"loss": 1.7228, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9597806215722121, |
|
"grad_norm": 116507.1328125, |
|
"learning_rate": 4.2337164750957856e-05, |
|
"loss": 1.7649, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9643510054844607, |
|
"grad_norm": 95376.0859375, |
|
"learning_rate": 4.228927203065134e-05, |
|
"loss": 1.7423, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.9689213893967094, |
|
"grad_norm": 77924.8828125, |
|
"learning_rate": 4.224137931034483e-05, |
|
"loss": 1.7371, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.973491773308958, |
|
"grad_norm": 108620.1484375, |
|
"learning_rate": 4.2193486590038316e-05, |
|
"loss": 1.6786, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9780621572212066, |
|
"grad_norm": 96284.8984375, |
|
"learning_rate": 4.21455938697318e-05, |
|
"loss": 1.641, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9826325411334552, |
|
"grad_norm": 78326.7890625, |
|
"learning_rate": 4.209770114942529e-05, |
|
"loss": 1.6984, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9872029250457038, |
|
"grad_norm": 86470.40625, |
|
"learning_rate": 4.204980842911878e-05, |
|
"loss": 1.7534, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9917733089579525, |
|
"grad_norm": 104938.75, |
|
"learning_rate": 4.2001915708812264e-05, |
|
"loss": 1.7039, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9963436928702011, |
|
"grad_norm": 100711.6875, |
|
"learning_rate": 4.195402298850575e-05, |
|
"loss": 1.7534, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.7005630731582642, |
|
"eval_runtime": 345.6818, |
|
"eval_samples_per_second": 43.393, |
|
"eval_steps_per_second": 1.357, |
|
"step": 2188 |
|
}, |
|
{ |
|
"epoch": 1.0009140767824498, |
|
"grad_norm": 91781.6796875, |
|
"learning_rate": 4.190613026819923e-05, |
|
"loss": 1.6595, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.0054844606946984, |
|
"grad_norm": 90080.9375, |
|
"learning_rate": 4.185823754789272e-05, |
|
"loss": 1.6538, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.010054844606947, |
|
"grad_norm": 88122.7890625, |
|
"learning_rate": 4.1810344827586205e-05, |
|
"loss": 1.6596, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.0146252285191957, |
|
"grad_norm": 111770.046875, |
|
"learning_rate": 4.17624521072797e-05, |
|
"loss": 1.6971, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.0191956124314443, |
|
"grad_norm": 104428.828125, |
|
"learning_rate": 4.1714559386973185e-05, |
|
"loss": 1.7929, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.023765996343693, |
|
"grad_norm": 90344.828125, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.8338, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.0283363802559415, |
|
"grad_norm": 87549.5546875, |
|
"learning_rate": 4.161877394636016e-05, |
|
"loss": 1.6911, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0329067641681902, |
|
"grad_norm": 82813.453125, |
|
"learning_rate": 4.1570881226053646e-05, |
|
"loss": 1.7573, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.0374771480804388, |
|
"grad_norm": 87447.1328125, |
|
"learning_rate": 4.1522988505747126e-05, |
|
"loss": 1.7275, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.0420475319926874, |
|
"grad_norm": 63916.98046875, |
|
"learning_rate": 4.147509578544061e-05, |
|
"loss": 1.7326, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.046617915904936, |
|
"grad_norm": 89433.3828125, |
|
"learning_rate": 4.14272030651341e-05, |
|
"loss": 1.7586, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.0511882998171846, |
|
"grad_norm": 171660.09375, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 1.789, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0557586837294333, |
|
"grad_norm": 69103.7734375, |
|
"learning_rate": 4.1331417624521073e-05, |
|
"loss": 1.7837, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.0603290676416819, |
|
"grad_norm": 81962.609375, |
|
"learning_rate": 4.128352490421456e-05, |
|
"loss": 1.7424, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.0648994515539305, |
|
"grad_norm": 83097.5703125, |
|
"learning_rate": 4.123563218390805e-05, |
|
"loss": 1.7879, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.0694698354661791, |
|
"grad_norm": 112565.484375, |
|
"learning_rate": 4.1187739463601534e-05, |
|
"loss": 1.7266, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.0740402193784278, |
|
"grad_norm": 120119.7265625, |
|
"learning_rate": 4.113984674329502e-05, |
|
"loss": 1.8418, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.0786106032906764, |
|
"grad_norm": 105760.8359375, |
|
"learning_rate": 4.109195402298851e-05, |
|
"loss": 1.7203, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.083180987202925, |
|
"grad_norm": 88435.3125, |
|
"learning_rate": 4.1044061302681995e-05, |
|
"loss": 1.6665, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.0877513711151736, |
|
"grad_norm": 74858.359375, |
|
"learning_rate": 4.099616858237548e-05, |
|
"loss": 1.6907, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.0923217550274222, |
|
"grad_norm": 80752.5546875, |
|
"learning_rate": 4.094827586206897e-05, |
|
"loss": 1.7489, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.0968921389396709, |
|
"grad_norm": 85903.7578125, |
|
"learning_rate": 4.0900383141762455e-05, |
|
"loss": 1.6526, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1014625228519195, |
|
"grad_norm": 87469.15625, |
|
"learning_rate": 4.085249042145594e-05, |
|
"loss": 1.7612, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.106032906764168, |
|
"grad_norm": 115305.1484375, |
|
"learning_rate": 4.080459770114943e-05, |
|
"loss": 1.7148, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.1106032906764167, |
|
"grad_norm": 76171.40625, |
|
"learning_rate": 4.0756704980842916e-05, |
|
"loss": 1.7383, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.1151736745886653, |
|
"grad_norm": 110853.4140625, |
|
"learning_rate": 4.07088122605364e-05, |
|
"loss": 1.7353, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.1197440585009142, |
|
"grad_norm": 83511.421875, |
|
"learning_rate": 4.066091954022988e-05, |
|
"loss": 1.7261, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1243144424131628, |
|
"grad_norm": 69653.203125, |
|
"learning_rate": 4.061302681992337e-05, |
|
"loss": 1.7403, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.1288848263254114, |
|
"grad_norm": 141315.3125, |
|
"learning_rate": 4.056513409961686e-05, |
|
"loss": 1.7125, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.13345521023766, |
|
"grad_norm": 139185.4375, |
|
"learning_rate": 4.0517241379310344e-05, |
|
"loss": 1.66, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.1380255941499087, |
|
"grad_norm": 107879.578125, |
|
"learning_rate": 4.046934865900383e-05, |
|
"loss": 1.8147, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.1425959780621573, |
|
"grad_norm": 93484.1953125, |
|
"learning_rate": 4.0421455938697324e-05, |
|
"loss": 1.6882, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.147166361974406, |
|
"grad_norm": 96417.0390625, |
|
"learning_rate": 4.037356321839081e-05, |
|
"loss": 1.7444, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.1517367458866545, |
|
"grad_norm": 105896.125, |
|
"learning_rate": 4.03256704980843e-05, |
|
"loss": 1.7218, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.1563071297989032, |
|
"grad_norm": 70536.5078125, |
|
"learning_rate": 4.027777777777778e-05, |
|
"loss": 1.643, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.1608775137111518, |
|
"grad_norm": 188483.515625, |
|
"learning_rate": 4.0229885057471265e-05, |
|
"loss": 1.7145, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.1654478976234004, |
|
"grad_norm": 91915.40625, |
|
"learning_rate": 4.018199233716475e-05, |
|
"loss": 1.6617, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.170018281535649, |
|
"grad_norm": 112711.40625, |
|
"learning_rate": 4.013409961685824e-05, |
|
"loss": 1.7117, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.1745886654478976, |
|
"grad_norm": 81478.6953125, |
|
"learning_rate": 4.0086206896551726e-05, |
|
"loss": 1.8141, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.1791590493601463, |
|
"grad_norm": 90161.96875, |
|
"learning_rate": 4.003831417624521e-05, |
|
"loss": 1.6604, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.1837294332723949, |
|
"grad_norm": 67411.1953125, |
|
"learning_rate": 3.99904214559387e-05, |
|
"loss": 1.7204, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.1882998171846435, |
|
"grad_norm": 104490.5234375, |
|
"learning_rate": 3.9942528735632186e-05, |
|
"loss": 1.7942, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.1928702010968921, |
|
"grad_norm": 81397.3515625, |
|
"learning_rate": 3.989463601532567e-05, |
|
"loss": 1.6425, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.1974405850091407, |
|
"grad_norm": 117060.71875, |
|
"learning_rate": 3.984674329501916e-05, |
|
"loss": 1.7771, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.2020109689213894, |
|
"grad_norm": 80522.625, |
|
"learning_rate": 3.979885057471265e-05, |
|
"loss": 1.7167, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.206581352833638, |
|
"grad_norm": 92349.40625, |
|
"learning_rate": 3.9750957854406134e-05, |
|
"loss": 1.6713, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.2111517367458866, |
|
"grad_norm": 100128.2578125, |
|
"learning_rate": 3.970306513409962e-05, |
|
"loss": 1.7673, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.2157221206581352, |
|
"grad_norm": 89113.0625, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 1.6187, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.2202925045703839, |
|
"grad_norm": 124528.21875, |
|
"learning_rate": 3.9607279693486594e-05, |
|
"loss": 1.7173, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.2248628884826325, |
|
"grad_norm": 105671.9765625, |
|
"learning_rate": 3.955938697318008e-05, |
|
"loss": 1.7232, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.229433272394881, |
|
"grad_norm": 72920.734375, |
|
"learning_rate": 3.951149425287357e-05, |
|
"loss": 1.7218, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.2340036563071297, |
|
"grad_norm": 89138.640625, |
|
"learning_rate": 3.9463601532567055e-05, |
|
"loss": 1.681, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2385740402193783, |
|
"grad_norm": 76915.9765625, |
|
"learning_rate": 3.9415708812260535e-05, |
|
"loss": 1.7417, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.2431444241316272, |
|
"grad_norm": 76970.3359375, |
|
"learning_rate": 3.936781609195402e-05, |
|
"loss": 1.7513, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.2477148080438756, |
|
"grad_norm": 102189.65625, |
|
"learning_rate": 3.931992337164751e-05, |
|
"loss": 1.6338, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.2522851919561244, |
|
"grad_norm": 102939.9375, |
|
"learning_rate": 3.9272030651340996e-05, |
|
"loss": 1.7095, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.2568555758683728, |
|
"grad_norm": 75103.9375, |
|
"learning_rate": 3.922413793103448e-05, |
|
"loss": 1.7029, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.2614259597806217, |
|
"grad_norm": 118085.3125, |
|
"learning_rate": 3.917624521072797e-05, |
|
"loss": 1.741, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.26599634369287, |
|
"grad_norm": 78790.3984375, |
|
"learning_rate": 3.912835249042146e-05, |
|
"loss": 1.739, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.270566727605119, |
|
"grad_norm": 105775.8125, |
|
"learning_rate": 3.908045977011495e-05, |
|
"loss": 1.6434, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.2751371115173675, |
|
"grad_norm": 78574.390625, |
|
"learning_rate": 3.903256704980843e-05, |
|
"loss": 1.7371, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.2797074954296161, |
|
"grad_norm": 141283.359375, |
|
"learning_rate": 3.898467432950192e-05, |
|
"loss": 1.7781, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.2842778793418648, |
|
"grad_norm": 111235.7265625, |
|
"learning_rate": 3.8936781609195404e-05, |
|
"loss": 1.7383, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.2888482632541134, |
|
"grad_norm": 76845.9296875, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 1.6674, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.293418647166362, |
|
"grad_norm": 95679.5, |
|
"learning_rate": 3.884099616858238e-05, |
|
"loss": 1.7516, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.2979890310786106, |
|
"grad_norm": 69231.671875, |
|
"learning_rate": 3.8793103448275865e-05, |
|
"loss": 1.7267, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.3025594149908593, |
|
"grad_norm": 90358.46875, |
|
"learning_rate": 3.874521072796935e-05, |
|
"loss": 1.7008, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.3071297989031079, |
|
"grad_norm": 130772.9921875, |
|
"learning_rate": 3.869731800766284e-05, |
|
"loss": 1.7041, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.3117001828153565, |
|
"grad_norm": 86227.6015625, |
|
"learning_rate": 3.8649425287356325e-05, |
|
"loss": 1.6709, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.3162705667276051, |
|
"grad_norm": 108528.421875, |
|
"learning_rate": 3.8601532567049805e-05, |
|
"loss": 1.6854, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.3208409506398537, |
|
"grad_norm": 96323.234375, |
|
"learning_rate": 3.855363984674329e-05, |
|
"loss": 1.7317, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.3254113345521024, |
|
"grad_norm": 160365.8125, |
|
"learning_rate": 3.850574712643678e-05, |
|
"loss": 1.7585, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.329981718464351, |
|
"grad_norm": 80294.25, |
|
"learning_rate": 3.845785440613027e-05, |
|
"loss": 1.7612, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.3345521023765996, |
|
"grad_norm": 106545.8515625, |
|
"learning_rate": 3.840996168582376e-05, |
|
"loss": 1.7292, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.3391224862888482, |
|
"grad_norm": 95112.4375, |
|
"learning_rate": 3.8362068965517246e-05, |
|
"loss": 1.7349, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.3436928702010968, |
|
"grad_norm": 65303.0390625, |
|
"learning_rate": 3.831417624521073e-05, |
|
"loss": 1.7047, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.3482632541133455, |
|
"grad_norm": 66884.5078125, |
|
"learning_rate": 3.826628352490422e-05, |
|
"loss": 1.7516, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.352833638025594, |
|
"grad_norm": 87596.6796875, |
|
"learning_rate": 3.82183908045977e-05, |
|
"loss": 1.7047, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.3574040219378427, |
|
"grad_norm": 87859.3671875, |
|
"learning_rate": 3.817049808429119e-05, |
|
"loss": 1.848, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.3619744058500913, |
|
"grad_norm": 78479.203125, |
|
"learning_rate": 3.8122605363984674e-05, |
|
"loss": 1.7104, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.3665447897623402, |
|
"grad_norm": 67868.75, |
|
"learning_rate": 3.807471264367816e-05, |
|
"loss": 1.7818, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.3711151736745886, |
|
"grad_norm": 108708.4921875, |
|
"learning_rate": 3.802681992337165e-05, |
|
"loss": 1.6592, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3756855575868374, |
|
"grad_norm": 49919.5, |
|
"learning_rate": 3.7978927203065135e-05, |
|
"loss": 1.8281, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.3802559414990858, |
|
"grad_norm": 92234.84375, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 1.6741, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.3848263254113347, |
|
"grad_norm": 110063.28125, |
|
"learning_rate": 3.788314176245211e-05, |
|
"loss": 1.6882, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.389396709323583, |
|
"grad_norm": 99851.5, |
|
"learning_rate": 3.7835249042145595e-05, |
|
"loss": 1.7069, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.393967093235832, |
|
"grad_norm": 91330.5, |
|
"learning_rate": 3.778735632183908e-05, |
|
"loss": 1.7738, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.3985374771480805, |
|
"grad_norm": 97720.6953125, |
|
"learning_rate": 3.773946360153257e-05, |
|
"loss": 1.699, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.4031078610603291, |
|
"grad_norm": 60284.0859375, |
|
"learning_rate": 3.7691570881226056e-05, |
|
"loss": 1.6353, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.4076782449725778, |
|
"grad_norm": 110682.34375, |
|
"learning_rate": 3.764367816091954e-05, |
|
"loss": 1.6963, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.4122486288848264, |
|
"grad_norm": 144162.453125, |
|
"learning_rate": 3.759578544061303e-05, |
|
"loss": 1.7995, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.416819012797075, |
|
"grad_norm": 205120.046875, |
|
"learning_rate": 3.7547892720306517e-05, |
|
"loss": 1.796, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4213893967093236, |
|
"grad_norm": 144576.359375, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.663, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.4259597806215722, |
|
"grad_norm": 103083.203125, |
|
"learning_rate": 3.745210727969349e-05, |
|
"loss": 1.6979, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.4305301645338209, |
|
"grad_norm": 100059.3046875, |
|
"learning_rate": 3.740421455938698e-05, |
|
"loss": 1.7823, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.4351005484460695, |
|
"grad_norm": 128589.3046875, |
|
"learning_rate": 3.735632183908046e-05, |
|
"loss": 1.7182, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.4396709323583181, |
|
"grad_norm": 70812.890625, |
|
"learning_rate": 3.7308429118773944e-05, |
|
"loss": 1.7785, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.4442413162705667, |
|
"grad_norm": 63189.20703125, |
|
"learning_rate": 3.726053639846743e-05, |
|
"loss": 1.7257, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.4488117001828154, |
|
"grad_norm": 108848.34375, |
|
"learning_rate": 3.721264367816092e-05, |
|
"loss": 1.7409, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.453382084095064, |
|
"grad_norm": 97378.515625, |
|
"learning_rate": 3.716475095785441e-05, |
|
"loss": 1.7078, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.4579524680073126, |
|
"grad_norm": 101953.046875, |
|
"learning_rate": 3.71168582375479e-05, |
|
"loss": 1.8016, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.4625228519195612, |
|
"grad_norm": 118171.328125, |
|
"learning_rate": 3.7068965517241385e-05, |
|
"loss": 1.719, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4670932358318098, |
|
"grad_norm": 108193.546875, |
|
"learning_rate": 3.702107279693487e-05, |
|
"loss": 1.7423, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.4716636197440585, |
|
"grad_norm": 83211.53125, |
|
"learning_rate": 3.697318007662835e-05, |
|
"loss": 1.6915, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.476234003656307, |
|
"grad_norm": 118025.546875, |
|
"learning_rate": 3.692528735632184e-05, |
|
"loss": 1.7542, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.4808043875685557, |
|
"grad_norm": 79268.8828125, |
|
"learning_rate": 3.6877394636015326e-05, |
|
"loss": 1.7558, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.4853747714808043, |
|
"grad_norm": 91756.1015625, |
|
"learning_rate": 3.682950191570881e-05, |
|
"loss": 1.6653, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.489945155393053, |
|
"grad_norm": 114188.828125, |
|
"learning_rate": 3.67816091954023e-05, |
|
"loss": 1.7899, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.4945155393053016, |
|
"grad_norm": 122504.4921875, |
|
"learning_rate": 3.673371647509579e-05, |
|
"loss": 1.8254, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.4990859232175504, |
|
"grad_norm": 68376.1640625, |
|
"learning_rate": 3.6685823754789274e-05, |
|
"loss": 1.7274, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.5036563071297988, |
|
"grad_norm": 118541.71875, |
|
"learning_rate": 3.663793103448276e-05, |
|
"loss": 1.7456, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.5082266910420477, |
|
"grad_norm": 62813.5078125, |
|
"learning_rate": 3.659003831417625e-05, |
|
"loss": 1.7049, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.512797074954296, |
|
"grad_norm": 107603.8515625, |
|
"learning_rate": 3.6542145593869734e-05, |
|
"loss": 1.6411, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.517367458866545, |
|
"grad_norm": 66405.75, |
|
"learning_rate": 3.649425287356322e-05, |
|
"loss": 1.6928, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.5219378427787933, |
|
"grad_norm": 77770.8125, |
|
"learning_rate": 3.644636015325671e-05, |
|
"loss": 1.6668, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.5265082266910421, |
|
"grad_norm": 96171.3359375, |
|
"learning_rate": 3.6398467432950195e-05, |
|
"loss": 1.7354, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.5310786106032905, |
|
"grad_norm": 114372.53125, |
|
"learning_rate": 3.635057471264368e-05, |
|
"loss": 1.6588, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.5356489945155394, |
|
"grad_norm": 126977.671875, |
|
"learning_rate": 3.630268199233717e-05, |
|
"loss": 1.7437, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.5402193784277878, |
|
"grad_norm": 124899.46875, |
|
"learning_rate": 3.6254789272030656e-05, |
|
"loss": 1.7381, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.5447897623400366, |
|
"grad_norm": 87250.2109375, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 1.7489, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.5493601462522852, |
|
"grad_norm": 99225.671875, |
|
"learning_rate": 3.615900383141763e-05, |
|
"loss": 1.7005, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.5539305301645339, |
|
"grad_norm": 110436.3515625, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 1.7526, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.5585009140767825, |
|
"grad_norm": 73272.2421875, |
|
"learning_rate": 3.6063218390804596e-05, |
|
"loss": 1.6412, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.563071297989031, |
|
"grad_norm": 83162.109375, |
|
"learning_rate": 3.601532567049808e-05, |
|
"loss": 1.7777, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.5676416819012797, |
|
"grad_norm": 65596.4609375, |
|
"learning_rate": 3.596743295019157e-05, |
|
"loss": 1.7165, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.5722120658135283, |
|
"grad_norm": 87716.9375, |
|
"learning_rate": 3.591954022988506e-05, |
|
"loss": 1.7015, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.576782449725777, |
|
"grad_norm": 108148.75, |
|
"learning_rate": 3.5871647509578544e-05, |
|
"loss": 1.6684, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.5813528336380256, |
|
"grad_norm": 79430.3203125, |
|
"learning_rate": 3.582375478927204e-05, |
|
"loss": 1.6243, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.5859232175502742, |
|
"grad_norm": 102633.3046875, |
|
"learning_rate": 3.5775862068965524e-05, |
|
"loss": 1.7009, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.5904936014625228, |
|
"grad_norm": 116884.0546875, |
|
"learning_rate": 3.5727969348659004e-05, |
|
"loss": 1.6794, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.5950639853747715, |
|
"grad_norm": 123101.0078125, |
|
"learning_rate": 3.568007662835249e-05, |
|
"loss": 1.7216, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.59963436928702, |
|
"grad_norm": 81439.2734375, |
|
"learning_rate": 3.563218390804598e-05, |
|
"loss": 1.6748, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6042047531992687, |
|
"grad_norm": 101444.484375, |
|
"learning_rate": 3.5584291187739465e-05, |
|
"loss": 1.8176, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.6087751371115173, |
|
"grad_norm": 76834.7578125, |
|
"learning_rate": 3.553639846743295e-05, |
|
"loss": 1.6932, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.6133455210237662, |
|
"grad_norm": 102102.7265625, |
|
"learning_rate": 3.548850574712644e-05, |
|
"loss": 1.7848, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.6179159049360146, |
|
"grad_norm": 65518.3984375, |
|
"learning_rate": 3.5440613026819926e-05, |
|
"loss": 1.6966, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.6224862888482634, |
|
"grad_norm": 72790.3359375, |
|
"learning_rate": 3.539272030651341e-05, |
|
"loss": 1.6584, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.6270566727605118, |
|
"grad_norm": 73727.9140625, |
|
"learning_rate": 3.53448275862069e-05, |
|
"loss": 1.7437, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.6316270566727606, |
|
"grad_norm": 91062.0859375, |
|
"learning_rate": 3.529693486590038e-05, |
|
"loss": 1.6678, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.636197440585009, |
|
"grad_norm": 172251.484375, |
|
"learning_rate": 3.5249042145593867e-05, |
|
"loss": 1.7394, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.6407678244972579, |
|
"grad_norm": 85787.4140625, |
|
"learning_rate": 3.5201149425287353e-05, |
|
"loss": 1.6567, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.6453382084095063, |
|
"grad_norm": 96389.1171875, |
|
"learning_rate": 3.515325670498085e-05, |
|
"loss": 1.7427, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6499085923217551, |
|
"grad_norm": 69201.625, |
|
"learning_rate": 3.5105363984674334e-05, |
|
"loss": 1.7775, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.6544789762340035, |
|
"grad_norm": 100617.40625, |
|
"learning_rate": 3.505747126436782e-05, |
|
"loss": 1.7371, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.6590493601462524, |
|
"grad_norm": 96952.21875, |
|
"learning_rate": 3.500957854406131e-05, |
|
"loss": 1.7269, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.6636197440585008, |
|
"grad_norm": 74618.84375, |
|
"learning_rate": 3.4961685823754795e-05, |
|
"loss": 1.6758, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.6681901279707496, |
|
"grad_norm": 57622.64453125, |
|
"learning_rate": 3.4913793103448275e-05, |
|
"loss": 1.6987, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.672760511882998, |
|
"grad_norm": 102405.7421875, |
|
"learning_rate": 3.486590038314176e-05, |
|
"loss": 1.755, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.6773308957952469, |
|
"grad_norm": 99707.1953125, |
|
"learning_rate": 3.481800766283525e-05, |
|
"loss": 1.6318, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.6819012797074955, |
|
"grad_norm": 84351.671875, |
|
"learning_rate": 3.4770114942528735e-05, |
|
"loss": 1.7005, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.686471663619744, |
|
"grad_norm": 92737.875, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 1.6547, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.6910420475319927, |
|
"grad_norm": 117513.71875, |
|
"learning_rate": 3.467432950191571e-05, |
|
"loss": 1.6817, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.6956124314442413, |
|
"grad_norm": 78983.3515625, |
|
"learning_rate": 3.4626436781609196e-05, |
|
"loss": 1.7011, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.70018281535649, |
|
"grad_norm": 160552.734375, |
|
"learning_rate": 3.457854406130268e-05, |
|
"loss": 1.7277, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.7047531992687386, |
|
"grad_norm": 69111.1328125, |
|
"learning_rate": 3.453065134099617e-05, |
|
"loss": 1.7326, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.7093235831809872, |
|
"grad_norm": 106880.96875, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 1.7453, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.7138939670932358, |
|
"grad_norm": 117654.46875, |
|
"learning_rate": 3.4434865900383143e-05, |
|
"loss": 1.8011, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.7184643510054844, |
|
"grad_norm": 85585.7265625, |
|
"learning_rate": 3.438697318007663e-05, |
|
"loss": 1.7664, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.723034734917733, |
|
"grad_norm": 77661.5078125, |
|
"learning_rate": 3.433908045977012e-05, |
|
"loss": 1.6859, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.7276051188299817, |
|
"grad_norm": 106292.359375, |
|
"learning_rate": 3.4291187739463604e-05, |
|
"loss": 1.7482, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.7321755027422303, |
|
"grad_norm": 98747.6171875, |
|
"learning_rate": 3.424329501915709e-05, |
|
"loss": 1.7234, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.736745886654479, |
|
"grad_norm": 155160.953125, |
|
"learning_rate": 3.419540229885058e-05, |
|
"loss": 1.8456, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7413162705667276, |
|
"grad_norm": 94582.5546875, |
|
"learning_rate": 3.4147509578544065e-05, |
|
"loss": 1.8332, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.7458866544789764, |
|
"grad_norm": 96502.8359375, |
|
"learning_rate": 3.409961685823755e-05, |
|
"loss": 1.8145, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.7504570383912248, |
|
"grad_norm": 87164.46875, |
|
"learning_rate": 3.405172413793103e-05, |
|
"loss": 1.6959, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.7550274223034736, |
|
"grad_norm": 94143.84375, |
|
"learning_rate": 3.400383141762452e-05, |
|
"loss": 1.7662, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.759597806215722, |
|
"grad_norm": 69526.3671875, |
|
"learning_rate": 3.3955938697318005e-05, |
|
"loss": 1.6818, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.7641681901279709, |
|
"grad_norm": 91595.3359375, |
|
"learning_rate": 3.390804597701149e-05, |
|
"loss": 1.6884, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.7687385740402193, |
|
"grad_norm": 97468.0234375, |
|
"learning_rate": 3.3860153256704986e-05, |
|
"loss": 1.7608, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.7733089579524681, |
|
"grad_norm": 138455.578125, |
|
"learning_rate": 3.381226053639847e-05, |
|
"loss": 1.6528, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.7778793418647165, |
|
"grad_norm": 107855.234375, |
|
"learning_rate": 3.376436781609196e-05, |
|
"loss": 1.7178, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.7824497257769654, |
|
"grad_norm": 105021.2421875, |
|
"learning_rate": 3.371647509578545e-05, |
|
"loss": 1.6807, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.7870201096892138, |
|
"grad_norm": 252806.71875, |
|
"learning_rate": 3.366858237547893e-05, |
|
"loss": 1.6723, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.7915904936014626, |
|
"grad_norm": 116308.34375, |
|
"learning_rate": 3.3620689655172414e-05, |
|
"loss": 1.7991, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.796160877513711, |
|
"grad_norm": 86454.4375, |
|
"learning_rate": 3.35727969348659e-05, |
|
"loss": 1.7825, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.8007312614259599, |
|
"grad_norm": 71364.28125, |
|
"learning_rate": 3.352490421455939e-05, |
|
"loss": 1.769, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.8053016453382082, |
|
"grad_norm": 70699.7265625, |
|
"learning_rate": 3.3477011494252874e-05, |
|
"loss": 1.7765, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.809872029250457, |
|
"grad_norm": 58119.38671875, |
|
"learning_rate": 3.342911877394636e-05, |
|
"loss": 1.6735, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.8144424131627057, |
|
"grad_norm": 89724.2578125, |
|
"learning_rate": 3.338122605363985e-05, |
|
"loss": 1.7261, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.8190127970749543, |
|
"grad_norm": 103765.2421875, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.7276, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.823583180987203, |
|
"grad_norm": 108591.3046875, |
|
"learning_rate": 3.328544061302682e-05, |
|
"loss": 1.6897, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.8281535648994516, |
|
"grad_norm": 70272.8046875, |
|
"learning_rate": 3.323754789272031e-05, |
|
"loss": 1.6812, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8327239488117002, |
|
"grad_norm": 71892.390625, |
|
"learning_rate": 3.3189655172413796e-05, |
|
"loss": 1.7387, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.8372943327239488, |
|
"grad_norm": 73947.421875, |
|
"learning_rate": 3.314176245210728e-05, |
|
"loss": 1.6488, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.8418647166361974, |
|
"grad_norm": 90722.765625, |
|
"learning_rate": 3.309386973180077e-05, |
|
"loss": 1.7028, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.846435100548446, |
|
"grad_norm": 106649.1484375, |
|
"learning_rate": 3.3045977011494256e-05, |
|
"loss": 1.7118, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.8510054844606947, |
|
"grad_norm": 79884.34375, |
|
"learning_rate": 3.299808429118774e-05, |
|
"loss": 1.6838, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.8555758683729433, |
|
"grad_norm": 133383.34375, |
|
"learning_rate": 3.295019157088123e-05, |
|
"loss": 1.7038, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.860146252285192, |
|
"grad_norm": 93182.28125, |
|
"learning_rate": 3.290229885057472e-05, |
|
"loss": 1.6335, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.8647166361974405, |
|
"grad_norm": 57547.94921875, |
|
"learning_rate": 3.2854406130268204e-05, |
|
"loss": 1.7422, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.8692870201096892, |
|
"grad_norm": 95853.7734375, |
|
"learning_rate": 3.2806513409961684e-05, |
|
"loss": 1.775, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.8738574040219378, |
|
"grad_norm": 144561.4375, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 1.8058, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.8784277879341866, |
|
"grad_norm": 108504.7734375, |
|
"learning_rate": 3.271072796934866e-05, |
|
"loss": 1.7476, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.882998171846435, |
|
"grad_norm": 94651.2109375, |
|
"learning_rate": 3.2662835249042144e-05, |
|
"loss": 1.7199, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.8875685557586839, |
|
"grad_norm": 70529.734375, |
|
"learning_rate": 3.261494252873563e-05, |
|
"loss": 1.7404, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.8921389396709323, |
|
"grad_norm": 61164.6796875, |
|
"learning_rate": 3.256704980842912e-05, |
|
"loss": 1.6635, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.8967093235831811, |
|
"grad_norm": 114792.84375, |
|
"learning_rate": 3.251915708812261e-05, |
|
"loss": 1.7986, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.9012797074954295, |
|
"grad_norm": 72610.09375, |
|
"learning_rate": 3.24712643678161e-05, |
|
"loss": 1.6614, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.9058500914076784, |
|
"grad_norm": 85551.8828125, |
|
"learning_rate": 3.242337164750958e-05, |
|
"loss": 1.6255, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.9104204753199268, |
|
"grad_norm": 109005.5546875, |
|
"learning_rate": 3.2375478927203066e-05, |
|
"loss": 1.6897, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.9149908592321756, |
|
"grad_norm": 107019.8046875, |
|
"learning_rate": 3.232758620689655e-05, |
|
"loss": 1.732, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.919561243144424, |
|
"grad_norm": 78904.2109375, |
|
"learning_rate": 3.227969348659004e-05, |
|
"loss": 1.7201, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9241316270566728, |
|
"grad_norm": 64701.79296875, |
|
"learning_rate": 3.2231800766283526e-05, |
|
"loss": 1.6952, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.9287020109689212, |
|
"grad_norm": 64434.64453125, |
|
"learning_rate": 3.218390804597701e-05, |
|
"loss": 1.6946, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.93327239488117, |
|
"grad_norm": 115172.6171875, |
|
"learning_rate": 3.21360153256705e-05, |
|
"loss": 1.6573, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.9378427787934185, |
|
"grad_norm": 82583.25, |
|
"learning_rate": 3.208812260536399e-05, |
|
"loss": 1.7499, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.9424131627056673, |
|
"grad_norm": 93962.40625, |
|
"learning_rate": 3.2040229885057474e-05, |
|
"loss": 1.7036, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.946983546617916, |
|
"grad_norm": 81828.8203125, |
|
"learning_rate": 3.1992337164750954e-05, |
|
"loss": 1.6486, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.9515539305301646, |
|
"grad_norm": 109928.0546875, |
|
"learning_rate": 3.194444444444444e-05, |
|
"loss": 1.7805, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.9561243144424132, |
|
"grad_norm": 100452.4765625, |
|
"learning_rate": 3.1896551724137935e-05, |
|
"loss": 1.6897, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.9606946983546618, |
|
"grad_norm": 53400.49609375, |
|
"learning_rate": 3.184865900383142e-05, |
|
"loss": 1.681, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.9652650822669104, |
|
"grad_norm": 104313.296875, |
|
"learning_rate": 3.180076628352491e-05, |
|
"loss": 1.737, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.969835466179159, |
|
"grad_norm": 92321.3515625, |
|
"learning_rate": 3.1752873563218395e-05, |
|
"loss": 1.6844, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.9744058500914077, |
|
"grad_norm": 87737.4140625, |
|
"learning_rate": 3.170498084291188e-05, |
|
"loss": 1.7712, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.9789762340036563, |
|
"grad_norm": 108424.9453125, |
|
"learning_rate": 3.165708812260537e-05, |
|
"loss": 1.7173, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.983546617915905, |
|
"grad_norm": 79022.0546875, |
|
"learning_rate": 3.160919540229885e-05, |
|
"loss": 1.668, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.9881170018281535, |
|
"grad_norm": 67820.46875, |
|
"learning_rate": 3.1561302681992336e-05, |
|
"loss": 1.7499, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.9926873857404022, |
|
"grad_norm": 56711.33984375, |
|
"learning_rate": 3.151340996168582e-05, |
|
"loss": 1.6331, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.9972577696526508, |
|
"grad_norm": 137476.015625, |
|
"learning_rate": 3.146551724137931e-05, |
|
"loss": 1.6912, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.693359375, |
|
"eval_runtime": 345.1475, |
|
"eval_samples_per_second": 43.46, |
|
"eval_steps_per_second": 1.359, |
|
"step": 4376 |
|
}, |
|
{ |
|
"epoch": 2.0018281535648996, |
|
"grad_norm": 95937.5703125, |
|
"learning_rate": 3.1417624521072797e-05, |
|
"loss": 1.7171, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.006398537477148, |
|
"grad_norm": 69800.3515625, |
|
"learning_rate": 3.1369731800766283e-05, |
|
"loss": 1.698, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.010968921389397, |
|
"grad_norm": 74739.84375, |
|
"learning_rate": 3.132183908045977e-05, |
|
"loss": 1.6507, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0155393053016453, |
|
"grad_norm": 82305.1484375, |
|
"learning_rate": 3.127394636015326e-05, |
|
"loss": 1.7155, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.020109689213894, |
|
"grad_norm": 62019.15625, |
|
"learning_rate": 3.1226053639846744e-05, |
|
"loss": 1.7122, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.0246800731261425, |
|
"grad_norm": 63863.31640625, |
|
"learning_rate": 3.117816091954023e-05, |
|
"loss": 1.7085, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.0292504570383914, |
|
"grad_norm": 51309.7109375, |
|
"learning_rate": 3.113026819923372e-05, |
|
"loss": 1.7576, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.0338208409506398, |
|
"grad_norm": 55412.6796875, |
|
"learning_rate": 3.1082375478927205e-05, |
|
"loss": 1.709, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.0383912248628886, |
|
"grad_norm": 107907.9609375, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 1.771, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.042961608775137, |
|
"grad_norm": 73671.1796875, |
|
"learning_rate": 3.098659003831418e-05, |
|
"loss": 1.7703, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.047531992687386, |
|
"grad_norm": 59534.9765625, |
|
"learning_rate": 3.0938697318007665e-05, |
|
"loss": 1.7293, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.0521023765996342, |
|
"grad_norm": 133561.09375, |
|
"learning_rate": 3.089080459770115e-05, |
|
"loss": 1.7752, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.056672760511883, |
|
"grad_norm": 133462.234375, |
|
"learning_rate": 3.084291187739464e-05, |
|
"loss": 1.7729, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0612431444241315, |
|
"grad_norm": 70087.765625, |
|
"learning_rate": 3.0795019157088126e-05, |
|
"loss": 1.7831, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.0658135283363803, |
|
"grad_norm": 141446.296875, |
|
"learning_rate": 3.0747126436781606e-05, |
|
"loss": 1.7614, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.0703839122486287, |
|
"grad_norm": 83022.75, |
|
"learning_rate": 3.069923371647509e-05, |
|
"loss": 1.5773, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.0749542961608776, |
|
"grad_norm": 81699.2578125, |
|
"learning_rate": 3.065134099616858e-05, |
|
"loss": 1.7204, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.079524680073126, |
|
"grad_norm": 73930.953125, |
|
"learning_rate": 3.060344827586207e-05, |
|
"loss": 1.781, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.084095063985375, |
|
"grad_norm": 99254.2890625, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 1.6642, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.088665447897623, |
|
"grad_norm": 84721.40625, |
|
"learning_rate": 3.0507662835249047e-05, |
|
"loss": 1.6594, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.093235831809872, |
|
"grad_norm": 78780.6015625, |
|
"learning_rate": 3.045977011494253e-05, |
|
"loss": 1.7436, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.0978062157221204, |
|
"grad_norm": 74259.4921875, |
|
"learning_rate": 3.0411877394636018e-05, |
|
"loss": 1.7986, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.1023765996343693, |
|
"grad_norm": 116283.8671875, |
|
"learning_rate": 3.0363984674329505e-05, |
|
"loss": 1.7248, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.106946983546618, |
|
"grad_norm": 96815.84375, |
|
"learning_rate": 3.031609195402299e-05, |
|
"loss": 1.763, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.1115173674588665, |
|
"grad_norm": 61044.22265625, |
|
"learning_rate": 3.0268199233716475e-05, |
|
"loss": 1.7655, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.1160877513711154, |
|
"grad_norm": 75176.796875, |
|
"learning_rate": 3.0220306513409962e-05, |
|
"loss": 1.7197, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.1206581352833638, |
|
"grad_norm": 100533.109375, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 1.6475, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.1252285191956126, |
|
"grad_norm": 79551.5234375, |
|
"learning_rate": 3.0124521072796936e-05, |
|
"loss": 1.6468, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.129798903107861, |
|
"grad_norm": 74119.8515625, |
|
"learning_rate": 3.0076628352490422e-05, |
|
"loss": 1.7115, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.13436928702011, |
|
"grad_norm": 62293.6953125, |
|
"learning_rate": 3.0028735632183906e-05, |
|
"loss": 1.7507, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.1389396709323583, |
|
"grad_norm": 66430.1953125, |
|
"learning_rate": 2.9980842911877393e-05, |
|
"loss": 1.6552, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.143510054844607, |
|
"grad_norm": 70040.5390625, |
|
"learning_rate": 2.9932950191570886e-05, |
|
"loss": 1.7662, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.1480804387568555, |
|
"grad_norm": 94635.6796875, |
|
"learning_rate": 2.988505747126437e-05, |
|
"loss": 1.7736, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.1526508226691043, |
|
"grad_norm": 79885.9453125, |
|
"learning_rate": 2.9837164750957857e-05, |
|
"loss": 1.7164, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.1572212065813527, |
|
"grad_norm": 81728.3046875, |
|
"learning_rate": 2.9789272030651344e-05, |
|
"loss": 1.7839, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.1617915904936016, |
|
"grad_norm": 95488.8984375, |
|
"learning_rate": 2.974137931034483e-05, |
|
"loss": 1.7674, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.16636197440585, |
|
"grad_norm": 109054.171875, |
|
"learning_rate": 2.9693486590038317e-05, |
|
"loss": 1.6966, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.170932358318099, |
|
"grad_norm": 79046.40625, |
|
"learning_rate": 2.96455938697318e-05, |
|
"loss": 1.6621, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.1755027422303472, |
|
"grad_norm": 102101.4765625, |
|
"learning_rate": 2.9597701149425288e-05, |
|
"loss": 1.7391, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.180073126142596, |
|
"grad_norm": 83587.875, |
|
"learning_rate": 2.9549808429118775e-05, |
|
"loss": 1.8232, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.1846435100548445, |
|
"grad_norm": 104632.4921875, |
|
"learning_rate": 2.950191570881226e-05, |
|
"loss": 1.6575, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.1892138939670933, |
|
"grad_norm": 72859.53125, |
|
"learning_rate": 2.945402298850575e-05, |
|
"loss": 1.7205, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.1937842778793417, |
|
"grad_norm": 92640.796875, |
|
"learning_rate": 2.9406130268199232e-05, |
|
"loss": 1.725, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.1983546617915906, |
|
"grad_norm": 246298.65625, |
|
"learning_rate": 2.935823754789272e-05, |
|
"loss": 1.7163, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.202925045703839, |
|
"grad_norm": 97907.3828125, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 1.6475, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.207495429616088, |
|
"grad_norm": 85840.625, |
|
"learning_rate": 2.9262452107279696e-05, |
|
"loss": 1.7121, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.212065813528336, |
|
"grad_norm": 72967.8984375, |
|
"learning_rate": 2.9214559386973183e-05, |
|
"loss": 1.7438, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.216636197440585, |
|
"grad_norm": 75571.765625, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 1.6586, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.2212065813528334, |
|
"grad_norm": 65606.75, |
|
"learning_rate": 2.9118773946360157e-05, |
|
"loss": 1.7405, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.2257769652650823, |
|
"grad_norm": 81884.3828125, |
|
"learning_rate": 2.9070881226053644e-05, |
|
"loss": 1.7318, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.2303473491773307, |
|
"grad_norm": 66072.0703125, |
|
"learning_rate": 2.9022988505747127e-05, |
|
"loss": 1.7428, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.2349177330895795, |
|
"grad_norm": 154896.03125, |
|
"learning_rate": 2.8975095785440614e-05, |
|
"loss": 1.7548, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.2394881170018284, |
|
"grad_norm": 75736.953125, |
|
"learning_rate": 2.89272030651341e-05, |
|
"loss": 1.7215, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.2440585009140768, |
|
"grad_norm": 67909.71875, |
|
"learning_rate": 2.8879310344827588e-05, |
|
"loss": 1.6336, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.2486288848263256, |
|
"grad_norm": 89797.1953125, |
|
"learning_rate": 2.8831417624521075e-05, |
|
"loss": 1.7273, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.253199268738574, |
|
"grad_norm": 85214.125, |
|
"learning_rate": 2.8783524904214558e-05, |
|
"loss": 1.7144, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.257769652650823, |
|
"grad_norm": 76317.734375, |
|
"learning_rate": 2.8735632183908045e-05, |
|
"loss": 1.7784, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.2623400365630713, |
|
"grad_norm": 49163.828125, |
|
"learning_rate": 2.8687739463601532e-05, |
|
"loss": 1.7026, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.26691042047532, |
|
"grad_norm": 107379.171875, |
|
"learning_rate": 2.863984674329502e-05, |
|
"loss": 1.7803, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.2714808043875685, |
|
"grad_norm": 69726.2734375, |
|
"learning_rate": 2.859195402298851e-05, |
|
"loss": 1.7006, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.2760511882998173, |
|
"grad_norm": 68964.7265625, |
|
"learning_rate": 2.8544061302681996e-05, |
|
"loss": 1.7053, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.2806215722120657, |
|
"grad_norm": 67173.03125, |
|
"learning_rate": 2.8496168582375483e-05, |
|
"loss": 1.6936, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.2851919561243146, |
|
"grad_norm": 78151.328125, |
|
"learning_rate": 2.844827586206897e-05, |
|
"loss": 1.7376, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.289762340036563, |
|
"grad_norm": 147513.4375, |
|
"learning_rate": 2.8400383141762453e-05, |
|
"loss": 1.7627, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.294332723948812, |
|
"grad_norm": 104835.921875, |
|
"learning_rate": 2.835249042145594e-05, |
|
"loss": 1.6425, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.2989031078610602, |
|
"grad_norm": 120597.7109375, |
|
"learning_rate": 2.8304597701149427e-05, |
|
"loss": 1.7738, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.303473491773309, |
|
"grad_norm": 65482.203125, |
|
"learning_rate": 2.8256704980842914e-05, |
|
"loss": 1.6205, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.3080438756855575, |
|
"grad_norm": 71476.0078125, |
|
"learning_rate": 2.82088122605364e-05, |
|
"loss": 1.6699, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.3126142595978063, |
|
"grad_norm": 85771.3515625, |
|
"learning_rate": 2.8160919540229884e-05, |
|
"loss": 1.697, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.3171846435100547, |
|
"grad_norm": 63680.30859375, |
|
"learning_rate": 2.811302681992337e-05, |
|
"loss": 1.6754, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.3217550274223036, |
|
"grad_norm": 152300.34375, |
|
"learning_rate": 2.8065134099616858e-05, |
|
"loss": 1.6722, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.326325411334552, |
|
"grad_norm": 88300.703125, |
|
"learning_rate": 2.8017241379310345e-05, |
|
"loss": 1.7725, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.330895795246801, |
|
"grad_norm": 106965.4375, |
|
"learning_rate": 2.796934865900383e-05, |
|
"loss": 1.7734, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.335466179159049, |
|
"grad_norm": 118704.640625, |
|
"learning_rate": 2.7921455938697322e-05, |
|
"loss": 1.6477, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.340036563071298, |
|
"grad_norm": 78607.4453125, |
|
"learning_rate": 2.787356321839081e-05, |
|
"loss": 1.7386, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.3446069469835464, |
|
"grad_norm": 83952.1171875, |
|
"learning_rate": 2.7825670498084296e-05, |
|
"loss": 1.64, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.3491773308957953, |
|
"grad_norm": 107545.0859375, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.6633, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.353747714808044, |
|
"grad_norm": 72284.5, |
|
"learning_rate": 2.7729885057471266e-05, |
|
"loss": 1.7249, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.3583180987202925, |
|
"grad_norm": 89877.7109375, |
|
"learning_rate": 2.7681992337164753e-05, |
|
"loss": 1.6901, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.362888482632541, |
|
"grad_norm": 138945.6875, |
|
"learning_rate": 2.763409961685824e-05, |
|
"loss": 1.6468, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.3674588665447898, |
|
"grad_norm": 58679.375, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 1.6838, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.3720292504570386, |
|
"grad_norm": 95501.8671875, |
|
"learning_rate": 2.753831417624521e-05, |
|
"loss": 1.682, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.376599634369287, |
|
"grad_norm": 76119.296875, |
|
"learning_rate": 2.7490421455938697e-05, |
|
"loss": 1.8395, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.3811700182815354, |
|
"grad_norm": 108761.65625, |
|
"learning_rate": 2.7442528735632184e-05, |
|
"loss": 1.7638, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.3857404021937842, |
|
"grad_norm": 99530.703125, |
|
"learning_rate": 2.739463601532567e-05, |
|
"loss": 1.6688, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.390310786106033, |
|
"grad_norm": 73215.6171875, |
|
"learning_rate": 2.7346743295019158e-05, |
|
"loss": 1.6266, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 2.3948811700182815, |
|
"grad_norm": 94147.75, |
|
"learning_rate": 2.7298850574712648e-05, |
|
"loss": 1.7344, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 2.3994515539305303, |
|
"grad_norm": 80156.09375, |
|
"learning_rate": 2.7250957854406135e-05, |
|
"loss": 1.7074, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.4040219378427787, |
|
"grad_norm": 54975.70703125, |
|
"learning_rate": 2.720306513409962e-05, |
|
"loss": 1.6535, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 2.4085923217550276, |
|
"grad_norm": 64294.23828125, |
|
"learning_rate": 2.7155172413793105e-05, |
|
"loss": 1.6997, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 2.413162705667276, |
|
"grad_norm": 83260.0, |
|
"learning_rate": 2.7107279693486592e-05, |
|
"loss": 1.7617, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 2.417733089579525, |
|
"grad_norm": 79186.6484375, |
|
"learning_rate": 2.705938697318008e-05, |
|
"loss": 1.6661, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 2.422303473491773, |
|
"grad_norm": 98957.21875, |
|
"learning_rate": 2.7011494252873566e-05, |
|
"loss": 1.6771, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.426873857404022, |
|
"grad_norm": 71378.125, |
|
"learning_rate": 2.6963601532567053e-05, |
|
"loss": 1.6975, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 2.4314442413162705, |
|
"grad_norm": 81879.71875, |
|
"learning_rate": 2.6915708812260536e-05, |
|
"loss": 1.774, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 2.4360146252285193, |
|
"grad_norm": 57695.0078125, |
|
"learning_rate": 2.6867816091954023e-05, |
|
"loss": 1.6981, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 2.4405850091407677, |
|
"grad_norm": 84480.6328125, |
|
"learning_rate": 2.681992337164751e-05, |
|
"loss": 1.7748, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 2.4451553930530165, |
|
"grad_norm": 71991.0859375, |
|
"learning_rate": 2.6772030651340997e-05, |
|
"loss": 1.6582, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.449725776965265, |
|
"grad_norm": 84038.8984375, |
|
"learning_rate": 2.672413793103448e-05, |
|
"loss": 1.7313, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 2.454296160877514, |
|
"grad_norm": 61137.41015625, |
|
"learning_rate": 2.6676245210727967e-05, |
|
"loss": 1.686, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 2.458866544789762, |
|
"grad_norm": 82829.7734375, |
|
"learning_rate": 2.662835249042146e-05, |
|
"loss": 1.6642, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 2.463436928702011, |
|
"grad_norm": 77371.140625, |
|
"learning_rate": 2.6580459770114948e-05, |
|
"loss": 1.7747, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 2.4680073126142594, |
|
"grad_norm": 109782.984375, |
|
"learning_rate": 2.653256704980843e-05, |
|
"loss": 1.7706, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.4725776965265083, |
|
"grad_norm": 99484.78125, |
|
"learning_rate": 2.6484674329501918e-05, |
|
"loss": 1.7255, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 2.4771480804387567, |
|
"grad_norm": 97897.8671875, |
|
"learning_rate": 2.6436781609195405e-05, |
|
"loss": 1.8301, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 2.4817184643510055, |
|
"grad_norm": 69454.625, |
|
"learning_rate": 2.6388888888888892e-05, |
|
"loss": 1.833, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 2.4862888482632544, |
|
"grad_norm": 51202.9765625, |
|
"learning_rate": 2.6340996168582375e-05, |
|
"loss": 1.8042, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 2.4908592321755028, |
|
"grad_norm": 62593.1015625, |
|
"learning_rate": 2.6293103448275862e-05, |
|
"loss": 1.7295, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.495429616087751, |
|
"grad_norm": 62147.26953125, |
|
"learning_rate": 2.624521072796935e-05, |
|
"loss": 1.7026, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 105682.3515625, |
|
"learning_rate": 2.6197318007662836e-05, |
|
"loss": 1.6866, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 2.504570383912249, |
|
"grad_norm": 133388.953125, |
|
"learning_rate": 2.6149425287356323e-05, |
|
"loss": 1.6257, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 2.5091407678244972, |
|
"grad_norm": 78931.4609375, |
|
"learning_rate": 2.6101532567049806e-05, |
|
"loss": 1.7367, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 2.5137111517367456, |
|
"grad_norm": 58830.1484375, |
|
"learning_rate": 2.6053639846743293e-05, |
|
"loss": 1.6935, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.5182815356489945, |
|
"grad_norm": 154903.03125, |
|
"learning_rate": 2.600574712643678e-05, |
|
"loss": 1.6548, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 2.5228519195612433, |
|
"grad_norm": 71029.5859375, |
|
"learning_rate": 2.595785440613027e-05, |
|
"loss": 1.7459, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 2.5274223034734917, |
|
"grad_norm": 76854.0, |
|
"learning_rate": 2.5909961685823757e-05, |
|
"loss": 1.7737, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 2.53199268738574, |
|
"grad_norm": 78467.3984375, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 1.728, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 2.536563071297989, |
|
"grad_norm": 81775.109375, |
|
"learning_rate": 2.581417624521073e-05, |
|
"loss": 1.6481, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.541133455210238, |
|
"grad_norm": 59482.62890625, |
|
"learning_rate": 2.5766283524904218e-05, |
|
"loss": 1.7565, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 2.545703839122486, |
|
"grad_norm": 114247.8671875, |
|
"learning_rate": 2.57183908045977e-05, |
|
"loss": 1.6725, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 2.550274223034735, |
|
"grad_norm": 69261.0390625, |
|
"learning_rate": 2.5670498084291188e-05, |
|
"loss": 1.7034, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 2.5548446069469835, |
|
"grad_norm": 88227.703125, |
|
"learning_rate": 2.5622605363984675e-05, |
|
"loss": 1.6888, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 2.5594149908592323, |
|
"grad_norm": 123599.8828125, |
|
"learning_rate": 2.5574712643678162e-05, |
|
"loss": 1.6814, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.5639853747714807, |
|
"grad_norm": 61095.8984375, |
|
"learning_rate": 2.552681992337165e-05, |
|
"loss": 1.6875, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 2.5685557586837295, |
|
"grad_norm": 71797.8671875, |
|
"learning_rate": 2.5478927203065132e-05, |
|
"loss": 1.6945, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.573126142595978, |
|
"grad_norm": 70069.7890625, |
|
"learning_rate": 2.543103448275862e-05, |
|
"loss": 1.6055, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.577696526508227, |
|
"grad_norm": 90758.25, |
|
"learning_rate": 2.5383141762452106e-05, |
|
"loss": 1.7105, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.582266910420475, |
|
"grad_norm": 73838.0390625, |
|
"learning_rate": 2.5335249042145593e-05, |
|
"loss": 1.7238, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.586837294332724, |
|
"grad_norm": 169500.65625, |
|
"learning_rate": 2.5287356321839083e-05, |
|
"loss": 1.6286, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.5914076782449724, |
|
"grad_norm": 86502.6484375, |
|
"learning_rate": 2.523946360153257e-05, |
|
"loss": 1.7394, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.5959780621572213, |
|
"grad_norm": 77630.203125, |
|
"learning_rate": 2.5191570881226057e-05, |
|
"loss": 1.6857, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.60054844606947, |
|
"grad_norm": 78792.4921875, |
|
"learning_rate": 2.5143678160919544e-05, |
|
"loss": 1.7818, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.6051188299817185, |
|
"grad_norm": 61021.87890625, |
|
"learning_rate": 2.5095785440613027e-05, |
|
"loss": 1.6825, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.609689213893967, |
|
"grad_norm": 93845.828125, |
|
"learning_rate": 2.5047892720306514e-05, |
|
"loss": 1.7818, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.6142595978062158, |
|
"grad_norm": 77104.4296875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.6891, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.6188299817184646, |
|
"grad_norm": 67157.0625, |
|
"learning_rate": 2.4952107279693488e-05, |
|
"loss": 1.7418, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.623400365630713, |
|
"grad_norm": 117816.0078125, |
|
"learning_rate": 2.4904214559386975e-05, |
|
"loss": 1.6851, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.6279707495429614, |
|
"grad_norm": 64429.0546875, |
|
"learning_rate": 2.485632183908046e-05, |
|
"loss": 1.7577, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.6325411334552102, |
|
"grad_norm": 99433.84375, |
|
"learning_rate": 2.480842911877395e-05, |
|
"loss": 1.6719, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.637111517367459, |
|
"grad_norm": 129014.9609375, |
|
"learning_rate": 2.4760536398467436e-05, |
|
"loss": 1.8557, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.6416819012797075, |
|
"grad_norm": 51642.76171875, |
|
"learning_rate": 2.4712643678160922e-05, |
|
"loss": 1.71, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.646252285191956, |
|
"grad_norm": 92177.1875, |
|
"learning_rate": 2.4664750957854406e-05, |
|
"loss": 1.6415, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.6508226691042047, |
|
"grad_norm": 83833.5546875, |
|
"learning_rate": 2.4616858237547893e-05, |
|
"loss": 1.6863, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.6553930530164536, |
|
"grad_norm": 83966.53125, |
|
"learning_rate": 2.456896551724138e-05, |
|
"loss": 1.7626, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.659963436928702, |
|
"grad_norm": 94047.453125, |
|
"learning_rate": 2.4521072796934867e-05, |
|
"loss": 1.7408, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.664533820840951, |
|
"grad_norm": 107394.71875, |
|
"learning_rate": 2.4473180076628353e-05, |
|
"loss": 1.7078, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.669104204753199, |
|
"grad_norm": 96418.6328125, |
|
"learning_rate": 2.442528735632184e-05, |
|
"loss": 1.7023, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.673674588665448, |
|
"grad_norm": 115618.6015625, |
|
"learning_rate": 2.4377394636015327e-05, |
|
"loss": 1.7811, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.6782449725776964, |
|
"grad_norm": 102043.5078125, |
|
"learning_rate": 2.4329501915708814e-05, |
|
"loss": 1.761, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.6828153564899453, |
|
"grad_norm": 85863.6953125, |
|
"learning_rate": 2.42816091954023e-05, |
|
"loss": 1.786, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.6873857404021937, |
|
"grad_norm": 105787.890625, |
|
"learning_rate": 2.4233716475095784e-05, |
|
"loss": 1.6736, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.6919561243144425, |
|
"grad_norm": 87654.65625, |
|
"learning_rate": 2.418582375478927e-05, |
|
"loss": 1.7352, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.696526508226691, |
|
"grad_norm": 65512.890625, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 1.6165, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.7010968921389398, |
|
"grad_norm": 96425.09375, |
|
"learning_rate": 2.409003831417625e-05, |
|
"loss": 1.6336, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.705667276051188, |
|
"grad_norm": 64857.203125, |
|
"learning_rate": 2.4042145593869732e-05, |
|
"loss": 1.7171, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.710237659963437, |
|
"grad_norm": 59102.8828125, |
|
"learning_rate": 2.399425287356322e-05, |
|
"loss": 1.7046, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.7148080438756854, |
|
"grad_norm": 89212.4140625, |
|
"learning_rate": 2.3946360153256706e-05, |
|
"loss": 1.6989, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.7193784277879343, |
|
"grad_norm": 75463.0859375, |
|
"learning_rate": 2.3898467432950193e-05, |
|
"loss": 1.7416, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.7239488117001827, |
|
"grad_norm": 107514.09375, |
|
"learning_rate": 2.385057471264368e-05, |
|
"loss": 1.7762, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.7285191956124315, |
|
"grad_norm": 97020.46875, |
|
"learning_rate": 2.3802681992337166e-05, |
|
"loss": 1.6877, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.7330895795246803, |
|
"grad_norm": 100322.8515625, |
|
"learning_rate": 2.3754789272030653e-05, |
|
"loss": 1.7148, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.7376599634369287, |
|
"grad_norm": 66746.328125, |
|
"learning_rate": 2.370689655172414e-05, |
|
"loss": 1.7645, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.742230347349177, |
|
"grad_norm": 50676.11328125, |
|
"learning_rate": 2.3659003831417627e-05, |
|
"loss": 1.7107, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.746800731261426, |
|
"grad_norm": 53923.9921875, |
|
"learning_rate": 2.361111111111111e-05, |
|
"loss": 1.7802, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.751371115173675, |
|
"grad_norm": 61331.36328125, |
|
"learning_rate": 2.3563218390804597e-05, |
|
"loss": 1.682, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.7559414990859232, |
|
"grad_norm": 92563.5078125, |
|
"learning_rate": 2.3515325670498088e-05, |
|
"loss": 1.8056, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.7605118829981716, |
|
"grad_norm": 103113.2578125, |
|
"learning_rate": 2.3467432950191575e-05, |
|
"loss": 1.6654, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.7650822669104205, |
|
"grad_norm": 83672.0859375, |
|
"learning_rate": 2.3419540229885058e-05, |
|
"loss": 1.6853, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.7696526508226693, |
|
"grad_norm": 66880.90625, |
|
"learning_rate": 2.3371647509578545e-05, |
|
"loss": 1.7165, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.7742230347349177, |
|
"grad_norm": 159394.40625, |
|
"learning_rate": 2.3323754789272032e-05, |
|
"loss": 1.6777, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.778793418647166, |
|
"grad_norm": 103960.3203125, |
|
"learning_rate": 2.327586206896552e-05, |
|
"loss": 1.7345, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.783363802559415, |
|
"grad_norm": 89361.9609375, |
|
"learning_rate": 2.3227969348659002e-05, |
|
"loss": 1.8153, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.787934186471664, |
|
"grad_norm": 48453.51953125, |
|
"learning_rate": 2.3180076628352492e-05, |
|
"loss": 1.7499, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.792504570383912, |
|
"grad_norm": 60024.06640625, |
|
"learning_rate": 2.313218390804598e-05, |
|
"loss": 1.6522, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.797074954296161, |
|
"grad_norm": 76540.078125, |
|
"learning_rate": 2.3084291187739466e-05, |
|
"loss": 1.7614, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.8016453382084094, |
|
"grad_norm": 127741.5390625, |
|
"learning_rate": 2.303639846743295e-05, |
|
"loss": 1.7328, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.8062157221206583, |
|
"grad_norm": 92570.3828125, |
|
"learning_rate": 2.2988505747126437e-05, |
|
"loss": 1.6602, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.8107861060329067, |
|
"grad_norm": 74026.5, |
|
"learning_rate": 2.2940613026819923e-05, |
|
"loss": 1.6413, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.8153564899451555, |
|
"grad_norm": 142319.03125, |
|
"learning_rate": 2.289272030651341e-05, |
|
"loss": 1.6652, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.819926873857404, |
|
"grad_norm": 66725.7578125, |
|
"learning_rate": 2.2844827586206897e-05, |
|
"loss": 1.7805, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.8244972577696528, |
|
"grad_norm": 204509.578125, |
|
"learning_rate": 2.2796934865900384e-05, |
|
"loss": 1.7006, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.829067641681901, |
|
"grad_norm": 81922.28125, |
|
"learning_rate": 2.274904214559387e-05, |
|
"loss": 1.7177, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.83363802559415, |
|
"grad_norm": 79576.234375, |
|
"learning_rate": 2.2701149425287358e-05, |
|
"loss": 1.7878, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.8382084095063984, |
|
"grad_norm": 94908.828125, |
|
"learning_rate": 2.2653256704980845e-05, |
|
"loss": 1.6381, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.8427787934186473, |
|
"grad_norm": 54743.4921875, |
|
"learning_rate": 2.2605363984674328e-05, |
|
"loss": 1.6953, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.8473491773308957, |
|
"grad_norm": 81304.8203125, |
|
"learning_rate": 2.2557471264367815e-05, |
|
"loss": 1.7417, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.8519195612431445, |
|
"grad_norm": 61965.26171875, |
|
"learning_rate": 2.2509578544061305e-05, |
|
"loss": 1.7189, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.856489945155393, |
|
"grad_norm": 74507.875, |
|
"learning_rate": 2.2461685823754792e-05, |
|
"loss": 1.6891, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.8610603290676417, |
|
"grad_norm": 83252.703125, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 1.6622, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.8656307129798906, |
|
"grad_norm": 74911.6484375, |
|
"learning_rate": 2.2365900383141763e-05, |
|
"loss": 1.684, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.870201096892139, |
|
"grad_norm": 67471.203125, |
|
"learning_rate": 2.231800766283525e-05, |
|
"loss": 1.7436, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.8747714808043874, |
|
"grad_norm": 54812.26953125, |
|
"learning_rate": 2.2270114942528736e-05, |
|
"loss": 1.7817, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.8793418647166362, |
|
"grad_norm": 93858.140625, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.721, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.883912248628885, |
|
"grad_norm": 64988.40234375, |
|
"learning_rate": 2.217432950191571e-05, |
|
"loss": 1.7022, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.8884826325411335, |
|
"grad_norm": 70428.0546875, |
|
"learning_rate": 2.2126436781609197e-05, |
|
"loss": 1.7059, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.893053016453382, |
|
"grad_norm": 82495.4453125, |
|
"learning_rate": 2.2078544061302684e-05, |
|
"loss": 1.8106, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.8976234003656307, |
|
"grad_norm": 81380.859375, |
|
"learning_rate": 2.203065134099617e-05, |
|
"loss": 1.6888, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.9021937842778796, |
|
"grad_norm": 79387.8125, |
|
"learning_rate": 2.1982758620689654e-05, |
|
"loss": 1.6739, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.906764168190128, |
|
"grad_norm": 139392.453125, |
|
"learning_rate": 2.193486590038314e-05, |
|
"loss": 1.6741, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.9113345521023763, |
|
"grad_norm": 87657.875, |
|
"learning_rate": 2.1886973180076628e-05, |
|
"loss": 1.7429, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.915904936014625, |
|
"grad_norm": 62421.78125, |
|
"learning_rate": 2.183908045977012e-05, |
|
"loss": 1.7148, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.920475319926874, |
|
"grad_norm": 126881.4375, |
|
"learning_rate": 2.1791187739463602e-05, |
|
"loss": 1.6232, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.9250457038391224, |
|
"grad_norm": 102633.1015625, |
|
"learning_rate": 2.174329501915709e-05, |
|
"loss": 1.7049, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.9296160877513713, |
|
"grad_norm": 102789.2265625, |
|
"learning_rate": 2.1695402298850576e-05, |
|
"loss": 1.7001, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.9341864716636197, |
|
"grad_norm": 79177.21875, |
|
"learning_rate": 2.1647509578544062e-05, |
|
"loss": 1.7525, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.9387568555758685, |
|
"grad_norm": 80986.46875, |
|
"learning_rate": 2.159961685823755e-05, |
|
"loss": 1.6458, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.943327239488117, |
|
"grad_norm": 97659.375, |
|
"learning_rate": 2.1551724137931033e-05, |
|
"loss": 1.7678, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.9478976234003658, |
|
"grad_norm": 97846.609375, |
|
"learning_rate": 2.1503831417624523e-05, |
|
"loss": 1.7128, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.952468007312614, |
|
"grad_norm": 74223.375, |
|
"learning_rate": 2.145593869731801e-05, |
|
"loss": 1.8434, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 2.957038391224863, |
|
"grad_norm": 59553.359375, |
|
"learning_rate": 2.1408045977011497e-05, |
|
"loss": 1.7717, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 2.9616087751371114, |
|
"grad_norm": 57012.078125, |
|
"learning_rate": 2.136015325670498e-05, |
|
"loss": 1.6865, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 2.9661791590493602, |
|
"grad_norm": 57963.9609375, |
|
"learning_rate": 2.1312260536398467e-05, |
|
"loss": 1.6966, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 2.9707495429616086, |
|
"grad_norm": 82150.6015625, |
|
"learning_rate": 2.1264367816091954e-05, |
|
"loss": 1.7219, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.9753199268738575, |
|
"grad_norm": 49790.92578125, |
|
"learning_rate": 2.1216475095785444e-05, |
|
"loss": 1.7301, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 2.979890310786106, |
|
"grad_norm": 77082.4140625, |
|
"learning_rate": 2.1168582375478928e-05, |
|
"loss": 1.6742, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 2.9844606946983547, |
|
"grad_norm": 123639.671875, |
|
"learning_rate": 2.1120689655172415e-05, |
|
"loss": 1.6256, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 2.989031078610603, |
|
"grad_norm": 67240.890625, |
|
"learning_rate": 2.10727969348659e-05, |
|
"loss": 1.7623, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 2.993601462522852, |
|
"grad_norm": 71455.7890625, |
|
"learning_rate": 2.102490421455939e-05, |
|
"loss": 1.6758, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 2.998171846435101, |
|
"grad_norm": 155724.125, |
|
"learning_rate": 2.0977011494252875e-05, |
|
"loss": 1.6401, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.695529818534851, |
|
"eval_runtime": 345.831, |
|
"eval_samples_per_second": 43.374, |
|
"eval_steps_per_second": 1.356, |
|
"step": 6564 |
|
}, |
|
{ |
|
"epoch": 3.002742230347349, |
|
"grad_norm": 71896.8515625, |
|
"learning_rate": 2.092911877394636e-05, |
|
"loss": 1.7537, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 3.0073126142595976, |
|
"grad_norm": 61401.01171875, |
|
"learning_rate": 2.088122605363985e-05, |
|
"loss": 1.6979, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 3.0118829981718465, |
|
"grad_norm": 108287.78125, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.7115, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 3.016453382084095, |
|
"grad_norm": 108027.125, |
|
"learning_rate": 2.0785440613026823e-05, |
|
"loss": 1.6887, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.0210237659963437, |
|
"grad_norm": 89622.265625, |
|
"learning_rate": 2.0737547892720306e-05, |
|
"loss": 1.6484, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 3.025594149908592, |
|
"grad_norm": 116170.4921875, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 1.723, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 3.030164533820841, |
|
"grad_norm": 76070.9765625, |
|
"learning_rate": 2.064176245210728e-05, |
|
"loss": 1.6649, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 3.03473491773309, |
|
"grad_norm": 86966.0859375, |
|
"learning_rate": 2.0593869731800767e-05, |
|
"loss": 1.6613, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 3.039305301645338, |
|
"grad_norm": 101902.5, |
|
"learning_rate": 2.0545977011494254e-05, |
|
"loss": 1.6718, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 3.043875685557587, |
|
"grad_norm": 45214.640625, |
|
"learning_rate": 2.049808429118774e-05, |
|
"loss": 1.7025, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 3.0484460694698354, |
|
"grad_norm": 61494.140625, |
|
"learning_rate": 2.0450191570881228e-05, |
|
"loss": 1.7289, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 3.0530164533820843, |
|
"grad_norm": 136512.90625, |
|
"learning_rate": 2.0402298850574715e-05, |
|
"loss": 1.7768, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 3.0575868372943327, |
|
"grad_norm": 106828.390625, |
|
"learning_rate": 2.03544061302682e-05, |
|
"loss": 1.7891, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 3.0621572212065815, |
|
"grad_norm": 54863.59765625, |
|
"learning_rate": 2.0306513409961685e-05, |
|
"loss": 1.6614, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.06672760511883, |
|
"grad_norm": 95806.8984375, |
|
"learning_rate": 2.0258620689655172e-05, |
|
"loss": 1.6815, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 3.0712979890310788, |
|
"grad_norm": 70664.6875, |
|
"learning_rate": 2.0210727969348662e-05, |
|
"loss": 1.6989, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 3.075868372943327, |
|
"grad_norm": 118271.9921875, |
|
"learning_rate": 2.016283524904215e-05, |
|
"loss": 1.8092, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 3.080438756855576, |
|
"grad_norm": 82462.28125, |
|
"learning_rate": 2.0114942528735632e-05, |
|
"loss": 1.8018, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 3.0850091407678244, |
|
"grad_norm": 69368.5546875, |
|
"learning_rate": 2.006704980842912e-05, |
|
"loss": 1.6884, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 3.0895795246800732, |
|
"grad_norm": 84815.4609375, |
|
"learning_rate": 2.0019157088122606e-05, |
|
"loss": 1.695, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 3.0941499085923216, |
|
"grad_norm": 117484.3125, |
|
"learning_rate": 1.9971264367816093e-05, |
|
"loss": 1.799, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 3.0987202925045705, |
|
"grad_norm": 74344.625, |
|
"learning_rate": 1.992337164750958e-05, |
|
"loss": 1.7173, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 3.103290676416819, |
|
"grad_norm": 61023.3359375, |
|
"learning_rate": 1.9875478927203067e-05, |
|
"loss": 1.7335, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 3.1078610603290677, |
|
"grad_norm": 80261.34375, |
|
"learning_rate": 1.9827586206896554e-05, |
|
"loss": 1.6618, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.112431444241316, |
|
"grad_norm": 44226.015625, |
|
"learning_rate": 1.977969348659004e-05, |
|
"loss": 1.6994, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 3.117001828153565, |
|
"grad_norm": 62552.55078125, |
|
"learning_rate": 1.9731800766283527e-05, |
|
"loss": 1.6826, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 3.1215722120658134, |
|
"grad_norm": 126776.40625, |
|
"learning_rate": 1.968390804597701e-05, |
|
"loss": 1.6274, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 3.126142595978062, |
|
"grad_norm": 71035.0234375, |
|
"learning_rate": 1.9636015325670498e-05, |
|
"loss": 1.8007, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 3.1307129798903106, |
|
"grad_norm": 78976.2265625, |
|
"learning_rate": 1.9588122605363985e-05, |
|
"loss": 1.727, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 3.1352833638025595, |
|
"grad_norm": 83373.7265625, |
|
"learning_rate": 1.9540229885057475e-05, |
|
"loss": 1.7554, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 3.139853747714808, |
|
"grad_norm": 165484.453125, |
|
"learning_rate": 1.949233716475096e-05, |
|
"loss": 1.621, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 3.1444241316270567, |
|
"grad_norm": 62350.18359375, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 1.8118, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 3.148994515539305, |
|
"grad_norm": 64155.27734375, |
|
"learning_rate": 1.9396551724137932e-05, |
|
"loss": 1.7245, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 3.153564899451554, |
|
"grad_norm": 69873.4375, |
|
"learning_rate": 1.934865900383142e-05, |
|
"loss": 1.7768, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.1581352833638023, |
|
"grad_norm": 122451.703125, |
|
"learning_rate": 1.9300766283524903e-05, |
|
"loss": 1.7436, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 3.162705667276051, |
|
"grad_norm": 50876.828125, |
|
"learning_rate": 1.925287356321839e-05, |
|
"loss": 1.7615, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 3.1672760511883, |
|
"grad_norm": 68587.875, |
|
"learning_rate": 1.920498084291188e-05, |
|
"loss": 1.6389, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 3.1718464351005484, |
|
"grad_norm": 111575.640625, |
|
"learning_rate": 1.9157088122605367e-05, |
|
"loss": 1.7055, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 3.1764168190127973, |
|
"grad_norm": 68116.921875, |
|
"learning_rate": 1.910919540229885e-05, |
|
"loss": 1.7635, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 3.1809872029250457, |
|
"grad_norm": 75995.734375, |
|
"learning_rate": 1.9061302681992337e-05, |
|
"loss": 1.7208, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 3.1855575868372945, |
|
"grad_norm": 97217.1796875, |
|
"learning_rate": 1.9013409961685824e-05, |
|
"loss": 1.6942, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 3.190127970749543, |
|
"grad_norm": 125494.984375, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 1.7897, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 3.1946983546617918, |
|
"grad_norm": 102539.3046875, |
|
"learning_rate": 1.8917624521072798e-05, |
|
"loss": 1.7186, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 3.19926873857404, |
|
"grad_norm": 92514.1640625, |
|
"learning_rate": 1.8869731800766285e-05, |
|
"loss": 1.7805, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.203839122486289, |
|
"grad_norm": 86951.125, |
|
"learning_rate": 1.882183908045977e-05, |
|
"loss": 1.738, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 3.2084095063985374, |
|
"grad_norm": 99123.890625, |
|
"learning_rate": 1.8773946360153258e-05, |
|
"loss": 1.6605, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 3.2129798903107862, |
|
"grad_norm": 61411.390625, |
|
"learning_rate": 1.8726053639846745e-05, |
|
"loss": 1.6291, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 3.2175502742230346, |
|
"grad_norm": 69628.1953125, |
|
"learning_rate": 1.867816091954023e-05, |
|
"loss": 1.7693, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 3.2221206581352835, |
|
"grad_norm": 74736.0390625, |
|
"learning_rate": 1.8630268199233716e-05, |
|
"loss": 1.6428, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 3.226691042047532, |
|
"grad_norm": 93917.203125, |
|
"learning_rate": 1.8582375478927206e-05, |
|
"loss": 1.7189, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 3.2312614259597807, |
|
"grad_norm": 80656.5546875, |
|
"learning_rate": 1.8534482758620693e-05, |
|
"loss": 1.7485, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 3.235831809872029, |
|
"grad_norm": 72125.5546875, |
|
"learning_rate": 1.8486590038314176e-05, |
|
"loss": 1.644, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 3.240402193784278, |
|
"grad_norm": 86443.671875, |
|
"learning_rate": 1.8438697318007663e-05, |
|
"loss": 1.6747, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 3.2449725776965264, |
|
"grad_norm": 93107.2734375, |
|
"learning_rate": 1.839080459770115e-05, |
|
"loss": 1.705, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.249542961608775, |
|
"grad_norm": 68446.5078125, |
|
"learning_rate": 1.8342911877394637e-05, |
|
"loss": 1.7647, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 3.2541133455210236, |
|
"grad_norm": 91024.3125, |
|
"learning_rate": 1.8295019157088124e-05, |
|
"loss": 1.7323, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 3.2586837294332724, |
|
"grad_norm": 63286.3828125, |
|
"learning_rate": 1.824712643678161e-05, |
|
"loss": 1.7359, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 3.263254113345521, |
|
"grad_norm": 94550.0234375, |
|
"learning_rate": 1.8199233716475097e-05, |
|
"loss": 1.7065, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 3.2678244972577697, |
|
"grad_norm": 66774.734375, |
|
"learning_rate": 1.8151340996168584e-05, |
|
"loss": 1.6688, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 3.272394881170018, |
|
"grad_norm": 52998.1484375, |
|
"learning_rate": 1.810344827586207e-05, |
|
"loss": 1.7237, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 3.276965265082267, |
|
"grad_norm": 70673.5078125, |
|
"learning_rate": 1.8055555555555555e-05, |
|
"loss": 1.7609, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 3.2815356489945158, |
|
"grad_norm": 109352.7421875, |
|
"learning_rate": 1.800766283524904e-05, |
|
"loss": 1.6908, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 3.286106032906764, |
|
"grad_norm": 66609.03125, |
|
"learning_rate": 1.795977011494253e-05, |
|
"loss": 1.7241, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 3.2906764168190126, |
|
"grad_norm": 74225.8984375, |
|
"learning_rate": 1.791187739463602e-05, |
|
"loss": 1.7827, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.2952468007312614, |
|
"grad_norm": 116946.515625, |
|
"learning_rate": 1.7863984674329502e-05, |
|
"loss": 1.6128, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 3.2998171846435103, |
|
"grad_norm": 76768.5234375, |
|
"learning_rate": 1.781609195402299e-05, |
|
"loss": 1.7406, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 3.3043875685557587, |
|
"grad_norm": 107767.0625, |
|
"learning_rate": 1.7768199233716476e-05, |
|
"loss": 1.6913, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 3.3089579524680075, |
|
"grad_norm": 76932.5703125, |
|
"learning_rate": 1.7720306513409963e-05, |
|
"loss": 1.751, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 3.313528336380256, |
|
"grad_norm": 132700.34375, |
|
"learning_rate": 1.767241379310345e-05, |
|
"loss": 1.7305, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 3.3180987202925047, |
|
"grad_norm": 148178.984375, |
|
"learning_rate": 1.7624521072796933e-05, |
|
"loss": 1.6116, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 3.322669104204753, |
|
"grad_norm": 84747.203125, |
|
"learning_rate": 1.7576628352490424e-05, |
|
"loss": 1.7728, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 3.327239488117002, |
|
"grad_norm": 88323.5078125, |
|
"learning_rate": 1.752873563218391e-05, |
|
"loss": 1.6676, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 3.3318098720292504, |
|
"grad_norm": 55836.046875, |
|
"learning_rate": 1.7480842911877397e-05, |
|
"loss": 1.7439, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 3.3363802559414992, |
|
"grad_norm": 107173.328125, |
|
"learning_rate": 1.743295019157088e-05, |
|
"loss": 1.746, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.3409506398537476, |
|
"grad_norm": 63958.0078125, |
|
"learning_rate": 1.7385057471264368e-05, |
|
"loss": 1.6936, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 3.3455210237659965, |
|
"grad_norm": 100361.1484375, |
|
"learning_rate": 1.7337164750957855e-05, |
|
"loss": 1.6349, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 3.350091407678245, |
|
"grad_norm": 64250.95703125, |
|
"learning_rate": 1.728927203065134e-05, |
|
"loss": 1.715, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 3.3546617915904937, |
|
"grad_norm": 66948.6953125, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 1.6291, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 3.359232175502742, |
|
"grad_norm": 83300.3125, |
|
"learning_rate": 1.7193486590038315e-05, |
|
"loss": 1.6724, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 3.363802559414991, |
|
"grad_norm": 88165.8125, |
|
"learning_rate": 1.7145593869731802e-05, |
|
"loss": 1.7536, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 3.3683729433272394, |
|
"grad_norm": 67886.234375, |
|
"learning_rate": 1.709770114942529e-05, |
|
"loss": 1.6798, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 3.372943327239488, |
|
"grad_norm": 64415.46484375, |
|
"learning_rate": 1.7049808429118776e-05, |
|
"loss": 1.7546, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 3.3775137111517366, |
|
"grad_norm": 75445.15625, |
|
"learning_rate": 1.700191570881226e-05, |
|
"loss": 1.6932, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 3.3820840950639854, |
|
"grad_norm": 122763.609375, |
|
"learning_rate": 1.6954022988505746e-05, |
|
"loss": 1.7015, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.386654478976234, |
|
"grad_norm": 113570.5546875, |
|
"learning_rate": 1.6906130268199236e-05, |
|
"loss": 1.7188, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 3.3912248628884827, |
|
"grad_norm": 103909.40625, |
|
"learning_rate": 1.6858237547892723e-05, |
|
"loss": 1.6516, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 3.395795246800731, |
|
"grad_norm": 72607.9375, |
|
"learning_rate": 1.6810344827586207e-05, |
|
"loss": 1.704, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 3.40036563071298, |
|
"grad_norm": 154061.578125, |
|
"learning_rate": 1.6762452107279694e-05, |
|
"loss": 1.769, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 3.4049360146252283, |
|
"grad_norm": 122622.734375, |
|
"learning_rate": 1.671455938697318e-05, |
|
"loss": 1.754, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 3.409506398537477, |
|
"grad_norm": 70141.734375, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.7117, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 3.414076782449726, |
|
"grad_norm": 82556.859375, |
|
"learning_rate": 1.6618773946360154e-05, |
|
"loss": 1.7317, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 3.4186471663619744, |
|
"grad_norm": 83208.140625, |
|
"learning_rate": 1.657088122605364e-05, |
|
"loss": 1.7346, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 3.423217550274223, |
|
"grad_norm": 102812.6171875, |
|
"learning_rate": 1.6522988505747128e-05, |
|
"loss": 1.7444, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 3.4277879341864717, |
|
"grad_norm": 90799.8359375, |
|
"learning_rate": 1.6475095785440615e-05, |
|
"loss": 1.7343, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.4323583180987205, |
|
"grad_norm": 91094.5390625, |
|
"learning_rate": 1.6427203065134102e-05, |
|
"loss": 1.7624, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 3.436928702010969, |
|
"grad_norm": 69841.4296875, |
|
"learning_rate": 1.6379310344827585e-05, |
|
"loss": 1.7264, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 3.4414990859232177, |
|
"grad_norm": 122445.3515625, |
|
"learning_rate": 1.6331417624521072e-05, |
|
"loss": 1.8214, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 3.446069469835466, |
|
"grad_norm": 85346.1875, |
|
"learning_rate": 1.628352490421456e-05, |
|
"loss": 1.7169, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 3.450639853747715, |
|
"grad_norm": 46006.20703125, |
|
"learning_rate": 1.623563218390805e-05, |
|
"loss": 1.6682, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 3.4552102376599634, |
|
"grad_norm": 61423.13671875, |
|
"learning_rate": 1.6187739463601533e-05, |
|
"loss": 1.7414, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 3.4597806215722122, |
|
"grad_norm": 73209.5625, |
|
"learning_rate": 1.613984674329502e-05, |
|
"loss": 1.699, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 3.4643510054844606, |
|
"grad_norm": 81988.2265625, |
|
"learning_rate": 1.6091954022988507e-05, |
|
"loss": 1.6394, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 3.4689213893967095, |
|
"grad_norm": 128754.4140625, |
|
"learning_rate": 1.6044061302681994e-05, |
|
"loss": 1.7366, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 3.473491773308958, |
|
"grad_norm": 92045.3828125, |
|
"learning_rate": 1.5996168582375477e-05, |
|
"loss": 1.7727, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.4780621572212067, |
|
"grad_norm": 144295.390625, |
|
"learning_rate": 1.5948275862068967e-05, |
|
"loss": 1.6927, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 3.482632541133455, |
|
"grad_norm": 54716.375, |
|
"learning_rate": 1.5900383141762454e-05, |
|
"loss": 1.7297, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 3.487202925045704, |
|
"grad_norm": 80850.1328125, |
|
"learning_rate": 1.585249042145594e-05, |
|
"loss": 1.639, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 3.4917733089579523, |
|
"grad_norm": 114330.296875, |
|
"learning_rate": 1.5804597701149425e-05, |
|
"loss": 1.7673, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 3.496343692870201, |
|
"grad_norm": 58934.4921875, |
|
"learning_rate": 1.575670498084291e-05, |
|
"loss": 1.6763, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 3.5009140767824496, |
|
"grad_norm": 123695.609375, |
|
"learning_rate": 1.5708812260536398e-05, |
|
"loss": 1.7135, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 3.5054844606946984, |
|
"grad_norm": 83289.6640625, |
|
"learning_rate": 1.5660919540229885e-05, |
|
"loss": 1.688, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 3.510054844606947, |
|
"grad_norm": 100226.015625, |
|
"learning_rate": 1.5613026819923372e-05, |
|
"loss": 1.7304, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 3.5146252285191957, |
|
"grad_norm": 88909.984375, |
|
"learning_rate": 1.556513409961686e-05, |
|
"loss": 1.6723, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 3.519195612431444, |
|
"grad_norm": 66940.3515625, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 1.6159, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.523765996343693, |
|
"grad_norm": 88044.171875, |
|
"learning_rate": 1.5469348659003833e-05, |
|
"loss": 1.7853, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 3.5283363802559418, |
|
"grad_norm": 85045.421875, |
|
"learning_rate": 1.542145593869732e-05, |
|
"loss": 1.7666, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 3.53290676416819, |
|
"grad_norm": 60147.796875, |
|
"learning_rate": 1.5373563218390803e-05, |
|
"loss": 1.6683, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 3.5374771480804386, |
|
"grad_norm": 82411.9609375, |
|
"learning_rate": 1.532567049808429e-05, |
|
"loss": 1.7323, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 3.5420475319926874, |
|
"grad_norm": 66054.59375, |
|
"learning_rate": 1.527777777777778e-05, |
|
"loss": 1.6714, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 3.5466179159049362, |
|
"grad_norm": 72301.625, |
|
"learning_rate": 1.5229885057471265e-05, |
|
"loss": 1.6496, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 3.5511882998171846, |
|
"grad_norm": 104870.84375, |
|
"learning_rate": 1.5181992337164752e-05, |
|
"loss": 1.729, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 3.555758683729433, |
|
"grad_norm": 155170.828125, |
|
"learning_rate": 1.5134099616858237e-05, |
|
"loss": 1.6892, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 3.560329067641682, |
|
"grad_norm": 103218.2578125, |
|
"learning_rate": 1.5086206896551724e-05, |
|
"loss": 1.68, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 3.5648994515539307, |
|
"grad_norm": 77247.625, |
|
"learning_rate": 1.5038314176245211e-05, |
|
"loss": 1.8163, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.569469835466179, |
|
"grad_norm": 62777.0859375, |
|
"learning_rate": 1.4990421455938696e-05, |
|
"loss": 1.7311, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 3.5740402193784275, |
|
"grad_norm": 79059.1328125, |
|
"learning_rate": 1.4942528735632185e-05, |
|
"loss": 1.714, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 3.5786106032906764, |
|
"grad_norm": 75726.578125, |
|
"learning_rate": 1.4894636015325672e-05, |
|
"loss": 1.724, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 3.583180987202925, |
|
"grad_norm": 62909.81640625, |
|
"learning_rate": 1.4846743295019159e-05, |
|
"loss": 1.6383, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 3.5877513711151736, |
|
"grad_norm": 80657.5234375, |
|
"learning_rate": 1.4798850574712644e-05, |
|
"loss": 1.7269, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 3.5923217550274225, |
|
"grad_norm": 73948.4375, |
|
"learning_rate": 1.475095785440613e-05, |
|
"loss": 1.6603, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 3.596892138939671, |
|
"grad_norm": 88141.875, |
|
"learning_rate": 1.4703065134099616e-05, |
|
"loss": 1.6842, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 3.6014625228519197, |
|
"grad_norm": 78391.890625, |
|
"learning_rate": 1.4655172413793103e-05, |
|
"loss": 1.6908, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 3.606032906764168, |
|
"grad_norm": 65879.515625, |
|
"learning_rate": 1.4607279693486591e-05, |
|
"loss": 1.7438, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 3.610603290676417, |
|
"grad_norm": 137562.15625, |
|
"learning_rate": 1.4559386973180078e-05, |
|
"loss": 1.704, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.6151736745886653, |
|
"grad_norm": 99103.4375, |
|
"learning_rate": 1.4511494252873564e-05, |
|
"loss": 1.7265, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 3.619744058500914, |
|
"grad_norm": 67414.4609375, |
|
"learning_rate": 1.446360153256705e-05, |
|
"loss": 1.7134, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 3.6243144424131626, |
|
"grad_norm": 109987.7265625, |
|
"learning_rate": 1.4415708812260537e-05, |
|
"loss": 1.6406, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 3.6288848263254114, |
|
"grad_norm": 143015.703125, |
|
"learning_rate": 1.4367816091954022e-05, |
|
"loss": 1.7694, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 3.63345521023766, |
|
"grad_norm": 87812.765625, |
|
"learning_rate": 1.431992337164751e-05, |
|
"loss": 1.7357, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 3.6380255941499087, |
|
"grad_norm": 84844.578125, |
|
"learning_rate": 1.4272030651340998e-05, |
|
"loss": 1.7893, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 3.642595978062157, |
|
"grad_norm": 72731.703125, |
|
"learning_rate": 1.4224137931034485e-05, |
|
"loss": 1.6586, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 3.647166361974406, |
|
"grad_norm": 81956.890625, |
|
"learning_rate": 1.417624521072797e-05, |
|
"loss": 1.7693, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 3.6517367458866543, |
|
"grad_norm": 69217.53125, |
|
"learning_rate": 1.4128352490421457e-05, |
|
"loss": 1.7928, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 3.656307129798903, |
|
"grad_norm": 54634.5703125, |
|
"learning_rate": 1.4080459770114942e-05, |
|
"loss": 1.7918, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.660877513711152, |
|
"grad_norm": 63817.8359375, |
|
"learning_rate": 1.4032567049808429e-05, |
|
"loss": 1.6763, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 3.6654478976234004, |
|
"grad_norm": 118554.0859375, |
|
"learning_rate": 1.3984674329501916e-05, |
|
"loss": 1.7032, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 3.670018281535649, |
|
"grad_norm": 96849.4453125, |
|
"learning_rate": 1.3936781609195404e-05, |
|
"loss": 1.6672, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 3.6745886654478976, |
|
"grad_norm": 138688.09375, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 1.6499, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 3.6791590493601465, |
|
"grad_norm": 71032.484375, |
|
"learning_rate": 1.3840996168582376e-05, |
|
"loss": 1.7859, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 3.683729433272395, |
|
"grad_norm": 105990.21875, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 1.6542, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 3.6882998171846433, |
|
"grad_norm": 119098.0859375, |
|
"learning_rate": 1.3745210727969348e-05, |
|
"loss": 1.7186, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 3.692870201096892, |
|
"grad_norm": 61243.96484375, |
|
"learning_rate": 1.3697318007662835e-05, |
|
"loss": 1.7328, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 3.697440585009141, |
|
"grad_norm": 78637.296875, |
|
"learning_rate": 1.3649425287356324e-05, |
|
"loss": 1.6125, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 3.7020109689213894, |
|
"grad_norm": 88676.828125, |
|
"learning_rate": 1.360153256704981e-05, |
|
"loss": 1.6784, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.7065813528336378, |
|
"grad_norm": 80246.65625, |
|
"learning_rate": 1.3553639846743296e-05, |
|
"loss": 1.6786, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 3.7111517367458866, |
|
"grad_norm": 79097.7734375, |
|
"learning_rate": 1.3505747126436783e-05, |
|
"loss": 1.6875, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 3.7157221206581355, |
|
"grad_norm": 75883.453125, |
|
"learning_rate": 1.3457854406130268e-05, |
|
"loss": 1.6788, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 3.720292504570384, |
|
"grad_norm": 87841.7734375, |
|
"learning_rate": 1.3409961685823755e-05, |
|
"loss": 1.7938, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 3.7248628884826327, |
|
"grad_norm": 60471.46875, |
|
"learning_rate": 1.336206896551724e-05, |
|
"loss": 1.7017, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 3.729433272394881, |
|
"grad_norm": 117315.484375, |
|
"learning_rate": 1.331417624521073e-05, |
|
"loss": 1.652, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 3.73400365630713, |
|
"grad_norm": 81507.8984375, |
|
"learning_rate": 1.3266283524904216e-05, |
|
"loss": 1.7639, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 3.7385740402193783, |
|
"grad_norm": 110054.6328125, |
|
"learning_rate": 1.3218390804597702e-05, |
|
"loss": 1.7429, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 3.743144424131627, |
|
"grad_norm": 65638.3828125, |
|
"learning_rate": 1.3170498084291188e-05, |
|
"loss": 1.7599, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 3.7477148080438756, |
|
"grad_norm": 116608.078125, |
|
"learning_rate": 1.3122605363984675e-05, |
|
"loss": 1.7329, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.7522851919561244, |
|
"grad_norm": 87637.5078125, |
|
"learning_rate": 1.3074712643678161e-05, |
|
"loss": 1.7624, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 3.756855575868373, |
|
"grad_norm": 94494.8125, |
|
"learning_rate": 1.3026819923371647e-05, |
|
"loss": 1.7554, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 3.7614259597806217, |
|
"grad_norm": 69636.9296875, |
|
"learning_rate": 1.2978927203065135e-05, |
|
"loss": 1.758, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 3.76599634369287, |
|
"grad_norm": 73185.421875, |
|
"learning_rate": 1.2931034482758622e-05, |
|
"loss": 1.7235, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 3.770566727605119, |
|
"grad_norm": 94298.2265625, |
|
"learning_rate": 1.2883141762452109e-05, |
|
"loss": 1.6726, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 3.7751371115173673, |
|
"grad_norm": 99814.8671875, |
|
"learning_rate": 1.2835249042145594e-05, |
|
"loss": 1.6912, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 3.779707495429616, |
|
"grad_norm": 68422.03125, |
|
"learning_rate": 1.2787356321839081e-05, |
|
"loss": 1.7406, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 3.7842778793418645, |
|
"grad_norm": 82088.296875, |
|
"learning_rate": 1.2739463601532566e-05, |
|
"loss": 1.7526, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 3.7888482632541134, |
|
"grad_norm": 77173.703125, |
|
"learning_rate": 1.2691570881226053e-05, |
|
"loss": 1.7784, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 3.7934186471663622, |
|
"grad_norm": 60719.59375, |
|
"learning_rate": 1.2643678160919542e-05, |
|
"loss": 1.7124, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.7979890310786106, |
|
"grad_norm": 71459.625, |
|
"learning_rate": 1.2595785440613029e-05, |
|
"loss": 1.6099, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 3.802559414990859, |
|
"grad_norm": 57145.5546875, |
|
"learning_rate": 1.2547892720306514e-05, |
|
"loss": 1.7385, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 3.807129798903108, |
|
"grad_norm": 72725.9296875, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.7214, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 3.8117001828153567, |
|
"grad_norm": 135925.65625, |
|
"learning_rate": 1.2452107279693487e-05, |
|
"loss": 1.719, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 3.816270566727605, |
|
"grad_norm": 48055.78515625, |
|
"learning_rate": 1.2404214559386974e-05, |
|
"loss": 1.7011, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 3.8208409506398535, |
|
"grad_norm": 120826.3359375, |
|
"learning_rate": 1.2356321839080461e-05, |
|
"loss": 1.6556, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 3.8254113345521024, |
|
"grad_norm": 98967.4453125, |
|
"learning_rate": 1.2308429118773946e-05, |
|
"loss": 1.6559, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 3.829981718464351, |
|
"grad_norm": 67471.8125, |
|
"learning_rate": 1.2260536398467433e-05, |
|
"loss": 1.6941, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 3.8345521023765996, |
|
"grad_norm": 73149.8359375, |
|
"learning_rate": 1.221264367816092e-05, |
|
"loss": 1.7197, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 3.839122486288848, |
|
"grad_norm": 59362.51953125, |
|
"learning_rate": 1.2164750957854407e-05, |
|
"loss": 1.7067, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.843692870201097, |
|
"grad_norm": 103423.5859375, |
|
"learning_rate": 1.2116858237547892e-05, |
|
"loss": 1.5936, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 3.8482632541133457, |
|
"grad_norm": 68154.90625, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 1.7166, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 3.852833638025594, |
|
"grad_norm": 81413.09375, |
|
"learning_rate": 1.2021072796934866e-05, |
|
"loss": 1.7183, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 3.857404021937843, |
|
"grad_norm": 67458.328125, |
|
"learning_rate": 1.1973180076628353e-05, |
|
"loss": 1.7063, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 3.8619744058500913, |
|
"grad_norm": 81477.78125, |
|
"learning_rate": 1.192528735632184e-05, |
|
"loss": 1.6622, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 3.86654478976234, |
|
"grad_norm": 94965.6953125, |
|
"learning_rate": 1.1877394636015327e-05, |
|
"loss": 1.6777, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 3.8711151736745886, |
|
"grad_norm": 64403.4375, |
|
"learning_rate": 1.1829501915708814e-05, |
|
"loss": 1.7471, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 3.8756855575868374, |
|
"grad_norm": 72309.5859375, |
|
"learning_rate": 1.1781609195402299e-05, |
|
"loss": 1.7836, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 3.880255941499086, |
|
"grad_norm": 80551.765625, |
|
"learning_rate": 1.1733716475095787e-05, |
|
"loss": 1.6478, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 3.8848263254113347, |
|
"grad_norm": 86743.6015625, |
|
"learning_rate": 1.1685823754789272e-05, |
|
"loss": 1.6723, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.889396709323583, |
|
"grad_norm": 60500.5, |
|
"learning_rate": 1.163793103448276e-05, |
|
"loss": 1.779, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 3.893967093235832, |
|
"grad_norm": 81024.5703125, |
|
"learning_rate": 1.1590038314176246e-05, |
|
"loss": 1.5701, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 3.8985374771480803, |
|
"grad_norm": 80383.0234375, |
|
"learning_rate": 1.1542145593869733e-05, |
|
"loss": 1.7806, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 3.903107861060329, |
|
"grad_norm": 76206.0, |
|
"learning_rate": 1.1494252873563218e-05, |
|
"loss": 1.6846, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 3.9076782449725775, |
|
"grad_norm": 105761.515625, |
|
"learning_rate": 1.1446360153256705e-05, |
|
"loss": 1.7613, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 3.9122486288848264, |
|
"grad_norm": 76597.375, |
|
"learning_rate": 1.1398467432950192e-05, |
|
"loss": 1.6565, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 3.916819012797075, |
|
"grad_norm": 73221.8984375, |
|
"learning_rate": 1.1350574712643679e-05, |
|
"loss": 1.7441, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 3.9213893967093236, |
|
"grad_norm": 79467.125, |
|
"learning_rate": 1.1302681992337164e-05, |
|
"loss": 1.6712, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 3.9259597806215725, |
|
"grad_norm": 72400.3125, |
|
"learning_rate": 1.1254789272030653e-05, |
|
"loss": 1.6967, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 3.930530164533821, |
|
"grad_norm": 57804.4296875, |
|
"learning_rate": 1.1206896551724138e-05, |
|
"loss": 1.7333, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.9351005484460693, |
|
"grad_norm": 67051.7734375, |
|
"learning_rate": 1.1159003831417625e-05, |
|
"loss": 1.6614, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 3.939670932358318, |
|
"grad_norm": 104811.15625, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 1.7326, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 3.944241316270567, |
|
"grad_norm": 110914.2265625, |
|
"learning_rate": 1.1063218390804599e-05, |
|
"loss": 1.6674, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 3.9488117001828154, |
|
"grad_norm": 79537.1875, |
|
"learning_rate": 1.1015325670498085e-05, |
|
"loss": 1.6492, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 3.9533820840950638, |
|
"grad_norm": 60097.46484375, |
|
"learning_rate": 1.096743295019157e-05, |
|
"loss": 1.6846, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 3.9579524680073126, |
|
"grad_norm": 118254.2265625, |
|
"learning_rate": 1.091954022988506e-05, |
|
"loss": 1.6961, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 3.9625228519195614, |
|
"grad_norm": 114773.8046875, |
|
"learning_rate": 1.0871647509578544e-05, |
|
"loss": 1.7114, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 3.96709323583181, |
|
"grad_norm": 87263.125, |
|
"learning_rate": 1.0823754789272031e-05, |
|
"loss": 1.7308, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 3.9716636197440582, |
|
"grad_norm": 66768.859375, |
|
"learning_rate": 1.0775862068965516e-05, |
|
"loss": 1.6712, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 3.976234003656307, |
|
"grad_norm": 119385.375, |
|
"learning_rate": 1.0727969348659005e-05, |
|
"loss": 1.7558, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 3.980804387568556, |
|
"grad_norm": 71487.484375, |
|
"learning_rate": 1.068007662835249e-05, |
|
"loss": 1.7924, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 3.9853747714808043, |
|
"grad_norm": 81396.0234375, |
|
"learning_rate": 1.0632183908045977e-05, |
|
"loss": 1.6232, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 3.989945155393053, |
|
"grad_norm": 89533.171875, |
|
"learning_rate": 1.0584291187739464e-05, |
|
"loss": 1.7174, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 3.9945155393053016, |
|
"grad_norm": 72157.4765625, |
|
"learning_rate": 1.053639846743295e-05, |
|
"loss": 1.7366, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 3.9990859232175504, |
|
"grad_norm": 71103.40625, |
|
"learning_rate": 1.0488505747126438e-05, |
|
"loss": 1.6741, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.6915712356567383, |
|
"eval_runtime": 345.8106, |
|
"eval_samples_per_second": 43.376, |
|
"eval_steps_per_second": 1.356, |
|
"step": 8752 |
|
}, |
|
{ |
|
"epoch": 4.003656307129799, |
|
"grad_norm": 75867.875, |
|
"learning_rate": 1.0440613026819925e-05, |
|
"loss": 1.6729, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 4.008226691042047, |
|
"grad_norm": 72915.7890625, |
|
"learning_rate": 1.0392720306513411e-05, |
|
"loss": 1.7197, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 4.012797074954296, |
|
"grad_norm": 92003.453125, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 1.6728, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 4.017367458866545, |
|
"grad_norm": 120198.3359375, |
|
"learning_rate": 1.0296934865900384e-05, |
|
"loss": 1.7737, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 4.021937842778794, |
|
"grad_norm": 102790.4375, |
|
"learning_rate": 1.024904214559387e-05, |
|
"loss": 1.7166, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.026508226691042, |
|
"grad_norm": 83954.15625, |
|
"learning_rate": 1.0201149425287357e-05, |
|
"loss": 1.6454, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 4.0310786106032905, |
|
"grad_norm": 100407.6484375, |
|
"learning_rate": 1.0153256704980842e-05, |
|
"loss": 1.7711, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 4.035648994515539, |
|
"grad_norm": 70433.90625, |
|
"learning_rate": 1.0105363984674331e-05, |
|
"loss": 1.7616, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 4.040219378427788, |
|
"grad_norm": 73853.703125, |
|
"learning_rate": 1.0057471264367816e-05, |
|
"loss": 1.7407, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 4.044789762340036, |
|
"grad_norm": 89838.96875, |
|
"learning_rate": 1.0009578544061303e-05, |
|
"loss": 1.6869, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 4.049360146252285, |
|
"grad_norm": 98299.5859375, |
|
"learning_rate": 9.96168582375479e-06, |
|
"loss": 1.6584, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 4.053930530164534, |
|
"grad_norm": 52650.97265625, |
|
"learning_rate": 9.913793103448277e-06, |
|
"loss": 1.7039, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 4.058500914076783, |
|
"grad_norm": 98332.7890625, |
|
"learning_rate": 9.865900383141764e-06, |
|
"loss": 1.7345, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 4.063071297989031, |
|
"grad_norm": 87076.296875, |
|
"learning_rate": 9.818007662835249e-06, |
|
"loss": 1.6923, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 4.0676416819012795, |
|
"grad_norm": 54348.390625, |
|
"learning_rate": 9.770114942528738e-06, |
|
"loss": 1.7026, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.072212065813528, |
|
"grad_norm": 57868.62109375, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 1.7003, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 4.076782449725777, |
|
"grad_norm": 65227.4375, |
|
"learning_rate": 9.67432950191571e-06, |
|
"loss": 1.5978, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 4.081352833638025, |
|
"grad_norm": 105873.453125, |
|
"learning_rate": 9.626436781609195e-06, |
|
"loss": 1.7575, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 4.085923217550274, |
|
"grad_norm": 112255.6640625, |
|
"learning_rate": 9.578544061302683e-06, |
|
"loss": 1.6445, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 4.090493601462523, |
|
"grad_norm": 65130.7109375, |
|
"learning_rate": 9.530651340996169e-06, |
|
"loss": 1.7077, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 4.095063985374772, |
|
"grad_norm": 116178.6796875, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 1.7243, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 4.0996343692870205, |
|
"grad_norm": 105348.765625, |
|
"learning_rate": 9.434865900383142e-06, |
|
"loss": 1.6588, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 4.1042047531992685, |
|
"grad_norm": 64916.08203125, |
|
"learning_rate": 9.386973180076629e-06, |
|
"loss": 1.6274, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 4.108775137111517, |
|
"grad_norm": 85616.9453125, |
|
"learning_rate": 9.339080459770114e-06, |
|
"loss": 1.7181, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 4.113345521023766, |
|
"grad_norm": 56802.30859375, |
|
"learning_rate": 9.291187739463603e-06, |
|
"loss": 1.7205, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.117915904936015, |
|
"grad_norm": 106071.6015625, |
|
"learning_rate": 9.243295019157088e-06, |
|
"loss": 1.663, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 4.122486288848263, |
|
"grad_norm": 84213.0859375, |
|
"learning_rate": 9.195402298850575e-06, |
|
"loss": 1.6674, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 4.127056672760512, |
|
"grad_norm": 83103.328125, |
|
"learning_rate": 9.147509578544062e-06, |
|
"loss": 1.7418, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 4.131627056672761, |
|
"grad_norm": 45266.80859375, |
|
"learning_rate": 9.099616858237549e-06, |
|
"loss": 1.6215, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 4.1361974405850095, |
|
"grad_norm": 83939.390625, |
|
"learning_rate": 9.051724137931036e-06, |
|
"loss": 1.7258, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 4.140767824497257, |
|
"grad_norm": 98675.046875, |
|
"learning_rate": 9.00383141762452e-06, |
|
"loss": 1.6511, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 4.145338208409506, |
|
"grad_norm": 78594.921875, |
|
"learning_rate": 8.95593869731801e-06, |
|
"loss": 1.6855, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 4.149908592321755, |
|
"grad_norm": 78093.4609375, |
|
"learning_rate": 8.908045977011495e-06, |
|
"loss": 1.7284, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 4.154478976234004, |
|
"grad_norm": 98573.2890625, |
|
"learning_rate": 8.860153256704981e-06, |
|
"loss": 1.686, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 4.159049360146252, |
|
"grad_norm": 56181.88671875, |
|
"learning_rate": 8.812260536398467e-06, |
|
"loss": 1.7138, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.163619744058501, |
|
"grad_norm": 75070.2421875, |
|
"learning_rate": 8.764367816091955e-06, |
|
"loss": 1.7444, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 4.16819012797075, |
|
"grad_norm": 62708.078125, |
|
"learning_rate": 8.71647509578544e-06, |
|
"loss": 1.7641, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 4.1727605118829985, |
|
"grad_norm": 98152.28125, |
|
"learning_rate": 8.668582375478927e-06, |
|
"loss": 1.7544, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 4.177330895795246, |
|
"grad_norm": 59807.0546875, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 1.727, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 4.181901279707495, |
|
"grad_norm": 61243.06640625, |
|
"learning_rate": 8.572796934865901e-06, |
|
"loss": 1.6752, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 4.186471663619744, |
|
"grad_norm": 69440.40625, |
|
"learning_rate": 8.524904214559388e-06, |
|
"loss": 1.7704, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 4.191042047531993, |
|
"grad_norm": 132799.5625, |
|
"learning_rate": 8.477011494252873e-06, |
|
"loss": 1.7243, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 4.195612431444241, |
|
"grad_norm": 60873.515625, |
|
"learning_rate": 8.429118773946362e-06, |
|
"loss": 1.7021, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 4.20018281535649, |
|
"grad_norm": 56033.59375, |
|
"learning_rate": 8.381226053639847e-06, |
|
"loss": 1.6721, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 4.204753199268739, |
|
"grad_norm": 65075.58984375, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.7053, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.209323583180987, |
|
"grad_norm": 65212.20703125, |
|
"learning_rate": 8.28544061302682e-06, |
|
"loss": 1.6667, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 4.213893967093236, |
|
"grad_norm": 98034.7109375, |
|
"learning_rate": 8.237547892720307e-06, |
|
"loss": 1.7291, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 4.218464351005484, |
|
"grad_norm": 112262.515625, |
|
"learning_rate": 8.189655172413793e-06, |
|
"loss": 1.7307, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 4.223034734917733, |
|
"grad_norm": 42643.26171875, |
|
"learning_rate": 8.14176245210728e-06, |
|
"loss": 1.7146, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 4.227605118829982, |
|
"grad_norm": 120319.0703125, |
|
"learning_rate": 8.093869731800766e-06, |
|
"loss": 1.5791, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 4.232175502742231, |
|
"grad_norm": 111697.4765625, |
|
"learning_rate": 8.045977011494253e-06, |
|
"loss": 1.7305, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 4.236745886654479, |
|
"grad_norm": 82615.4453125, |
|
"learning_rate": 7.998084291187739e-06, |
|
"loss": 1.6441, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 4.2413162705667276, |
|
"grad_norm": 112459.7890625, |
|
"learning_rate": 7.950191570881227e-06, |
|
"loss": 1.6667, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 4.245886654478976, |
|
"grad_norm": 89633.421875, |
|
"learning_rate": 7.902298850574712e-06, |
|
"loss": 1.7402, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 4.250457038391225, |
|
"grad_norm": 73259.6953125, |
|
"learning_rate": 7.854406130268199e-06, |
|
"loss": 1.725, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.255027422303473, |
|
"grad_norm": 87751.640625, |
|
"learning_rate": 7.806513409961686e-06, |
|
"loss": 1.7017, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 4.259597806215722, |
|
"grad_norm": 130956.0546875, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 1.6762, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 4.264168190127971, |
|
"grad_norm": 54888.203125, |
|
"learning_rate": 7.71072796934866e-06, |
|
"loss": 1.772, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 4.26873857404022, |
|
"grad_norm": 55581.7109375, |
|
"learning_rate": 7.662835249042145e-06, |
|
"loss": 1.7176, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 4.273308957952468, |
|
"grad_norm": 89327.875, |
|
"learning_rate": 7.614942528735633e-06, |
|
"loss": 1.7243, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 4.2778793418647165, |
|
"grad_norm": 76867.40625, |
|
"learning_rate": 7.567049808429119e-06, |
|
"loss": 1.7136, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 4.282449725776965, |
|
"grad_norm": 131182.859375, |
|
"learning_rate": 7.519157088122606e-06, |
|
"loss": 1.7012, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 4.287020109689214, |
|
"grad_norm": 80961.5546875, |
|
"learning_rate": 7.4712643678160925e-06, |
|
"loss": 1.7052, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 4.291590493601462, |
|
"grad_norm": 64866.6796875, |
|
"learning_rate": 7.423371647509579e-06, |
|
"loss": 1.7689, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 4.296160877513711, |
|
"grad_norm": 65769.59375, |
|
"learning_rate": 7.375478927203065e-06, |
|
"loss": 1.7517, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.30073126142596, |
|
"grad_norm": 54188.25, |
|
"learning_rate": 7.3275862068965514e-06, |
|
"loss": 1.7222, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 4.305301645338209, |
|
"grad_norm": 122926.7734375, |
|
"learning_rate": 7.279693486590039e-06, |
|
"loss": 1.6745, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 4.309872029250457, |
|
"grad_norm": 69720.5078125, |
|
"learning_rate": 7.231800766283525e-06, |
|
"loss": 1.7206, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 4.3144424131627055, |
|
"grad_norm": 127599.6953125, |
|
"learning_rate": 7.183908045977011e-06, |
|
"loss": 1.6919, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 4.319012797074954, |
|
"grad_norm": 75789.5546875, |
|
"learning_rate": 7.136015325670499e-06, |
|
"loss": 1.725, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 4.323583180987203, |
|
"grad_norm": 99748.046875, |
|
"learning_rate": 7.088122605363985e-06, |
|
"loss": 1.7164, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 4.328153564899452, |
|
"grad_norm": 85926.734375, |
|
"learning_rate": 7.040229885057471e-06, |
|
"loss": 1.7377, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 4.3327239488117, |
|
"grad_norm": 85478.2109375, |
|
"learning_rate": 6.992337164750958e-06, |
|
"loss": 1.7319, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 4.337294332723949, |
|
"grad_norm": 72827.2265625, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 1.7371, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 4.341864716636198, |
|
"grad_norm": 93393.625, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 1.6363, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.346435100548446, |
|
"grad_norm": 90090.8359375, |
|
"learning_rate": 6.848659003831418e-06, |
|
"loss": 1.6988, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 4.3510054844606945, |
|
"grad_norm": 85922.3203125, |
|
"learning_rate": 6.800766283524905e-06, |
|
"loss": 1.6765, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 4.355575868372943, |
|
"grad_norm": 94569.2109375, |
|
"learning_rate": 6.7528735632183914e-06, |
|
"loss": 1.6928, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 4.360146252285192, |
|
"grad_norm": 86991.3984375, |
|
"learning_rate": 6.7049808429118775e-06, |
|
"loss": 1.7658, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 4.364716636197441, |
|
"grad_norm": 129308.6171875, |
|
"learning_rate": 6.657088122605365e-06, |
|
"loss": 1.7118, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 4.369287020109689, |
|
"grad_norm": 58696.4765625, |
|
"learning_rate": 6.609195402298851e-06, |
|
"loss": 1.7136, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 4.373857404021938, |
|
"grad_norm": 72559.2265625, |
|
"learning_rate": 6.561302681992337e-06, |
|
"loss": 1.7245, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 4.378427787934187, |
|
"grad_norm": 96479.890625, |
|
"learning_rate": 6.513409961685823e-06, |
|
"loss": 1.7426, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 4.3829981718464355, |
|
"grad_norm": 52625.5859375, |
|
"learning_rate": 6.465517241379311e-06, |
|
"loss": 1.818, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 4.387568555758683, |
|
"grad_norm": 74031.9765625, |
|
"learning_rate": 6.417624521072797e-06, |
|
"loss": 1.688, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.392138939670932, |
|
"grad_norm": 70750.546875, |
|
"learning_rate": 6.369731800766283e-06, |
|
"loss": 1.6336, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 4.396709323583181, |
|
"grad_norm": 79010.375, |
|
"learning_rate": 6.321839080459771e-06, |
|
"loss": 1.6344, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 4.40127970749543, |
|
"grad_norm": 52663.4765625, |
|
"learning_rate": 6.273946360153257e-06, |
|
"loss": 1.7024, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 4.405850091407678, |
|
"grad_norm": 88580.4375, |
|
"learning_rate": 6.226053639846744e-06, |
|
"loss": 1.7704, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 4.410420475319927, |
|
"grad_norm": 75858.3828125, |
|
"learning_rate": 6.178160919540231e-06, |
|
"loss": 1.7163, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 4.414990859232176, |
|
"grad_norm": 85468.9296875, |
|
"learning_rate": 6.130268199233717e-06, |
|
"loss": 1.8792, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 4.4195612431444244, |
|
"grad_norm": 149377.140625, |
|
"learning_rate": 6.0823754789272035e-06, |
|
"loss": 1.7492, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 4.424131627056672, |
|
"grad_norm": 96749.546875, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 1.641, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 4.428702010968921, |
|
"grad_norm": 114815.234375, |
|
"learning_rate": 5.9865900383141764e-06, |
|
"loss": 1.7394, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 4.43327239488117, |
|
"grad_norm": 117656.1953125, |
|
"learning_rate": 5.938697318007663e-06, |
|
"loss": 1.7659, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.437842778793419, |
|
"grad_norm": 91634.71875, |
|
"learning_rate": 5.890804597701149e-06, |
|
"loss": 1.806, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 4.442413162705667, |
|
"grad_norm": 59309.10546875, |
|
"learning_rate": 5.842911877394636e-06, |
|
"loss": 1.7264, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 4.446983546617916, |
|
"grad_norm": 102864.578125, |
|
"learning_rate": 5.795019157088123e-06, |
|
"loss": 1.6442, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 4.451553930530165, |
|
"grad_norm": 48123.75, |
|
"learning_rate": 5.747126436781609e-06, |
|
"loss": 1.7215, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 4.456124314442413, |
|
"grad_norm": 59340.78125, |
|
"learning_rate": 5.699233716475096e-06, |
|
"loss": 1.7105, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 4.460694698354661, |
|
"grad_norm": 47793.0078125, |
|
"learning_rate": 5.651340996168582e-06, |
|
"loss": 1.773, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 4.46526508226691, |
|
"grad_norm": 68314.3828125, |
|
"learning_rate": 5.603448275862069e-06, |
|
"loss": 1.8067, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 4.469835466179159, |
|
"grad_norm": 127164.171875, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 1.712, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 4.474405850091408, |
|
"grad_norm": 108175.9296875, |
|
"learning_rate": 5.507662835249043e-06, |
|
"loss": 1.6775, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 4.478976234003657, |
|
"grad_norm": 83982.234375, |
|
"learning_rate": 5.45977011494253e-06, |
|
"loss": 1.7235, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.483546617915905, |
|
"grad_norm": 116926.3515625, |
|
"learning_rate": 5.411877394636016e-06, |
|
"loss": 1.7325, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 4.4881170018281535, |
|
"grad_norm": 85041.0234375, |
|
"learning_rate": 5.3639846743295025e-06, |
|
"loss": 1.7853, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 4.492687385740402, |
|
"grad_norm": 67453.5859375, |
|
"learning_rate": 5.3160919540229885e-06, |
|
"loss": 1.7712, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 4.497257769652651, |
|
"grad_norm": 120161.9140625, |
|
"learning_rate": 5.268199233716475e-06, |
|
"loss": 1.7047, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 4.501828153564899, |
|
"grad_norm": 91166.8984375, |
|
"learning_rate": 5.220306513409962e-06, |
|
"loss": 1.7825, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 4.506398537477148, |
|
"grad_norm": 80539.265625, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 1.7293, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 4.510968921389397, |
|
"grad_norm": 89111.5390625, |
|
"learning_rate": 5.124521072796935e-06, |
|
"loss": 1.6972, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 4.515539305301646, |
|
"grad_norm": 106499.9453125, |
|
"learning_rate": 5.076628352490421e-06, |
|
"loss": 1.6945, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 4.520109689213894, |
|
"grad_norm": 81342.203125, |
|
"learning_rate": 5.028735632183908e-06, |
|
"loss": 1.7311, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 4.5246800731261425, |
|
"grad_norm": 63680.8359375, |
|
"learning_rate": 4.980842911877395e-06, |
|
"loss": 1.6724, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.529250457038391, |
|
"grad_norm": 80776.640625, |
|
"learning_rate": 4.932950191570882e-06, |
|
"loss": 1.7283, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 4.53382084095064, |
|
"grad_norm": 107851.6328125, |
|
"learning_rate": 4.885057471264369e-06, |
|
"loss": 1.7028, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 4.538391224862888, |
|
"grad_norm": 79906.65625, |
|
"learning_rate": 4.837164750957855e-06, |
|
"loss": 1.6608, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 4.542961608775137, |
|
"grad_norm": 67892.0, |
|
"learning_rate": 4.789272030651342e-06, |
|
"loss": 1.6717, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 4.547531992687386, |
|
"grad_norm": 79051.953125, |
|
"learning_rate": 4.741379310344828e-06, |
|
"loss": 1.6859, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 4.552102376599635, |
|
"grad_norm": 107722.7109375, |
|
"learning_rate": 4.6934865900383146e-06, |
|
"loss": 1.768, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 4.556672760511883, |
|
"grad_norm": 73130.265625, |
|
"learning_rate": 4.6455938697318015e-06, |
|
"loss": 1.623, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 4.5612431444241315, |
|
"grad_norm": 96307.8984375, |
|
"learning_rate": 4.5977011494252875e-06, |
|
"loss": 1.7251, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 4.56581352833638, |
|
"grad_norm": 92756.6484375, |
|
"learning_rate": 4.549808429118774e-06, |
|
"loss": 1.7256, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 4.570383912248629, |
|
"grad_norm": 99314.5390625, |
|
"learning_rate": 4.50191570881226e-06, |
|
"loss": 1.7193, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.574954296160877, |
|
"grad_norm": 89754.9453125, |
|
"learning_rate": 4.454022988505747e-06, |
|
"loss": 1.7082, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 4.579524680073126, |
|
"grad_norm": 75742.890625, |
|
"learning_rate": 4.406130268199233e-06, |
|
"loss": 1.7386, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 4.584095063985375, |
|
"grad_norm": 95144.9921875, |
|
"learning_rate": 4.35823754789272e-06, |
|
"loss": 1.7612, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 4.588665447897624, |
|
"grad_norm": 74380.0546875, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 1.7231, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 4.5932358318098725, |
|
"grad_norm": 75351.3515625, |
|
"learning_rate": 4.262452107279694e-06, |
|
"loss": 1.6865, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 4.5978062157221204, |
|
"grad_norm": 126452.8359375, |
|
"learning_rate": 4.214559386973181e-06, |
|
"loss": 1.7256, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 4.602376599634369, |
|
"grad_norm": 50301.078125, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 1.7692, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 4.606946983546618, |
|
"grad_norm": 72251.7265625, |
|
"learning_rate": 4.118773946360154e-06, |
|
"loss": 1.7593, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 4.611517367458866, |
|
"grad_norm": 63932.359375, |
|
"learning_rate": 4.07088122605364e-06, |
|
"loss": 1.768, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 4.616087751371115, |
|
"grad_norm": 71254.5390625, |
|
"learning_rate": 4.022988505747127e-06, |
|
"loss": 1.736, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.620658135283364, |
|
"grad_norm": 105173.09375, |
|
"learning_rate": 3.9750957854406135e-06, |
|
"loss": 1.7123, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 4.625228519195613, |
|
"grad_norm": 55138.73828125, |
|
"learning_rate": 3.9272030651340996e-06, |
|
"loss": 1.7062, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 4.6297989031078615, |
|
"grad_norm": 56330.21484375, |
|
"learning_rate": 3.8793103448275865e-06, |
|
"loss": 1.6406, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 4.634369287020109, |
|
"grad_norm": 134460.59375, |
|
"learning_rate": 3.8314176245210725e-06, |
|
"loss": 1.7209, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 4.638939670932358, |
|
"grad_norm": 67907.453125, |
|
"learning_rate": 3.7835249042145594e-06, |
|
"loss": 1.7233, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 4.643510054844607, |
|
"grad_norm": 100771.4921875, |
|
"learning_rate": 3.7356321839080462e-06, |
|
"loss": 1.6853, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 4.648080438756856, |
|
"grad_norm": 54433.70703125, |
|
"learning_rate": 3.6877394636015327e-06, |
|
"loss": 1.5971, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 4.652650822669104, |
|
"grad_norm": 74801.8515625, |
|
"learning_rate": 3.6398467432950196e-06, |
|
"loss": 1.6912, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 4.657221206581353, |
|
"grad_norm": 89413.6328125, |
|
"learning_rate": 3.5919540229885056e-06, |
|
"loss": 1.7757, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 4.661791590493602, |
|
"grad_norm": 55121.12109375, |
|
"learning_rate": 3.5440613026819925e-06, |
|
"loss": 1.7154, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.66636197440585, |
|
"grad_norm": 71017.0078125, |
|
"learning_rate": 3.496168582375479e-06, |
|
"loss": 1.6741, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 4.670932358318098, |
|
"grad_norm": 70393.5859375, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.7049, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 4.675502742230347, |
|
"grad_norm": 68136.203125, |
|
"learning_rate": 3.4003831417624527e-06, |
|
"loss": 1.7331, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 4.680073126142596, |
|
"grad_norm": 92633.125, |
|
"learning_rate": 3.3524904214559387e-06, |
|
"loss": 1.7386, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 4.684643510054845, |
|
"grad_norm": 52863.0234375, |
|
"learning_rate": 3.3045977011494256e-06, |
|
"loss": 1.6676, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 4.689213893967093, |
|
"grad_norm": 52217.04296875, |
|
"learning_rate": 3.2567049808429117e-06, |
|
"loss": 1.6496, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 4.693784277879342, |
|
"grad_norm": 71589.21875, |
|
"learning_rate": 3.2088122605363985e-06, |
|
"loss": 1.7128, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 4.698354661791591, |
|
"grad_norm": 63616.85546875, |
|
"learning_rate": 3.1609195402298854e-06, |
|
"loss": 1.8258, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 4.702925045703839, |
|
"grad_norm": 51611.06640625, |
|
"learning_rate": 3.113026819923372e-06, |
|
"loss": 1.6953, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 4.707495429616088, |
|
"grad_norm": 65978.15625, |
|
"learning_rate": 3.0651340996168583e-06, |
|
"loss": 1.7285, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.712065813528336, |
|
"grad_norm": 68540.25, |
|
"learning_rate": 3.017241379310345e-06, |
|
"loss": 1.666, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 4.716636197440585, |
|
"grad_norm": 121031.6015625, |
|
"learning_rate": 2.9693486590038317e-06, |
|
"loss": 1.7578, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 4.721206581352834, |
|
"grad_norm": 90117.9296875, |
|
"learning_rate": 2.921455938697318e-06, |
|
"loss": 1.7317, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 4.725776965265082, |
|
"grad_norm": 113749.5390625, |
|
"learning_rate": 2.8735632183908046e-06, |
|
"loss": 1.7508, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 4.730347349177331, |
|
"grad_norm": 51193.17578125, |
|
"learning_rate": 2.825670498084291e-06, |
|
"loss": 1.6362, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 4.7349177330895795, |
|
"grad_norm": 90305.71875, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 1.7621, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 4.739488117001828, |
|
"grad_norm": 74391.09375, |
|
"learning_rate": 2.729885057471265e-06, |
|
"loss": 1.6677, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 4.744058500914077, |
|
"grad_norm": 82829.6875, |
|
"learning_rate": 2.6819923371647512e-06, |
|
"loss": 1.8085, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 4.748628884826325, |
|
"grad_norm": 102442.8515625, |
|
"learning_rate": 2.6340996168582377e-06, |
|
"loss": 1.6941, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 4.753199268738574, |
|
"grad_norm": 84212.0546875, |
|
"learning_rate": 2.586206896551724e-06, |
|
"loss": 1.6369, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.757769652650823, |
|
"grad_norm": 87777.5234375, |
|
"learning_rate": 2.5383141762452106e-06, |
|
"loss": 1.6629, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 4.762340036563071, |
|
"grad_norm": 72210.796875, |
|
"learning_rate": 2.4904214559386975e-06, |
|
"loss": 1.7738, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 4.76691042047532, |
|
"grad_norm": 72672.140625, |
|
"learning_rate": 2.4425287356321844e-06, |
|
"loss": 1.743, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 4.7714808043875685, |
|
"grad_norm": 52435.03515625, |
|
"learning_rate": 2.394636015325671e-06, |
|
"loss": 1.6503, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 4.776051188299817, |
|
"grad_norm": 81516.578125, |
|
"learning_rate": 2.3467432950191573e-06, |
|
"loss": 1.7072, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 4.780621572212066, |
|
"grad_norm": 64971.1328125, |
|
"learning_rate": 2.2988505747126437e-06, |
|
"loss": 1.7349, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 4.785191956124314, |
|
"grad_norm": 90203.71875, |
|
"learning_rate": 2.25095785440613e-06, |
|
"loss": 1.7305, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 4.789762340036563, |
|
"grad_norm": 85017.875, |
|
"learning_rate": 2.2030651340996167e-06, |
|
"loss": 1.642, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 4.794332723948812, |
|
"grad_norm": 88660.4765625, |
|
"learning_rate": 2.1551724137931035e-06, |
|
"loss": 1.7387, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 4.798903107861061, |
|
"grad_norm": 81671.3984375, |
|
"learning_rate": 2.1072796934865904e-06, |
|
"loss": 1.6963, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.803473491773309, |
|
"grad_norm": 82145.9453125, |
|
"learning_rate": 2.059386973180077e-06, |
|
"loss": 1.689, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 4.8080438756855575, |
|
"grad_norm": 60723.84375, |
|
"learning_rate": 2.0114942528735633e-06, |
|
"loss": 1.7065, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 4.812614259597806, |
|
"grad_norm": 79342.8515625, |
|
"learning_rate": 1.9636015325670498e-06, |
|
"loss": 1.6952, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 4.817184643510055, |
|
"grad_norm": 66902.59375, |
|
"learning_rate": 1.9157088122605362e-06, |
|
"loss": 1.6965, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 4.821755027422303, |
|
"grad_norm": 57767.9453125, |
|
"learning_rate": 1.8678160919540231e-06, |
|
"loss": 1.7267, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 4.826325411334552, |
|
"grad_norm": 57847.31640625, |
|
"learning_rate": 1.8199233716475098e-06, |
|
"loss": 1.5725, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 4.830895795246801, |
|
"grad_norm": 71927.8984375, |
|
"learning_rate": 1.7720306513409962e-06, |
|
"loss": 1.6501, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 4.83546617915905, |
|
"grad_norm": 74899.0234375, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 1.6678, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 4.840036563071298, |
|
"grad_norm": 92564.6875, |
|
"learning_rate": 1.6762452107279694e-06, |
|
"loss": 1.6991, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 4.844606946983546, |
|
"grad_norm": 65702.7109375, |
|
"learning_rate": 1.6283524904214558e-06, |
|
"loss": 1.7309, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.849177330895795, |
|
"grad_norm": 72665.1796875, |
|
"learning_rate": 1.5804597701149427e-06, |
|
"loss": 1.6152, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 4.853747714808044, |
|
"grad_norm": 94501.109375, |
|
"learning_rate": 1.5325670498084292e-06, |
|
"loss": 1.6927, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 4.858318098720293, |
|
"grad_norm": 113540.65625, |
|
"learning_rate": 1.4846743295019158e-06, |
|
"loss": 1.7562, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 4.862888482632541, |
|
"grad_norm": 60082.1484375, |
|
"learning_rate": 1.4367816091954023e-06, |
|
"loss": 1.743, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 4.86745886654479, |
|
"grad_norm": 77984.4921875, |
|
"learning_rate": 1.388888888888889e-06, |
|
"loss": 1.597, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 4.872029250457039, |
|
"grad_norm": 77285.3515625, |
|
"learning_rate": 1.3409961685823756e-06, |
|
"loss": 1.6401, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 4.876599634369287, |
|
"grad_norm": 62164.921875, |
|
"learning_rate": 1.293103448275862e-06, |
|
"loss": 1.7019, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 4.881170018281535, |
|
"grad_norm": 82581.1875, |
|
"learning_rate": 1.2452107279693487e-06, |
|
"loss": 1.6698, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 4.885740402193784, |
|
"grad_norm": 191003.421875, |
|
"learning_rate": 1.1973180076628354e-06, |
|
"loss": 1.7425, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 4.890310786106033, |
|
"grad_norm": 70496.2421875, |
|
"learning_rate": 1.1494252873563219e-06, |
|
"loss": 1.7575, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 4.894881170018282, |
|
"grad_norm": 96979.078125, |
|
"learning_rate": 1.1015325670498083e-06, |
|
"loss": 1.7798, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 4.89945155393053, |
|
"grad_norm": 75589.8046875, |
|
"learning_rate": 1.0536398467432952e-06, |
|
"loss": 1.6209, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 4.904021937842779, |
|
"grad_norm": 146879.5, |
|
"learning_rate": 1.0057471264367817e-06, |
|
"loss": 1.7368, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 4.908592321755028, |
|
"grad_norm": 80107.3125, |
|
"learning_rate": 9.578544061302681e-07, |
|
"loss": 1.7019, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 4.913162705667276, |
|
"grad_norm": 66398.5703125, |
|
"learning_rate": 9.099616858237549e-07, |
|
"loss": 1.7885, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 4.917733089579524, |
|
"grad_norm": 76734.59375, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 1.7817, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 4.922303473491773, |
|
"grad_norm": 79165.3203125, |
|
"learning_rate": 8.141762452107279e-07, |
|
"loss": 1.7004, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 4.926873857404022, |
|
"grad_norm": 61589.2109375, |
|
"learning_rate": 7.662835249042146e-07, |
|
"loss": 1.7177, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 4.931444241316271, |
|
"grad_norm": 91896.1015625, |
|
"learning_rate": 7.183908045977011e-07, |
|
"loss": 1.7116, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 4.936014625228519, |
|
"grad_norm": 76825.4296875, |
|
"learning_rate": 6.704980842911878e-07, |
|
"loss": 1.6501, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 4.940585009140768, |
|
"grad_norm": 79197.75, |
|
"learning_rate": 6.226053639846744e-07, |
|
"loss": 1.7284, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 4.9451553930530165, |
|
"grad_norm": 87847.4609375, |
|
"learning_rate": 5.747126436781609e-07, |
|
"loss": 1.7757, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 4.949725776965265, |
|
"grad_norm": 105324.3125, |
|
"learning_rate": 5.268199233716476e-07, |
|
"loss": 1.7017, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 4.954296160877513, |
|
"grad_norm": 78621.59375, |
|
"learning_rate": 4.789272030651341e-07, |
|
"loss": 1.6659, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 4.958866544789762, |
|
"grad_norm": 161616.046875, |
|
"learning_rate": 4.3103448275862073e-07, |
|
"loss": 1.7586, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 4.963436928702011, |
|
"grad_norm": 87202.765625, |
|
"learning_rate": 3.831417624521073e-07, |
|
"loss": 1.6147, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 4.96800731261426, |
|
"grad_norm": 87169.0390625, |
|
"learning_rate": 3.352490421455939e-07, |
|
"loss": 1.6565, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 4.972577696526509, |
|
"grad_norm": 93609.015625, |
|
"learning_rate": 2.8735632183908047e-07, |
|
"loss": 1.8099, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 4.977148080438757, |
|
"grad_norm": 120831.265625, |
|
"learning_rate": 2.3946360153256703e-07, |
|
"loss": 1.7018, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 4.9817184643510055, |
|
"grad_norm": 69430.453125, |
|
"learning_rate": 1.9157088122605365e-07, |
|
"loss": 1.6799, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 4.986288848263254, |
|
"grad_norm": 65192.02734375, |
|
"learning_rate": 1.4367816091954023e-07, |
|
"loss": 1.7379, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 4.990859232175502, |
|
"grad_norm": 89309.6953125, |
|
"learning_rate": 9.578544061302682e-08, |
|
"loss": 1.7807, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 4.995429616087751, |
|
"grad_norm": 65604.3203125, |
|
"learning_rate": 4.789272030651341e-08, |
|
"loss": 1.7125, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 114907.515625, |
|
"learning_rate": 0.0, |
|
"loss": 1.7282, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.6917415857315063, |
|
"eval_runtime": 346.3088, |
|
"eval_samples_per_second": 43.314, |
|
"eval_steps_per_second": 1.354, |
|
"step": 10940 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10940, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|