|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 17633, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014177961776215052, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.5e-08, |
|
"loss": 2.9276, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0028355923552430104, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 5e-08, |
|
"loss": 2.9468, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004253388532864515, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 7.500000000000001e-08, |
|
"loss": 2.9228, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.005671184710486021, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1e-07, |
|
"loss": 2.8258, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.007088980888107526, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.25e-07, |
|
"loss": 2.7263, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00850677706572903, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 2.6245, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009924573243350537, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.7500000000000002e-07, |
|
"loss": 2.4902, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.011342369420972042, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2e-07, |
|
"loss": 2.349, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012760165598593547, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 2.25e-07, |
|
"loss": 2.3038, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.014177961776215052, |
|
"grad_norm": 3.0, |
|
"learning_rate": 2.5e-07, |
|
"loss": 2.2632, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.015595757953836557, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.7499999999999996e-07, |
|
"loss": 2.2339, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.01701355413145806, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 2.2106, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018431350309079567, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.25e-07, |
|
"loss": 2.178, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.019849146486701073, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.5000000000000004e-07, |
|
"loss": 2.1623, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.021266942664322577, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.75e-07, |
|
"loss": 2.1621, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.022684738841944083, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4e-07, |
|
"loss": 2.1586, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.024102535019565587, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 4.25e-07, |
|
"loss": 2.1498, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.025520331197187093, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.5e-07, |
|
"loss": 2.1515, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.026938127374808597, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.75e-07, |
|
"loss": 2.1216, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.028355923552430103, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5e-07, |
|
"loss": 2.0938, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.029773719730051607, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.25e-07, |
|
"loss": 2.0788, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.031191515907673113, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.499999999999999e-07, |
|
"loss": 2.0899, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.03260931208529462, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.750000000000001e-07, |
|
"loss": 2.0949, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.03402710826291612, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 2.0772, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.035444904440537627, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.25e-07, |
|
"loss": 2.0612, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.03686270061815913, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.5e-07, |
|
"loss": 2.0366, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.03828049679578064, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 6.75e-07, |
|
"loss": 2.0379, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.03969829297340215, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 2.0458, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.041116089151023646, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.25e-07, |
|
"loss": 2.0248, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.04253388532864515, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 7.5e-07, |
|
"loss": 2.0222, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.04395168150626666, |
|
"grad_norm": 2.875, |
|
"learning_rate": 7.750000000000001e-07, |
|
"loss": 2.0129, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.04536947768388817, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 8e-07, |
|
"loss": 2.0452, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.046787273861509666, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 8.25e-07, |
|
"loss": 2.0129, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.04820507003913117, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 8.5e-07, |
|
"loss": 2.0083, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.04962286621675268, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.750000000000001e-07, |
|
"loss": 2.0002, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.05104066239437419, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9e-07, |
|
"loss": 1.9946, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05245845857199569, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 9.25e-07, |
|
"loss": 1.9826, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.05387625474961719, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.5e-07, |
|
"loss": 1.9619, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0552940509272387, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 9.75e-07, |
|
"loss": 1.959, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.05671184710486021, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9438, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05812964328248171, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.0250000000000001e-06, |
|
"loss": 1.9322, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.05954743946010321, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.05e-06, |
|
"loss": 1.9012, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.06096523563772472, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.075e-06, |
|
"loss": 1.8728, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.06238303181534623, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.0999999999999998e-06, |
|
"loss": 1.8705, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06380082799296773, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.125e-06, |
|
"loss": 1.8529, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.06521862417058924, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.1500000000000002e-06, |
|
"loss": 1.855, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.06663642034821074, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.175e-06, |
|
"loss": 1.8516, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.06805421652583224, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.8211, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.06947201270345375, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.225e-06, |
|
"loss": 1.8235, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.07088980888107525, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.8149, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.07230760505869677, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.275e-06, |
|
"loss": 1.8164, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.07372540123631827, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.3e-06, |
|
"loss": 1.7967, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.07514319741393977, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.325e-06, |
|
"loss": 1.819, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.07656099359156128, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.35e-06, |
|
"loss": 1.7788, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.07797878976918278, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.375e-06, |
|
"loss": 1.7854, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.0793965859468043, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.7644, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.0808143821244258, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.425e-06, |
|
"loss": 1.7811, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.08223217830204729, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.45e-06, |
|
"loss": 1.7685, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.0836499744796688, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.4749999999999999e-06, |
|
"loss": 1.7554, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.0850677706572903, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.7636, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0864855668349118, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.525e-06, |
|
"loss": 1.7472, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.08790336301253332, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.5500000000000002e-06, |
|
"loss": 1.7534, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.08932115919015482, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.5750000000000002e-06, |
|
"loss": 1.7226, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.09073895536777633, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.6e-06, |
|
"loss": 1.7416, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.09215675154539783, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.625e-06, |
|
"loss": 1.7371, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.09357454772301933, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.65e-06, |
|
"loss": 1.7358, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.09499234390064085, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.675e-06, |
|
"loss": 1.7299, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.09641014007826235, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.7e-06, |
|
"loss": 1.7308, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.09782793625588386, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.725e-06, |
|
"loss": 1.7035, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.09924573243350536, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.7500000000000002e-06, |
|
"loss": 1.7224, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.10066352861112686, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.7750000000000002e-06, |
|
"loss": 1.7148, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.10208132478874837, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.7211, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.10349912096636987, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.8249999999999999e-06, |
|
"loss": 1.7162, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.10491691714399139, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.85e-06, |
|
"loss": 1.7024, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.10633471332161289, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.875e-06, |
|
"loss": 1.707, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.10775250949923439, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9e-06, |
|
"loss": 1.7134, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1091703056768559, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.925e-06, |
|
"loss": 1.6868, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.1105881018544774, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.95e-06, |
|
"loss": 1.7017, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.1120058980320989, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.975e-06, |
|
"loss": 1.6898, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.11342369420972041, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2e-06, |
|
"loss": 1.7212, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.11484149038734191, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.025e-06, |
|
"loss": 1.6863, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.11625928656496343, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.0500000000000003e-06, |
|
"loss": 1.6828, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.11767708274258493, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.075e-06, |
|
"loss": 1.6918, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.11909487892020643, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.1e-06, |
|
"loss": 1.6978, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.12051267509782794, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.125e-06, |
|
"loss": 1.6995, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.12193047127544944, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.15e-06, |
|
"loss": 1.681, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.12334826745307095, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.175e-06, |
|
"loss": 1.6893, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.12476606363069245, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.1999999999999997e-06, |
|
"loss": 1.6847, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.12618385980831395, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.2250000000000003e-06, |
|
"loss": 1.6997, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.12760165598593545, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.6706, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.12901945216355698, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.275e-06, |
|
"loss": 1.68, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.13043724834117848, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.3000000000000004e-06, |
|
"loss": 1.6774, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.13185504451879998, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.325e-06, |
|
"loss": 1.6954, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.13327284069642148, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.35e-06, |
|
"loss": 1.6797, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.13469063687404298, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.375e-06, |
|
"loss": 1.6646, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.13610843305166448, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.6769, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.137526229229286, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.425e-06, |
|
"loss": 1.6546, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.1389440254069075, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.45e-06, |
|
"loss": 1.6662, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.140361821584529, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.475e-06, |
|
"loss": 1.6757, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.1417796177621505, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.6692, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.143197413939772, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.525e-06, |
|
"loss": 1.6591, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.14461521011739353, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.55e-06, |
|
"loss": 1.6577, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.14603300629501503, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.575e-06, |
|
"loss": 1.6837, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.14745080247263653, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.6607, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.14886859865025803, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.6250000000000003e-06, |
|
"loss": 1.6535, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.15028639482787953, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.65e-06, |
|
"loss": 1.6696, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.15170419100550106, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.6750000000000002e-06, |
|
"loss": 1.6548, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.15312198718312256, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.7e-06, |
|
"loss": 1.6772, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.15453978336074406, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.725e-06, |
|
"loss": 1.6483, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.15595757953836556, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.75e-06, |
|
"loss": 1.6666, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.15737537571598706, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.775e-06, |
|
"loss": 1.6435, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.1587931718936086, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.6426, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.1602109680712301, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.825e-06, |
|
"loss": 1.6559, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.1616287642488516, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.85e-06, |
|
"loss": 1.6515, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.16304656042647309, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.875e-06, |
|
"loss": 1.6438, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.16446435660409459, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.9e-06, |
|
"loss": 1.6411, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.1658821527817161, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.925e-06, |
|
"loss": 1.6579, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.1672999489593376, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.9499999999999997e-06, |
|
"loss": 1.6521, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.1687177451369591, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.9750000000000003e-06, |
|
"loss": 1.6422, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.1701355413145806, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3e-06, |
|
"loss": 1.6389, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1715533374922021, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.9948745985102165e-06, |
|
"loss": 1.6347, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.1729711336698236, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.9897491970204334e-06, |
|
"loss": 1.655, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.17438892984744514, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.98462379553065e-06, |
|
"loss": 1.6428, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.17580672602506664, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.9794983940408667e-06, |
|
"loss": 1.6572, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.17722452220268814, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.974372992551083e-06, |
|
"loss": 1.6407, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.17864231838030964, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.9692475910613e-06, |
|
"loss": 1.6383, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.18006011455793114, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.9641221895715165e-06, |
|
"loss": 1.6426, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.18147791073555267, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.9589967880817333e-06, |
|
"loss": 1.6349, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.18289570691317417, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.9538713865919498e-06, |
|
"loss": 1.6491, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.18431350309079567, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.9487459851021667e-06, |
|
"loss": 1.6329, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.18573129926841717, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.943620583612383e-06, |
|
"loss": 1.6319, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.18714909544603867, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 2.9384951821226e-06, |
|
"loss": 1.6178, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.1885668916236602, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.9333697806328164e-06, |
|
"loss": 1.5879, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.1899846878012817, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.928244379143033e-06, |
|
"loss": 1.6113, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.1914024839789032, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.9231189776532493e-06, |
|
"loss": 1.5957, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.1928202801565247, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.917993576163466e-06, |
|
"loss": 1.5978, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.1942380763341462, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.9128681746736826e-06, |
|
"loss": 1.5955, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.19565587251176772, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.9077427731838995e-06, |
|
"loss": 1.61, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.19707366868938922, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.902617371694116e-06, |
|
"loss": 1.5998, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.19849146486701072, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.897491970204333e-06, |
|
"loss": 1.6176, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.19990926104463222, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.8923665687145493e-06, |
|
"loss": 1.6058, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.20132705722225372, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.887241167224766e-06, |
|
"loss": 1.5966, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.20274485339987525, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.8821157657349826e-06, |
|
"loss": 1.5852, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.20416264957749675, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.876990364245199e-06, |
|
"loss": 1.5644, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.20558044575511825, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.871864962755416e-06, |
|
"loss": 1.589, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.20699824193273975, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.8667395612656323e-06, |
|
"loss": 1.5646, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.20841603811036125, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.861614159775849e-06, |
|
"loss": 1.578, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.20983383428798277, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.8564887582860657e-06, |
|
"loss": 1.5959, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.21125163046560427, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.8513633567962825e-06, |
|
"loss": 1.5788, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.21266942664322577, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.846237955306499e-06, |
|
"loss": 1.5886, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.21408722282084727, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.841112553816716e-06, |
|
"loss": 1.583, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.21550501899846877, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.8359871523269323e-06, |
|
"loss": 1.6008, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.21692281517609027, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.830861750837149e-06, |
|
"loss": 1.6073, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.2183406113537118, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.8257363493473656e-06, |
|
"loss": 1.5921, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.2197584075313333, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.8206109478575825e-06, |
|
"loss": 1.5971, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.2211762037089548, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.815485546367799e-06, |
|
"loss": 1.5706, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2225939998865763, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.810360144878016e-06, |
|
"loss": 1.5788, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.2240117960641978, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 2.8052347433882322e-06, |
|
"loss": 1.5963, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.22542959224181933, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.800109341898449e-06, |
|
"loss": 1.5899, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.22684738841944083, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.7949839404086656e-06, |
|
"loss": 1.5878, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.22826518459706233, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.789858538918882e-06, |
|
"loss": 1.58, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.22968298077468383, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.7847331374290984e-06, |
|
"loss": 1.5926, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.23110077695230533, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.7796077359393153e-06, |
|
"loss": 1.6063, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.23251857312992685, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.7744823344495318e-06, |
|
"loss": 1.5617, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.23393636930754835, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.7693569329597486e-06, |
|
"loss": 1.5941, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.23535416548516985, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.764231531469965e-06, |
|
"loss": 1.5699, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.23677196166279135, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.759106129980182e-06, |
|
"loss": 1.5856, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.23818975784041285, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.7539807284903984e-06, |
|
"loss": 1.5984, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.23960755401803438, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.748855327000615e-06, |
|
"loss": 1.5768, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.24102535019565588, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.7437299255108317e-06, |
|
"loss": 1.5845, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.24244314637327738, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.738604524021048e-06, |
|
"loss": 1.5756, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.24386094255089888, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.733479122531265e-06, |
|
"loss": 1.5682, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.24527873872852038, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.7283537210414815e-06, |
|
"loss": 1.5657, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.2466965349061419, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.7232283195516983e-06, |
|
"loss": 1.5778, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.2481143310837634, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.718102918061915e-06, |
|
"loss": 1.5683, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.2495321272613849, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.7129775165721317e-06, |
|
"loss": 1.5821, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.2509499234390064, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.707852115082348e-06, |
|
"loss": 1.5784, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.2523677196166279, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.702726713592565e-06, |
|
"loss": 1.5649, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.2537855157942494, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.6976013121027814e-06, |
|
"loss": 1.596, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.2552033119718709, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.6924759106129983e-06, |
|
"loss": 1.5722, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2566211081494924, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.6873505091232147e-06, |
|
"loss": 1.5901, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.25803890432711396, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.6822251076334316e-06, |
|
"loss": 1.5751, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.25945670050473546, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.677099706143648e-06, |
|
"loss": 1.5799, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.26087449668235696, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.671974304653865e-06, |
|
"loss": 1.5533, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.26229229285997846, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.6668489031640814e-06, |
|
"loss": 1.5454, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.26371008903759996, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.661723501674298e-06, |
|
"loss": 1.5528, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.26512788521522146, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.6565981001845147e-06, |
|
"loss": 1.5732, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.26654568139284296, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.651472698694731e-06, |
|
"loss": 1.5799, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.26796347757046446, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.646347297204948e-06, |
|
"loss": 1.5679, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.26938127374808596, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.6412218957151645e-06, |
|
"loss": 1.5455, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.27079906992570746, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.636096494225381e-06, |
|
"loss": 1.5666, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.27221686610332896, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.6309710927355973e-06, |
|
"loss": 1.5726, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.2736346622809505, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.6258456912458142e-06, |
|
"loss": 1.588, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.275052458458572, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.6207202897560307e-06, |
|
"loss": 1.5913, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.2764702546361935, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.6155948882662475e-06, |
|
"loss": 1.5448, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.277888050813815, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.610469486776464e-06, |
|
"loss": 1.546, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.2793058469914365, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.605344085286681e-06, |
|
"loss": 1.5662, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.280723643169058, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.6002186837968973e-06, |
|
"loss": 1.5662, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.2821414393466795, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.595093282307114e-06, |
|
"loss": 1.5658, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.283559235524301, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5899678808173306e-06, |
|
"loss": 1.5773, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2849770317019225, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.5848424793275475e-06, |
|
"loss": 1.5609, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.286394827879544, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.579717077837764e-06, |
|
"loss": 1.5894, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.28781262405716557, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.574591676347981e-06, |
|
"loss": 1.5775, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.28923042023478707, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.5694662748581972e-06, |
|
"loss": 1.5585, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.29064821641240857, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.564340873368414e-06, |
|
"loss": 1.5626, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.29206601259003007, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.5592154718786306e-06, |
|
"loss": 1.556, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.29348380876765157, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.5540900703888474e-06, |
|
"loss": 1.5664, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.29490160494527307, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.548964668899064e-06, |
|
"loss": 1.5774, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.29631940112289457, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.5438392674092803e-06, |
|
"loss": 1.5716, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.29773719730051607, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.538713865919497e-06, |
|
"loss": 1.5724, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.29915499347813757, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.5335884644297136e-06, |
|
"loss": 1.5428, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.30057278965575907, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.5284630629399305e-06, |
|
"loss": 1.5703, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3019905858333806, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.523337661450147e-06, |
|
"loss": 1.547, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.3034083820110021, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.518212259960364e-06, |
|
"loss": 1.5526, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.3048261781886236, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.5130868584705803e-06, |
|
"loss": 1.5507, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.3062439743662451, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.507961456980797e-06, |
|
"loss": 1.5654, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3076617705438666, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.5028360554910136e-06, |
|
"loss": 1.5907, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.3090795667214881, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.49771065400123e-06, |
|
"loss": 1.5457, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.3104973628991096, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.4925852525114465e-06, |
|
"loss": 1.5435, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.3119151590767311, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.4874598510216633e-06, |
|
"loss": 1.5799, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3133329552543526, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.48233444953188e-06, |
|
"loss": 1.5848, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.3147507514319741, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.4772090480420967e-06, |
|
"loss": 1.5561, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.3161685476095956, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 2.472083646552313e-06, |
|
"loss": 1.5756, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.3175863437872172, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.46695824506253e-06, |
|
"loss": 1.5723, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.3190041399648387, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.4618328435727464e-06, |
|
"loss": 1.5503, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.3204219361424602, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.4567074420829633e-06, |
|
"loss": 1.5537, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.3218397323200817, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.4515820405931797e-06, |
|
"loss": 1.5652, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.3232575284977032, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.4464566391033966e-06, |
|
"loss": 1.5675, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.441331237613613e-06, |
|
"loss": 1.5658, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.32609312085294617, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.43620583612383e-06, |
|
"loss": 1.5576, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.32751091703056767, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.4310804346340464e-06, |
|
"loss": 1.5751, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.32892871320818917, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.4259550331442632e-06, |
|
"loss": 1.5758, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.33034650938581067, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.4208296316544797e-06, |
|
"loss": 1.5617, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.3317643055634322, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.415704230164696e-06, |
|
"loss": 1.5638, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.3331821017410537, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.410578828674913e-06, |
|
"loss": 1.5722, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.3345998979186752, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.4054534271851295e-06, |
|
"loss": 1.5682, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3360176940962967, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.4003280256953463e-06, |
|
"loss": 1.5611, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.3374354902739182, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.3952026242055628e-06, |
|
"loss": 1.5741, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.3388532864515397, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.3900772227157796e-06, |
|
"loss": 1.5527, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.3402710826291612, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.384951821225996e-06, |
|
"loss": 1.5703, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3416888788067827, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.379826419736213e-06, |
|
"loss": 1.5924, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.3431066749844042, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.3747010182464294e-06, |
|
"loss": 1.565, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.3445244711620257, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.3695756167566463e-06, |
|
"loss": 1.5657, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.3459422673396472, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.3644502152668627e-06, |
|
"loss": 1.5382, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.3473600635172688, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.3593248137770796e-06, |
|
"loss": 1.5631, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.3487778596948903, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.3541994122872956e-06, |
|
"loss": 1.5612, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.3501956558725118, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.3490740107975125e-06, |
|
"loss": 1.542, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.3516134520501333, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.343948609307729e-06, |
|
"loss": 1.547, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.3530312482277548, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.338823207817946e-06, |
|
"loss": 1.5521, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.3544490444053763, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.3336978063281622e-06, |
|
"loss": 1.5392, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.3558668405829978, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.328572404838379e-06, |
|
"loss": 1.5626, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.3572846367606193, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.3234470033485956e-06, |
|
"loss": 1.5753, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.3587024329382408, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.3183216018588124e-06, |
|
"loss": 1.5502, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.3601202291158623, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.313196200369029e-06, |
|
"loss": 1.5427, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.36153802529348383, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.3080707988792457e-06, |
|
"loss": 1.5595, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.36295582147110533, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.302945397389462e-06, |
|
"loss": 1.5756, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.36437361764872683, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.2978199958996786e-06, |
|
"loss": 1.5495, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.36579141382634833, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.2926945944098955e-06, |
|
"loss": 1.5591, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.36720921000396983, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.287569192920112e-06, |
|
"loss": 1.5515, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.36862700618159133, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.282443791430329e-06, |
|
"loss": 1.5557, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.37004480235921283, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.2773183899405453e-06, |
|
"loss": 1.5657, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.37146259853683433, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.272192988450762e-06, |
|
"loss": 1.5626, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.37288039471445583, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.2670675869609786e-06, |
|
"loss": 1.5578, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.37429819089207733, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.2619421854711955e-06, |
|
"loss": 1.5567, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.3757159870696989, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.256816783981412e-06, |
|
"loss": 1.5522, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.3771337832473204, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.2516913824916288e-06, |
|
"loss": 1.557, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.3785515794249419, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.2465659810018452e-06, |
|
"loss": 1.5283, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.3799693756025634, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.241440579512062e-06, |
|
"loss": 1.5619, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.3813871717801849, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.2363151780222785e-06, |
|
"loss": 1.5395, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.3828049679578064, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.2311897765324954e-06, |
|
"loss": 1.5604, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.3842227641354279, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.226064375042712e-06, |
|
"loss": 1.5259, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.3856405603130494, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.2209389735529287e-06, |
|
"loss": 1.5592, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.3870583564906709, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.215813572063145e-06, |
|
"loss": 1.5364, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.3884761526682924, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.2106881705733616e-06, |
|
"loss": 1.5603, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.3898939488459139, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.205562769083578e-06, |
|
"loss": 1.5701, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.39131174502353544, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.200437367593795e-06, |
|
"loss": 1.5886, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.39272954120115694, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.1953119661040114e-06, |
|
"loss": 1.5645, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.39414733737877844, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.1901865646142282e-06, |
|
"loss": 1.5606, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.39556513355639994, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.1850611631244447e-06, |
|
"loss": 1.5466, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.39698292973402144, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.179935761634661e-06, |
|
"loss": 1.556, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.39840072591164294, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.174810360144878e-06, |
|
"loss": 1.5753, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.39981852208926444, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.1696849586550945e-06, |
|
"loss": 1.5631, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.40123631826688594, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.1645595571653113e-06, |
|
"loss": 1.5483, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.40265411444450744, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.1594341556755278e-06, |
|
"loss": 1.5562, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.40407191062212894, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.1543087541857446e-06, |
|
"loss": 1.5561, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.4054897067997505, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.149183352695961e-06, |
|
"loss": 1.5358, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.406907502977372, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.144057951206178e-06, |
|
"loss": 1.5465, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.4083252991549935, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.1389325497163944e-06, |
|
"loss": 1.5762, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.409743095332615, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.1338071482266113e-06, |
|
"loss": 1.5593, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.4111608915102365, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.1286817467368277e-06, |
|
"loss": 1.5578, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.412578687687858, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.1235563452470446e-06, |
|
"loss": 1.5889, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.4139964838654795, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.118430943757261e-06, |
|
"loss": 1.5562, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.415414280043101, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.113305542267478e-06, |
|
"loss": 1.5316, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.4168320762207225, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.1081801407776944e-06, |
|
"loss": 1.556, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.418249872398344, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.1030547392879112e-06, |
|
"loss": 1.5628, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.41966766857596555, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.0979293377981277e-06, |
|
"loss": 1.555, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.42108546475358705, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.092803936308344e-06, |
|
"loss": 1.5561, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.42250326093120855, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.087678534818561e-06, |
|
"loss": 1.5601, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.42392105710883005, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.0825531333287774e-06, |
|
"loss": 1.541, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.42533885328645155, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.0774277318389943e-06, |
|
"loss": 1.564, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.42675664946407305, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.0723023303492107e-06, |
|
"loss": 1.5771, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.42817444564169455, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 2.067176928859427e-06, |
|
"loss": 1.562, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.42959224181931605, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.062051527369644e-06, |
|
"loss": 1.5351, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.43101003799693755, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.0569261258798605e-06, |
|
"loss": 1.5399, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.43242783417455904, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.051800724390077e-06, |
|
"loss": 1.5723, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.43384563035218054, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.046675322900294e-06, |
|
"loss": 1.5707, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.4352634265298021, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.0415499214105103e-06, |
|
"loss": 1.5229, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.4366812227074236, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.036424519920727e-06, |
|
"loss": 1.5318, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.4380990188850451, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.0312991184309436e-06, |
|
"loss": 1.5321, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.4395168150626666, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.0261737169411605e-06, |
|
"loss": 1.5619, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.4409346112402881, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.021048315451377e-06, |
|
"loss": 1.5507, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.4423524074179096, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.0159229139615938e-06, |
|
"loss": 1.5443, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4437702035955311, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.0107975124718102e-06, |
|
"loss": 1.5362, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.4451879997731526, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.005672110982027e-06, |
|
"loss": 1.5435, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.4466057959507741, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.0005467094922435e-06, |
|
"loss": 1.5365, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.4480235921283956, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.9954213080024604e-06, |
|
"loss": 1.57, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.44944138830601715, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.990295906512677e-06, |
|
"loss": 1.5625, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.45085918448363865, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.9851705050228937e-06, |
|
"loss": 1.5687, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.45227698066126015, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.98004510353311e-06, |
|
"loss": 1.5611, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.45369477683888165, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.974919702043327e-06, |
|
"loss": 1.5497, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.45511257301650315, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9697943005535435e-06, |
|
"loss": 1.5631, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.45653036919412465, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.96466889906376e-06, |
|
"loss": 1.5564, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.45794816537174615, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.959543497573977e-06, |
|
"loss": 1.5502, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.45936596154936765, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.9544180960841933e-06, |
|
"loss": 1.5428, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.46078375772698915, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.94929269459441e-06, |
|
"loss": 1.5859, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.46220155390461065, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.9441672931046266e-06, |
|
"loss": 1.5673, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.46361935008223215, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9390418916148434e-06, |
|
"loss": 1.5583, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.4650371462598537, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.93391649012506e-06, |
|
"loss": 1.5509, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.4664549424374752, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.9287910886352768e-06, |
|
"loss": 1.539, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.4678727386150967, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9236656871454928e-06, |
|
"loss": 1.5393, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.4692905347927182, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.9185402856557096e-06, |
|
"loss": 1.5529, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.4707083309703397, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.913414884165926e-06, |
|
"loss": 1.5485, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.4721261271479612, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.908289482676143e-06, |
|
"loss": 1.5476, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.4735439233255827, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.9031640811863596e-06, |
|
"loss": 1.5571, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.4749617195032042, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.8980386796965763e-06, |
|
"loss": 1.557, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.4763795156808257, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.8929132782067927e-06, |
|
"loss": 1.5768, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.4777973118584472, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.8877878767170094e-06, |
|
"loss": 1.5641, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.47921510803606876, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.882662475227226e-06, |
|
"loss": 1.5534, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.48063290421369026, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.8775370737374427e-06, |
|
"loss": 1.5421, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.48205070039131176, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.8724116722476594e-06, |
|
"loss": 1.5555, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.48346849656893326, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.867286270757876e-06, |
|
"loss": 1.5428, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.48488629274655476, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.8621608692680927e-06, |
|
"loss": 1.5412, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.48630408892417626, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.8570354677783093e-06, |
|
"loss": 1.5354, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.48772188510179776, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.851910066288526e-06, |
|
"loss": 1.5298, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.48913968127941926, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.8467846647987426e-06, |
|
"loss": 1.5496, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.49055747745704076, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.8416592633089593e-06, |
|
"loss": 1.5764, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.49197527363466226, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.836533861819176e-06, |
|
"loss": 1.5439, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.4933930698122838, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.8314084603293926e-06, |
|
"loss": 1.5625, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.4948108659899053, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.8262830588396093e-06, |
|
"loss": 1.5426, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.4962286621675268, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.821157657349826e-06, |
|
"loss": 1.5572, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.4976464583451483, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.8160322558600426e-06, |
|
"loss": 1.5544, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.4990642545227698, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.8109068543702593e-06, |
|
"loss": 1.5523, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.5004820507003913, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.8057814528804757e-06, |
|
"loss": 1.5566, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.5018998468780128, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.8006560513906924e-06, |
|
"loss": 1.5398, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.5033176430556343, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.795530649900909e-06, |
|
"loss": 1.5565, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.5047354392332558, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.7904052484111257e-06, |
|
"loss": 1.5548, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5061532354108773, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.7852798469213421e-06, |
|
"loss": 1.5535, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.5075710315884988, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.7801544454315588e-06, |
|
"loss": 1.5706, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.5089888277661203, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.7750290439417754e-06, |
|
"loss": 1.5568, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 0.5104066239437418, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.7699036424519919e-06, |
|
"loss": 1.5563, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5118244201213633, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.7647782409622085e-06, |
|
"loss": 1.5338, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 0.5132422162989848, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.7596528394724252e-06, |
|
"loss": 1.5521, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.5146600124766063, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.7545274379826419e-06, |
|
"loss": 1.5492, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.5160778086542279, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.7494020364928585e-06, |
|
"loss": 1.5345, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5174956048318494, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.7442766350030752e-06, |
|
"loss": 1.5407, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 0.5189134010094709, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.7391512335132918e-06, |
|
"loss": 1.5592, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.5203311971870924, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.7340258320235085e-06, |
|
"loss": 1.5387, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 0.5217489933647139, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.7289004305337251e-06, |
|
"loss": 1.5425, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5231667895423354, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.7237750290439418e-06, |
|
"loss": 1.5737, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.5245845857199569, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.7186496275541585e-06, |
|
"loss": 1.5753, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.5260023818975784, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.7135242260643751e-06, |
|
"loss": 1.5684, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 0.5274201780751999, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.7083988245745918e-06, |
|
"loss": 1.5471, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5288379742528214, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.7032734230848084e-06, |
|
"loss": 1.5585, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 0.5302557704304429, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.698148021595025e-06, |
|
"loss": 1.5689, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.5316735666080644, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.6930226201052418e-06, |
|
"loss": 1.5559, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.5330913627856859, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.6878972186154584e-06, |
|
"loss": 1.536, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5345091589633074, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.6827718171256749e-06, |
|
"loss": 1.5343, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 0.5359269551409289, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.6776464156358915e-06, |
|
"loss": 1.5333, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.5373447513185504, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.6725210141461082e-06, |
|
"loss": 1.5387, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 0.5387625474961719, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.6673956126563248e-06, |
|
"loss": 1.5467, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5401803436737934, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.6622702111665415e-06, |
|
"loss": 1.5439, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.5415981398514149, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.6571448096767582e-06, |
|
"loss": 1.5434, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.5430159360290364, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.6520194081869748e-06, |
|
"loss": 1.5438, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 0.5444337322066579, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.6468940066971915e-06, |
|
"loss": 1.5222, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.5458515283842795, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.6417686052074077e-06, |
|
"loss": 1.5479, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 0.547269324561901, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.6366432037176244e-06, |
|
"loss": 1.5733, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.5486871207395225, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.631517802227841e-06, |
|
"loss": 1.5279, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.550104916917144, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.6263924007380577e-06, |
|
"loss": 1.5554, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.5515227130947655, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.6212669992482743e-06, |
|
"loss": 1.5095, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 0.552940509272387, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.616141597758491e-06, |
|
"loss": 1.5139, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.5543583054500085, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.6110161962687077e-06, |
|
"loss": 1.5381, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 0.55577610162763, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.6058907947789243e-06, |
|
"loss": 1.5252, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.5571938978052515, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.600765393289141e-06, |
|
"loss": 1.5623, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.558611693982873, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.5956399917993576e-06, |
|
"loss": 1.5585, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.5600294901604945, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.5905145903095743e-06, |
|
"loss": 1.5705, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 0.561447286338116, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.585389188819791e-06, |
|
"loss": 1.5343, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5628650825157375, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.5802637873300076e-06, |
|
"loss": 1.5257, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 0.564282878693359, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.5751383858402243e-06, |
|
"loss": 1.5605, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.5657006748709805, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.570012984350441e-06, |
|
"loss": 1.5301, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.567118471048602, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.5648875828606576e-06, |
|
"loss": 1.5498, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5685362672262235, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.559762181370874e-06, |
|
"loss": 1.5258, |
|
"step": 10025 |
|
}, |
|
{ |
|
"epoch": 0.569954063403845, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.5546367798810907e-06, |
|
"loss": 1.5391, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.5713718595814665, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.5495113783913073e-06, |
|
"loss": 1.5478, |
|
"step": 10075 |
|
}, |
|
{ |
|
"epoch": 0.572789655759088, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.544385976901524e-06, |
|
"loss": 1.5388, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.5742074519367096, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.5392605754117407e-06, |
|
"loss": 1.5413, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 0.5756252481143311, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.5341351739219573e-06, |
|
"loss": 1.5519, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.5770430442919526, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.529009772432174e-06, |
|
"loss": 1.5516, |
|
"step": 10175 |
|
}, |
|
{ |
|
"epoch": 0.5784608404695741, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.5238843709423906e-06, |
|
"loss": 1.5193, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.5798786366471956, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.5187589694526073e-06, |
|
"loss": 1.5562, |
|
"step": 10225 |
|
}, |
|
{ |
|
"epoch": 0.5812964328248171, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.513633567962824e-06, |
|
"loss": 1.5564, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.5827142290024386, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.5085081664730406e-06, |
|
"loss": 1.5505, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 0.5841320251800601, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.5033827649832573e-06, |
|
"loss": 1.5526, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.5855498213576816, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.4982573634934737e-06, |
|
"loss": 1.5544, |
|
"step": 10325 |
|
}, |
|
{ |
|
"epoch": 0.5869676175353031, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.4931319620036904e-06, |
|
"loss": 1.5197, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.5883854137129246, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.488006560513907e-06, |
|
"loss": 1.5515, |
|
"step": 10375 |
|
}, |
|
{ |
|
"epoch": 0.5898032098905461, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.4828811590241237e-06, |
|
"loss": 1.542, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.5912210060681676, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.4777557575343403e-06, |
|
"loss": 1.5378, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 0.5926388022457891, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.472630356044557e-06, |
|
"loss": 1.5499, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.5940565984234106, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.4675049545547737e-06, |
|
"loss": 1.5717, |
|
"step": 10475 |
|
}, |
|
{ |
|
"epoch": 0.5954743946010321, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.46237955306499e-06, |
|
"loss": 1.5418, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5968921907786536, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.4572541515752068e-06, |
|
"loss": 1.5272, |
|
"step": 10525 |
|
}, |
|
{ |
|
"epoch": 0.5983099869562751, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.4521287500854234e-06, |
|
"loss": 1.5359, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.5997277831338966, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.44700334859564e-06, |
|
"loss": 1.546, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 0.6011455793115181, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.4418779471058567e-06, |
|
"loss": 1.5394, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6025633754891396, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.4367525456160732e-06, |
|
"loss": 1.5449, |
|
"step": 10625 |
|
}, |
|
{ |
|
"epoch": 0.6039811716667612, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.4316271441262898e-06, |
|
"loss": 1.5449, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.6053989678443827, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.4265017426365065e-06, |
|
"loss": 1.5255, |
|
"step": 10675 |
|
}, |
|
{ |
|
"epoch": 0.6068167640220042, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.4213763411467232e-06, |
|
"loss": 1.559, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6082345601996257, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.4162509396569398e-06, |
|
"loss": 1.5654, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 0.6096523563772472, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4111255381671565e-06, |
|
"loss": 1.5608, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.6110701525548687, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.4060001366773731e-06, |
|
"loss": 1.5727, |
|
"step": 10775 |
|
}, |
|
{ |
|
"epoch": 0.6124879487324902, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.4008747351875898e-06, |
|
"loss": 1.55, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6139057449101117, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.3957493336978064e-06, |
|
"loss": 1.5276, |
|
"step": 10825 |
|
}, |
|
{ |
|
"epoch": 0.6153235410877332, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.3906239322080229e-06, |
|
"loss": 1.5596, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.6167413372653547, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3854985307182395e-06, |
|
"loss": 1.5531, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 0.6181591334429762, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.3803731292284562e-06, |
|
"loss": 1.5493, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6195769296205977, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.3752477277386729e-06, |
|
"loss": 1.549, |
|
"step": 10925 |
|
}, |
|
{ |
|
"epoch": 0.6209947257982192, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.3701223262488895e-06, |
|
"loss": 1.5222, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.6224125219758407, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.3649969247591062e-06, |
|
"loss": 1.5372, |
|
"step": 10975 |
|
}, |
|
{ |
|
"epoch": 0.6238303181534622, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3598715232693228e-06, |
|
"loss": 1.5489, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6252481143310837, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3547461217795395e-06, |
|
"loss": 1.5413, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 0.6266659105087052, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3496207202897562e-06, |
|
"loss": 1.5258, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.6280837066863267, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.3444953187999728e-06, |
|
"loss": 1.5472, |
|
"step": 11075 |
|
}, |
|
{ |
|
"epoch": 0.6295015028639482, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.3393699173101895e-06, |
|
"loss": 1.5343, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.6309192990415697, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.334244515820406e-06, |
|
"loss": 1.5464, |
|
"step": 11125 |
|
}, |
|
{ |
|
"epoch": 0.6323370952191912, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3291191143306226e-06, |
|
"loss": 1.5363, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.6337548913968128, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.3239937128408392e-06, |
|
"loss": 1.5315, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 0.6351726875744343, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.3188683113510559e-06, |
|
"loss": 1.5516, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.6365904837520558, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.3137429098612723e-06, |
|
"loss": 1.5475, |
|
"step": 11225 |
|
}, |
|
{ |
|
"epoch": 0.6380082799296773, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.308617508371489e-06, |
|
"loss": 1.529, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.6394260761072988, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.3034921068817057e-06, |
|
"loss": 1.5497, |
|
"step": 11275 |
|
}, |
|
{ |
|
"epoch": 0.6408438722849203, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.2983667053919223e-06, |
|
"loss": 1.5614, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.6422616684625418, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.293241303902139e-06, |
|
"loss": 1.5547, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 0.6436794646401633, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.2881159024123556e-06, |
|
"loss": 1.5541, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.6450972608177848, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.2829905009225723e-06, |
|
"loss": 1.5661, |
|
"step": 11375 |
|
}, |
|
{ |
|
"epoch": 0.6465150569954063, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.277865099432789e-06, |
|
"loss": 1.5288, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.6479328531730278, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.2727396979430056e-06, |
|
"loss": 1.5425, |
|
"step": 11425 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.2676142964532223e-06, |
|
"loss": 1.5383, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.6507684455282708, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.262488894963439e-06, |
|
"loss": 1.556, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 0.6521862417058923, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.2573634934736556e-06, |
|
"loss": 1.5487, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6536040378835138, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.2522380919838722e-06, |
|
"loss": 1.5661, |
|
"step": 11525 |
|
}, |
|
{ |
|
"epoch": 0.6550218340611353, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.2471126904940887e-06, |
|
"loss": 1.5446, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.6564396302387568, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.2419872890043053e-06, |
|
"loss": 1.5546, |
|
"step": 11575 |
|
}, |
|
{ |
|
"epoch": 0.6578574264163783, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.236861887514522e-06, |
|
"loss": 1.5458, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.6592752225939998, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.2317364860247387e-06, |
|
"loss": 1.5684, |
|
"step": 11625 |
|
}, |
|
{ |
|
"epoch": 0.6606930187716213, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.2266110845349553e-06, |
|
"loss": 1.5305, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.6621108149492428, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.221485683045172e-06, |
|
"loss": 1.5331, |
|
"step": 11675 |
|
}, |
|
{ |
|
"epoch": 0.6635286111268645, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.2163602815553886e-06, |
|
"loss": 1.5263, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.664946407304486, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.211234880065605e-06, |
|
"loss": 1.5346, |
|
"step": 11725 |
|
}, |
|
{ |
|
"epoch": 0.6663642034821075, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2061094785758217e-06, |
|
"loss": 1.525, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.667781999659729, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.2009840770860384e-06, |
|
"loss": 1.5601, |
|
"step": 11775 |
|
}, |
|
{ |
|
"epoch": 0.6691997958373505, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.195858675596255e-06, |
|
"loss": 1.5598, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.670617592014972, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.1907332741064717e-06, |
|
"loss": 1.5205, |
|
"step": 11825 |
|
}, |
|
{ |
|
"epoch": 0.6720353881925935, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.1856078726166884e-06, |
|
"loss": 1.5609, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.673453184370215, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.180482471126905e-06, |
|
"loss": 1.5311, |
|
"step": 11875 |
|
}, |
|
{ |
|
"epoch": 0.6748709805478365, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.1753570696371215e-06, |
|
"loss": 1.5562, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.676288776725458, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.1702316681473381e-06, |
|
"loss": 1.5397, |
|
"step": 11925 |
|
}, |
|
{ |
|
"epoch": 0.6777065729030795, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.1651062666575548e-06, |
|
"loss": 1.5235, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.679124369080701, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.1599808651677714e-06, |
|
"loss": 1.5541, |
|
"step": 11975 |
|
}, |
|
{ |
|
"epoch": 0.6805421652583225, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.1548554636779881e-06, |
|
"loss": 1.5365, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.681959961435944, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.1497300621882048e-06, |
|
"loss": 1.5254, |
|
"step": 12025 |
|
}, |
|
{ |
|
"epoch": 0.6833777576135655, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.1446046606984214e-06, |
|
"loss": 1.5569, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.684795553791187, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.139479259208638e-06, |
|
"loss": 1.5388, |
|
"step": 12075 |
|
}, |
|
{ |
|
"epoch": 0.6862133499688085, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.1343538577188547e-06, |
|
"loss": 1.5449, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.68763114614643, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.1292284562290714e-06, |
|
"loss": 1.5471, |
|
"step": 12125 |
|
}, |
|
{ |
|
"epoch": 0.6890489423240514, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.124103054739288e-06, |
|
"loss": 1.5367, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.690466738501673, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.1189776532495047e-06, |
|
"loss": 1.5452, |
|
"step": 12175 |
|
}, |
|
{ |
|
"epoch": 0.6918845346792944, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.1138522517597214e-06, |
|
"loss": 1.54, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.6933023308569161, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.1087268502699378e-06, |
|
"loss": 1.5475, |
|
"step": 12225 |
|
}, |
|
{ |
|
"epoch": 0.6947201270345376, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.1036014487801545e-06, |
|
"loss": 1.552, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.6961379232121591, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.0984760472903711e-06, |
|
"loss": 1.5461, |
|
"step": 12275 |
|
}, |
|
{ |
|
"epoch": 0.6975557193897806, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.0933506458005878e-06, |
|
"loss": 1.5243, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.6989735155674021, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.0882252443108042e-06, |
|
"loss": 1.5551, |
|
"step": 12325 |
|
}, |
|
{ |
|
"epoch": 0.7003913117450236, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.083099842821021e-06, |
|
"loss": 1.5374, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.7018091079226451, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.0779744413312376e-06, |
|
"loss": 1.55, |
|
"step": 12375 |
|
}, |
|
{ |
|
"epoch": 0.7032269041002666, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.0728490398414542e-06, |
|
"loss": 1.5355, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.7046447002778881, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.0677236383516709e-06, |
|
"loss": 1.5465, |
|
"step": 12425 |
|
}, |
|
{ |
|
"epoch": 0.7060624964555096, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.0625982368618875e-06, |
|
"loss": 1.5404, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.7074802926331311, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.0574728353721042e-06, |
|
"loss": 1.5264, |
|
"step": 12475 |
|
}, |
|
{ |
|
"epoch": 0.7088980888107526, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.0523474338823208e-06, |
|
"loss": 1.5598, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7103158849883741, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.0472220323925375e-06, |
|
"loss": 1.5594, |
|
"step": 12525 |
|
}, |
|
{ |
|
"epoch": 0.7117336811659956, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.0420966309027542e-06, |
|
"loss": 1.5319, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.7131514773436171, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.0369712294129708e-06, |
|
"loss": 1.5403, |
|
"step": 12575 |
|
}, |
|
{ |
|
"epoch": 0.7145692735212386, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.0318458279231873e-06, |
|
"loss": 1.5131, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7159870696988601, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.026720426433404e-06, |
|
"loss": 1.5628, |
|
"step": 12625 |
|
}, |
|
{ |
|
"epoch": 0.7174048658764816, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.0215950249436206e-06, |
|
"loss": 1.5474, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.7188226620541031, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.0164696234538372e-06, |
|
"loss": 1.5401, |
|
"step": 12675 |
|
}, |
|
{ |
|
"epoch": 0.7202404582317246, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.011344221964054e-06, |
|
"loss": 1.5464, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.7216582544093462, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.0062188204742706e-06, |
|
"loss": 1.5344, |
|
"step": 12725 |
|
}, |
|
{ |
|
"epoch": 0.7230760505869677, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.0010934189844872e-06, |
|
"loss": 1.5035, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.7244938467645892, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 9.959680174947039e-07, |
|
"loss": 1.5582, |
|
"step": 12775 |
|
}, |
|
{ |
|
"epoch": 0.7259116429422107, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.908426160049205e-07, |
|
"loss": 1.5341, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.7273294391198322, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.85717214515137e-07, |
|
"loss": 1.5449, |
|
"step": 12825 |
|
}, |
|
{ |
|
"epoch": 0.7287472352974537, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.805918130253536e-07, |
|
"loss": 1.5297, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.7301650314750752, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.754664115355703e-07, |
|
"loss": 1.5498, |
|
"step": 12875 |
|
}, |
|
{ |
|
"epoch": 0.7315828276526967, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 9.70341010045787e-07, |
|
"loss": 1.5448, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.7330006238303182, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 9.652156085560036e-07, |
|
"loss": 1.5242, |
|
"step": 12925 |
|
}, |
|
{ |
|
"epoch": 0.7344184200079397, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 9.6009020706622e-07, |
|
"loss": 1.5191, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.7358362161855612, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 9.549648055764367e-07, |
|
"loss": 1.532, |
|
"step": 12975 |
|
}, |
|
{ |
|
"epoch": 0.7372540123631827, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 9.498394040866535e-07, |
|
"loss": 1.5711, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7386718085408042, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.4471400259687e-07, |
|
"loss": 1.5418, |
|
"step": 13025 |
|
}, |
|
{ |
|
"epoch": 0.7400896047184257, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.395886011070867e-07, |
|
"loss": 1.5549, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.7415074008960472, |
|
"grad_norm": 1.625, |
|
"learning_rate": 9.344631996173033e-07, |
|
"loss": 1.5404, |
|
"step": 13075 |
|
}, |
|
{ |
|
"epoch": 0.7429251970736687, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 9.2933779812752e-07, |
|
"loss": 1.5488, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.7443429932512902, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 9.242123966377367e-07, |
|
"loss": 1.5429, |
|
"step": 13125 |
|
}, |
|
{ |
|
"epoch": 0.7457607894289117, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 9.190869951479533e-07, |
|
"loss": 1.5235, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.7471785856065332, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 9.1396159365817e-07, |
|
"loss": 1.5297, |
|
"step": 13175 |
|
}, |
|
{ |
|
"epoch": 0.7485963817841547, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.088361921683866e-07, |
|
"loss": 1.5214, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.7500141779617762, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.037107906786032e-07, |
|
"loss": 1.5256, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 0.7514319741393978, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 8.985853891888198e-07, |
|
"loss": 1.5405, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.7528497703170193, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 8.934599876990365e-07, |
|
"loss": 1.5277, |
|
"step": 13275 |
|
}, |
|
{ |
|
"epoch": 0.7542675664946408, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 8.883345862092531e-07, |
|
"loss": 1.5297, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.7556853626722623, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 8.832091847194696e-07, |
|
"loss": 1.5141, |
|
"step": 13325 |
|
}, |
|
{ |
|
"epoch": 0.7571031588498838, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 8.780837832296863e-07, |
|
"loss": 1.5404, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.7585209550275053, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 8.729583817399029e-07, |
|
"loss": 1.5397, |
|
"step": 13375 |
|
}, |
|
{ |
|
"epoch": 0.7599387512051268, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 8.678329802501196e-07, |
|
"loss": 1.5652, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.7613565473827483, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 8.627075787603362e-07, |
|
"loss": 1.5527, |
|
"step": 13425 |
|
}, |
|
{ |
|
"epoch": 0.7627743435603698, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.575821772705529e-07, |
|
"loss": 1.5429, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.7641921397379913, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.524567757807696e-07, |
|
"loss": 1.5511, |
|
"step": 13475 |
|
}, |
|
{ |
|
"epoch": 0.7656099359156128, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 8.473313742909862e-07, |
|
"loss": 1.5494, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7670277320932343, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 8.422059728012028e-07, |
|
"loss": 1.5272, |
|
"step": 13525 |
|
}, |
|
{ |
|
"epoch": 0.7684455282708558, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 8.370805713114194e-07, |
|
"loss": 1.5361, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 0.7698633244484773, |
|
"grad_norm": 1.625, |
|
"learning_rate": 8.319551698216361e-07, |
|
"loss": 1.5051, |
|
"step": 13575 |
|
}, |
|
{ |
|
"epoch": 0.7712811206260988, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 8.268297683318527e-07, |
|
"loss": 1.5396, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.7726989168037203, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 8.217043668420694e-07, |
|
"loss": 1.5366, |
|
"step": 13625 |
|
}, |
|
{ |
|
"epoch": 0.7741167129813418, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 8.165789653522858e-07, |
|
"loss": 1.5285, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.7755345091589633, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 8.114535638625025e-07, |
|
"loss": 1.5392, |
|
"step": 13675 |
|
}, |
|
{ |
|
"epoch": 0.7769523053365848, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 8.063281623727192e-07, |
|
"loss": 1.5187, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.7783701015142063, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.012027608829358e-07, |
|
"loss": 1.532, |
|
"step": 13725 |
|
}, |
|
{ |
|
"epoch": 0.7797878976918278, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 7.960773593931525e-07, |
|
"loss": 1.5551, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.7812056938694494, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 7.909519579033691e-07, |
|
"loss": 1.5561, |
|
"step": 13775 |
|
}, |
|
{ |
|
"epoch": 0.7826234900470709, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 7.858265564135858e-07, |
|
"loss": 1.5498, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.7840412862246924, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 7.807011549238023e-07, |
|
"loss": 1.5047, |
|
"step": 13825 |
|
}, |
|
{ |
|
"epoch": 0.7854590824023139, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.75575753434019e-07, |
|
"loss": 1.5347, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 0.7868768785799354, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 7.704503519442357e-07, |
|
"loss": 1.5385, |
|
"step": 13875 |
|
}, |
|
{ |
|
"epoch": 0.7882946747575569, |
|
"grad_norm": 2.875, |
|
"learning_rate": 7.653249504544523e-07, |
|
"loss": 1.5458, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.7897124709351784, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 7.60199548964669e-07, |
|
"loss": 1.5497, |
|
"step": 13925 |
|
}, |
|
{ |
|
"epoch": 0.7911302671127999, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.550741474748856e-07, |
|
"loss": 1.5308, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.7925480632904214, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 7.499487459851022e-07, |
|
"loss": 1.528, |
|
"step": 13975 |
|
}, |
|
{ |
|
"epoch": 0.7939658594680429, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 7.448233444953189e-07, |
|
"loss": 1.5413, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7953836556456644, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.396979430055355e-07, |
|
"loss": 1.5175, |
|
"step": 14025 |
|
}, |
|
{ |
|
"epoch": 0.7968014518232859, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 7.345725415157522e-07, |
|
"loss": 1.5351, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 0.7982192480009074, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.294471400259687e-07, |
|
"loss": 1.544, |
|
"step": 14075 |
|
}, |
|
{ |
|
"epoch": 0.7996370441785289, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 7.243217385361854e-07, |
|
"loss": 1.5715, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8010548403561504, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.191963370464019e-07, |
|
"loss": 1.5418, |
|
"step": 14125 |
|
}, |
|
{ |
|
"epoch": 0.8024726365337719, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.140709355566186e-07, |
|
"loss": 1.5358, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 0.8038904327113934, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.089455340668352e-07, |
|
"loss": 1.5244, |
|
"step": 14175 |
|
}, |
|
{ |
|
"epoch": 0.8053082288890149, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.038201325770519e-07, |
|
"loss": 1.5106, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.8067260250666364, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 6.986947310872686e-07, |
|
"loss": 1.5303, |
|
"step": 14225 |
|
}, |
|
{ |
|
"epoch": 0.8081438212442579, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 6.935693295974851e-07, |
|
"loss": 1.5587, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.8095616174218794, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.884439281077018e-07, |
|
"loss": 1.5189, |
|
"step": 14275 |
|
}, |
|
{ |
|
"epoch": 0.810979413599501, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 6.833185266179184e-07, |
|
"loss": 1.5448, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.8123972097771225, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 6.781931251281351e-07, |
|
"loss": 1.5461, |
|
"step": 14325 |
|
}, |
|
{ |
|
"epoch": 0.813815005954744, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.730677236383517e-07, |
|
"loss": 1.5511, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 0.8152328021323655, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 6.679423221485683e-07, |
|
"loss": 1.5455, |
|
"step": 14375 |
|
}, |
|
{ |
|
"epoch": 0.816650598309987, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 6.62816920658785e-07, |
|
"loss": 1.5559, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.8180683944876085, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 6.576915191690015e-07, |
|
"loss": 1.5264, |
|
"step": 14425 |
|
}, |
|
{ |
|
"epoch": 0.81948619066523, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 6.525661176792182e-07, |
|
"loss": 1.5374, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 0.8209039868428515, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 6.474407161894348e-07, |
|
"loss": 1.5376, |
|
"step": 14475 |
|
}, |
|
{ |
|
"epoch": 0.822321783020473, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 6.423153146996515e-07, |
|
"loss": 1.5514, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8237395791980945, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.371899132098681e-07, |
|
"loss": 1.5478, |
|
"step": 14525 |
|
}, |
|
{ |
|
"epoch": 0.825157375375716, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6.320645117200848e-07, |
|
"loss": 1.5213, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.8265751715533375, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 6.269391102303015e-07, |
|
"loss": 1.544, |
|
"step": 14575 |
|
}, |
|
{ |
|
"epoch": 0.827992967730959, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 6.21813708740518e-07, |
|
"loss": 1.5342, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.8294107639085805, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.166883072507347e-07, |
|
"loss": 1.5428, |
|
"step": 14625 |
|
}, |
|
{ |
|
"epoch": 0.830828560086202, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 6.115629057609513e-07, |
|
"loss": 1.5419, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 0.8322463562638235, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.064375042711679e-07, |
|
"loss": 1.5475, |
|
"step": 14675 |
|
}, |
|
{ |
|
"epoch": 0.833664152441445, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 6.013121027813845e-07, |
|
"loss": 1.5467, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.8350819486190665, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.961867012916012e-07, |
|
"loss": 1.5364, |
|
"step": 14725 |
|
}, |
|
{ |
|
"epoch": 0.836499744796688, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 5.910612998018179e-07, |
|
"loss": 1.5297, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.8379175409743095, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 5.859358983120344e-07, |
|
"loss": 1.5556, |
|
"step": 14775 |
|
}, |
|
{ |
|
"epoch": 0.8393353371519311, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.808104968222511e-07, |
|
"loss": 1.5352, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.8407531333295526, |
|
"grad_norm": 1.5, |
|
"learning_rate": 5.756850953324677e-07, |
|
"loss": 1.5577, |
|
"step": 14825 |
|
}, |
|
{ |
|
"epoch": 0.8421709295071741, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 5.705596938426844e-07, |
|
"loss": 1.5562, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.8435887256847956, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.65434292352901e-07, |
|
"loss": 1.5538, |
|
"step": 14875 |
|
}, |
|
{ |
|
"epoch": 0.8450065218624171, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.603088908631177e-07, |
|
"loss": 1.5314, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.8464243180400386, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 5.551834893733342e-07, |
|
"loss": 1.5466, |
|
"step": 14925 |
|
}, |
|
{ |
|
"epoch": 0.8478421142176601, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 5.500580878835509e-07, |
|
"loss": 1.5408, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 0.8492599103952816, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.449326863937675e-07, |
|
"loss": 1.5476, |
|
"step": 14975 |
|
}, |
|
{ |
|
"epoch": 0.8506777065729031, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.398072849039841e-07, |
|
"loss": 1.5487, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8520955027505246, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.346818834142008e-07, |
|
"loss": 1.5147, |
|
"step": 15025 |
|
}, |
|
{ |
|
"epoch": 0.8535132989281461, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 5.295564819244174e-07, |
|
"loss": 1.5342, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 0.8549310951057676, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 5.244310804346341e-07, |
|
"loss": 1.5487, |
|
"step": 15075 |
|
}, |
|
{ |
|
"epoch": 0.8563488912833891, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.193056789448507e-07, |
|
"loss": 1.5365, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.8577666874610106, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 5.141802774550673e-07, |
|
"loss": 1.5347, |
|
"step": 15125 |
|
}, |
|
{ |
|
"epoch": 0.8591844836386321, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 5.09054875965284e-07, |
|
"loss": 1.5168, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.8606022798162536, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 5.039294744755006e-07, |
|
"loss": 1.5616, |
|
"step": 15175 |
|
}, |
|
{ |
|
"epoch": 0.8620200759938751, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.988040729857173e-07, |
|
"loss": 1.5387, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.8634378721714966, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.936786714959338e-07, |
|
"loss": 1.533, |
|
"step": 15225 |
|
}, |
|
{ |
|
"epoch": 0.8648556683491181, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.885532700061505e-07, |
|
"loss": 1.5412, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.8662734645267396, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.834278685163671e-07, |
|
"loss": 1.5347, |
|
"step": 15275 |
|
}, |
|
{ |
|
"epoch": 0.8676912607043611, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.783024670265837e-07, |
|
"loss": 1.5393, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.8691090568819827, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.7317706553680035e-07, |
|
"loss": 1.5254, |
|
"step": 15325 |
|
}, |
|
{ |
|
"epoch": 0.8705268530596042, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.68051664047017e-07, |
|
"loss": 1.5425, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 0.8719446492372257, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.6292626255723367e-07, |
|
"loss": 1.5319, |
|
"step": 15375 |
|
}, |
|
{ |
|
"epoch": 0.8733624454148472, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.5780086106745033e-07, |
|
"loss": 1.531, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.8747802415924687, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.5267545957766693e-07, |
|
"loss": 1.5329, |
|
"step": 15425 |
|
}, |
|
{ |
|
"epoch": 0.8761980377700902, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.475500580878836e-07, |
|
"loss": 1.5389, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.8776158339477117, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.4242465659810014e-07, |
|
"loss": 1.5516, |
|
"step": 15475 |
|
}, |
|
{ |
|
"epoch": 0.8790336301253332, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.372992551083168e-07, |
|
"loss": 1.5463, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8804514263029547, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.3217385361853346e-07, |
|
"loss": 1.5105, |
|
"step": 15525 |
|
}, |
|
{ |
|
"epoch": 0.8818692224805762, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.270484521287501e-07, |
|
"loss": 1.5498, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 0.8832870186581977, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.219230506389667e-07, |
|
"loss": 1.5456, |
|
"step": 15575 |
|
}, |
|
{ |
|
"epoch": 0.8847048148358192, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.167976491491834e-07, |
|
"loss": 1.5473, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.8861226110134407, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.1167224765940004e-07, |
|
"loss": 1.549, |
|
"step": 15625 |
|
}, |
|
{ |
|
"epoch": 0.8875404071910622, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.065468461696166e-07, |
|
"loss": 1.5346, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 0.8889582033686837, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.0142144467983325e-07, |
|
"loss": 1.5345, |
|
"step": 15675 |
|
}, |
|
{ |
|
"epoch": 0.8903759995463052, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.962960431900499e-07, |
|
"loss": 1.5251, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.8917937957239267, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.911706417002665e-07, |
|
"loss": 1.5184, |
|
"step": 15725 |
|
}, |
|
{ |
|
"epoch": 0.8932115919015482, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.8604524021048317e-07, |
|
"loss": 1.5245, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.8946293880791697, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.8091983872069983e-07, |
|
"loss": 1.5386, |
|
"step": 15775 |
|
}, |
|
{ |
|
"epoch": 0.8960471842567912, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.757944372309165e-07, |
|
"loss": 1.5282, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.8974649804344127, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.706690357411331e-07, |
|
"loss": 1.5082, |
|
"step": 15825 |
|
}, |
|
{ |
|
"epoch": 0.8988827766120343, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.655436342513497e-07, |
|
"loss": 1.5486, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 0.9003005727896558, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.604182327615663e-07, |
|
"loss": 1.5204, |
|
"step": 15875 |
|
}, |
|
{ |
|
"epoch": 0.9017183689672773, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.5529283127178296e-07, |
|
"loss": 1.5422, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.9031361651448988, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.501674297819996e-07, |
|
"loss": 1.5431, |
|
"step": 15925 |
|
}, |
|
{ |
|
"epoch": 0.9045539613225203, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.450420282922162e-07, |
|
"loss": 1.537, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 0.9059717575001418, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.399166268024329e-07, |
|
"loss": 1.5505, |
|
"step": 15975 |
|
}, |
|
{ |
|
"epoch": 0.9073895536777633, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.347912253126495e-07, |
|
"loss": 1.5466, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.9088073498553848, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.296658238228661e-07, |
|
"loss": 1.5434, |
|
"step": 16025 |
|
}, |
|
{ |
|
"epoch": 0.9102251460330063, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.2454042233308275e-07, |
|
"loss": 1.5351, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 0.9116429422106278, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.194150208432994e-07, |
|
"loss": 1.5692, |
|
"step": 16075 |
|
}, |
|
{ |
|
"epoch": 0.9130607383882493, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.1428961935351607e-07, |
|
"loss": 1.5178, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.9144785345658708, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.0916421786373267e-07, |
|
"loss": 1.5542, |
|
"step": 16125 |
|
}, |
|
{ |
|
"epoch": 0.9158963307434923, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.040388163739493e-07, |
|
"loss": 1.5211, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 0.9173141269211138, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.9891341488416594e-07, |
|
"loss": 1.538, |
|
"step": 16175 |
|
}, |
|
{ |
|
"epoch": 0.9187319230987353, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.9378801339438254e-07, |
|
"loss": 1.5463, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.9201497192763568, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.886626119045992e-07, |
|
"loss": 1.5322, |
|
"step": 16225 |
|
}, |
|
{ |
|
"epoch": 0.9215675154539783, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.8353721041481586e-07, |
|
"loss": 1.554, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.9229853116315998, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.7841180892503246e-07, |
|
"loss": 1.5234, |
|
"step": 16275 |
|
}, |
|
{ |
|
"epoch": 0.9244031078092213, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.7328640743524907e-07, |
|
"loss": 1.5341, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.9258209039868428, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.681610059454657e-07, |
|
"loss": 1.5332, |
|
"step": 16325 |
|
}, |
|
{ |
|
"epoch": 0.9272387001644643, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.630356044556824e-07, |
|
"loss": 1.548, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 0.9286564963420859, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.57910202965899e-07, |
|
"loss": 1.5414, |
|
"step": 16375 |
|
}, |
|
{ |
|
"epoch": 0.9300742925197074, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.5278480147611565e-07, |
|
"loss": 1.5336, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.9314920886973289, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.4765939998633225e-07, |
|
"loss": 1.566, |
|
"step": 16425 |
|
}, |
|
{ |
|
"epoch": 0.9329098848749504, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.425339984965489e-07, |
|
"loss": 1.5279, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 0.9343276810525719, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.3740859700676552e-07, |
|
"loss": 1.5338, |
|
"step": 16475 |
|
}, |
|
{ |
|
"epoch": 0.9357454772301934, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.3228319551698217e-07, |
|
"loss": 1.5625, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9371632734078149, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.271577940271988e-07, |
|
"loss": 1.5272, |
|
"step": 16525 |
|
}, |
|
{ |
|
"epoch": 0.9385810695854364, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.220323925374154e-07, |
|
"loss": 1.5431, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 0.9399988657630579, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.1690699104763207e-07, |
|
"loss": 1.5529, |
|
"step": 16575 |
|
}, |
|
{ |
|
"epoch": 0.9414166619406794, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.117815895578487e-07, |
|
"loss": 1.5358, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.9428344581183009, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.0665618806806536e-07, |
|
"loss": 1.5334, |
|
"step": 16625 |
|
}, |
|
{ |
|
"epoch": 0.9442522542959224, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.0153078657828196e-07, |
|
"loss": 1.5475, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 0.9456700504735439, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.964053850884986e-07, |
|
"loss": 1.5342, |
|
"step": 16675 |
|
}, |
|
{ |
|
"epoch": 0.9470878466511654, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.9127998359871525e-07, |
|
"loss": 1.5595, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.9485056428287869, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.8615458210893189e-07, |
|
"loss": 1.5124, |
|
"step": 16725 |
|
}, |
|
{ |
|
"epoch": 0.9499234390064084, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.810291806191485e-07, |
|
"loss": 1.5265, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.9513412351840299, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.7590377912936515e-07, |
|
"loss": 1.5528, |
|
"step": 16775 |
|
}, |
|
{ |
|
"epoch": 0.9527590313616514, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.7077837763958178e-07, |
|
"loss": 1.5389, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.9541768275392729, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.656529761497984e-07, |
|
"loss": 1.5352, |
|
"step": 16825 |
|
}, |
|
{ |
|
"epoch": 0.9555946237168944, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6052757466001504e-07, |
|
"loss": 1.544, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 0.957012419894516, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.5540217317023168e-07, |
|
"loss": 1.521, |
|
"step": 16875 |
|
}, |
|
{ |
|
"epoch": 0.9584302160721375, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.502767716804483e-07, |
|
"loss": 1.5307, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.959848012249759, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.4515137019066494e-07, |
|
"loss": 1.5331, |
|
"step": 16925 |
|
}, |
|
{ |
|
"epoch": 0.9612658084273805, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.4002596870088157e-07, |
|
"loss": 1.5488, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 0.962683604605002, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.349005672110982e-07, |
|
"loss": 1.5285, |
|
"step": 16975 |
|
}, |
|
{ |
|
"epoch": 0.9641014007826235, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.2977516572131486e-07, |
|
"loss": 1.5377, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.965519196960245, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.2464976423153147e-07, |
|
"loss": 1.5591, |
|
"step": 17025 |
|
}, |
|
{ |
|
"epoch": 0.9669369931378665, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.195243627417481e-07, |
|
"loss": 1.5427, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 0.968354789315488, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.1439896125196474e-07, |
|
"loss": 1.5266, |
|
"step": 17075 |
|
}, |
|
{ |
|
"epoch": 0.9697725854931095, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.0927355976218137e-07, |
|
"loss": 1.535, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.971190381670731, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.0414815827239802e-07, |
|
"loss": 1.5384, |
|
"step": 17125 |
|
}, |
|
{ |
|
"epoch": 0.9726081778483525, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.902275678261464e-08, |
|
"loss": 1.5588, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.389735529283128e-08, |
|
"loss": 1.5508, |
|
"step": 17175 |
|
}, |
|
{ |
|
"epoch": 0.9754437702035955, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.877195380304791e-08, |
|
"loss": 1.5335, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.976861566381217, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 8.364655231326454e-08, |
|
"loss": 1.5481, |
|
"step": 17225 |
|
}, |
|
{ |
|
"epoch": 0.9782793625588385, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 7.852115082348118e-08, |
|
"loss": 1.5226, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 0.97969715873646, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 7.339574933369781e-08, |
|
"loss": 1.5397, |
|
"step": 17275 |
|
}, |
|
{ |
|
"epoch": 0.9811149549140815, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.827034784391444e-08, |
|
"loss": 1.5393, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.982532751091703, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.314494635413107e-08, |
|
"loss": 1.5498, |
|
"step": 17325 |
|
}, |
|
{ |
|
"epoch": 0.9839505472693245, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 5.801954486434771e-08, |
|
"loss": 1.5442, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 0.985368343446946, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 5.289414337456434e-08, |
|
"loss": 1.5461, |
|
"step": 17375 |
|
}, |
|
{ |
|
"epoch": 0.9867861396245676, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.776874188478098e-08, |
|
"loss": 1.5237, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.9882039358021891, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.264334039499761e-08, |
|
"loss": 1.5218, |
|
"step": 17425 |
|
}, |
|
{ |
|
"epoch": 0.9896217319798106, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.751793890521424e-08, |
|
"loss": 1.5385, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 0.9910395281574321, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.239253741543088e-08, |
|
"loss": 1.5342, |
|
"step": 17475 |
|
}, |
|
{ |
|
"epoch": 0.9924573243350536, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.726713592564751e-08, |
|
"loss": 1.5421, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9938751205126751, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.2141734435864145e-08, |
|
"loss": 1.538, |
|
"step": 17525 |
|
}, |
|
{ |
|
"epoch": 0.9952929166902966, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.7016332946080776e-08, |
|
"loss": 1.5478, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 0.9967107128679181, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.1890931456297411e-08, |
|
"loss": 1.5367, |
|
"step": 17575 |
|
}, |
|
{ |
|
"epoch": 0.9981285090455396, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.765529966514044e-09, |
|
"loss": 1.5193, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.9995463052231611, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.6401284767306775e-09, |
|
"loss": 1.5537, |
|
"step": 17625 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 17633, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.571044232598127e+18, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|