BMC-smolvlm1-500M / trainer_state.json
Alejandro98's picture
Upload folder using huggingface_hub
26a4b4c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 17633,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014177961776215052,
"grad_norm": 9.8125,
"learning_rate": 2.5e-08,
"loss": 2.9276,
"step": 25
},
{
"epoch": 0.0028355923552430104,
"grad_norm": 9.8125,
"learning_rate": 5e-08,
"loss": 2.9468,
"step": 50
},
{
"epoch": 0.004253388532864515,
"grad_norm": 7.84375,
"learning_rate": 7.500000000000001e-08,
"loss": 2.9228,
"step": 75
},
{
"epoch": 0.005671184710486021,
"grad_norm": 7.0625,
"learning_rate": 1e-07,
"loss": 2.8258,
"step": 100
},
{
"epoch": 0.007088980888107526,
"grad_norm": 5.5625,
"learning_rate": 1.25e-07,
"loss": 2.7263,
"step": 125
},
{
"epoch": 0.00850677706572903,
"grad_norm": 4.875,
"learning_rate": 1.5000000000000002e-07,
"loss": 2.6245,
"step": 150
},
{
"epoch": 0.009924573243350537,
"grad_norm": 4.125,
"learning_rate": 1.7500000000000002e-07,
"loss": 2.4902,
"step": 175
},
{
"epoch": 0.011342369420972042,
"grad_norm": 3.21875,
"learning_rate": 2e-07,
"loss": 2.349,
"step": 200
},
{
"epoch": 0.012760165598593547,
"grad_norm": 3.15625,
"learning_rate": 2.25e-07,
"loss": 2.3038,
"step": 225
},
{
"epoch": 0.014177961776215052,
"grad_norm": 3.0,
"learning_rate": 2.5e-07,
"loss": 2.2632,
"step": 250
},
{
"epoch": 0.015595757953836557,
"grad_norm": 2.859375,
"learning_rate": 2.7499999999999996e-07,
"loss": 2.2339,
"step": 275
},
{
"epoch": 0.01701355413145806,
"grad_norm": 2.78125,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.2106,
"step": 300
},
{
"epoch": 0.018431350309079567,
"grad_norm": 2.578125,
"learning_rate": 3.25e-07,
"loss": 2.178,
"step": 325
},
{
"epoch": 0.019849146486701073,
"grad_norm": 2.6875,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.1623,
"step": 350
},
{
"epoch": 0.021266942664322577,
"grad_norm": 2.25,
"learning_rate": 3.75e-07,
"loss": 2.1621,
"step": 375
},
{
"epoch": 0.022684738841944083,
"grad_norm": 2.640625,
"learning_rate": 4e-07,
"loss": 2.1586,
"step": 400
},
{
"epoch": 0.024102535019565587,
"grad_norm": 2.828125,
"learning_rate": 4.25e-07,
"loss": 2.1498,
"step": 425
},
{
"epoch": 0.025520331197187093,
"grad_norm": 2.65625,
"learning_rate": 4.5e-07,
"loss": 2.1515,
"step": 450
},
{
"epoch": 0.026938127374808597,
"grad_norm": 3.125,
"learning_rate": 4.75e-07,
"loss": 2.1216,
"step": 475
},
{
"epoch": 0.028355923552430103,
"grad_norm": 2.546875,
"learning_rate": 5e-07,
"loss": 2.0938,
"step": 500
},
{
"epoch": 0.029773719730051607,
"grad_norm": 2.59375,
"learning_rate": 5.25e-07,
"loss": 2.0788,
"step": 525
},
{
"epoch": 0.031191515907673113,
"grad_norm": 2.625,
"learning_rate": 5.499999999999999e-07,
"loss": 2.0899,
"step": 550
},
{
"epoch": 0.03260931208529462,
"grad_norm": 2.703125,
"learning_rate": 5.750000000000001e-07,
"loss": 2.0949,
"step": 575
},
{
"epoch": 0.03402710826291612,
"grad_norm": 2.921875,
"learning_rate": 6.000000000000001e-07,
"loss": 2.0772,
"step": 600
},
{
"epoch": 0.035444904440537627,
"grad_norm": 3.0,
"learning_rate": 6.25e-07,
"loss": 2.0612,
"step": 625
},
{
"epoch": 0.03686270061815913,
"grad_norm": 2.328125,
"learning_rate": 6.5e-07,
"loss": 2.0366,
"step": 650
},
{
"epoch": 0.03828049679578064,
"grad_norm": 2.765625,
"learning_rate": 6.75e-07,
"loss": 2.0379,
"step": 675
},
{
"epoch": 0.03969829297340215,
"grad_norm": 2.71875,
"learning_rate": 7.000000000000001e-07,
"loss": 2.0458,
"step": 700
},
{
"epoch": 0.041116089151023646,
"grad_norm": 2.984375,
"learning_rate": 7.25e-07,
"loss": 2.0248,
"step": 725
},
{
"epoch": 0.04253388532864515,
"grad_norm": 3.109375,
"learning_rate": 7.5e-07,
"loss": 2.0222,
"step": 750
},
{
"epoch": 0.04395168150626666,
"grad_norm": 2.875,
"learning_rate": 7.750000000000001e-07,
"loss": 2.0129,
"step": 775
},
{
"epoch": 0.04536947768388817,
"grad_norm": 3.015625,
"learning_rate": 8e-07,
"loss": 2.0452,
"step": 800
},
{
"epoch": 0.046787273861509666,
"grad_norm": 3.0625,
"learning_rate": 8.25e-07,
"loss": 2.0129,
"step": 825
},
{
"epoch": 0.04820507003913117,
"grad_norm": 3.1875,
"learning_rate": 8.5e-07,
"loss": 2.0083,
"step": 850
},
{
"epoch": 0.04962286621675268,
"grad_norm": 3.265625,
"learning_rate": 8.750000000000001e-07,
"loss": 2.0002,
"step": 875
},
{
"epoch": 0.05104066239437419,
"grad_norm": 3.375,
"learning_rate": 9e-07,
"loss": 1.9946,
"step": 900
},
{
"epoch": 0.05245845857199569,
"grad_norm": 4.1875,
"learning_rate": 9.25e-07,
"loss": 1.9826,
"step": 925
},
{
"epoch": 0.05387625474961719,
"grad_norm": 3.546875,
"learning_rate": 9.5e-07,
"loss": 1.9619,
"step": 950
},
{
"epoch": 0.0552940509272387,
"grad_norm": 4.5625,
"learning_rate": 9.75e-07,
"loss": 1.959,
"step": 975
},
{
"epoch": 0.05671184710486021,
"grad_norm": 4.5,
"learning_rate": 1e-06,
"loss": 1.9438,
"step": 1000
},
{
"epoch": 0.05812964328248171,
"grad_norm": 4.65625,
"learning_rate": 1.0250000000000001e-06,
"loss": 1.9322,
"step": 1025
},
{
"epoch": 0.05954743946010321,
"grad_norm": 5.125,
"learning_rate": 1.05e-06,
"loss": 1.9012,
"step": 1050
},
{
"epoch": 0.06096523563772472,
"grad_norm": 5.125,
"learning_rate": 1.075e-06,
"loss": 1.8728,
"step": 1075
},
{
"epoch": 0.06238303181534623,
"grad_norm": 5.21875,
"learning_rate": 1.0999999999999998e-06,
"loss": 1.8705,
"step": 1100
},
{
"epoch": 0.06380082799296773,
"grad_norm": 4.53125,
"learning_rate": 1.125e-06,
"loss": 1.8529,
"step": 1125
},
{
"epoch": 0.06521862417058924,
"grad_norm": 5.21875,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.855,
"step": 1150
},
{
"epoch": 0.06663642034821074,
"grad_norm": 5.1875,
"learning_rate": 1.175e-06,
"loss": 1.8516,
"step": 1175
},
{
"epoch": 0.06805421652583224,
"grad_norm": 5.375,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.8211,
"step": 1200
},
{
"epoch": 0.06947201270345375,
"grad_norm": 5.875,
"learning_rate": 1.225e-06,
"loss": 1.8235,
"step": 1225
},
{
"epoch": 0.07088980888107525,
"grad_norm": 5.5,
"learning_rate": 1.25e-06,
"loss": 1.8149,
"step": 1250
},
{
"epoch": 0.07230760505869677,
"grad_norm": 4.84375,
"learning_rate": 1.275e-06,
"loss": 1.8164,
"step": 1275
},
{
"epoch": 0.07372540123631827,
"grad_norm": 4.46875,
"learning_rate": 1.3e-06,
"loss": 1.7967,
"step": 1300
},
{
"epoch": 0.07514319741393977,
"grad_norm": 4.5625,
"learning_rate": 1.325e-06,
"loss": 1.819,
"step": 1325
},
{
"epoch": 0.07656099359156128,
"grad_norm": 4.78125,
"learning_rate": 1.35e-06,
"loss": 1.7788,
"step": 1350
},
{
"epoch": 0.07797878976918278,
"grad_norm": 4.5,
"learning_rate": 1.375e-06,
"loss": 1.7854,
"step": 1375
},
{
"epoch": 0.0793965859468043,
"grad_norm": 3.9375,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.7644,
"step": 1400
},
{
"epoch": 0.0808143821244258,
"grad_norm": 3.453125,
"learning_rate": 1.425e-06,
"loss": 1.7811,
"step": 1425
},
{
"epoch": 0.08223217830204729,
"grad_norm": 3.328125,
"learning_rate": 1.45e-06,
"loss": 1.7685,
"step": 1450
},
{
"epoch": 0.0836499744796688,
"grad_norm": 3.40625,
"learning_rate": 1.4749999999999999e-06,
"loss": 1.7554,
"step": 1475
},
{
"epoch": 0.0850677706572903,
"grad_norm": 2.703125,
"learning_rate": 1.5e-06,
"loss": 1.7636,
"step": 1500
},
{
"epoch": 0.0864855668349118,
"grad_norm": 2.859375,
"learning_rate": 1.525e-06,
"loss": 1.7472,
"step": 1525
},
{
"epoch": 0.08790336301253332,
"grad_norm": 2.75,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.7534,
"step": 1550
},
{
"epoch": 0.08932115919015482,
"grad_norm": 2.703125,
"learning_rate": 1.5750000000000002e-06,
"loss": 1.7226,
"step": 1575
},
{
"epoch": 0.09073895536777633,
"grad_norm": 2.078125,
"learning_rate": 1.6e-06,
"loss": 1.7416,
"step": 1600
},
{
"epoch": 0.09215675154539783,
"grad_norm": 1.953125,
"learning_rate": 1.625e-06,
"loss": 1.7371,
"step": 1625
},
{
"epoch": 0.09357454772301933,
"grad_norm": 1.8984375,
"learning_rate": 1.65e-06,
"loss": 1.7358,
"step": 1650
},
{
"epoch": 0.09499234390064085,
"grad_norm": 1.953125,
"learning_rate": 1.675e-06,
"loss": 1.7299,
"step": 1675
},
{
"epoch": 0.09641014007826235,
"grad_norm": 1.953125,
"learning_rate": 1.7e-06,
"loss": 1.7308,
"step": 1700
},
{
"epoch": 0.09782793625588386,
"grad_norm": 1.8046875,
"learning_rate": 1.725e-06,
"loss": 1.7035,
"step": 1725
},
{
"epoch": 0.09924573243350536,
"grad_norm": 1.8828125,
"learning_rate": 1.7500000000000002e-06,
"loss": 1.7224,
"step": 1750
},
{
"epoch": 0.10066352861112686,
"grad_norm": 1.7890625,
"learning_rate": 1.7750000000000002e-06,
"loss": 1.7148,
"step": 1775
},
{
"epoch": 0.10208132478874837,
"grad_norm": 1.875,
"learning_rate": 1.8e-06,
"loss": 1.7211,
"step": 1800
},
{
"epoch": 0.10349912096636987,
"grad_norm": 1.703125,
"learning_rate": 1.8249999999999999e-06,
"loss": 1.7162,
"step": 1825
},
{
"epoch": 0.10491691714399139,
"grad_norm": 1.734375,
"learning_rate": 1.85e-06,
"loss": 1.7024,
"step": 1850
},
{
"epoch": 0.10633471332161289,
"grad_norm": 1.5859375,
"learning_rate": 1.875e-06,
"loss": 1.707,
"step": 1875
},
{
"epoch": 0.10775250949923439,
"grad_norm": 1.5703125,
"learning_rate": 1.9e-06,
"loss": 1.7134,
"step": 1900
},
{
"epoch": 0.1091703056768559,
"grad_norm": 1.734375,
"learning_rate": 1.925e-06,
"loss": 1.6868,
"step": 1925
},
{
"epoch": 0.1105881018544774,
"grad_norm": 1.671875,
"learning_rate": 1.95e-06,
"loss": 1.7017,
"step": 1950
},
{
"epoch": 0.1120058980320989,
"grad_norm": 1.4765625,
"learning_rate": 1.975e-06,
"loss": 1.6898,
"step": 1975
},
{
"epoch": 0.11342369420972041,
"grad_norm": 1.6484375,
"learning_rate": 2e-06,
"loss": 1.7212,
"step": 2000
},
{
"epoch": 0.11484149038734191,
"grad_norm": 1.59375,
"learning_rate": 2.025e-06,
"loss": 1.6863,
"step": 2025
},
{
"epoch": 0.11625928656496343,
"grad_norm": 1.6484375,
"learning_rate": 2.0500000000000003e-06,
"loss": 1.6828,
"step": 2050
},
{
"epoch": 0.11767708274258493,
"grad_norm": 1.5234375,
"learning_rate": 2.075e-06,
"loss": 1.6918,
"step": 2075
},
{
"epoch": 0.11909487892020643,
"grad_norm": 1.53125,
"learning_rate": 2.1e-06,
"loss": 1.6978,
"step": 2100
},
{
"epoch": 0.12051267509782794,
"grad_norm": 1.5859375,
"learning_rate": 2.125e-06,
"loss": 1.6995,
"step": 2125
},
{
"epoch": 0.12193047127544944,
"grad_norm": 1.6875,
"learning_rate": 2.15e-06,
"loss": 1.681,
"step": 2150
},
{
"epoch": 0.12334826745307095,
"grad_norm": 1.625,
"learning_rate": 2.175e-06,
"loss": 1.6893,
"step": 2175
},
{
"epoch": 0.12476606363069245,
"grad_norm": 1.5546875,
"learning_rate": 2.1999999999999997e-06,
"loss": 1.6847,
"step": 2200
},
{
"epoch": 0.12618385980831395,
"grad_norm": 1.7109375,
"learning_rate": 2.2250000000000003e-06,
"loss": 1.6997,
"step": 2225
},
{
"epoch": 0.12760165598593545,
"grad_norm": 1.6484375,
"learning_rate": 2.25e-06,
"loss": 1.6706,
"step": 2250
},
{
"epoch": 0.12901945216355698,
"grad_norm": 1.6484375,
"learning_rate": 2.275e-06,
"loss": 1.68,
"step": 2275
},
{
"epoch": 0.13043724834117848,
"grad_norm": 1.5859375,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.6774,
"step": 2300
},
{
"epoch": 0.13185504451879998,
"grad_norm": 1.46875,
"learning_rate": 2.325e-06,
"loss": 1.6954,
"step": 2325
},
{
"epoch": 0.13327284069642148,
"grad_norm": 1.5625,
"learning_rate": 2.35e-06,
"loss": 1.6797,
"step": 2350
},
{
"epoch": 0.13469063687404298,
"grad_norm": 1.8125,
"learning_rate": 2.375e-06,
"loss": 1.6646,
"step": 2375
},
{
"epoch": 0.13610843305166448,
"grad_norm": 1.5078125,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.6769,
"step": 2400
},
{
"epoch": 0.137526229229286,
"grad_norm": 1.6328125,
"learning_rate": 2.425e-06,
"loss": 1.6546,
"step": 2425
},
{
"epoch": 0.1389440254069075,
"grad_norm": 1.4921875,
"learning_rate": 2.45e-06,
"loss": 1.6662,
"step": 2450
},
{
"epoch": 0.140361821584529,
"grad_norm": 1.390625,
"learning_rate": 2.475e-06,
"loss": 1.6757,
"step": 2475
},
{
"epoch": 0.1417796177621505,
"grad_norm": 1.4921875,
"learning_rate": 2.5e-06,
"loss": 1.6692,
"step": 2500
},
{
"epoch": 0.143197413939772,
"grad_norm": 1.640625,
"learning_rate": 2.525e-06,
"loss": 1.6591,
"step": 2525
},
{
"epoch": 0.14461521011739353,
"grad_norm": 1.6640625,
"learning_rate": 2.55e-06,
"loss": 1.6577,
"step": 2550
},
{
"epoch": 0.14603300629501503,
"grad_norm": 1.6484375,
"learning_rate": 2.575e-06,
"loss": 1.6837,
"step": 2575
},
{
"epoch": 0.14745080247263653,
"grad_norm": 1.4453125,
"learning_rate": 2.6e-06,
"loss": 1.6607,
"step": 2600
},
{
"epoch": 0.14886859865025803,
"grad_norm": 1.5390625,
"learning_rate": 2.6250000000000003e-06,
"loss": 1.6535,
"step": 2625
},
{
"epoch": 0.15028639482787953,
"grad_norm": 1.484375,
"learning_rate": 2.65e-06,
"loss": 1.6696,
"step": 2650
},
{
"epoch": 0.15170419100550106,
"grad_norm": 1.65625,
"learning_rate": 2.6750000000000002e-06,
"loss": 1.6548,
"step": 2675
},
{
"epoch": 0.15312198718312256,
"grad_norm": 1.5078125,
"learning_rate": 2.7e-06,
"loss": 1.6772,
"step": 2700
},
{
"epoch": 0.15453978336074406,
"grad_norm": 1.5078125,
"learning_rate": 2.725e-06,
"loss": 1.6483,
"step": 2725
},
{
"epoch": 0.15595757953836556,
"grad_norm": 1.4453125,
"learning_rate": 2.75e-06,
"loss": 1.6666,
"step": 2750
},
{
"epoch": 0.15737537571598706,
"grad_norm": 1.5546875,
"learning_rate": 2.775e-06,
"loss": 1.6435,
"step": 2775
},
{
"epoch": 0.1587931718936086,
"grad_norm": 1.40625,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.6426,
"step": 2800
},
{
"epoch": 0.1602109680712301,
"grad_norm": 1.6640625,
"learning_rate": 2.825e-06,
"loss": 1.6559,
"step": 2825
},
{
"epoch": 0.1616287642488516,
"grad_norm": 1.4453125,
"learning_rate": 2.85e-06,
"loss": 1.6515,
"step": 2850
},
{
"epoch": 0.16304656042647309,
"grad_norm": 1.703125,
"learning_rate": 2.875e-06,
"loss": 1.6438,
"step": 2875
},
{
"epoch": 0.16446435660409459,
"grad_norm": 1.71875,
"learning_rate": 2.9e-06,
"loss": 1.6411,
"step": 2900
},
{
"epoch": 0.1658821527817161,
"grad_norm": 1.5234375,
"learning_rate": 2.925e-06,
"loss": 1.6579,
"step": 2925
},
{
"epoch": 0.1672999489593376,
"grad_norm": 1.59375,
"learning_rate": 2.9499999999999997e-06,
"loss": 1.6521,
"step": 2950
},
{
"epoch": 0.1687177451369591,
"grad_norm": 1.7109375,
"learning_rate": 2.9750000000000003e-06,
"loss": 1.6422,
"step": 2975
},
{
"epoch": 0.1701355413145806,
"grad_norm": 1.5546875,
"learning_rate": 3e-06,
"loss": 1.6389,
"step": 3000
},
{
"epoch": 0.1715533374922021,
"grad_norm": 1.5625,
"learning_rate": 2.9948745985102165e-06,
"loss": 1.6347,
"step": 3025
},
{
"epoch": 0.1729711336698236,
"grad_norm": 1.625,
"learning_rate": 2.9897491970204334e-06,
"loss": 1.655,
"step": 3050
},
{
"epoch": 0.17438892984744514,
"grad_norm": 1.3828125,
"learning_rate": 2.98462379553065e-06,
"loss": 1.6428,
"step": 3075
},
{
"epoch": 0.17580672602506664,
"grad_norm": 1.5390625,
"learning_rate": 2.9794983940408667e-06,
"loss": 1.6572,
"step": 3100
},
{
"epoch": 0.17722452220268814,
"grad_norm": 1.5859375,
"learning_rate": 2.974372992551083e-06,
"loss": 1.6407,
"step": 3125
},
{
"epoch": 0.17864231838030964,
"grad_norm": 1.4765625,
"learning_rate": 2.9692475910613e-06,
"loss": 1.6383,
"step": 3150
},
{
"epoch": 0.18006011455793114,
"grad_norm": 1.5703125,
"learning_rate": 2.9641221895715165e-06,
"loss": 1.6426,
"step": 3175
},
{
"epoch": 0.18147791073555267,
"grad_norm": 1.6953125,
"learning_rate": 2.9589967880817333e-06,
"loss": 1.6349,
"step": 3200
},
{
"epoch": 0.18289570691317417,
"grad_norm": 1.671875,
"learning_rate": 2.9538713865919498e-06,
"loss": 1.6491,
"step": 3225
},
{
"epoch": 0.18431350309079567,
"grad_norm": 1.59375,
"learning_rate": 2.9487459851021667e-06,
"loss": 1.6329,
"step": 3250
},
{
"epoch": 0.18573129926841717,
"grad_norm": 2.75,
"learning_rate": 2.943620583612383e-06,
"loss": 1.6319,
"step": 3275
},
{
"epoch": 0.18714909544603867,
"grad_norm": 2.71875,
"learning_rate": 2.9384951821226e-06,
"loss": 1.6178,
"step": 3300
},
{
"epoch": 0.1885668916236602,
"grad_norm": 1.4609375,
"learning_rate": 2.9333697806328164e-06,
"loss": 1.5879,
"step": 3325
},
{
"epoch": 0.1899846878012817,
"grad_norm": 1.453125,
"learning_rate": 2.928244379143033e-06,
"loss": 1.6113,
"step": 3350
},
{
"epoch": 0.1914024839789032,
"grad_norm": 1.421875,
"learning_rate": 2.9231189776532493e-06,
"loss": 1.5957,
"step": 3375
},
{
"epoch": 0.1928202801565247,
"grad_norm": 1.6484375,
"learning_rate": 2.917993576163466e-06,
"loss": 1.5978,
"step": 3400
},
{
"epoch": 0.1942380763341462,
"grad_norm": 1.3671875,
"learning_rate": 2.9128681746736826e-06,
"loss": 1.5955,
"step": 3425
},
{
"epoch": 0.19565587251176772,
"grad_norm": 1.3984375,
"learning_rate": 2.9077427731838995e-06,
"loss": 1.61,
"step": 3450
},
{
"epoch": 0.19707366868938922,
"grad_norm": 1.8828125,
"learning_rate": 2.902617371694116e-06,
"loss": 1.5998,
"step": 3475
},
{
"epoch": 0.19849146486701072,
"grad_norm": 1.546875,
"learning_rate": 2.897491970204333e-06,
"loss": 1.6176,
"step": 3500
},
{
"epoch": 0.19990926104463222,
"grad_norm": 1.5625,
"learning_rate": 2.8923665687145493e-06,
"loss": 1.6058,
"step": 3525
},
{
"epoch": 0.20132705722225372,
"grad_norm": 1.546875,
"learning_rate": 2.887241167224766e-06,
"loss": 1.5966,
"step": 3550
},
{
"epoch": 0.20274485339987525,
"grad_norm": 1.4921875,
"learning_rate": 2.8821157657349826e-06,
"loss": 1.5852,
"step": 3575
},
{
"epoch": 0.20416264957749675,
"grad_norm": 1.4765625,
"learning_rate": 2.876990364245199e-06,
"loss": 1.5644,
"step": 3600
},
{
"epoch": 0.20558044575511825,
"grad_norm": 1.59375,
"learning_rate": 2.871864962755416e-06,
"loss": 1.589,
"step": 3625
},
{
"epoch": 0.20699824193273975,
"grad_norm": 1.46875,
"learning_rate": 2.8667395612656323e-06,
"loss": 1.5646,
"step": 3650
},
{
"epoch": 0.20841603811036125,
"grad_norm": 1.5703125,
"learning_rate": 2.861614159775849e-06,
"loss": 1.578,
"step": 3675
},
{
"epoch": 0.20983383428798277,
"grad_norm": 1.5078125,
"learning_rate": 2.8564887582860657e-06,
"loss": 1.5959,
"step": 3700
},
{
"epoch": 0.21125163046560427,
"grad_norm": 1.4921875,
"learning_rate": 2.8513633567962825e-06,
"loss": 1.5788,
"step": 3725
},
{
"epoch": 0.21266942664322577,
"grad_norm": 1.4921875,
"learning_rate": 2.846237955306499e-06,
"loss": 1.5886,
"step": 3750
},
{
"epoch": 0.21408722282084727,
"grad_norm": 1.8203125,
"learning_rate": 2.841112553816716e-06,
"loss": 1.583,
"step": 3775
},
{
"epoch": 0.21550501899846877,
"grad_norm": 1.390625,
"learning_rate": 2.8359871523269323e-06,
"loss": 1.6008,
"step": 3800
},
{
"epoch": 0.21692281517609027,
"grad_norm": 1.8046875,
"learning_rate": 2.830861750837149e-06,
"loss": 1.6073,
"step": 3825
},
{
"epoch": 0.2183406113537118,
"grad_norm": 1.6171875,
"learning_rate": 2.8257363493473656e-06,
"loss": 1.5921,
"step": 3850
},
{
"epoch": 0.2197584075313333,
"grad_norm": 1.4609375,
"learning_rate": 2.8206109478575825e-06,
"loss": 1.5971,
"step": 3875
},
{
"epoch": 0.2211762037089548,
"grad_norm": 1.4375,
"learning_rate": 2.815485546367799e-06,
"loss": 1.5706,
"step": 3900
},
{
"epoch": 0.2225939998865763,
"grad_norm": 1.65625,
"learning_rate": 2.810360144878016e-06,
"loss": 1.5788,
"step": 3925
},
{
"epoch": 0.2240117960641978,
"grad_norm": 1.828125,
"learning_rate": 2.8052347433882322e-06,
"loss": 1.5963,
"step": 3950
},
{
"epoch": 0.22542959224181933,
"grad_norm": 1.3515625,
"learning_rate": 2.800109341898449e-06,
"loss": 1.5899,
"step": 3975
},
{
"epoch": 0.22684738841944083,
"grad_norm": 1.40625,
"learning_rate": 2.7949839404086656e-06,
"loss": 1.5878,
"step": 4000
},
{
"epoch": 0.22826518459706233,
"grad_norm": 1.375,
"learning_rate": 2.789858538918882e-06,
"loss": 1.58,
"step": 4025
},
{
"epoch": 0.22968298077468383,
"grad_norm": 1.453125,
"learning_rate": 2.7847331374290984e-06,
"loss": 1.5926,
"step": 4050
},
{
"epoch": 0.23110077695230533,
"grad_norm": 1.4921875,
"learning_rate": 2.7796077359393153e-06,
"loss": 1.6063,
"step": 4075
},
{
"epoch": 0.23251857312992685,
"grad_norm": 1.4296875,
"learning_rate": 2.7744823344495318e-06,
"loss": 1.5617,
"step": 4100
},
{
"epoch": 0.23393636930754835,
"grad_norm": 1.53125,
"learning_rate": 2.7693569329597486e-06,
"loss": 1.5941,
"step": 4125
},
{
"epoch": 0.23535416548516985,
"grad_norm": 1.4453125,
"learning_rate": 2.764231531469965e-06,
"loss": 1.5699,
"step": 4150
},
{
"epoch": 0.23677196166279135,
"grad_norm": 1.6484375,
"learning_rate": 2.759106129980182e-06,
"loss": 1.5856,
"step": 4175
},
{
"epoch": 0.23818975784041285,
"grad_norm": 1.375,
"learning_rate": 2.7539807284903984e-06,
"loss": 1.5984,
"step": 4200
},
{
"epoch": 0.23960755401803438,
"grad_norm": 1.4765625,
"learning_rate": 2.748855327000615e-06,
"loss": 1.5768,
"step": 4225
},
{
"epoch": 0.24102535019565588,
"grad_norm": 1.46875,
"learning_rate": 2.7437299255108317e-06,
"loss": 1.5845,
"step": 4250
},
{
"epoch": 0.24244314637327738,
"grad_norm": 1.4453125,
"learning_rate": 2.738604524021048e-06,
"loss": 1.5756,
"step": 4275
},
{
"epoch": 0.24386094255089888,
"grad_norm": 1.53125,
"learning_rate": 2.733479122531265e-06,
"loss": 1.5682,
"step": 4300
},
{
"epoch": 0.24527873872852038,
"grad_norm": 1.3984375,
"learning_rate": 2.7283537210414815e-06,
"loss": 1.5657,
"step": 4325
},
{
"epoch": 0.2466965349061419,
"grad_norm": 2.203125,
"learning_rate": 2.7232283195516983e-06,
"loss": 1.5778,
"step": 4350
},
{
"epoch": 0.2481143310837634,
"grad_norm": 1.6953125,
"learning_rate": 2.718102918061915e-06,
"loss": 1.5683,
"step": 4375
},
{
"epoch": 0.2495321272613849,
"grad_norm": 1.4375,
"learning_rate": 2.7129775165721317e-06,
"loss": 1.5821,
"step": 4400
},
{
"epoch": 0.2509499234390064,
"grad_norm": 1.546875,
"learning_rate": 2.707852115082348e-06,
"loss": 1.5784,
"step": 4425
},
{
"epoch": 0.2523677196166279,
"grad_norm": 1.75,
"learning_rate": 2.702726713592565e-06,
"loss": 1.5649,
"step": 4450
},
{
"epoch": 0.2537855157942494,
"grad_norm": 1.4921875,
"learning_rate": 2.6976013121027814e-06,
"loss": 1.596,
"step": 4475
},
{
"epoch": 0.2552033119718709,
"grad_norm": 1.453125,
"learning_rate": 2.6924759106129983e-06,
"loss": 1.5722,
"step": 4500
},
{
"epoch": 0.2566211081494924,
"grad_norm": 1.546875,
"learning_rate": 2.6873505091232147e-06,
"loss": 1.5901,
"step": 4525
},
{
"epoch": 0.25803890432711396,
"grad_norm": 1.4921875,
"learning_rate": 2.6822251076334316e-06,
"loss": 1.5751,
"step": 4550
},
{
"epoch": 0.25945670050473546,
"grad_norm": 1.4296875,
"learning_rate": 2.677099706143648e-06,
"loss": 1.5799,
"step": 4575
},
{
"epoch": 0.26087449668235696,
"grad_norm": 1.6640625,
"learning_rate": 2.671974304653865e-06,
"loss": 1.5533,
"step": 4600
},
{
"epoch": 0.26229229285997846,
"grad_norm": 1.53125,
"learning_rate": 2.6668489031640814e-06,
"loss": 1.5454,
"step": 4625
},
{
"epoch": 0.26371008903759996,
"grad_norm": 1.6875,
"learning_rate": 2.661723501674298e-06,
"loss": 1.5528,
"step": 4650
},
{
"epoch": 0.26512788521522146,
"grad_norm": 1.6171875,
"learning_rate": 2.6565981001845147e-06,
"loss": 1.5732,
"step": 4675
},
{
"epoch": 0.26654568139284296,
"grad_norm": 1.46875,
"learning_rate": 2.651472698694731e-06,
"loss": 1.5799,
"step": 4700
},
{
"epoch": 0.26796347757046446,
"grad_norm": 1.4375,
"learning_rate": 2.646347297204948e-06,
"loss": 1.5679,
"step": 4725
},
{
"epoch": 0.26938127374808596,
"grad_norm": 1.421875,
"learning_rate": 2.6412218957151645e-06,
"loss": 1.5455,
"step": 4750
},
{
"epoch": 0.27079906992570746,
"grad_norm": 1.484375,
"learning_rate": 2.636096494225381e-06,
"loss": 1.5666,
"step": 4775
},
{
"epoch": 0.27221686610332896,
"grad_norm": 1.421875,
"learning_rate": 2.6309710927355973e-06,
"loss": 1.5726,
"step": 4800
},
{
"epoch": 0.2736346622809505,
"grad_norm": 1.796875,
"learning_rate": 2.6258456912458142e-06,
"loss": 1.588,
"step": 4825
},
{
"epoch": 0.275052458458572,
"grad_norm": 1.390625,
"learning_rate": 2.6207202897560307e-06,
"loss": 1.5913,
"step": 4850
},
{
"epoch": 0.2764702546361935,
"grad_norm": 1.4921875,
"learning_rate": 2.6155948882662475e-06,
"loss": 1.5448,
"step": 4875
},
{
"epoch": 0.277888050813815,
"grad_norm": 1.4921875,
"learning_rate": 2.610469486776464e-06,
"loss": 1.546,
"step": 4900
},
{
"epoch": 0.2793058469914365,
"grad_norm": 1.6015625,
"learning_rate": 2.605344085286681e-06,
"loss": 1.5662,
"step": 4925
},
{
"epoch": 0.280723643169058,
"grad_norm": 1.4453125,
"learning_rate": 2.6002186837968973e-06,
"loss": 1.5662,
"step": 4950
},
{
"epoch": 0.2821414393466795,
"grad_norm": 1.7890625,
"learning_rate": 2.595093282307114e-06,
"loss": 1.5658,
"step": 4975
},
{
"epoch": 0.283559235524301,
"grad_norm": 1.5625,
"learning_rate": 2.5899678808173306e-06,
"loss": 1.5773,
"step": 5000
},
{
"epoch": 0.2849770317019225,
"grad_norm": 1.4375,
"learning_rate": 2.5848424793275475e-06,
"loss": 1.5609,
"step": 5025
},
{
"epoch": 0.286394827879544,
"grad_norm": 1.6015625,
"learning_rate": 2.579717077837764e-06,
"loss": 1.5894,
"step": 5050
},
{
"epoch": 0.28781262405716557,
"grad_norm": 1.515625,
"learning_rate": 2.574591676347981e-06,
"loss": 1.5775,
"step": 5075
},
{
"epoch": 0.28923042023478707,
"grad_norm": 1.625,
"learning_rate": 2.5694662748581972e-06,
"loss": 1.5585,
"step": 5100
},
{
"epoch": 0.29064821641240857,
"grad_norm": 1.5625,
"learning_rate": 2.564340873368414e-06,
"loss": 1.5626,
"step": 5125
},
{
"epoch": 0.29206601259003007,
"grad_norm": 1.484375,
"learning_rate": 2.5592154718786306e-06,
"loss": 1.556,
"step": 5150
},
{
"epoch": 0.29348380876765157,
"grad_norm": 1.359375,
"learning_rate": 2.5540900703888474e-06,
"loss": 1.5664,
"step": 5175
},
{
"epoch": 0.29490160494527307,
"grad_norm": 2.078125,
"learning_rate": 2.548964668899064e-06,
"loss": 1.5774,
"step": 5200
},
{
"epoch": 0.29631940112289457,
"grad_norm": 1.5859375,
"learning_rate": 2.5438392674092803e-06,
"loss": 1.5716,
"step": 5225
},
{
"epoch": 0.29773719730051607,
"grad_norm": 1.515625,
"learning_rate": 2.538713865919497e-06,
"loss": 1.5724,
"step": 5250
},
{
"epoch": 0.29915499347813757,
"grad_norm": 2.125,
"learning_rate": 2.5335884644297136e-06,
"loss": 1.5428,
"step": 5275
},
{
"epoch": 0.30057278965575907,
"grad_norm": 1.4765625,
"learning_rate": 2.5284630629399305e-06,
"loss": 1.5703,
"step": 5300
},
{
"epoch": 0.3019905858333806,
"grad_norm": 1.546875,
"learning_rate": 2.523337661450147e-06,
"loss": 1.547,
"step": 5325
},
{
"epoch": 0.3034083820110021,
"grad_norm": 2.109375,
"learning_rate": 2.518212259960364e-06,
"loss": 1.5526,
"step": 5350
},
{
"epoch": 0.3048261781886236,
"grad_norm": 1.6484375,
"learning_rate": 2.5130868584705803e-06,
"loss": 1.5507,
"step": 5375
},
{
"epoch": 0.3062439743662451,
"grad_norm": 1.59375,
"learning_rate": 2.507961456980797e-06,
"loss": 1.5654,
"step": 5400
},
{
"epoch": 0.3076617705438666,
"grad_norm": 1.5546875,
"learning_rate": 2.5028360554910136e-06,
"loss": 1.5907,
"step": 5425
},
{
"epoch": 0.3090795667214881,
"grad_norm": 1.515625,
"learning_rate": 2.49771065400123e-06,
"loss": 1.5457,
"step": 5450
},
{
"epoch": 0.3104973628991096,
"grad_norm": 1.6015625,
"learning_rate": 2.4925852525114465e-06,
"loss": 1.5435,
"step": 5475
},
{
"epoch": 0.3119151590767311,
"grad_norm": 1.4296875,
"learning_rate": 2.4874598510216633e-06,
"loss": 1.5799,
"step": 5500
},
{
"epoch": 0.3133329552543526,
"grad_norm": 1.625,
"learning_rate": 2.48233444953188e-06,
"loss": 1.5848,
"step": 5525
},
{
"epoch": 0.3147507514319741,
"grad_norm": 1.515625,
"learning_rate": 2.4772090480420967e-06,
"loss": 1.5561,
"step": 5550
},
{
"epoch": 0.3161685476095956,
"grad_norm": 1.90625,
"learning_rate": 2.472083646552313e-06,
"loss": 1.5756,
"step": 5575
},
{
"epoch": 0.3175863437872172,
"grad_norm": 1.453125,
"learning_rate": 2.46695824506253e-06,
"loss": 1.5723,
"step": 5600
},
{
"epoch": 0.3190041399648387,
"grad_norm": 1.4453125,
"learning_rate": 2.4618328435727464e-06,
"loss": 1.5503,
"step": 5625
},
{
"epoch": 0.3204219361424602,
"grad_norm": 1.421875,
"learning_rate": 2.4567074420829633e-06,
"loss": 1.5537,
"step": 5650
},
{
"epoch": 0.3218397323200817,
"grad_norm": 1.9453125,
"learning_rate": 2.4515820405931797e-06,
"loss": 1.5652,
"step": 5675
},
{
"epoch": 0.3232575284977032,
"grad_norm": 1.6796875,
"learning_rate": 2.4464566391033966e-06,
"loss": 1.5675,
"step": 5700
},
{
"epoch": 0.3246753246753247,
"grad_norm": 1.6171875,
"learning_rate": 2.441331237613613e-06,
"loss": 1.5658,
"step": 5725
},
{
"epoch": 0.32609312085294617,
"grad_norm": 1.5234375,
"learning_rate": 2.43620583612383e-06,
"loss": 1.5576,
"step": 5750
},
{
"epoch": 0.32751091703056767,
"grad_norm": 1.5,
"learning_rate": 2.4310804346340464e-06,
"loss": 1.5751,
"step": 5775
},
{
"epoch": 0.32892871320818917,
"grad_norm": 1.4921875,
"learning_rate": 2.4259550331442632e-06,
"loss": 1.5758,
"step": 5800
},
{
"epoch": 0.33034650938581067,
"grad_norm": 1.546875,
"learning_rate": 2.4208296316544797e-06,
"loss": 1.5617,
"step": 5825
},
{
"epoch": 0.3317643055634322,
"grad_norm": 1.59375,
"learning_rate": 2.415704230164696e-06,
"loss": 1.5638,
"step": 5850
},
{
"epoch": 0.3331821017410537,
"grad_norm": 1.7265625,
"learning_rate": 2.410578828674913e-06,
"loss": 1.5722,
"step": 5875
},
{
"epoch": 0.3345998979186752,
"grad_norm": 1.4609375,
"learning_rate": 2.4054534271851295e-06,
"loss": 1.5682,
"step": 5900
},
{
"epoch": 0.3360176940962967,
"grad_norm": 1.453125,
"learning_rate": 2.4003280256953463e-06,
"loss": 1.5611,
"step": 5925
},
{
"epoch": 0.3374354902739182,
"grad_norm": 1.9453125,
"learning_rate": 2.3952026242055628e-06,
"loss": 1.5741,
"step": 5950
},
{
"epoch": 0.3388532864515397,
"grad_norm": 1.4453125,
"learning_rate": 2.3900772227157796e-06,
"loss": 1.5527,
"step": 5975
},
{
"epoch": 0.3402710826291612,
"grad_norm": 1.453125,
"learning_rate": 2.384951821225996e-06,
"loss": 1.5703,
"step": 6000
},
{
"epoch": 0.3416888788067827,
"grad_norm": 1.8203125,
"learning_rate": 2.379826419736213e-06,
"loss": 1.5924,
"step": 6025
},
{
"epoch": 0.3431066749844042,
"grad_norm": 1.6171875,
"learning_rate": 2.3747010182464294e-06,
"loss": 1.565,
"step": 6050
},
{
"epoch": 0.3445244711620257,
"grad_norm": 1.640625,
"learning_rate": 2.3695756167566463e-06,
"loss": 1.5657,
"step": 6075
},
{
"epoch": 0.3459422673396472,
"grad_norm": 1.5390625,
"learning_rate": 2.3644502152668627e-06,
"loss": 1.5382,
"step": 6100
},
{
"epoch": 0.3473600635172688,
"grad_norm": 1.2734375,
"learning_rate": 2.3593248137770796e-06,
"loss": 1.5631,
"step": 6125
},
{
"epoch": 0.3487778596948903,
"grad_norm": 1.3359375,
"learning_rate": 2.3541994122872956e-06,
"loss": 1.5612,
"step": 6150
},
{
"epoch": 0.3501956558725118,
"grad_norm": 1.5625,
"learning_rate": 2.3490740107975125e-06,
"loss": 1.542,
"step": 6175
},
{
"epoch": 0.3516134520501333,
"grad_norm": 1.703125,
"learning_rate": 2.343948609307729e-06,
"loss": 1.547,
"step": 6200
},
{
"epoch": 0.3530312482277548,
"grad_norm": 1.8515625,
"learning_rate": 2.338823207817946e-06,
"loss": 1.5521,
"step": 6225
},
{
"epoch": 0.3544490444053763,
"grad_norm": 1.578125,
"learning_rate": 2.3336978063281622e-06,
"loss": 1.5392,
"step": 6250
},
{
"epoch": 0.3558668405829978,
"grad_norm": 1.5390625,
"learning_rate": 2.328572404838379e-06,
"loss": 1.5626,
"step": 6275
},
{
"epoch": 0.3572846367606193,
"grad_norm": 1.40625,
"learning_rate": 2.3234470033485956e-06,
"loss": 1.5753,
"step": 6300
},
{
"epoch": 0.3587024329382408,
"grad_norm": 1.5625,
"learning_rate": 2.3183216018588124e-06,
"loss": 1.5502,
"step": 6325
},
{
"epoch": 0.3601202291158623,
"grad_norm": 1.5390625,
"learning_rate": 2.313196200369029e-06,
"loss": 1.5427,
"step": 6350
},
{
"epoch": 0.36153802529348383,
"grad_norm": 1.625,
"learning_rate": 2.3080707988792457e-06,
"loss": 1.5595,
"step": 6375
},
{
"epoch": 0.36295582147110533,
"grad_norm": 1.375,
"learning_rate": 2.302945397389462e-06,
"loss": 1.5756,
"step": 6400
},
{
"epoch": 0.36437361764872683,
"grad_norm": 1.578125,
"learning_rate": 2.2978199958996786e-06,
"loss": 1.5495,
"step": 6425
},
{
"epoch": 0.36579141382634833,
"grad_norm": 1.9140625,
"learning_rate": 2.2926945944098955e-06,
"loss": 1.5591,
"step": 6450
},
{
"epoch": 0.36720921000396983,
"grad_norm": 1.5078125,
"learning_rate": 2.287569192920112e-06,
"loss": 1.5515,
"step": 6475
},
{
"epoch": 0.36862700618159133,
"grad_norm": 1.4765625,
"learning_rate": 2.282443791430329e-06,
"loss": 1.5557,
"step": 6500
},
{
"epoch": 0.37004480235921283,
"grad_norm": 1.40625,
"learning_rate": 2.2773183899405453e-06,
"loss": 1.5657,
"step": 6525
},
{
"epoch": 0.37146259853683433,
"grad_norm": 1.3984375,
"learning_rate": 2.272192988450762e-06,
"loss": 1.5626,
"step": 6550
},
{
"epoch": 0.37288039471445583,
"grad_norm": 1.625,
"learning_rate": 2.2670675869609786e-06,
"loss": 1.5578,
"step": 6575
},
{
"epoch": 0.37429819089207733,
"grad_norm": 1.5234375,
"learning_rate": 2.2619421854711955e-06,
"loss": 1.5567,
"step": 6600
},
{
"epoch": 0.3757159870696989,
"grad_norm": 1.546875,
"learning_rate": 2.256816783981412e-06,
"loss": 1.5522,
"step": 6625
},
{
"epoch": 0.3771337832473204,
"grad_norm": 1.515625,
"learning_rate": 2.2516913824916288e-06,
"loss": 1.557,
"step": 6650
},
{
"epoch": 0.3785515794249419,
"grad_norm": 1.515625,
"learning_rate": 2.2465659810018452e-06,
"loss": 1.5283,
"step": 6675
},
{
"epoch": 0.3799693756025634,
"grad_norm": 1.640625,
"learning_rate": 2.241440579512062e-06,
"loss": 1.5619,
"step": 6700
},
{
"epoch": 0.3813871717801849,
"grad_norm": 1.421875,
"learning_rate": 2.2363151780222785e-06,
"loss": 1.5395,
"step": 6725
},
{
"epoch": 0.3828049679578064,
"grad_norm": 1.578125,
"learning_rate": 2.2311897765324954e-06,
"loss": 1.5604,
"step": 6750
},
{
"epoch": 0.3842227641354279,
"grad_norm": 1.515625,
"learning_rate": 2.226064375042712e-06,
"loss": 1.5259,
"step": 6775
},
{
"epoch": 0.3856405603130494,
"grad_norm": 1.5390625,
"learning_rate": 2.2209389735529287e-06,
"loss": 1.5592,
"step": 6800
},
{
"epoch": 0.3870583564906709,
"grad_norm": 1.5078125,
"learning_rate": 2.215813572063145e-06,
"loss": 1.5364,
"step": 6825
},
{
"epoch": 0.3884761526682924,
"grad_norm": 1.453125,
"learning_rate": 2.2106881705733616e-06,
"loss": 1.5603,
"step": 6850
},
{
"epoch": 0.3898939488459139,
"grad_norm": 1.4765625,
"learning_rate": 2.205562769083578e-06,
"loss": 1.5701,
"step": 6875
},
{
"epoch": 0.39131174502353544,
"grad_norm": 1.6953125,
"learning_rate": 2.200437367593795e-06,
"loss": 1.5886,
"step": 6900
},
{
"epoch": 0.39272954120115694,
"grad_norm": 1.578125,
"learning_rate": 2.1953119661040114e-06,
"loss": 1.5645,
"step": 6925
},
{
"epoch": 0.39414733737877844,
"grad_norm": 1.59375,
"learning_rate": 2.1901865646142282e-06,
"loss": 1.5606,
"step": 6950
},
{
"epoch": 0.39556513355639994,
"grad_norm": 1.59375,
"learning_rate": 2.1850611631244447e-06,
"loss": 1.5466,
"step": 6975
},
{
"epoch": 0.39698292973402144,
"grad_norm": 1.515625,
"learning_rate": 2.179935761634661e-06,
"loss": 1.556,
"step": 7000
},
{
"epoch": 0.39840072591164294,
"grad_norm": 1.65625,
"learning_rate": 2.174810360144878e-06,
"loss": 1.5753,
"step": 7025
},
{
"epoch": 0.39981852208926444,
"grad_norm": 1.6953125,
"learning_rate": 2.1696849586550945e-06,
"loss": 1.5631,
"step": 7050
},
{
"epoch": 0.40123631826688594,
"grad_norm": 1.515625,
"learning_rate": 2.1645595571653113e-06,
"loss": 1.5483,
"step": 7075
},
{
"epoch": 0.40265411444450744,
"grad_norm": 1.4453125,
"learning_rate": 2.1594341556755278e-06,
"loss": 1.5562,
"step": 7100
},
{
"epoch": 0.40407191062212894,
"grad_norm": 1.84375,
"learning_rate": 2.1543087541857446e-06,
"loss": 1.5561,
"step": 7125
},
{
"epoch": 0.4054897067997505,
"grad_norm": 1.421875,
"learning_rate": 2.149183352695961e-06,
"loss": 1.5358,
"step": 7150
},
{
"epoch": 0.406907502977372,
"grad_norm": 1.546875,
"learning_rate": 2.144057951206178e-06,
"loss": 1.5465,
"step": 7175
},
{
"epoch": 0.4083252991549935,
"grad_norm": 1.4453125,
"learning_rate": 2.1389325497163944e-06,
"loss": 1.5762,
"step": 7200
},
{
"epoch": 0.409743095332615,
"grad_norm": 1.8125,
"learning_rate": 2.1338071482266113e-06,
"loss": 1.5593,
"step": 7225
},
{
"epoch": 0.4111608915102365,
"grad_norm": 1.671875,
"learning_rate": 2.1286817467368277e-06,
"loss": 1.5578,
"step": 7250
},
{
"epoch": 0.412578687687858,
"grad_norm": 1.3828125,
"learning_rate": 2.1235563452470446e-06,
"loss": 1.5889,
"step": 7275
},
{
"epoch": 0.4139964838654795,
"grad_norm": 1.6875,
"learning_rate": 2.118430943757261e-06,
"loss": 1.5562,
"step": 7300
},
{
"epoch": 0.415414280043101,
"grad_norm": 1.6171875,
"learning_rate": 2.113305542267478e-06,
"loss": 1.5316,
"step": 7325
},
{
"epoch": 0.4168320762207225,
"grad_norm": 1.5859375,
"learning_rate": 2.1081801407776944e-06,
"loss": 1.556,
"step": 7350
},
{
"epoch": 0.418249872398344,
"grad_norm": 1.375,
"learning_rate": 2.1030547392879112e-06,
"loss": 1.5628,
"step": 7375
},
{
"epoch": 0.41966766857596555,
"grad_norm": 1.71875,
"learning_rate": 2.0979293377981277e-06,
"loss": 1.555,
"step": 7400
},
{
"epoch": 0.42108546475358705,
"grad_norm": 1.5859375,
"learning_rate": 2.092803936308344e-06,
"loss": 1.5561,
"step": 7425
},
{
"epoch": 0.42250326093120855,
"grad_norm": 1.4453125,
"learning_rate": 2.087678534818561e-06,
"loss": 1.5601,
"step": 7450
},
{
"epoch": 0.42392105710883005,
"grad_norm": 1.8515625,
"learning_rate": 2.0825531333287774e-06,
"loss": 1.541,
"step": 7475
},
{
"epoch": 0.42533885328645155,
"grad_norm": 1.734375,
"learning_rate": 2.0774277318389943e-06,
"loss": 1.564,
"step": 7500
},
{
"epoch": 0.42675664946407305,
"grad_norm": 1.4921875,
"learning_rate": 2.0723023303492107e-06,
"loss": 1.5771,
"step": 7525
},
{
"epoch": 0.42817444564169455,
"grad_norm": 1.828125,
"learning_rate": 2.067176928859427e-06,
"loss": 1.562,
"step": 7550
},
{
"epoch": 0.42959224181931605,
"grad_norm": 1.5625,
"learning_rate": 2.062051527369644e-06,
"loss": 1.5351,
"step": 7575
},
{
"epoch": 0.43101003799693755,
"grad_norm": 1.390625,
"learning_rate": 2.0569261258798605e-06,
"loss": 1.5399,
"step": 7600
},
{
"epoch": 0.43242783417455904,
"grad_norm": 1.6484375,
"learning_rate": 2.051800724390077e-06,
"loss": 1.5723,
"step": 7625
},
{
"epoch": 0.43384563035218054,
"grad_norm": 1.5234375,
"learning_rate": 2.046675322900294e-06,
"loss": 1.5707,
"step": 7650
},
{
"epoch": 0.4352634265298021,
"grad_norm": 1.4140625,
"learning_rate": 2.0415499214105103e-06,
"loss": 1.5229,
"step": 7675
},
{
"epoch": 0.4366812227074236,
"grad_norm": 1.5625,
"learning_rate": 2.036424519920727e-06,
"loss": 1.5318,
"step": 7700
},
{
"epoch": 0.4380990188850451,
"grad_norm": 1.6875,
"learning_rate": 2.0312991184309436e-06,
"loss": 1.5321,
"step": 7725
},
{
"epoch": 0.4395168150626666,
"grad_norm": 1.6171875,
"learning_rate": 2.0261737169411605e-06,
"loss": 1.5619,
"step": 7750
},
{
"epoch": 0.4409346112402881,
"grad_norm": 1.7890625,
"learning_rate": 2.021048315451377e-06,
"loss": 1.5507,
"step": 7775
},
{
"epoch": 0.4423524074179096,
"grad_norm": 1.46875,
"learning_rate": 2.0159229139615938e-06,
"loss": 1.5443,
"step": 7800
},
{
"epoch": 0.4437702035955311,
"grad_norm": 1.5234375,
"learning_rate": 2.0107975124718102e-06,
"loss": 1.5362,
"step": 7825
},
{
"epoch": 0.4451879997731526,
"grad_norm": 1.6484375,
"learning_rate": 2.005672110982027e-06,
"loss": 1.5435,
"step": 7850
},
{
"epoch": 0.4466057959507741,
"grad_norm": 1.625,
"learning_rate": 2.0005467094922435e-06,
"loss": 1.5365,
"step": 7875
},
{
"epoch": 0.4480235921283956,
"grad_norm": 1.4609375,
"learning_rate": 1.9954213080024604e-06,
"loss": 1.57,
"step": 7900
},
{
"epoch": 0.44944138830601715,
"grad_norm": 1.765625,
"learning_rate": 1.990295906512677e-06,
"loss": 1.5625,
"step": 7925
},
{
"epoch": 0.45085918448363865,
"grad_norm": 1.7734375,
"learning_rate": 1.9851705050228937e-06,
"loss": 1.5687,
"step": 7950
},
{
"epoch": 0.45227698066126015,
"grad_norm": 1.515625,
"learning_rate": 1.98004510353311e-06,
"loss": 1.5611,
"step": 7975
},
{
"epoch": 0.45369477683888165,
"grad_norm": 1.4609375,
"learning_rate": 1.974919702043327e-06,
"loss": 1.5497,
"step": 8000
},
{
"epoch": 0.45511257301650315,
"grad_norm": 1.6796875,
"learning_rate": 1.9697943005535435e-06,
"loss": 1.5631,
"step": 8025
},
{
"epoch": 0.45653036919412465,
"grad_norm": 1.5546875,
"learning_rate": 1.96466889906376e-06,
"loss": 1.5564,
"step": 8050
},
{
"epoch": 0.45794816537174615,
"grad_norm": 1.53125,
"learning_rate": 1.959543497573977e-06,
"loss": 1.5502,
"step": 8075
},
{
"epoch": 0.45936596154936765,
"grad_norm": 1.453125,
"learning_rate": 1.9544180960841933e-06,
"loss": 1.5428,
"step": 8100
},
{
"epoch": 0.46078375772698915,
"grad_norm": 1.5625,
"learning_rate": 1.94929269459441e-06,
"loss": 1.5859,
"step": 8125
},
{
"epoch": 0.46220155390461065,
"grad_norm": 1.59375,
"learning_rate": 1.9441672931046266e-06,
"loss": 1.5673,
"step": 8150
},
{
"epoch": 0.46361935008223215,
"grad_norm": 1.6796875,
"learning_rate": 1.9390418916148434e-06,
"loss": 1.5583,
"step": 8175
},
{
"epoch": 0.4650371462598537,
"grad_norm": 1.625,
"learning_rate": 1.93391649012506e-06,
"loss": 1.5509,
"step": 8200
},
{
"epoch": 0.4664549424374752,
"grad_norm": 1.4375,
"learning_rate": 1.9287910886352768e-06,
"loss": 1.539,
"step": 8225
},
{
"epoch": 0.4678727386150967,
"grad_norm": 1.6171875,
"learning_rate": 1.9236656871454928e-06,
"loss": 1.5393,
"step": 8250
},
{
"epoch": 0.4692905347927182,
"grad_norm": 1.5546875,
"learning_rate": 1.9185402856557096e-06,
"loss": 1.5529,
"step": 8275
},
{
"epoch": 0.4707083309703397,
"grad_norm": 1.3984375,
"learning_rate": 1.913414884165926e-06,
"loss": 1.5485,
"step": 8300
},
{
"epoch": 0.4721261271479612,
"grad_norm": 2.046875,
"learning_rate": 1.908289482676143e-06,
"loss": 1.5476,
"step": 8325
},
{
"epoch": 0.4735439233255827,
"grad_norm": 1.53125,
"learning_rate": 1.9031640811863596e-06,
"loss": 1.5571,
"step": 8350
},
{
"epoch": 0.4749617195032042,
"grad_norm": 1.5234375,
"learning_rate": 1.8980386796965763e-06,
"loss": 1.557,
"step": 8375
},
{
"epoch": 0.4763795156808257,
"grad_norm": 1.53125,
"learning_rate": 1.8929132782067927e-06,
"loss": 1.5768,
"step": 8400
},
{
"epoch": 0.4777973118584472,
"grad_norm": 1.6484375,
"learning_rate": 1.8877878767170094e-06,
"loss": 1.5641,
"step": 8425
},
{
"epoch": 0.47921510803606876,
"grad_norm": 1.5390625,
"learning_rate": 1.882662475227226e-06,
"loss": 1.5534,
"step": 8450
},
{
"epoch": 0.48063290421369026,
"grad_norm": 1.4765625,
"learning_rate": 1.8775370737374427e-06,
"loss": 1.5421,
"step": 8475
},
{
"epoch": 0.48205070039131176,
"grad_norm": 1.5078125,
"learning_rate": 1.8724116722476594e-06,
"loss": 1.5555,
"step": 8500
},
{
"epoch": 0.48346849656893326,
"grad_norm": 1.375,
"learning_rate": 1.867286270757876e-06,
"loss": 1.5428,
"step": 8525
},
{
"epoch": 0.48488629274655476,
"grad_norm": 1.578125,
"learning_rate": 1.8621608692680927e-06,
"loss": 1.5412,
"step": 8550
},
{
"epoch": 0.48630408892417626,
"grad_norm": 1.5,
"learning_rate": 1.8570354677783093e-06,
"loss": 1.5354,
"step": 8575
},
{
"epoch": 0.48772188510179776,
"grad_norm": 1.4375,
"learning_rate": 1.851910066288526e-06,
"loss": 1.5298,
"step": 8600
},
{
"epoch": 0.48913968127941926,
"grad_norm": 1.546875,
"learning_rate": 1.8467846647987426e-06,
"loss": 1.5496,
"step": 8625
},
{
"epoch": 0.49055747745704076,
"grad_norm": 1.734375,
"learning_rate": 1.8416592633089593e-06,
"loss": 1.5764,
"step": 8650
},
{
"epoch": 0.49197527363466226,
"grad_norm": 1.5546875,
"learning_rate": 1.836533861819176e-06,
"loss": 1.5439,
"step": 8675
},
{
"epoch": 0.4933930698122838,
"grad_norm": 1.75,
"learning_rate": 1.8314084603293926e-06,
"loss": 1.5625,
"step": 8700
},
{
"epoch": 0.4948108659899053,
"grad_norm": 1.5859375,
"learning_rate": 1.8262830588396093e-06,
"loss": 1.5426,
"step": 8725
},
{
"epoch": 0.4962286621675268,
"grad_norm": 1.453125,
"learning_rate": 1.821157657349826e-06,
"loss": 1.5572,
"step": 8750
},
{
"epoch": 0.4976464583451483,
"grad_norm": 1.4921875,
"learning_rate": 1.8160322558600426e-06,
"loss": 1.5544,
"step": 8775
},
{
"epoch": 0.4990642545227698,
"grad_norm": 1.4296875,
"learning_rate": 1.8109068543702593e-06,
"loss": 1.5523,
"step": 8800
},
{
"epoch": 0.5004820507003913,
"grad_norm": 1.4609375,
"learning_rate": 1.8057814528804757e-06,
"loss": 1.5566,
"step": 8825
},
{
"epoch": 0.5018998468780128,
"grad_norm": 1.640625,
"learning_rate": 1.8006560513906924e-06,
"loss": 1.5398,
"step": 8850
},
{
"epoch": 0.5033176430556343,
"grad_norm": 1.5546875,
"learning_rate": 1.795530649900909e-06,
"loss": 1.5565,
"step": 8875
},
{
"epoch": 0.5047354392332558,
"grad_norm": 1.4921875,
"learning_rate": 1.7904052484111257e-06,
"loss": 1.5548,
"step": 8900
},
{
"epoch": 0.5061532354108773,
"grad_norm": 1.421875,
"learning_rate": 1.7852798469213421e-06,
"loss": 1.5535,
"step": 8925
},
{
"epoch": 0.5075710315884988,
"grad_norm": 1.5625,
"learning_rate": 1.7801544454315588e-06,
"loss": 1.5706,
"step": 8950
},
{
"epoch": 0.5089888277661203,
"grad_norm": 1.484375,
"learning_rate": 1.7750290439417754e-06,
"loss": 1.5568,
"step": 8975
},
{
"epoch": 0.5104066239437418,
"grad_norm": 1.5234375,
"learning_rate": 1.7699036424519919e-06,
"loss": 1.5563,
"step": 9000
},
{
"epoch": 0.5118244201213633,
"grad_norm": 1.4609375,
"learning_rate": 1.7647782409622085e-06,
"loss": 1.5338,
"step": 9025
},
{
"epoch": 0.5132422162989848,
"grad_norm": 1.640625,
"learning_rate": 1.7596528394724252e-06,
"loss": 1.5521,
"step": 9050
},
{
"epoch": 0.5146600124766063,
"grad_norm": 1.53125,
"learning_rate": 1.7545274379826419e-06,
"loss": 1.5492,
"step": 9075
},
{
"epoch": 0.5160778086542279,
"grad_norm": 1.6328125,
"learning_rate": 1.7494020364928585e-06,
"loss": 1.5345,
"step": 9100
},
{
"epoch": 0.5174956048318494,
"grad_norm": 1.515625,
"learning_rate": 1.7442766350030752e-06,
"loss": 1.5407,
"step": 9125
},
{
"epoch": 0.5189134010094709,
"grad_norm": 1.34375,
"learning_rate": 1.7391512335132918e-06,
"loss": 1.5592,
"step": 9150
},
{
"epoch": 0.5203311971870924,
"grad_norm": 1.8125,
"learning_rate": 1.7340258320235085e-06,
"loss": 1.5387,
"step": 9175
},
{
"epoch": 0.5217489933647139,
"grad_norm": 1.6171875,
"learning_rate": 1.7289004305337251e-06,
"loss": 1.5425,
"step": 9200
},
{
"epoch": 0.5231667895423354,
"grad_norm": 1.578125,
"learning_rate": 1.7237750290439418e-06,
"loss": 1.5737,
"step": 9225
},
{
"epoch": 0.5245845857199569,
"grad_norm": 1.8984375,
"learning_rate": 1.7186496275541585e-06,
"loss": 1.5753,
"step": 9250
},
{
"epoch": 0.5260023818975784,
"grad_norm": 1.4765625,
"learning_rate": 1.7135242260643751e-06,
"loss": 1.5684,
"step": 9275
},
{
"epoch": 0.5274201780751999,
"grad_norm": 1.71875,
"learning_rate": 1.7083988245745918e-06,
"loss": 1.5471,
"step": 9300
},
{
"epoch": 0.5288379742528214,
"grad_norm": 1.59375,
"learning_rate": 1.7032734230848084e-06,
"loss": 1.5585,
"step": 9325
},
{
"epoch": 0.5302557704304429,
"grad_norm": 1.5234375,
"learning_rate": 1.698148021595025e-06,
"loss": 1.5689,
"step": 9350
},
{
"epoch": 0.5316735666080644,
"grad_norm": 1.5703125,
"learning_rate": 1.6930226201052418e-06,
"loss": 1.5559,
"step": 9375
},
{
"epoch": 0.5330913627856859,
"grad_norm": 2.015625,
"learning_rate": 1.6878972186154584e-06,
"loss": 1.536,
"step": 9400
},
{
"epoch": 0.5345091589633074,
"grad_norm": 1.65625,
"learning_rate": 1.6827718171256749e-06,
"loss": 1.5343,
"step": 9425
},
{
"epoch": 0.5359269551409289,
"grad_norm": 1.640625,
"learning_rate": 1.6776464156358915e-06,
"loss": 1.5333,
"step": 9450
},
{
"epoch": 0.5373447513185504,
"grad_norm": 1.546875,
"learning_rate": 1.6725210141461082e-06,
"loss": 1.5387,
"step": 9475
},
{
"epoch": 0.5387625474961719,
"grad_norm": 1.703125,
"learning_rate": 1.6673956126563248e-06,
"loss": 1.5467,
"step": 9500
},
{
"epoch": 0.5401803436737934,
"grad_norm": 1.6640625,
"learning_rate": 1.6622702111665415e-06,
"loss": 1.5439,
"step": 9525
},
{
"epoch": 0.5415981398514149,
"grad_norm": 1.5234375,
"learning_rate": 1.6571448096767582e-06,
"loss": 1.5434,
"step": 9550
},
{
"epoch": 0.5430159360290364,
"grad_norm": 1.640625,
"learning_rate": 1.6520194081869748e-06,
"loss": 1.5438,
"step": 9575
},
{
"epoch": 0.5444337322066579,
"grad_norm": 1.6171875,
"learning_rate": 1.6468940066971915e-06,
"loss": 1.5222,
"step": 9600
},
{
"epoch": 0.5458515283842795,
"grad_norm": 1.4296875,
"learning_rate": 1.6417686052074077e-06,
"loss": 1.5479,
"step": 9625
},
{
"epoch": 0.547269324561901,
"grad_norm": 1.5625,
"learning_rate": 1.6366432037176244e-06,
"loss": 1.5733,
"step": 9650
},
{
"epoch": 0.5486871207395225,
"grad_norm": 1.625,
"learning_rate": 1.631517802227841e-06,
"loss": 1.5279,
"step": 9675
},
{
"epoch": 0.550104916917144,
"grad_norm": 1.890625,
"learning_rate": 1.6263924007380577e-06,
"loss": 1.5554,
"step": 9700
},
{
"epoch": 0.5515227130947655,
"grad_norm": 1.5625,
"learning_rate": 1.6212669992482743e-06,
"loss": 1.5095,
"step": 9725
},
{
"epoch": 0.552940509272387,
"grad_norm": 1.5625,
"learning_rate": 1.616141597758491e-06,
"loss": 1.5139,
"step": 9750
},
{
"epoch": 0.5543583054500085,
"grad_norm": 1.453125,
"learning_rate": 1.6110161962687077e-06,
"loss": 1.5381,
"step": 9775
},
{
"epoch": 0.55577610162763,
"grad_norm": 1.484375,
"learning_rate": 1.6058907947789243e-06,
"loss": 1.5252,
"step": 9800
},
{
"epoch": 0.5571938978052515,
"grad_norm": 2.015625,
"learning_rate": 1.600765393289141e-06,
"loss": 1.5623,
"step": 9825
},
{
"epoch": 0.558611693982873,
"grad_norm": 1.4140625,
"learning_rate": 1.5956399917993576e-06,
"loss": 1.5585,
"step": 9850
},
{
"epoch": 0.5600294901604945,
"grad_norm": 1.515625,
"learning_rate": 1.5905145903095743e-06,
"loss": 1.5705,
"step": 9875
},
{
"epoch": 0.561447286338116,
"grad_norm": 1.546875,
"learning_rate": 1.585389188819791e-06,
"loss": 1.5343,
"step": 9900
},
{
"epoch": 0.5628650825157375,
"grad_norm": 1.7265625,
"learning_rate": 1.5802637873300076e-06,
"loss": 1.5257,
"step": 9925
},
{
"epoch": 0.564282878693359,
"grad_norm": 1.59375,
"learning_rate": 1.5751383858402243e-06,
"loss": 1.5605,
"step": 9950
},
{
"epoch": 0.5657006748709805,
"grad_norm": 1.453125,
"learning_rate": 1.570012984350441e-06,
"loss": 1.5301,
"step": 9975
},
{
"epoch": 0.567118471048602,
"grad_norm": 1.5390625,
"learning_rate": 1.5648875828606576e-06,
"loss": 1.5498,
"step": 10000
},
{
"epoch": 0.5685362672262235,
"grad_norm": 1.53125,
"learning_rate": 1.559762181370874e-06,
"loss": 1.5258,
"step": 10025
},
{
"epoch": 0.569954063403845,
"grad_norm": 1.65625,
"learning_rate": 1.5546367798810907e-06,
"loss": 1.5391,
"step": 10050
},
{
"epoch": 0.5713718595814665,
"grad_norm": 1.4609375,
"learning_rate": 1.5495113783913073e-06,
"loss": 1.5478,
"step": 10075
},
{
"epoch": 0.572789655759088,
"grad_norm": 1.515625,
"learning_rate": 1.544385976901524e-06,
"loss": 1.5388,
"step": 10100
},
{
"epoch": 0.5742074519367096,
"grad_norm": 1.5234375,
"learning_rate": 1.5392605754117407e-06,
"loss": 1.5413,
"step": 10125
},
{
"epoch": 0.5756252481143311,
"grad_norm": 1.6328125,
"learning_rate": 1.5341351739219573e-06,
"loss": 1.5519,
"step": 10150
},
{
"epoch": 0.5770430442919526,
"grad_norm": 1.8125,
"learning_rate": 1.529009772432174e-06,
"loss": 1.5516,
"step": 10175
},
{
"epoch": 0.5784608404695741,
"grad_norm": 1.6015625,
"learning_rate": 1.5238843709423906e-06,
"loss": 1.5193,
"step": 10200
},
{
"epoch": 0.5798786366471956,
"grad_norm": 1.609375,
"learning_rate": 1.5187589694526073e-06,
"loss": 1.5562,
"step": 10225
},
{
"epoch": 0.5812964328248171,
"grad_norm": 1.625,
"learning_rate": 1.513633567962824e-06,
"loss": 1.5564,
"step": 10250
},
{
"epoch": 0.5827142290024386,
"grad_norm": 1.46875,
"learning_rate": 1.5085081664730406e-06,
"loss": 1.5505,
"step": 10275
},
{
"epoch": 0.5841320251800601,
"grad_norm": 1.59375,
"learning_rate": 1.5033827649832573e-06,
"loss": 1.5526,
"step": 10300
},
{
"epoch": 0.5855498213576816,
"grad_norm": 1.7734375,
"learning_rate": 1.4982573634934737e-06,
"loss": 1.5544,
"step": 10325
},
{
"epoch": 0.5869676175353031,
"grad_norm": 1.625,
"learning_rate": 1.4931319620036904e-06,
"loss": 1.5197,
"step": 10350
},
{
"epoch": 0.5883854137129246,
"grad_norm": 1.6640625,
"learning_rate": 1.488006560513907e-06,
"loss": 1.5515,
"step": 10375
},
{
"epoch": 0.5898032098905461,
"grad_norm": 1.5625,
"learning_rate": 1.4828811590241237e-06,
"loss": 1.542,
"step": 10400
},
{
"epoch": 0.5912210060681676,
"grad_norm": 1.5078125,
"learning_rate": 1.4777557575343403e-06,
"loss": 1.5378,
"step": 10425
},
{
"epoch": 0.5926388022457891,
"grad_norm": 1.5390625,
"learning_rate": 1.472630356044557e-06,
"loss": 1.5499,
"step": 10450
},
{
"epoch": 0.5940565984234106,
"grad_norm": 1.8515625,
"learning_rate": 1.4675049545547737e-06,
"loss": 1.5717,
"step": 10475
},
{
"epoch": 0.5954743946010321,
"grad_norm": 1.5,
"learning_rate": 1.46237955306499e-06,
"loss": 1.5418,
"step": 10500
},
{
"epoch": 0.5968921907786536,
"grad_norm": 1.390625,
"learning_rate": 1.4572541515752068e-06,
"loss": 1.5272,
"step": 10525
},
{
"epoch": 0.5983099869562751,
"grad_norm": 1.578125,
"learning_rate": 1.4521287500854234e-06,
"loss": 1.5359,
"step": 10550
},
{
"epoch": 0.5997277831338966,
"grad_norm": 1.6171875,
"learning_rate": 1.44700334859564e-06,
"loss": 1.546,
"step": 10575
},
{
"epoch": 0.6011455793115181,
"grad_norm": 1.546875,
"learning_rate": 1.4418779471058567e-06,
"loss": 1.5394,
"step": 10600
},
{
"epoch": 0.6025633754891396,
"grad_norm": 1.5078125,
"learning_rate": 1.4367525456160732e-06,
"loss": 1.5449,
"step": 10625
},
{
"epoch": 0.6039811716667612,
"grad_norm": 1.6640625,
"learning_rate": 1.4316271441262898e-06,
"loss": 1.5449,
"step": 10650
},
{
"epoch": 0.6053989678443827,
"grad_norm": 1.609375,
"learning_rate": 1.4265017426365065e-06,
"loss": 1.5255,
"step": 10675
},
{
"epoch": 0.6068167640220042,
"grad_norm": 1.4453125,
"learning_rate": 1.4213763411467232e-06,
"loss": 1.559,
"step": 10700
},
{
"epoch": 0.6082345601996257,
"grad_norm": 1.5390625,
"learning_rate": 1.4162509396569398e-06,
"loss": 1.5654,
"step": 10725
},
{
"epoch": 0.6096523563772472,
"grad_norm": 1.6796875,
"learning_rate": 1.4111255381671565e-06,
"loss": 1.5608,
"step": 10750
},
{
"epoch": 0.6110701525548687,
"grad_norm": 1.671875,
"learning_rate": 1.4060001366773731e-06,
"loss": 1.5727,
"step": 10775
},
{
"epoch": 0.6124879487324902,
"grad_norm": 1.59375,
"learning_rate": 1.4008747351875898e-06,
"loss": 1.55,
"step": 10800
},
{
"epoch": 0.6139057449101117,
"grad_norm": 1.453125,
"learning_rate": 1.3957493336978064e-06,
"loss": 1.5276,
"step": 10825
},
{
"epoch": 0.6153235410877332,
"grad_norm": 1.4609375,
"learning_rate": 1.3906239322080229e-06,
"loss": 1.5596,
"step": 10850
},
{
"epoch": 0.6167413372653547,
"grad_norm": 1.5234375,
"learning_rate": 1.3854985307182395e-06,
"loss": 1.5531,
"step": 10875
},
{
"epoch": 0.6181591334429762,
"grad_norm": 1.546875,
"learning_rate": 1.3803731292284562e-06,
"loss": 1.5493,
"step": 10900
},
{
"epoch": 0.6195769296205977,
"grad_norm": 1.6875,
"learning_rate": 1.3752477277386729e-06,
"loss": 1.549,
"step": 10925
},
{
"epoch": 0.6209947257982192,
"grad_norm": 1.4609375,
"learning_rate": 1.3701223262488895e-06,
"loss": 1.5222,
"step": 10950
},
{
"epoch": 0.6224125219758407,
"grad_norm": 1.6484375,
"learning_rate": 1.3649969247591062e-06,
"loss": 1.5372,
"step": 10975
},
{
"epoch": 0.6238303181534622,
"grad_norm": 1.5390625,
"learning_rate": 1.3598715232693228e-06,
"loss": 1.5489,
"step": 11000
},
{
"epoch": 0.6252481143310837,
"grad_norm": 1.5234375,
"learning_rate": 1.3547461217795395e-06,
"loss": 1.5413,
"step": 11025
},
{
"epoch": 0.6266659105087052,
"grad_norm": 1.5,
"learning_rate": 1.3496207202897562e-06,
"loss": 1.5258,
"step": 11050
},
{
"epoch": 0.6280837066863267,
"grad_norm": 1.6171875,
"learning_rate": 1.3444953187999728e-06,
"loss": 1.5472,
"step": 11075
},
{
"epoch": 0.6295015028639482,
"grad_norm": 1.5546875,
"learning_rate": 1.3393699173101895e-06,
"loss": 1.5343,
"step": 11100
},
{
"epoch": 0.6309192990415697,
"grad_norm": 1.609375,
"learning_rate": 1.334244515820406e-06,
"loss": 1.5464,
"step": 11125
},
{
"epoch": 0.6323370952191912,
"grad_norm": 2.203125,
"learning_rate": 1.3291191143306226e-06,
"loss": 1.5363,
"step": 11150
},
{
"epoch": 0.6337548913968128,
"grad_norm": 1.546875,
"learning_rate": 1.3239937128408392e-06,
"loss": 1.5315,
"step": 11175
},
{
"epoch": 0.6351726875744343,
"grad_norm": 1.734375,
"learning_rate": 1.3188683113510559e-06,
"loss": 1.5516,
"step": 11200
},
{
"epoch": 0.6365904837520558,
"grad_norm": 2.03125,
"learning_rate": 1.3137429098612723e-06,
"loss": 1.5475,
"step": 11225
},
{
"epoch": 0.6380082799296773,
"grad_norm": 1.5078125,
"learning_rate": 1.308617508371489e-06,
"loss": 1.529,
"step": 11250
},
{
"epoch": 0.6394260761072988,
"grad_norm": 1.703125,
"learning_rate": 1.3034921068817057e-06,
"loss": 1.5497,
"step": 11275
},
{
"epoch": 0.6408438722849203,
"grad_norm": 1.6484375,
"learning_rate": 1.2983667053919223e-06,
"loss": 1.5614,
"step": 11300
},
{
"epoch": 0.6422616684625418,
"grad_norm": 1.7265625,
"learning_rate": 1.293241303902139e-06,
"loss": 1.5547,
"step": 11325
},
{
"epoch": 0.6436794646401633,
"grad_norm": 1.46875,
"learning_rate": 1.2881159024123556e-06,
"loss": 1.5541,
"step": 11350
},
{
"epoch": 0.6450972608177848,
"grad_norm": 1.5703125,
"learning_rate": 1.2829905009225723e-06,
"loss": 1.5661,
"step": 11375
},
{
"epoch": 0.6465150569954063,
"grad_norm": 1.5625,
"learning_rate": 1.277865099432789e-06,
"loss": 1.5288,
"step": 11400
},
{
"epoch": 0.6479328531730278,
"grad_norm": 1.6796875,
"learning_rate": 1.2727396979430056e-06,
"loss": 1.5425,
"step": 11425
},
{
"epoch": 0.6493506493506493,
"grad_norm": 1.5234375,
"learning_rate": 1.2676142964532223e-06,
"loss": 1.5383,
"step": 11450
},
{
"epoch": 0.6507684455282708,
"grad_norm": 1.65625,
"learning_rate": 1.262488894963439e-06,
"loss": 1.556,
"step": 11475
},
{
"epoch": 0.6521862417058923,
"grad_norm": 1.609375,
"learning_rate": 1.2573634934736556e-06,
"loss": 1.5487,
"step": 11500
},
{
"epoch": 0.6536040378835138,
"grad_norm": 1.5859375,
"learning_rate": 1.2522380919838722e-06,
"loss": 1.5661,
"step": 11525
},
{
"epoch": 0.6550218340611353,
"grad_norm": 1.625,
"learning_rate": 1.2471126904940887e-06,
"loss": 1.5446,
"step": 11550
},
{
"epoch": 0.6564396302387568,
"grad_norm": 1.4765625,
"learning_rate": 1.2419872890043053e-06,
"loss": 1.5546,
"step": 11575
},
{
"epoch": 0.6578574264163783,
"grad_norm": 1.5234375,
"learning_rate": 1.236861887514522e-06,
"loss": 1.5458,
"step": 11600
},
{
"epoch": 0.6592752225939998,
"grad_norm": 1.5,
"learning_rate": 1.2317364860247387e-06,
"loss": 1.5684,
"step": 11625
},
{
"epoch": 0.6606930187716213,
"grad_norm": 1.5234375,
"learning_rate": 1.2266110845349553e-06,
"loss": 1.5305,
"step": 11650
},
{
"epoch": 0.6621108149492428,
"grad_norm": 1.484375,
"learning_rate": 1.221485683045172e-06,
"loss": 1.5331,
"step": 11675
},
{
"epoch": 0.6635286111268645,
"grad_norm": 1.640625,
"learning_rate": 1.2163602815553886e-06,
"loss": 1.5263,
"step": 11700
},
{
"epoch": 0.664946407304486,
"grad_norm": 1.375,
"learning_rate": 1.211234880065605e-06,
"loss": 1.5346,
"step": 11725
},
{
"epoch": 0.6663642034821075,
"grad_norm": 1.734375,
"learning_rate": 1.2061094785758217e-06,
"loss": 1.525,
"step": 11750
},
{
"epoch": 0.667781999659729,
"grad_norm": 1.4765625,
"learning_rate": 1.2009840770860384e-06,
"loss": 1.5601,
"step": 11775
},
{
"epoch": 0.6691997958373505,
"grad_norm": 1.4921875,
"learning_rate": 1.195858675596255e-06,
"loss": 1.5598,
"step": 11800
},
{
"epoch": 0.670617592014972,
"grad_norm": 1.5859375,
"learning_rate": 1.1907332741064717e-06,
"loss": 1.5205,
"step": 11825
},
{
"epoch": 0.6720353881925935,
"grad_norm": 1.65625,
"learning_rate": 1.1856078726166884e-06,
"loss": 1.5609,
"step": 11850
},
{
"epoch": 0.673453184370215,
"grad_norm": 1.3984375,
"learning_rate": 1.180482471126905e-06,
"loss": 1.5311,
"step": 11875
},
{
"epoch": 0.6748709805478365,
"grad_norm": 1.6015625,
"learning_rate": 1.1753570696371215e-06,
"loss": 1.5562,
"step": 11900
},
{
"epoch": 0.676288776725458,
"grad_norm": 1.546875,
"learning_rate": 1.1702316681473381e-06,
"loss": 1.5397,
"step": 11925
},
{
"epoch": 0.6777065729030795,
"grad_norm": 1.484375,
"learning_rate": 1.1651062666575548e-06,
"loss": 1.5235,
"step": 11950
},
{
"epoch": 0.679124369080701,
"grad_norm": 1.5859375,
"learning_rate": 1.1599808651677714e-06,
"loss": 1.5541,
"step": 11975
},
{
"epoch": 0.6805421652583225,
"grad_norm": 1.578125,
"learning_rate": 1.1548554636779881e-06,
"loss": 1.5365,
"step": 12000
},
{
"epoch": 0.681959961435944,
"grad_norm": 1.5703125,
"learning_rate": 1.1497300621882048e-06,
"loss": 1.5254,
"step": 12025
},
{
"epoch": 0.6833777576135655,
"grad_norm": 1.546875,
"learning_rate": 1.1446046606984214e-06,
"loss": 1.5569,
"step": 12050
},
{
"epoch": 0.684795553791187,
"grad_norm": 1.859375,
"learning_rate": 1.139479259208638e-06,
"loss": 1.5388,
"step": 12075
},
{
"epoch": 0.6862133499688085,
"grad_norm": 1.6640625,
"learning_rate": 1.1343538577188547e-06,
"loss": 1.5449,
"step": 12100
},
{
"epoch": 0.68763114614643,
"grad_norm": 1.3828125,
"learning_rate": 1.1292284562290714e-06,
"loss": 1.5471,
"step": 12125
},
{
"epoch": 0.6890489423240514,
"grad_norm": 1.546875,
"learning_rate": 1.124103054739288e-06,
"loss": 1.5367,
"step": 12150
},
{
"epoch": 0.690466738501673,
"grad_norm": 1.8515625,
"learning_rate": 1.1189776532495047e-06,
"loss": 1.5452,
"step": 12175
},
{
"epoch": 0.6918845346792944,
"grad_norm": 1.4609375,
"learning_rate": 1.1138522517597214e-06,
"loss": 1.54,
"step": 12200
},
{
"epoch": 0.6933023308569161,
"grad_norm": 1.9453125,
"learning_rate": 1.1087268502699378e-06,
"loss": 1.5475,
"step": 12225
},
{
"epoch": 0.6947201270345376,
"grad_norm": 1.6640625,
"learning_rate": 1.1036014487801545e-06,
"loss": 1.552,
"step": 12250
},
{
"epoch": 0.6961379232121591,
"grad_norm": 1.5234375,
"learning_rate": 1.0984760472903711e-06,
"loss": 1.5461,
"step": 12275
},
{
"epoch": 0.6975557193897806,
"grad_norm": 1.6875,
"learning_rate": 1.0933506458005878e-06,
"loss": 1.5243,
"step": 12300
},
{
"epoch": 0.6989735155674021,
"grad_norm": 1.5703125,
"learning_rate": 1.0882252443108042e-06,
"loss": 1.5551,
"step": 12325
},
{
"epoch": 0.7003913117450236,
"grad_norm": 1.78125,
"learning_rate": 1.083099842821021e-06,
"loss": 1.5374,
"step": 12350
},
{
"epoch": 0.7018091079226451,
"grad_norm": 1.578125,
"learning_rate": 1.0779744413312376e-06,
"loss": 1.55,
"step": 12375
},
{
"epoch": 0.7032269041002666,
"grad_norm": 1.609375,
"learning_rate": 1.0728490398414542e-06,
"loss": 1.5355,
"step": 12400
},
{
"epoch": 0.7046447002778881,
"grad_norm": 1.5078125,
"learning_rate": 1.0677236383516709e-06,
"loss": 1.5465,
"step": 12425
},
{
"epoch": 0.7060624964555096,
"grad_norm": 1.59375,
"learning_rate": 1.0625982368618875e-06,
"loss": 1.5404,
"step": 12450
},
{
"epoch": 0.7074802926331311,
"grad_norm": 1.53125,
"learning_rate": 1.0574728353721042e-06,
"loss": 1.5264,
"step": 12475
},
{
"epoch": 0.7088980888107526,
"grad_norm": 1.671875,
"learning_rate": 1.0523474338823208e-06,
"loss": 1.5598,
"step": 12500
},
{
"epoch": 0.7103158849883741,
"grad_norm": 1.5859375,
"learning_rate": 1.0472220323925375e-06,
"loss": 1.5594,
"step": 12525
},
{
"epoch": 0.7117336811659956,
"grad_norm": 1.5703125,
"learning_rate": 1.0420966309027542e-06,
"loss": 1.5319,
"step": 12550
},
{
"epoch": 0.7131514773436171,
"grad_norm": 1.9765625,
"learning_rate": 1.0369712294129708e-06,
"loss": 1.5403,
"step": 12575
},
{
"epoch": 0.7145692735212386,
"grad_norm": 1.6328125,
"learning_rate": 1.0318458279231873e-06,
"loss": 1.5131,
"step": 12600
},
{
"epoch": 0.7159870696988601,
"grad_norm": 1.453125,
"learning_rate": 1.026720426433404e-06,
"loss": 1.5628,
"step": 12625
},
{
"epoch": 0.7174048658764816,
"grad_norm": 1.609375,
"learning_rate": 1.0215950249436206e-06,
"loss": 1.5474,
"step": 12650
},
{
"epoch": 0.7188226620541031,
"grad_norm": 1.515625,
"learning_rate": 1.0164696234538372e-06,
"loss": 1.5401,
"step": 12675
},
{
"epoch": 0.7202404582317246,
"grad_norm": 1.5234375,
"learning_rate": 1.011344221964054e-06,
"loss": 1.5464,
"step": 12700
},
{
"epoch": 0.7216582544093462,
"grad_norm": 1.6171875,
"learning_rate": 1.0062188204742706e-06,
"loss": 1.5344,
"step": 12725
},
{
"epoch": 0.7230760505869677,
"grad_norm": 1.7734375,
"learning_rate": 1.0010934189844872e-06,
"loss": 1.5035,
"step": 12750
},
{
"epoch": 0.7244938467645892,
"grad_norm": 1.6328125,
"learning_rate": 9.959680174947039e-07,
"loss": 1.5582,
"step": 12775
},
{
"epoch": 0.7259116429422107,
"grad_norm": 1.4765625,
"learning_rate": 9.908426160049205e-07,
"loss": 1.5341,
"step": 12800
},
{
"epoch": 0.7273294391198322,
"grad_norm": 1.3828125,
"learning_rate": 9.85717214515137e-07,
"loss": 1.5449,
"step": 12825
},
{
"epoch": 0.7287472352974537,
"grad_norm": 1.6953125,
"learning_rate": 9.805918130253536e-07,
"loss": 1.5297,
"step": 12850
},
{
"epoch": 0.7301650314750752,
"grad_norm": 1.5390625,
"learning_rate": 9.754664115355703e-07,
"loss": 1.5498,
"step": 12875
},
{
"epoch": 0.7315828276526967,
"grad_norm": 1.515625,
"learning_rate": 9.70341010045787e-07,
"loss": 1.5448,
"step": 12900
},
{
"epoch": 0.7330006238303182,
"grad_norm": 1.8828125,
"learning_rate": 9.652156085560036e-07,
"loss": 1.5242,
"step": 12925
},
{
"epoch": 0.7344184200079397,
"grad_norm": 1.65625,
"learning_rate": 9.6009020706622e-07,
"loss": 1.5191,
"step": 12950
},
{
"epoch": 0.7358362161855612,
"grad_norm": 1.65625,
"learning_rate": 9.549648055764367e-07,
"loss": 1.532,
"step": 12975
},
{
"epoch": 0.7372540123631827,
"grad_norm": 1.609375,
"learning_rate": 9.498394040866535e-07,
"loss": 1.5711,
"step": 13000
},
{
"epoch": 0.7386718085408042,
"grad_norm": 1.59375,
"learning_rate": 9.4471400259687e-07,
"loss": 1.5418,
"step": 13025
},
{
"epoch": 0.7400896047184257,
"grad_norm": 1.828125,
"learning_rate": 9.395886011070867e-07,
"loss": 1.5549,
"step": 13050
},
{
"epoch": 0.7415074008960472,
"grad_norm": 1.625,
"learning_rate": 9.344631996173033e-07,
"loss": 1.5404,
"step": 13075
},
{
"epoch": 0.7429251970736687,
"grad_norm": 1.609375,
"learning_rate": 9.2933779812752e-07,
"loss": 1.5488,
"step": 13100
},
{
"epoch": 0.7443429932512902,
"grad_norm": 1.546875,
"learning_rate": 9.242123966377367e-07,
"loss": 1.5429,
"step": 13125
},
{
"epoch": 0.7457607894289117,
"grad_norm": 1.78125,
"learning_rate": 9.190869951479533e-07,
"loss": 1.5235,
"step": 13150
},
{
"epoch": 0.7471785856065332,
"grad_norm": 1.640625,
"learning_rate": 9.1396159365817e-07,
"loss": 1.5297,
"step": 13175
},
{
"epoch": 0.7485963817841547,
"grad_norm": 1.59375,
"learning_rate": 9.088361921683866e-07,
"loss": 1.5214,
"step": 13200
},
{
"epoch": 0.7500141779617762,
"grad_norm": 1.6796875,
"learning_rate": 9.037107906786032e-07,
"loss": 1.5256,
"step": 13225
},
{
"epoch": 0.7514319741393978,
"grad_norm": 1.53125,
"learning_rate": 8.985853891888198e-07,
"loss": 1.5405,
"step": 13250
},
{
"epoch": 0.7528497703170193,
"grad_norm": 1.515625,
"learning_rate": 8.934599876990365e-07,
"loss": 1.5277,
"step": 13275
},
{
"epoch": 0.7542675664946408,
"grad_norm": 1.40625,
"learning_rate": 8.883345862092531e-07,
"loss": 1.5297,
"step": 13300
},
{
"epoch": 0.7556853626722623,
"grad_norm": 1.59375,
"learning_rate": 8.832091847194696e-07,
"loss": 1.5141,
"step": 13325
},
{
"epoch": 0.7571031588498838,
"grad_norm": 1.609375,
"learning_rate": 8.780837832296863e-07,
"loss": 1.5404,
"step": 13350
},
{
"epoch": 0.7585209550275053,
"grad_norm": 1.546875,
"learning_rate": 8.729583817399029e-07,
"loss": 1.5397,
"step": 13375
},
{
"epoch": 0.7599387512051268,
"grad_norm": 1.40625,
"learning_rate": 8.678329802501196e-07,
"loss": 1.5652,
"step": 13400
},
{
"epoch": 0.7613565473827483,
"grad_norm": 1.53125,
"learning_rate": 8.627075787603362e-07,
"loss": 1.5527,
"step": 13425
},
{
"epoch": 0.7627743435603698,
"grad_norm": 1.671875,
"learning_rate": 8.575821772705529e-07,
"loss": 1.5429,
"step": 13450
},
{
"epoch": 0.7641921397379913,
"grad_norm": 1.671875,
"learning_rate": 8.524567757807696e-07,
"loss": 1.5511,
"step": 13475
},
{
"epoch": 0.7656099359156128,
"grad_norm": 1.6875,
"learning_rate": 8.473313742909862e-07,
"loss": 1.5494,
"step": 13500
},
{
"epoch": 0.7670277320932343,
"grad_norm": 1.8125,
"learning_rate": 8.422059728012028e-07,
"loss": 1.5272,
"step": 13525
},
{
"epoch": 0.7684455282708558,
"grad_norm": 1.65625,
"learning_rate": 8.370805713114194e-07,
"loss": 1.5361,
"step": 13550
},
{
"epoch": 0.7698633244484773,
"grad_norm": 1.625,
"learning_rate": 8.319551698216361e-07,
"loss": 1.5051,
"step": 13575
},
{
"epoch": 0.7712811206260988,
"grad_norm": 1.578125,
"learning_rate": 8.268297683318527e-07,
"loss": 1.5396,
"step": 13600
},
{
"epoch": 0.7726989168037203,
"grad_norm": 1.4375,
"learning_rate": 8.217043668420694e-07,
"loss": 1.5366,
"step": 13625
},
{
"epoch": 0.7741167129813418,
"grad_norm": 1.5234375,
"learning_rate": 8.165789653522858e-07,
"loss": 1.5285,
"step": 13650
},
{
"epoch": 0.7755345091589633,
"grad_norm": 1.5859375,
"learning_rate": 8.114535638625025e-07,
"loss": 1.5392,
"step": 13675
},
{
"epoch": 0.7769523053365848,
"grad_norm": 1.578125,
"learning_rate": 8.063281623727192e-07,
"loss": 1.5187,
"step": 13700
},
{
"epoch": 0.7783701015142063,
"grad_norm": 1.703125,
"learning_rate": 8.012027608829358e-07,
"loss": 1.532,
"step": 13725
},
{
"epoch": 0.7797878976918278,
"grad_norm": 1.4921875,
"learning_rate": 7.960773593931525e-07,
"loss": 1.5551,
"step": 13750
},
{
"epoch": 0.7812056938694494,
"grad_norm": 1.59375,
"learning_rate": 7.909519579033691e-07,
"loss": 1.5561,
"step": 13775
},
{
"epoch": 0.7826234900470709,
"grad_norm": 1.59375,
"learning_rate": 7.858265564135858e-07,
"loss": 1.5498,
"step": 13800
},
{
"epoch": 0.7840412862246924,
"grad_norm": 1.59375,
"learning_rate": 7.807011549238023e-07,
"loss": 1.5047,
"step": 13825
},
{
"epoch": 0.7854590824023139,
"grad_norm": 1.6875,
"learning_rate": 7.75575753434019e-07,
"loss": 1.5347,
"step": 13850
},
{
"epoch": 0.7868768785799354,
"grad_norm": 1.65625,
"learning_rate": 7.704503519442357e-07,
"loss": 1.5385,
"step": 13875
},
{
"epoch": 0.7882946747575569,
"grad_norm": 2.875,
"learning_rate": 7.653249504544523e-07,
"loss": 1.5458,
"step": 13900
},
{
"epoch": 0.7897124709351784,
"grad_norm": 1.9453125,
"learning_rate": 7.60199548964669e-07,
"loss": 1.5497,
"step": 13925
},
{
"epoch": 0.7911302671127999,
"grad_norm": 1.6875,
"learning_rate": 7.550741474748856e-07,
"loss": 1.5308,
"step": 13950
},
{
"epoch": 0.7925480632904214,
"grad_norm": 1.7265625,
"learning_rate": 7.499487459851022e-07,
"loss": 1.528,
"step": 13975
},
{
"epoch": 0.7939658594680429,
"grad_norm": 1.6015625,
"learning_rate": 7.448233444953189e-07,
"loss": 1.5413,
"step": 14000
},
{
"epoch": 0.7953836556456644,
"grad_norm": 1.7109375,
"learning_rate": 7.396979430055355e-07,
"loss": 1.5175,
"step": 14025
},
{
"epoch": 0.7968014518232859,
"grad_norm": 1.59375,
"learning_rate": 7.345725415157522e-07,
"loss": 1.5351,
"step": 14050
},
{
"epoch": 0.7982192480009074,
"grad_norm": 1.6640625,
"learning_rate": 7.294471400259687e-07,
"loss": 1.544,
"step": 14075
},
{
"epoch": 0.7996370441785289,
"grad_norm": 1.671875,
"learning_rate": 7.243217385361854e-07,
"loss": 1.5715,
"step": 14100
},
{
"epoch": 0.8010548403561504,
"grad_norm": 1.765625,
"learning_rate": 7.191963370464019e-07,
"loss": 1.5418,
"step": 14125
},
{
"epoch": 0.8024726365337719,
"grad_norm": 1.71875,
"learning_rate": 7.140709355566186e-07,
"loss": 1.5358,
"step": 14150
},
{
"epoch": 0.8038904327113934,
"grad_norm": 1.6875,
"learning_rate": 7.089455340668352e-07,
"loss": 1.5244,
"step": 14175
},
{
"epoch": 0.8053082288890149,
"grad_norm": 1.6875,
"learning_rate": 7.038201325770519e-07,
"loss": 1.5106,
"step": 14200
},
{
"epoch": 0.8067260250666364,
"grad_norm": 1.4453125,
"learning_rate": 6.986947310872686e-07,
"loss": 1.5303,
"step": 14225
},
{
"epoch": 0.8081438212442579,
"grad_norm": 1.71875,
"learning_rate": 6.935693295974851e-07,
"loss": 1.5587,
"step": 14250
},
{
"epoch": 0.8095616174218794,
"grad_norm": 1.7578125,
"learning_rate": 6.884439281077018e-07,
"loss": 1.5189,
"step": 14275
},
{
"epoch": 0.810979413599501,
"grad_norm": 1.53125,
"learning_rate": 6.833185266179184e-07,
"loss": 1.5448,
"step": 14300
},
{
"epoch": 0.8123972097771225,
"grad_norm": 1.6015625,
"learning_rate": 6.781931251281351e-07,
"loss": 1.5461,
"step": 14325
},
{
"epoch": 0.813815005954744,
"grad_norm": 1.6640625,
"learning_rate": 6.730677236383517e-07,
"loss": 1.5511,
"step": 14350
},
{
"epoch": 0.8152328021323655,
"grad_norm": 1.8046875,
"learning_rate": 6.679423221485683e-07,
"loss": 1.5455,
"step": 14375
},
{
"epoch": 0.816650598309987,
"grad_norm": 1.5390625,
"learning_rate": 6.62816920658785e-07,
"loss": 1.5559,
"step": 14400
},
{
"epoch": 0.8180683944876085,
"grad_norm": 1.859375,
"learning_rate": 6.576915191690015e-07,
"loss": 1.5264,
"step": 14425
},
{
"epoch": 0.81948619066523,
"grad_norm": 1.609375,
"learning_rate": 6.525661176792182e-07,
"loss": 1.5374,
"step": 14450
},
{
"epoch": 0.8209039868428515,
"grad_norm": 1.5546875,
"learning_rate": 6.474407161894348e-07,
"loss": 1.5376,
"step": 14475
},
{
"epoch": 0.822321783020473,
"grad_norm": 1.703125,
"learning_rate": 6.423153146996515e-07,
"loss": 1.5514,
"step": 14500
},
{
"epoch": 0.8237395791980945,
"grad_norm": 1.515625,
"learning_rate": 6.371899132098681e-07,
"loss": 1.5478,
"step": 14525
},
{
"epoch": 0.825157375375716,
"grad_norm": 1.59375,
"learning_rate": 6.320645117200848e-07,
"loss": 1.5213,
"step": 14550
},
{
"epoch": 0.8265751715533375,
"grad_norm": 1.609375,
"learning_rate": 6.269391102303015e-07,
"loss": 1.544,
"step": 14575
},
{
"epoch": 0.827992967730959,
"grad_norm": 1.734375,
"learning_rate": 6.21813708740518e-07,
"loss": 1.5342,
"step": 14600
},
{
"epoch": 0.8294107639085805,
"grad_norm": 1.515625,
"learning_rate": 6.166883072507347e-07,
"loss": 1.5428,
"step": 14625
},
{
"epoch": 0.830828560086202,
"grad_norm": 1.578125,
"learning_rate": 6.115629057609513e-07,
"loss": 1.5419,
"step": 14650
},
{
"epoch": 0.8322463562638235,
"grad_norm": 1.6640625,
"learning_rate": 6.064375042711679e-07,
"loss": 1.5475,
"step": 14675
},
{
"epoch": 0.833664152441445,
"grad_norm": 1.5234375,
"learning_rate": 6.013121027813845e-07,
"loss": 1.5467,
"step": 14700
},
{
"epoch": 0.8350819486190665,
"grad_norm": 1.5078125,
"learning_rate": 5.961867012916012e-07,
"loss": 1.5364,
"step": 14725
},
{
"epoch": 0.836499744796688,
"grad_norm": 1.7578125,
"learning_rate": 5.910612998018179e-07,
"loss": 1.5297,
"step": 14750
},
{
"epoch": 0.8379175409743095,
"grad_norm": 1.5546875,
"learning_rate": 5.859358983120344e-07,
"loss": 1.5556,
"step": 14775
},
{
"epoch": 0.8393353371519311,
"grad_norm": 1.75,
"learning_rate": 5.808104968222511e-07,
"loss": 1.5352,
"step": 14800
},
{
"epoch": 0.8407531333295526,
"grad_norm": 1.5,
"learning_rate": 5.756850953324677e-07,
"loss": 1.5577,
"step": 14825
},
{
"epoch": 0.8421709295071741,
"grad_norm": 1.4296875,
"learning_rate": 5.705596938426844e-07,
"loss": 1.5562,
"step": 14850
},
{
"epoch": 0.8435887256847956,
"grad_norm": 1.6796875,
"learning_rate": 5.65434292352901e-07,
"loss": 1.5538,
"step": 14875
},
{
"epoch": 0.8450065218624171,
"grad_norm": 1.75,
"learning_rate": 5.603088908631177e-07,
"loss": 1.5314,
"step": 14900
},
{
"epoch": 0.8464243180400386,
"grad_norm": 1.546875,
"learning_rate": 5.551834893733342e-07,
"loss": 1.5466,
"step": 14925
},
{
"epoch": 0.8478421142176601,
"grad_norm": 1.5234375,
"learning_rate": 5.500580878835509e-07,
"loss": 1.5408,
"step": 14950
},
{
"epoch": 0.8492599103952816,
"grad_norm": 1.7265625,
"learning_rate": 5.449326863937675e-07,
"loss": 1.5476,
"step": 14975
},
{
"epoch": 0.8506777065729031,
"grad_norm": 1.703125,
"learning_rate": 5.398072849039841e-07,
"loss": 1.5487,
"step": 15000
},
{
"epoch": 0.8520955027505246,
"grad_norm": 1.5078125,
"learning_rate": 5.346818834142008e-07,
"loss": 1.5147,
"step": 15025
},
{
"epoch": 0.8535132989281461,
"grad_norm": 1.4296875,
"learning_rate": 5.295564819244174e-07,
"loss": 1.5342,
"step": 15050
},
{
"epoch": 0.8549310951057676,
"grad_norm": 1.640625,
"learning_rate": 5.244310804346341e-07,
"loss": 1.5487,
"step": 15075
},
{
"epoch": 0.8563488912833891,
"grad_norm": 1.5078125,
"learning_rate": 5.193056789448507e-07,
"loss": 1.5365,
"step": 15100
},
{
"epoch": 0.8577666874610106,
"grad_norm": 1.6484375,
"learning_rate": 5.141802774550673e-07,
"loss": 1.5347,
"step": 15125
},
{
"epoch": 0.8591844836386321,
"grad_norm": 1.8671875,
"learning_rate": 5.09054875965284e-07,
"loss": 1.5168,
"step": 15150
},
{
"epoch": 0.8606022798162536,
"grad_norm": 1.5390625,
"learning_rate": 5.039294744755006e-07,
"loss": 1.5616,
"step": 15175
},
{
"epoch": 0.8620200759938751,
"grad_norm": 1.65625,
"learning_rate": 4.988040729857173e-07,
"loss": 1.5387,
"step": 15200
},
{
"epoch": 0.8634378721714966,
"grad_norm": 1.5703125,
"learning_rate": 4.936786714959338e-07,
"loss": 1.533,
"step": 15225
},
{
"epoch": 0.8648556683491181,
"grad_norm": 1.578125,
"learning_rate": 4.885532700061505e-07,
"loss": 1.5412,
"step": 15250
},
{
"epoch": 0.8662734645267396,
"grad_norm": 1.59375,
"learning_rate": 4.834278685163671e-07,
"loss": 1.5347,
"step": 15275
},
{
"epoch": 0.8676912607043611,
"grad_norm": 1.8125,
"learning_rate": 4.783024670265837e-07,
"loss": 1.5393,
"step": 15300
},
{
"epoch": 0.8691090568819827,
"grad_norm": 1.5,
"learning_rate": 4.7317706553680035e-07,
"loss": 1.5254,
"step": 15325
},
{
"epoch": 0.8705268530596042,
"grad_norm": 1.5078125,
"learning_rate": 4.68051664047017e-07,
"loss": 1.5425,
"step": 15350
},
{
"epoch": 0.8719446492372257,
"grad_norm": 1.390625,
"learning_rate": 4.6292626255723367e-07,
"loss": 1.5319,
"step": 15375
},
{
"epoch": 0.8733624454148472,
"grad_norm": 1.671875,
"learning_rate": 4.5780086106745033e-07,
"loss": 1.531,
"step": 15400
},
{
"epoch": 0.8747802415924687,
"grad_norm": 1.5390625,
"learning_rate": 4.5267545957766693e-07,
"loss": 1.5329,
"step": 15425
},
{
"epoch": 0.8761980377700902,
"grad_norm": 1.671875,
"learning_rate": 4.475500580878836e-07,
"loss": 1.5389,
"step": 15450
},
{
"epoch": 0.8776158339477117,
"grad_norm": 1.640625,
"learning_rate": 4.4242465659810014e-07,
"loss": 1.5516,
"step": 15475
},
{
"epoch": 0.8790336301253332,
"grad_norm": 1.6484375,
"learning_rate": 4.372992551083168e-07,
"loss": 1.5463,
"step": 15500
},
{
"epoch": 0.8804514263029547,
"grad_norm": 1.5390625,
"learning_rate": 4.3217385361853346e-07,
"loss": 1.5105,
"step": 15525
},
{
"epoch": 0.8818692224805762,
"grad_norm": 1.6953125,
"learning_rate": 4.270484521287501e-07,
"loss": 1.5498,
"step": 15550
},
{
"epoch": 0.8832870186581977,
"grad_norm": 1.7265625,
"learning_rate": 4.219230506389667e-07,
"loss": 1.5456,
"step": 15575
},
{
"epoch": 0.8847048148358192,
"grad_norm": 1.7265625,
"learning_rate": 4.167976491491834e-07,
"loss": 1.5473,
"step": 15600
},
{
"epoch": 0.8861226110134407,
"grad_norm": 1.6015625,
"learning_rate": 4.1167224765940004e-07,
"loss": 1.549,
"step": 15625
},
{
"epoch": 0.8875404071910622,
"grad_norm": 1.6484375,
"learning_rate": 4.065468461696166e-07,
"loss": 1.5346,
"step": 15650
},
{
"epoch": 0.8889582033686837,
"grad_norm": 1.515625,
"learning_rate": 4.0142144467983325e-07,
"loss": 1.5345,
"step": 15675
},
{
"epoch": 0.8903759995463052,
"grad_norm": 1.53125,
"learning_rate": 3.962960431900499e-07,
"loss": 1.5251,
"step": 15700
},
{
"epoch": 0.8917937957239267,
"grad_norm": 1.546875,
"learning_rate": 3.911706417002665e-07,
"loss": 1.5184,
"step": 15725
},
{
"epoch": 0.8932115919015482,
"grad_norm": 1.59375,
"learning_rate": 3.8604524021048317e-07,
"loss": 1.5245,
"step": 15750
},
{
"epoch": 0.8946293880791697,
"grad_norm": 1.515625,
"learning_rate": 3.8091983872069983e-07,
"loss": 1.5386,
"step": 15775
},
{
"epoch": 0.8960471842567912,
"grad_norm": 1.6328125,
"learning_rate": 3.757944372309165e-07,
"loss": 1.5282,
"step": 15800
},
{
"epoch": 0.8974649804344127,
"grad_norm": 1.6640625,
"learning_rate": 3.706690357411331e-07,
"loss": 1.5082,
"step": 15825
},
{
"epoch": 0.8988827766120343,
"grad_norm": 1.78125,
"learning_rate": 3.655436342513497e-07,
"loss": 1.5486,
"step": 15850
},
{
"epoch": 0.9003005727896558,
"grad_norm": 1.5,
"learning_rate": 3.604182327615663e-07,
"loss": 1.5204,
"step": 15875
},
{
"epoch": 0.9017183689672773,
"grad_norm": 1.78125,
"learning_rate": 3.5529283127178296e-07,
"loss": 1.5422,
"step": 15900
},
{
"epoch": 0.9031361651448988,
"grad_norm": 1.59375,
"learning_rate": 3.501674297819996e-07,
"loss": 1.5431,
"step": 15925
},
{
"epoch": 0.9045539613225203,
"grad_norm": 1.65625,
"learning_rate": 3.450420282922162e-07,
"loss": 1.537,
"step": 15950
},
{
"epoch": 0.9059717575001418,
"grad_norm": 1.640625,
"learning_rate": 3.399166268024329e-07,
"loss": 1.5505,
"step": 15975
},
{
"epoch": 0.9073895536777633,
"grad_norm": 1.4765625,
"learning_rate": 3.347912253126495e-07,
"loss": 1.5466,
"step": 16000
},
{
"epoch": 0.9088073498553848,
"grad_norm": 1.53125,
"learning_rate": 3.296658238228661e-07,
"loss": 1.5434,
"step": 16025
},
{
"epoch": 0.9102251460330063,
"grad_norm": 1.5625,
"learning_rate": 3.2454042233308275e-07,
"loss": 1.5351,
"step": 16050
},
{
"epoch": 0.9116429422106278,
"grad_norm": 1.5625,
"learning_rate": 3.194150208432994e-07,
"loss": 1.5692,
"step": 16075
},
{
"epoch": 0.9130607383882493,
"grad_norm": 1.9921875,
"learning_rate": 3.1428961935351607e-07,
"loss": 1.5178,
"step": 16100
},
{
"epoch": 0.9144785345658708,
"grad_norm": 1.7265625,
"learning_rate": 3.0916421786373267e-07,
"loss": 1.5542,
"step": 16125
},
{
"epoch": 0.9158963307434923,
"grad_norm": 1.6171875,
"learning_rate": 3.040388163739493e-07,
"loss": 1.5211,
"step": 16150
},
{
"epoch": 0.9173141269211138,
"grad_norm": 1.5390625,
"learning_rate": 2.9891341488416594e-07,
"loss": 1.538,
"step": 16175
},
{
"epoch": 0.9187319230987353,
"grad_norm": 1.5390625,
"learning_rate": 2.9378801339438254e-07,
"loss": 1.5463,
"step": 16200
},
{
"epoch": 0.9201497192763568,
"grad_norm": 1.515625,
"learning_rate": 2.886626119045992e-07,
"loss": 1.5322,
"step": 16225
},
{
"epoch": 0.9215675154539783,
"grad_norm": 1.6640625,
"learning_rate": 2.8353721041481586e-07,
"loss": 1.554,
"step": 16250
},
{
"epoch": 0.9229853116315998,
"grad_norm": 1.53125,
"learning_rate": 2.7841180892503246e-07,
"loss": 1.5234,
"step": 16275
},
{
"epoch": 0.9244031078092213,
"grad_norm": 1.5,
"learning_rate": 2.7328640743524907e-07,
"loss": 1.5341,
"step": 16300
},
{
"epoch": 0.9258209039868428,
"grad_norm": 1.7109375,
"learning_rate": 2.681610059454657e-07,
"loss": 1.5332,
"step": 16325
},
{
"epoch": 0.9272387001644643,
"grad_norm": 1.7265625,
"learning_rate": 2.630356044556824e-07,
"loss": 1.548,
"step": 16350
},
{
"epoch": 0.9286564963420859,
"grad_norm": 1.59375,
"learning_rate": 2.57910202965899e-07,
"loss": 1.5414,
"step": 16375
},
{
"epoch": 0.9300742925197074,
"grad_norm": 1.5859375,
"learning_rate": 2.5278480147611565e-07,
"loss": 1.5336,
"step": 16400
},
{
"epoch": 0.9314920886973289,
"grad_norm": 1.6328125,
"learning_rate": 2.4765939998633225e-07,
"loss": 1.566,
"step": 16425
},
{
"epoch": 0.9329098848749504,
"grad_norm": 1.6015625,
"learning_rate": 2.425339984965489e-07,
"loss": 1.5279,
"step": 16450
},
{
"epoch": 0.9343276810525719,
"grad_norm": 1.59375,
"learning_rate": 2.3740859700676552e-07,
"loss": 1.5338,
"step": 16475
},
{
"epoch": 0.9357454772301934,
"grad_norm": 1.5703125,
"learning_rate": 2.3228319551698217e-07,
"loss": 1.5625,
"step": 16500
},
{
"epoch": 0.9371632734078149,
"grad_norm": 1.5,
"learning_rate": 2.271577940271988e-07,
"loss": 1.5272,
"step": 16525
},
{
"epoch": 0.9385810695854364,
"grad_norm": 1.671875,
"learning_rate": 2.220323925374154e-07,
"loss": 1.5431,
"step": 16550
},
{
"epoch": 0.9399988657630579,
"grad_norm": 1.5078125,
"learning_rate": 2.1690699104763207e-07,
"loss": 1.5529,
"step": 16575
},
{
"epoch": 0.9414166619406794,
"grad_norm": 1.71875,
"learning_rate": 2.117815895578487e-07,
"loss": 1.5358,
"step": 16600
},
{
"epoch": 0.9428344581183009,
"grad_norm": 1.7109375,
"learning_rate": 2.0665618806806536e-07,
"loss": 1.5334,
"step": 16625
},
{
"epoch": 0.9442522542959224,
"grad_norm": 1.59375,
"learning_rate": 2.0153078657828196e-07,
"loss": 1.5475,
"step": 16650
},
{
"epoch": 0.9456700504735439,
"grad_norm": 1.578125,
"learning_rate": 1.964053850884986e-07,
"loss": 1.5342,
"step": 16675
},
{
"epoch": 0.9470878466511654,
"grad_norm": 1.65625,
"learning_rate": 1.9127998359871525e-07,
"loss": 1.5595,
"step": 16700
},
{
"epoch": 0.9485056428287869,
"grad_norm": 1.4765625,
"learning_rate": 1.8615458210893189e-07,
"loss": 1.5124,
"step": 16725
},
{
"epoch": 0.9499234390064084,
"grad_norm": 1.5234375,
"learning_rate": 1.810291806191485e-07,
"loss": 1.5265,
"step": 16750
},
{
"epoch": 0.9513412351840299,
"grad_norm": 1.6875,
"learning_rate": 1.7590377912936515e-07,
"loss": 1.5528,
"step": 16775
},
{
"epoch": 0.9527590313616514,
"grad_norm": 1.453125,
"learning_rate": 1.7077837763958178e-07,
"loss": 1.5389,
"step": 16800
},
{
"epoch": 0.9541768275392729,
"grad_norm": 1.6015625,
"learning_rate": 1.656529761497984e-07,
"loss": 1.5352,
"step": 16825
},
{
"epoch": 0.9555946237168944,
"grad_norm": 1.8359375,
"learning_rate": 1.6052757466001504e-07,
"loss": 1.544,
"step": 16850
},
{
"epoch": 0.957012419894516,
"grad_norm": 1.75,
"learning_rate": 1.5540217317023168e-07,
"loss": 1.521,
"step": 16875
},
{
"epoch": 0.9584302160721375,
"grad_norm": 1.6015625,
"learning_rate": 1.502767716804483e-07,
"loss": 1.5307,
"step": 16900
},
{
"epoch": 0.959848012249759,
"grad_norm": 1.5390625,
"learning_rate": 1.4515137019066494e-07,
"loss": 1.5331,
"step": 16925
},
{
"epoch": 0.9612658084273805,
"grad_norm": 1.640625,
"learning_rate": 1.4002596870088157e-07,
"loss": 1.5488,
"step": 16950
},
{
"epoch": 0.962683604605002,
"grad_norm": 1.5,
"learning_rate": 1.349005672110982e-07,
"loss": 1.5285,
"step": 16975
},
{
"epoch": 0.9641014007826235,
"grad_norm": 1.5078125,
"learning_rate": 1.2977516572131486e-07,
"loss": 1.5377,
"step": 17000
},
{
"epoch": 0.965519196960245,
"grad_norm": 1.6171875,
"learning_rate": 1.2464976423153147e-07,
"loss": 1.5591,
"step": 17025
},
{
"epoch": 0.9669369931378665,
"grad_norm": 1.703125,
"learning_rate": 1.195243627417481e-07,
"loss": 1.5427,
"step": 17050
},
{
"epoch": 0.968354789315488,
"grad_norm": 1.578125,
"learning_rate": 1.1439896125196474e-07,
"loss": 1.5266,
"step": 17075
},
{
"epoch": 0.9697725854931095,
"grad_norm": 1.7421875,
"learning_rate": 1.0927355976218137e-07,
"loss": 1.535,
"step": 17100
},
{
"epoch": 0.971190381670731,
"grad_norm": 1.5703125,
"learning_rate": 1.0414815827239802e-07,
"loss": 1.5384,
"step": 17125
},
{
"epoch": 0.9726081778483525,
"grad_norm": 1.5390625,
"learning_rate": 9.902275678261464e-08,
"loss": 1.5588,
"step": 17150
},
{
"epoch": 0.974025974025974,
"grad_norm": 1.6484375,
"learning_rate": 9.389735529283128e-08,
"loss": 1.5508,
"step": 17175
},
{
"epoch": 0.9754437702035955,
"grad_norm": 1.6953125,
"learning_rate": 8.877195380304791e-08,
"loss": 1.5335,
"step": 17200
},
{
"epoch": 0.976861566381217,
"grad_norm": 1.65625,
"learning_rate": 8.364655231326454e-08,
"loss": 1.5481,
"step": 17225
},
{
"epoch": 0.9782793625588385,
"grad_norm": 1.65625,
"learning_rate": 7.852115082348118e-08,
"loss": 1.5226,
"step": 17250
},
{
"epoch": 0.97969715873646,
"grad_norm": 1.5078125,
"learning_rate": 7.339574933369781e-08,
"loss": 1.5397,
"step": 17275
},
{
"epoch": 0.9811149549140815,
"grad_norm": 1.515625,
"learning_rate": 6.827034784391444e-08,
"loss": 1.5393,
"step": 17300
},
{
"epoch": 0.982532751091703,
"grad_norm": 1.546875,
"learning_rate": 6.314494635413107e-08,
"loss": 1.5498,
"step": 17325
},
{
"epoch": 0.9839505472693245,
"grad_norm": 1.6015625,
"learning_rate": 5.801954486434771e-08,
"loss": 1.5442,
"step": 17350
},
{
"epoch": 0.985368343446946,
"grad_norm": 1.6640625,
"learning_rate": 5.289414337456434e-08,
"loss": 1.5461,
"step": 17375
},
{
"epoch": 0.9867861396245676,
"grad_norm": 1.5703125,
"learning_rate": 4.776874188478098e-08,
"loss": 1.5237,
"step": 17400
},
{
"epoch": 0.9882039358021891,
"grad_norm": 1.90625,
"learning_rate": 4.264334039499761e-08,
"loss": 1.5218,
"step": 17425
},
{
"epoch": 0.9896217319798106,
"grad_norm": 1.5703125,
"learning_rate": 3.751793890521424e-08,
"loss": 1.5385,
"step": 17450
},
{
"epoch": 0.9910395281574321,
"grad_norm": 1.640625,
"learning_rate": 3.239253741543088e-08,
"loss": 1.5342,
"step": 17475
},
{
"epoch": 0.9924573243350536,
"grad_norm": 1.4921875,
"learning_rate": 2.726713592564751e-08,
"loss": 1.5421,
"step": 17500
},
{
"epoch": 0.9938751205126751,
"grad_norm": 1.5859375,
"learning_rate": 2.2141734435864145e-08,
"loss": 1.538,
"step": 17525
},
{
"epoch": 0.9952929166902966,
"grad_norm": 1.5625,
"learning_rate": 1.7016332946080776e-08,
"loss": 1.5478,
"step": 17550
},
{
"epoch": 0.9967107128679181,
"grad_norm": 1.6171875,
"learning_rate": 1.1890931456297411e-08,
"loss": 1.5367,
"step": 17575
},
{
"epoch": 0.9981285090455396,
"grad_norm": 1.7578125,
"learning_rate": 6.765529966514044e-09,
"loss": 1.5193,
"step": 17600
},
{
"epoch": 0.9995463052231611,
"grad_norm": 1.7578125,
"learning_rate": 1.6401284767306775e-09,
"loss": 1.5537,
"step": 17625
}
],
"logging_steps": 25,
"max_steps": 17633,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.571044232598127e+18,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}