Zhengxue's picture
Upload 20 files
e1d5322 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.913669064748202,
"eval_steps": 500,
"global_step": 828,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014388489208633094,
"grad_norm": 3.779401339094513,
"learning_rate": 8.000000000000001e-07,
"loss": 3.8262,
"step": 1
},
{
"epoch": 0.02877697841726619,
"grad_norm": 10.455825805969267,
"learning_rate": 1.6000000000000001e-06,
"loss": 6.7511,
"step": 2
},
{
"epoch": 0.04316546762589928,
"grad_norm": 5.6185654464107815,
"learning_rate": 2.4000000000000003e-06,
"loss": 4.6228,
"step": 3
},
{
"epoch": 0.05755395683453238,
"grad_norm": 4.885801768876069,
"learning_rate": 3.2000000000000003e-06,
"loss": 4.2241,
"step": 4
},
{
"epoch": 0.07194244604316546,
"grad_norm": 10.403297492152396,
"learning_rate": 4.000000000000001e-06,
"loss": 6.7416,
"step": 5
},
{
"epoch": 0.08633093525179857,
"grad_norm": 4.5044178093473395,
"learning_rate": 4.800000000000001e-06,
"loss": 4.1637,
"step": 6
},
{
"epoch": 0.10071942446043165,
"grad_norm": 4.734693138114069,
"learning_rate": 5.600000000000001e-06,
"loss": 4.3628,
"step": 7
},
{
"epoch": 0.11510791366906475,
"grad_norm": 4.288727085080122,
"learning_rate": 6.4000000000000006e-06,
"loss": 4.0006,
"step": 8
},
{
"epoch": 0.12949640287769784,
"grad_norm": 6.746488042337733,
"learning_rate": 7.2000000000000005e-06,
"loss": 5.1247,
"step": 9
},
{
"epoch": 0.14388489208633093,
"grad_norm": 4.730837471611876,
"learning_rate": 8.000000000000001e-06,
"loss": 4.167,
"step": 10
},
{
"epoch": 0.15827338129496402,
"grad_norm": 5.617511430019142,
"learning_rate": 8.8e-06,
"loss": 4.8073,
"step": 11
},
{
"epoch": 0.17266187050359713,
"grad_norm": 4.585866335374106,
"learning_rate": 9.600000000000001e-06,
"loss": 4.1609,
"step": 12
},
{
"epoch": 0.18705035971223022,
"grad_norm": 11.787607704302536,
"learning_rate": 1.04e-05,
"loss": 7.1837,
"step": 13
},
{
"epoch": 0.2014388489208633,
"grad_norm": 4.123753478777725,
"learning_rate": 1.1200000000000001e-05,
"loss": 3.9041,
"step": 14
},
{
"epoch": 0.2158273381294964,
"grad_norm": 3.8762103667223227,
"learning_rate": 1.2e-05,
"loss": 3.7087,
"step": 15
},
{
"epoch": 0.2302158273381295,
"grad_norm": 4.21611793164487,
"learning_rate": 1.2800000000000001e-05,
"loss": 3.8878,
"step": 16
},
{
"epoch": 0.2446043165467626,
"grad_norm": 4.614901815617855,
"learning_rate": 1.3600000000000002e-05,
"loss": 4.098,
"step": 17
},
{
"epoch": 0.2589928057553957,
"grad_norm": 3.8977663337164286,
"learning_rate": 1.4400000000000001e-05,
"loss": 3.6943,
"step": 18
},
{
"epoch": 0.2733812949640288,
"grad_norm": 4.8023114000216465,
"learning_rate": 1.5200000000000002e-05,
"loss": 4.0675,
"step": 19
},
{
"epoch": 0.28776978417266186,
"grad_norm": 4.371540379053842,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.8599,
"step": 20
},
{
"epoch": 0.302158273381295,
"grad_norm": 5.726889530637721,
"learning_rate": 1.6800000000000002e-05,
"loss": 4.5061,
"step": 21
},
{
"epoch": 0.31654676258992803,
"grad_norm": 3.9969844690452887,
"learning_rate": 1.76e-05,
"loss": 3.6472,
"step": 22
},
{
"epoch": 0.33093525179856115,
"grad_norm": 11.516154261012499,
"learning_rate": 1.8400000000000003e-05,
"loss": 6.2924,
"step": 23
},
{
"epoch": 0.34532374100719426,
"grad_norm": 5.202990002676231,
"learning_rate": 1.9200000000000003e-05,
"loss": 4.0705,
"step": 24
},
{
"epoch": 0.3597122302158273,
"grad_norm": 4.176782643264017,
"learning_rate": 2e-05,
"loss": 3.5909,
"step": 25
},
{
"epoch": 0.37410071942446044,
"grad_norm": 6.424896250724197,
"learning_rate": 1.9999923468873635e-05,
"loss": 4.2519,
"step": 26
},
{
"epoch": 0.38848920863309355,
"grad_norm": 3.814366825221532,
"learning_rate": 1.999969387666594e-05,
"loss": 3.2879,
"step": 27
},
{
"epoch": 0.4028776978417266,
"grad_norm": 10.112357856323287,
"learning_rate": 1.9999311226891104e-05,
"loss": 5.1954,
"step": 28
},
{
"epoch": 0.4172661870503597,
"grad_norm": 5.7174917457637795,
"learning_rate": 1.999877552540605e-05,
"loss": 3.7522,
"step": 29
},
{
"epoch": 0.4316546762589928,
"grad_norm": 10.763244722760309,
"learning_rate": 1.9998086780410353e-05,
"loss": 4.8527,
"step": 30
},
{
"epoch": 0.4460431654676259,
"grad_norm": 3.982452930852662,
"learning_rate": 1.999724500244609e-05,
"loss": 3.1306,
"step": 31
},
{
"epoch": 0.460431654676259,
"grad_norm": 5.709026877118652,
"learning_rate": 1.999625020439771e-05,
"loss": 3.3423,
"step": 32
},
{
"epoch": 0.4748201438848921,
"grad_norm": 6.659790318204302,
"learning_rate": 1.999510240149181e-05,
"loss": 3.4041,
"step": 33
},
{
"epoch": 0.4892086330935252,
"grad_norm": 4.7853395869132545,
"learning_rate": 1.9993801611296923e-05,
"loss": 2.985,
"step": 34
},
{
"epoch": 0.5035971223021583,
"grad_norm": 4.922928181710041,
"learning_rate": 1.999234785372324e-05,
"loss": 2.8203,
"step": 35
},
{
"epoch": 0.5179856115107914,
"grad_norm": 5.760568951577959,
"learning_rate": 1.9990741151022302e-05,
"loss": 2.6983,
"step": 36
},
{
"epoch": 0.5323741007194245,
"grad_norm": 4.4691409045575075,
"learning_rate": 1.9988981527786656e-05,
"loss": 2.5688,
"step": 37
},
{
"epoch": 0.5467625899280576,
"grad_norm": 7.274174017956967,
"learning_rate": 1.99870690109495e-05,
"loss": 2.6621,
"step": 38
},
{
"epoch": 0.5611510791366906,
"grad_norm": 4.569137013282271,
"learning_rate": 1.9985003629784237e-05,
"loss": 2.5249,
"step": 39
},
{
"epoch": 0.5755395683453237,
"grad_norm": 2.929731882279866,
"learning_rate": 1.9982785415904063e-05,
"loss": 2.4861,
"step": 40
},
{
"epoch": 0.5899280575539568,
"grad_norm": 3.7590625764657815,
"learning_rate": 1.998041440326146e-05,
"loss": 2.3703,
"step": 41
},
{
"epoch": 0.60431654676259,
"grad_norm": 3.113972586690868,
"learning_rate": 1.9977890628147684e-05,
"loss": 2.3579,
"step": 42
},
{
"epoch": 0.6187050359712231,
"grad_norm": 4.310115895361215,
"learning_rate": 1.99752141291922e-05,
"loss": 2.4436,
"step": 43
},
{
"epoch": 0.6330935251798561,
"grad_norm": 3.609141873223579,
"learning_rate": 1.99723849473621e-05,
"loss": 2.3687,
"step": 44
},
{
"epoch": 0.6474820143884892,
"grad_norm": 2.889773990288414,
"learning_rate": 1.996940312596149e-05,
"loss": 2.2521,
"step": 45
},
{
"epoch": 0.6618705035971223,
"grad_norm": 2.8196363081660745,
"learning_rate": 1.9966268710630795e-05,
"loss": 2.1572,
"step": 46
},
{
"epoch": 0.6762589928057554,
"grad_norm": 3.5289128758260255,
"learning_rate": 1.996298174934608e-05,
"loss": 2.2095,
"step": 47
},
{
"epoch": 0.6906474820143885,
"grad_norm": 4.166486783655687,
"learning_rate": 1.9959542292418317e-05,
"loss": 2.0916,
"step": 48
},
{
"epoch": 0.7050359712230215,
"grad_norm": 3.6766424577128047,
"learning_rate": 1.9955950392492604e-05,
"loss": 2.0578,
"step": 49
},
{
"epoch": 0.7194244604316546,
"grad_norm": 2.989826753407446,
"learning_rate": 1.9952206104547378e-05,
"loss": 2.0855,
"step": 50
},
{
"epoch": 0.7338129496402878,
"grad_norm": 6.40215562149978,
"learning_rate": 1.994830948589355e-05,
"loss": 1.8788,
"step": 51
},
{
"epoch": 0.7482014388489209,
"grad_norm": 4.072155747218333,
"learning_rate": 1.9944260596173642e-05,
"loss": 1.9819,
"step": 52
},
{
"epoch": 0.762589928057554,
"grad_norm": 3.724293840264578,
"learning_rate": 1.9940059497360874e-05,
"loss": 1.8445,
"step": 53
},
{
"epoch": 0.7769784172661871,
"grad_norm": 3.406626514519604,
"learning_rate": 1.9935706253758206e-05,
"loss": 1.9222,
"step": 54
},
{
"epoch": 0.7913669064748201,
"grad_norm": 3.3579698036375034,
"learning_rate": 1.9931200931997372e-05,
"loss": 1.716,
"step": 55
},
{
"epoch": 0.8057553956834532,
"grad_norm": 3.430514351410994,
"learning_rate": 1.9926543601037843e-05,
"loss": 1.795,
"step": 56
},
{
"epoch": 0.8201438848920863,
"grad_norm": 3.0933971993062017,
"learning_rate": 1.992173433216577e-05,
"loss": 1.6326,
"step": 57
},
{
"epoch": 0.8345323741007195,
"grad_norm": 2.658306477244865,
"learning_rate": 1.99167731989929e-05,
"loss": 1.8422,
"step": 58
},
{
"epoch": 0.8489208633093526,
"grad_norm": 2.609103199735189,
"learning_rate": 1.9911660277455473e-05,
"loss": 1.7832,
"step": 59
},
{
"epoch": 0.8633093525179856,
"grad_norm": 3.0489907374135896,
"learning_rate": 1.9906395645813e-05,
"loss": 1.5908,
"step": 60
},
{
"epoch": 0.8776978417266187,
"grad_norm": 2.9154327708322803,
"learning_rate": 1.990097938464713e-05,
"loss": 1.5599,
"step": 61
},
{
"epoch": 0.8920863309352518,
"grad_norm": 2.6855178783593328,
"learning_rate": 1.989541157686037e-05,
"loss": 1.6876,
"step": 62
},
{
"epoch": 0.9064748201438849,
"grad_norm": 2.410132159045943,
"learning_rate": 1.9889692307674847e-05,
"loss": 1.5693,
"step": 63
},
{
"epoch": 0.920863309352518,
"grad_norm": 2.6119620830871244,
"learning_rate": 1.9883821664630977e-05,
"loss": 1.4076,
"step": 64
},
{
"epoch": 0.935251798561151,
"grad_norm": 2.5077319176478996,
"learning_rate": 1.987779973758615e-05,
"loss": 1.5181,
"step": 65
},
{
"epoch": 0.9496402877697842,
"grad_norm": 2.8061485591127266,
"learning_rate": 1.987162661871333e-05,
"loss": 1.3751,
"step": 66
},
{
"epoch": 0.9640287769784173,
"grad_norm": 2.685195543198549,
"learning_rate": 1.986530240249968e-05,
"loss": 1.3248,
"step": 67
},
{
"epoch": 0.9784172661870504,
"grad_norm": 2.6515044506712546,
"learning_rate": 1.985882718574506e-05,
"loss": 1.2112,
"step": 68
},
{
"epoch": 0.9928057553956835,
"grad_norm": 2.1678461224085193,
"learning_rate": 1.9852201067560607e-05,
"loss": 1.3792,
"step": 69
},
{
"epoch": 1.0071942446043165,
"grad_norm": 2.231749821096067,
"learning_rate": 1.984542414936718e-05,
"loss": 1.2438,
"step": 70
},
{
"epoch": 1.0215827338129497,
"grad_norm": 2.5781231491148353,
"learning_rate": 1.9838496534893807e-05,
"loss": 1.5007,
"step": 71
},
{
"epoch": 1.0359712230215827,
"grad_norm": 2.5108817806595574,
"learning_rate": 1.9831418330176127e-05,
"loss": 1.4368,
"step": 72
},
{
"epoch": 1.0503597122302157,
"grad_norm": 1.9489467948836736,
"learning_rate": 1.9824189643554724e-05,
"loss": 1.2176,
"step": 73
},
{
"epoch": 1.064748201438849,
"grad_norm": 2.431713013736794,
"learning_rate": 1.9816810585673515e-05,
"loss": 1.2662,
"step": 74
},
{
"epoch": 1.079136690647482,
"grad_norm": 2.615444743987483,
"learning_rate": 1.9809281269478015e-05,
"loss": 0.6466,
"step": 75
},
{
"epoch": 1.0935251798561152,
"grad_norm": 1.9780475992590945,
"learning_rate": 1.9801601810213634e-05,
"loss": 1.1773,
"step": 76
},
{
"epoch": 1.1079136690647482,
"grad_norm": 2.738489524134038,
"learning_rate": 1.979377232542391e-05,
"loss": 0.7522,
"step": 77
},
{
"epoch": 1.1223021582733812,
"grad_norm": 2.92031012217587,
"learning_rate": 1.9785792934948697e-05,
"loss": 1.2811,
"step": 78
},
{
"epoch": 1.1366906474820144,
"grad_norm": 2.1812951842981407,
"learning_rate": 1.9777663760922342e-05,
"loss": 1.2223,
"step": 79
},
{
"epoch": 1.1510791366906474,
"grad_norm": 1.9774480380800536,
"learning_rate": 1.976938492777182e-05,
"loss": 1.2216,
"step": 80
},
{
"epoch": 1.1654676258992807,
"grad_norm": 2.0162999343359904,
"learning_rate": 1.9760956562214808e-05,
"loss": 1.1783,
"step": 81
},
{
"epoch": 1.1798561151079137,
"grad_norm": 1.8114249309162656,
"learning_rate": 1.9752378793257777e-05,
"loss": 0.9817,
"step": 82
},
{
"epoch": 1.1942446043165469,
"grad_norm": 2.7290657585488143,
"learning_rate": 1.9743651752193983e-05,
"loss": 0.9542,
"step": 83
},
{
"epoch": 1.20863309352518,
"grad_norm": 2.017011266770035,
"learning_rate": 1.9734775572601487e-05,
"loss": 1.1217,
"step": 84
},
{
"epoch": 1.223021582733813,
"grad_norm": 1.748500187733016,
"learning_rate": 1.9725750390341093e-05,
"loss": 0.7081,
"step": 85
},
{
"epoch": 1.2374100719424461,
"grad_norm": 5.7387726390350515,
"learning_rate": 1.9716576343554274e-05,
"loss": 0.7381,
"step": 86
},
{
"epoch": 1.2517985611510791,
"grad_norm": 2.160256977147074,
"learning_rate": 1.9707253572661057e-05,
"loss": 1.0861,
"step": 87
},
{
"epoch": 1.2661870503597124,
"grad_norm": 2.4513022300810223,
"learning_rate": 1.969778222035787e-05,
"loss": 1.0924,
"step": 88
},
{
"epoch": 1.2805755395683454,
"grad_norm": 2.2964346352019116,
"learning_rate": 1.9688162431615367e-05,
"loss": 0.7906,
"step": 89
},
{
"epoch": 1.2949640287769784,
"grad_norm": 2.760514099820931,
"learning_rate": 1.9678394353676203e-05,
"loss": 1.0421,
"step": 90
},
{
"epoch": 1.3093525179856116,
"grad_norm": 2.977379886340304,
"learning_rate": 1.9668478136052776e-05,
"loss": 1.0089,
"step": 91
},
{
"epoch": 1.3237410071942446,
"grad_norm": 3.228823722579014,
"learning_rate": 1.9658413930524955e-05,
"loss": 0.882,
"step": 92
},
{
"epoch": 1.3381294964028778,
"grad_norm": 1.8879288085516621,
"learning_rate": 1.9648201891137725e-05,
"loss": 0.8884,
"step": 93
},
{
"epoch": 1.3525179856115108,
"grad_norm": 5.701861760835263,
"learning_rate": 1.963784217419887e-05,
"loss": 0.5543,
"step": 94
},
{
"epoch": 1.3669064748201438,
"grad_norm": 1.9714095423076823,
"learning_rate": 1.9627334938276547e-05,
"loss": 0.9301,
"step": 95
},
{
"epoch": 1.381294964028777,
"grad_norm": 2.440289446102772,
"learning_rate": 1.961668034419688e-05,
"loss": 0.5015,
"step": 96
},
{
"epoch": 1.39568345323741,
"grad_norm": 2.3315013817563237,
"learning_rate": 1.9605878555041484e-05,
"loss": 0.9329,
"step": 97
},
{
"epoch": 1.4100719424460433,
"grad_norm": 2.543223303151188,
"learning_rate": 1.9594929736144978e-05,
"loss": 0.981,
"step": 98
},
{
"epoch": 1.4244604316546763,
"grad_norm": 2.2968302939118486,
"learning_rate": 1.9583834055092446e-05,
"loss": 0.8583,
"step": 99
},
{
"epoch": 1.4388489208633093,
"grad_norm": 2.3678435949631287,
"learning_rate": 1.9572591681716888e-05,
"loss": 0.9773,
"step": 100
},
{
"epoch": 1.4532374100719425,
"grad_norm": 5.699515821984953,
"learning_rate": 1.95612027880966e-05,
"loss": 0.5195,
"step": 101
},
{
"epoch": 1.4676258992805755,
"grad_norm": 2.4763879942295812,
"learning_rate": 1.9549667548552557e-05,
"loss": 0.7692,
"step": 102
},
{
"epoch": 1.4820143884892087,
"grad_norm": 1.726706976599975,
"learning_rate": 1.9537986139645724e-05,
"loss": 0.7894,
"step": 103
},
{
"epoch": 1.4964028776978417,
"grad_norm": 2.880795266012809,
"learning_rate": 1.9526158740174392e-05,
"loss": 0.8268,
"step": 104
},
{
"epoch": 1.5107913669064748,
"grad_norm": 2.2837687142476737,
"learning_rate": 1.951418553117139e-05,
"loss": 0.7285,
"step": 105
},
{
"epoch": 1.5251798561151078,
"grad_norm": 2.401929061303925,
"learning_rate": 1.950206669590136e-05,
"loss": 0.7437,
"step": 106
},
{
"epoch": 1.539568345323741,
"grad_norm": 2.109686331424331,
"learning_rate": 1.9489802419857918e-05,
"loss": 0.7687,
"step": 107
},
{
"epoch": 1.5539568345323742,
"grad_norm": 2.52807406566671,
"learning_rate": 1.947739289076084e-05,
"loss": 0.7827,
"step": 108
},
{
"epoch": 1.5683453237410072,
"grad_norm": 2.399711195058357,
"learning_rate": 1.9464838298553172e-05,
"loss": 0.4237,
"step": 109
},
{
"epoch": 1.5827338129496402,
"grad_norm": 2.06327086788237,
"learning_rate": 1.9452138835398333e-05,
"loss": 0.6328,
"step": 110
},
{
"epoch": 1.5971223021582732,
"grad_norm": 1.8934017549759041,
"learning_rate": 1.9439294695677168e-05,
"loss": 0.7544,
"step": 111
},
{
"epoch": 1.6115107913669064,
"grad_norm": 1.7326175063319036,
"learning_rate": 1.9426306075984968e-05,
"loss": 0.5431,
"step": 112
},
{
"epoch": 1.6258992805755397,
"grad_norm": 2.1015742280893184,
"learning_rate": 1.9413173175128472e-05,
"loss": 0.6663,
"step": 113
},
{
"epoch": 1.6402877697841727,
"grad_norm": 2.231853898087344,
"learning_rate": 1.9399896194122824e-05,
"loss": 0.6107,
"step": 114
},
{
"epoch": 1.6546762589928057,
"grad_norm": 2.090875801066189,
"learning_rate": 1.9386475336188484e-05,
"loss": 0.6786,
"step": 115
},
{
"epoch": 1.6690647482014387,
"grad_norm": 2.3679863454534678,
"learning_rate": 1.9372910806748124e-05,
"loss": 0.5826,
"step": 116
},
{
"epoch": 1.683453237410072,
"grad_norm": 2.0143151155086585,
"learning_rate": 1.935920281342349e-05,
"loss": 0.5107,
"step": 117
},
{
"epoch": 1.6978417266187051,
"grad_norm": 2.0579871874173326,
"learning_rate": 1.934535156603222e-05,
"loss": 0.5457,
"step": 118
},
{
"epoch": 1.7122302158273381,
"grad_norm": 2.0248928133131505,
"learning_rate": 1.933135727658462e-05,
"loss": 0.5204,
"step": 119
},
{
"epoch": 1.7266187050359711,
"grad_norm": 2.143170055281987,
"learning_rate": 1.931722015928044e-05,
"loss": 0.4414,
"step": 120
},
{
"epoch": 1.7410071942446042,
"grad_norm": 3.8092005485468086,
"learning_rate": 1.930294043050558e-05,
"loss": 0.3688,
"step": 121
},
{
"epoch": 1.7553956834532374,
"grad_norm": 1.989587109704384,
"learning_rate": 1.928851830882879e-05,
"loss": 0.4322,
"step": 122
},
{
"epoch": 1.7697841726618706,
"grad_norm": 2.12538729505454,
"learning_rate": 1.9273954014998307e-05,
"loss": 0.3567,
"step": 123
},
{
"epoch": 1.7841726618705036,
"grad_norm": 2.0988722867429135,
"learning_rate": 1.92592477719385e-05,
"loss": 0.4249,
"step": 124
},
{
"epoch": 1.7985611510791366,
"grad_norm": 2.0670095637448815,
"learning_rate": 1.9244399804746436e-05,
"loss": 0.4687,
"step": 125
},
{
"epoch": 1.8129496402877698,
"grad_norm": 2.9462308640831436,
"learning_rate": 1.9229410340688442e-05,
"loss": 0.4576,
"step": 126
},
{
"epoch": 1.8273381294964028,
"grad_norm": 2.2511867293077144,
"learning_rate": 1.9214279609196632e-05,
"loss": 0.361,
"step": 127
},
{
"epoch": 1.841726618705036,
"grad_norm": 2.9578408313586224,
"learning_rate": 1.9199007841865395e-05,
"loss": 0.3939,
"step": 128
},
{
"epoch": 1.856115107913669,
"grad_norm": 1.67569117249568,
"learning_rate": 1.9183595272447843e-05,
"loss": 0.3387,
"step": 129
},
{
"epoch": 1.870503597122302,
"grad_norm": 1.922970389381037,
"learning_rate": 1.9168042136852228e-05,
"loss": 0.3162,
"step": 130
},
{
"epoch": 1.8848920863309353,
"grad_norm": 8.10597561850986,
"learning_rate": 1.9152348673138355e-05,
"loss": 0.2718,
"step": 131
},
{
"epoch": 1.8992805755395683,
"grad_norm": 4.044552909471015,
"learning_rate": 1.913651512151391e-05,
"loss": 0.2843,
"step": 132
},
{
"epoch": 1.9136690647482015,
"grad_norm": 1.8361101289997892,
"learning_rate": 1.9120541724330802e-05,
"loss": 0.2922,
"step": 133
},
{
"epoch": 1.9280575539568345,
"grad_norm": 1.8567473300234578,
"learning_rate": 1.910442872608145e-05,
"loss": 0.2136,
"step": 134
},
{
"epoch": 1.9424460431654675,
"grad_norm": 2.263518478480423,
"learning_rate": 1.908817637339503e-05,
"loss": 0.2331,
"step": 135
},
{
"epoch": 1.9568345323741008,
"grad_norm": 2.655129174999631,
"learning_rate": 1.9071784915033717e-05,
"loss": 0.2805,
"step": 136
},
{
"epoch": 1.9712230215827338,
"grad_norm": 2.349586516019947,
"learning_rate": 1.9055254601888867e-05,
"loss": 0.3259,
"step": 137
},
{
"epoch": 1.985611510791367,
"grad_norm": 2.03687355334055,
"learning_rate": 1.9038585686977168e-05,
"loss": 0.2869,
"step": 138
},
{
"epoch": 2.0,
"grad_norm": 2.9583687224064192,
"learning_rate": 1.9021778425436797e-05,
"loss": 0.3408,
"step": 139
},
{
"epoch": 2.014388489208633,
"grad_norm": 1.91826726074781,
"learning_rate": 1.9004833074523478e-05,
"loss": 0.2307,
"step": 140
},
{
"epoch": 2.028776978417266,
"grad_norm": 1.1616239437089266,
"learning_rate": 1.8987749893606575e-05,
"loss": 0.158,
"step": 141
},
{
"epoch": 2.0431654676258995,
"grad_norm": 2.661585557841078,
"learning_rate": 1.8970529144165103e-05,
"loss": 0.182,
"step": 142
},
{
"epoch": 2.0575539568345325,
"grad_norm": 1.4648504478084143,
"learning_rate": 1.8953171089783725e-05,
"loss": 0.1868,
"step": 143
},
{
"epoch": 2.0719424460431655,
"grad_norm": 1.6563021788099919,
"learning_rate": 1.8935675996148738e-05,
"loss": 0.2079,
"step": 144
},
{
"epoch": 2.0863309352517985,
"grad_norm": 1.765245657702871,
"learning_rate": 1.8918044131043987e-05,
"loss": 0.2056,
"step": 145
},
{
"epoch": 2.1007194244604315,
"grad_norm": 1.5438608872021158,
"learning_rate": 1.890027576434677e-05,
"loss": 0.2635,
"step": 146
},
{
"epoch": 2.115107913669065,
"grad_norm": 2.2956532783932535,
"learning_rate": 1.8882371168023708e-05,
"loss": 0.2029,
"step": 147
},
{
"epoch": 2.129496402877698,
"grad_norm": 1.4619363151073599,
"learning_rate": 1.8864330616126586e-05,
"loss": 0.155,
"step": 148
},
{
"epoch": 2.143884892086331,
"grad_norm": 2.1621204649874124,
"learning_rate": 1.8846154384788162e-05,
"loss": 0.1719,
"step": 149
},
{
"epoch": 2.158273381294964,
"grad_norm": 2.7166855043485967,
"learning_rate": 1.8827842752217917e-05,
"loss": 0.1819,
"step": 150
},
{
"epoch": 2.172661870503597,
"grad_norm": 1.8834021077536263,
"learning_rate": 1.8809395998697835e-05,
"loss": 0.1828,
"step": 151
},
{
"epoch": 2.1870503597122304,
"grad_norm": 1.5733332816754892,
"learning_rate": 1.8790814406578073e-05,
"loss": 0.2194,
"step": 152
},
{
"epoch": 2.2014388489208634,
"grad_norm": 1.9669982488920634,
"learning_rate": 1.877209826027267e-05,
"loss": 0.164,
"step": 153
},
{
"epoch": 2.2158273381294964,
"grad_norm": 10.292309004798875,
"learning_rate": 1.8753247846255175e-05,
"loss": 0.2773,
"step": 154
},
{
"epoch": 2.2302158273381294,
"grad_norm": 2.2748735550997563,
"learning_rate": 1.8734263453054274e-05,
"loss": 0.1718,
"step": 155
},
{
"epoch": 2.2446043165467624,
"grad_norm": 2.8655891892789866,
"learning_rate": 1.871514537124936e-05,
"loss": 0.1579,
"step": 156
},
{
"epoch": 2.258992805755396,
"grad_norm": 2.217166529615519,
"learning_rate": 1.869589389346611e-05,
"loss": 0.1787,
"step": 157
},
{
"epoch": 2.273381294964029,
"grad_norm": 1.8903126047188956,
"learning_rate": 1.8676509314371977e-05,
"loss": 0.1848,
"step": 158
},
{
"epoch": 2.287769784172662,
"grad_norm": 2.22331229134063,
"learning_rate": 1.8656991930671687e-05,
"loss": 0.1547,
"step": 159
},
{
"epoch": 2.302158273381295,
"grad_norm": 2.4540086773291323,
"learning_rate": 1.863734204110272e-05,
"loss": 0.1621,
"step": 160
},
{
"epoch": 2.316546762589928,
"grad_norm": 2.718097280283145,
"learning_rate": 1.861755994643071e-05,
"loss": 0.1644,
"step": 161
},
{
"epoch": 2.3309352517985613,
"grad_norm": 2.4404623229012996,
"learning_rate": 1.859764594944485e-05,
"loss": 0.1555,
"step": 162
},
{
"epoch": 2.3453237410071943,
"grad_norm": 2.3253380241917685,
"learning_rate": 1.8577600354953273e-05,
"loss": 0.1524,
"step": 163
},
{
"epoch": 2.3597122302158273,
"grad_norm": 2.4646303579458353,
"learning_rate": 1.8557423469778356e-05,
"loss": 0.1473,
"step": 164
},
{
"epoch": 2.3741007194244603,
"grad_norm": 1.5376405445606527,
"learning_rate": 1.8537115602752054e-05,
"loss": 0.1495,
"step": 165
},
{
"epoch": 2.3884892086330938,
"grad_norm": 4.745077306666788,
"learning_rate": 1.851667706471115e-05,
"loss": 0.1821,
"step": 166
},
{
"epoch": 2.402877697841727,
"grad_norm": 2.552588156434035,
"learning_rate": 1.8496108168492518e-05,
"loss": 0.1319,
"step": 167
},
{
"epoch": 2.41726618705036,
"grad_norm": 1.8367052946729043,
"learning_rate": 1.8475409228928314e-05,
"loss": 0.1349,
"step": 168
},
{
"epoch": 2.431654676258993,
"grad_norm": 2.5445371080031314,
"learning_rate": 1.8454580562841165e-05,
"loss": 0.13,
"step": 169
},
{
"epoch": 2.446043165467626,
"grad_norm": 2.344234909741198,
"learning_rate": 1.8433622489039333e-05,
"loss": 0.1506,
"step": 170
},
{
"epoch": 2.460431654676259,
"grad_norm": 2.376247631671717,
"learning_rate": 1.8412535328311813e-05,
"loss": 0.1344,
"step": 171
},
{
"epoch": 2.4748201438848922,
"grad_norm": 2.9461424922262784,
"learning_rate": 1.839131940342344e-05,
"loss": 0.1483,
"step": 172
},
{
"epoch": 2.4892086330935252,
"grad_norm": 1.9976524594362375,
"learning_rate": 1.8369975039109937e-05,
"loss": 0.1803,
"step": 173
},
{
"epoch": 2.5035971223021583,
"grad_norm": 1.0443749894322358,
"learning_rate": 1.8348502562072955e-05,
"loss": 0.1171,
"step": 174
},
{
"epoch": 2.5179856115107913,
"grad_norm": 1.4361047756192113,
"learning_rate": 1.8326902300975063e-05,
"loss": 0.149,
"step": 175
},
{
"epoch": 2.5323741007194247,
"grad_norm": 1.6346883755053931,
"learning_rate": 1.8305174586434724e-05,
"loss": 0.1444,
"step": 176
},
{
"epoch": 2.5467625899280577,
"grad_norm": 1.6834887574364132,
"learning_rate": 1.828331975102123e-05,
"loss": 0.1144,
"step": 177
},
{
"epoch": 2.5611510791366907,
"grad_norm": 1.354227075277937,
"learning_rate": 1.8261338129249623e-05,
"loss": 0.1178,
"step": 178
},
{
"epoch": 2.5755395683453237,
"grad_norm": 2.2741550896246587,
"learning_rate": 1.8239230057575542e-05,
"loss": 0.1534,
"step": 179
},
{
"epoch": 2.5899280575539567,
"grad_norm": 1.1972242744740567,
"learning_rate": 1.8216995874390128e-05,
"loss": 0.0885,
"step": 180
},
{
"epoch": 2.6043165467625897,
"grad_norm": 1.973587237520198,
"learning_rate": 1.819463592001479e-05,
"loss": 0.135,
"step": 181
},
{
"epoch": 2.618705035971223,
"grad_norm": 3.1272879249109753,
"learning_rate": 1.817215053669603e-05,
"loss": 0.1586,
"step": 182
},
{
"epoch": 2.633093525179856,
"grad_norm": 2.191338511461004,
"learning_rate": 1.814954006860018e-05,
"loss": 0.1416,
"step": 183
},
{
"epoch": 2.647482014388489,
"grad_norm": 1.572355984168845,
"learning_rate": 1.8126804861808175e-05,
"loss": 0.1185,
"step": 184
},
{
"epoch": 2.661870503597122,
"grad_norm": 0.975274936777776,
"learning_rate": 1.81039452643102e-05,
"loss": 0.1,
"step": 185
},
{
"epoch": 2.6762589928057556,
"grad_norm": 1.6008482801863315,
"learning_rate": 1.808096162600041e-05,
"loss": 0.1051,
"step": 186
},
{
"epoch": 2.6906474820143886,
"grad_norm": 1.7704784936509204,
"learning_rate": 1.8057854298671545e-05,
"loss": 0.13,
"step": 187
},
{
"epoch": 2.7050359712230216,
"grad_norm": 1.8920163138110342,
"learning_rate": 1.803462363600957e-05,
"loss": 0.1458,
"step": 188
},
{
"epoch": 2.7194244604316546,
"grad_norm": 5.176691025598706,
"learning_rate": 1.8011269993588234e-05,
"loss": 0.1791,
"step": 189
},
{
"epoch": 2.7338129496402876,
"grad_norm": 1.6375993170680554,
"learning_rate": 1.798779372886365e-05,
"loss": 0.1177,
"step": 190
},
{
"epoch": 2.7482014388489207,
"grad_norm": 4.122622959327029,
"learning_rate": 1.796419520116882e-05,
"loss": 0.26,
"step": 191
},
{
"epoch": 2.762589928057554,
"grad_norm": 1.902781478532848,
"learning_rate": 1.7940474771708118e-05,
"loss": 0.1298,
"step": 192
},
{
"epoch": 2.776978417266187,
"grad_norm": 1.7134666967364445,
"learning_rate": 1.791663280355178e-05,
"loss": 0.1075,
"step": 193
},
{
"epoch": 2.79136690647482,
"grad_norm": 2.1429774203344274,
"learning_rate": 1.789266966163035e-05,
"loss": 0.1131,
"step": 194
},
{
"epoch": 2.805755395683453,
"grad_norm": 2.1096187128500317,
"learning_rate": 1.786858571272907e-05,
"loss": 0.1407,
"step": 195
},
{
"epoch": 2.8201438848920866,
"grad_norm": 1.7461053345159192,
"learning_rate": 1.7844381325482293e-05,
"loss": 0.0962,
"step": 196
},
{
"epoch": 2.8345323741007196,
"grad_norm": 2.272810785009126,
"learning_rate": 1.7820056870367813e-05,
"loss": 0.1982,
"step": 197
},
{
"epoch": 2.8489208633093526,
"grad_norm": 1.887371974290098,
"learning_rate": 1.7795612719701228e-05,
"loss": 0.1436,
"step": 198
},
{
"epoch": 2.8633093525179856,
"grad_norm": 1.8778160079306951,
"learning_rate": 1.7771049247630215e-05,
"loss": 0.1218,
"step": 199
},
{
"epoch": 2.8776978417266186,
"grad_norm": 1.1423107655269147,
"learning_rate": 1.7746366830128803e-05,
"loss": 0.0901,
"step": 200
},
{
"epoch": 2.8920863309352516,
"grad_norm": 1.5837299145510542,
"learning_rate": 1.7721565844991643e-05,
"loss": 0.0799,
"step": 201
},
{
"epoch": 2.906474820143885,
"grad_norm": 1.3786215691627017,
"learning_rate": 1.76966466718282e-05,
"loss": 0.1063,
"step": 202
},
{
"epoch": 2.920863309352518,
"grad_norm": 2.048529831050718,
"learning_rate": 1.7671609692056946e-05,
"loss": 0.1188,
"step": 203
},
{
"epoch": 2.935251798561151,
"grad_norm": 4.107034047711157,
"learning_rate": 1.7646455288899535e-05,
"loss": 0.1608,
"step": 204
},
{
"epoch": 2.949640287769784,
"grad_norm": 1.419074613907907,
"learning_rate": 1.7621183847374935e-05,
"loss": 0.0947,
"step": 205
},
{
"epoch": 2.9640287769784175,
"grad_norm": 1.7241505506887336,
"learning_rate": 1.7595795754293514e-05,
"loss": 0.0933,
"step": 206
},
{
"epoch": 2.9784172661870505,
"grad_norm": 5.437440138394324,
"learning_rate": 1.7570291398251153e-05,
"loss": 0.1616,
"step": 207
},
{
"epoch": 2.9928057553956835,
"grad_norm": 1.20883185060423,
"learning_rate": 1.7544671169623263e-05,
"loss": 0.0926,
"step": 208
},
{
"epoch": 3.0071942446043165,
"grad_norm": 1.4389531365223567,
"learning_rate": 1.751893546055884e-05,
"loss": 0.079,
"step": 209
},
{
"epoch": 3.0215827338129495,
"grad_norm": 2.9037490869241642,
"learning_rate": 1.749308466497444e-05,
"loss": 0.1041,
"step": 210
},
{
"epoch": 3.0359712230215825,
"grad_norm": 2.3752741387378062,
"learning_rate": 1.746711917854817e-05,
"loss": 0.1602,
"step": 211
},
{
"epoch": 3.050359712230216,
"grad_norm": 3.434671646120461,
"learning_rate": 1.744103939871361e-05,
"loss": 0.1553,
"step": 212
},
{
"epoch": 3.064748201438849,
"grad_norm": 1.7548341283748703,
"learning_rate": 1.7414845724653743e-05,
"loss": 0.1046,
"step": 213
},
{
"epoch": 3.079136690647482,
"grad_norm": 2.4665177125279514,
"learning_rate": 1.738853855729485e-05,
"loss": 0.1063,
"step": 214
},
{
"epoch": 3.093525179856115,
"grad_norm": 2.2287980432317136,
"learning_rate": 1.7362118299300363e-05,
"loss": 0.1017,
"step": 215
},
{
"epoch": 3.1079136690647484,
"grad_norm": 2.035780926825967,
"learning_rate": 1.733558535506469e-05,
"loss": 0.1022,
"step": 216
},
{
"epoch": 3.1223021582733814,
"grad_norm": 2.3935570961264707,
"learning_rate": 1.730894013070707e-05,
"loss": 0.1185,
"step": 217
},
{
"epoch": 3.1366906474820144,
"grad_norm": 3.4732921260424536,
"learning_rate": 1.7282183034065296e-05,
"loss": 0.1375,
"step": 218
},
{
"epoch": 3.1510791366906474,
"grad_norm": 1.9600377867714436,
"learning_rate": 1.7255314474689524e-05,
"loss": 0.0858,
"step": 219
},
{
"epoch": 3.1654676258992804,
"grad_norm": 1.2129581509037186,
"learning_rate": 1.7228334863835972e-05,
"loss": 0.0786,
"step": 220
},
{
"epoch": 3.1798561151079134,
"grad_norm": 1.3441082560042694,
"learning_rate": 1.7201244614460645e-05,
"loss": 0.1193,
"step": 221
},
{
"epoch": 3.194244604316547,
"grad_norm": 1.2454468384467656,
"learning_rate": 1.7174044141213e-05,
"loss": 0.0742,
"step": 222
},
{
"epoch": 3.20863309352518,
"grad_norm": 2.522852911662667,
"learning_rate": 1.7146733860429614e-05,
"loss": 0.118,
"step": 223
},
{
"epoch": 3.223021582733813,
"grad_norm": 1.9278040098990679,
"learning_rate": 1.7119314190127786e-05,
"loss": 0.0977,
"step": 224
},
{
"epoch": 3.237410071942446,
"grad_norm": 2.1026449906365707,
"learning_rate": 1.7091785549999177e-05,
"loss": 0.1052,
"step": 225
},
{
"epoch": 3.2517985611510793,
"grad_norm": 2.6529166852325257,
"learning_rate": 1.7064148361403347e-05,
"loss": 0.2227,
"step": 226
},
{
"epoch": 3.2661870503597124,
"grad_norm": 4.552433635116517,
"learning_rate": 1.7036403047361336e-05,
"loss": 0.1501,
"step": 227
},
{
"epoch": 3.2805755395683454,
"grad_norm": 2.310770991325816,
"learning_rate": 1.7008550032549167e-05,
"loss": 0.1216,
"step": 228
},
{
"epoch": 3.2949640287769784,
"grad_norm": 3.36364275278406,
"learning_rate": 1.6980589743291362e-05,
"loss": 0.1235,
"step": 229
},
{
"epoch": 3.3093525179856114,
"grad_norm": 1.829401154310567,
"learning_rate": 1.695252260755441e-05,
"loss": 0.1233,
"step": 230
},
{
"epoch": 3.3237410071942444,
"grad_norm": 2.223251742922475,
"learning_rate": 1.6924349054940204e-05,
"loss": 0.1139,
"step": 231
},
{
"epoch": 3.338129496402878,
"grad_norm": 2.1098226205823734,
"learning_rate": 1.6896069516679494e-05,
"loss": 0.0954,
"step": 232
},
{
"epoch": 3.352517985611511,
"grad_norm": 2.420417244636502,
"learning_rate": 1.6867684425625265e-05,
"loss": 0.1024,
"step": 233
},
{
"epoch": 3.366906474820144,
"grad_norm": 3.516667669455294,
"learning_rate": 1.683919421624611e-05,
"loss": 0.1811,
"step": 234
},
{
"epoch": 3.381294964028777,
"grad_norm": 1.1050242423015832,
"learning_rate": 1.681059932461959e-05,
"loss": 0.0677,
"step": 235
},
{
"epoch": 3.3956834532374103,
"grad_norm": 2.176543994109208,
"learning_rate": 1.6781900188425565e-05,
"loss": 0.093,
"step": 236
},
{
"epoch": 3.4100719424460433,
"grad_norm": 2.4696463349977873,
"learning_rate": 1.6753097246939475e-05,
"loss": 0.0865,
"step": 237
},
{
"epoch": 3.4244604316546763,
"grad_norm": 1.2628603534222926,
"learning_rate": 1.672419094102563e-05,
"loss": 0.0867,
"step": 238
},
{
"epoch": 3.4388489208633093,
"grad_norm": 6.155069816331444,
"learning_rate": 1.6695181713130462e-05,
"loss": 0.1917,
"step": 239
},
{
"epoch": 3.4532374100719423,
"grad_norm": 2.147319268591747,
"learning_rate": 1.6666070007275746e-05,
"loss": 0.1466,
"step": 240
},
{
"epoch": 3.4676258992805753,
"grad_norm": 2.163903497725947,
"learning_rate": 1.6636856269051813e-05,
"loss": 0.1364,
"step": 241
},
{
"epoch": 3.4820143884892087,
"grad_norm": 1.6774031370584257,
"learning_rate": 1.6607540945610722e-05,
"loss": 0.0906,
"step": 242
},
{
"epoch": 3.4964028776978417,
"grad_norm": 9.014308156169962,
"learning_rate": 1.6578124485659414e-05,
"loss": 0.1861,
"step": 243
},
{
"epoch": 3.5107913669064748,
"grad_norm": 2.3885377725749715,
"learning_rate": 1.6548607339452853e-05,
"loss": 0.1281,
"step": 244
},
{
"epoch": 3.5251798561151078,
"grad_norm": 1.6108063549734837,
"learning_rate": 1.6518989958787126e-05,
"loss": 0.0981,
"step": 245
},
{
"epoch": 3.539568345323741,
"grad_norm": 3.5691166357587054,
"learning_rate": 1.6489272796992536e-05,
"loss": 0.1074,
"step": 246
},
{
"epoch": 3.553956834532374,
"grad_norm": 3.4979429013811467,
"learning_rate": 1.6459456308926662e-05,
"loss": 0.1338,
"step": 247
},
{
"epoch": 3.568345323741007,
"grad_norm": 4.957592452667262,
"learning_rate": 1.642954095096737e-05,
"loss": 0.2005,
"step": 248
},
{
"epoch": 3.58273381294964,
"grad_norm": 2.5648919694650107,
"learning_rate": 1.639952718100589e-05,
"loss": 0.1081,
"step": 249
},
{
"epoch": 3.597122302158273,
"grad_norm": 3.7598235610947643,
"learning_rate": 1.636941545843973e-05,
"loss": 0.1533,
"step": 250
},
{
"epoch": 3.6115107913669062,
"grad_norm": 2.9195056813718496,
"learning_rate": 1.6339206244165705e-05,
"loss": 0.1188,
"step": 251
},
{
"epoch": 3.6258992805755397,
"grad_norm": 2.79416599036527,
"learning_rate": 1.630890000057285e-05,
"loss": 0.1106,
"step": 252
},
{
"epoch": 3.6402877697841727,
"grad_norm": 2.2623014827430485,
"learning_rate": 1.6278497191535364e-05,
"loss": 0.0913,
"step": 253
},
{
"epoch": 3.6546762589928057,
"grad_norm": 3.0134989782439736,
"learning_rate": 1.6247998282405486e-05,
"loss": 0.1368,
"step": 254
},
{
"epoch": 3.6690647482014387,
"grad_norm": 1.0640618586580433,
"learning_rate": 1.621740374000639e-05,
"loss": 0.0749,
"step": 255
},
{
"epoch": 3.683453237410072,
"grad_norm": 2.0407254201531857,
"learning_rate": 1.6186714032625036e-05,
"loss": 0.1347,
"step": 256
},
{
"epoch": 3.697841726618705,
"grad_norm": 1.3495622495153805,
"learning_rate": 1.6155929630004995e-05,
"loss": 0.0938,
"step": 257
},
{
"epoch": 3.712230215827338,
"grad_norm": 1.626082630128798,
"learning_rate": 1.6125051003339277e-05,
"loss": 0.0735,
"step": 258
},
{
"epoch": 3.726618705035971,
"grad_norm": 1.045029527525099,
"learning_rate": 1.6094078625263085e-05,
"loss": 0.0665,
"step": 259
},
{
"epoch": 3.741007194244604,
"grad_norm": 1.0769877251894375,
"learning_rate": 1.6063012969846624e-05,
"loss": 0.0594,
"step": 260
},
{
"epoch": 3.755395683453237,
"grad_norm": 1.573095185027216,
"learning_rate": 1.603185451258781e-05,
"loss": 0.0989,
"step": 261
},
{
"epoch": 3.7697841726618706,
"grad_norm": 1.5939550154351394,
"learning_rate": 1.6000603730405013e-05,
"loss": 0.0918,
"step": 262
},
{
"epoch": 3.7841726618705036,
"grad_norm": 3.084625753380633,
"learning_rate": 1.5969261101629744e-05,
"loss": 0.1507,
"step": 263
},
{
"epoch": 3.7985611510791366,
"grad_norm": 2.1876597444656367,
"learning_rate": 1.593782710599934e-05,
"loss": 0.1153,
"step": 264
},
{
"epoch": 3.81294964028777,
"grad_norm": 1.4453678209486815,
"learning_rate": 1.5906302224649613e-05,
"loss": 0.0881,
"step": 265
},
{
"epoch": 3.827338129496403,
"grad_norm": 1.415817693353708,
"learning_rate": 1.5874686940107507e-05,
"loss": 0.0921,
"step": 266
},
{
"epoch": 3.841726618705036,
"grad_norm": 1.3019018391132993,
"learning_rate": 1.5842981736283686e-05,
"loss": 0.0942,
"step": 267
},
{
"epoch": 3.856115107913669,
"grad_norm": 2.124481879469022,
"learning_rate": 1.581118709846514e-05,
"loss": 0.0892,
"step": 268
},
{
"epoch": 3.870503597122302,
"grad_norm": 0.7530466355105916,
"learning_rate": 1.5779303513307765e-05,
"loss": 0.0611,
"step": 269
},
{
"epoch": 3.884892086330935,
"grad_norm": 1.9390276125404986,
"learning_rate": 1.574733146882889e-05,
"loss": 0.0711,
"step": 270
},
{
"epoch": 3.899280575539568,
"grad_norm": 1.7780292244134437,
"learning_rate": 1.571527145439983e-05,
"loss": 0.0912,
"step": 271
},
{
"epoch": 3.9136690647482015,
"grad_norm": 1.5211678706292164,
"learning_rate": 1.5683123960738395e-05,
"loss": 0.0828,
"step": 272
},
{
"epoch": 3.9280575539568345,
"grad_norm": 3.235343849443108,
"learning_rate": 1.5650889479901356e-05,
"loss": 0.1355,
"step": 273
},
{
"epoch": 3.9424460431654675,
"grad_norm": 2.418347718572089,
"learning_rate": 1.5618568505276948e-05,
"loss": 0.0934,
"step": 274
},
{
"epoch": 3.956834532374101,
"grad_norm": 1.856004868367365,
"learning_rate": 1.558616153157728e-05,
"loss": 0.1214,
"step": 275
},
{
"epoch": 3.971223021582734,
"grad_norm": 1.9055690379940484,
"learning_rate": 1.5553669054830806e-05,
"loss": 0.0759,
"step": 276
},
{
"epoch": 3.985611510791367,
"grad_norm": 0.7835271833606042,
"learning_rate": 1.552109157237468e-05,
"loss": 0.0636,
"step": 277
},
{
"epoch": 4.0,
"grad_norm": 1.9789660771379471,
"learning_rate": 1.5488429582847194e-05,
"loss": 0.0935,
"step": 278
},
{
"epoch": 4.014388489208633,
"grad_norm": 1.920696415437042,
"learning_rate": 1.5455683586180117e-05,
"loss": 0.0732,
"step": 279
},
{
"epoch": 4.028776978417266,
"grad_norm": 1.7756913937042766,
"learning_rate": 1.542285408359105e-05,
"loss": 0.1339,
"step": 280
},
{
"epoch": 4.043165467625899,
"grad_norm": 1.3081335048820986,
"learning_rate": 1.5389941577575753e-05,
"loss": 0.0805,
"step": 281
},
{
"epoch": 4.057553956834532,
"grad_norm": 1.545019490212882,
"learning_rate": 1.5356946571900465e-05,
"loss": 0.0764,
"step": 282
},
{
"epoch": 4.071942446043165,
"grad_norm": 1.6175892304589476,
"learning_rate": 1.5323869571594166e-05,
"loss": 0.0838,
"step": 283
},
{
"epoch": 4.086330935251799,
"grad_norm": 8.86376731047678,
"learning_rate": 1.5290711082940883e-05,
"loss": 0.2142,
"step": 284
},
{
"epoch": 4.100719424460432,
"grad_norm": 2.0771074188946748,
"learning_rate": 1.5257471613471908e-05,
"loss": 0.1161,
"step": 285
},
{
"epoch": 4.115107913669065,
"grad_norm": 1.841466825633457,
"learning_rate": 1.5224151671958045e-05,
"loss": 0.111,
"step": 286
},
{
"epoch": 4.129496402877698,
"grad_norm": 2.1971490305870254,
"learning_rate": 1.5190751768401835e-05,
"loss": 0.1001,
"step": 287
},
{
"epoch": 4.143884892086331,
"grad_norm": 1.887590572191191,
"learning_rate": 1.515727241402972e-05,
"loss": 0.0817,
"step": 288
},
{
"epoch": 4.158273381294964,
"grad_norm": 1.5504037915944906,
"learning_rate": 1.512371412128424e-05,
"loss": 0.0721,
"step": 289
},
{
"epoch": 4.172661870503597,
"grad_norm": 3.2084429695170398,
"learning_rate": 1.509007740381618e-05,
"loss": 0.1495,
"step": 290
},
{
"epoch": 4.18705035971223,
"grad_norm": 1.9773379887209472,
"learning_rate": 1.505636277647672e-05,
"loss": 0.1021,
"step": 291
},
{
"epoch": 4.201438848920863,
"grad_norm": 1.7919752378903473,
"learning_rate": 1.5022570755309542e-05,
"loss": 0.069,
"step": 292
},
{
"epoch": 4.215827338129497,
"grad_norm": 1.8353019507539692,
"learning_rate": 1.4988701857542932e-05,
"loss": 0.0908,
"step": 293
},
{
"epoch": 4.23021582733813,
"grad_norm": 1.6805427100586665,
"learning_rate": 1.495475660158187e-05,
"loss": 0.0785,
"step": 294
},
{
"epoch": 4.244604316546763,
"grad_norm": 1.380400795948778,
"learning_rate": 1.492073550700009e-05,
"loss": 0.0817,
"step": 295
},
{
"epoch": 4.258992805755396,
"grad_norm": 0.9200650137201957,
"learning_rate": 1.4886639094532129e-05,
"loss": 0.0646,
"step": 296
},
{
"epoch": 4.273381294964029,
"grad_norm": 2.0328467587787884,
"learning_rate": 1.4852467886065357e-05,
"loss": 0.0816,
"step": 297
},
{
"epoch": 4.287769784172662,
"grad_norm": 2.471692983826739,
"learning_rate": 1.4818222404631993e-05,
"loss": 0.1168,
"step": 298
},
{
"epoch": 4.302158273381295,
"grad_norm": 1.9480957771711376,
"learning_rate": 1.4783903174401086e-05,
"loss": 0.1056,
"step": 299
},
{
"epoch": 4.316546762589928,
"grad_norm": 1.3707094764913785,
"learning_rate": 1.4749510720670506e-05,
"loss": 0.081,
"step": 300
},
{
"epoch": 4.330935251798561,
"grad_norm": 1.8160322544483793,
"learning_rate": 1.4715045569858895e-05,
"loss": 0.0784,
"step": 301
},
{
"epoch": 4.345323741007194,
"grad_norm": 1.6519200877530784,
"learning_rate": 1.4680508249497622e-05,
"loss": 0.0758,
"step": 302
},
{
"epoch": 4.359712230215827,
"grad_norm": 1.1953425576844743,
"learning_rate": 1.4645899288222686e-05,
"loss": 0.076,
"step": 303
},
{
"epoch": 4.374100719424461,
"grad_norm": 2.1532443296286217,
"learning_rate": 1.461121921576665e-05,
"loss": 0.0956,
"step": 304
},
{
"epoch": 4.388489208633094,
"grad_norm": 7.291642986227351,
"learning_rate": 1.457646856295051e-05,
"loss": 0.1638,
"step": 305
},
{
"epoch": 4.402877697841727,
"grad_norm": 1.9760971103123395,
"learning_rate": 1.4541647861675592e-05,
"loss": 0.0898,
"step": 306
},
{
"epoch": 4.41726618705036,
"grad_norm": 1.549355801958334,
"learning_rate": 1.4506757644915393e-05,
"loss": 0.0804,
"step": 307
},
{
"epoch": 4.431654676258993,
"grad_norm": 2.1548681765232005,
"learning_rate": 1.4471798446707426e-05,
"loss": 0.0917,
"step": 308
},
{
"epoch": 4.446043165467626,
"grad_norm": 2.416819319060172,
"learning_rate": 1.443677080214506e-05,
"loss": 0.1,
"step": 309
},
{
"epoch": 4.460431654676259,
"grad_norm": 2.357572317260481,
"learning_rate": 1.4401675247369307e-05,
"loss": 0.0842,
"step": 310
},
{
"epoch": 4.474820143884892,
"grad_norm": 2.3896391383138305,
"learning_rate": 1.4366512319560642e-05,
"loss": 0.0825,
"step": 311
},
{
"epoch": 4.489208633093525,
"grad_norm": 2.43013200710383,
"learning_rate": 1.4331282556930753e-05,
"loss": 0.0694,
"step": 312
},
{
"epoch": 4.503597122302159,
"grad_norm": 2.044331917417573,
"learning_rate": 1.4295986498714326e-05,
"loss": 0.0782,
"step": 313
},
{
"epoch": 4.517985611510792,
"grad_norm": 2.5158625592140424,
"learning_rate": 1.4260624685160778e-05,
"loss": 0.0861,
"step": 314
},
{
"epoch": 4.532374100719425,
"grad_norm": 2.390662985836405,
"learning_rate": 1.4225197657525996e-05,
"loss": 0.0998,
"step": 315
},
{
"epoch": 4.546762589928058,
"grad_norm": 4.358152932825534,
"learning_rate": 1.4189705958064041e-05,
"loss": 0.2349,
"step": 316
},
{
"epoch": 4.561151079136691,
"grad_norm": 1.4662171648939122,
"learning_rate": 1.4154150130018867e-05,
"loss": 0.0828,
"step": 317
},
{
"epoch": 4.575539568345324,
"grad_norm": 2.618941379165625,
"learning_rate": 1.4118530717615982e-05,
"loss": 0.1057,
"step": 318
},
{
"epoch": 4.589928057553957,
"grad_norm": 5.796662333549953,
"learning_rate": 1.4082848266054136e-05,
"loss": 0.1314,
"step": 319
},
{
"epoch": 4.60431654676259,
"grad_norm": 1.3755791710433443,
"learning_rate": 1.4047103321496977e-05,
"loss": 0.0568,
"step": 320
},
{
"epoch": 4.618705035971223,
"grad_norm": 1.4690673745162433,
"learning_rate": 1.4011296431064675e-05,
"loss": 0.0857,
"step": 321
},
{
"epoch": 4.633093525179856,
"grad_norm": 1.510028184893069,
"learning_rate": 1.3975428142825562e-05,
"loss": 0.0661,
"step": 322
},
{
"epoch": 4.647482014388489,
"grad_norm": 1.8656945859612246,
"learning_rate": 1.3939499005787735e-05,
"loss": 0.0885,
"step": 323
},
{
"epoch": 4.661870503597123,
"grad_norm": 4.360334150991463,
"learning_rate": 1.3903509569890663e-05,
"loss": 0.1249,
"step": 324
},
{
"epoch": 4.676258992805756,
"grad_norm": 1.2705938890119497,
"learning_rate": 1.3867460385996756e-05,
"loss": 0.0483,
"step": 325
},
{
"epoch": 4.690647482014389,
"grad_norm": 1.6429511839584963,
"learning_rate": 1.3831352005882947e-05,
"loss": 0.0678,
"step": 326
},
{
"epoch": 4.705035971223022,
"grad_norm": 0.8064327586315418,
"learning_rate": 1.3795184982232234e-05,
"loss": 0.0481,
"step": 327
},
{
"epoch": 4.719424460431655,
"grad_norm": 1.6276254758163173,
"learning_rate": 1.3758959868625233e-05,
"loss": 0.0642,
"step": 328
},
{
"epoch": 4.733812949640288,
"grad_norm": 1.6478404195254295,
"learning_rate": 1.3722677219531684e-05,
"loss": 0.0537,
"step": 329
},
{
"epoch": 4.748201438848921,
"grad_norm": 2.0775687439431443,
"learning_rate": 1.3686337590301997e-05,
"loss": 0.0826,
"step": 330
},
{
"epoch": 4.762589928057554,
"grad_norm": 1.6321172079536772,
"learning_rate": 1.364994153715872e-05,
"loss": 0.0931,
"step": 331
},
{
"epoch": 4.7769784172661875,
"grad_norm": 1.5139845300039876,
"learning_rate": 1.361348961718804e-05,
"loss": 0.0755,
"step": 332
},
{
"epoch": 4.7913669064748206,
"grad_norm": 1.6885011783715966,
"learning_rate": 1.3576982388331258e-05,
"loss": 0.0708,
"step": 333
},
{
"epoch": 4.805755395683454,
"grad_norm": 3.58412758620127,
"learning_rate": 1.3540420409376237e-05,
"loss": 0.1443,
"step": 334
},
{
"epoch": 4.820143884892087,
"grad_norm": 3.970560863668551,
"learning_rate": 1.3503804239948874e-05,
"loss": 0.1164,
"step": 335
},
{
"epoch": 4.83453237410072,
"grad_norm": 3.0950452614631216,
"learning_rate": 1.3467134440504497e-05,
"loss": 0.1638,
"step": 336
},
{
"epoch": 4.848920863309353,
"grad_norm": 1.2344229500725417,
"learning_rate": 1.3430411572319323e-05,
"loss": 0.0414,
"step": 337
},
{
"epoch": 4.863309352517986,
"grad_norm": 4.693981116757926,
"learning_rate": 1.3393636197481842e-05,
"loss": 0.1099,
"step": 338
},
{
"epoch": 4.877697841726619,
"grad_norm": 1.280394160785586,
"learning_rate": 1.335680887888423e-05,
"loss": 0.069,
"step": 339
},
{
"epoch": 4.892086330935252,
"grad_norm": 2.6531048299227287,
"learning_rate": 1.3319930180213713e-05,
"loss": 0.0945,
"step": 340
},
{
"epoch": 4.906474820143885,
"grad_norm": 3.102339751706606,
"learning_rate": 1.3283000665943972e-05,
"loss": 0.1103,
"step": 341
},
{
"epoch": 4.920863309352518,
"grad_norm": 1.4150229722556091,
"learning_rate": 1.3246020901326465e-05,
"loss": 0.0787,
"step": 342
},
{
"epoch": 4.935251798561151,
"grad_norm": 2.3202398526115933,
"learning_rate": 1.3208991452381798e-05,
"loss": 0.0956,
"step": 343
},
{
"epoch": 4.9496402877697845,
"grad_norm": 6.464019103643947,
"learning_rate": 1.3171912885891063e-05,
"loss": 0.1059,
"step": 344
},
{
"epoch": 4.9640287769784175,
"grad_norm": 2.2987115160645617,
"learning_rate": 1.3134785769387147e-05,
"loss": 0.0905,
"step": 345
},
{
"epoch": 4.9784172661870505,
"grad_norm": 3.42537659995528,
"learning_rate": 1.3097610671146063e-05,
"loss": 0.0891,
"step": 346
},
{
"epoch": 4.9928057553956835,
"grad_norm": 1.607289746350073,
"learning_rate": 1.3060388160178237e-05,
"loss": 0.0756,
"step": 347
},
{
"epoch": 5.0071942446043165,
"grad_norm": 2.0287001101488733,
"learning_rate": 1.302311880621981e-05,
"loss": 0.1092,
"step": 348
},
{
"epoch": 5.0215827338129495,
"grad_norm": 2.296081619843284,
"learning_rate": 1.2985803179723903e-05,
"loss": 0.0814,
"step": 349
},
{
"epoch": 5.0359712230215825,
"grad_norm": 1.3541106943850767,
"learning_rate": 1.294844185185191e-05,
"loss": 0.0495,
"step": 350
},
{
"epoch": 5.0503597122302155,
"grad_norm": 3.473072731374066,
"learning_rate": 1.2911035394464724e-05,
"loss": 0.1115,
"step": 351
},
{
"epoch": 5.0647482014388485,
"grad_norm": 1.878085254754344,
"learning_rate": 1.2873584380114012e-05,
"loss": 0.0758,
"step": 352
},
{
"epoch": 5.079136690647482,
"grad_norm": 1.5828011496456365,
"learning_rate": 1.283608938203344e-05,
"loss": 0.0653,
"step": 353
},
{
"epoch": 5.093525179856115,
"grad_norm": 2.310554737444448,
"learning_rate": 1.2798550974129888e-05,
"loss": 0.0795,
"step": 354
},
{
"epoch": 5.107913669064748,
"grad_norm": 1.392927738275557,
"learning_rate": 1.2760969730974692e-05,
"loss": 0.0555,
"step": 355
},
{
"epoch": 5.122302158273381,
"grad_norm": 1.9277356604394835,
"learning_rate": 1.2723346227794817e-05,
"loss": 0.0709,
"step": 356
},
{
"epoch": 5.136690647482014,
"grad_norm": 1.5371398787685273,
"learning_rate": 1.2685681040464081e-05,
"loss": 0.0596,
"step": 357
},
{
"epoch": 5.151079136690647,
"grad_norm": 2.8096729108941134,
"learning_rate": 1.264797474549433e-05,
"loss": 0.1064,
"step": 358
},
{
"epoch": 5.16546762589928,
"grad_norm": 0.9781522138651574,
"learning_rate": 1.2610227920026608e-05,
"loss": 0.051,
"step": 359
},
{
"epoch": 5.179856115107913,
"grad_norm": 1.1509313519794822,
"learning_rate": 1.2572441141822322e-05,
"loss": 0.0651,
"step": 360
},
{
"epoch": 5.194244604316546,
"grad_norm": 4.388250469742715,
"learning_rate": 1.2534614989254423e-05,
"loss": 0.0967,
"step": 361
},
{
"epoch": 5.2086330935251794,
"grad_norm": 0.9566869047445085,
"learning_rate": 1.2496750041298515e-05,
"loss": 0.0609,
"step": 362
},
{
"epoch": 5.223021582733813,
"grad_norm": 2.2089936486598076,
"learning_rate": 1.2458846877524025e-05,
"loss": 0.0657,
"step": 363
},
{
"epoch": 5.237410071942446,
"grad_norm": 2.1577271836034058,
"learning_rate": 1.2420906078085316e-05,
"loss": 0.0859,
"step": 364
},
{
"epoch": 5.251798561151079,
"grad_norm": 2.107752486687129,
"learning_rate": 1.2382928223712807e-05,
"loss": 0.0493,
"step": 365
},
{
"epoch": 5.266187050359712,
"grad_norm": 1.5362209636910469,
"learning_rate": 1.2344913895704099e-05,
"loss": 0.0551,
"step": 366
},
{
"epoch": 5.280575539568345,
"grad_norm": 1.8284113473780614,
"learning_rate": 1.2306863675915058e-05,
"loss": 0.0639,
"step": 367
},
{
"epoch": 5.294964028776978,
"grad_norm": 1.5434857110112714,
"learning_rate": 1.2268778146750914e-05,
"loss": 0.0665,
"step": 368
},
{
"epoch": 5.309352517985611,
"grad_norm": 1.5270591397764983,
"learning_rate": 1.2230657891157365e-05,
"loss": 0.0614,
"step": 369
},
{
"epoch": 5.323741007194244,
"grad_norm": 1.1644989385855498,
"learning_rate": 1.2192503492611625e-05,
"loss": 0.0516,
"step": 370
},
{
"epoch": 5.338129496402877,
"grad_norm": 2.4389365127644664,
"learning_rate": 1.2154315535113513e-05,
"loss": 0.0763,
"step": 371
},
{
"epoch": 5.35251798561151,
"grad_norm": 2.570405580037144,
"learning_rate": 1.2116094603176513e-05,
"loss": 0.0645,
"step": 372
},
{
"epoch": 5.366906474820144,
"grad_norm": 2.525534092427501,
"learning_rate": 1.2077841281818816e-05,
"loss": 0.0754,
"step": 373
},
{
"epoch": 5.381294964028777,
"grad_norm": 2.3428164694918654,
"learning_rate": 1.203955615655438e-05,
"loss": 0.0861,
"step": 374
},
{
"epoch": 5.39568345323741,
"grad_norm": 2.235811734094292,
"learning_rate": 1.2001239813383951e-05,
"loss": 0.0549,
"step": 375
},
{
"epoch": 5.410071942446043,
"grad_norm": 4.506038617907832,
"learning_rate": 1.1962892838786116e-05,
"loss": 0.0857,
"step": 376
},
{
"epoch": 5.424460431654676,
"grad_norm": 2.267447064370234,
"learning_rate": 1.19245158197083e-05,
"loss": 0.0717,
"step": 377
},
{
"epoch": 5.438848920863309,
"grad_norm": 1.3482474852811166,
"learning_rate": 1.1886109343557808e-05,
"loss": 0.0772,
"step": 378
},
{
"epoch": 5.453237410071942,
"grad_norm": 2.1410593958658954,
"learning_rate": 1.1847673998192815e-05,
"loss": 0.0536,
"step": 379
},
{
"epoch": 5.467625899280575,
"grad_norm": 1.5967556597455597,
"learning_rate": 1.180921037191337e-05,
"loss": 0.0459,
"step": 380
},
{
"epoch": 5.482014388489208,
"grad_norm": 1.2435751103662702,
"learning_rate": 1.1770719053452408e-05,
"loss": 0.0443,
"step": 381
},
{
"epoch": 5.496402877697841,
"grad_norm": 3.2548280663914784,
"learning_rate": 1.1732200631966717e-05,
"loss": 0.0843,
"step": 382
},
{
"epoch": 5.510791366906475,
"grad_norm": 1.991827364041768,
"learning_rate": 1.1693655697027935e-05,
"loss": 0.0561,
"step": 383
},
{
"epoch": 5.525179856115108,
"grad_norm": 1.7434178537703542,
"learning_rate": 1.165508483861352e-05,
"loss": 0.0631,
"step": 384
},
{
"epoch": 5.539568345323741,
"grad_norm": 3.6171543812005362,
"learning_rate": 1.1616488647097718e-05,
"loss": 0.0704,
"step": 385
},
{
"epoch": 5.553956834532374,
"grad_norm": 3.4479680855430113,
"learning_rate": 1.1577867713242532e-05,
"loss": 0.0751,
"step": 386
},
{
"epoch": 5.568345323741007,
"grad_norm": 1.7391495389296567,
"learning_rate": 1.1539222628188675e-05,
"loss": 0.0524,
"step": 387
},
{
"epoch": 5.58273381294964,
"grad_norm": 2.204791122892472,
"learning_rate": 1.1500553983446527e-05,
"loss": 0.0696,
"step": 388
},
{
"epoch": 5.597122302158273,
"grad_norm": 4.934866987474034,
"learning_rate": 1.1461862370887076e-05,
"loss": 0.0841,
"step": 389
},
{
"epoch": 5.611510791366906,
"grad_norm": 1.5682934454178645,
"learning_rate": 1.1423148382732854e-05,
"loss": 0.0604,
"step": 390
},
{
"epoch": 5.625899280575539,
"grad_norm": 1.5376010738808106,
"learning_rate": 1.1384412611548887e-05,
"loss": 0.0763,
"step": 391
},
{
"epoch": 5.640287769784173,
"grad_norm": 2.2954456281899858,
"learning_rate": 1.134565565023362e-05,
"loss": 0.0631,
"step": 392
},
{
"epoch": 5.654676258992806,
"grad_norm": 2.8902333297932334,
"learning_rate": 1.1306878092009828e-05,
"loss": 0.1072,
"step": 393
},
{
"epoch": 5.669064748201439,
"grad_norm": 3.1298174346773635,
"learning_rate": 1.1268080530415557e-05,
"loss": 0.0906,
"step": 394
},
{
"epoch": 5.683453237410072,
"grad_norm": 1.9540710814071045,
"learning_rate": 1.122926355929502e-05,
"loss": 0.0815,
"step": 395
},
{
"epoch": 5.697841726618705,
"grad_norm": 3.0936428631268424,
"learning_rate": 1.119042777278953e-05,
"loss": 0.0933,
"step": 396
},
{
"epoch": 5.712230215827338,
"grad_norm": 0.779804603376737,
"learning_rate": 1.1151573765328374e-05,
"loss": 0.0377,
"step": 397
},
{
"epoch": 5.726618705035971,
"grad_norm": 1.4682872653216061,
"learning_rate": 1.1112702131619747e-05,
"loss": 0.0553,
"step": 398
},
{
"epoch": 5.741007194244604,
"grad_norm": 2.2888255822610586,
"learning_rate": 1.1073813466641633e-05,
"loss": 0.0592,
"step": 399
},
{
"epoch": 5.755395683453237,
"grad_norm": 2.0210708158515356,
"learning_rate": 1.1034908365632695e-05,
"loss": 0.0591,
"step": 400
},
{
"epoch": 5.76978417266187,
"grad_norm": 2.8477043817929175,
"learning_rate": 1.0995987424083178e-05,
"loss": 0.0665,
"step": 401
},
{
"epoch": 5.784172661870503,
"grad_norm": 4.075702050234569,
"learning_rate": 1.0957051237725775e-05,
"loss": 0.0891,
"step": 402
},
{
"epoch": 5.798561151079137,
"grad_norm": 3.4838765154884053,
"learning_rate": 1.0918100402526533e-05,
"loss": 0.0752,
"step": 403
},
{
"epoch": 5.81294964028777,
"grad_norm": 2.190868453265739,
"learning_rate": 1.0879135514675706e-05,
"loss": 0.0678,
"step": 404
},
{
"epoch": 5.827338129496403,
"grad_norm": 6.885762602295394,
"learning_rate": 1.0840157170578645e-05,
"loss": 0.1085,
"step": 405
},
{
"epoch": 5.841726618705036,
"grad_norm": 2.1256795879435946,
"learning_rate": 1.0801165966846662e-05,
"loss": 0.0587,
"step": 406
},
{
"epoch": 5.856115107913669,
"grad_norm": 3.0358003868740777,
"learning_rate": 1.0762162500287916e-05,
"loss": 0.1023,
"step": 407
},
{
"epoch": 5.870503597122302,
"grad_norm": 2.72412213052102,
"learning_rate": 1.0723147367898243e-05,
"loss": 0.0755,
"step": 408
},
{
"epoch": 5.884892086330935,
"grad_norm": 5.991640096070039,
"learning_rate": 1.068412116685205e-05,
"loss": 0.0906,
"step": 409
},
{
"epoch": 5.899280575539568,
"grad_norm": 4.47844473482202,
"learning_rate": 1.0645084494493166e-05,
"loss": 0.1367,
"step": 410
},
{
"epoch": 5.913669064748201,
"grad_norm": 5.278497486551652,
"learning_rate": 1.0606037948325686e-05,
"loss": 0.0934,
"step": 411
},
{
"epoch": 5.928057553956835,
"grad_norm": 1.662465142150059,
"learning_rate": 1.0566982126004848e-05,
"loss": 0.0425,
"step": 412
},
{
"epoch": 5.942446043165468,
"grad_norm": 2.1454760909414476,
"learning_rate": 1.052791762532786e-05,
"loss": 0.0632,
"step": 413
},
{
"epoch": 5.956834532374101,
"grad_norm": 1.8023401117347966,
"learning_rate": 1.0488845044224774e-05,
"loss": 0.0562,
"step": 414
},
{
"epoch": 5.971223021582734,
"grad_norm": 1.998906167438941,
"learning_rate": 1.0449764980749317e-05,
"loss": 0.0464,
"step": 415
},
{
"epoch": 5.985611510791367,
"grad_norm": 1.9791549061655505,
"learning_rate": 1.0410678033069745e-05,
"loss": 0.0509,
"step": 416
},
{
"epoch": 6.0,
"grad_norm": 1.783059633172495,
"learning_rate": 1.0371584799459684e-05,
"loss": 0.0693,
"step": 417
},
{
"epoch": 6.014388489208633,
"grad_norm": 3.1713602527987077,
"learning_rate": 1.0332485878288977e-05,
"loss": 0.0896,
"step": 418
},
{
"epoch": 6.028776978417266,
"grad_norm": 13.02216773829757,
"learning_rate": 1.029338186801451e-05,
"loss": 0.1842,
"step": 419
},
{
"epoch": 6.043165467625899,
"grad_norm": 3.175785916601786,
"learning_rate": 1.0254273367171085e-05,
"loss": 0.0673,
"step": 420
},
{
"epoch": 6.057553956834532,
"grad_norm": 3.0816879666228885,
"learning_rate": 1.0215160974362224e-05,
"loss": 0.0648,
"step": 421
},
{
"epoch": 6.071942446043165,
"grad_norm": 3.138491600475823,
"learning_rate": 1.0176045288251014e-05,
"loss": 0.0537,
"step": 422
},
{
"epoch": 6.086330935251799,
"grad_norm": 1.9965147840919386,
"learning_rate": 1.0136926907550968e-05,
"loss": 0.0493,
"step": 423
},
{
"epoch": 6.100719424460432,
"grad_norm": 4.040616547013923,
"learning_rate": 1.0097806431016825e-05,
"loss": 0.0718,
"step": 424
},
{
"epoch": 6.115107913669065,
"grad_norm": 3.684053981687095,
"learning_rate": 1.0058684457435419e-05,
"loss": 0.0885,
"step": 425
},
{
"epoch": 6.129496402877698,
"grad_norm": 3.4407063513016207,
"learning_rate": 1.0019561585616485e-05,
"loss": 0.0878,
"step": 426
},
{
"epoch": 6.143884892086331,
"grad_norm": 3.631968287932021,
"learning_rate": 9.980438414383518e-06,
"loss": 0.0716,
"step": 427
},
{
"epoch": 6.158273381294964,
"grad_norm": 3.0607381634313535,
"learning_rate": 9.941315542564583e-06,
"loss": 0.058,
"step": 428
},
{
"epoch": 6.172661870503597,
"grad_norm": 3.9600176462472607,
"learning_rate": 9.902193568983177e-06,
"loss": 0.1314,
"step": 429
},
{
"epoch": 6.18705035971223,
"grad_norm": 1.7718441919276489,
"learning_rate": 9.863073092449033e-06,
"loss": 0.0619,
"step": 430
},
{
"epoch": 6.201438848920863,
"grad_norm": 2.14606486678126,
"learning_rate": 9.823954711748987e-06,
"loss": 0.0537,
"step": 431
},
{
"epoch": 6.215827338129497,
"grad_norm": 4.994325517315764,
"learning_rate": 9.78483902563778e-06,
"loss": 0.0989,
"step": 432
},
{
"epoch": 6.23021582733813,
"grad_norm": 1.5344735636293707,
"learning_rate": 9.745726632828913e-06,
"loss": 0.0536,
"step": 433
},
{
"epoch": 6.244604316546763,
"grad_norm": 2.0531411979678658,
"learning_rate": 9.706618131985489e-06,
"loss": 0.0522,
"step": 434
},
{
"epoch": 6.258992805755396,
"grad_norm": 1.8694652467324728,
"learning_rate": 9.667514121711025e-06,
"loss": 0.0652,
"step": 435
},
{
"epoch": 6.273381294964029,
"grad_norm": 1.7939240986480327,
"learning_rate": 9.628415200540317e-06,
"loss": 0.0585,
"step": 436
},
{
"epoch": 6.287769784172662,
"grad_norm": 1.1701921170565006,
"learning_rate": 9.589321966930255e-06,
"loss": 0.0446,
"step": 437
},
{
"epoch": 6.302158273381295,
"grad_norm": 1.3183342321738782,
"learning_rate": 9.550235019250688e-06,
"loss": 0.0365,
"step": 438
},
{
"epoch": 6.316546762589928,
"grad_norm": 3.380696309795155,
"learning_rate": 9.51115495577523e-06,
"loss": 0.0855,
"step": 439
},
{
"epoch": 6.330935251798561,
"grad_norm": 3.2163242534678536,
"learning_rate": 9.472082374672145e-06,
"loss": 0.1112,
"step": 440
},
{
"epoch": 6.345323741007194,
"grad_norm": 2.8805271526903398,
"learning_rate": 9.433017873995159e-06,
"loss": 0.0567,
"step": 441
},
{
"epoch": 6.359712230215827,
"grad_norm": 2.9270420886671626,
"learning_rate": 9.393962051674319e-06,
"loss": 0.073,
"step": 442
},
{
"epoch": 6.374100719424461,
"grad_norm": 0.716770072244056,
"learning_rate": 9.354915505506839e-06,
"loss": 0.0273,
"step": 443
},
{
"epoch": 6.388489208633094,
"grad_norm": 1.9375688885176805,
"learning_rate": 9.315878833147953e-06,
"loss": 0.0458,
"step": 444
},
{
"epoch": 6.402877697841727,
"grad_norm": 1.5114813171921724,
"learning_rate": 9.27685263210176e-06,
"loss": 0.0408,
"step": 445
},
{
"epoch": 6.41726618705036,
"grad_norm": 2.176739947497608,
"learning_rate": 9.237837499712088e-06,
"loss": 0.0406,
"step": 446
},
{
"epoch": 6.431654676258993,
"grad_norm": 1.0546177826776708,
"learning_rate": 9.19883403315334e-06,
"loss": 0.032,
"step": 447
},
{
"epoch": 6.446043165467626,
"grad_norm": 0.5710171789573429,
"learning_rate": 9.159842829421358e-06,
"loss": 0.0283,
"step": 448
},
{
"epoch": 6.460431654676259,
"grad_norm": 2.4971494935474463,
"learning_rate": 9.1208644853243e-06,
"loss": 0.053,
"step": 449
},
{
"epoch": 6.474820143884892,
"grad_norm": 2.1060917891375213,
"learning_rate": 9.081899597473469e-06,
"loss": 0.0685,
"step": 450
},
{
"epoch": 6.489208633093525,
"grad_norm": 4.872774207148436,
"learning_rate": 9.042948762274227e-06,
"loss": 0.0878,
"step": 451
},
{
"epoch": 6.503597122302159,
"grad_norm": 3.57310670704578,
"learning_rate": 9.004012575916825e-06,
"loss": 0.0898,
"step": 452
},
{
"epoch": 6.517985611510792,
"grad_norm": 1.5890926297246664,
"learning_rate": 8.965091634367306e-06,
"loss": 0.0443,
"step": 453
},
{
"epoch": 6.532374100719425,
"grad_norm": 1.6565607787351473,
"learning_rate": 8.92618653335837e-06,
"loss": 0.0414,
"step": 454
},
{
"epoch": 6.546762589928058,
"grad_norm": 1.6598383644043355,
"learning_rate": 8.887297868380255e-06,
"loss": 0.0404,
"step": 455
},
{
"epoch": 6.561151079136691,
"grad_norm": 1.3690066201582984,
"learning_rate": 8.84842623467163e-06,
"loss": 0.0596,
"step": 456
},
{
"epoch": 6.575539568345324,
"grad_norm": 1.795250419015445,
"learning_rate": 8.809572227210472e-06,
"loss": 0.038,
"step": 457
},
{
"epoch": 6.589928057553957,
"grad_norm": 3.139575746021586,
"learning_rate": 8.770736440704979e-06,
"loss": 0.0709,
"step": 458
},
{
"epoch": 6.60431654676259,
"grad_norm": 3.5570255461130476,
"learning_rate": 8.731919469584443e-06,
"loss": 0.0707,
"step": 459
},
{
"epoch": 6.618705035971223,
"grad_norm": 2.2552703837998322,
"learning_rate": 8.693121907990177e-06,
"loss": 0.0653,
"step": 460
},
{
"epoch": 6.633093525179856,
"grad_norm": 1.9006385019285805,
"learning_rate": 8.654344349766384e-06,
"loss": 0.0629,
"step": 461
},
{
"epoch": 6.647482014388489,
"grad_norm": 2.4225211791810053,
"learning_rate": 8.615587388451116e-06,
"loss": 0.0546,
"step": 462
},
{
"epoch": 6.661870503597123,
"grad_norm": 2.687344986181635,
"learning_rate": 8.576851617267151e-06,
"loss": 0.0499,
"step": 463
},
{
"epoch": 6.676258992805756,
"grad_norm": 1.6697444443320166,
"learning_rate": 8.53813762911293e-06,
"loss": 0.0424,
"step": 464
},
{
"epoch": 6.690647482014389,
"grad_norm": 4.050618894489727,
"learning_rate": 8.499446016553475e-06,
"loss": 0.1016,
"step": 465
},
{
"epoch": 6.705035971223022,
"grad_norm": 1.3790673230578159,
"learning_rate": 8.460777371811327e-06,
"loss": 0.0328,
"step": 466
},
{
"epoch": 6.719424460431655,
"grad_norm": 1.1507918309844436,
"learning_rate": 8.42213228675747e-06,
"loss": 0.0223,
"step": 467
},
{
"epoch": 6.733812949640288,
"grad_norm": 2.4647114023706256,
"learning_rate": 8.383511352902285e-06,
"loss": 0.0684,
"step": 468
},
{
"epoch": 6.748201438848921,
"grad_norm": 1.9432128594740437,
"learning_rate": 8.344915161386485e-06,
"loss": 0.0544,
"step": 469
},
{
"epoch": 6.762589928057554,
"grad_norm": 1.9522718283037046,
"learning_rate": 8.306344302972066e-06,
"loss": 0.0545,
"step": 470
},
{
"epoch": 6.7769784172661875,
"grad_norm": 3.50307964418194,
"learning_rate": 8.267799368033288e-06,
"loss": 0.0727,
"step": 471
},
{
"epoch": 6.7913669064748206,
"grad_norm": 4.659530620027555,
"learning_rate": 8.229280946547595e-06,
"loss": 0.1447,
"step": 472
},
{
"epoch": 6.805755395683454,
"grad_norm": 2.459012650016016,
"learning_rate": 8.190789628086632e-06,
"loss": 0.0544,
"step": 473
},
{
"epoch": 6.820143884892087,
"grad_norm": 2.6126248982483204,
"learning_rate": 8.15232600180719e-06,
"loss": 0.0799,
"step": 474
},
{
"epoch": 6.83453237410072,
"grad_norm": 1.6545392348906836,
"learning_rate": 8.113890656442194e-06,
"loss": 0.0422,
"step": 475
},
{
"epoch": 6.848920863309353,
"grad_norm": 1.8182442546260595,
"learning_rate": 8.075484180291702e-06,
"loss": 0.0539,
"step": 476
},
{
"epoch": 6.863309352517986,
"grad_norm": 1.6010075595221613,
"learning_rate": 8.037107161213886e-06,
"loss": 0.0425,
"step": 477
},
{
"epoch": 6.877697841726619,
"grad_norm": 1.6332321065292132,
"learning_rate": 7.99876018661605e-06,
"loss": 0.059,
"step": 478
},
{
"epoch": 6.892086330935252,
"grad_norm": 2.4841015711601613,
"learning_rate": 7.960443843445622e-06,
"loss": 0.0493,
"step": 479
},
{
"epoch": 6.906474820143885,
"grad_norm": 2.325274291210812,
"learning_rate": 7.922158718181184e-06,
"loss": 0.0535,
"step": 480
},
{
"epoch": 6.920863309352518,
"grad_norm": 5.9237325194153225,
"learning_rate": 7.883905396823487e-06,
"loss": 0.0702,
"step": 481
},
{
"epoch": 6.935251798561151,
"grad_norm": 1.3505054193697872,
"learning_rate": 7.845684464886487e-06,
"loss": 0.0463,
"step": 482
},
{
"epoch": 6.9496402877697845,
"grad_norm": 4.741713866445365,
"learning_rate": 7.80749650738838e-06,
"loss": 0.0741,
"step": 483
},
{
"epoch": 6.9640287769784175,
"grad_norm": 3.680501521362714,
"learning_rate": 7.769342108842641e-06,
"loss": 0.0597,
"step": 484
},
{
"epoch": 6.9784172661870505,
"grad_norm": 1.5698663322195734,
"learning_rate": 7.731221853249089e-06,
"loss": 0.0481,
"step": 485
},
{
"epoch": 6.9928057553956835,
"grad_norm": 2.7205015167111535,
"learning_rate": 7.693136324084949e-06,
"loss": 0.0779,
"step": 486
},
{
"epoch": 7.0071942446043165,
"grad_norm": 2.492166076915753,
"learning_rate": 7.655086104295904e-06,
"loss": 0.0444,
"step": 487
},
{
"epoch": 7.0215827338129495,
"grad_norm": 2.8913487401770097,
"learning_rate": 7.617071776287196e-06,
"loss": 0.0474,
"step": 488
},
{
"epoch": 7.0359712230215825,
"grad_norm": 3.022407364083313,
"learning_rate": 7.5790939219146874e-06,
"loss": 0.0663,
"step": 489
},
{
"epoch": 7.0503597122302155,
"grad_norm": 4.042879715219215,
"learning_rate": 7.541153122475978e-06,
"loss": 0.0654,
"step": 490
},
{
"epoch": 7.0647482014388485,
"grad_norm": 3.5955340416725727,
"learning_rate": 7.503249958701489e-06,
"loss": 0.076,
"step": 491
},
{
"epoch": 7.079136690647482,
"grad_norm": 1.1422784311056753,
"learning_rate": 7.46538501074558e-06,
"loss": 0.0312,
"step": 492
},
{
"epoch": 7.093525179856115,
"grad_norm": 2.8616905836854674,
"learning_rate": 7.427558858177679e-06,
"loss": 0.0707,
"step": 493
},
{
"epoch": 7.107913669064748,
"grad_norm": 3.736472268458676,
"learning_rate": 7.389772079973397e-06,
"loss": 0.07,
"step": 494
},
{
"epoch": 7.122302158273381,
"grad_norm": 1.952123520605071,
"learning_rate": 7.352025254505672e-06,
"loss": 0.0644,
"step": 495
},
{
"epoch": 7.136690647482014,
"grad_norm": 2.5321330171246577,
"learning_rate": 7.31431895953592e-06,
"loss": 0.0597,
"step": 496
},
{
"epoch": 7.151079136690647,
"grad_norm": 1.3288743837048944,
"learning_rate": 7.276653772205187e-06,
"loss": 0.043,
"step": 497
},
{
"epoch": 7.16546762589928,
"grad_norm": 1.1513739812834443,
"learning_rate": 7.239030269025311e-06,
"loss": 0.035,
"step": 498
},
{
"epoch": 7.179856115107913,
"grad_norm": 1.5262098875270578,
"learning_rate": 7.201449025870113e-06,
"loss": 0.0377,
"step": 499
},
{
"epoch": 7.194244604316546,
"grad_norm": 3.435543237475937,
"learning_rate": 7.163910617966563e-06,
"loss": 0.0609,
"step": 500
},
{
"epoch": 7.2086330935251794,
"grad_norm": 0.9604425592050114,
"learning_rate": 7.126415619885987e-06,
"loss": 0.0307,
"step": 501
},
{
"epoch": 7.223021582733813,
"grad_norm": 5.364737035927668,
"learning_rate": 7.088964605535278e-06,
"loss": 0.0779,
"step": 502
},
{
"epoch": 7.237410071942446,
"grad_norm": 2.0995461577780086,
"learning_rate": 7.0515581481480925e-06,
"loss": 0.0614,
"step": 503
},
{
"epoch": 7.251798561151079,
"grad_norm": 1.9139618540642809,
"learning_rate": 7.014196820276098e-06,
"loss": 0.0339,
"step": 504
},
{
"epoch": 7.266187050359712,
"grad_norm": 1.282509471776567,
"learning_rate": 6.976881193780196e-06,
"loss": 0.0415,
"step": 505
},
{
"epoch": 7.280575539568345,
"grad_norm": 1.8876563385392593,
"learning_rate": 6.9396118398217675e-06,
"loss": 0.0555,
"step": 506
},
{
"epoch": 7.294964028776978,
"grad_norm": 1.6273768839175773,
"learning_rate": 6.90238932885394e-06,
"loss": 0.0315,
"step": 507
},
{
"epoch": 7.309352517985611,
"grad_norm": 2.502695211360501,
"learning_rate": 6.865214230612858e-06,
"loss": 0.0517,
"step": 508
},
{
"epoch": 7.323741007194244,
"grad_norm": 4.1055254501583995,
"learning_rate": 6.8280871141089415e-06,
"loss": 0.0733,
"step": 509
},
{
"epoch": 7.338129496402877,
"grad_norm": 3.3526140983673285,
"learning_rate": 6.791008547618207e-06,
"loss": 0.0537,
"step": 510
},
{
"epoch": 7.35251798561151,
"grad_norm": 2.27702848929816,
"learning_rate": 6.753979098673539e-06,
"loss": 0.0394,
"step": 511
},
{
"epoch": 7.366906474820144,
"grad_norm": 2.5895671987953053,
"learning_rate": 6.716999334056031e-06,
"loss": 0.072,
"step": 512
},
{
"epoch": 7.381294964028777,
"grad_norm": 2.560346455515735,
"learning_rate": 6.680069819786288e-06,
"loss": 0.0551,
"step": 513
},
{
"epoch": 7.39568345323741,
"grad_norm": 2.4213413575290885,
"learning_rate": 6.643191121115773e-06,
"loss": 0.0604,
"step": 514
},
{
"epoch": 7.410071942446043,
"grad_norm": 2.5037600137764415,
"learning_rate": 6.6063638025181594e-06,
"loss": 0.0505,
"step": 515
},
{
"epoch": 7.424460431654676,
"grad_norm": 5.462794355900936,
"learning_rate": 6.5695884276806784e-06,
"loss": 0.0601,
"step": 516
},
{
"epoch": 7.438848920863309,
"grad_norm": 5.901550532764724,
"learning_rate": 6.532865559495505e-06,
"loss": 0.0732,
"step": 517
},
{
"epoch": 7.453237410071942,
"grad_norm": 2.1680456127592382,
"learning_rate": 6.496195760051128e-06,
"loss": 0.037,
"step": 518
},
{
"epoch": 7.467625899280575,
"grad_norm": 1.0418430134694838,
"learning_rate": 6.459579590623763e-06,
"loss": 0.0296,
"step": 519
},
{
"epoch": 7.482014388489208,
"grad_norm": 4.611606098360257,
"learning_rate": 6.423017611668745e-06,
"loss": 0.0904,
"step": 520
},
{
"epoch": 7.496402877697841,
"grad_norm": 1.7334595469785676,
"learning_rate": 6.386510382811963e-06,
"loss": 0.065,
"step": 521
},
{
"epoch": 7.510791366906475,
"grad_norm": 2.5488880130044302,
"learning_rate": 6.350058462841283e-06,
"loss": 0.0711,
"step": 522
},
{
"epoch": 7.525179856115108,
"grad_norm": 4.383649882258492,
"learning_rate": 6.313662409698004e-06,
"loss": 0.0672,
"step": 523
},
{
"epoch": 7.539568345323741,
"grad_norm": 2.161691670015276,
"learning_rate": 6.277322780468317e-06,
"loss": 0.0542,
"step": 524
},
{
"epoch": 7.553956834532374,
"grad_norm": 1.75533651285235,
"learning_rate": 6.241040131374769e-06,
"loss": 0.0385,
"step": 525
},
{
"epoch": 7.568345323741007,
"grad_norm": 1.6713788310284323,
"learning_rate": 6.204815017767767e-06,
"loss": 0.0651,
"step": 526
},
{
"epoch": 7.58273381294964,
"grad_norm": 2.7767180589589957,
"learning_rate": 6.168647994117057e-06,
"loss": 0.0727,
"step": 527
},
{
"epoch": 7.597122302158273,
"grad_norm": 1.7703681537405498,
"learning_rate": 6.132539614003249e-06,
"loss": 0.0399,
"step": 528
},
{
"epoch": 7.611510791366906,
"grad_norm": 2.182319585582808,
"learning_rate": 6.096490430109343e-06,
"loss": 0.0537,
"step": 529
},
{
"epoch": 7.625899280575539,
"grad_norm": 3.13728684435282,
"learning_rate": 6.0605009942122705e-06,
"loss": 0.0486,
"step": 530
},
{
"epoch": 7.640287769784173,
"grad_norm": 1.9092924208411692,
"learning_rate": 6.024571857174443e-06,
"loss": 0.0426,
"step": 531
},
{
"epoch": 7.654676258992806,
"grad_norm": 1.2985276847388703,
"learning_rate": 5.988703568935329e-06,
"loss": 0.0229,
"step": 532
},
{
"epoch": 7.669064748201439,
"grad_norm": 0.8412966639532138,
"learning_rate": 5.952896678503025e-06,
"loss": 0.0185,
"step": 533
},
{
"epoch": 7.683453237410072,
"grad_norm": 2.7092486788073074,
"learning_rate": 5.917151733945865e-06,
"loss": 0.0428,
"step": 534
},
{
"epoch": 7.697841726618705,
"grad_norm": 3.4234238702022526,
"learning_rate": 5.88146928238402e-06,
"loss": 0.0405,
"step": 535
},
{
"epoch": 7.712230215827338,
"grad_norm": 4.2390460482616685,
"learning_rate": 5.845849869981137e-06,
"loss": 0.0623,
"step": 536
},
{
"epoch": 7.726618705035971,
"grad_norm": 2.9908661711146167,
"learning_rate": 5.8102940419359595e-06,
"loss": 0.0584,
"step": 537
},
{
"epoch": 7.741007194244604,
"grad_norm": 1.6724838566811557,
"learning_rate": 5.7748023424740085e-06,
"loss": 0.0255,
"step": 538
},
{
"epoch": 7.755395683453237,
"grad_norm": 5.552785695728314,
"learning_rate": 5.739375314839226e-06,
"loss": 0.047,
"step": 539
},
{
"epoch": 7.76978417266187,
"grad_norm": 4.78316445415734,
"learning_rate": 5.704013501285679e-06,
"loss": 0.059,
"step": 540
},
{
"epoch": 7.784172661870503,
"grad_norm": 2.2867834949348116,
"learning_rate": 5.6687174430692495e-06,
"loss": 0.049,
"step": 541
},
{
"epoch": 7.798561151079137,
"grad_norm": 1.4114860474896007,
"learning_rate": 5.633487680439362e-06,
"loss": 0.0282,
"step": 542
},
{
"epoch": 7.81294964028777,
"grad_norm": 3.3647369349864102,
"learning_rate": 5.598324752630695e-06,
"loss": 0.0749,
"step": 543
},
{
"epoch": 7.827338129496403,
"grad_norm": 2.684742315386639,
"learning_rate": 5.5632291978549445e-06,
"loss": 0.0455,
"step": 544
},
{
"epoch": 7.841726618705036,
"grad_norm": 2.0554793850593005,
"learning_rate": 5.528201553292578e-06,
"loss": 0.0439,
"step": 545
},
{
"epoch": 7.856115107913669,
"grad_norm": 4.087197798794891,
"learning_rate": 5.493242355084609e-06,
"loss": 0.0688,
"step": 546
},
{
"epoch": 7.870503597122302,
"grad_norm": 1.2791224839211275,
"learning_rate": 5.458352138324408e-06,
"loss": 0.0298,
"step": 547
},
{
"epoch": 7.884892086330935,
"grad_norm": 5.979296747331615,
"learning_rate": 5.423531437049491e-06,
"loss": 0.0662,
"step": 548
},
{
"epoch": 7.899280575539568,
"grad_norm": 4.045189397887292,
"learning_rate": 5.388780784233354e-06,
"loss": 0.0554,
"step": 549
},
{
"epoch": 7.913669064748201,
"grad_norm": 4.898228214701075,
"learning_rate": 5.354100711777317e-06,
"loss": 0.0594,
"step": 550
},
{
"epoch": 7.928057553956835,
"grad_norm": 2.02299170942208,
"learning_rate": 5.319491750502383e-06,
"loss": 0.0617,
"step": 551
},
{
"epoch": 7.942446043165468,
"grad_norm": 3.392724832422359,
"learning_rate": 5.284954430141109e-06,
"loss": 0.0574,
"step": 552
},
{
"epoch": 7.956834532374101,
"grad_norm": 2.0165506744847437,
"learning_rate": 5.250489279329501e-06,
"loss": 0.0261,
"step": 553
},
{
"epoch": 7.971223021582734,
"grad_norm": 1.399835068970492,
"learning_rate": 5.216096825598917e-06,
"loss": 0.0324,
"step": 554
},
{
"epoch": 7.985611510791367,
"grad_norm": 2.9458649739740155,
"learning_rate": 5.18177759536801e-06,
"loss": 0.0497,
"step": 555
},
{
"epoch": 8.0,
"grad_norm": 0.7940680363085926,
"learning_rate": 5.147532113934646e-06,
"loss": 0.0181,
"step": 556
},
{
"epoch": 8.014388489208633,
"grad_norm": 1.9007802957877749,
"learning_rate": 5.113360905467875e-06,
"loss": 0.037,
"step": 557
},
{
"epoch": 8.028776978417266,
"grad_norm": 3.2781065841019066,
"learning_rate": 5.079264492999916e-06,
"loss": 0.036,
"step": 558
},
{
"epoch": 8.043165467625899,
"grad_norm": 3.819122592314504,
"learning_rate": 5.0452433984181315e-06,
"loss": 0.0523,
"step": 559
},
{
"epoch": 8.057553956834532,
"grad_norm": 4.2144956363601525,
"learning_rate": 5.011298142457069e-06,
"loss": 0.0636,
"step": 560
},
{
"epoch": 8.071942446043165,
"grad_norm": 13.047476159517892,
"learning_rate": 4.97742924469046e-06,
"loss": 0.1163,
"step": 561
},
{
"epoch": 8.086330935251798,
"grad_norm": 6.830416937477266,
"learning_rate": 4.943637223523282e-06,
"loss": 0.1028,
"step": 562
},
{
"epoch": 8.100719424460431,
"grad_norm": 2.3255270246907203,
"learning_rate": 4.909922596183822e-06,
"loss": 0.0521,
"step": 563
},
{
"epoch": 8.115107913669064,
"grad_norm": 1.1435086086266115,
"learning_rate": 4.876285878715764e-06,
"loss": 0.0157,
"step": 564
},
{
"epoch": 8.129496402877697,
"grad_norm": 3.7825947186999436,
"learning_rate": 4.842727585970284e-06,
"loss": 0.0393,
"step": 565
},
{
"epoch": 8.14388489208633,
"grad_norm": 3.3170824479840797,
"learning_rate": 4.8092482315981685e-06,
"loss": 0.0507,
"step": 566
},
{
"epoch": 8.158273381294965,
"grad_norm": 4.7246048323427265,
"learning_rate": 4.775848328041956e-06,
"loss": 0.0752,
"step": 567
},
{
"epoch": 8.172661870503598,
"grad_norm": 2.8604995363327896,
"learning_rate": 4.742528386528094e-06,
"loss": 0.0447,
"step": 568
},
{
"epoch": 8.18705035971223,
"grad_norm": 4.122397781085086,
"learning_rate": 4.709288917059118e-06,
"loss": 0.0571,
"step": 569
},
{
"epoch": 8.201438848920864,
"grad_norm": 3.8083495479732816,
"learning_rate": 4.676130428405834e-06,
"loss": 0.0494,
"step": 570
},
{
"epoch": 8.215827338129497,
"grad_norm": 3.48317844967832,
"learning_rate": 4.643053428099538e-06,
"loss": 0.0841,
"step": 571
},
{
"epoch": 8.23021582733813,
"grad_norm": 1.551504859564384,
"learning_rate": 4.610058422424249e-06,
"loss": 0.0375,
"step": 572
},
{
"epoch": 8.244604316546763,
"grad_norm": 1.9477471961254322,
"learning_rate": 4.577145916408955e-06,
"loss": 0.0257,
"step": 573
},
{
"epoch": 8.258992805755396,
"grad_norm": 3.2335530341856886,
"learning_rate": 4.544316413819888e-06,
"loss": 0.075,
"step": 574
},
{
"epoch": 8.273381294964029,
"grad_norm": 1.0232024159790356,
"learning_rate": 4.5115704171528105e-06,
"loss": 0.026,
"step": 575
},
{
"epoch": 8.287769784172662,
"grad_norm": 2.8739397189315956,
"learning_rate": 4.478908427625323e-06,
"loss": 0.0409,
"step": 576
},
{
"epoch": 8.302158273381295,
"grad_norm": 7.820713750474336,
"learning_rate": 4.446330945169197e-06,
"loss": 0.0697,
"step": 577
},
{
"epoch": 8.316546762589928,
"grad_norm": 1.4559489461697241,
"learning_rate": 4.41383846842272e-06,
"loss": 0.0338,
"step": 578
},
{
"epoch": 8.33093525179856,
"grad_norm": 1.6200703776809549,
"learning_rate": 4.381431494723056e-06,
"loss": 0.0453,
"step": 579
},
{
"epoch": 8.345323741007194,
"grad_norm": 2.462591590876853,
"learning_rate": 4.349110520098644e-06,
"loss": 0.0452,
"step": 580
},
{
"epoch": 8.359712230215827,
"grad_norm": 2.4958974429500675,
"learning_rate": 4.31687603926161e-06,
"loss": 0.0419,
"step": 581
},
{
"epoch": 8.37410071942446,
"grad_norm": 3.3620333990871414,
"learning_rate": 4.284728545600174e-06,
"loss": 0.0918,
"step": 582
},
{
"epoch": 8.388489208633093,
"grad_norm": 2.076920143609935,
"learning_rate": 4.252668531171117e-06,
"loss": 0.0333,
"step": 583
},
{
"epoch": 8.402877697841726,
"grad_norm": 11.337846192838462,
"learning_rate": 4.220696486692241e-06,
"loss": 0.0809,
"step": 584
},
{
"epoch": 8.417266187050359,
"grad_norm": 1.4691851543426133,
"learning_rate": 4.18881290153486e-06,
"loss": 0.0297,
"step": 585
},
{
"epoch": 8.431654676258994,
"grad_norm": 2.322343633696552,
"learning_rate": 4.1570182637163155e-06,
"loss": 0.041,
"step": 586
},
{
"epoch": 8.446043165467627,
"grad_norm": 1.857573901571014,
"learning_rate": 4.125313059892494e-06,
"loss": 0.026,
"step": 587
},
{
"epoch": 8.46043165467626,
"grad_norm": 2.420406989345141,
"learning_rate": 4.093697775350388e-06,
"loss": 0.0425,
"step": 588
},
{
"epoch": 8.474820143884893,
"grad_norm": 2.528933273598754,
"learning_rate": 4.062172894000664e-06,
"loss": 0.0194,
"step": 589
},
{
"epoch": 8.489208633093526,
"grad_norm": 4.851880404918257,
"learning_rate": 4.0307388983702555e-06,
"loss": 0.0456,
"step": 590
},
{
"epoch": 8.503597122302159,
"grad_norm": 1.0566440356278923,
"learning_rate": 3.9993962695949865e-06,
"loss": 0.037,
"step": 591
},
{
"epoch": 8.517985611510792,
"grad_norm": 0.9342625319695776,
"learning_rate": 3.9681454874121905e-06,
"loss": 0.0246,
"step": 592
},
{
"epoch": 8.532374100719425,
"grad_norm": 1.404658963816671,
"learning_rate": 3.9369870301533785e-06,
"loss": 0.0323,
"step": 593
},
{
"epoch": 8.546762589928058,
"grad_norm": 2.8421121591274474,
"learning_rate": 3.905921374736919e-06,
"loss": 0.042,
"step": 594
},
{
"epoch": 8.56115107913669,
"grad_norm": 1.665249679691359,
"learning_rate": 3.87494899666073e-06,
"loss": 0.0471,
"step": 595
},
{
"epoch": 8.575539568345324,
"grad_norm": 3.150615281085969,
"learning_rate": 3.844070369995008e-06,
"loss": 0.0592,
"step": 596
},
{
"epoch": 8.589928057553957,
"grad_norm": 2.1657965045021093,
"learning_rate": 3.8132859673749688e-06,
"loss": 0.0335,
"step": 597
},
{
"epoch": 8.60431654676259,
"grad_norm": 0.9753064067607254,
"learning_rate": 3.7825962599936117e-06,
"loss": 0.0173,
"step": 598
},
{
"epoch": 8.618705035971223,
"grad_norm": 1.6281860386953595,
"learning_rate": 3.7520017175945168e-06,
"loss": 0.0327,
"step": 599
},
{
"epoch": 8.633093525179856,
"grad_norm": 1.3250219320378607,
"learning_rate": 3.7215028084646385e-06,
"loss": 0.0389,
"step": 600
},
{
"epoch": 8.647482014388489,
"grad_norm": 2.663488081521059,
"learning_rate": 3.691099999427152e-06,
"loss": 0.0451,
"step": 601
},
{
"epoch": 8.661870503597122,
"grad_norm": 1.0684718799506963,
"learning_rate": 3.6607937558342975e-06,
"loss": 0.0227,
"step": 602
},
{
"epoch": 8.676258992805755,
"grad_norm": 1.8713141278138534,
"learning_rate": 3.6305845415602726e-06,
"loss": 0.0324,
"step": 603
},
{
"epoch": 8.690647482014388,
"grad_norm": 1.959513645334475,
"learning_rate": 3.6004728189941142e-06,
"loss": 0.0483,
"step": 604
},
{
"epoch": 8.70503597122302,
"grad_norm": 8.273881877761735,
"learning_rate": 3.5704590490326298e-06,
"loss": 0.0701,
"step": 605
},
{
"epoch": 8.719424460431654,
"grad_norm": 1.932144875971402,
"learning_rate": 3.5405436910733437e-06,
"loss": 0.0412,
"step": 606
},
{
"epoch": 8.733812949640289,
"grad_norm": 7.650174976201004,
"learning_rate": 3.5107272030074626e-06,
"loss": 0.0525,
"step": 607
},
{
"epoch": 8.748201438848922,
"grad_norm": 2.482410934643446,
"learning_rate": 3.4810100412128743e-06,
"loss": 0.0447,
"step": 608
},
{
"epoch": 8.762589928057555,
"grad_norm": 1.7887809194693534,
"learning_rate": 3.4513926605471504e-06,
"loss": 0.0334,
"step": 609
},
{
"epoch": 8.776978417266188,
"grad_norm": 2.262589027477362,
"learning_rate": 3.421875514340589e-06,
"loss": 0.0438,
"step": 610
},
{
"epoch": 8.79136690647482,
"grad_norm": 5.096455221790141,
"learning_rate": 3.392459054389281e-06,
"loss": 0.0589,
"step": 611
},
{
"epoch": 8.805755395683454,
"grad_norm": 1.7161312058815825,
"learning_rate": 3.3631437309481853e-06,
"loss": 0.0261,
"step": 612
},
{
"epoch": 8.820143884892087,
"grad_norm": 2.941763984376986,
"learning_rate": 3.333929992724253e-06,
"loss": 0.0576,
"step": 613
},
{
"epoch": 8.83453237410072,
"grad_norm": 3.006151368731597,
"learning_rate": 3.30481828686954e-06,
"loss": 0.0443,
"step": 614
},
{
"epoch": 8.848920863309353,
"grad_norm": 1.6800648366968625,
"learning_rate": 3.275809058974373e-06,
"loss": 0.0307,
"step": 615
},
{
"epoch": 8.863309352517986,
"grad_norm": 0.8243738116908417,
"learning_rate": 3.2469027530605255e-06,
"loss": 0.0184,
"step": 616
},
{
"epoch": 8.877697841726619,
"grad_norm": 2.9566481267485365,
"learning_rate": 3.2180998115744387e-06,
"loss": 0.0373,
"step": 617
},
{
"epoch": 8.892086330935252,
"grad_norm": 1.9637756364219565,
"learning_rate": 3.1894006753804143e-06,
"loss": 0.0414,
"step": 618
},
{
"epoch": 8.906474820143885,
"grad_norm": 2.3021615520028984,
"learning_rate": 3.1608057837538976e-06,
"loss": 0.0423,
"step": 619
},
{
"epoch": 8.920863309352518,
"grad_norm": 2.145448976885504,
"learning_rate": 3.1323155743747393e-06,
"loss": 0.0404,
"step": 620
},
{
"epoch": 8.93525179856115,
"grad_norm": 2.6838866265175287,
"learning_rate": 3.1039304833205073e-06,
"loss": 0.042,
"step": 621
},
{
"epoch": 8.949640287769784,
"grad_norm": 0.9803533745198019,
"learning_rate": 3.075650945059799e-06,
"loss": 0.028,
"step": 622
},
{
"epoch": 8.964028776978417,
"grad_norm": 6.015138805293879,
"learning_rate": 3.047477392445596e-06,
"loss": 0.0469,
"step": 623
},
{
"epoch": 8.97841726618705,
"grad_norm": 2.47190153965276,
"learning_rate": 3.019410256708637e-06,
"loss": 0.0699,
"step": 624
},
{
"epoch": 8.992805755395683,
"grad_norm": 1.6888494123739861,
"learning_rate": 2.9914499674508337e-06,
"loss": 0.0352,
"step": 625
},
{
"epoch": 9.007194244604317,
"grad_norm": 0.8291369253401952,
"learning_rate": 2.9635969526386665e-06,
"loss": 0.0173,
"step": 626
},
{
"epoch": 9.02158273381295,
"grad_norm": 3.1456360877961043,
"learning_rate": 2.935851638596655e-06,
"loss": 0.0445,
"step": 627
},
{
"epoch": 9.035971223021583,
"grad_norm": 2.3237289901109754,
"learning_rate": 2.908214450000828e-06,
"loss": 0.0392,
"step": 628
},
{
"epoch": 9.050359712230216,
"grad_norm": 2.1054179088541547,
"learning_rate": 2.8806858098722155e-06,
"loss": 0.0585,
"step": 629
},
{
"epoch": 9.06474820143885,
"grad_norm": 1.2969196699460492,
"learning_rate": 2.853266139570391e-06,
"loss": 0.0208,
"step": 630
},
{
"epoch": 9.079136690647482,
"grad_norm": 0.9970140027989338,
"learning_rate": 2.825955858787002e-06,
"loss": 0.0183,
"step": 631
},
{
"epoch": 9.093525179856115,
"grad_norm": 1.046038196325873,
"learning_rate": 2.798755385539358e-06,
"loss": 0.0196,
"step": 632
},
{
"epoch": 9.107913669064748,
"grad_norm": 1.9785975611671889,
"learning_rate": 2.7716651361640277e-06,
"loss": 0.046,
"step": 633
},
{
"epoch": 9.122302158273381,
"grad_norm": 2.076469916316301,
"learning_rate": 2.7446855253104775e-06,
"loss": 0.035,
"step": 634
},
{
"epoch": 9.136690647482014,
"grad_norm": 0.996431472737031,
"learning_rate": 2.717816965934705e-06,
"loss": 0.0377,
"step": 635
},
{
"epoch": 9.151079136690647,
"grad_norm": 4.566127623498886,
"learning_rate": 2.6910598692929323e-06,
"loss": 0.0767,
"step": 636
},
{
"epoch": 9.16546762589928,
"grad_norm": 12.39526382129473,
"learning_rate": 2.6644146449353103e-06,
"loss": 0.1713,
"step": 637
},
{
"epoch": 9.179856115107913,
"grad_norm": 2.4799629481553724,
"learning_rate": 2.6378817006996393e-06,
"loss": 0.0314,
"step": 638
},
{
"epoch": 9.194244604316546,
"grad_norm": 8.085625915606265,
"learning_rate": 2.611461442705152e-06,
"loss": 0.051,
"step": 639
},
{
"epoch": 9.20863309352518,
"grad_norm": 3.9263706221680685,
"learning_rate": 2.5851542753462612e-06,
"loss": 0.0521,
"step": 640
},
{
"epoch": 9.223021582733812,
"grad_norm": 2.226548600355445,
"learning_rate": 2.5589606012863968e-06,
"loss": 0.0332,
"step": 641
},
{
"epoch": 9.237410071942445,
"grad_norm": 1.255383022545736,
"learning_rate": 2.532880821451833e-06,
"loss": 0.0248,
"step": 642
},
{
"epoch": 9.251798561151078,
"grad_norm": 5.241468552575159,
"learning_rate": 2.5069153350255617e-06,
"loss": 0.0544,
"step": 643
},
{
"epoch": 9.266187050359711,
"grad_norm": 1.3999294797764177,
"learning_rate": 2.4810645394411636e-06,
"loss": 0.0284,
"step": 644
},
{
"epoch": 9.280575539568344,
"grad_norm": 1.7303921026902358,
"learning_rate": 2.455328830376741e-06,
"loss": 0.0212,
"step": 645
},
{
"epoch": 9.29496402877698,
"grad_norm": 3.2404444332120868,
"learning_rate": 2.429708601748849e-06,
"loss": 0.0698,
"step": 646
},
{
"epoch": 9.309352517985612,
"grad_norm": 1.553700541641244,
"learning_rate": 2.4042042457064863e-06,
"loss": 0.0389,
"step": 647
},
{
"epoch": 9.323741007194245,
"grad_norm": 2.570999782342014,
"learning_rate": 2.3788161526250677e-06,
"loss": 0.034,
"step": 648
},
{
"epoch": 9.338129496402878,
"grad_norm": 1.4441708855755897,
"learning_rate": 2.3535447111004662e-06,
"loss": 0.0318,
"step": 649
},
{
"epoch": 9.352517985611511,
"grad_norm": 2.9286217319244456,
"learning_rate": 2.3283903079430582e-06,
"loss": 0.0426,
"step": 650
},
{
"epoch": 9.366906474820144,
"grad_norm": 2.3158759616027758,
"learning_rate": 2.3033533281718036e-06,
"loss": 0.04,
"step": 651
},
{
"epoch": 9.381294964028777,
"grad_norm": 5.000706423916787,
"learning_rate": 2.2784341550083577e-06,
"loss": 0.0778,
"step": 652
},
{
"epoch": 9.39568345323741,
"grad_norm": 3.1672796078263667,
"learning_rate": 2.253633169871198e-06,
"loss": 0.0477,
"step": 653
},
{
"epoch": 9.410071942446043,
"grad_norm": 2.936168504993618,
"learning_rate": 2.2289507523697894e-06,
"loss": 0.0355,
"step": 654
},
{
"epoch": 9.424460431654676,
"grad_norm": 4.144038176169699,
"learning_rate": 2.204387280298772e-06,
"loss": 0.0636,
"step": 655
},
{
"epoch": 9.43884892086331,
"grad_norm": 3.317961055103226,
"learning_rate": 2.1799431296321883e-06,
"loss": 0.0254,
"step": 656
},
{
"epoch": 9.453237410071942,
"grad_norm": 2.0945507810139166,
"learning_rate": 2.155618674517711e-06,
"loss": 0.0396,
"step": 657
},
{
"epoch": 9.467625899280575,
"grad_norm": 2.1490741031658755,
"learning_rate": 2.131414287270931e-06,
"loss": 0.0351,
"step": 658
},
{
"epoch": 9.482014388489208,
"grad_norm": 1.170394818981662,
"learning_rate": 2.107330338369652e-06,
"loss": 0.0223,
"step": 659
},
{
"epoch": 9.496402877697841,
"grad_norm": 1.9060637808073648,
"learning_rate": 2.083367196448219e-06,
"loss": 0.0314,
"step": 660
},
{
"epoch": 9.510791366906474,
"grad_norm": 2.682470653382018,
"learning_rate": 2.0595252282918875e-06,
"loss": 0.0311,
"step": 661
},
{
"epoch": 9.525179856115107,
"grad_norm": 2.824810882470908,
"learning_rate": 2.0358047988311857e-06,
"loss": 0.0408,
"step": 662
},
{
"epoch": 9.53956834532374,
"grad_norm": 1.8783543488553889,
"learning_rate": 2.012206271136353e-06,
"loss": 0.0316,
"step": 663
},
{
"epoch": 9.553956834532373,
"grad_norm": 1.277908232236526,
"learning_rate": 1.988730006411769e-06,
"loss": 0.0227,
"step": 664
},
{
"epoch": 9.568345323741006,
"grad_norm": 2.3170498768091763,
"learning_rate": 1.9653763639904333e-06,
"loss": 0.0324,
"step": 665
},
{
"epoch": 9.582733812949641,
"grad_norm": 1.4282734057780846,
"learning_rate": 1.942145701328456e-06,
"loss": 0.0282,
"step": 666
},
{
"epoch": 9.597122302158274,
"grad_norm": 2.7421058903271676,
"learning_rate": 1.9190383739995933e-06,
"loss": 0.0433,
"step": 667
},
{
"epoch": 9.611510791366907,
"grad_norm": 0.5131708654352847,
"learning_rate": 1.8960547356897997e-06,
"loss": 0.0161,
"step": 668
},
{
"epoch": 9.62589928057554,
"grad_norm": 2.0530874650800106,
"learning_rate": 1.8731951381918257e-06,
"loss": 0.0427,
"step": 669
},
{
"epoch": 9.640287769784173,
"grad_norm": 1.8051899893158643,
"learning_rate": 1.8504599313998196e-06,
"loss": 0.0218,
"step": 670
},
{
"epoch": 9.654676258992806,
"grad_norm": 0.7906143024517177,
"learning_rate": 1.8278494633039756e-06,
"loss": 0.0195,
"step": 671
},
{
"epoch": 9.66906474820144,
"grad_norm": 3.283281196176163,
"learning_rate": 1.8053640799852134e-06,
"loss": 0.0279,
"step": 672
},
{
"epoch": 9.683453237410072,
"grad_norm": 1.4837554695301376,
"learning_rate": 1.783004125609873e-06,
"loss": 0.0303,
"step": 673
},
{
"epoch": 9.697841726618705,
"grad_norm": 3.9513747658965364,
"learning_rate": 1.7607699424244583e-06,
"loss": 0.0453,
"step": 674
},
{
"epoch": 9.712230215827338,
"grad_norm": 2.932528943145533,
"learning_rate": 1.7386618707503822e-06,
"loss": 0.0551,
"step": 675
},
{
"epoch": 9.726618705035971,
"grad_norm": 1.8691452428430109,
"learning_rate": 1.7166802489787704e-06,
"loss": 0.0288,
"step": 676
},
{
"epoch": 9.741007194244604,
"grad_norm": 2.267950496543606,
"learning_rate": 1.6948254135652764e-06,
"loss": 0.0471,
"step": 677
},
{
"epoch": 9.755395683453237,
"grad_norm": 3.362001877990058,
"learning_rate": 1.673097699024938e-06,
"loss": 0.0443,
"step": 678
},
{
"epoch": 9.76978417266187,
"grad_norm": 0.6960568252828435,
"learning_rate": 1.6514974379270465e-06,
"loss": 0.0143,
"step": 679
},
{
"epoch": 9.784172661870503,
"grad_norm": 1.5401636007060717,
"learning_rate": 1.6300249608900654e-06,
"loss": 0.0318,
"step": 680
},
{
"epoch": 9.798561151079136,
"grad_norm": 2.2596630157728765,
"learning_rate": 1.608680596576563e-06,
"loss": 0.0332,
"step": 681
},
{
"epoch": 9.81294964028777,
"grad_norm": 1.1661531548368926,
"learning_rate": 1.587464671688187e-06,
"loss": 0.0187,
"step": 682
},
{
"epoch": 9.827338129496402,
"grad_norm": 1.4871849727237823,
"learning_rate": 1.5663775109606682e-06,
"loss": 0.0283,
"step": 683
},
{
"epoch": 9.841726618705035,
"grad_norm": 0.9194790607066714,
"learning_rate": 1.5454194371588383e-06,
"loss": 0.0166,
"step": 684
},
{
"epoch": 9.85611510791367,
"grad_norm": 2.308580159255162,
"learning_rate": 1.5245907710716912e-06,
"loss": 0.0349,
"step": 685
},
{
"epoch": 9.870503597122303,
"grad_norm": 2.314639880080365,
"learning_rate": 1.5038918315074825e-06,
"loss": 0.0281,
"step": 686
},
{
"epoch": 9.884892086330936,
"grad_norm": 1.619927122217182,
"learning_rate": 1.48332293528885e-06,
"loss": 0.0575,
"step": 687
},
{
"epoch": 9.899280575539569,
"grad_norm": 1.5455675444988652,
"learning_rate": 1.462884397247949e-06,
"loss": 0.0281,
"step": 688
},
{
"epoch": 9.913669064748202,
"grad_norm": 1.496371665596201,
"learning_rate": 1.4425765302216467e-06,
"loss": 0.0262,
"step": 689
},
{
"epoch": 9.928057553956835,
"grad_norm": 0.6246401749746331,
"learning_rate": 1.4223996450467291e-06,
"loss": 0.0155,
"step": 690
},
{
"epoch": 9.942446043165468,
"grad_norm": 10.244899114128767,
"learning_rate": 1.4023540505551514e-06,
"loss": 0.0584,
"step": 691
},
{
"epoch": 9.956834532374101,
"grad_norm": 3.578140786106804,
"learning_rate": 1.382440053569295e-06,
"loss": 0.0403,
"step": 692
},
{
"epoch": 9.971223021582734,
"grad_norm": 1.6841621634178627,
"learning_rate": 1.3626579588972843e-06,
"loss": 0.0347,
"step": 693
},
{
"epoch": 9.985611510791367,
"grad_norm": 1.6886542091112717,
"learning_rate": 1.3430080693283176e-06,
"loss": 0.0252,
"step": 694
},
{
"epoch": 10.0,
"grad_norm": 2.1884226287968,
"learning_rate": 1.3234906856280272e-06,
"loss": 0.0289,
"step": 695
},
{
"epoch": 10.014388489208633,
"grad_norm": 1.8465342865749588,
"learning_rate": 1.30410610653389e-06,
"loss": 0.0317,
"step": 696
},
{
"epoch": 10.028776978417266,
"grad_norm": 1.6653664187493198,
"learning_rate": 1.2848546287506392e-06,
"loss": 0.0327,
"step": 697
},
{
"epoch": 10.043165467625899,
"grad_norm": 2.716731042561852,
"learning_rate": 1.2657365469457295e-06,
"loss": 0.0356,
"step": 698
},
{
"epoch": 10.057553956834532,
"grad_norm": 0.5749921009255655,
"learning_rate": 1.2467521537448258e-06,
"loss": 0.0129,
"step": 699
},
{
"epoch": 10.071942446043165,
"grad_norm": 1.6695913891538687,
"learning_rate": 1.227901739727332e-06,
"loss": 0.0386,
"step": 700
},
{
"epoch": 10.086330935251798,
"grad_norm": 1.480135248408385,
"learning_rate": 1.2091855934219289e-06,
"loss": 0.0325,
"step": 701
},
{
"epoch": 10.100719424460431,
"grad_norm": 1.3476082037635144,
"learning_rate": 1.1906040013021668e-06,
"loss": 0.0143,
"step": 702
},
{
"epoch": 10.115107913669064,
"grad_norm": 1.0352529590879136,
"learning_rate": 1.172157247782083e-06,
"loss": 0.0198,
"step": 703
},
{
"epoch": 10.129496402877697,
"grad_norm": 1.1278501275644894,
"learning_rate": 1.1538456152118394e-06,
"loss": 0.0207,
"step": 704
},
{
"epoch": 10.14388489208633,
"grad_norm": 3.9502198656971963,
"learning_rate": 1.1356693838734134e-06,
"loss": 0.0514,
"step": 705
},
{
"epoch": 10.158273381294965,
"grad_norm": 1.530725864660551,
"learning_rate": 1.1176288319762963e-06,
"loss": 0.0221,
"step": 706
},
{
"epoch": 10.172661870503598,
"grad_norm": 2.2735261159620723,
"learning_rate": 1.0997242356532335e-06,
"loss": 0.0466,
"step": 707
},
{
"epoch": 10.18705035971223,
"grad_norm": 1.0618045257756556,
"learning_rate": 1.0819558689560162e-06,
"loss": 0.0227,
"step": 708
},
{
"epoch": 10.201438848920864,
"grad_norm": 1.6182455175606794,
"learning_rate": 1.0643240038512648e-06,
"loss": 0.0231,
"step": 709
},
{
"epoch": 10.215827338129497,
"grad_norm": 8.00225455212014,
"learning_rate": 1.0468289102162788e-06,
"loss": 0.0681,
"step": 710
},
{
"epoch": 10.23021582733813,
"grad_norm": 2.591497264989186,
"learning_rate": 1.0294708558349031e-06,
"loss": 0.0251,
"step": 711
},
{
"epoch": 10.244604316546763,
"grad_norm": 1.7149356695320543,
"learning_rate": 1.0122501063934266e-06,
"loss": 0.0304,
"step": 712
},
{
"epoch": 10.258992805755396,
"grad_norm": 2.487093456558291,
"learning_rate": 9.951669254765227e-07,
"loss": 0.0316,
"step": 713
},
{
"epoch": 10.273381294964029,
"grad_norm": 1.8372925491948908,
"learning_rate": 9.782215745632063e-07,
"loss": 0.0244,
"step": 714
},
{
"epoch": 10.287769784172662,
"grad_norm": 1.6522089041208652,
"learning_rate": 9.614143130228336e-07,
"loss": 0.0213,
"step": 715
},
{
"epoch": 10.302158273381295,
"grad_norm": 1.388640294189459,
"learning_rate": 9.447453981111377e-07,
"loss": 0.022,
"step": 716
},
{
"epoch": 10.316546762589928,
"grad_norm": 1.8567918043310965,
"learning_rate": 9.282150849662841e-07,
"loss": 0.0277,
"step": 717
},
{
"epoch": 10.33093525179856,
"grad_norm": 1.7484564597114693,
"learning_rate": 9.118236266049707e-07,
"loss": 0.04,
"step": 718
},
{
"epoch": 10.345323741007194,
"grad_norm": 1.3472624690293522,
"learning_rate": 8.955712739185529e-07,
"loss": 0.0284,
"step": 719
},
{
"epoch": 10.359712230215827,
"grad_norm": 2.569862357797179,
"learning_rate": 8.794582756691994e-07,
"loss": 0.0487,
"step": 720
},
{
"epoch": 10.37410071942446,
"grad_norm": 1.1341008302050821,
"learning_rate": 8.634848784860916e-07,
"loss": 0.0259,
"step": 721
},
{
"epoch": 10.388489208633093,
"grad_norm": 3.9556296920276326,
"learning_rate": 8.476513268616471e-07,
"loss": 0.0303,
"step": 722
},
{
"epoch": 10.402877697841726,
"grad_norm": 1.0067776542085125,
"learning_rate": 8.319578631477731e-07,
"loss": 0.0215,
"step": 723
},
{
"epoch": 10.417266187050359,
"grad_norm": 1.5246834403644955,
"learning_rate": 8.164047275521614e-07,
"loss": 0.0279,
"step": 724
},
{
"epoch": 10.431654676258994,
"grad_norm": 1.5007545531516848,
"learning_rate": 8.00992158134607e-07,
"loss": 0.0371,
"step": 725
},
{
"epoch": 10.446043165467627,
"grad_norm": 0.962027429251609,
"learning_rate": 7.857203908033684e-07,
"loss": 0.026,
"step": 726
},
{
"epoch": 10.46043165467626,
"grad_norm": 1.2598114860067426,
"learning_rate": 7.705896593115614e-07,
"loss": 0.0275,
"step": 727
},
{
"epoch": 10.474820143884893,
"grad_norm": 1.5601411747424254,
"learning_rate": 7.556001952535697e-07,
"loss": 0.0336,
"step": 728
},
{
"epoch": 10.489208633093526,
"grad_norm": 2.7709712616630626,
"learning_rate": 7.40752228061502e-07,
"loss": 0.0355,
"step": 729
},
{
"epoch": 10.503597122302159,
"grad_norm": 4.058223338059946,
"learning_rate": 7.260459850016932e-07,
"loss": 0.0587,
"step": 730
},
{
"epoch": 10.517985611510792,
"grad_norm": 0.8541524082485146,
"learning_rate": 7.114816911712131e-07,
"loss": 0.0137,
"step": 731
},
{
"epoch": 10.532374100719425,
"grad_norm": 2.2965443467696542,
"learning_rate": 6.970595694944215e-07,
"loss": 0.0441,
"step": 732
},
{
"epoch": 10.546762589928058,
"grad_norm": 4.196670787721921,
"learning_rate": 6.827798407195629e-07,
"loss": 0.0284,
"step": 733
},
{
"epoch": 10.56115107913669,
"grad_norm": 1.1707179381668629,
"learning_rate": 6.686427234153814e-07,
"loss": 0.0277,
"step": 734
},
{
"epoch": 10.575539568345324,
"grad_norm": 1.7237854874400753,
"learning_rate": 6.546484339677817e-07,
"loss": 0.0229,
"step": 735
},
{
"epoch": 10.589928057553957,
"grad_norm": 1.9963363458117016,
"learning_rate": 6.407971865765095e-07,
"loss": 0.0403,
"step": 736
},
{
"epoch": 10.60431654676259,
"grad_norm": 1.8475932957020715,
"learning_rate": 6.270891932518775e-07,
"loss": 0.0339,
"step": 737
},
{
"epoch": 10.618705035971223,
"grad_norm": 2.207865836762862,
"learning_rate": 6.1352466381152e-07,
"loss": 0.0206,
"step": 738
},
{
"epoch": 10.633093525179856,
"grad_norm": 1.5028628741315406,
"learning_rate": 6.00103805877178e-07,
"loss": 0.0279,
"step": 739
},
{
"epoch": 10.647482014388489,
"grad_norm": 1.6578850661820663,
"learning_rate": 5.868268248715292e-07,
"loss": 0.0458,
"step": 740
},
{
"epoch": 10.661870503597122,
"grad_norm": 2.8376855115850255,
"learning_rate": 5.736939240150363e-07,
"loss": 0.0357,
"step": 741
},
{
"epoch": 10.676258992805755,
"grad_norm": 2.5385576697329313,
"learning_rate": 5.607053043228361e-07,
"loss": 0.0331,
"step": 742
},
{
"epoch": 10.690647482014388,
"grad_norm": 1.2033464016461923,
"learning_rate": 5.478611646016674e-07,
"loss": 0.0207,
"step": 743
},
{
"epoch": 10.70503597122302,
"grad_norm": 0.981803438753484,
"learning_rate": 5.35161701446828e-07,
"loss": 0.0188,
"step": 744
},
{
"epoch": 10.719424460431654,
"grad_norm": 8.071336752025925,
"learning_rate": 5.226071092391616e-07,
"loss": 0.0645,
"step": 745
},
{
"epoch": 10.733812949640289,
"grad_norm": 2.41778659174354,
"learning_rate": 5.101975801420844e-07,
"loss": 0.04,
"step": 746
},
{
"epoch": 10.748201438848922,
"grad_norm": 0.641208094752535,
"learning_rate": 4.979333040986434e-07,
"loss": 0.0131,
"step": 747
},
{
"epoch": 10.762589928057555,
"grad_norm": 6.926642444043881,
"learning_rate": 4.858144688286103e-07,
"loss": 0.0465,
"step": 748
},
{
"epoch": 10.776978417266188,
"grad_norm": 2.1100789184063675,
"learning_rate": 4.7384125982561035e-07,
"loss": 0.0303,
"step": 749
},
{
"epoch": 10.79136690647482,
"grad_norm": 4.2180297630453305,
"learning_rate": 4.6201386035427785e-07,
"loss": 0.0313,
"step": 750
},
{
"epoch": 10.805755395683454,
"grad_norm": 2.0665863271791167,
"learning_rate": 4.503324514474483e-07,
"loss": 0.0274,
"step": 751
},
{
"epoch": 10.820143884892087,
"grad_norm": 1.6728982206562557,
"learning_rate": 4.387972119034023e-07,
"loss": 0.0307,
"step": 752
},
{
"epoch": 10.83453237410072,
"grad_norm": 3.4746815846649746,
"learning_rate": 4.274083182831157e-07,
"loss": 0.0355,
"step": 753
},
{
"epoch": 10.848920863309353,
"grad_norm": 1.8969392294440341,
"learning_rate": 4.161659449075572e-07,
"loss": 0.0343,
"step": 754
},
{
"epoch": 10.863309352517986,
"grad_norm": 2.7876842253949112,
"learning_rate": 4.0507026385502747e-07,
"loss": 0.0467,
"step": 755
},
{
"epoch": 10.877697841726619,
"grad_norm": 4.67027301314563,
"learning_rate": 3.9412144495851845e-07,
"loss": 0.0436,
"step": 756
},
{
"epoch": 10.892086330935252,
"grad_norm": 1.399538680812464,
"learning_rate": 3.833196558031216e-07,
"loss": 0.0217,
"step": 757
},
{
"epoch": 10.906474820143885,
"grad_norm": 1.7409190227121822,
"learning_rate": 3.7266506172345507e-07,
"loss": 0.0372,
"step": 758
},
{
"epoch": 10.920863309352518,
"grad_norm": 2.1365410486540974,
"learning_rate": 3.621578258011338e-07,
"loss": 0.0245,
"step": 759
},
{
"epoch": 10.93525179856115,
"grad_norm": 3.4216815696618257,
"learning_rate": 3.517981088622768e-07,
"loss": 0.0447,
"step": 760
},
{
"epoch": 10.949640287769784,
"grad_norm": 2.5234742535562695,
"learning_rate": 3.4158606947504944e-07,
"loss": 0.0397,
"step": 761
},
{
"epoch": 10.964028776978417,
"grad_norm": 4.126068958805298,
"learning_rate": 3.3152186394722506e-07,
"loss": 0.0538,
"step": 762
},
{
"epoch": 10.97841726618705,
"grad_norm": 2.2465535716930614,
"learning_rate": 3.2160564632380043e-07,
"loss": 0.0693,
"step": 763
},
{
"epoch": 10.992805755395683,
"grad_norm": 3.814910469078625,
"learning_rate": 3.118375683846353e-07,
"loss": 0.0713,
"step": 764
},
{
"epoch": 11.007194244604317,
"grad_norm": 0.870573413202881,
"learning_rate": 3.022177796421322e-07,
"loss": 0.0141,
"step": 765
},
{
"epoch": 11.02158273381295,
"grad_norm": 2.5693617992072033,
"learning_rate": 2.9274642733894577e-07,
"loss": 0.0341,
"step": 766
},
{
"epoch": 11.035971223021583,
"grad_norm": 1.6022862996501166,
"learning_rate": 2.834236564457271e-07,
"loss": 0.0354,
"step": 767
},
{
"epoch": 11.050359712230216,
"grad_norm": 1.0198868538110064,
"learning_rate": 2.742496096589076e-07,
"loss": 0.0166,
"step": 768
},
{
"epoch": 11.06474820143885,
"grad_norm": 1.9386025908425748,
"learning_rate": 2.652244273985127e-07,
"loss": 0.0406,
"step": 769
},
{
"epoch": 11.079136690647482,
"grad_norm": 1.506641792856537,
"learning_rate": 2.5634824780601753e-07,
"loss": 0.0215,
"step": 770
},
{
"epoch": 11.093525179856115,
"grad_norm": 2.8974005417112054,
"learning_rate": 2.4762120674222456e-07,
"loss": 0.0337,
"step": 771
},
{
"epoch": 11.107913669064748,
"grad_norm": 3.2363471525554632,
"learning_rate": 2.390434377851925e-07,
"loss": 0.0659,
"step": 772
},
{
"epoch": 11.122302158273381,
"grad_norm": 1.7250018435406151,
"learning_rate": 2.3061507222818303e-07,
"loss": 0.0264,
"step": 773
},
{
"epoch": 11.136690647482014,
"grad_norm": 1.3716552652282374,
"learning_rate": 2.2233623907765956e-07,
"loss": 0.0301,
"step": 774
},
{
"epoch": 11.151079136690647,
"grad_norm": 4.081730598474069,
"learning_rate": 2.1420706505130728e-07,
"loss": 0.0296,
"step": 775
},
{
"epoch": 11.16546762589928,
"grad_norm": 2.1837875002466993,
"learning_rate": 2.0622767457609384e-07,
"loss": 0.0487,
"step": 776
},
{
"epoch": 11.179856115107913,
"grad_norm": 2.5403013467016016,
"learning_rate": 1.983981897863685e-07,
"loss": 0.0298,
"step": 777
},
{
"epoch": 11.194244604316546,
"grad_norm": 0.9102126378246074,
"learning_rate": 1.9071873052198818e-07,
"loss": 0.0198,
"step": 778
},
{
"epoch": 11.20863309352518,
"grad_norm": 1.3128849233164,
"learning_rate": 1.8318941432648785e-07,
"loss": 0.0193,
"step": 779
},
{
"epoch": 11.223021582733812,
"grad_norm": 1.4735468637747058,
"learning_rate": 1.7581035644527623e-07,
"loss": 0.0243,
"step": 780
},
{
"epoch": 11.237410071942445,
"grad_norm": 1.8021548024444698,
"learning_rate": 1.6858166982387624e-07,
"loss": 0.0205,
"step": 781
},
{
"epoch": 11.251798561151078,
"grad_norm": 2.25011481794677,
"learning_rate": 1.6150346510619197e-07,
"loss": 0.0401,
"step": 782
},
{
"epoch": 11.266187050359711,
"grad_norm": 3.4908449211884482,
"learning_rate": 1.5457585063282322e-07,
"loss": 0.0739,
"step": 783
},
{
"epoch": 11.280575539568344,
"grad_norm": 0.8061010306090391,
"learning_rate": 1.4779893243939358e-07,
"loss": 0.0143,
"step": 784
},
{
"epoch": 11.29496402877698,
"grad_norm": 1.6504383979055732,
"learning_rate": 1.4117281425494178e-07,
"loss": 0.0248,
"step": 785
},
{
"epoch": 11.309352517985612,
"grad_norm": 1.402157952033158,
"learning_rate": 1.3469759750032508e-07,
"loss": 0.023,
"step": 786
},
{
"epoch": 11.323741007194245,
"grad_norm": 2.645079301698734,
"learning_rate": 1.2837338128666942e-07,
"loss": 0.0647,
"step": 787
},
{
"epoch": 11.338129496402878,
"grad_norm": 0.7567449101065951,
"learning_rate": 1.2220026241385296e-07,
"loss": 0.0156,
"step": 788
},
{
"epoch": 11.352517985611511,
"grad_norm": 0.6465755222813994,
"learning_rate": 1.1617833536902489e-07,
"loss": 0.0179,
"step": 789
},
{
"epoch": 11.366906474820144,
"grad_norm": 1.8698065932207548,
"learning_rate": 1.1030769232515559e-07,
"loss": 0.023,
"step": 790
},
{
"epoch": 11.381294964028777,
"grad_norm": 1.7988268213081555,
"learning_rate": 1.0458842313963102e-07,
"loss": 0.0411,
"step": 791
},
{
"epoch": 11.39568345323741,
"grad_norm": 3.6957888652655444,
"learning_rate": 9.902061535287278e-08,
"loss": 0.0338,
"step": 792
},
{
"epoch": 11.410071942446043,
"grad_norm": 2.7446261934693443,
"learning_rate": 9.360435418700131e-08,
"loss": 0.0324,
"step": 793
},
{
"epoch": 11.424460431654676,
"grad_norm": 0.7757882042918355,
"learning_rate": 8.83397225445315e-08,
"loss": 0.0281,
"step": 794
},
{
"epoch": 11.43884892086331,
"grad_norm": 2.857395014664893,
"learning_rate": 8.322680100710023e-08,
"loss": 0.0336,
"step": 795
},
{
"epoch": 11.453237410071942,
"grad_norm": 0.8599695116705092,
"learning_rate": 7.826566783423639e-08,
"loss": 0.0153,
"step": 796
},
{
"epoch": 11.467625899280575,
"grad_norm": 1.2009242568158411,
"learning_rate": 7.345639896216173e-08,
"loss": 0.0218,
"step": 797
},
{
"epoch": 11.482014388489208,
"grad_norm": 1.8513393342942366,
"learning_rate": 6.879906800262848e-08,
"loss": 0.0248,
"step": 798
},
{
"epoch": 11.496402877697841,
"grad_norm": 1.2374041080258498,
"learning_rate": 6.429374624179474e-08,
"loss": 0.0187,
"step": 799
},
{
"epoch": 11.510791366906474,
"grad_norm": 1.493116059019553,
"learning_rate": 5.994050263912976e-08,
"loss": 0.0228,
"step": 800
},
{
"epoch": 11.525179856115107,
"grad_norm": 1.9511941731868687,
"learning_rate": 5.573940382636145e-08,
"loss": 0.05,
"step": 801
},
{
"epoch": 11.53956834532374,
"grad_norm": 1.6434577538884052,
"learning_rate": 5.169051410645276e-08,
"loss": 0.0183,
"step": 802
},
{
"epoch": 11.553956834532373,
"grad_norm": 3.362352075361728,
"learning_rate": 4.7793895452623584e-08,
"loss": 0.037,
"step": 803
},
{
"epoch": 11.568345323741006,
"grad_norm": 5.575378703603965,
"learning_rate": 4.4049607507397066e-08,
"loss": 0.0794,
"step": 804
},
{
"epoch": 11.582733812949641,
"grad_norm": 3.109823706873304,
"learning_rate": 4.045770758168699e-08,
"loss": 0.0317,
"step": 805
},
{
"epoch": 11.597122302158274,
"grad_norm": 2.2826191924472394,
"learning_rate": 3.701825065392184e-08,
"loss": 0.0293,
"step": 806
},
{
"epoch": 11.611510791366907,
"grad_norm": 2.018015360147881,
"learning_rate": 3.3731289369206556e-08,
"loss": 0.0398,
"step": 807
},
{
"epoch": 11.62589928057554,
"grad_norm": 8.047431870090547,
"learning_rate": 3.059687403850986e-08,
"loss": 0.1134,
"step": 808
},
{
"epoch": 11.640287769784173,
"grad_norm": 2.298672016403596,
"learning_rate": 2.761505263789821e-08,
"loss": 0.0243,
"step": 809
},
{
"epoch": 11.654676258992806,
"grad_norm": 2.5580107049006315,
"learning_rate": 2.4785870807803036e-08,
"loss": 0.0363,
"step": 810
},
{
"epoch": 11.66906474820144,
"grad_norm": 2.1820443354655548,
"learning_rate": 2.2109371852317985e-08,
"loss": 0.0321,
"step": 811
},
{
"epoch": 11.683453237410072,
"grad_norm": 1.8784576576635295,
"learning_rate": 1.9585596738539436e-08,
"loss": 0.0294,
"step": 812
},
{
"epoch": 11.697841726618705,
"grad_norm": 1.2733181144368895,
"learning_rate": 1.7214584095937015e-08,
"loss": 0.0322,
"step": 813
},
{
"epoch": 11.712230215827338,
"grad_norm": 1.4234884359583329,
"learning_rate": 1.4996370215765165e-08,
"loss": 0.0234,
"step": 814
},
{
"epoch": 11.726618705035971,
"grad_norm": 1.3429094276944762,
"learning_rate": 1.2930989050504717e-08,
"loss": 0.0166,
"step": 815
},
{
"epoch": 11.741007194244604,
"grad_norm": 0.9942863729919723,
"learning_rate": 1.101847221334551e-08,
"loss": 0.0178,
"step": 816
},
{
"epoch": 11.755395683453237,
"grad_norm": 4.8436461710096514,
"learning_rate": 9.25884897770013e-09,
"loss": 0.0356,
"step": 817
},
{
"epoch": 11.76978417266187,
"grad_norm": 3.3015502134325945,
"learning_rate": 7.652146276759808e-09,
"loss": 0.0656,
"step": 818
},
{
"epoch": 11.784172661870503,
"grad_norm": 1.6919061388445014,
"learning_rate": 6.1983887030769855e-09,
"loss": 0.0243,
"step": 819
},
{
"epoch": 11.798561151079136,
"grad_norm": 2.579249722184542,
"learning_rate": 4.897598508192269e-09,
"loss": 0.0411,
"step": 820
},
{
"epoch": 11.81294964028777,
"grad_norm": 1.8241696897466821,
"learning_rate": 3.749795602294715e-09,
"loss": 0.0294,
"step": 821
},
{
"epoch": 11.827338129496402,
"grad_norm": 2.10962756409833,
"learning_rate": 2.7549975539120644e-09,
"loss": 0.0307,
"step": 822
},
{
"epoch": 11.841726618705035,
"grad_norm": 2.127397019212184,
"learning_rate": 1.9132195896498505e-09,
"loss": 0.0287,
"step": 823
},
{
"epoch": 11.85611510791367,
"grad_norm": 1.7438775653570628,
"learning_rate": 1.2244745939493651e-09,
"loss": 0.023,
"step": 824
},
{
"epoch": 11.870503597122303,
"grad_norm": 1.7148926403499662,
"learning_rate": 6.887731088978111e-10,
"loss": 0.0321,
"step": 825
},
{
"epoch": 11.884892086330936,
"grad_norm": 3.3795126925432015,
"learning_rate": 3.0612333406176976e-10,
"loss": 0.0481,
"step": 826
},
{
"epoch": 11.899280575539569,
"grad_norm": 2.9388316113176565,
"learning_rate": 7.65311263661861e-11,
"loss": 0.0546,
"step": 827
},
{
"epoch": 11.913669064748202,
"grad_norm": 6.73693905281281,
"learning_rate": 0.0,
"loss": 0.0567,
"step": 828
}
],
"logging_steps": 1.0,
"max_steps": 828,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 200.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 418903697129472.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}