Rei-V2-12B / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
f2d0355 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 764,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002617801047120419,
"grad_norm": 12.144990537050639,
"learning_rate": 5e-08,
"loss": 1.2113,
"step": 1
},
{
"epoch": 0.005235602094240838,
"grad_norm": 13.176179220886064,
"learning_rate": 1e-07,
"loss": 1.3287,
"step": 2
},
{
"epoch": 0.007853403141361256,
"grad_norm": 12.924711168956318,
"learning_rate": 1.5e-07,
"loss": 1.2798,
"step": 3
},
{
"epoch": 0.010471204188481676,
"grad_norm": 12.21691289343528,
"learning_rate": 2e-07,
"loss": 1.2284,
"step": 4
},
{
"epoch": 0.013089005235602094,
"grad_norm": 11.997438945474764,
"learning_rate": 2.5e-07,
"loss": 1.2477,
"step": 5
},
{
"epoch": 0.015706806282722512,
"grad_norm": 12.217585739384367,
"learning_rate": 3e-07,
"loss": 1.2437,
"step": 6
},
{
"epoch": 0.01832460732984293,
"grad_norm": 11.875859411166187,
"learning_rate": 3.5e-07,
"loss": 1.2108,
"step": 7
},
{
"epoch": 0.020942408376963352,
"grad_norm": 11.64973795949067,
"learning_rate": 4e-07,
"loss": 1.2131,
"step": 8
},
{
"epoch": 0.02356020942408377,
"grad_norm": 12.45663018271384,
"learning_rate": 4.5e-07,
"loss": 1.2516,
"step": 9
},
{
"epoch": 0.02617801047120419,
"grad_norm": 11.239158096874556,
"learning_rate": 5e-07,
"loss": 1.1734,
"step": 10
},
{
"epoch": 0.028795811518324606,
"grad_norm": 11.710604360468333,
"learning_rate": 5.5e-07,
"loss": 1.2206,
"step": 11
},
{
"epoch": 0.031413612565445025,
"grad_norm": 11.118095241592131,
"learning_rate": 6e-07,
"loss": 1.2074,
"step": 12
},
{
"epoch": 0.034031413612565446,
"grad_norm": 11.770346865985067,
"learning_rate": 6.5e-07,
"loss": 1.2615,
"step": 13
},
{
"epoch": 0.03664921465968586,
"grad_norm": 11.44615754399185,
"learning_rate": 7e-07,
"loss": 1.2183,
"step": 14
},
{
"epoch": 0.03926701570680628,
"grad_norm": 9.987481097984933,
"learning_rate": 7.5e-07,
"loss": 1.1378,
"step": 15
},
{
"epoch": 0.041884816753926704,
"grad_norm": 10.087076189949745,
"learning_rate": 8e-07,
"loss": 1.2098,
"step": 16
},
{
"epoch": 0.04450261780104712,
"grad_norm": 10.02461667279938,
"learning_rate": 8.499999999999999e-07,
"loss": 1.1835,
"step": 17
},
{
"epoch": 0.04712041884816754,
"grad_norm": 8.79462248217657,
"learning_rate": 9e-07,
"loss": 1.1356,
"step": 18
},
{
"epoch": 0.049738219895287955,
"grad_norm": 8.628781591254402,
"learning_rate": 9.499999999999999e-07,
"loss": 1.1735,
"step": 19
},
{
"epoch": 0.05235602094240838,
"grad_norm": 7.738647331698055,
"learning_rate": 1e-06,
"loss": 1.1868,
"step": 20
},
{
"epoch": 0.0549738219895288,
"grad_norm": 6.233136791291174,
"learning_rate": 1.05e-06,
"loss": 1.151,
"step": 21
},
{
"epoch": 0.05759162303664921,
"grad_norm": 5.834711461409201,
"learning_rate": 1.1e-06,
"loss": 1.129,
"step": 22
},
{
"epoch": 0.060209424083769635,
"grad_norm": 5.0077704654863595,
"learning_rate": 1.1499999999999998e-06,
"loss": 1.1388,
"step": 23
},
{
"epoch": 0.06282722513089005,
"grad_norm": 3.854978376544968,
"learning_rate": 1.2e-06,
"loss": 1.052,
"step": 24
},
{
"epoch": 0.06544502617801047,
"grad_norm": 3.534169938073168,
"learning_rate": 1.2499999999999999e-06,
"loss": 1.0727,
"step": 25
},
{
"epoch": 0.06806282722513089,
"grad_norm": 3.2514927992093274,
"learning_rate": 1.3e-06,
"loss": 1.1347,
"step": 26
},
{
"epoch": 0.07068062827225131,
"grad_norm": 3.102034937556906,
"learning_rate": 1.35e-06,
"loss": 1.1357,
"step": 27
},
{
"epoch": 0.07329842931937172,
"grad_norm": 2.6056648897843577,
"learning_rate": 1.4e-06,
"loss": 1.0368,
"step": 28
},
{
"epoch": 0.07591623036649214,
"grad_norm": 5.81258653165808,
"learning_rate": 1.4499999999999999e-06,
"loss": 1.1151,
"step": 29
},
{
"epoch": 0.07853403141361257,
"grad_norm": 3.4877693307584723,
"learning_rate": 1.5e-06,
"loss": 1.1164,
"step": 30
},
{
"epoch": 0.08115183246073299,
"grad_norm": 4.02075317392796,
"learning_rate": 1.55e-06,
"loss": 1.1174,
"step": 31
},
{
"epoch": 0.08376963350785341,
"grad_norm": 3.7625924042569543,
"learning_rate": 1.6e-06,
"loss": 1.0247,
"step": 32
},
{
"epoch": 0.08638743455497382,
"grad_norm": 3.698847952235366,
"learning_rate": 1.6499999999999999e-06,
"loss": 1.0578,
"step": 33
},
{
"epoch": 0.08900523560209424,
"grad_norm": 3.4078618253433444,
"learning_rate": 1.6999999999999998e-06,
"loss": 1.0513,
"step": 34
},
{
"epoch": 0.09162303664921466,
"grad_norm": 3.2909646643289325,
"learning_rate": 1.75e-06,
"loss": 1.1018,
"step": 35
},
{
"epoch": 0.09424083769633508,
"grad_norm": 2.872238177892186,
"learning_rate": 1.8e-06,
"loss": 1.0762,
"step": 36
},
{
"epoch": 0.0968586387434555,
"grad_norm": 2.77121115764399,
"learning_rate": 1.85e-06,
"loss": 1.0735,
"step": 37
},
{
"epoch": 0.09947643979057591,
"grad_norm": 2.549417261476137,
"learning_rate": 1.8999999999999998e-06,
"loss": 1.0632,
"step": 38
},
{
"epoch": 0.10209424083769633,
"grad_norm": 2.190358595015251,
"learning_rate": 1.95e-06,
"loss": 1.0227,
"step": 39
},
{
"epoch": 0.10471204188481675,
"grad_norm": 2.5027525699752227,
"learning_rate": 2e-06,
"loss": 1.0837,
"step": 40
},
{
"epoch": 0.10732984293193717,
"grad_norm": 2.535128732687191,
"learning_rate": 1.9999905856154088e-06,
"loss": 1.0799,
"step": 41
},
{
"epoch": 0.1099476439790576,
"grad_norm": 1.7534174402541685,
"learning_rate": 1.999962342638896e-06,
"loss": 1.0124,
"step": 42
},
{
"epoch": 0.112565445026178,
"grad_norm": 2.0631738564308377,
"learning_rate": 1.9999152716022427e-06,
"loss": 1.1008,
"step": 43
},
{
"epoch": 0.11518324607329843,
"grad_norm": 2.005091740871665,
"learning_rate": 1.9998493733917385e-06,
"loss": 1.0374,
"step": 44
},
{
"epoch": 0.11780104712041885,
"grad_norm": 1.7413337245159195,
"learning_rate": 1.999764649248165e-06,
"loss": 1.0398,
"step": 45
},
{
"epoch": 0.12041884816753927,
"grad_norm": 1.745666445155081,
"learning_rate": 1.999661100766774e-06,
"loss": 1.0645,
"step": 46
},
{
"epoch": 0.12303664921465969,
"grad_norm": 1.8987977735185813,
"learning_rate": 1.999538729897256e-06,
"loss": 1.0614,
"step": 47
},
{
"epoch": 0.1256544502617801,
"grad_norm": 1.711040452338805,
"learning_rate": 1.9993975389437036e-06,
"loss": 1.0332,
"step": 48
},
{
"epoch": 0.12827225130890052,
"grad_norm": 1.6087094017934318,
"learning_rate": 1.999237530564569e-06,
"loss": 1.0008,
"step": 49
},
{
"epoch": 0.13089005235602094,
"grad_norm": 2.0042351547368686,
"learning_rate": 1.9990587077726125e-06,
"loss": 1.0768,
"step": 50
},
{
"epoch": 0.13350785340314136,
"grad_norm": 1.5264346842814094,
"learning_rate": 1.998861073934848e-06,
"loss": 1.0213,
"step": 51
},
{
"epoch": 0.13612565445026178,
"grad_norm": 1.7583198075470237,
"learning_rate": 1.998644632772477e-06,
"loss": 1.0103,
"step": 52
},
{
"epoch": 0.1387434554973822,
"grad_norm": 1.6841921629086405,
"learning_rate": 1.99840938836082e-06,
"loss": 1.0116,
"step": 53
},
{
"epoch": 0.14136125654450263,
"grad_norm": 9.859700163848672,
"learning_rate": 1.9981553451292393e-06,
"loss": 1.0429,
"step": 54
},
{
"epoch": 0.14397905759162305,
"grad_norm": 1.6093588714569955,
"learning_rate": 1.9978825078610574e-06,
"loss": 0.9722,
"step": 55
},
{
"epoch": 0.14659685863874344,
"grad_norm": 1.6921175654924636,
"learning_rate": 1.9975908816934638e-06,
"loss": 1.045,
"step": 56
},
{
"epoch": 0.14921465968586387,
"grad_norm": 1.4401701807904859,
"learning_rate": 1.9972804721174198e-06,
"loss": 1.0094,
"step": 57
},
{
"epoch": 0.1518324607329843,
"grad_norm": 1.7314403887490193,
"learning_rate": 1.996951284977556e-06,
"loss": 0.9334,
"step": 58
},
{
"epoch": 0.1544502617801047,
"grad_norm": 1.4035559785329323,
"learning_rate": 1.9966033264720613e-06,
"loss": 0.9635,
"step": 59
},
{
"epoch": 0.15706806282722513,
"grad_norm": 1.4996157907491292,
"learning_rate": 1.9962366031525663e-06,
"loss": 1.0347,
"step": 60
},
{
"epoch": 0.15968586387434555,
"grad_norm": 3.8626755595391287,
"learning_rate": 1.9958511219240188e-06,
"loss": 0.9453,
"step": 61
},
{
"epoch": 0.16230366492146597,
"grad_norm": 1.5545221856692377,
"learning_rate": 1.9954468900445565e-06,
"loss": 1.0431,
"step": 62
},
{
"epoch": 0.1649214659685864,
"grad_norm": 1.50894331448368,
"learning_rate": 1.995023915125368e-06,
"loss": 1.056,
"step": 63
},
{
"epoch": 0.16753926701570682,
"grad_norm": 1.6123663294471207,
"learning_rate": 1.9945822051305507e-06,
"loss": 0.9744,
"step": 64
},
{
"epoch": 0.17015706806282724,
"grad_norm": 2.002886641603634,
"learning_rate": 1.9941217683769596e-06,
"loss": 1.0396,
"step": 65
},
{
"epoch": 0.17277486910994763,
"grad_norm": 1.4061639592638548,
"learning_rate": 1.9936426135340527e-06,
"loss": 1.0162,
"step": 66
},
{
"epoch": 0.17539267015706805,
"grad_norm": 1.4744552229885421,
"learning_rate": 1.9931447496237255e-06,
"loss": 1.0339,
"step": 67
},
{
"epoch": 0.17801047120418848,
"grad_norm": 1.3373753426511383,
"learning_rate": 1.9926281860201426e-06,
"loss": 1.0123,
"step": 68
},
{
"epoch": 0.1806282722513089,
"grad_norm": 1.8346771940527866,
"learning_rate": 1.992092932449561e-06,
"loss": 1.0218,
"step": 69
},
{
"epoch": 0.18324607329842932,
"grad_norm": 1.5683702537742212,
"learning_rate": 1.9915389989901473e-06,
"loss": 0.9868,
"step": 70
},
{
"epoch": 0.18586387434554974,
"grad_norm": 1.6088369103380997,
"learning_rate": 1.9909663960717854e-06,
"loss": 1.0082,
"step": 71
},
{
"epoch": 0.18848167539267016,
"grad_norm": 1.6226263180260028,
"learning_rate": 1.9903751344758845e-06,
"loss": 1.0272,
"step": 72
},
{
"epoch": 0.19109947643979058,
"grad_norm": 1.519358495108808,
"learning_rate": 1.9897652253351726e-06,
"loss": 1.0006,
"step": 73
},
{
"epoch": 0.193717277486911,
"grad_norm": 1.7246977984441099,
"learning_rate": 1.9891366801334875e-06,
"loss": 1.0071,
"step": 74
},
{
"epoch": 0.19633507853403143,
"grad_norm": 1.885654171226984,
"learning_rate": 1.9884895107055627e-06,
"loss": 0.9659,
"step": 75
},
{
"epoch": 0.19895287958115182,
"grad_norm": 1.4255056172161928,
"learning_rate": 1.987823729236801e-06,
"loss": 0.9791,
"step": 76
},
{
"epoch": 0.20157068062827224,
"grad_norm": 1.4127676458511806,
"learning_rate": 1.9871393482630486e-06,
"loss": 0.9982,
"step": 77
},
{
"epoch": 0.20418848167539266,
"grad_norm": 1.6038922300951168,
"learning_rate": 1.9864363806703567e-06,
"loss": 1.0035,
"step": 78
},
{
"epoch": 0.20680628272251309,
"grad_norm": 1.6873794773106967,
"learning_rate": 1.9857148396947403e-06,
"loss": 1.0059,
"step": 79
},
{
"epoch": 0.2094240837696335,
"grad_norm": 1.4769795174016325,
"learning_rate": 1.984974738921927e-06,
"loss": 0.999,
"step": 80
},
{
"epoch": 0.21204188481675393,
"grad_norm": 1.595922250055431,
"learning_rate": 1.9842160922871043e-06,
"loss": 0.9853,
"step": 81
},
{
"epoch": 0.21465968586387435,
"grad_norm": 2.267409709264766,
"learning_rate": 1.9834389140746535e-06,
"loss": 1.0104,
"step": 82
},
{
"epoch": 0.21727748691099477,
"grad_norm": 1.4934157955551683,
"learning_rate": 1.982643218917885e-06,
"loss": 0.9612,
"step": 83
},
{
"epoch": 0.2198952879581152,
"grad_norm": 1.7746131869025623,
"learning_rate": 1.9818290217987584e-06,
"loss": 0.9525,
"step": 84
},
{
"epoch": 0.22251308900523561,
"grad_norm": 1.7233411954456554,
"learning_rate": 1.980996338047604e-06,
"loss": 1.035,
"step": 85
},
{
"epoch": 0.225130890052356,
"grad_norm": 1.4012107807745622,
"learning_rate": 1.980145183342831e-06,
"loss": 0.9657,
"step": 86
},
{
"epoch": 0.22774869109947643,
"grad_norm": 1.5160148683515746,
"learning_rate": 1.9792755737106357e-06,
"loss": 0.9691,
"step": 87
},
{
"epoch": 0.23036649214659685,
"grad_norm": 1.3553031139205232,
"learning_rate": 1.978387525524697e-06,
"loss": 0.9732,
"step": 88
},
{
"epoch": 0.23298429319371727,
"grad_norm": 2.3559259408150144,
"learning_rate": 1.9774810555058694e-06,
"loss": 0.968,
"step": 89
},
{
"epoch": 0.2356020942408377,
"grad_norm": 2.0405648874947144,
"learning_rate": 1.976556180721867e-06,
"loss": 1.0217,
"step": 90
},
{
"epoch": 0.23821989528795812,
"grad_norm": 1.289980797855502,
"learning_rate": 1.975612918586944e-06,
"loss": 0.973,
"step": 91
},
{
"epoch": 0.24083769633507854,
"grad_norm": 1.4893463969411602,
"learning_rate": 1.9746512868615655e-06,
"loss": 1.0109,
"step": 92
},
{
"epoch": 0.24345549738219896,
"grad_norm": 2.074068588481718,
"learning_rate": 1.973671303652073e-06,
"loss": 0.9831,
"step": 93
},
{
"epoch": 0.24607329842931938,
"grad_norm": 1.4717466896018212,
"learning_rate": 1.972672987410345e-06,
"loss": 0.9815,
"step": 94
},
{
"epoch": 0.2486910994764398,
"grad_norm": 1.3362580099120316,
"learning_rate": 1.971656356933446e-06,
"loss": 0.9627,
"step": 95
},
{
"epoch": 0.2513089005235602,
"grad_norm": 1.6316029366291782,
"learning_rate": 1.970621431363278e-06,
"loss": 0.9657,
"step": 96
},
{
"epoch": 0.25392670157068065,
"grad_norm": 1.7671135565160097,
"learning_rate": 1.9695682301862154e-06,
"loss": 0.9219,
"step": 97
},
{
"epoch": 0.25654450261780104,
"grad_norm": 1.5966004233267017,
"learning_rate": 1.9684967732327396e-06,
"loss": 1.0045,
"step": 98
},
{
"epoch": 0.2591623036649215,
"grad_norm": 1.4806542129023226,
"learning_rate": 1.9674070806770667e-06,
"loss": 0.9732,
"step": 99
},
{
"epoch": 0.2617801047120419,
"grad_norm": 1.5115625649689057,
"learning_rate": 1.9662991730367663e-06,
"loss": 0.9692,
"step": 100
},
{
"epoch": 0.2643979057591623,
"grad_norm": 1.6408654241651082,
"learning_rate": 1.965173071172375e-06,
"loss": 1.0782,
"step": 101
},
{
"epoch": 0.2670157068062827,
"grad_norm": 1.58266015019036,
"learning_rate": 1.9640287962870057e-06,
"loss": 0.9532,
"step": 102
},
{
"epoch": 0.2696335078534031,
"grad_norm": 1.39253004882154,
"learning_rate": 1.962866369925946e-06,
"loss": 0.9742,
"step": 103
},
{
"epoch": 0.27225130890052357,
"grad_norm": 1.4246820152864472,
"learning_rate": 1.9616858139762532e-06,
"loss": 1.0196,
"step": 104
},
{
"epoch": 0.27486910994764396,
"grad_norm": 1.5013590811914759,
"learning_rate": 1.960487150666343e-06,
"loss": 1.0238,
"step": 105
},
{
"epoch": 0.2774869109947644,
"grad_norm": 1.440692442657796,
"learning_rate": 1.95927040256557e-06,
"loss": 1.0166,
"step": 106
},
{
"epoch": 0.2801047120418848,
"grad_norm": 1.5426181884013244,
"learning_rate": 1.958035592583803e-06,
"loss": 0.9635,
"step": 107
},
{
"epoch": 0.28272251308900526,
"grad_norm": 1.3715243788491147,
"learning_rate": 1.956782743970995e-06,
"loss": 0.972,
"step": 108
},
{
"epoch": 0.28534031413612565,
"grad_norm": 1.6076440082600667,
"learning_rate": 1.955511880316743e-06,
"loss": 0.9634,
"step": 109
},
{
"epoch": 0.2879581151832461,
"grad_norm": 1.322996372089846,
"learning_rate": 1.9542230255498453e-06,
"loss": 0.946,
"step": 110
},
{
"epoch": 0.2905759162303665,
"grad_norm": 1.6508002306297551,
"learning_rate": 1.9529162039378505e-06,
"loss": 1.0146,
"step": 111
},
{
"epoch": 0.2931937172774869,
"grad_norm": 1.9140720133949438,
"learning_rate": 1.951591440086602e-06,
"loss": 0.9794,
"step": 112
},
{
"epoch": 0.29581151832460734,
"grad_norm": 1.301517922409106,
"learning_rate": 1.9502487589397717e-06,
"loss": 0.9955,
"step": 113
},
{
"epoch": 0.29842931937172773,
"grad_norm": 1.6752384604989274,
"learning_rate": 1.948888185778393e-06,
"loss": 0.9614,
"step": 114
},
{
"epoch": 0.3010471204188482,
"grad_norm": 1.6669201309688906,
"learning_rate": 1.947509746220385e-06,
"loss": 0.9596,
"step": 115
},
{
"epoch": 0.3036649214659686,
"grad_norm": 1.6186884886800548,
"learning_rate": 1.9461134662200666e-06,
"loss": 0.966,
"step": 116
},
{
"epoch": 0.306282722513089,
"grad_norm": 1.4525901118285445,
"learning_rate": 1.9446993720676725e-06,
"loss": 0.9637,
"step": 117
},
{
"epoch": 0.3089005235602094,
"grad_norm": 1.3926685659345446,
"learning_rate": 1.9432674903888547e-06,
"loss": 0.9562,
"step": 118
},
{
"epoch": 0.31151832460732987,
"grad_norm": 1.5687485197442492,
"learning_rate": 1.941817848144183e-06,
"loss": 1.003,
"step": 119
},
{
"epoch": 0.31413612565445026,
"grad_norm": 1.3452828778510757,
"learning_rate": 1.9403504726286365e-06,
"loss": 0.976,
"step": 120
},
{
"epoch": 0.31675392670157065,
"grad_norm": 1.3816617452534992,
"learning_rate": 1.93886539147109e-06,
"loss": 0.9599,
"step": 121
},
{
"epoch": 0.3193717277486911,
"grad_norm": 1.481897037058517,
"learning_rate": 1.9373626326337944e-06,
"loss": 0.9731,
"step": 122
},
{
"epoch": 0.3219895287958115,
"grad_norm": 1.413329266686157,
"learning_rate": 1.9358422244118486e-06,
"loss": 0.9783,
"step": 123
},
{
"epoch": 0.32460732984293195,
"grad_norm": 2.9010127845249385,
"learning_rate": 1.9343041954326677e-06,
"loss": 0.9777,
"step": 124
},
{
"epoch": 0.32722513089005234,
"grad_norm": 1.2853594407436248,
"learning_rate": 1.932748574655445e-06,
"loss": 0.9784,
"step": 125
},
{
"epoch": 0.3298429319371728,
"grad_norm": 1.6563678696752941,
"learning_rate": 1.931175391370605e-06,
"loss": 0.9591,
"step": 126
},
{
"epoch": 0.3324607329842932,
"grad_norm": 1.4153593487098186,
"learning_rate": 1.929584675199252e-06,
"loss": 0.9433,
"step": 127
},
{
"epoch": 0.33507853403141363,
"grad_norm": 1.4853036785401517,
"learning_rate": 1.927976456092614e-06,
"loss": 0.9195,
"step": 128
},
{
"epoch": 0.337696335078534,
"grad_norm": 1.6653698839653936,
"learning_rate": 1.9263507643314775e-06,
"loss": 0.9711,
"step": 129
},
{
"epoch": 0.3403141361256545,
"grad_norm": 1.5513246503570877,
"learning_rate": 1.9247076305256173e-06,
"loss": 1.0266,
"step": 130
},
{
"epoch": 0.34293193717277487,
"grad_norm": 1.9279366482258156,
"learning_rate": 1.923047085613221e-06,
"loss": 1.0001,
"step": 131
},
{
"epoch": 0.34554973821989526,
"grad_norm": 1.4339951925997667,
"learning_rate": 1.9213691608603046e-06,
"loss": 1.003,
"step": 132
},
{
"epoch": 0.3481675392670157,
"grad_norm": 1.3521774084798501,
"learning_rate": 1.9196738878601262e-06,
"loss": 0.9748,
"step": 133
},
{
"epoch": 0.3507853403141361,
"grad_norm": 1.4539030309601608,
"learning_rate": 1.9179612985325907e-06,
"loss": 0.9544,
"step": 134
},
{
"epoch": 0.35340314136125656,
"grad_norm": 1.9287872837043116,
"learning_rate": 1.9162314251236464e-06,
"loss": 0.9649,
"step": 135
},
{
"epoch": 0.35602094240837695,
"grad_norm": 1.426685422908151,
"learning_rate": 1.9144843002046803e-06,
"loss": 1.0246,
"step": 136
},
{
"epoch": 0.3586387434554974,
"grad_norm": 1.6129363461024815,
"learning_rate": 1.912719956671905e-06,
"loss": 0.9603,
"step": 137
},
{
"epoch": 0.3612565445026178,
"grad_norm": 1.4893083520917,
"learning_rate": 1.9109384277457366e-06,
"loss": 0.9644,
"step": 138
},
{
"epoch": 0.36387434554973824,
"grad_norm": 1.4422789845290571,
"learning_rate": 1.9091397469701734e-06,
"loss": 1.0022,
"step": 139
},
{
"epoch": 0.36649214659685864,
"grad_norm": 1.379089386140381,
"learning_rate": 1.9073239482121597e-06,
"loss": 0.977,
"step": 140
},
{
"epoch": 0.36910994764397903,
"grad_norm": 1.2902251521449346,
"learning_rate": 1.905491065660951e-06,
"loss": 0.9479,
"step": 141
},
{
"epoch": 0.3717277486910995,
"grad_norm": 1.4849260045966897,
"learning_rate": 1.9036411338274702e-06,
"loss": 0.9535,
"step": 142
},
{
"epoch": 0.3743455497382199,
"grad_norm": 1.4742367033559105,
"learning_rate": 1.9017741875436569e-06,
"loss": 0.9692,
"step": 143
},
{
"epoch": 0.3769633507853403,
"grad_norm": 1.7554436015022166,
"learning_rate": 1.8998902619618114e-06,
"loss": 0.9279,
"step": 144
},
{
"epoch": 0.3795811518324607,
"grad_norm": 1.4010842342135992,
"learning_rate": 1.8979893925539336e-06,
"loss": 0.9423,
"step": 145
},
{
"epoch": 0.38219895287958117,
"grad_norm": 1.497443248481895,
"learning_rate": 1.8960716151110553e-06,
"loss": 0.9848,
"step": 146
},
{
"epoch": 0.38481675392670156,
"grad_norm": 1.5982300331675101,
"learning_rate": 1.894136965742565e-06,
"loss": 0.9691,
"step": 147
},
{
"epoch": 0.387434554973822,
"grad_norm": 1.371150523732322,
"learning_rate": 1.8921854808755292e-06,
"loss": 0.9548,
"step": 148
},
{
"epoch": 0.3900523560209424,
"grad_norm": 1.7867184813381989,
"learning_rate": 1.8902171972540058e-06,
"loss": 0.985,
"step": 149
},
{
"epoch": 0.39267015706806285,
"grad_norm": 1.4222923909172587,
"learning_rate": 1.8882321519383533e-06,
"loss": 0.9473,
"step": 150
},
{
"epoch": 0.39528795811518325,
"grad_norm": 1.4430633539722946,
"learning_rate": 1.886230382304531e-06,
"loss": 0.945,
"step": 151
},
{
"epoch": 0.39790575916230364,
"grad_norm": 1.4120566052672336,
"learning_rate": 1.884211926043398e-06,
"loss": 0.9377,
"step": 152
},
{
"epoch": 0.4005235602094241,
"grad_norm": 1.5491012831054978,
"learning_rate": 1.882176821160001e-06,
"loss": 0.9694,
"step": 153
},
{
"epoch": 0.4031413612565445,
"grad_norm": 1.702124781114296,
"learning_rate": 1.8801251059728602e-06,
"loss": 0.9713,
"step": 154
},
{
"epoch": 0.40575916230366493,
"grad_norm": 1.2951128973054504,
"learning_rate": 1.878056819113247e-06,
"loss": 0.9355,
"step": 155
},
{
"epoch": 0.4083769633507853,
"grad_norm": 1.3183333685080236,
"learning_rate": 1.875971999524458e-06,
"loss": 0.9591,
"step": 156
},
{
"epoch": 0.4109947643979058,
"grad_norm": 1.4206883250837021,
"learning_rate": 1.8738706864610791e-06,
"loss": 0.9724,
"step": 157
},
{
"epoch": 0.41361256544502617,
"grad_norm": 1.3491536224151637,
"learning_rate": 1.8717529194882497e-06,
"loss": 0.9596,
"step": 158
},
{
"epoch": 0.4162303664921466,
"grad_norm": 1.6067238922960525,
"learning_rate": 1.8696187384809153e-06,
"loss": 0.9874,
"step": 159
},
{
"epoch": 0.418848167539267,
"grad_norm": 2.170081739206507,
"learning_rate": 1.8674681836230768e-06,
"loss": 0.9393,
"step": 160
},
{
"epoch": 0.4214659685863874,
"grad_norm": 1.6523206816354508,
"learning_rate": 1.8653012954070356e-06,
"loss": 0.9912,
"step": 161
},
{
"epoch": 0.42408376963350786,
"grad_norm": 1.7458545642560128,
"learning_rate": 1.8631181146326303e-06,
"loss": 0.9163,
"step": 162
},
{
"epoch": 0.42670157068062825,
"grad_norm": 1.551930694859062,
"learning_rate": 1.860918682406467e-06,
"loss": 0.9959,
"step": 163
},
{
"epoch": 0.4293193717277487,
"grad_norm": 1.3800307543491146,
"learning_rate": 1.8587030401411478e-06,
"loss": 0.944,
"step": 164
},
{
"epoch": 0.4319371727748691,
"grad_norm": 1.5733297290745525,
"learning_rate": 1.8564712295544892e-06,
"loss": 0.9952,
"step": 165
},
{
"epoch": 0.43455497382198954,
"grad_norm": 1.69702400357591,
"learning_rate": 1.8542232926687382e-06,
"loss": 0.9765,
"step": 166
},
{
"epoch": 0.43717277486910994,
"grad_norm": 1.4210131276532727,
"learning_rate": 1.851959271809779e-06,
"loss": 0.9644,
"step": 167
},
{
"epoch": 0.4397905759162304,
"grad_norm": 1.3705469469324443,
"learning_rate": 1.8496792096063379e-06,
"loss": 0.9784,
"step": 168
},
{
"epoch": 0.4424083769633508,
"grad_norm": 1.367481685410119,
"learning_rate": 1.8473831489891798e-06,
"loss": 0.9487,
"step": 169
},
{
"epoch": 0.44502617801047123,
"grad_norm": 1.4039844158067947,
"learning_rate": 1.8450711331903005e-06,
"loss": 0.9287,
"step": 170
},
{
"epoch": 0.4476439790575916,
"grad_norm": 1.9060472632401462,
"learning_rate": 1.8427432057421113e-06,
"loss": 0.9887,
"step": 171
},
{
"epoch": 0.450261780104712,
"grad_norm": 1.4101639997739956,
"learning_rate": 1.8403994104766212e-06,
"loss": 0.9732,
"step": 172
},
{
"epoch": 0.45287958115183247,
"grad_norm": 1.2872899994123914,
"learning_rate": 1.83803979152461e-06,
"loss": 0.934,
"step": 173
},
{
"epoch": 0.45549738219895286,
"grad_norm": 1.5676491316951058,
"learning_rate": 1.8356643933147985e-06,
"loss": 0.9706,
"step": 174
},
{
"epoch": 0.4581151832460733,
"grad_norm": 1.4853491925545348,
"learning_rate": 1.8332732605730109e-06,
"loss": 0.9548,
"step": 175
},
{
"epoch": 0.4607329842931937,
"grad_norm": 1.4216585779544653,
"learning_rate": 1.8308664383213342e-06,
"loss": 0.9953,
"step": 176
},
{
"epoch": 0.46335078534031415,
"grad_norm": 1.283197939445289,
"learning_rate": 1.8284439718772687e-06,
"loss": 0.9058,
"step": 177
},
{
"epoch": 0.46596858638743455,
"grad_norm": 1.4393324681794193,
"learning_rate": 1.8260059068528762e-06,
"loss": 0.9455,
"step": 178
},
{
"epoch": 0.468586387434555,
"grad_norm": 1.4129111621276607,
"learning_rate": 1.82355228915392e-06,
"loss": 0.9674,
"step": 179
},
{
"epoch": 0.4712041884816754,
"grad_norm": 1.5281499729812038,
"learning_rate": 1.8210831649790015e-06,
"loss": 0.9451,
"step": 180
},
{
"epoch": 0.4738219895287958,
"grad_norm": 1.5870948861953107,
"learning_rate": 1.8185985808186901e-06,
"loss": 0.976,
"step": 181
},
{
"epoch": 0.47643979057591623,
"grad_norm": 1.4461944444081016,
"learning_rate": 1.8160985834546474e-06,
"loss": 0.9872,
"step": 182
},
{
"epoch": 0.4790575916230366,
"grad_norm": 1.572493945220242,
"learning_rate": 1.813583219958746e-06,
"loss": 0.9677,
"step": 183
},
{
"epoch": 0.4816753926701571,
"grad_norm": 1.488313044626855,
"learning_rate": 1.811052537692186e-06,
"loss": 0.9853,
"step": 184
},
{
"epoch": 0.48429319371727747,
"grad_norm": 1.3580335854042254,
"learning_rate": 1.8085065843045986e-06,
"loss": 0.9668,
"step": 185
},
{
"epoch": 0.4869109947643979,
"grad_norm": 1.3632089604738407,
"learning_rate": 1.8059454077331526e-06,
"loss": 0.9483,
"step": 186
},
{
"epoch": 0.4895287958115183,
"grad_norm": 1.3238305682165803,
"learning_rate": 1.8033690562016507e-06,
"loss": 0.958,
"step": 187
},
{
"epoch": 0.49214659685863876,
"grad_norm": 1.3836133731386677,
"learning_rate": 1.8007775782196212e-06,
"loss": 0.901,
"step": 188
},
{
"epoch": 0.49476439790575916,
"grad_norm": 1.5098701097333365,
"learning_rate": 1.798171022581405e-06,
"loss": 0.9208,
"step": 189
},
{
"epoch": 0.4973821989528796,
"grad_norm": 1.3315669287435203,
"learning_rate": 1.7955494383652364e-06,
"loss": 0.9957,
"step": 190
},
{
"epoch": 0.5,
"grad_norm": 1.4430105607942247,
"learning_rate": 1.7929128749323193e-06,
"loss": 0.9629,
"step": 191
},
{
"epoch": 0.5026178010471204,
"grad_norm": 1.3748095197619064,
"learning_rate": 1.7902613819258983e-06,
"loss": 0.9728,
"step": 192
},
{
"epoch": 0.5052356020942408,
"grad_norm": 1.4636464184043185,
"learning_rate": 1.7875950092703232e-06,
"loss": 0.8843,
"step": 193
},
{
"epoch": 0.5078534031413613,
"grad_norm": 1.3206431630831412,
"learning_rate": 1.784913807170109e-06,
"loss": 0.964,
"step": 194
},
{
"epoch": 0.5104712041884817,
"grad_norm": 1.3494139964974532,
"learning_rate": 1.7822178261089917e-06,
"loss": 0.955,
"step": 195
},
{
"epoch": 0.5130890052356021,
"grad_norm": 1.3451367536689751,
"learning_rate": 1.7795071168489759e-06,
"loss": 0.9491,
"step": 196
},
{
"epoch": 0.5157068062827225,
"grad_norm": 1.5037258899078974,
"learning_rate": 1.776781730429381e-06,
"loss": 0.9859,
"step": 197
},
{
"epoch": 0.518324607329843,
"grad_norm": 1.341930800019914,
"learning_rate": 1.7740417181658787e-06,
"loss": 0.9903,
"step": 198
},
{
"epoch": 0.5209424083769634,
"grad_norm": 1.7348664326730883,
"learning_rate": 1.771287131649527e-06,
"loss": 0.97,
"step": 199
},
{
"epoch": 0.5235602094240838,
"grad_norm": 1.6744990474369295,
"learning_rate": 1.7685180227458e-06,
"loss": 0.9286,
"step": 200
},
{
"epoch": 0.5261780104712042,
"grad_norm": 1.4602031412588528,
"learning_rate": 1.7657344435936106e-06,
"loss": 0.9064,
"step": 201
},
{
"epoch": 0.5287958115183246,
"grad_norm": 1.315375736937206,
"learning_rate": 1.762936446604327e-06,
"loss": 0.9298,
"step": 202
},
{
"epoch": 0.5314136125654451,
"grad_norm": 1.594576053216995,
"learning_rate": 1.76012408446079e-06,
"loss": 0.9615,
"step": 203
},
{
"epoch": 0.5340314136125655,
"grad_norm": 1.3869099290197058,
"learning_rate": 1.7572974101163163e-06,
"loss": 0.9677,
"step": 204
},
{
"epoch": 0.5366492146596858,
"grad_norm": 3.9344634005104564,
"learning_rate": 1.7544564767937046e-06,
"loss": 1.0005,
"step": 205
},
{
"epoch": 0.5392670157068062,
"grad_norm": 1.5834820308294422,
"learning_rate": 1.7516013379842336e-06,
"loss": 1.0007,
"step": 206
},
{
"epoch": 0.5418848167539267,
"grad_norm": 1.3504523420206829,
"learning_rate": 1.7487320474466523e-06,
"loss": 0.904,
"step": 207
},
{
"epoch": 0.5445026178010471,
"grad_norm": 1.4968877215079561,
"learning_rate": 1.74584865920617e-06,
"loss": 0.9503,
"step": 208
},
{
"epoch": 0.5471204188481675,
"grad_norm": 1.4700348286966616,
"learning_rate": 1.742951227553438e-06,
"loss": 0.9261,
"step": 209
},
{
"epoch": 0.5497382198952879,
"grad_norm": 1.2537017291057395,
"learning_rate": 1.7400398070435292e-06,
"loss": 0.8954,
"step": 210
},
{
"epoch": 0.5523560209424084,
"grad_norm": 1.4930050854123005,
"learning_rate": 1.7371144524949073e-06,
"loss": 0.9453,
"step": 211
},
{
"epoch": 0.5549738219895288,
"grad_norm": 1.2707287989257905,
"learning_rate": 1.734175218988398e-06,
"loss": 0.8907,
"step": 212
},
{
"epoch": 0.5575916230366492,
"grad_norm": 1.4475456324495655,
"learning_rate": 1.7312221618661514e-06,
"loss": 0.9423,
"step": 213
},
{
"epoch": 0.5602094240837696,
"grad_norm": 1.4619960180732723,
"learning_rate": 1.7282553367305975e-06,
"loss": 0.9778,
"step": 214
},
{
"epoch": 0.56282722513089,
"grad_norm": 1.4264214578023766,
"learning_rate": 1.7252747994434022e-06,
"loss": 0.9902,
"step": 215
},
{
"epoch": 0.5654450261780105,
"grad_norm": 1.2942895079135885,
"learning_rate": 1.7222806061244147e-06,
"loss": 0.9354,
"step": 216
},
{
"epoch": 0.5680628272251309,
"grad_norm": 1.266395336298489,
"learning_rate": 1.7192728131506092e-06,
"loss": 0.9379,
"step": 217
},
{
"epoch": 0.5706806282722513,
"grad_norm": 1.308287162023205,
"learning_rate": 1.7162514771550253e-06,
"loss": 0.9487,
"step": 218
},
{
"epoch": 0.5732984293193717,
"grad_norm": 1.271075019304684,
"learning_rate": 1.7132166550257017e-06,
"loss": 0.9369,
"step": 219
},
{
"epoch": 0.5759162303664922,
"grad_norm": 1.3338935943608123,
"learning_rate": 1.7101684039046037e-06,
"loss": 0.9609,
"step": 220
},
{
"epoch": 0.5785340314136126,
"grad_norm": 1.4007771729449567,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.9484,
"step": 221
},
{
"epoch": 0.581151832460733,
"grad_norm": 1.4937516535328665,
"learning_rate": 1.7040318445181207e-06,
"loss": 0.9823,
"step": 222
},
{
"epoch": 0.5837696335078534,
"grad_norm": 1.398159578570716,
"learning_rate": 1.700943651796597e-06,
"loss": 0.946,
"step": 223
},
{
"epoch": 0.5863874345549738,
"grad_norm": 1.4551713677795557,
"learning_rate": 1.697842261168843e-06,
"loss": 0.9853,
"step": 224
},
{
"epoch": 0.5890052356020943,
"grad_norm": 1.3054117416156679,
"learning_rate": 1.6947277310302282e-06,
"loss": 0.942,
"step": 225
},
{
"epoch": 0.5916230366492147,
"grad_norm": 1.3573897273220095,
"learning_rate": 1.6916001200235207e-06,
"loss": 0.9133,
"step": 226
},
{
"epoch": 0.5942408376963351,
"grad_norm": 1.6785599921639411,
"learning_rate": 1.6884594870377869e-06,
"loss": 1.0038,
"step": 227
},
{
"epoch": 0.5968586387434555,
"grad_norm": 1.4459897239045543,
"learning_rate": 1.68530589120728e-06,
"loss": 0.9801,
"step": 228
},
{
"epoch": 0.599476439790576,
"grad_norm": 1.375845613183054,
"learning_rate": 1.682139391910328e-06,
"loss": 0.9485,
"step": 229
},
{
"epoch": 0.6020942408376964,
"grad_norm": 1.397560038167906,
"learning_rate": 1.6789600487682153e-06,
"loss": 0.9049,
"step": 230
},
{
"epoch": 0.6047120418848168,
"grad_norm": 1.5103238712813247,
"learning_rate": 1.6757679216440605e-06,
"loss": 0.9194,
"step": 231
},
{
"epoch": 0.6073298429319371,
"grad_norm": 1.4527282751393369,
"learning_rate": 1.672563070641688e-06,
"loss": 0.9514,
"step": 232
},
{
"epoch": 0.6099476439790575,
"grad_norm": 1.5949625620890675,
"learning_rate": 1.6693455561044975e-06,
"loss": 0.9429,
"step": 233
},
{
"epoch": 0.612565445026178,
"grad_norm": 1.6225622102603658,
"learning_rate": 1.666115438614328e-06,
"loss": 0.9081,
"step": 234
},
{
"epoch": 0.6151832460732984,
"grad_norm": 2.222270565423305,
"learning_rate": 1.662872778990316e-06,
"loss": 1.0294,
"step": 235
},
{
"epoch": 0.6178010471204188,
"grad_norm": 1.4866785885036309,
"learning_rate": 1.6596176382877504e-06,
"loss": 0.9904,
"step": 236
},
{
"epoch": 0.6204188481675392,
"grad_norm": 1.339889347899888,
"learning_rate": 1.6563500777969252e-06,
"loss": 0.935,
"step": 237
},
{
"epoch": 0.6230366492146597,
"grad_norm": 1.456584804791214,
"learning_rate": 1.6530701590419823e-06,
"loss": 0.933,
"step": 238
},
{
"epoch": 0.6256544502617801,
"grad_norm": 1.6334561208762561,
"learning_rate": 1.6497779437797546e-06,
"loss": 0.9932,
"step": 239
},
{
"epoch": 0.6282722513089005,
"grad_norm": 1.4843297692175943,
"learning_rate": 1.6464734939986035e-06,
"loss": 0.9969,
"step": 240
},
{
"epoch": 0.6308900523560209,
"grad_norm": 1.4307606437352334,
"learning_rate": 1.6431568719172513e-06,
"loss": 0.9282,
"step": 241
},
{
"epoch": 0.6335078534031413,
"grad_norm": 1.4227744118845083,
"learning_rate": 1.6398281399836097e-06,
"loss": 0.9435,
"step": 242
},
{
"epoch": 0.6361256544502618,
"grad_norm": 1.42172179895148,
"learning_rate": 1.6364873608736035e-06,
"loss": 0.9205,
"step": 243
},
{
"epoch": 0.6387434554973822,
"grad_norm": 1.3401809099812794,
"learning_rate": 1.6331345974899922e-06,
"loss": 0.9474,
"step": 244
},
{
"epoch": 0.6413612565445026,
"grad_norm": 1.4648547641535852,
"learning_rate": 1.629769912961183e-06,
"loss": 0.9629,
"step": 245
},
{
"epoch": 0.643979057591623,
"grad_norm": 1.4527950553735653,
"learning_rate": 1.626393370640045e-06,
"loss": 0.873,
"step": 246
},
{
"epoch": 0.6465968586387435,
"grad_norm": 1.455801277807891,
"learning_rate": 1.6230050341027133e-06,
"loss": 0.9389,
"step": 247
},
{
"epoch": 0.6492146596858639,
"grad_norm": 1.3597683147157529,
"learning_rate": 1.6196049671473952e-06,
"loss": 0.9622,
"step": 248
},
{
"epoch": 0.6518324607329843,
"grad_norm": 1.3452857712103874,
"learning_rate": 1.616193233793166e-06,
"loss": 0.9423,
"step": 249
},
{
"epoch": 0.6544502617801047,
"grad_norm": 5.437851500838725,
"learning_rate": 1.612769898278766e-06,
"loss": 0.9624,
"step": 250
},
{
"epoch": 0.6570680628272252,
"grad_norm": 1.5102856532376654,
"learning_rate": 1.6093350250613892e-06,
"loss": 0.979,
"step": 251
},
{
"epoch": 0.6596858638743456,
"grad_norm": 1.4743192601344492,
"learning_rate": 1.605888678815471e-06,
"loss": 0.9569,
"step": 252
},
{
"epoch": 0.662303664921466,
"grad_norm": 1.5393143829011873,
"learning_rate": 1.602430924431469e-06,
"loss": 0.9629,
"step": 253
},
{
"epoch": 0.6649214659685864,
"grad_norm": 1.5737174699578425,
"learning_rate": 1.5989618270146422e-06,
"loss": 0.9639,
"step": 254
},
{
"epoch": 0.6675392670157068,
"grad_norm": 4.258134694492717,
"learning_rate": 1.5954814518838253e-06,
"loss": 0.9198,
"step": 255
},
{
"epoch": 0.6701570680628273,
"grad_norm": 1.4218596129552161,
"learning_rate": 1.5919898645701987e-06,
"loss": 0.886,
"step": 256
},
{
"epoch": 0.6727748691099477,
"grad_norm": 1.5211636215659439,
"learning_rate": 1.5884871308160536e-06,
"loss": 0.9175,
"step": 257
},
{
"epoch": 0.675392670157068,
"grad_norm": 1.4773591575654617,
"learning_rate": 1.5849733165735555e-06,
"loss": 0.9014,
"step": 258
},
{
"epoch": 0.6780104712041884,
"grad_norm": 1.5157264963354438,
"learning_rate": 1.5814484880035016e-06,
"loss": 0.9516,
"step": 259
},
{
"epoch": 0.680628272251309,
"grad_norm": 1.3470517687326489,
"learning_rate": 1.5779127114740755e-06,
"loss": 0.912,
"step": 260
},
{
"epoch": 0.6832460732984293,
"grad_norm": 1.349831010666242,
"learning_rate": 1.5743660535595975e-06,
"loss": 0.8723,
"step": 261
},
{
"epoch": 0.6858638743455497,
"grad_norm": 1.4458453237757587,
"learning_rate": 1.5708085810392705e-06,
"loss": 0.9299,
"step": 262
},
{
"epoch": 0.6884816753926701,
"grad_norm": 1.4350060007388417,
"learning_rate": 1.567240360895924e-06,
"loss": 0.9602,
"step": 263
},
{
"epoch": 0.6910994764397905,
"grad_norm": 1.4063518232729058,
"learning_rate": 1.563661460314751e-06,
"loss": 0.9271,
"step": 264
},
{
"epoch": 0.693717277486911,
"grad_norm": 1.2949234623299979,
"learning_rate": 1.5600719466820447e-06,
"loss": 0.9348,
"step": 265
},
{
"epoch": 0.6963350785340314,
"grad_norm": 1.399942819545271,
"learning_rate": 1.5564718875839287e-06,
"loss": 0.9577,
"step": 266
},
{
"epoch": 0.6989528795811518,
"grad_norm": 1.3541499365962402,
"learning_rate": 1.5528613508050847e-06,
"loss": 0.9818,
"step": 267
},
{
"epoch": 0.7015706806282722,
"grad_norm": 1.472944664577557,
"learning_rate": 1.5492404043274767e-06,
"loss": 1.0009,
"step": 268
},
{
"epoch": 0.7041884816753927,
"grad_norm": 4.017991300664643,
"learning_rate": 1.5456091163290697e-06,
"loss": 0.9481,
"step": 269
},
{
"epoch": 0.7068062827225131,
"grad_norm": 1.4031375152179757,
"learning_rate": 1.5419675551825472e-06,
"loss": 0.9454,
"step": 270
},
{
"epoch": 0.7094240837696335,
"grad_norm": 1.3949814525905722,
"learning_rate": 1.5383157894540242e-06,
"loss": 0.9701,
"step": 271
},
{
"epoch": 0.7120418848167539,
"grad_norm": 1.4769482292493297,
"learning_rate": 1.5346538879017538e-06,
"loss": 0.9386,
"step": 272
},
{
"epoch": 0.7146596858638743,
"grad_norm": 1.2860864329400274,
"learning_rate": 1.5309819194748359e-06,
"loss": 0.9,
"step": 273
},
{
"epoch": 0.7172774869109948,
"grad_norm": 1.3727353556535293,
"learning_rate": 1.5272999533119162e-06,
"loss": 0.9805,
"step": 274
},
{
"epoch": 0.7198952879581152,
"grad_norm": 2.722418651884381,
"learning_rate": 1.5236080587398853e-06,
"loss": 0.8907,
"step": 275
},
{
"epoch": 0.7225130890052356,
"grad_norm": 1.4156318742824492,
"learning_rate": 1.5199063052725745e-06,
"loss": 0.9734,
"step": 276
},
{
"epoch": 0.725130890052356,
"grad_norm": 1.519150038749317,
"learning_rate": 1.516194762609445e-06,
"loss": 0.9548,
"step": 277
},
{
"epoch": 0.7277486910994765,
"grad_norm": 2.3876346042029013,
"learning_rate": 1.512473500634277e-06,
"loss": 0.9355,
"step": 278
},
{
"epoch": 0.7303664921465969,
"grad_norm": 1.4156665926570595,
"learning_rate": 1.5087425894138534e-06,
"loss": 0.9418,
"step": 279
},
{
"epoch": 0.7329842931937173,
"grad_norm": 1.545693736367149,
"learning_rate": 1.5050020991966403e-06,
"loss": 0.943,
"step": 280
},
{
"epoch": 0.7356020942408377,
"grad_norm": 1.3719386457832154,
"learning_rate": 1.501252100411465e-06,
"loss": 0.9504,
"step": 281
},
{
"epoch": 0.7382198952879581,
"grad_norm": 1.4434108163997796,
"learning_rate": 1.497492663666189e-06,
"loss": 0.8861,
"step": 282
},
{
"epoch": 0.7408376963350786,
"grad_norm": 1.4077022286642678,
"learning_rate": 1.4937238597463784e-06,
"loss": 0.9503,
"step": 283
},
{
"epoch": 0.743455497382199,
"grad_norm": 1.6432508014410978,
"learning_rate": 1.4899457596139727e-06,
"loss": 0.9809,
"step": 284
},
{
"epoch": 0.7460732984293194,
"grad_norm": 1.4078348319712304,
"learning_rate": 1.4861584344059474e-06,
"loss": 0.9221,
"step": 285
},
{
"epoch": 0.7486910994764397,
"grad_norm": 1.496498216030133,
"learning_rate": 1.4823619554329744e-06,
"loss": 0.9593,
"step": 286
},
{
"epoch": 0.7513089005235603,
"grad_norm": 1.1775236514477745,
"learning_rate": 1.4785563941780805e-06,
"loss": 0.9004,
"step": 287
},
{
"epoch": 0.7539267015706806,
"grad_norm": 1.445348047393682,
"learning_rate": 1.4747418222952993e-06,
"loss": 0.9188,
"step": 288
},
{
"epoch": 0.756544502617801,
"grad_norm": 1.4942704837793932,
"learning_rate": 1.4709183116083253e-06,
"loss": 0.9618,
"step": 289
},
{
"epoch": 0.7591623036649214,
"grad_norm": 1.3529276296646142,
"learning_rate": 1.4670859341091577e-06,
"loss": 0.9704,
"step": 290
},
{
"epoch": 0.7617801047120419,
"grad_norm": 1.5516858536495775,
"learning_rate": 1.4632447619567488e-06,
"loss": 0.9155,
"step": 291
},
{
"epoch": 0.7643979057591623,
"grad_norm": 1.443364008768138,
"learning_rate": 1.4593948674756415e-06,
"loss": 0.9358,
"step": 292
},
{
"epoch": 0.7670157068062827,
"grad_norm": 1.3608416942283856,
"learning_rate": 1.4555363231546109e-06,
"loss": 0.9952,
"step": 293
},
{
"epoch": 0.7696335078534031,
"grad_norm": 1.3239348941023465,
"learning_rate": 1.4516692016452979e-06,
"loss": 0.9165,
"step": 294
},
{
"epoch": 0.7722513089005235,
"grad_norm": 1.6158463432267232,
"learning_rate": 1.4477935757608397e-06,
"loss": 0.9066,
"step": 295
},
{
"epoch": 0.774869109947644,
"grad_norm": 1.5884308678780332,
"learning_rate": 1.4439095184745022e-06,
"loss": 0.9458,
"step": 296
},
{
"epoch": 0.7774869109947644,
"grad_norm": 2.012960318795794,
"learning_rate": 1.4400171029183035e-06,
"loss": 0.9006,
"step": 297
},
{
"epoch": 0.7801047120418848,
"grad_norm": 1.360499900869024,
"learning_rate": 1.4361164023816374e-06,
"loss": 0.9351,
"step": 298
},
{
"epoch": 0.7827225130890052,
"grad_norm": 1.3724813802477163,
"learning_rate": 1.4322074903098945e-06,
"loss": 0.917,
"step": 299
},
{
"epoch": 0.7853403141361257,
"grad_norm": 1.503052362303298,
"learning_rate": 1.428290440303077e-06,
"loss": 0.9927,
"step": 300
},
{
"epoch": 0.7879581151832461,
"grad_norm": 1.7377456947229262,
"learning_rate": 1.4243653261144167e-06,
"loss": 0.9541,
"step": 301
},
{
"epoch": 0.7905759162303665,
"grad_norm": 1.2764425600693903,
"learning_rate": 1.4204322216489813e-06,
"loss": 0.9262,
"step": 302
},
{
"epoch": 0.7931937172774869,
"grad_norm": 1.567992829586323,
"learning_rate": 1.4164912009622878e-06,
"loss": 0.9829,
"step": 303
},
{
"epoch": 0.7958115183246073,
"grad_norm": 1.5156917718141123,
"learning_rate": 1.4125423382589048e-06,
"loss": 0.952,
"step": 304
},
{
"epoch": 0.7984293193717278,
"grad_norm": 1.4764271181959159,
"learning_rate": 1.4085857078910567e-06,
"loss": 0.9458,
"step": 305
},
{
"epoch": 0.8010471204188482,
"grad_norm": 1.2522703326500677,
"learning_rate": 1.4046213843572234e-06,
"loss": 0.9462,
"step": 306
},
{
"epoch": 0.8036649214659686,
"grad_norm": 1.4674944162208565,
"learning_rate": 1.400649442300738e-06,
"loss": 0.9537,
"step": 307
},
{
"epoch": 0.806282722513089,
"grad_norm": 1.3763179090039912,
"learning_rate": 1.3966699565083803e-06,
"loss": 0.9365,
"step": 308
},
{
"epoch": 0.8089005235602095,
"grad_norm": 1.9176648914200796,
"learning_rate": 1.3926830019089694e-06,
"loss": 1.0161,
"step": 309
},
{
"epoch": 0.8115183246073299,
"grad_norm": 1.4863434582201211,
"learning_rate": 1.3886886535719539e-06,
"loss": 0.9457,
"step": 310
},
{
"epoch": 0.8141361256544503,
"grad_norm": 1.3323857609548473,
"learning_rate": 1.3846869867059965e-06,
"loss": 0.9434,
"step": 311
},
{
"epoch": 0.8167539267015707,
"grad_norm": 1.490105065147535,
"learning_rate": 1.3806780766575587e-06,
"loss": 0.9392,
"step": 312
},
{
"epoch": 0.819371727748691,
"grad_norm": 1.333245804459532,
"learning_rate": 1.3766619989094827e-06,
"loss": 0.908,
"step": 313
},
{
"epoch": 0.8219895287958116,
"grad_norm": 1.5154308753484564,
"learning_rate": 1.3726388290795696e-06,
"loss": 0.8954,
"step": 314
},
{
"epoch": 0.824607329842932,
"grad_norm": 1.3909829985266102,
"learning_rate": 1.3686086429191552e-06,
"loss": 0.9485,
"step": 315
},
{
"epoch": 0.8272251308900523,
"grad_norm": 1.5779188390331473,
"learning_rate": 1.3645715163116845e-06,
"loss": 0.9557,
"step": 316
},
{
"epoch": 0.8298429319371727,
"grad_norm": 1.449669031785137,
"learning_rate": 1.3605275252712826e-06,
"loss": 0.8792,
"step": 317
},
{
"epoch": 0.8324607329842932,
"grad_norm": 1.4974346680981285,
"learning_rate": 1.3564767459413235e-06,
"loss": 0.9502,
"step": 318
},
{
"epoch": 0.8350785340314136,
"grad_norm": 1.379610694396643,
"learning_rate": 1.3524192545929963e-06,
"loss": 0.9344,
"step": 319
},
{
"epoch": 0.837696335078534,
"grad_norm": 1.3871474353129742,
"learning_rate": 1.3483551276238688e-06,
"loss": 0.9295,
"step": 320
},
{
"epoch": 0.8403141361256544,
"grad_norm": 1.506463325541792,
"learning_rate": 1.3442844415564496e-06,
"loss": 0.9316,
"step": 321
},
{
"epoch": 0.8429319371727748,
"grad_norm": 1.7186336719867092,
"learning_rate": 1.3402072730367474e-06,
"loss": 0.9275,
"step": 322
},
{
"epoch": 0.8455497382198953,
"grad_norm": 1.3614543479827845,
"learning_rate": 1.336123698832827e-06,
"loss": 0.9394,
"step": 323
},
{
"epoch": 0.8481675392670157,
"grad_norm": 1.5928598074183693,
"learning_rate": 1.3320337958333637e-06,
"loss": 0.9284,
"step": 324
},
{
"epoch": 0.8507853403141361,
"grad_norm": 1.450573452624891,
"learning_rate": 1.3279376410461987e-06,
"loss": 0.9453,
"step": 325
},
{
"epoch": 0.8534031413612565,
"grad_norm": 1.3696430137457172,
"learning_rate": 1.3238353115968838e-06,
"loss": 0.9345,
"step": 326
},
{
"epoch": 0.856020942408377,
"grad_norm": 1.4279904502242198,
"learning_rate": 1.3197268847272338e-06,
"loss": 0.9405,
"step": 327
},
{
"epoch": 0.8586387434554974,
"grad_norm": 1.3113735477129913,
"learning_rate": 1.3156124377938698e-06,
"loss": 0.8496,
"step": 328
},
{
"epoch": 0.8612565445026178,
"grad_norm": 1.5978377076734773,
"learning_rate": 1.3114920482667633e-06,
"loss": 0.9504,
"step": 329
},
{
"epoch": 0.8638743455497382,
"grad_norm": 1.8348826755579801,
"learning_rate": 1.307365793727778e-06,
"loss": 1.0206,
"step": 330
},
{
"epoch": 0.8664921465968587,
"grad_norm": 1.6036886081318196,
"learning_rate": 1.3032337518692079e-06,
"loss": 0.9325,
"step": 331
},
{
"epoch": 0.8691099476439791,
"grad_norm": 1.4319398650151158,
"learning_rate": 1.2990960004923153e-06,
"loss": 0.9511,
"step": 332
},
{
"epoch": 0.8717277486910995,
"grad_norm": 1.587326115767848,
"learning_rate": 1.2949526175058663e-06,
"loss": 0.9352,
"step": 333
},
{
"epoch": 0.8743455497382199,
"grad_norm": 1.4070281530555484,
"learning_rate": 1.2908036809246622e-06,
"loss": 0.9169,
"step": 334
},
{
"epoch": 0.8769633507853403,
"grad_norm": 1.3435113876325042,
"learning_rate": 1.286649268868073e-06,
"loss": 0.9191,
"step": 335
},
{
"epoch": 0.8795811518324608,
"grad_norm": 1.3798965471877482,
"learning_rate": 1.2824894595585636e-06,
"loss": 0.8751,
"step": 336
},
{
"epoch": 0.8821989528795812,
"grad_norm": 1.4127914600029392,
"learning_rate": 1.278324331320224e-06,
"loss": 0.9221,
"step": 337
},
{
"epoch": 0.8848167539267016,
"grad_norm": 1.419094074148045,
"learning_rate": 1.2741539625772916e-06,
"loss": 0.994,
"step": 338
},
{
"epoch": 0.887434554973822,
"grad_norm": 1.6168248145801407,
"learning_rate": 1.269978431852678e-06,
"loss": 0.9068,
"step": 339
},
{
"epoch": 0.8900523560209425,
"grad_norm": 1.4320272522924853,
"learning_rate": 1.265797817766486e-06,
"loss": 0.9107,
"step": 340
},
{
"epoch": 0.8926701570680629,
"grad_norm": 1.5043311007283438,
"learning_rate": 1.2616121990345344e-06,
"loss": 0.9379,
"step": 341
},
{
"epoch": 0.8952879581151832,
"grad_norm": 1.5310090194376413,
"learning_rate": 1.2574216544668719e-06,
"loss": 0.976,
"step": 342
},
{
"epoch": 0.8979057591623036,
"grad_norm": 1.362042648677866,
"learning_rate": 1.2532262629662947e-06,
"loss": 0.9131,
"step": 343
},
{
"epoch": 0.900523560209424,
"grad_norm": 1.5988287333686646,
"learning_rate": 1.2490261035268612e-06,
"loss": 0.8755,
"step": 344
},
{
"epoch": 0.9031413612565445,
"grad_norm": 1.4637242725250341,
"learning_rate": 1.244821255232404e-06,
"loss": 0.9109,
"step": 345
},
{
"epoch": 0.9057591623036649,
"grad_norm": 1.4212281055853575,
"learning_rate": 1.2406117972550411e-06,
"loss": 0.9539,
"step": 346
},
{
"epoch": 0.9083769633507853,
"grad_norm": 1.3319624620662243,
"learning_rate": 1.2363978088536851e-06,
"loss": 0.8959,
"step": 347
},
{
"epoch": 0.9109947643979057,
"grad_norm": 1.4662422372152333,
"learning_rate": 1.2321793693725506e-06,
"loss": 0.9405,
"step": 348
},
{
"epoch": 0.9136125654450262,
"grad_norm": 1.4304240335118916,
"learning_rate": 1.2279565582396615e-06,
"loss": 0.9541,
"step": 349
},
{
"epoch": 0.9162303664921466,
"grad_norm": 1.3671914328595074,
"learning_rate": 1.2237294549653539e-06,
"loss": 0.9717,
"step": 350
},
{
"epoch": 0.918848167539267,
"grad_norm": 1.3382227529528294,
"learning_rate": 1.219498139140779e-06,
"loss": 0.9378,
"step": 351
},
{
"epoch": 0.9214659685863874,
"grad_norm": 1.331756121322301,
"learning_rate": 1.2152626904364064e-06,
"loss": 0.9559,
"step": 352
},
{
"epoch": 0.9240837696335078,
"grad_norm": 1.4348519441766092,
"learning_rate": 1.2110231886005222e-06,
"loss": 0.9148,
"step": 353
},
{
"epoch": 0.9267015706806283,
"grad_norm": 1.2598591796573784,
"learning_rate": 1.2067797134577273e-06,
"loss": 0.9749,
"step": 354
},
{
"epoch": 0.9293193717277487,
"grad_norm": 1.6362760645353196,
"learning_rate": 1.202532344907436e-06,
"loss": 0.9261,
"step": 355
},
{
"epoch": 0.9319371727748691,
"grad_norm": 1.3685299905093398,
"learning_rate": 1.198281162922371e-06,
"loss": 0.9157,
"step": 356
},
{
"epoch": 0.9345549738219895,
"grad_norm": 1.5014341284660457,
"learning_rate": 1.1940262475470555e-06,
"loss": 0.9468,
"step": 357
},
{
"epoch": 0.93717277486911,
"grad_norm": 1.4866894767271521,
"learning_rate": 1.18976767889631e-06,
"loss": 0.9737,
"step": 358
},
{
"epoch": 0.9397905759162304,
"grad_norm": 1.3686575013762912,
"learning_rate": 1.1855055371537399e-06,
"loss": 0.9671,
"step": 359
},
{
"epoch": 0.9424083769633508,
"grad_norm": 1.3408401081503738,
"learning_rate": 1.1812399025702289e-06,
"loss": 0.9446,
"step": 360
},
{
"epoch": 0.9450261780104712,
"grad_norm": 1.4426258662911882,
"learning_rate": 1.1769708554624255e-06,
"loss": 0.9424,
"step": 361
},
{
"epoch": 0.9476439790575916,
"grad_norm": 1.3570630863827366,
"learning_rate": 1.1726984762112326e-06,
"loss": 0.9363,
"step": 362
},
{
"epoch": 0.9502617801047121,
"grad_norm": 1.4972719883412338,
"learning_rate": 1.168422845260293e-06,
"loss": 0.9629,
"step": 363
},
{
"epoch": 0.9528795811518325,
"grad_norm": 2.0926927624177853,
"learning_rate": 1.1641440431144748e-06,
"loss": 0.9362,
"step": 364
},
{
"epoch": 0.9554973821989529,
"grad_norm": 1.9559448320553872,
"learning_rate": 1.1598621503383564e-06,
"loss": 0.9355,
"step": 365
},
{
"epoch": 0.9581151832460733,
"grad_norm": 1.4196013691936538,
"learning_rate": 1.1555772475547083e-06,
"loss": 0.9807,
"step": 366
},
{
"epoch": 0.9607329842931938,
"grad_norm": 1.7129377232532392,
"learning_rate": 1.1512894154429757e-06,
"loss": 0.9321,
"step": 367
},
{
"epoch": 0.9633507853403142,
"grad_norm": 1.3874760503367283,
"learning_rate": 1.14699873473776e-06,
"loss": 0.9171,
"step": 368
},
{
"epoch": 0.9659685863874345,
"grad_norm": 1.3667521655356518,
"learning_rate": 1.1427052862272981e-06,
"loss": 0.9634,
"step": 369
},
{
"epoch": 0.9685863874345549,
"grad_norm": 1.4603827013405721,
"learning_rate": 1.1384091507519403e-06,
"loss": 0.8996,
"step": 370
},
{
"epoch": 0.9712041884816754,
"grad_norm": 1.3023965306720733,
"learning_rate": 1.1341104092026302e-06,
"loss": 0.9057,
"step": 371
},
{
"epoch": 0.9738219895287958,
"grad_norm": 1.587437099971742,
"learning_rate": 1.1298091425193806e-06,
"loss": 0.9122,
"step": 372
},
{
"epoch": 0.9764397905759162,
"grad_norm": 1.5072141830161945,
"learning_rate": 1.1255054316897482e-06,
"loss": 0.917,
"step": 373
},
{
"epoch": 0.9790575916230366,
"grad_norm": 1.5666361589706173,
"learning_rate": 1.121199357747312e-06,
"loss": 0.9004,
"step": 374
},
{
"epoch": 0.981675392670157,
"grad_norm": 1.547610708086,
"learning_rate": 1.1168910017701434e-06,
"loss": 0.8929,
"step": 375
},
{
"epoch": 0.9842931937172775,
"grad_norm": 1.4698749043156947,
"learning_rate": 1.112580444879283e-06,
"loss": 1.0095,
"step": 376
},
{
"epoch": 0.9869109947643979,
"grad_norm": 1.532940293838814,
"learning_rate": 1.1082677682372112e-06,
"loss": 0.944,
"step": 377
},
{
"epoch": 0.9895287958115183,
"grad_norm": 1.4452991257917254,
"learning_rate": 1.1039530530463217e-06,
"loss": 0.9699,
"step": 378
},
{
"epoch": 0.9921465968586387,
"grad_norm": 1.3913866901966334,
"learning_rate": 1.0996363805473902e-06,
"loss": 0.9476,
"step": 379
},
{
"epoch": 0.9947643979057592,
"grad_norm": 1.4385256079298478,
"learning_rate": 1.0953178320180473e-06,
"loss": 0.8981,
"step": 380
},
{
"epoch": 0.9973821989528796,
"grad_norm": 1.4122272138909508,
"learning_rate": 1.0909974887712468e-06,
"loss": 0.937,
"step": 381
},
{
"epoch": 1.0,
"grad_norm": 1.5381889662517363,
"learning_rate": 1.0866754321537337e-06,
"loss": 0.8369,
"step": 382
},
{
"epoch": 1.0026178010471205,
"grad_norm": 1.352548452256198,
"learning_rate": 1.0823517435445149e-06,
"loss": 0.8598,
"step": 383
},
{
"epoch": 1.0052356020942408,
"grad_norm": 1.6987449243575325,
"learning_rate": 1.078026504353325e-06,
"loss": 0.9466,
"step": 384
},
{
"epoch": 1.0078534031413613,
"grad_norm": 1.2636646845580983,
"learning_rate": 1.0736997960190945e-06,
"loss": 0.8466,
"step": 385
},
{
"epoch": 1.0104712041884816,
"grad_norm": 1.3487399166845027,
"learning_rate": 1.0693717000084158e-06,
"loss": 0.9227,
"step": 386
},
{
"epoch": 1.013089005235602,
"grad_norm": 1.4238712320318556,
"learning_rate": 1.06504229781401e-06,
"loss": 0.9006,
"step": 387
},
{
"epoch": 1.0157068062827226,
"grad_norm": 1.4571642770903115,
"learning_rate": 1.0607116709531918e-06,
"loss": 0.9162,
"step": 388
},
{
"epoch": 1.0183246073298429,
"grad_norm": 1.346066094766837,
"learning_rate": 1.0563799009663343e-06,
"loss": 0.9108,
"step": 389
},
{
"epoch": 1.0209424083769634,
"grad_norm": 1.410928572921669,
"learning_rate": 1.0520470694153352e-06,
"loss": 0.9914,
"step": 390
},
{
"epoch": 1.0235602094240839,
"grad_norm": 1.5207294046186268,
"learning_rate": 1.047713257882079e-06,
"loss": 0.9295,
"step": 391
},
{
"epoch": 1.0261780104712042,
"grad_norm": 1.3840105489229526,
"learning_rate": 1.0433785479669038e-06,
"loss": 0.8874,
"step": 392
},
{
"epoch": 1.0287958115183247,
"grad_norm": 1.3438440478368636,
"learning_rate": 1.039043021287061e-06,
"loss": 0.9186,
"step": 393
},
{
"epoch": 1.031413612565445,
"grad_norm": 1.5703077556397094,
"learning_rate": 1.034706759475182e-06,
"loss": 0.9052,
"step": 394
},
{
"epoch": 1.0340314136125655,
"grad_norm": 1.3504157220975264,
"learning_rate": 1.03036984417774e-06,
"loss": 0.9045,
"step": 395
},
{
"epoch": 1.036649214659686,
"grad_norm": 1.4105634277460741,
"learning_rate": 1.026032357053512e-06,
"loss": 0.9045,
"step": 396
},
{
"epoch": 1.0392670157068062,
"grad_norm": 1.3475091071385106,
"learning_rate": 1.0216943797720417e-06,
"loss": 0.8633,
"step": 397
},
{
"epoch": 1.0418848167539267,
"grad_norm": 1.4138471590235702,
"learning_rate": 1.017355994012102e-06,
"loss": 0.8908,
"step": 398
},
{
"epoch": 1.044502617801047,
"grad_norm": 1.4770009484245705,
"learning_rate": 1.0130172814601574e-06,
"loss": 0.931,
"step": 399
},
{
"epoch": 1.0471204188481675,
"grad_norm": 1.4838585726093223,
"learning_rate": 1.0086783238088244e-06,
"loss": 0.8935,
"step": 400
},
{
"epoch": 1.049738219895288,
"grad_norm": 1.7353117348056972,
"learning_rate": 1.0043392027553359e-06,
"loss": 0.9103,
"step": 401
},
{
"epoch": 1.0523560209424083,
"grad_norm": 1.606805445159876,
"learning_rate": 1e-06,
"loss": 0.9098,
"step": 402
},
{
"epoch": 1.0549738219895288,
"grad_norm": 1.4003150648318952,
"learning_rate": 9.956607972446642e-07,
"loss": 0.911,
"step": 403
},
{
"epoch": 1.057591623036649,
"grad_norm": 1.3167792983140534,
"learning_rate": 9.913216761911753e-07,
"loss": 0.9009,
"step": 404
},
{
"epoch": 1.0602094240837696,
"grad_norm": 1.2725669879710217,
"learning_rate": 9.869827185398427e-07,
"loss": 0.8839,
"step": 405
},
{
"epoch": 1.0628272251308901,
"grad_norm": 1.2890395865651842,
"learning_rate": 9.826440059878981e-07,
"loss": 0.9019,
"step": 406
},
{
"epoch": 1.0654450261780104,
"grad_norm": 1.3894062424259876,
"learning_rate": 9.783056202279587e-07,
"loss": 0.9324,
"step": 407
},
{
"epoch": 1.068062827225131,
"grad_norm": 1.3917884191601717,
"learning_rate": 9.73967642946488e-07,
"loss": 0.8865,
"step": 408
},
{
"epoch": 1.0706806282722514,
"grad_norm": 1.3702754228543925,
"learning_rate": 9.6963015582226e-07,
"loss": 0.8896,
"step": 409
},
{
"epoch": 1.0732984293193717,
"grad_norm": 1.4183394433577425,
"learning_rate": 9.65293240524818e-07,
"loss": 0.9622,
"step": 410
},
{
"epoch": 1.0759162303664922,
"grad_norm": 1.8223040196130649,
"learning_rate": 9.609569787129392e-07,
"loss": 0.9445,
"step": 411
},
{
"epoch": 1.0785340314136125,
"grad_norm": 1.561543253672229,
"learning_rate": 9.566214520330965e-07,
"loss": 0.9201,
"step": 412
},
{
"epoch": 1.081151832460733,
"grad_norm": 1.5251337755140832,
"learning_rate": 9.52286742117921e-07,
"loss": 0.8734,
"step": 413
},
{
"epoch": 1.0837696335078535,
"grad_norm": 1.2585711830780457,
"learning_rate": 9.479529305846652e-07,
"loss": 0.8811,
"step": 414
},
{
"epoch": 1.0863874345549738,
"grad_norm": 1.347193385434298,
"learning_rate": 9.436200990336656e-07,
"loss": 0.9101,
"step": 415
},
{
"epoch": 1.0890052356020943,
"grad_norm": 1.380510360812572,
"learning_rate": 9.392883290468082e-07,
"loss": 0.9352,
"step": 416
},
{
"epoch": 1.0916230366492146,
"grad_norm": 1.4226456539762178,
"learning_rate": 9.349577021859899e-07,
"loss": 0.9216,
"step": 417
},
{
"epoch": 1.094240837696335,
"grad_norm": 1.4185426724478578,
"learning_rate": 9.306282999915839e-07,
"loss": 0.8718,
"step": 418
},
{
"epoch": 1.0968586387434556,
"grad_norm": 1.6442742168613387,
"learning_rate": 9.263002039809055e-07,
"loss": 0.9369,
"step": 419
},
{
"epoch": 1.0994764397905759,
"grad_norm": 1.4966541668940625,
"learning_rate": 9.219734956466752e-07,
"loss": 0.9093,
"step": 420
},
{
"epoch": 1.1020942408376964,
"grad_norm": 1.5331073728044513,
"learning_rate": 9.176482564554853e-07,
"loss": 0.8945,
"step": 421
},
{
"epoch": 1.1047120418848166,
"grad_norm": 2.010031110583405,
"learning_rate": 9.133245678462662e-07,
"loss": 0.8757,
"step": 422
},
{
"epoch": 1.1073298429319371,
"grad_norm": 1.4805034302628122,
"learning_rate": 9.090025112287532e-07,
"loss": 0.9101,
"step": 423
},
{
"epoch": 1.1099476439790577,
"grad_norm": 1.3324528881382394,
"learning_rate": 9.046821679819526e-07,
"loss": 0.8468,
"step": 424
},
{
"epoch": 1.112565445026178,
"grad_norm": 1.5950663314140405,
"learning_rate": 9.003636194526098e-07,
"loss": 0.859,
"step": 425
},
{
"epoch": 1.1151832460732984,
"grad_norm": 1.4696265552281182,
"learning_rate": 8.960469469536784e-07,
"loss": 0.9125,
"step": 426
},
{
"epoch": 1.117801047120419,
"grad_norm": 1.7012055856407813,
"learning_rate": 8.917322317627886e-07,
"loss": 0.9044,
"step": 427
},
{
"epoch": 1.1204188481675392,
"grad_norm": 1.3440632345526482,
"learning_rate": 8.874195551207173e-07,
"loss": 0.9052,
"step": 428
},
{
"epoch": 1.1230366492146597,
"grad_norm": 1.494387132622485,
"learning_rate": 8.831089982298568e-07,
"loss": 0.8855,
"step": 429
},
{
"epoch": 1.12565445026178,
"grad_norm": 1.2116093561626082,
"learning_rate": 8.78800642252688e-07,
"loss": 0.9089,
"step": 430
},
{
"epoch": 1.1282722513089005,
"grad_norm": 1.3952551501152495,
"learning_rate": 8.744945683102516e-07,
"loss": 0.903,
"step": 431
},
{
"epoch": 1.130890052356021,
"grad_norm": 1.4380203340874709,
"learning_rate": 8.701908574806198e-07,
"loss": 0.8961,
"step": 432
},
{
"epoch": 1.1335078534031413,
"grad_norm": 1.321179107685139,
"learning_rate": 8.658895907973696e-07,
"loss": 0.8675,
"step": 433
},
{
"epoch": 1.1361256544502618,
"grad_norm": 1.5378152096859476,
"learning_rate": 8.615908492480598e-07,
"loss": 0.9023,
"step": 434
},
{
"epoch": 1.1387434554973823,
"grad_norm": 1.412669028369897,
"learning_rate": 8.572947137727022e-07,
"loss": 0.8696,
"step": 435
},
{
"epoch": 1.1413612565445026,
"grad_norm": 1.531047948413987,
"learning_rate": 8.530012652622397e-07,
"loss": 0.9266,
"step": 436
},
{
"epoch": 1.143979057591623,
"grad_norm": 1.3302826186622878,
"learning_rate": 8.487105845570242e-07,
"loss": 0.8793,
"step": 437
},
{
"epoch": 1.1465968586387434,
"grad_norm": 1.32167025755748,
"learning_rate": 8.444227524452919e-07,
"loss": 0.8921,
"step": 438
},
{
"epoch": 1.149214659685864,
"grad_norm": 1.437600669859301,
"learning_rate": 8.401378496616436e-07,
"loss": 0.9262,
"step": 439
},
{
"epoch": 1.1518324607329844,
"grad_norm": 1.7595701476639378,
"learning_rate": 8.358559568855248e-07,
"loss": 0.95,
"step": 440
},
{
"epoch": 1.1544502617801047,
"grad_norm": 1.8234006679918948,
"learning_rate": 8.315771547397069e-07,
"loss": 0.9589,
"step": 441
},
{
"epoch": 1.1570680628272252,
"grad_norm": 1.452249454487249,
"learning_rate": 8.273015237887673e-07,
"loss": 0.9084,
"step": 442
},
{
"epoch": 1.1596858638743455,
"grad_norm": 1.7580904230300225,
"learning_rate": 8.230291445375743e-07,
"loss": 0.8941,
"step": 443
},
{
"epoch": 1.162303664921466,
"grad_norm": 1.3278204456920104,
"learning_rate": 8.187600974297713e-07,
"loss": 0.8985,
"step": 444
},
{
"epoch": 1.1649214659685865,
"grad_norm": 1.4027118574490405,
"learning_rate": 8.144944628462602e-07,
"loss": 0.8731,
"step": 445
},
{
"epoch": 1.1675392670157068,
"grad_norm": 1.415174215071559,
"learning_rate": 8.102323211036903e-07,
"loss": 0.8845,
"step": 446
},
{
"epoch": 1.1701570680628273,
"grad_norm": 1.3913552918511438,
"learning_rate": 8.059737524529443e-07,
"loss": 0.8932,
"step": 447
},
{
"epoch": 1.1727748691099475,
"grad_norm": 1.3393476374259683,
"learning_rate": 8.017188370776291e-07,
"loss": 0.9429,
"step": 448
},
{
"epoch": 1.175392670157068,
"grad_norm": 1.339931563196864,
"learning_rate": 7.974676550925638e-07,
"loss": 0.8584,
"step": 449
},
{
"epoch": 1.1780104712041886,
"grad_norm": 1.4030008780056942,
"learning_rate": 7.932202865422726e-07,
"loss": 0.8831,
"step": 450
},
{
"epoch": 1.1806282722513088,
"grad_norm": 1.8118925202824216,
"learning_rate": 7.889768113994779e-07,
"loss": 0.8887,
"step": 451
},
{
"epoch": 1.1832460732984293,
"grad_norm": 1.5387839907662275,
"learning_rate": 7.847373095635936e-07,
"loss": 0.8957,
"step": 452
},
{
"epoch": 1.1858638743455496,
"grad_norm": 1.3918514287546606,
"learning_rate": 7.805018608592211e-07,
"loss": 0.9043,
"step": 453
},
{
"epoch": 1.1884816753926701,
"grad_norm": 1.2858265895726548,
"learning_rate": 7.76270545034646e-07,
"loss": 0.8629,
"step": 454
},
{
"epoch": 1.1910994764397906,
"grad_norm": 1.3873983010304787,
"learning_rate": 7.720434417603383e-07,
"loss": 0.8948,
"step": 455
},
{
"epoch": 1.193717277486911,
"grad_norm": 1.317347612940767,
"learning_rate": 7.678206306274494e-07,
"loss": 0.8789,
"step": 456
},
{
"epoch": 1.1963350785340314,
"grad_norm": 1.5036388466833512,
"learning_rate": 7.636021911463151e-07,
"loss": 0.9402,
"step": 457
},
{
"epoch": 1.1989528795811517,
"grad_norm": 1.427135257044766,
"learning_rate": 7.59388202744959e-07,
"loss": 0.9449,
"step": 458
},
{
"epoch": 1.2015706806282722,
"grad_norm": 1.93494024531244,
"learning_rate": 7.551787447675961e-07,
"loss": 0.8978,
"step": 459
},
{
"epoch": 1.2041884816753927,
"grad_norm": 1.4160041714291973,
"learning_rate": 7.509738964731388e-07,
"loss": 0.8502,
"step": 460
},
{
"epoch": 1.206806282722513,
"grad_norm": 1.4158880080077554,
"learning_rate": 7.467737370337053e-07,
"loss": 0.8544,
"step": 461
},
{
"epoch": 1.2094240837696335,
"grad_norm": 1.4319367593292147,
"learning_rate": 7.42578345533128e-07,
"loss": 0.8924,
"step": 462
},
{
"epoch": 1.212041884816754,
"grad_norm": 1.5603806054375955,
"learning_rate": 7.383878009654656e-07,
"loss": 0.9332,
"step": 463
},
{
"epoch": 1.2146596858638743,
"grad_norm": 1.6030080299637368,
"learning_rate": 7.342021822335142e-07,
"loss": 0.9562,
"step": 464
},
{
"epoch": 1.2172774869109948,
"grad_norm": 1.4321929382537035,
"learning_rate": 7.300215681473223e-07,
"loss": 0.8923,
"step": 465
},
{
"epoch": 1.2198952879581153,
"grad_norm": 1.5156349677916563,
"learning_rate": 7.258460374227084e-07,
"loss": 0.9585,
"step": 466
},
{
"epoch": 1.2225130890052356,
"grad_norm": 1.382771006951781,
"learning_rate": 7.216756686797763e-07,
"loss": 0.8921,
"step": 467
},
{
"epoch": 1.225130890052356,
"grad_norm": 1.3862045180941078,
"learning_rate": 7.175105404414361e-07,
"loss": 0.9613,
"step": 468
},
{
"epoch": 1.2277486910994764,
"grad_norm": 1.403237935502315,
"learning_rate": 7.133507311319269e-07,
"loss": 0.8979,
"step": 469
},
{
"epoch": 1.2303664921465969,
"grad_norm": 1.470944258568419,
"learning_rate": 7.091963190753377e-07,
"loss": 0.938,
"step": 470
},
{
"epoch": 1.2329842931937174,
"grad_norm": 1.499848647249035,
"learning_rate": 7.050473824941339e-07,
"loss": 0.9093,
"step": 471
},
{
"epoch": 1.2356020942408377,
"grad_norm": 1.405120300665954,
"learning_rate": 7.009039995076844e-07,
"loss": 0.928,
"step": 472
},
{
"epoch": 1.2382198952879582,
"grad_norm": 1.4681450182994786,
"learning_rate": 6.967662481307922e-07,
"loss": 0.8985,
"step": 473
},
{
"epoch": 1.2408376963350785,
"grad_norm": 1.3777720330440961,
"learning_rate": 6.926342062722222e-07,
"loss": 0.8719,
"step": 474
},
{
"epoch": 1.243455497382199,
"grad_norm": 1.4958767523410936,
"learning_rate": 6.885079517332366e-07,
"loss": 0.8984,
"step": 475
},
{
"epoch": 1.2460732984293195,
"grad_norm": 1.5727144596330556,
"learning_rate": 6.843875622061304e-07,
"loss": 0.8878,
"step": 476
},
{
"epoch": 1.2486910994764397,
"grad_norm": 1.6315335944052536,
"learning_rate": 6.802731152727663e-07,
"loss": 0.91,
"step": 477
},
{
"epoch": 1.2513089005235603,
"grad_norm": 1.5654763348760663,
"learning_rate": 6.761646884031163e-07,
"loss": 0.8597,
"step": 478
},
{
"epoch": 1.2539267015706805,
"grad_norm": 1.3376352257465756,
"learning_rate": 6.720623589538013e-07,
"loss": 0.9081,
"step": 479
},
{
"epoch": 1.256544502617801,
"grad_norm": 1.5086059528146298,
"learning_rate": 6.679662041666361e-07,
"loss": 0.8981,
"step": 480
},
{
"epoch": 1.2591623036649215,
"grad_norm": 1.2782585477588344,
"learning_rate": 6.638763011671735e-07,
"loss": 0.8778,
"step": 481
},
{
"epoch": 1.2617801047120418,
"grad_norm": 1.7286688483189723,
"learning_rate": 6.597927269632526e-07,
"loss": 0.8708,
"step": 482
},
{
"epoch": 1.2643979057591623,
"grad_norm": 1.2561419055543754,
"learning_rate": 6.557155584435503e-07,
"loss": 0.8966,
"step": 483
},
{
"epoch": 1.2670157068062826,
"grad_norm": 1.497272759598682,
"learning_rate": 6.516448723761314e-07,
"loss": 0.8719,
"step": 484
},
{
"epoch": 1.2696335078534031,
"grad_norm": 1.6250829967641724,
"learning_rate": 6.475807454070039e-07,
"loss": 0.8856,
"step": 485
},
{
"epoch": 1.2722513089005236,
"grad_norm": 1.6479813154722118,
"learning_rate": 6.435232540586762e-07,
"loss": 0.9266,
"step": 486
},
{
"epoch": 1.274869109947644,
"grad_norm": 1.3286340505653726,
"learning_rate": 6.394724747287172e-07,
"loss": 0.8334,
"step": 487
},
{
"epoch": 1.2774869109947644,
"grad_norm": 1.4542515014039075,
"learning_rate": 6.354284836883156e-07,
"loss": 0.8887,
"step": 488
},
{
"epoch": 1.2801047120418847,
"grad_norm": 1.3724418981619309,
"learning_rate": 6.313913570808446e-07,
"loss": 0.8706,
"step": 489
},
{
"epoch": 1.2827225130890052,
"grad_norm": 1.3658073904261523,
"learning_rate": 6.273611709204303e-07,
"loss": 0.9141,
"step": 490
},
{
"epoch": 1.2853403141361257,
"grad_norm": 1.2739502124007493,
"learning_rate": 6.233380010905174e-07,
"loss": 0.9124,
"step": 491
},
{
"epoch": 1.2879581151832462,
"grad_norm": 1.8768508251733684,
"learning_rate": 6.193219233424414e-07,
"loss": 0.9036,
"step": 492
},
{
"epoch": 1.2905759162303665,
"grad_norm": 1.3168948507652463,
"learning_rate": 6.153130132940036e-07,
"loss": 0.9322,
"step": 493
},
{
"epoch": 1.2931937172774868,
"grad_norm": 1.4566708836290705,
"learning_rate": 6.11311346428046e-07,
"loss": 0.9675,
"step": 494
},
{
"epoch": 1.2958115183246073,
"grad_norm": 1.3456105635036395,
"learning_rate": 6.073169980910307e-07,
"loss": 0.8839,
"step": 495
},
{
"epoch": 1.2984293193717278,
"grad_norm": 1.3260427877201129,
"learning_rate": 6.033300434916202e-07,
"loss": 0.8501,
"step": 496
},
{
"epoch": 1.3010471204188483,
"grad_norm": 1.6991685082617407,
"learning_rate": 5.993505576992622e-07,
"loss": 0.8694,
"step": 497
},
{
"epoch": 1.3036649214659686,
"grad_norm": 1.2197619039548226,
"learning_rate": 5.953786156427764e-07,
"loss": 0.9285,
"step": 498
},
{
"epoch": 1.306282722513089,
"grad_norm": 1.5649739326206697,
"learning_rate": 5.914142921089433e-07,
"loss": 0.9077,
"step": 499
},
{
"epoch": 1.3089005235602094,
"grad_norm": 1.5043102113788342,
"learning_rate": 5.874576617410949e-07,
"loss": 0.9359,
"step": 500
},
{
"epoch": 1.3115183246073299,
"grad_norm": 2.2191105066016523,
"learning_rate": 5.835087990377123e-07,
"loss": 0.8882,
"step": 501
},
{
"epoch": 1.3141361256544504,
"grad_norm": 1.3870827210325436,
"learning_rate": 5.795677783510186e-07,
"loss": 0.8605,
"step": 502
},
{
"epoch": 1.3167539267015707,
"grad_norm": 1.3303488313205487,
"learning_rate": 5.756346738855835e-07,
"loss": 0.862,
"step": 503
},
{
"epoch": 1.3193717277486912,
"grad_norm": 1.4039158189310836,
"learning_rate": 5.717095596969226e-07,
"loss": 0.8973,
"step": 504
},
{
"epoch": 1.3219895287958114,
"grad_norm": 1.2314389814739966,
"learning_rate": 5.677925096901055e-07,
"loss": 0.8651,
"step": 505
},
{
"epoch": 1.324607329842932,
"grad_norm": 1.3345927348395523,
"learning_rate": 5.638835976183627e-07,
"loss": 0.8745,
"step": 506
},
{
"epoch": 1.3272251308900525,
"grad_norm": 1.4154278961549511,
"learning_rate": 5.599828970816963e-07,
"loss": 0.8673,
"step": 507
},
{
"epoch": 1.3298429319371727,
"grad_norm": 1.3638849226919136,
"learning_rate": 5.560904815254979e-07,
"loss": 0.9074,
"step": 508
},
{
"epoch": 1.3324607329842932,
"grad_norm": 1.3669358510510996,
"learning_rate": 5.522064242391603e-07,
"loss": 0.8715,
"step": 509
},
{
"epoch": 1.3350785340314135,
"grad_norm": 1.5856610536711122,
"learning_rate": 5.483307983547025e-07,
"loss": 0.9246,
"step": 510
},
{
"epoch": 1.337696335078534,
"grad_norm": 1.365878150253015,
"learning_rate": 5.444636768453887e-07,
"loss": 0.876,
"step": 511
},
{
"epoch": 1.3403141361256545,
"grad_norm": 1.6334459477041363,
"learning_rate": 5.406051325243585e-07,
"loss": 0.9312,
"step": 512
},
{
"epoch": 1.3429319371727748,
"grad_norm": 1.5863516351938438,
"learning_rate": 5.367552380432515e-07,
"loss": 0.9283,
"step": 513
},
{
"epoch": 1.3455497382198953,
"grad_norm": 1.3595941807771459,
"learning_rate": 5.329140658908422e-07,
"loss": 0.9232,
"step": 514
},
{
"epoch": 1.3481675392670156,
"grad_norm": 1.5206184191201402,
"learning_rate": 5.290816883916748e-07,
"loss": 0.8676,
"step": 515
},
{
"epoch": 1.350785340314136,
"grad_norm": 1.3031469098418837,
"learning_rate": 5.252581777047008e-07,
"loss": 0.8812,
"step": 516
},
{
"epoch": 1.3534031413612566,
"grad_norm": 1.3798809076308727,
"learning_rate": 5.214436058219198e-07,
"loss": 0.9039,
"step": 517
},
{
"epoch": 1.356020942408377,
"grad_norm": 1.3510273757712852,
"learning_rate": 5.176380445670254e-07,
"loss": 0.8814,
"step": 518
},
{
"epoch": 1.3586387434554974,
"grad_norm": 1.542901220604215,
"learning_rate": 5.138415655940525e-07,
"loss": 0.9526,
"step": 519
},
{
"epoch": 1.3612565445026177,
"grad_norm": 1.2836209031828834,
"learning_rate": 5.100542403860271e-07,
"loss": 0.856,
"step": 520
},
{
"epoch": 1.3638743455497382,
"grad_norm": 1.4938375796062573,
"learning_rate": 5.062761402536215e-07,
"loss": 0.9408,
"step": 521
},
{
"epoch": 1.3664921465968587,
"grad_norm": 2.3056799393831082,
"learning_rate": 5.02507336333811e-07,
"loss": 0.902,
"step": 522
},
{
"epoch": 1.369109947643979,
"grad_norm": 1.368596540328692,
"learning_rate": 4.98747899588535e-07,
"loss": 0.874,
"step": 523
},
{
"epoch": 1.3717277486910995,
"grad_norm": 1.509703116789799,
"learning_rate": 4.949979008033595e-07,
"loss": 0.8776,
"step": 524
},
{
"epoch": 1.3743455497382198,
"grad_norm": 1.493268000765195,
"learning_rate": 4.912574105861465e-07,
"loss": 0.9217,
"step": 525
},
{
"epoch": 1.3769633507853403,
"grad_norm": 1.714251809547912,
"learning_rate": 4.87526499365723e-07,
"loss": 0.8575,
"step": 526
},
{
"epoch": 1.3795811518324608,
"grad_norm": 1.4496034561474174,
"learning_rate": 4.838052373905553e-07,
"loss": 0.8833,
"step": 527
},
{
"epoch": 1.3821989528795813,
"grad_norm": 1.56426776623298,
"learning_rate": 4.800936947274254e-07,
"loss": 0.8553,
"step": 528
},
{
"epoch": 1.3848167539267016,
"grad_norm": 1.591662406148868,
"learning_rate": 4.7639194126011486e-07,
"loss": 0.8626,
"step": 529
},
{
"epoch": 1.387434554973822,
"grad_norm": 1.2998408316507073,
"learning_rate": 4.7270004668808393e-07,
"loss": 0.8924,
"step": 530
},
{
"epoch": 1.3900523560209423,
"grad_norm": 1.7152024963422792,
"learning_rate": 4.690180805251643e-07,
"loss": 0.8902,
"step": 531
},
{
"epoch": 1.3926701570680629,
"grad_norm": 1.3075264023398263,
"learning_rate": 4.653461120982459e-07,
"loss": 0.8603,
"step": 532
},
{
"epoch": 1.3952879581151834,
"grad_norm": 1.276878966251307,
"learning_rate": 4.6168421054597606e-07,
"loss": 0.8739,
"step": 533
},
{
"epoch": 1.3979057591623036,
"grad_norm": 1.4884315886808126,
"learning_rate": 4.5803244481745276e-07,
"loss": 0.8923,
"step": 534
},
{
"epoch": 1.4005235602094241,
"grad_norm": 1.6380352911517773,
"learning_rate": 4.5439088367093036e-07,
"loss": 0.9608,
"step": 535
},
{
"epoch": 1.4031413612565444,
"grad_norm": 1.4430469631924363,
"learning_rate": 4.507595956725233e-07,
"loss": 0.8983,
"step": 536
},
{
"epoch": 1.405759162303665,
"grad_norm": 1.4694298853784378,
"learning_rate": 4.471386491949151e-07,
"loss": 0.8383,
"step": 537
},
{
"epoch": 1.4083769633507854,
"grad_norm": 1.9449190678271149,
"learning_rate": 4.4352811241607146e-07,
"loss": 0.8741,
"step": 538
},
{
"epoch": 1.4109947643979057,
"grad_norm": 1.5509421449752532,
"learning_rate": 4.39928053317955e-07,
"loss": 0.8887,
"step": 539
},
{
"epoch": 1.4136125654450262,
"grad_norm": 1.3749583182027436,
"learning_rate": 4.36338539685249e-07,
"loss": 0.9093,
"step": 540
},
{
"epoch": 1.4162303664921465,
"grad_norm": 1.3975843157116803,
"learning_rate": 4.32759639104076e-07,
"loss": 0.9235,
"step": 541
},
{
"epoch": 1.418848167539267,
"grad_norm": 1.4039921493904044,
"learning_rate": 4.2919141896072965e-07,
"loss": 0.9163,
"step": 542
},
{
"epoch": 1.4214659685863875,
"grad_norm": 1.3949577352275373,
"learning_rate": 4.256339464404024e-07,
"loss": 0.8548,
"step": 543
},
{
"epoch": 1.4240837696335078,
"grad_norm": 1.352669832446612,
"learning_rate": 4.2208728852592466e-07,
"loss": 0.9593,
"step": 544
},
{
"epoch": 1.4267015706806283,
"grad_norm": 1.535192382477205,
"learning_rate": 4.185515119964985e-07,
"loss": 0.9072,
"step": 545
},
{
"epoch": 1.4293193717277486,
"grad_norm": 1.4024835914952385,
"learning_rate": 4.150266834264445e-07,
"loss": 0.8771,
"step": 546
},
{
"epoch": 1.431937172774869,
"grad_norm": 1.3426811269822514,
"learning_rate": 4.115128691839463e-07,
"loss": 0.8857,
"step": 547
},
{
"epoch": 1.4345549738219896,
"grad_norm": 2.2188316350749986,
"learning_rate": 4.0801013542980154e-07,
"loss": 0.8902,
"step": 548
},
{
"epoch": 1.4371727748691099,
"grad_norm": 1.6290927785062779,
"learning_rate": 4.045185481161747e-07,
"loss": 0.968,
"step": 549
},
{
"epoch": 1.4397905759162304,
"grad_norm": 1.4583741240333974,
"learning_rate": 4.010381729853579e-07,
"loss": 0.8961,
"step": 550
},
{
"epoch": 1.4424083769633507,
"grad_norm": 1.3107716580378566,
"learning_rate": 3.975690755685311e-07,
"loss": 0.8983,
"step": 551
},
{
"epoch": 1.4450261780104712,
"grad_norm": 1.3562349394678586,
"learning_rate": 3.9411132118452893e-07,
"loss": 0.9214,
"step": 552
},
{
"epoch": 1.4476439790575917,
"grad_norm": 1.3224730554942807,
"learning_rate": 3.906649749386105e-07,
"loss": 0.9057,
"step": 553
},
{
"epoch": 1.450261780104712,
"grad_norm": 1.353535129786952,
"learning_rate": 3.8723010172123373e-07,
"loss": 0.946,
"step": 554
},
{
"epoch": 1.4528795811518325,
"grad_norm": 1.4259143441660183,
"learning_rate": 3.838067662068341e-07,
"loss": 0.8604,
"step": 555
},
{
"epoch": 1.4554973821989527,
"grad_norm": 1.42186194700426,
"learning_rate": 3.80395032852605e-07,
"loss": 0.8439,
"step": 556
},
{
"epoch": 1.4581151832460733,
"grad_norm": 1.3809858189745732,
"learning_rate": 3.769949658972866e-07,
"loss": 0.928,
"step": 557
},
{
"epoch": 1.4607329842931938,
"grad_norm": 1.6516624180839072,
"learning_rate": 3.7360662935995504e-07,
"loss": 0.9032,
"step": 558
},
{
"epoch": 1.4633507853403143,
"grad_norm": 1.34386031295635,
"learning_rate": 3.70230087038817e-07,
"loss": 0.9219,
"step": 559
},
{
"epoch": 1.4659685863874345,
"grad_norm": 1.6338778956502633,
"learning_rate": 3.6686540251000754e-07,
"loss": 0.916,
"step": 560
},
{
"epoch": 1.468586387434555,
"grad_norm": 1.346375748845269,
"learning_rate": 3.635126391263964e-07,
"loss": 0.8901,
"step": 561
},
{
"epoch": 1.4712041884816753,
"grad_norm": 1.8950133658290673,
"learning_rate": 3.6017186001639035e-07,
"loss": 0.8983,
"step": 562
},
{
"epoch": 1.4738219895287958,
"grad_norm": 1.502360041436484,
"learning_rate": 3.5684312808274895e-07,
"loss": 0.8465,
"step": 563
},
{
"epoch": 1.4764397905759163,
"grad_norm": 1.3328999154470254,
"learning_rate": 3.5352650600139643e-07,
"loss": 0.8678,
"step": 564
},
{
"epoch": 1.4790575916230366,
"grad_norm": 1.5402031214432916,
"learning_rate": 3.502220562202457e-07,
"loss": 0.9039,
"step": 565
},
{
"epoch": 1.4816753926701571,
"grad_norm": 1.3596182174458997,
"learning_rate": 3.469298409580179e-07,
"loss": 0.8975,
"step": 566
},
{
"epoch": 1.4842931937172774,
"grad_norm": 2.2946988942603097,
"learning_rate": 3.4364992220307474e-07,
"loss": 0.8954,
"step": 567
},
{
"epoch": 1.486910994764398,
"grad_norm": 1.4327603069667216,
"learning_rate": 3.4038236171224943e-07,
"loss": 0.9415,
"step": 568
},
{
"epoch": 1.4895287958115184,
"grad_norm": 1.3641815612490016,
"learning_rate": 3.3712722100968416e-07,
"loss": 0.9026,
"step": 569
},
{
"epoch": 1.4921465968586387,
"grad_norm": 1.2637466433514526,
"learning_rate": 3.338845613856722e-07,
"loss": 0.8561,
"step": 570
},
{
"epoch": 1.4947643979057592,
"grad_norm": 1.250859176376699,
"learning_rate": 3.306544438955021e-07,
"loss": 0.8633,
"step": 571
},
{
"epoch": 1.4973821989528795,
"grad_norm": 1.4958091635550417,
"learning_rate": 3.2743692935831204e-07,
"loss": 0.9117,
"step": 572
},
{
"epoch": 1.5,
"grad_norm": 1.4189458972675342,
"learning_rate": 3.2423207835593945e-07,
"loss": 0.9277,
"step": 573
},
{
"epoch": 1.5026178010471205,
"grad_norm": 1.6386051647596955,
"learning_rate": 3.2103995123178485e-07,
"loss": 0.9326,
"step": 574
},
{
"epoch": 1.5052356020942408,
"grad_norm": 1.3086995202347653,
"learning_rate": 3.17860608089672e-07,
"loss": 0.9019,
"step": 575
},
{
"epoch": 1.5078534031413613,
"grad_norm": 1.4054865422218317,
"learning_rate": 3.146941087927203e-07,
"loss": 0.9337,
"step": 576
},
{
"epoch": 1.5104712041884816,
"grad_norm": 1.3123033746962398,
"learning_rate": 3.115405129622133e-07,
"loss": 0.923,
"step": 577
},
{
"epoch": 1.513089005235602,
"grad_norm": 1.4643032025460945,
"learning_rate": 3.083998799764793e-07,
"loss": 0.8798,
"step": 578
},
{
"epoch": 1.5157068062827226,
"grad_norm": 1.422050292940817,
"learning_rate": 3.052722689697719e-07,
"loss": 0.8686,
"step": 579
},
{
"epoch": 1.518324607329843,
"grad_norm": 1.6086227282469414,
"learning_rate": 3.02157738831157e-07,
"loss": 0.9343,
"step": 580
},
{
"epoch": 1.5209424083769634,
"grad_norm": 1.490803668534442,
"learning_rate": 2.990563482034032e-07,
"loss": 0.9108,
"step": 581
},
{
"epoch": 1.5235602094240837,
"grad_norm": 1.5896774793419899,
"learning_rate": 2.9596815548187906e-07,
"loss": 0.9147,
"step": 582
},
{
"epoch": 1.5261780104712042,
"grad_norm": 1.3359116805228912,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.8956,
"step": 583
},
{
"epoch": 1.5287958115183247,
"grad_norm": 1.4082406709301296,
"learning_rate": 2.898315960953963e-07,
"loss": 0.9,
"step": 584
},
{
"epoch": 1.5314136125654452,
"grad_norm": 1.5960566913445038,
"learning_rate": 2.86783344974298e-07,
"loss": 0.8866,
"step": 585
},
{
"epoch": 1.5340314136125655,
"grad_norm": 1.3367632546914359,
"learning_rate": 2.837485228449744e-07,
"loss": 0.9182,
"step": 586
},
{
"epoch": 1.5366492146596857,
"grad_norm": 1.4328906430200836,
"learning_rate": 2.80727186849391e-07,
"loss": 0.9065,
"step": 587
},
{
"epoch": 1.5392670157068062,
"grad_norm": 1.4305707171373445,
"learning_rate": 2.777193938755855e-07,
"loss": 0.8474,
"step": 588
},
{
"epoch": 1.5418848167539267,
"grad_norm": 1.390558186229553,
"learning_rate": 2.7472520055659766e-07,
"loss": 0.8292,
"step": 589
},
{
"epoch": 1.5445026178010473,
"grad_norm": 1.5168812321972025,
"learning_rate": 2.717446632694025e-07,
"loss": 0.9483,
"step": 590
},
{
"epoch": 1.5471204188481675,
"grad_norm": 1.5339281627360795,
"learning_rate": 2.6877783813384893e-07,
"loss": 0.8949,
"step": 591
},
{
"epoch": 1.5497382198952878,
"grad_norm": 1.4176186502561052,
"learning_rate": 2.6582478101160166e-07,
"loss": 0.9198,
"step": 592
},
{
"epoch": 1.5523560209424083,
"grad_norm": 1.3908601156289901,
"learning_rate": 2.6288554750509283e-07,
"loss": 0.8816,
"step": 593
},
{
"epoch": 1.5549738219895288,
"grad_norm": 1.303626234335547,
"learning_rate": 2.599601929564709e-07,
"loss": 0.8803,
"step": 594
},
{
"epoch": 1.5575916230366493,
"grad_norm": 1.3719162542207297,
"learning_rate": 2.57048772446562e-07,
"loss": 0.8948,
"step": 595
},
{
"epoch": 1.5602094240837696,
"grad_norm": 1.3156007083318564,
"learning_rate": 2.5415134079383004e-07,
"loss": 0.8825,
"step": 596
},
{
"epoch": 1.56282722513089,
"grad_norm": 1.393225250452261,
"learning_rate": 2.5126795255334787e-07,
"loss": 0.9464,
"step": 597
},
{
"epoch": 1.5654450261780104,
"grad_norm": 1.4173100082790748,
"learning_rate": 2.4839866201576645e-07,
"loss": 0.8965,
"step": 598
},
{
"epoch": 1.568062827225131,
"grad_norm": 1.3223995366617138,
"learning_rate": 2.4554352320629523e-07,
"loss": 0.9205,
"step": 599
},
{
"epoch": 1.5706806282722514,
"grad_norm": 1.4027524768427433,
"learning_rate": 2.4270258988368374e-07,
"loss": 0.9074,
"step": 600
},
{
"epoch": 1.5732984293193717,
"grad_norm": 1.4584542546530008,
"learning_rate": 2.3987591553920996e-07,
"loss": 0.8893,
"step": 601
},
{
"epoch": 1.5759162303664922,
"grad_norm": 1.3388421557211998,
"learning_rate": 2.3706355339567286e-07,
"loss": 0.8849,
"step": 602
},
{
"epoch": 1.5785340314136125,
"grad_norm": 1.341899738253868,
"learning_rate": 2.3426555640638922e-07,
"loss": 0.9048,
"step": 603
},
{
"epoch": 1.581151832460733,
"grad_norm": 1.4453501391703267,
"learning_rate": 2.3148197725419983e-07,
"loss": 0.9189,
"step": 604
},
{
"epoch": 1.5837696335078535,
"grad_norm": 1.511859976381982,
"learning_rate": 2.2871286835047287e-07,
"loss": 0.9055,
"step": 605
},
{
"epoch": 1.5863874345549738,
"grad_norm": 1.3403547937150142,
"learning_rate": 2.2595828183412168e-07,
"loss": 0.8339,
"step": 606
},
{
"epoch": 1.5890052356020943,
"grad_norm": 1.3796848926133887,
"learning_rate": 2.2321826957061884e-07,
"loss": 0.917,
"step": 607
},
{
"epoch": 1.5916230366492146,
"grad_norm": 1.3988877715990504,
"learning_rate": 2.204928831510241e-07,
"loss": 0.9039,
"step": 608
},
{
"epoch": 1.594240837696335,
"grad_norm": 1.278731483728787,
"learning_rate": 2.1778217389100828e-07,
"loss": 0.9258,
"step": 609
},
{
"epoch": 1.5968586387434556,
"grad_norm": 1.3160813737990813,
"learning_rate": 2.1508619282989083e-07,
"loss": 0.8876,
"step": 610
},
{
"epoch": 1.599476439790576,
"grad_norm": 1.2756509102772609,
"learning_rate": 2.1240499072967676e-07,
"loss": 0.9271,
"step": 611
},
{
"epoch": 1.6020942408376964,
"grad_norm": 1.4288572172533927,
"learning_rate": 2.0973861807410187e-07,
"loss": 0.8502,
"step": 612
},
{
"epoch": 1.6047120418848166,
"grad_norm": 1.4064091065807276,
"learning_rate": 2.0708712506768077e-07,
"loss": 0.9031,
"step": 613
},
{
"epoch": 1.6073298429319371,
"grad_norm": 1.4628361057234258,
"learning_rate": 2.0445056163476372e-07,
"loss": 0.893,
"step": 614
},
{
"epoch": 1.6099476439790577,
"grad_norm": 1.2800166095474408,
"learning_rate": 2.0182897741859494e-07,
"loss": 0.9062,
"step": 615
},
{
"epoch": 1.6125654450261782,
"grad_norm": 1.4185981585601595,
"learning_rate": 1.9922242178037863e-07,
"loss": 0.8921,
"step": 616
},
{
"epoch": 1.6151832460732984,
"grad_norm": 1.4565744107743526,
"learning_rate": 1.966309437983491e-07,
"loss": 0.8639,
"step": 617
},
{
"epoch": 1.6178010471204187,
"grad_norm": 1.495668458429946,
"learning_rate": 1.9405459226684717e-07,
"loss": 0.8979,
"step": 618
},
{
"epoch": 1.6204188481675392,
"grad_norm": 1.3012489378658836,
"learning_rate": 1.9149341569540156e-07,
"loss": 0.8967,
"step": 619
},
{
"epoch": 1.6230366492146597,
"grad_norm": 1.281621052307457,
"learning_rate": 1.88947462307814e-07,
"loss": 0.8495,
"step": 620
},
{
"epoch": 1.6256544502617802,
"grad_norm": 1.4215901575416943,
"learning_rate": 1.8641678004125362e-07,
"loss": 0.8946,
"step": 621
},
{
"epoch": 1.6282722513089005,
"grad_norm": 1.3593548915385338,
"learning_rate": 1.8390141654535263e-07,
"loss": 0.8708,
"step": 622
},
{
"epoch": 1.6308900523560208,
"grad_norm": 1.521225370539659,
"learning_rate": 1.8140141918131003e-07,
"loss": 0.9211,
"step": 623
},
{
"epoch": 1.6335078534031413,
"grad_norm": 1.308383204107825,
"learning_rate": 1.7891683502099831e-07,
"loss": 0.872,
"step": 624
},
{
"epoch": 1.6361256544502618,
"grad_norm": 1.3011165358618517,
"learning_rate": 1.7644771084608011e-07,
"loss": 0.9185,
"step": 625
},
{
"epoch": 1.6387434554973823,
"grad_norm": 1.5506599670903491,
"learning_rate": 1.739940931471239e-07,
"loss": 0.8768,
"step": 626
},
{
"epoch": 1.6413612565445026,
"grad_norm": 1.3984936451622314,
"learning_rate": 1.715560281227315e-07,
"loss": 0.8728,
"step": 627
},
{
"epoch": 1.6439790575916229,
"grad_norm": 1.453272317924072,
"learning_rate": 1.6913356167866578e-07,
"loss": 0.8847,
"step": 628
},
{
"epoch": 1.6465968586387434,
"grad_norm": 1.5302879869825294,
"learning_rate": 1.6672673942698922e-07,
"loss": 0.8946,
"step": 629
},
{
"epoch": 1.649214659685864,
"grad_norm": 1.5604301829465825,
"learning_rate": 1.6433560668520174e-07,
"loss": 0.9157,
"step": 630
},
{
"epoch": 1.6518324607329844,
"grad_norm": 1.5376108645580229,
"learning_rate": 1.6196020847539006e-07,
"loss": 0.9386,
"step": 631
},
{
"epoch": 1.6544502617801047,
"grad_norm": 1.2960957446844783,
"learning_rate": 1.5960058952337884e-07,
"loss": 0.8951,
"step": 632
},
{
"epoch": 1.6570680628272252,
"grad_norm": 1.4271085017911613,
"learning_rate": 1.572567942578885e-07,
"loss": 0.8765,
"step": 633
},
{
"epoch": 1.6596858638743455,
"grad_norm": 1.3941354342600962,
"learning_rate": 1.5492886680969964e-07,
"loss": 0.9211,
"step": 634
},
{
"epoch": 1.662303664921466,
"grad_norm": 1.4547756229332254,
"learning_rate": 1.526168510108199e-07,
"loss": 0.9032,
"step": 635
},
{
"epoch": 1.6649214659685865,
"grad_norm": 2.046655662815991,
"learning_rate": 1.5032079039366208e-07,
"loss": 0.8988,
"step": 636
},
{
"epoch": 1.6675392670157068,
"grad_norm": 1.3480768897271267,
"learning_rate": 1.4804072819022106e-07,
"loss": 0.9378,
"step": 637
},
{
"epoch": 1.6701570680628273,
"grad_norm": 1.3812591568322627,
"learning_rate": 1.45776707331262e-07,
"loss": 0.9235,
"step": 638
},
{
"epoch": 1.6727748691099475,
"grad_norm": 1.433731226822694,
"learning_rate": 1.4352877044551048e-07,
"loss": 0.9036,
"step": 639
},
{
"epoch": 1.675392670157068,
"grad_norm": 1.475252375209803,
"learning_rate": 1.4129695985885227e-07,
"loss": 0.907,
"step": 640
},
{
"epoch": 1.6780104712041886,
"grad_norm": 1.6504645816597694,
"learning_rate": 1.3908131759353304e-07,
"loss": 0.8855,
"step": 641
},
{
"epoch": 1.680628272251309,
"grad_norm": 1.4370298290777148,
"learning_rate": 1.3688188536736968e-07,
"loss": 0.9286,
"step": 642
},
{
"epoch": 1.6832460732984293,
"grad_norm": 2.601924820023129,
"learning_rate": 1.3469870459296406e-07,
"loss": 0.8947,
"step": 643
},
{
"epoch": 1.6858638743455496,
"grad_norm": 1.2670411167466755,
"learning_rate": 1.3253181637692324e-07,
"loss": 0.8945,
"step": 644
},
{
"epoch": 1.6884816753926701,
"grad_norm": 1.3738351586559805,
"learning_rate": 1.303812615190849e-07,
"loss": 0.9443,
"step": 645
},
{
"epoch": 1.6910994764397906,
"grad_norm": 1.3428167518112526,
"learning_rate": 1.2824708051175014e-07,
"loss": 0.859,
"step": 646
},
{
"epoch": 1.6937172774869111,
"grad_norm": 1.5087146741136386,
"learning_rate": 1.2612931353892074e-07,
"loss": 0.8993,
"step": 647
},
{
"epoch": 1.6963350785340314,
"grad_norm": 1.3091751432038465,
"learning_rate": 1.2402800047554206e-07,
"loss": 0.8872,
"step": 648
},
{
"epoch": 1.6989528795811517,
"grad_norm": 1.4051594837211576,
"learning_rate": 1.2194318088675282e-07,
"loss": 0.9054,
"step": 649
},
{
"epoch": 1.7015706806282722,
"grad_norm": 1.2366099250393099,
"learning_rate": 1.198748940271398e-07,
"loss": 0.9225,
"step": 650
},
{
"epoch": 1.7041884816753927,
"grad_norm": 1.2514264913592443,
"learning_rate": 1.1782317883999915e-07,
"loss": 0.9377,
"step": 651
},
{
"epoch": 1.7068062827225132,
"grad_norm": 1.570847699413206,
"learning_rate": 1.1578807395660206e-07,
"loss": 0.8891,
"step": 652
},
{
"epoch": 1.7094240837696335,
"grad_norm": 1.3355965811118744,
"learning_rate": 1.1376961769546889e-07,
"loss": 0.9141,
"step": 653
},
{
"epoch": 1.7120418848167538,
"grad_norm": 1.508690437109559,
"learning_rate": 1.1176784806164674e-07,
"loss": 0.8628,
"step": 654
},
{
"epoch": 1.7146596858638743,
"grad_norm": 1.6773878404961995,
"learning_rate": 1.0978280274599417e-07,
"loss": 0.8179,
"step": 655
},
{
"epoch": 1.7172774869109948,
"grad_norm": 1.4628566180299178,
"learning_rate": 1.078145191244706e-07,
"loss": 0.8923,
"step": 656
},
{
"epoch": 1.7198952879581153,
"grad_norm": 1.5566732390176854,
"learning_rate": 1.0586303425743493e-07,
"loss": 0.8911,
"step": 657
},
{
"epoch": 1.7225130890052356,
"grad_norm": 1.4155543135683646,
"learning_rate": 1.0392838488894462e-07,
"loss": 0.8629,
"step": 658
},
{
"epoch": 1.7251308900523559,
"grad_norm": 1.2831149126092454,
"learning_rate": 1.0201060744606637e-07,
"loss": 0.8875,
"step": 659
},
{
"epoch": 1.7277486910994764,
"grad_norm": 1.4332173905157346,
"learning_rate": 1.0010973803818856e-07,
"loss": 0.943,
"step": 660
},
{
"epoch": 1.7303664921465969,
"grad_norm": 1.2134779529389657,
"learning_rate": 9.822581245634321e-08,
"loss": 0.8183,
"step": 661
},
{
"epoch": 1.7329842931937174,
"grad_norm": 1.4149474814170255,
"learning_rate": 9.635886617252975e-08,
"loss": 0.901,
"step": 662
},
{
"epoch": 1.7356020942408377,
"grad_norm": 1.5242078190981234,
"learning_rate": 9.450893433904895e-08,
"loss": 0.8263,
"step": 663
},
{
"epoch": 1.738219895287958,
"grad_norm": 1.7776241358004243,
"learning_rate": 9.267605178784033e-08,
"loss": 0.8908,
"step": 664
},
{
"epoch": 1.7408376963350785,
"grad_norm": 1.4641461126272701,
"learning_rate": 9.086025302982648e-08,
"loss": 0.8887,
"step": 665
},
{
"epoch": 1.743455497382199,
"grad_norm": 2.5856902698595556,
"learning_rate": 8.906157225426313e-08,
"loss": 0.9558,
"step": 666
},
{
"epoch": 1.7460732984293195,
"grad_norm": 1.448131814585754,
"learning_rate": 8.728004332809514e-08,
"loss": 0.848,
"step": 667
},
{
"epoch": 1.7486910994764397,
"grad_norm": 1.3537545655810488,
"learning_rate": 8.55156997953197e-08,
"loss": 0.871,
"step": 668
},
{
"epoch": 1.7513089005235603,
"grad_norm": 1.3990392212643947,
"learning_rate": 8.37685748763538e-08,
"loss": 0.9056,
"step": 669
},
{
"epoch": 1.7539267015706805,
"grad_norm": 1.5454399399661,
"learning_rate": 8.203870146740932e-08,
"loss": 0.954,
"step": 670
},
{
"epoch": 1.756544502617801,
"grad_norm": 1.3257827710500718,
"learning_rate": 8.03261121398735e-08,
"loss": 0.9104,
"step": 671
},
{
"epoch": 1.7591623036649215,
"grad_norm": 1.3550433390583934,
"learning_rate": 7.86308391396956e-08,
"loss": 0.8676,
"step": 672
},
{
"epoch": 1.761780104712042,
"grad_norm": 1.4690820882565427,
"learning_rate": 7.695291438677931e-08,
"loss": 0.8799,
"step": 673
},
{
"epoch": 1.7643979057591623,
"grad_norm": 1.6381490580033888,
"learning_rate": 7.529236947438256e-08,
"loss": 0.9297,
"step": 674
},
{
"epoch": 1.7670157068062826,
"grad_norm": 1.4254089921050725,
"learning_rate": 7.364923566852244e-08,
"loss": 0.9021,
"step": 675
},
{
"epoch": 1.7696335078534031,
"grad_norm": 1.6928537975880145,
"learning_rate": 7.202354390738608e-08,
"loss": 0.8564,
"step": 676
},
{
"epoch": 1.7722513089005236,
"grad_norm": 1.3023570567264096,
"learning_rate": 7.041532480074819e-08,
"loss": 0.9184,
"step": 677
},
{
"epoch": 1.7748691099476441,
"grad_norm": 1.3429734060010876,
"learning_rate": 6.88246086293952e-08,
"loss": 0.9471,
"step": 678
},
{
"epoch": 1.7774869109947644,
"grad_norm": 1.382194472551508,
"learning_rate": 6.725142534455486e-08,
"loss": 0.8766,
"step": 679
},
{
"epoch": 1.7801047120418847,
"grad_norm": 1.3774349266930637,
"learning_rate": 6.569580456733204e-08,
"loss": 0.8905,
"step": 680
},
{
"epoch": 1.7827225130890052,
"grad_norm": 1.5906615374253104,
"learning_rate": 6.415777558815138e-08,
"loss": 0.8966,
"step": 681
},
{
"epoch": 1.7853403141361257,
"grad_norm": 1.4753708964257082,
"learning_rate": 6.263736736620551e-08,
"loss": 0.9317,
"step": 682
},
{
"epoch": 1.7879581151832462,
"grad_norm": 1.4312204179081733,
"learning_rate": 6.113460852890973e-08,
"loss": 0.8454,
"step": 683
},
{
"epoch": 1.7905759162303665,
"grad_norm": 1.5371219041917656,
"learning_rate": 5.964952737136353e-08,
"loss": 0.9033,
"step": 684
},
{
"epoch": 1.7931937172774868,
"grad_norm": 1.314841769284732,
"learning_rate": 5.8182151855816986e-08,
"loss": 0.8834,
"step": 685
},
{
"epoch": 1.7958115183246073,
"grad_norm": 1.353813544946452,
"learning_rate": 5.6732509611145284e-08,
"loss": 0.9084,
"step": 686
},
{
"epoch": 1.7984293193717278,
"grad_norm": 1.4640193110979116,
"learning_rate": 5.5300627932327706e-08,
"loss": 0.929,
"step": 687
},
{
"epoch": 1.8010471204188483,
"grad_norm": 1.3860981489888715,
"learning_rate": 5.388653377993324e-08,
"loss": 0.9187,
"step": 688
},
{
"epoch": 1.8036649214659686,
"grad_norm": 1.4257965000825006,
"learning_rate": 5.2490253779615133e-08,
"loss": 0.8793,
"step": 689
},
{
"epoch": 1.8062827225130889,
"grad_norm": 1.3839459669807797,
"learning_rate": 5.111181422160671e-08,
"loss": 0.9342,
"step": 690
},
{
"epoch": 1.8089005235602094,
"grad_norm": 1.927472714256995,
"learning_rate": 4.975124106022843e-08,
"loss": 0.912,
"step": 691
},
{
"epoch": 1.8115183246073299,
"grad_norm": 1.362684938892342,
"learning_rate": 4.840855991339798e-08,
"loss": 0.8619,
"step": 692
},
{
"epoch": 1.8141361256544504,
"grad_norm": 1.4760030845035397,
"learning_rate": 4.7083796062149297e-08,
"loss": 0.8613,
"step": 693
},
{
"epoch": 1.8167539267015707,
"grad_norm": 1.3554051401391647,
"learning_rate": 4.577697445015471e-08,
"loss": 0.8376,
"step": 694
},
{
"epoch": 1.819371727748691,
"grad_norm": 1.6346763895111678,
"learning_rate": 4.448811968325683e-08,
"loss": 0.8559,
"step": 695
},
{
"epoch": 1.8219895287958114,
"grad_norm": 1.4481074116917443,
"learning_rate": 4.321725602900472e-08,
"loss": 0.9446,
"step": 696
},
{
"epoch": 1.824607329842932,
"grad_norm": 1.2980006766112568,
"learning_rate": 4.196440741619678e-08,
"loss": 0.8896,
"step": 697
},
{
"epoch": 1.8272251308900525,
"grad_norm": 1.4191572591347388,
"learning_rate": 4.0729597434430164e-08,
"loss": 0.8363,
"step": 698
},
{
"epoch": 1.8298429319371727,
"grad_norm": 1.411676924931839,
"learning_rate": 3.9512849333657064e-08,
"loss": 0.8892,
"step": 699
},
{
"epoch": 1.8324607329842932,
"grad_norm": 1.3885038260855735,
"learning_rate": 3.8314186023746696e-08,
"loss": 0.8561,
"step": 700
},
{
"epoch": 1.8350785340314135,
"grad_norm": 1.4592148849927922,
"learning_rate": 3.713363007405379e-08,
"loss": 0.8753,
"step": 701
},
{
"epoch": 1.837696335078534,
"grad_norm": 1.5404990433301489,
"learning_rate": 3.5971203712993894e-08,
"loss": 0.9085,
"step": 702
},
{
"epoch": 1.8403141361256545,
"grad_norm": 1.3311115738208295,
"learning_rate": 3.482692882762461e-08,
"loss": 0.8894,
"step": 703
},
{
"epoch": 1.8429319371727748,
"grad_norm": 1.37981516106682,
"learning_rate": 3.3700826963233734e-08,
"loss": 0.8637,
"step": 704
},
{
"epoch": 1.8455497382198953,
"grad_norm": 1.5214558709057895,
"learning_rate": 3.2592919322933174e-08,
"loss": 0.9005,
"step": 705
},
{
"epoch": 1.8481675392670156,
"grad_norm": 1.3717062091821015,
"learning_rate": 3.150322676726025e-08,
"loss": 0.8954,
"step": 706
},
{
"epoch": 1.850785340314136,
"grad_norm": 1.306979872125254,
"learning_rate": 3.0431769813784595e-08,
"loss": 0.9342,
"step": 707
},
{
"epoch": 1.8534031413612566,
"grad_norm": 1.3332532481135164,
"learning_rate": 2.9378568636721836e-08,
"loss": 0.9161,
"step": 708
},
{
"epoch": 1.8560209424083771,
"grad_norm": 1.5679258379351098,
"learning_rate": 2.834364306655379e-08,
"loss": 0.9414,
"step": 709
},
{
"epoch": 1.8586387434554974,
"grad_norm": 1.3241677265890193,
"learning_rate": 2.7327012589655307e-08,
"loss": 0.9092,
"step": 710
},
{
"epoch": 1.8612565445026177,
"grad_norm": 1.848580838309608,
"learning_rate": 2.6328696347926783e-08,
"loss": 0.9327,
"step": 711
},
{
"epoch": 1.8638743455497382,
"grad_norm": 1.6231182635223822,
"learning_rate": 2.5348713138434564e-08,
"loss": 0.9256,
"step": 712
},
{
"epoch": 1.8664921465968587,
"grad_norm": 1.370044017499312,
"learning_rate": 2.43870814130559e-08,
"loss": 0.9057,
"step": 713
},
{
"epoch": 1.8691099476439792,
"grad_norm": 1.3354739269648654,
"learning_rate": 2.3443819278132992e-08,
"loss": 0.9143,
"step": 714
},
{
"epoch": 1.8717277486910995,
"grad_norm": 1.378182616321603,
"learning_rate": 2.251894449413061e-08,
"loss": 0.9092,
"step": 715
},
{
"epoch": 1.8743455497382198,
"grad_norm": 1.3052742423435106,
"learning_rate": 2.161247447530268e-08,
"loss": 0.9136,
"step": 716
},
{
"epoch": 1.8769633507853403,
"grad_norm": 1.6339003824243274,
"learning_rate": 2.0724426289363995e-08,
"loss": 0.8698,
"step": 717
},
{
"epoch": 1.8795811518324608,
"grad_norm": 1.377085237822559,
"learning_rate": 1.9854816657168817e-08,
"loss": 0.9006,
"step": 718
},
{
"epoch": 1.8821989528795813,
"grad_norm": 1.5109348592590883,
"learning_rate": 1.9003661952396223e-08,
"loss": 0.8986,
"step": 719
},
{
"epoch": 1.8848167539267016,
"grad_norm": 1.3810204321286608,
"learning_rate": 1.817097820124147e-08,
"loss": 0.863,
"step": 720
},
{
"epoch": 1.8874345549738218,
"grad_norm": 1.314882972235073,
"learning_rate": 1.7356781082115024e-08,
"loss": 0.8868,
"step": 721
},
{
"epoch": 1.8900523560209423,
"grad_norm": 1.4964476218671747,
"learning_rate": 1.656108592534633e-08,
"loss": 0.8755,
"step": 722
},
{
"epoch": 1.8926701570680629,
"grad_norm": 1.3775827034693857,
"learning_rate": 1.578390771289606e-08,
"loss": 0.8786,
"step": 723
},
{
"epoch": 1.8952879581151834,
"grad_norm": 1.497907001459483,
"learning_rate": 1.5025261078073003e-08,
"loss": 0.956,
"step": 724
},
{
"epoch": 1.8979057591623036,
"grad_norm": 1.422027343640092,
"learning_rate": 1.4285160305259836e-08,
"loss": 0.9062,
"step": 725
},
{
"epoch": 1.900523560209424,
"grad_norm": 1.3537382840275476,
"learning_rate": 1.3563619329643117e-08,
"loss": 0.894,
"step": 726
},
{
"epoch": 1.9031413612565444,
"grad_norm": 1.4016382516425014,
"learning_rate": 1.2860651736951278e-08,
"loss": 0.8895,
"step": 727
},
{
"epoch": 1.905759162303665,
"grad_norm": 1.290142647836188,
"learning_rate": 1.2176270763198825e-08,
"loss": 0.8809,
"step": 728
},
{
"epoch": 1.9083769633507854,
"grad_norm": 1.4168614683015706,
"learning_rate": 1.1510489294437431e-08,
"loss": 0.9017,
"step": 729
},
{
"epoch": 1.9109947643979057,
"grad_norm": 1.3924491890195099,
"learning_rate": 1.0863319866512344e-08,
"loss": 0.8747,
"step": 730
},
{
"epoch": 1.9136125654450262,
"grad_norm": 1.5107117645316126,
"learning_rate": 1.0234774664827473e-08,
"loss": 0.9059,
"step": 731
},
{
"epoch": 1.9162303664921465,
"grad_norm": 1.344447744542007,
"learning_rate": 9.624865524115344e-09,
"loss": 0.8854,
"step": 732
},
{
"epoch": 1.918848167539267,
"grad_norm": 1.442508498350657,
"learning_rate": 9.033603928214396e-09,
"loss": 0.8964,
"step": 733
},
{
"epoch": 1.9214659685863875,
"grad_norm": 1.3923227096852724,
"learning_rate": 8.461001009852809e-09,
"loss": 0.8501,
"step": 734
},
{
"epoch": 1.9240837696335078,
"grad_norm": 1.322210909486878,
"learning_rate": 7.907067550438684e-09,
"loss": 0.8854,
"step": 735
},
{
"epoch": 1.9267015706806283,
"grad_norm": 1.3293207958123026,
"learning_rate": 7.371813979857311e-09,
"loss": 0.9489,
"step": 736
},
{
"epoch": 1.9293193717277486,
"grad_norm": 1.6807388495323254,
"learning_rate": 6.855250376274546e-09,
"loss": 0.9322,
"step": 737
},
{
"epoch": 1.931937172774869,
"grad_norm": 1.4321457195007106,
"learning_rate": 6.357386465947301e-09,
"loss": 0.941,
"step": 738
},
{
"epoch": 1.9345549738219896,
"grad_norm": 1.366512872397213,
"learning_rate": 5.878231623040242e-09,
"loss": 0.9164,
"step": 739
},
{
"epoch": 1.93717277486911,
"grad_norm": 1.5040506501535371,
"learning_rate": 5.417794869449377e-09,
"loss": 0.9216,
"step": 740
},
{
"epoch": 1.9397905759162304,
"grad_norm": 1.5043370580153907,
"learning_rate": 4.9760848746319695e-09,
"loss": 0.903,
"step": 741
},
{
"epoch": 1.9424083769633507,
"grad_norm": 1.591493042963084,
"learning_rate": 4.553109955443557e-09,
"loss": 0.9202,
"step": 742
},
{
"epoch": 1.9450261780104712,
"grad_norm": 1.286578387396067,
"learning_rate": 4.148878075981299e-09,
"loss": 0.8912,
"step": 743
},
{
"epoch": 1.9476439790575917,
"grad_norm": 1.6240641009201287,
"learning_rate": 3.763396847433875e-09,
"loss": 0.8771,
"step": 744
},
{
"epoch": 1.9502617801047122,
"grad_norm": 1.4006914501273882,
"learning_rate": 3.3966735279384875e-09,
"loss": 0.8407,
"step": 745
},
{
"epoch": 1.9528795811518325,
"grad_norm": 1.2527953374854444,
"learning_rate": 3.0487150224437487e-09,
"loss": 0.8606,
"step": 746
},
{
"epoch": 1.9554973821989527,
"grad_norm": 1.3573387159729935,
"learning_rate": 2.7195278825801195e-09,
"loss": 0.8481,
"step": 747
},
{
"epoch": 1.9581151832460733,
"grad_norm": 1.8268988010137661,
"learning_rate": 2.4091183065362285e-09,
"loss": 0.9248,
"step": 748
},
{
"epoch": 1.9607329842931938,
"grad_norm": 1.286277864510778,
"learning_rate": 2.1174921389424114e-09,
"loss": 0.8809,
"step": 749
},
{
"epoch": 1.9633507853403143,
"grad_norm": 1.3340327743313127,
"learning_rate": 1.8446548707604648e-09,
"loss": 0.9177,
"step": 750
},
{
"epoch": 1.9659685863874345,
"grad_norm": 1.3617686042968828,
"learning_rate": 1.5906116391801726e-09,
"loss": 0.9111,
"step": 751
},
{
"epoch": 1.9685863874345548,
"grad_norm": 1.5794964764984032,
"learning_rate": 1.355367227523052e-09,
"loss": 0.9111,
"step": 752
},
{
"epoch": 1.9712041884816753,
"grad_norm": 1.4937021056451114,
"learning_rate": 1.1389260651518684e-09,
"loss": 0.8331,
"step": 753
},
{
"epoch": 1.9738219895287958,
"grad_norm": 1.4048693955875151,
"learning_rate": 9.412922273871471e-10,
"loss": 0.909,
"step": 754
},
{
"epoch": 1.9764397905759163,
"grad_norm": 1.5558437033842454,
"learning_rate": 7.624694354309014e-10,
"loss": 0.8696,
"step": 755
},
{
"epoch": 1.9790575916230366,
"grad_norm": 1.3565360114481129,
"learning_rate": 6.02461056296244e-10,
"loss": 0.9147,
"step": 756
},
{
"epoch": 1.981675392670157,
"grad_norm": 1.4504340570028544,
"learning_rate": 4.6127010274399356e-10,
"loss": 0.9321,
"step": 757
},
{
"epoch": 1.9842931937172774,
"grad_norm": 1.6341119346475543,
"learning_rate": 3.3889923322594217e-10,
"loss": 0.9144,
"step": 758
},
{
"epoch": 1.986910994764398,
"grad_norm": 1.3600992136272299,
"learning_rate": 2.353507518350062e-10,
"loss": 0.8706,
"step": 759
},
{
"epoch": 1.9895287958115184,
"grad_norm": 1.8493583140551575,
"learning_rate": 1.506266082615948e-10,
"loss": 0.8717,
"step": 760
},
{
"epoch": 1.9921465968586387,
"grad_norm": 1.3957149457130282,
"learning_rate": 8.472839775719442e-11,
"loss": 0.9138,
"step": 761
},
{
"epoch": 1.9947643979057592,
"grad_norm": 1.6670788746427903,
"learning_rate": 3.765736110383777e-11,
"loss": 0.9377,
"step": 762
},
{
"epoch": 1.9973821989528795,
"grad_norm": 1.3773872289172804,
"learning_rate": 9.414384591233116e-12,
"loss": 0.9113,
"step": 763
},
{
"epoch": 2.0,
"grad_norm": 1.4031461299714583,
"learning_rate": 0.0,
"loss": 0.9218,
"step": 764
}
],
"logging_steps": 1,
"max_steps": 764,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 191,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 996711585546240.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}