{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.913669064748202, "eval_steps": 500, "global_step": 828, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014388489208633094, "grad_norm": 3.779401339094513, "learning_rate": 8.000000000000001e-07, "loss": 3.8262, "step": 1 }, { "epoch": 0.02877697841726619, "grad_norm": 10.455825805969267, "learning_rate": 1.6000000000000001e-06, "loss": 6.7511, "step": 2 }, { "epoch": 0.04316546762589928, "grad_norm": 5.6185654464107815, "learning_rate": 2.4000000000000003e-06, "loss": 4.6228, "step": 3 }, { "epoch": 0.05755395683453238, "grad_norm": 4.885801768876069, "learning_rate": 3.2000000000000003e-06, "loss": 4.2241, "step": 4 }, { "epoch": 0.07194244604316546, "grad_norm": 10.403297492152396, "learning_rate": 4.000000000000001e-06, "loss": 6.7416, "step": 5 }, { "epoch": 0.08633093525179857, "grad_norm": 4.5044178093473395, "learning_rate": 4.800000000000001e-06, "loss": 4.1637, "step": 6 }, { "epoch": 0.10071942446043165, "grad_norm": 4.734693138114069, "learning_rate": 5.600000000000001e-06, "loss": 4.3628, "step": 7 }, { "epoch": 0.11510791366906475, "grad_norm": 4.288727085080122, "learning_rate": 6.4000000000000006e-06, "loss": 4.0006, "step": 8 }, { "epoch": 0.12949640287769784, "grad_norm": 6.746488042337733, "learning_rate": 7.2000000000000005e-06, "loss": 5.1247, "step": 9 }, { "epoch": 0.14388489208633093, "grad_norm": 4.730837471611876, "learning_rate": 8.000000000000001e-06, "loss": 4.167, "step": 10 }, { "epoch": 0.15827338129496402, "grad_norm": 5.617511430019142, "learning_rate": 8.8e-06, "loss": 4.8073, "step": 11 }, { "epoch": 0.17266187050359713, "grad_norm": 4.585866335374106, "learning_rate": 9.600000000000001e-06, "loss": 4.1609, "step": 12 }, { "epoch": 0.18705035971223022, "grad_norm": 11.787607704302536, "learning_rate": 1.04e-05, "loss": 7.1837, "step": 13 }, { "epoch": 0.2014388489208633, "grad_norm": 4.123753478777725, "learning_rate": 1.1200000000000001e-05, "loss": 3.9041, "step": 14 }, { "epoch": 0.2158273381294964, "grad_norm": 3.8762103667223227, "learning_rate": 1.2e-05, "loss": 3.7087, "step": 15 }, { "epoch": 0.2302158273381295, "grad_norm": 4.21611793164487, "learning_rate": 1.2800000000000001e-05, "loss": 3.8878, "step": 16 }, { "epoch": 0.2446043165467626, "grad_norm": 4.614901815617855, "learning_rate": 1.3600000000000002e-05, "loss": 4.098, "step": 17 }, { "epoch": 0.2589928057553957, "grad_norm": 3.8977663337164286, "learning_rate": 1.4400000000000001e-05, "loss": 3.6943, "step": 18 }, { "epoch": 0.2733812949640288, "grad_norm": 4.8023114000216465, "learning_rate": 1.5200000000000002e-05, "loss": 4.0675, "step": 19 }, { "epoch": 0.28776978417266186, "grad_norm": 4.371540379053842, "learning_rate": 1.6000000000000003e-05, "loss": 3.8599, "step": 20 }, { "epoch": 0.302158273381295, "grad_norm": 5.726889530637721, "learning_rate": 1.6800000000000002e-05, "loss": 4.5061, "step": 21 }, { "epoch": 0.31654676258992803, "grad_norm": 3.9969844690452887, "learning_rate": 1.76e-05, "loss": 3.6472, "step": 22 }, { "epoch": 0.33093525179856115, "grad_norm": 11.516154261012499, "learning_rate": 1.8400000000000003e-05, "loss": 6.2924, "step": 23 }, { "epoch": 0.34532374100719426, "grad_norm": 5.202990002676231, "learning_rate": 1.9200000000000003e-05, "loss": 4.0705, "step": 24 }, { "epoch": 0.3597122302158273, "grad_norm": 4.176782643264017, "learning_rate": 2e-05, "loss": 3.5909, "step": 25 }, { "epoch": 0.37410071942446044, "grad_norm": 6.424896250724197, "learning_rate": 1.9999923468873635e-05, "loss": 4.2519, "step": 26 }, { "epoch": 0.38848920863309355, "grad_norm": 3.814366825221532, "learning_rate": 1.999969387666594e-05, "loss": 3.2879, "step": 27 }, { "epoch": 0.4028776978417266, "grad_norm": 10.112357856323287, "learning_rate": 1.9999311226891104e-05, "loss": 5.1954, "step": 28 }, { "epoch": 0.4172661870503597, "grad_norm": 5.7174917457637795, "learning_rate": 1.999877552540605e-05, "loss": 3.7522, "step": 29 }, { "epoch": 0.4316546762589928, "grad_norm": 10.763244722760309, "learning_rate": 1.9998086780410353e-05, "loss": 4.8527, "step": 30 }, { "epoch": 0.4460431654676259, "grad_norm": 3.982452930852662, "learning_rate": 1.999724500244609e-05, "loss": 3.1306, "step": 31 }, { "epoch": 0.460431654676259, "grad_norm": 5.709026877118652, "learning_rate": 1.999625020439771e-05, "loss": 3.3423, "step": 32 }, { "epoch": 0.4748201438848921, "grad_norm": 6.659790318204302, "learning_rate": 1.999510240149181e-05, "loss": 3.4041, "step": 33 }, { "epoch": 0.4892086330935252, "grad_norm": 4.7853395869132545, "learning_rate": 1.9993801611296923e-05, "loss": 2.985, "step": 34 }, { "epoch": 0.5035971223021583, "grad_norm": 4.922928181710041, "learning_rate": 1.999234785372324e-05, "loss": 2.8203, "step": 35 }, { "epoch": 0.5179856115107914, "grad_norm": 5.760568951577959, "learning_rate": 1.9990741151022302e-05, "loss": 2.6983, "step": 36 }, { "epoch": 0.5323741007194245, "grad_norm": 4.4691409045575075, "learning_rate": 1.9988981527786656e-05, "loss": 2.5688, "step": 37 }, { "epoch": 0.5467625899280576, "grad_norm": 7.274174017956967, "learning_rate": 1.99870690109495e-05, "loss": 2.6621, "step": 38 }, { "epoch": 0.5611510791366906, "grad_norm": 4.569137013282271, "learning_rate": 1.9985003629784237e-05, "loss": 2.5249, "step": 39 }, { "epoch": 0.5755395683453237, "grad_norm": 2.929731882279866, "learning_rate": 1.9982785415904063e-05, "loss": 2.4861, "step": 40 }, { "epoch": 0.5899280575539568, "grad_norm": 3.7590625764657815, "learning_rate": 1.998041440326146e-05, "loss": 2.3703, "step": 41 }, { "epoch": 0.60431654676259, "grad_norm": 3.113972586690868, "learning_rate": 1.9977890628147684e-05, "loss": 2.3579, "step": 42 }, { "epoch": 0.6187050359712231, "grad_norm": 4.310115895361215, "learning_rate": 1.99752141291922e-05, "loss": 2.4436, "step": 43 }, { "epoch": 0.6330935251798561, "grad_norm": 3.609141873223579, "learning_rate": 1.99723849473621e-05, "loss": 2.3687, "step": 44 }, { "epoch": 0.6474820143884892, "grad_norm": 2.889773990288414, "learning_rate": 1.996940312596149e-05, "loss": 2.2521, "step": 45 }, { "epoch": 0.6618705035971223, "grad_norm": 2.8196363081660745, "learning_rate": 1.9966268710630795e-05, "loss": 2.1572, "step": 46 }, { "epoch": 0.6762589928057554, "grad_norm": 3.5289128758260255, "learning_rate": 1.996298174934608e-05, "loss": 2.2095, "step": 47 }, { "epoch": 0.6906474820143885, "grad_norm": 4.166486783655687, "learning_rate": 1.9959542292418317e-05, "loss": 2.0916, "step": 48 }, { "epoch": 0.7050359712230215, "grad_norm": 3.6766424577128047, "learning_rate": 1.9955950392492604e-05, "loss": 2.0578, "step": 49 }, { "epoch": 0.7194244604316546, "grad_norm": 2.989826753407446, "learning_rate": 1.9952206104547378e-05, "loss": 2.0855, "step": 50 }, { "epoch": 0.7338129496402878, "grad_norm": 6.40215562149978, "learning_rate": 1.994830948589355e-05, "loss": 1.8788, "step": 51 }, { "epoch": 0.7482014388489209, "grad_norm": 4.072155747218333, "learning_rate": 1.9944260596173642e-05, "loss": 1.9819, "step": 52 }, { "epoch": 0.762589928057554, "grad_norm": 3.724293840264578, "learning_rate": 1.9940059497360874e-05, "loss": 1.8445, "step": 53 }, { "epoch": 0.7769784172661871, "grad_norm": 3.406626514519604, "learning_rate": 1.9935706253758206e-05, "loss": 1.9222, "step": 54 }, { "epoch": 0.7913669064748201, "grad_norm": 3.3579698036375034, "learning_rate": 1.9931200931997372e-05, "loss": 1.716, "step": 55 }, { "epoch": 0.8057553956834532, "grad_norm": 3.430514351410994, "learning_rate": 1.9926543601037843e-05, "loss": 1.795, "step": 56 }, { "epoch": 0.8201438848920863, "grad_norm": 3.0933971993062017, "learning_rate": 1.992173433216577e-05, "loss": 1.6326, "step": 57 }, { "epoch": 0.8345323741007195, "grad_norm": 2.658306477244865, "learning_rate": 1.99167731989929e-05, "loss": 1.8422, "step": 58 }, { "epoch": 0.8489208633093526, "grad_norm": 2.609103199735189, "learning_rate": 1.9911660277455473e-05, "loss": 1.7832, "step": 59 }, { "epoch": 0.8633093525179856, "grad_norm": 3.0489907374135896, "learning_rate": 1.9906395645813e-05, "loss": 1.5908, "step": 60 }, { "epoch": 0.8776978417266187, "grad_norm": 2.9154327708322803, "learning_rate": 1.990097938464713e-05, "loss": 1.5599, "step": 61 }, { "epoch": 0.8920863309352518, "grad_norm": 2.6855178783593328, "learning_rate": 1.989541157686037e-05, "loss": 1.6876, "step": 62 }, { "epoch": 0.9064748201438849, "grad_norm": 2.410132159045943, "learning_rate": 1.9889692307674847e-05, "loss": 1.5693, "step": 63 }, { "epoch": 0.920863309352518, "grad_norm": 2.6119620830871244, "learning_rate": 1.9883821664630977e-05, "loss": 1.4076, "step": 64 }, { "epoch": 0.935251798561151, "grad_norm": 2.5077319176478996, "learning_rate": 1.987779973758615e-05, "loss": 1.5181, "step": 65 }, { "epoch": 0.9496402877697842, "grad_norm": 2.8061485591127266, "learning_rate": 1.987162661871333e-05, "loss": 1.3751, "step": 66 }, { "epoch": 0.9640287769784173, "grad_norm": 2.685195543198549, "learning_rate": 1.986530240249968e-05, "loss": 1.3248, "step": 67 }, { "epoch": 0.9784172661870504, "grad_norm": 2.6515044506712546, "learning_rate": 1.985882718574506e-05, "loss": 1.2112, "step": 68 }, { "epoch": 0.9928057553956835, "grad_norm": 2.1678461224085193, "learning_rate": 1.9852201067560607e-05, "loss": 1.3792, "step": 69 }, { "epoch": 1.0071942446043165, "grad_norm": 2.231749821096067, "learning_rate": 1.984542414936718e-05, "loss": 1.2438, "step": 70 }, { "epoch": 1.0215827338129497, "grad_norm": 2.5781231491148353, "learning_rate": 1.9838496534893807e-05, "loss": 1.5007, "step": 71 }, { "epoch": 1.0359712230215827, "grad_norm": 2.5108817806595574, "learning_rate": 1.9831418330176127e-05, "loss": 1.4368, "step": 72 }, { "epoch": 1.0503597122302157, "grad_norm": 1.9489467948836736, "learning_rate": 1.9824189643554724e-05, "loss": 1.2176, "step": 73 }, { "epoch": 1.064748201438849, "grad_norm": 2.431713013736794, "learning_rate": 1.9816810585673515e-05, "loss": 1.2662, "step": 74 }, { "epoch": 1.079136690647482, "grad_norm": 2.615444743987483, "learning_rate": 1.9809281269478015e-05, "loss": 0.6466, "step": 75 }, { "epoch": 1.0935251798561152, "grad_norm": 1.9780475992590945, "learning_rate": 1.9801601810213634e-05, "loss": 1.1773, "step": 76 }, { "epoch": 1.1079136690647482, "grad_norm": 2.738489524134038, "learning_rate": 1.979377232542391e-05, "loss": 0.7522, "step": 77 }, { "epoch": 1.1223021582733812, "grad_norm": 2.92031012217587, "learning_rate": 1.9785792934948697e-05, "loss": 1.2811, "step": 78 }, { "epoch": 1.1366906474820144, "grad_norm": 2.1812951842981407, "learning_rate": 1.9777663760922342e-05, "loss": 1.2223, "step": 79 }, { "epoch": 1.1510791366906474, "grad_norm": 1.9774480380800536, "learning_rate": 1.976938492777182e-05, "loss": 1.2216, "step": 80 }, { "epoch": 1.1654676258992807, "grad_norm": 2.0162999343359904, "learning_rate": 1.9760956562214808e-05, "loss": 1.1783, "step": 81 }, { "epoch": 1.1798561151079137, "grad_norm": 1.8114249309162656, "learning_rate": 1.9752378793257777e-05, "loss": 0.9817, "step": 82 }, { "epoch": 1.1942446043165469, "grad_norm": 2.7290657585488143, "learning_rate": 1.9743651752193983e-05, "loss": 0.9542, "step": 83 }, { "epoch": 1.20863309352518, "grad_norm": 2.017011266770035, "learning_rate": 1.9734775572601487e-05, "loss": 1.1217, "step": 84 }, { "epoch": 1.223021582733813, "grad_norm": 1.748500187733016, "learning_rate": 1.9725750390341093e-05, "loss": 0.7081, "step": 85 }, { "epoch": 1.2374100719424461, "grad_norm": 5.7387726390350515, "learning_rate": 1.9716576343554274e-05, "loss": 0.7381, "step": 86 }, { "epoch": 1.2517985611510791, "grad_norm": 2.160256977147074, "learning_rate": 1.9707253572661057e-05, "loss": 1.0861, "step": 87 }, { "epoch": 1.2661870503597124, "grad_norm": 2.4513022300810223, "learning_rate": 1.969778222035787e-05, "loss": 1.0924, "step": 88 }, { "epoch": 1.2805755395683454, "grad_norm": 2.2964346352019116, "learning_rate": 1.9688162431615367e-05, "loss": 0.7906, "step": 89 }, { "epoch": 1.2949640287769784, "grad_norm": 2.760514099820931, "learning_rate": 1.9678394353676203e-05, "loss": 1.0421, "step": 90 }, { "epoch": 1.3093525179856116, "grad_norm": 2.977379886340304, "learning_rate": 1.9668478136052776e-05, "loss": 1.0089, "step": 91 }, { "epoch": 1.3237410071942446, "grad_norm": 3.228823722579014, "learning_rate": 1.9658413930524955e-05, "loss": 0.882, "step": 92 }, { "epoch": 1.3381294964028778, "grad_norm": 1.8879288085516621, "learning_rate": 1.9648201891137725e-05, "loss": 0.8884, "step": 93 }, { "epoch": 1.3525179856115108, "grad_norm": 5.701861760835263, "learning_rate": 1.963784217419887e-05, "loss": 0.5543, "step": 94 }, { "epoch": 1.3669064748201438, "grad_norm": 1.9714095423076823, "learning_rate": 1.9627334938276547e-05, "loss": 0.9301, "step": 95 }, { "epoch": 1.381294964028777, "grad_norm": 2.440289446102772, "learning_rate": 1.961668034419688e-05, "loss": 0.5015, "step": 96 }, { "epoch": 1.39568345323741, "grad_norm": 2.3315013817563237, "learning_rate": 1.9605878555041484e-05, "loss": 0.9329, "step": 97 }, { "epoch": 1.4100719424460433, "grad_norm": 2.543223303151188, "learning_rate": 1.9594929736144978e-05, "loss": 0.981, "step": 98 }, { "epoch": 1.4244604316546763, "grad_norm": 2.2968302939118486, "learning_rate": 1.9583834055092446e-05, "loss": 0.8583, "step": 99 }, { "epoch": 1.4388489208633093, "grad_norm": 2.3678435949631287, "learning_rate": 1.9572591681716888e-05, "loss": 0.9773, "step": 100 }, { "epoch": 1.4532374100719425, "grad_norm": 5.699515821984953, "learning_rate": 1.95612027880966e-05, "loss": 0.5195, "step": 101 }, { "epoch": 1.4676258992805755, "grad_norm": 2.4763879942295812, "learning_rate": 1.9549667548552557e-05, "loss": 0.7692, "step": 102 }, { "epoch": 1.4820143884892087, "grad_norm": 1.726706976599975, "learning_rate": 1.9537986139645724e-05, "loss": 0.7894, "step": 103 }, { "epoch": 1.4964028776978417, "grad_norm": 2.880795266012809, "learning_rate": 1.9526158740174392e-05, "loss": 0.8268, "step": 104 }, { "epoch": 1.5107913669064748, "grad_norm": 2.2837687142476737, "learning_rate": 1.951418553117139e-05, "loss": 0.7285, "step": 105 }, { "epoch": 1.5251798561151078, "grad_norm": 2.401929061303925, "learning_rate": 1.950206669590136e-05, "loss": 0.7437, "step": 106 }, { "epoch": 1.539568345323741, "grad_norm": 2.109686331424331, "learning_rate": 1.9489802419857918e-05, "loss": 0.7687, "step": 107 }, { "epoch": 1.5539568345323742, "grad_norm": 2.52807406566671, "learning_rate": 1.947739289076084e-05, "loss": 0.7827, "step": 108 }, { "epoch": 1.5683453237410072, "grad_norm": 2.399711195058357, "learning_rate": 1.9464838298553172e-05, "loss": 0.4237, "step": 109 }, { "epoch": 1.5827338129496402, "grad_norm": 2.06327086788237, "learning_rate": 1.9452138835398333e-05, "loss": 0.6328, "step": 110 }, { "epoch": 1.5971223021582732, "grad_norm": 1.8934017549759041, "learning_rate": 1.9439294695677168e-05, "loss": 0.7544, "step": 111 }, { "epoch": 1.6115107913669064, "grad_norm": 1.7326175063319036, "learning_rate": 1.9426306075984968e-05, "loss": 0.5431, "step": 112 }, { "epoch": 1.6258992805755397, "grad_norm": 2.1015742280893184, "learning_rate": 1.9413173175128472e-05, "loss": 0.6663, "step": 113 }, { "epoch": 1.6402877697841727, "grad_norm": 2.231853898087344, "learning_rate": 1.9399896194122824e-05, "loss": 0.6107, "step": 114 }, { "epoch": 1.6546762589928057, "grad_norm": 2.090875801066189, "learning_rate": 1.9386475336188484e-05, "loss": 0.6786, "step": 115 }, { "epoch": 1.6690647482014387, "grad_norm": 2.3679863454534678, "learning_rate": 1.9372910806748124e-05, "loss": 0.5826, "step": 116 }, { "epoch": 1.683453237410072, "grad_norm": 2.0143151155086585, "learning_rate": 1.935920281342349e-05, "loss": 0.5107, "step": 117 }, { "epoch": 1.6978417266187051, "grad_norm": 2.0579871874173326, "learning_rate": 1.934535156603222e-05, "loss": 0.5457, "step": 118 }, { "epoch": 1.7122302158273381, "grad_norm": 2.0248928133131505, "learning_rate": 1.933135727658462e-05, "loss": 0.5204, "step": 119 }, { "epoch": 1.7266187050359711, "grad_norm": 2.143170055281987, "learning_rate": 1.931722015928044e-05, "loss": 0.4414, "step": 120 }, { "epoch": 1.7410071942446042, "grad_norm": 3.8092005485468086, "learning_rate": 1.930294043050558e-05, "loss": 0.3688, "step": 121 }, { "epoch": 1.7553956834532374, "grad_norm": 1.989587109704384, "learning_rate": 1.928851830882879e-05, "loss": 0.4322, "step": 122 }, { "epoch": 1.7697841726618706, "grad_norm": 2.12538729505454, "learning_rate": 1.9273954014998307e-05, "loss": 0.3567, "step": 123 }, { "epoch": 1.7841726618705036, "grad_norm": 2.0988722867429135, "learning_rate": 1.92592477719385e-05, "loss": 0.4249, "step": 124 }, { "epoch": 1.7985611510791366, "grad_norm": 2.0670095637448815, "learning_rate": 1.9244399804746436e-05, "loss": 0.4687, "step": 125 }, { "epoch": 1.8129496402877698, "grad_norm": 2.9462308640831436, "learning_rate": 1.9229410340688442e-05, "loss": 0.4576, "step": 126 }, { "epoch": 1.8273381294964028, "grad_norm": 2.2511867293077144, "learning_rate": 1.9214279609196632e-05, "loss": 0.361, "step": 127 }, { "epoch": 1.841726618705036, "grad_norm": 2.9578408313586224, "learning_rate": 1.9199007841865395e-05, "loss": 0.3939, "step": 128 }, { "epoch": 1.856115107913669, "grad_norm": 1.67569117249568, "learning_rate": 1.9183595272447843e-05, "loss": 0.3387, "step": 129 }, { "epoch": 1.870503597122302, "grad_norm": 1.922970389381037, "learning_rate": 1.9168042136852228e-05, "loss": 0.3162, "step": 130 }, { "epoch": 1.8848920863309353, "grad_norm": 8.10597561850986, "learning_rate": 1.9152348673138355e-05, "loss": 0.2718, "step": 131 }, { "epoch": 1.8992805755395683, "grad_norm": 4.044552909471015, "learning_rate": 1.913651512151391e-05, "loss": 0.2843, "step": 132 }, { "epoch": 1.9136690647482015, "grad_norm": 1.8361101289997892, "learning_rate": 1.9120541724330802e-05, "loss": 0.2922, "step": 133 }, { "epoch": 1.9280575539568345, "grad_norm": 1.8567473300234578, "learning_rate": 1.910442872608145e-05, "loss": 0.2136, "step": 134 }, { "epoch": 1.9424460431654675, "grad_norm": 2.263518478480423, "learning_rate": 1.908817637339503e-05, "loss": 0.2331, "step": 135 }, { "epoch": 1.9568345323741008, "grad_norm": 2.655129174999631, "learning_rate": 1.9071784915033717e-05, "loss": 0.2805, "step": 136 }, { "epoch": 1.9712230215827338, "grad_norm": 2.349586516019947, "learning_rate": 1.9055254601888867e-05, "loss": 0.3259, "step": 137 }, { "epoch": 1.985611510791367, "grad_norm": 2.03687355334055, "learning_rate": 1.9038585686977168e-05, "loss": 0.2869, "step": 138 }, { "epoch": 2.0, "grad_norm": 2.9583687224064192, "learning_rate": 1.9021778425436797e-05, "loss": 0.3408, "step": 139 }, { "epoch": 2.014388489208633, "grad_norm": 1.91826726074781, "learning_rate": 1.9004833074523478e-05, "loss": 0.2307, "step": 140 }, { "epoch": 2.028776978417266, "grad_norm": 1.1616239437089266, "learning_rate": 1.8987749893606575e-05, "loss": 0.158, "step": 141 }, { "epoch": 2.0431654676258995, "grad_norm": 2.661585557841078, "learning_rate": 1.8970529144165103e-05, "loss": 0.182, "step": 142 }, { "epoch": 2.0575539568345325, "grad_norm": 1.4648504478084143, "learning_rate": 1.8953171089783725e-05, "loss": 0.1868, "step": 143 }, { "epoch": 2.0719424460431655, "grad_norm": 1.6563021788099919, "learning_rate": 1.8935675996148738e-05, "loss": 0.2079, "step": 144 }, { "epoch": 2.0863309352517985, "grad_norm": 1.765245657702871, "learning_rate": 1.8918044131043987e-05, "loss": 0.2056, "step": 145 }, { "epoch": 2.1007194244604315, "grad_norm": 1.5438608872021158, "learning_rate": 1.890027576434677e-05, "loss": 0.2635, "step": 146 }, { "epoch": 2.115107913669065, "grad_norm": 2.2956532783932535, "learning_rate": 1.8882371168023708e-05, "loss": 0.2029, "step": 147 }, { "epoch": 2.129496402877698, "grad_norm": 1.4619363151073599, "learning_rate": 1.8864330616126586e-05, "loss": 0.155, "step": 148 }, { "epoch": 2.143884892086331, "grad_norm": 2.1621204649874124, "learning_rate": 1.8846154384788162e-05, "loss": 0.1719, "step": 149 }, { "epoch": 2.158273381294964, "grad_norm": 2.7166855043485967, "learning_rate": 1.8827842752217917e-05, "loss": 0.1819, "step": 150 }, { "epoch": 2.172661870503597, "grad_norm": 1.8834021077536263, "learning_rate": 1.8809395998697835e-05, "loss": 0.1828, "step": 151 }, { "epoch": 2.1870503597122304, "grad_norm": 1.5733332816754892, "learning_rate": 1.8790814406578073e-05, "loss": 0.2194, "step": 152 }, { "epoch": 2.2014388489208634, "grad_norm": 1.9669982488920634, "learning_rate": 1.877209826027267e-05, "loss": 0.164, "step": 153 }, { "epoch": 2.2158273381294964, "grad_norm": 10.292309004798875, "learning_rate": 1.8753247846255175e-05, "loss": 0.2773, "step": 154 }, { "epoch": 2.2302158273381294, "grad_norm": 2.2748735550997563, "learning_rate": 1.8734263453054274e-05, "loss": 0.1718, "step": 155 }, { "epoch": 2.2446043165467624, "grad_norm": 2.8655891892789866, "learning_rate": 1.871514537124936e-05, "loss": 0.1579, "step": 156 }, { "epoch": 2.258992805755396, "grad_norm": 2.217166529615519, "learning_rate": 1.869589389346611e-05, "loss": 0.1787, "step": 157 }, { "epoch": 2.273381294964029, "grad_norm": 1.8903126047188956, "learning_rate": 1.8676509314371977e-05, "loss": 0.1848, "step": 158 }, { "epoch": 2.287769784172662, "grad_norm": 2.22331229134063, "learning_rate": 1.8656991930671687e-05, "loss": 0.1547, "step": 159 }, { "epoch": 2.302158273381295, "grad_norm": 2.4540086773291323, "learning_rate": 1.863734204110272e-05, "loss": 0.1621, "step": 160 }, { "epoch": 2.316546762589928, "grad_norm": 2.718097280283145, "learning_rate": 1.861755994643071e-05, "loss": 0.1644, "step": 161 }, { "epoch": 2.3309352517985613, "grad_norm": 2.4404623229012996, "learning_rate": 1.859764594944485e-05, "loss": 0.1555, "step": 162 }, { "epoch": 2.3453237410071943, "grad_norm": 2.3253380241917685, "learning_rate": 1.8577600354953273e-05, "loss": 0.1524, "step": 163 }, { "epoch": 2.3597122302158273, "grad_norm": 2.4646303579458353, "learning_rate": 1.8557423469778356e-05, "loss": 0.1473, "step": 164 }, { "epoch": 2.3741007194244603, "grad_norm": 1.5376405445606527, "learning_rate": 1.8537115602752054e-05, "loss": 0.1495, "step": 165 }, { "epoch": 2.3884892086330938, "grad_norm": 4.745077306666788, "learning_rate": 1.851667706471115e-05, "loss": 0.1821, "step": 166 }, { "epoch": 2.402877697841727, "grad_norm": 2.552588156434035, "learning_rate": 1.8496108168492518e-05, "loss": 0.1319, "step": 167 }, { "epoch": 2.41726618705036, "grad_norm": 1.8367052946729043, "learning_rate": 1.8475409228928314e-05, "loss": 0.1349, "step": 168 }, { "epoch": 2.431654676258993, "grad_norm": 2.5445371080031314, "learning_rate": 1.8454580562841165e-05, "loss": 0.13, "step": 169 }, { "epoch": 2.446043165467626, "grad_norm": 2.344234909741198, "learning_rate": 1.8433622489039333e-05, "loss": 0.1506, "step": 170 }, { "epoch": 2.460431654676259, "grad_norm": 2.376247631671717, "learning_rate": 1.8412535328311813e-05, "loss": 0.1344, "step": 171 }, { "epoch": 2.4748201438848922, "grad_norm": 2.9461424922262784, "learning_rate": 1.839131940342344e-05, "loss": 0.1483, "step": 172 }, { "epoch": 2.4892086330935252, "grad_norm": 1.9976524594362375, "learning_rate": 1.8369975039109937e-05, "loss": 0.1803, "step": 173 }, { "epoch": 2.5035971223021583, "grad_norm": 1.0443749894322358, "learning_rate": 1.8348502562072955e-05, "loss": 0.1171, "step": 174 }, { "epoch": 2.5179856115107913, "grad_norm": 1.4361047756192113, "learning_rate": 1.8326902300975063e-05, "loss": 0.149, "step": 175 }, { "epoch": 2.5323741007194247, "grad_norm": 1.6346883755053931, "learning_rate": 1.8305174586434724e-05, "loss": 0.1444, "step": 176 }, { "epoch": 2.5467625899280577, "grad_norm": 1.6834887574364132, "learning_rate": 1.828331975102123e-05, "loss": 0.1144, "step": 177 }, { "epoch": 2.5611510791366907, "grad_norm": 1.354227075277937, "learning_rate": 1.8261338129249623e-05, "loss": 0.1178, "step": 178 }, { "epoch": 2.5755395683453237, "grad_norm": 2.2741550896246587, "learning_rate": 1.8239230057575542e-05, "loss": 0.1534, "step": 179 }, { "epoch": 2.5899280575539567, "grad_norm": 1.1972242744740567, "learning_rate": 1.8216995874390128e-05, "loss": 0.0885, "step": 180 }, { "epoch": 2.6043165467625897, "grad_norm": 1.973587237520198, "learning_rate": 1.819463592001479e-05, "loss": 0.135, "step": 181 }, { "epoch": 2.618705035971223, "grad_norm": 3.1272879249109753, "learning_rate": 1.817215053669603e-05, "loss": 0.1586, "step": 182 }, { "epoch": 2.633093525179856, "grad_norm": 2.191338511461004, "learning_rate": 1.814954006860018e-05, "loss": 0.1416, "step": 183 }, { "epoch": 2.647482014388489, "grad_norm": 1.572355984168845, "learning_rate": 1.8126804861808175e-05, "loss": 0.1185, "step": 184 }, { "epoch": 2.661870503597122, "grad_norm": 0.975274936777776, "learning_rate": 1.81039452643102e-05, "loss": 0.1, "step": 185 }, { "epoch": 2.6762589928057556, "grad_norm": 1.6008482801863315, "learning_rate": 1.808096162600041e-05, "loss": 0.1051, "step": 186 }, { "epoch": 2.6906474820143886, "grad_norm": 1.7704784936509204, "learning_rate": 1.8057854298671545e-05, "loss": 0.13, "step": 187 }, { "epoch": 2.7050359712230216, "grad_norm": 1.8920163138110342, "learning_rate": 1.803462363600957e-05, "loss": 0.1458, "step": 188 }, { "epoch": 2.7194244604316546, "grad_norm": 5.176691025598706, "learning_rate": 1.8011269993588234e-05, "loss": 0.1791, "step": 189 }, { "epoch": 2.7338129496402876, "grad_norm": 1.6375993170680554, "learning_rate": 1.798779372886365e-05, "loss": 0.1177, "step": 190 }, { "epoch": 2.7482014388489207, "grad_norm": 4.122622959327029, "learning_rate": 1.796419520116882e-05, "loss": 0.26, "step": 191 }, { "epoch": 2.762589928057554, "grad_norm": 1.902781478532848, "learning_rate": 1.7940474771708118e-05, "loss": 0.1298, "step": 192 }, { "epoch": 2.776978417266187, "grad_norm": 1.7134666967364445, "learning_rate": 1.791663280355178e-05, "loss": 0.1075, "step": 193 }, { "epoch": 2.79136690647482, "grad_norm": 2.1429774203344274, "learning_rate": 1.789266966163035e-05, "loss": 0.1131, "step": 194 }, { "epoch": 2.805755395683453, "grad_norm": 2.1096187128500317, "learning_rate": 1.786858571272907e-05, "loss": 0.1407, "step": 195 }, { "epoch": 2.8201438848920866, "grad_norm": 1.7461053345159192, "learning_rate": 1.7844381325482293e-05, "loss": 0.0962, "step": 196 }, { "epoch": 2.8345323741007196, "grad_norm": 2.272810785009126, "learning_rate": 1.7820056870367813e-05, "loss": 0.1982, "step": 197 }, { "epoch": 2.8489208633093526, "grad_norm": 1.887371974290098, "learning_rate": 1.7795612719701228e-05, "loss": 0.1436, "step": 198 }, { "epoch": 2.8633093525179856, "grad_norm": 1.8778160079306951, "learning_rate": 1.7771049247630215e-05, "loss": 0.1218, "step": 199 }, { "epoch": 2.8776978417266186, "grad_norm": 1.1423107655269147, "learning_rate": 1.7746366830128803e-05, "loss": 0.0901, "step": 200 }, { "epoch": 2.8920863309352516, "grad_norm": 1.5837299145510542, "learning_rate": 1.7721565844991643e-05, "loss": 0.0799, "step": 201 }, { "epoch": 2.906474820143885, "grad_norm": 1.3786215691627017, "learning_rate": 1.76966466718282e-05, "loss": 0.1063, "step": 202 }, { "epoch": 2.920863309352518, "grad_norm": 2.048529831050718, "learning_rate": 1.7671609692056946e-05, "loss": 0.1188, "step": 203 }, { "epoch": 2.935251798561151, "grad_norm": 4.107034047711157, "learning_rate": 1.7646455288899535e-05, "loss": 0.1608, "step": 204 }, { "epoch": 2.949640287769784, "grad_norm": 1.419074613907907, "learning_rate": 1.7621183847374935e-05, "loss": 0.0947, "step": 205 }, { "epoch": 2.9640287769784175, "grad_norm": 1.7241505506887336, "learning_rate": 1.7595795754293514e-05, "loss": 0.0933, "step": 206 }, { "epoch": 2.9784172661870505, "grad_norm": 5.437440138394324, "learning_rate": 1.7570291398251153e-05, "loss": 0.1616, "step": 207 }, { "epoch": 2.9928057553956835, "grad_norm": 1.20883185060423, "learning_rate": 1.7544671169623263e-05, "loss": 0.0926, "step": 208 }, { "epoch": 3.0071942446043165, "grad_norm": 1.4389531365223567, "learning_rate": 1.751893546055884e-05, "loss": 0.079, "step": 209 }, { "epoch": 3.0215827338129495, "grad_norm": 2.9037490869241642, "learning_rate": 1.749308466497444e-05, "loss": 0.1041, "step": 210 }, { "epoch": 3.0359712230215825, "grad_norm": 2.3752741387378062, "learning_rate": 1.746711917854817e-05, "loss": 0.1602, "step": 211 }, { "epoch": 3.050359712230216, "grad_norm": 3.434671646120461, "learning_rate": 1.744103939871361e-05, "loss": 0.1553, "step": 212 }, { "epoch": 3.064748201438849, "grad_norm": 1.7548341283748703, "learning_rate": 1.7414845724653743e-05, "loss": 0.1046, "step": 213 }, { "epoch": 3.079136690647482, "grad_norm": 2.4665177125279514, "learning_rate": 1.738853855729485e-05, "loss": 0.1063, "step": 214 }, { "epoch": 3.093525179856115, "grad_norm": 2.2287980432317136, "learning_rate": 1.7362118299300363e-05, "loss": 0.1017, "step": 215 }, { "epoch": 3.1079136690647484, "grad_norm": 2.035780926825967, "learning_rate": 1.733558535506469e-05, "loss": 0.1022, "step": 216 }, { "epoch": 3.1223021582733814, "grad_norm": 2.3935570961264707, "learning_rate": 1.730894013070707e-05, "loss": 0.1185, "step": 217 }, { "epoch": 3.1366906474820144, "grad_norm": 3.4732921260424536, "learning_rate": 1.7282183034065296e-05, "loss": 0.1375, "step": 218 }, { "epoch": 3.1510791366906474, "grad_norm": 1.9600377867714436, "learning_rate": 1.7255314474689524e-05, "loss": 0.0858, "step": 219 }, { "epoch": 3.1654676258992804, "grad_norm": 1.2129581509037186, "learning_rate": 1.7228334863835972e-05, "loss": 0.0786, "step": 220 }, { "epoch": 3.1798561151079134, "grad_norm": 1.3441082560042694, "learning_rate": 1.7201244614460645e-05, "loss": 0.1193, "step": 221 }, { "epoch": 3.194244604316547, "grad_norm": 1.2454468384467656, "learning_rate": 1.7174044141213e-05, "loss": 0.0742, "step": 222 }, { "epoch": 3.20863309352518, "grad_norm": 2.522852911662667, "learning_rate": 1.7146733860429614e-05, "loss": 0.118, "step": 223 }, { "epoch": 3.223021582733813, "grad_norm": 1.9278040098990679, "learning_rate": 1.7119314190127786e-05, "loss": 0.0977, "step": 224 }, { "epoch": 3.237410071942446, "grad_norm": 2.1026449906365707, "learning_rate": 1.7091785549999177e-05, "loss": 0.1052, "step": 225 }, { "epoch": 3.2517985611510793, "grad_norm": 2.6529166852325257, "learning_rate": 1.7064148361403347e-05, "loss": 0.2227, "step": 226 }, { "epoch": 3.2661870503597124, "grad_norm": 4.552433635116517, "learning_rate": 1.7036403047361336e-05, "loss": 0.1501, "step": 227 }, { "epoch": 3.2805755395683454, "grad_norm": 2.310770991325816, "learning_rate": 1.7008550032549167e-05, "loss": 0.1216, "step": 228 }, { "epoch": 3.2949640287769784, "grad_norm": 3.36364275278406, "learning_rate": 1.6980589743291362e-05, "loss": 0.1235, "step": 229 }, { "epoch": 3.3093525179856114, "grad_norm": 1.829401154310567, "learning_rate": 1.695252260755441e-05, "loss": 0.1233, "step": 230 }, { "epoch": 3.3237410071942444, "grad_norm": 2.223251742922475, "learning_rate": 1.6924349054940204e-05, "loss": 0.1139, "step": 231 }, { "epoch": 3.338129496402878, "grad_norm": 2.1098226205823734, "learning_rate": 1.6896069516679494e-05, "loss": 0.0954, "step": 232 }, { "epoch": 3.352517985611511, "grad_norm": 2.420417244636502, "learning_rate": 1.6867684425625265e-05, "loss": 0.1024, "step": 233 }, { "epoch": 3.366906474820144, "grad_norm": 3.516667669455294, "learning_rate": 1.683919421624611e-05, "loss": 0.1811, "step": 234 }, { "epoch": 3.381294964028777, "grad_norm": 1.1050242423015832, "learning_rate": 1.681059932461959e-05, "loss": 0.0677, "step": 235 }, { "epoch": 3.3956834532374103, "grad_norm": 2.176543994109208, "learning_rate": 1.6781900188425565e-05, "loss": 0.093, "step": 236 }, { "epoch": 3.4100719424460433, "grad_norm": 2.4696463349977873, "learning_rate": 1.6753097246939475e-05, "loss": 0.0865, "step": 237 }, { "epoch": 3.4244604316546763, "grad_norm": 1.2628603534222926, "learning_rate": 1.672419094102563e-05, "loss": 0.0867, "step": 238 }, { "epoch": 3.4388489208633093, "grad_norm": 6.155069816331444, "learning_rate": 1.6695181713130462e-05, "loss": 0.1917, "step": 239 }, { "epoch": 3.4532374100719423, "grad_norm": 2.147319268591747, "learning_rate": 1.6666070007275746e-05, "loss": 0.1466, "step": 240 }, { "epoch": 3.4676258992805753, "grad_norm": 2.163903497725947, "learning_rate": 1.6636856269051813e-05, "loss": 0.1364, "step": 241 }, { "epoch": 3.4820143884892087, "grad_norm": 1.6774031370584257, "learning_rate": 1.6607540945610722e-05, "loss": 0.0906, "step": 242 }, { "epoch": 3.4964028776978417, "grad_norm": 9.014308156169962, "learning_rate": 1.6578124485659414e-05, "loss": 0.1861, "step": 243 }, { "epoch": 3.5107913669064748, "grad_norm": 2.3885377725749715, "learning_rate": 1.6548607339452853e-05, "loss": 0.1281, "step": 244 }, { "epoch": 3.5251798561151078, "grad_norm": 1.6108063549734837, "learning_rate": 1.6518989958787126e-05, "loss": 0.0981, "step": 245 }, { "epoch": 3.539568345323741, "grad_norm": 3.5691166357587054, "learning_rate": 1.6489272796992536e-05, "loss": 0.1074, "step": 246 }, { "epoch": 3.553956834532374, "grad_norm": 3.4979429013811467, "learning_rate": 1.6459456308926662e-05, "loss": 0.1338, "step": 247 }, { "epoch": 3.568345323741007, "grad_norm": 4.957592452667262, "learning_rate": 1.642954095096737e-05, "loss": 0.2005, "step": 248 }, { "epoch": 3.58273381294964, "grad_norm": 2.5648919694650107, "learning_rate": 1.639952718100589e-05, "loss": 0.1081, "step": 249 }, { "epoch": 3.597122302158273, "grad_norm": 3.7598235610947643, "learning_rate": 1.636941545843973e-05, "loss": 0.1533, "step": 250 }, { "epoch": 3.6115107913669062, "grad_norm": 2.9195056813718496, "learning_rate": 1.6339206244165705e-05, "loss": 0.1188, "step": 251 }, { "epoch": 3.6258992805755397, "grad_norm": 2.79416599036527, "learning_rate": 1.630890000057285e-05, "loss": 0.1106, "step": 252 }, { "epoch": 3.6402877697841727, "grad_norm": 2.2623014827430485, "learning_rate": 1.6278497191535364e-05, "loss": 0.0913, "step": 253 }, { "epoch": 3.6546762589928057, "grad_norm": 3.0134989782439736, "learning_rate": 1.6247998282405486e-05, "loss": 0.1368, "step": 254 }, { "epoch": 3.6690647482014387, "grad_norm": 1.0640618586580433, "learning_rate": 1.621740374000639e-05, "loss": 0.0749, "step": 255 }, { "epoch": 3.683453237410072, "grad_norm": 2.0407254201531857, "learning_rate": 1.6186714032625036e-05, "loss": 0.1347, "step": 256 }, { "epoch": 3.697841726618705, "grad_norm": 1.3495622495153805, "learning_rate": 1.6155929630004995e-05, "loss": 0.0938, "step": 257 }, { "epoch": 3.712230215827338, "grad_norm": 1.626082630128798, "learning_rate": 1.6125051003339277e-05, "loss": 0.0735, "step": 258 }, { "epoch": 3.726618705035971, "grad_norm": 1.045029527525099, "learning_rate": 1.6094078625263085e-05, "loss": 0.0665, "step": 259 }, { "epoch": 3.741007194244604, "grad_norm": 1.0769877251894375, "learning_rate": 1.6063012969846624e-05, "loss": 0.0594, "step": 260 }, { "epoch": 3.755395683453237, "grad_norm": 1.573095185027216, "learning_rate": 1.603185451258781e-05, "loss": 0.0989, "step": 261 }, { "epoch": 3.7697841726618706, "grad_norm": 1.5939550154351394, "learning_rate": 1.6000603730405013e-05, "loss": 0.0918, "step": 262 }, { "epoch": 3.7841726618705036, "grad_norm": 3.084625753380633, "learning_rate": 1.5969261101629744e-05, "loss": 0.1507, "step": 263 }, { "epoch": 3.7985611510791366, "grad_norm": 2.1876597444656367, "learning_rate": 1.593782710599934e-05, "loss": 0.1153, "step": 264 }, { "epoch": 3.81294964028777, "grad_norm": 1.4453678209486815, "learning_rate": 1.5906302224649613e-05, "loss": 0.0881, "step": 265 }, { "epoch": 3.827338129496403, "grad_norm": 1.415817693353708, "learning_rate": 1.5874686940107507e-05, "loss": 0.0921, "step": 266 }, { "epoch": 3.841726618705036, "grad_norm": 1.3019018391132993, "learning_rate": 1.5842981736283686e-05, "loss": 0.0942, "step": 267 }, { "epoch": 3.856115107913669, "grad_norm": 2.124481879469022, "learning_rate": 1.581118709846514e-05, "loss": 0.0892, "step": 268 }, { "epoch": 3.870503597122302, "grad_norm": 0.7530466355105916, "learning_rate": 1.5779303513307765e-05, "loss": 0.0611, "step": 269 }, { "epoch": 3.884892086330935, "grad_norm": 1.9390276125404986, "learning_rate": 1.574733146882889e-05, "loss": 0.0711, "step": 270 }, { "epoch": 3.899280575539568, "grad_norm": 1.7780292244134437, "learning_rate": 1.571527145439983e-05, "loss": 0.0912, "step": 271 }, { "epoch": 3.9136690647482015, "grad_norm": 1.5211678706292164, "learning_rate": 1.5683123960738395e-05, "loss": 0.0828, "step": 272 }, { "epoch": 3.9280575539568345, "grad_norm": 3.235343849443108, "learning_rate": 1.5650889479901356e-05, "loss": 0.1355, "step": 273 }, { "epoch": 3.9424460431654675, "grad_norm": 2.418347718572089, "learning_rate": 1.5618568505276948e-05, "loss": 0.0934, "step": 274 }, { "epoch": 3.956834532374101, "grad_norm": 1.856004868367365, "learning_rate": 1.558616153157728e-05, "loss": 0.1214, "step": 275 }, { "epoch": 3.971223021582734, "grad_norm": 1.9055690379940484, "learning_rate": 1.5553669054830806e-05, "loss": 0.0759, "step": 276 }, { "epoch": 3.985611510791367, "grad_norm": 0.7835271833606042, "learning_rate": 1.552109157237468e-05, "loss": 0.0636, "step": 277 }, { "epoch": 4.0, "grad_norm": 1.9789660771379471, "learning_rate": 1.5488429582847194e-05, "loss": 0.0935, "step": 278 }, { "epoch": 4.014388489208633, "grad_norm": 1.920696415437042, "learning_rate": 1.5455683586180117e-05, "loss": 0.0732, "step": 279 }, { "epoch": 4.028776978417266, "grad_norm": 1.7756913937042766, "learning_rate": 1.542285408359105e-05, "loss": 0.1339, "step": 280 }, { "epoch": 4.043165467625899, "grad_norm": 1.3081335048820986, "learning_rate": 1.5389941577575753e-05, "loss": 0.0805, "step": 281 }, { "epoch": 4.057553956834532, "grad_norm": 1.545019490212882, "learning_rate": 1.5356946571900465e-05, "loss": 0.0764, "step": 282 }, { "epoch": 4.071942446043165, "grad_norm": 1.6175892304589476, "learning_rate": 1.5323869571594166e-05, "loss": 0.0838, "step": 283 }, { "epoch": 4.086330935251799, "grad_norm": 8.86376731047678, "learning_rate": 1.5290711082940883e-05, "loss": 0.2142, "step": 284 }, { "epoch": 4.100719424460432, "grad_norm": 2.0771074188946748, "learning_rate": 1.5257471613471908e-05, "loss": 0.1161, "step": 285 }, { "epoch": 4.115107913669065, "grad_norm": 1.841466825633457, "learning_rate": 1.5224151671958045e-05, "loss": 0.111, "step": 286 }, { "epoch": 4.129496402877698, "grad_norm": 2.1971490305870254, "learning_rate": 1.5190751768401835e-05, "loss": 0.1001, "step": 287 }, { "epoch": 4.143884892086331, "grad_norm": 1.887590572191191, "learning_rate": 1.515727241402972e-05, "loss": 0.0817, "step": 288 }, { "epoch": 4.158273381294964, "grad_norm": 1.5504037915944906, "learning_rate": 1.512371412128424e-05, "loss": 0.0721, "step": 289 }, { "epoch": 4.172661870503597, "grad_norm": 3.2084429695170398, "learning_rate": 1.509007740381618e-05, "loss": 0.1495, "step": 290 }, { "epoch": 4.18705035971223, "grad_norm": 1.9773379887209472, "learning_rate": 1.505636277647672e-05, "loss": 0.1021, "step": 291 }, { "epoch": 4.201438848920863, "grad_norm": 1.7919752378903473, "learning_rate": 1.5022570755309542e-05, "loss": 0.069, "step": 292 }, { "epoch": 4.215827338129497, "grad_norm": 1.8353019507539692, "learning_rate": 1.4988701857542932e-05, "loss": 0.0908, "step": 293 }, { "epoch": 4.23021582733813, "grad_norm": 1.6805427100586665, "learning_rate": 1.495475660158187e-05, "loss": 0.0785, "step": 294 }, { "epoch": 4.244604316546763, "grad_norm": 1.380400795948778, "learning_rate": 1.492073550700009e-05, "loss": 0.0817, "step": 295 }, { "epoch": 4.258992805755396, "grad_norm": 0.9200650137201957, "learning_rate": 1.4886639094532129e-05, "loss": 0.0646, "step": 296 }, { "epoch": 4.273381294964029, "grad_norm": 2.0328467587787884, "learning_rate": 1.4852467886065357e-05, "loss": 0.0816, "step": 297 }, { "epoch": 4.287769784172662, "grad_norm": 2.471692983826739, "learning_rate": 1.4818222404631993e-05, "loss": 0.1168, "step": 298 }, { "epoch": 4.302158273381295, "grad_norm": 1.9480957771711376, "learning_rate": 1.4783903174401086e-05, "loss": 0.1056, "step": 299 }, { "epoch": 4.316546762589928, "grad_norm": 1.3707094764913785, "learning_rate": 1.4749510720670506e-05, "loss": 0.081, "step": 300 }, { "epoch": 4.330935251798561, "grad_norm": 1.8160322544483793, "learning_rate": 1.4715045569858895e-05, "loss": 0.0784, "step": 301 }, { "epoch": 4.345323741007194, "grad_norm": 1.6519200877530784, "learning_rate": 1.4680508249497622e-05, "loss": 0.0758, "step": 302 }, { "epoch": 4.359712230215827, "grad_norm": 1.1953425576844743, "learning_rate": 1.4645899288222686e-05, "loss": 0.076, "step": 303 }, { "epoch": 4.374100719424461, "grad_norm": 2.1532443296286217, "learning_rate": 1.461121921576665e-05, "loss": 0.0956, "step": 304 }, { "epoch": 4.388489208633094, "grad_norm": 7.291642986227351, "learning_rate": 1.457646856295051e-05, "loss": 0.1638, "step": 305 }, { "epoch": 4.402877697841727, "grad_norm": 1.9760971103123395, "learning_rate": 1.4541647861675592e-05, "loss": 0.0898, "step": 306 }, { "epoch": 4.41726618705036, "grad_norm": 1.549355801958334, "learning_rate": 1.4506757644915393e-05, "loss": 0.0804, "step": 307 }, { "epoch": 4.431654676258993, "grad_norm": 2.1548681765232005, "learning_rate": 1.4471798446707426e-05, "loss": 0.0917, "step": 308 }, { "epoch": 4.446043165467626, "grad_norm": 2.416819319060172, "learning_rate": 1.443677080214506e-05, "loss": 0.1, "step": 309 }, { "epoch": 4.460431654676259, "grad_norm": 2.357572317260481, "learning_rate": 1.4401675247369307e-05, "loss": 0.0842, "step": 310 }, { "epoch": 4.474820143884892, "grad_norm": 2.3896391383138305, "learning_rate": 1.4366512319560642e-05, "loss": 0.0825, "step": 311 }, { "epoch": 4.489208633093525, "grad_norm": 2.43013200710383, "learning_rate": 1.4331282556930753e-05, "loss": 0.0694, "step": 312 }, { "epoch": 4.503597122302159, "grad_norm": 2.044331917417573, "learning_rate": 1.4295986498714326e-05, "loss": 0.0782, "step": 313 }, { "epoch": 4.517985611510792, "grad_norm": 2.5158625592140424, "learning_rate": 1.4260624685160778e-05, "loss": 0.0861, "step": 314 }, { "epoch": 4.532374100719425, "grad_norm": 2.390662985836405, "learning_rate": 1.4225197657525996e-05, "loss": 0.0998, "step": 315 }, { "epoch": 4.546762589928058, "grad_norm": 4.358152932825534, "learning_rate": 1.4189705958064041e-05, "loss": 0.2349, "step": 316 }, { "epoch": 4.561151079136691, "grad_norm": 1.4662171648939122, "learning_rate": 1.4154150130018867e-05, "loss": 0.0828, "step": 317 }, { "epoch": 4.575539568345324, "grad_norm": 2.618941379165625, "learning_rate": 1.4118530717615982e-05, "loss": 0.1057, "step": 318 }, { "epoch": 4.589928057553957, "grad_norm": 5.796662333549953, "learning_rate": 1.4082848266054136e-05, "loss": 0.1314, "step": 319 }, { "epoch": 4.60431654676259, "grad_norm": 1.3755791710433443, "learning_rate": 1.4047103321496977e-05, "loss": 0.0568, "step": 320 }, { "epoch": 4.618705035971223, "grad_norm": 1.4690673745162433, "learning_rate": 1.4011296431064675e-05, "loss": 0.0857, "step": 321 }, { "epoch": 4.633093525179856, "grad_norm": 1.510028184893069, "learning_rate": 1.3975428142825562e-05, "loss": 0.0661, "step": 322 }, { "epoch": 4.647482014388489, "grad_norm": 1.8656945859612246, "learning_rate": 1.3939499005787735e-05, "loss": 0.0885, "step": 323 }, { "epoch": 4.661870503597123, "grad_norm": 4.360334150991463, "learning_rate": 1.3903509569890663e-05, "loss": 0.1249, "step": 324 }, { "epoch": 4.676258992805756, "grad_norm": 1.2705938890119497, "learning_rate": 1.3867460385996756e-05, "loss": 0.0483, "step": 325 }, { "epoch": 4.690647482014389, "grad_norm": 1.6429511839584963, "learning_rate": 1.3831352005882947e-05, "loss": 0.0678, "step": 326 }, { "epoch": 4.705035971223022, "grad_norm": 0.8064327586315418, "learning_rate": 1.3795184982232234e-05, "loss": 0.0481, "step": 327 }, { "epoch": 4.719424460431655, "grad_norm": 1.6276254758163173, "learning_rate": 1.3758959868625233e-05, "loss": 0.0642, "step": 328 }, { "epoch": 4.733812949640288, "grad_norm": 1.6478404195254295, "learning_rate": 1.3722677219531684e-05, "loss": 0.0537, "step": 329 }, { "epoch": 4.748201438848921, "grad_norm": 2.0775687439431443, "learning_rate": 1.3686337590301997e-05, "loss": 0.0826, "step": 330 }, { "epoch": 4.762589928057554, "grad_norm": 1.6321172079536772, "learning_rate": 1.364994153715872e-05, "loss": 0.0931, "step": 331 }, { "epoch": 4.7769784172661875, "grad_norm": 1.5139845300039876, "learning_rate": 1.361348961718804e-05, "loss": 0.0755, "step": 332 }, { "epoch": 4.7913669064748206, "grad_norm": 1.6885011783715966, "learning_rate": 1.3576982388331258e-05, "loss": 0.0708, "step": 333 }, { "epoch": 4.805755395683454, "grad_norm": 3.58412758620127, "learning_rate": 1.3540420409376237e-05, "loss": 0.1443, "step": 334 }, { "epoch": 4.820143884892087, "grad_norm": 3.970560863668551, "learning_rate": 1.3503804239948874e-05, "loss": 0.1164, "step": 335 }, { "epoch": 4.83453237410072, "grad_norm": 3.0950452614631216, "learning_rate": 1.3467134440504497e-05, "loss": 0.1638, "step": 336 }, { "epoch": 4.848920863309353, "grad_norm": 1.2344229500725417, "learning_rate": 1.3430411572319323e-05, "loss": 0.0414, "step": 337 }, { "epoch": 4.863309352517986, "grad_norm": 4.693981116757926, "learning_rate": 1.3393636197481842e-05, "loss": 0.1099, "step": 338 }, { "epoch": 4.877697841726619, "grad_norm": 1.280394160785586, "learning_rate": 1.335680887888423e-05, "loss": 0.069, "step": 339 }, { "epoch": 4.892086330935252, "grad_norm": 2.6531048299227287, "learning_rate": 1.3319930180213713e-05, "loss": 0.0945, "step": 340 }, { "epoch": 4.906474820143885, "grad_norm": 3.102339751706606, "learning_rate": 1.3283000665943972e-05, "loss": 0.1103, "step": 341 }, { "epoch": 4.920863309352518, "grad_norm": 1.4150229722556091, "learning_rate": 1.3246020901326465e-05, "loss": 0.0787, "step": 342 }, { "epoch": 4.935251798561151, "grad_norm": 2.3202398526115933, "learning_rate": 1.3208991452381798e-05, "loss": 0.0956, "step": 343 }, { "epoch": 4.9496402877697845, "grad_norm": 6.464019103643947, "learning_rate": 1.3171912885891063e-05, "loss": 0.1059, "step": 344 }, { "epoch": 4.9640287769784175, "grad_norm": 2.2987115160645617, "learning_rate": 1.3134785769387147e-05, "loss": 0.0905, "step": 345 }, { "epoch": 4.9784172661870505, "grad_norm": 3.42537659995528, "learning_rate": 1.3097610671146063e-05, "loss": 0.0891, "step": 346 }, { "epoch": 4.9928057553956835, "grad_norm": 1.607289746350073, "learning_rate": 1.3060388160178237e-05, "loss": 0.0756, "step": 347 }, { "epoch": 5.0071942446043165, "grad_norm": 2.0287001101488733, "learning_rate": 1.302311880621981e-05, "loss": 0.1092, "step": 348 }, { "epoch": 5.0215827338129495, "grad_norm": 2.296081619843284, "learning_rate": 1.2985803179723903e-05, "loss": 0.0814, "step": 349 }, { "epoch": 5.0359712230215825, "grad_norm": 1.3541106943850767, "learning_rate": 1.294844185185191e-05, "loss": 0.0495, "step": 350 }, { "epoch": 5.0503597122302155, "grad_norm": 3.473072731374066, "learning_rate": 1.2911035394464724e-05, "loss": 0.1115, "step": 351 }, { "epoch": 5.0647482014388485, "grad_norm": 1.878085254754344, "learning_rate": 1.2873584380114012e-05, "loss": 0.0758, "step": 352 }, { "epoch": 5.079136690647482, "grad_norm": 1.5828011496456365, "learning_rate": 1.283608938203344e-05, "loss": 0.0653, "step": 353 }, { "epoch": 5.093525179856115, "grad_norm": 2.310554737444448, "learning_rate": 1.2798550974129888e-05, "loss": 0.0795, "step": 354 }, { "epoch": 5.107913669064748, "grad_norm": 1.392927738275557, "learning_rate": 1.2760969730974692e-05, "loss": 0.0555, "step": 355 }, { "epoch": 5.122302158273381, "grad_norm": 1.9277356604394835, "learning_rate": 1.2723346227794817e-05, "loss": 0.0709, "step": 356 }, { "epoch": 5.136690647482014, "grad_norm": 1.5371398787685273, "learning_rate": 1.2685681040464081e-05, "loss": 0.0596, "step": 357 }, { "epoch": 5.151079136690647, "grad_norm": 2.8096729108941134, "learning_rate": 1.264797474549433e-05, "loss": 0.1064, "step": 358 }, { "epoch": 5.16546762589928, "grad_norm": 0.9781522138651574, "learning_rate": 1.2610227920026608e-05, "loss": 0.051, "step": 359 }, { "epoch": 5.179856115107913, "grad_norm": 1.1509313519794822, "learning_rate": 1.2572441141822322e-05, "loss": 0.0651, "step": 360 }, { "epoch": 5.194244604316546, "grad_norm": 4.388250469742715, "learning_rate": 1.2534614989254423e-05, "loss": 0.0967, "step": 361 }, { "epoch": 5.2086330935251794, "grad_norm": 0.9566869047445085, "learning_rate": 1.2496750041298515e-05, "loss": 0.0609, "step": 362 }, { "epoch": 5.223021582733813, "grad_norm": 2.2089936486598076, "learning_rate": 1.2458846877524025e-05, "loss": 0.0657, "step": 363 }, { "epoch": 5.237410071942446, "grad_norm": 2.1577271836034058, "learning_rate": 1.2420906078085316e-05, "loss": 0.0859, "step": 364 }, { "epoch": 5.251798561151079, "grad_norm": 2.107752486687129, "learning_rate": 1.2382928223712807e-05, "loss": 0.0493, "step": 365 }, { "epoch": 5.266187050359712, "grad_norm": 1.5362209636910469, "learning_rate": 1.2344913895704099e-05, "loss": 0.0551, "step": 366 }, { "epoch": 5.280575539568345, "grad_norm": 1.8284113473780614, "learning_rate": 1.2306863675915058e-05, "loss": 0.0639, "step": 367 }, { "epoch": 5.294964028776978, "grad_norm": 1.5434857110112714, "learning_rate": 1.2268778146750914e-05, "loss": 0.0665, "step": 368 }, { "epoch": 5.309352517985611, "grad_norm": 1.5270591397764983, "learning_rate": 1.2230657891157365e-05, "loss": 0.0614, "step": 369 }, { "epoch": 5.323741007194244, "grad_norm": 1.1644989385855498, "learning_rate": 1.2192503492611625e-05, "loss": 0.0516, "step": 370 }, { "epoch": 5.338129496402877, "grad_norm": 2.4389365127644664, "learning_rate": 1.2154315535113513e-05, "loss": 0.0763, "step": 371 }, { "epoch": 5.35251798561151, "grad_norm": 2.570405580037144, "learning_rate": 1.2116094603176513e-05, "loss": 0.0645, "step": 372 }, { "epoch": 5.366906474820144, "grad_norm": 2.525534092427501, "learning_rate": 1.2077841281818816e-05, "loss": 0.0754, "step": 373 }, { "epoch": 5.381294964028777, "grad_norm": 2.3428164694918654, "learning_rate": 1.203955615655438e-05, "loss": 0.0861, "step": 374 }, { "epoch": 5.39568345323741, "grad_norm": 2.235811734094292, "learning_rate": 1.2001239813383951e-05, "loss": 0.0549, "step": 375 }, { "epoch": 5.410071942446043, "grad_norm": 4.506038617907832, "learning_rate": 1.1962892838786116e-05, "loss": 0.0857, "step": 376 }, { "epoch": 5.424460431654676, "grad_norm": 2.267447064370234, "learning_rate": 1.19245158197083e-05, "loss": 0.0717, "step": 377 }, { "epoch": 5.438848920863309, "grad_norm": 1.3482474852811166, "learning_rate": 1.1886109343557808e-05, "loss": 0.0772, "step": 378 }, { "epoch": 5.453237410071942, "grad_norm": 2.1410593958658954, "learning_rate": 1.1847673998192815e-05, "loss": 0.0536, "step": 379 }, { "epoch": 5.467625899280575, "grad_norm": 1.5967556597455597, "learning_rate": 1.180921037191337e-05, "loss": 0.0459, "step": 380 }, { "epoch": 5.482014388489208, "grad_norm": 1.2435751103662702, "learning_rate": 1.1770719053452408e-05, "loss": 0.0443, "step": 381 }, { "epoch": 5.496402877697841, "grad_norm": 3.2548280663914784, "learning_rate": 1.1732200631966717e-05, "loss": 0.0843, "step": 382 }, { "epoch": 5.510791366906475, "grad_norm": 1.991827364041768, "learning_rate": 1.1693655697027935e-05, "loss": 0.0561, "step": 383 }, { "epoch": 5.525179856115108, "grad_norm": 1.7434178537703542, "learning_rate": 1.165508483861352e-05, "loss": 0.0631, "step": 384 }, { "epoch": 5.539568345323741, "grad_norm": 3.6171543812005362, "learning_rate": 1.1616488647097718e-05, "loss": 0.0704, "step": 385 }, { "epoch": 5.553956834532374, "grad_norm": 3.4479680855430113, "learning_rate": 1.1577867713242532e-05, "loss": 0.0751, "step": 386 }, { "epoch": 5.568345323741007, "grad_norm": 1.7391495389296567, "learning_rate": 1.1539222628188675e-05, "loss": 0.0524, "step": 387 }, { "epoch": 5.58273381294964, "grad_norm": 2.204791122892472, "learning_rate": 1.1500553983446527e-05, "loss": 0.0696, "step": 388 }, { "epoch": 5.597122302158273, "grad_norm": 4.934866987474034, "learning_rate": 1.1461862370887076e-05, "loss": 0.0841, "step": 389 }, { "epoch": 5.611510791366906, "grad_norm": 1.5682934454178645, "learning_rate": 1.1423148382732854e-05, "loss": 0.0604, "step": 390 }, { "epoch": 5.625899280575539, "grad_norm": 1.5376010738808106, "learning_rate": 1.1384412611548887e-05, "loss": 0.0763, "step": 391 }, { "epoch": 5.640287769784173, "grad_norm": 2.2954456281899858, "learning_rate": 1.134565565023362e-05, "loss": 0.0631, "step": 392 }, { "epoch": 5.654676258992806, "grad_norm": 2.8902333297932334, "learning_rate": 1.1306878092009828e-05, "loss": 0.1072, "step": 393 }, { "epoch": 5.669064748201439, "grad_norm": 3.1298174346773635, "learning_rate": 1.1268080530415557e-05, "loss": 0.0906, "step": 394 }, { "epoch": 5.683453237410072, "grad_norm": 1.9540710814071045, "learning_rate": 1.122926355929502e-05, "loss": 0.0815, "step": 395 }, { "epoch": 5.697841726618705, "grad_norm": 3.0936428631268424, "learning_rate": 1.119042777278953e-05, "loss": 0.0933, "step": 396 }, { "epoch": 5.712230215827338, "grad_norm": 0.779804603376737, "learning_rate": 1.1151573765328374e-05, "loss": 0.0377, "step": 397 }, { "epoch": 5.726618705035971, "grad_norm": 1.4682872653216061, "learning_rate": 1.1112702131619747e-05, "loss": 0.0553, "step": 398 }, { "epoch": 5.741007194244604, "grad_norm": 2.2888255822610586, "learning_rate": 1.1073813466641633e-05, "loss": 0.0592, "step": 399 }, { "epoch": 5.755395683453237, "grad_norm": 2.0210708158515356, "learning_rate": 1.1034908365632695e-05, "loss": 0.0591, "step": 400 }, { "epoch": 5.76978417266187, "grad_norm": 2.8477043817929175, "learning_rate": 1.0995987424083178e-05, "loss": 0.0665, "step": 401 }, { "epoch": 5.784172661870503, "grad_norm": 4.075702050234569, "learning_rate": 1.0957051237725775e-05, "loss": 0.0891, "step": 402 }, { "epoch": 5.798561151079137, "grad_norm": 3.4838765154884053, "learning_rate": 1.0918100402526533e-05, "loss": 0.0752, "step": 403 }, { "epoch": 5.81294964028777, "grad_norm": 2.190868453265739, "learning_rate": 1.0879135514675706e-05, "loss": 0.0678, "step": 404 }, { "epoch": 5.827338129496403, "grad_norm": 6.885762602295394, "learning_rate": 1.0840157170578645e-05, "loss": 0.1085, "step": 405 }, { "epoch": 5.841726618705036, "grad_norm": 2.1256795879435946, "learning_rate": 1.0801165966846662e-05, "loss": 0.0587, "step": 406 }, { "epoch": 5.856115107913669, "grad_norm": 3.0358003868740777, "learning_rate": 1.0762162500287916e-05, "loss": 0.1023, "step": 407 }, { "epoch": 5.870503597122302, "grad_norm": 2.72412213052102, "learning_rate": 1.0723147367898243e-05, "loss": 0.0755, "step": 408 }, { "epoch": 5.884892086330935, "grad_norm": 5.991640096070039, "learning_rate": 1.068412116685205e-05, "loss": 0.0906, "step": 409 }, { "epoch": 5.899280575539568, "grad_norm": 4.47844473482202, "learning_rate": 1.0645084494493166e-05, "loss": 0.1367, "step": 410 }, { "epoch": 5.913669064748201, "grad_norm": 5.278497486551652, "learning_rate": 1.0606037948325686e-05, "loss": 0.0934, "step": 411 }, { "epoch": 5.928057553956835, "grad_norm": 1.662465142150059, "learning_rate": 1.0566982126004848e-05, "loss": 0.0425, "step": 412 }, { "epoch": 5.942446043165468, "grad_norm": 2.1454760909414476, "learning_rate": 1.052791762532786e-05, "loss": 0.0632, "step": 413 }, { "epoch": 5.956834532374101, "grad_norm": 1.8023401117347966, "learning_rate": 1.0488845044224774e-05, "loss": 0.0562, "step": 414 }, { "epoch": 5.971223021582734, "grad_norm": 1.998906167438941, "learning_rate": 1.0449764980749317e-05, "loss": 0.0464, "step": 415 }, { "epoch": 5.985611510791367, "grad_norm": 1.9791549061655505, "learning_rate": 1.0410678033069745e-05, "loss": 0.0509, "step": 416 }, { "epoch": 6.0, "grad_norm": 1.783059633172495, "learning_rate": 1.0371584799459684e-05, "loss": 0.0693, "step": 417 }, { "epoch": 6.014388489208633, "grad_norm": 3.1713602527987077, "learning_rate": 1.0332485878288977e-05, "loss": 0.0896, "step": 418 }, { "epoch": 6.028776978417266, "grad_norm": 13.02216773829757, "learning_rate": 1.029338186801451e-05, "loss": 0.1842, "step": 419 }, { "epoch": 6.043165467625899, "grad_norm": 3.175785916601786, "learning_rate": 1.0254273367171085e-05, "loss": 0.0673, "step": 420 }, { "epoch": 6.057553956834532, "grad_norm": 3.0816879666228885, "learning_rate": 1.0215160974362224e-05, "loss": 0.0648, "step": 421 }, { "epoch": 6.071942446043165, "grad_norm": 3.138491600475823, "learning_rate": 1.0176045288251014e-05, "loss": 0.0537, "step": 422 }, { "epoch": 6.086330935251799, "grad_norm": 1.9965147840919386, "learning_rate": 1.0136926907550968e-05, "loss": 0.0493, "step": 423 }, { "epoch": 6.100719424460432, "grad_norm": 4.040616547013923, "learning_rate": 1.0097806431016825e-05, "loss": 0.0718, "step": 424 }, { "epoch": 6.115107913669065, "grad_norm": 3.684053981687095, "learning_rate": 1.0058684457435419e-05, "loss": 0.0885, "step": 425 }, { "epoch": 6.129496402877698, "grad_norm": 3.4407063513016207, "learning_rate": 1.0019561585616485e-05, "loss": 0.0878, "step": 426 }, { "epoch": 6.143884892086331, "grad_norm": 3.631968287932021, "learning_rate": 9.980438414383518e-06, "loss": 0.0716, "step": 427 }, { "epoch": 6.158273381294964, "grad_norm": 3.0607381634313535, "learning_rate": 9.941315542564583e-06, "loss": 0.058, "step": 428 }, { "epoch": 6.172661870503597, "grad_norm": 3.9600176462472607, "learning_rate": 9.902193568983177e-06, "loss": 0.1314, "step": 429 }, { "epoch": 6.18705035971223, "grad_norm": 1.7718441919276489, "learning_rate": 9.863073092449033e-06, "loss": 0.0619, "step": 430 }, { "epoch": 6.201438848920863, "grad_norm": 2.14606486678126, "learning_rate": 9.823954711748987e-06, "loss": 0.0537, "step": 431 }, { "epoch": 6.215827338129497, "grad_norm": 4.994325517315764, "learning_rate": 9.78483902563778e-06, "loss": 0.0989, "step": 432 }, { "epoch": 6.23021582733813, "grad_norm": 1.5344735636293707, "learning_rate": 9.745726632828913e-06, "loss": 0.0536, "step": 433 }, { "epoch": 6.244604316546763, "grad_norm": 2.0531411979678658, "learning_rate": 9.706618131985489e-06, "loss": 0.0522, "step": 434 }, { "epoch": 6.258992805755396, "grad_norm": 1.8694652467324728, "learning_rate": 9.667514121711025e-06, "loss": 0.0652, "step": 435 }, { "epoch": 6.273381294964029, "grad_norm": 1.7939240986480327, "learning_rate": 9.628415200540317e-06, "loss": 0.0585, "step": 436 }, { "epoch": 6.287769784172662, "grad_norm": 1.1701921170565006, "learning_rate": 9.589321966930255e-06, "loss": 0.0446, "step": 437 }, { "epoch": 6.302158273381295, "grad_norm": 1.3183342321738782, "learning_rate": 9.550235019250688e-06, "loss": 0.0365, "step": 438 }, { "epoch": 6.316546762589928, "grad_norm": 3.380696309795155, "learning_rate": 9.51115495577523e-06, "loss": 0.0855, "step": 439 }, { "epoch": 6.330935251798561, "grad_norm": 3.2163242534678536, "learning_rate": 9.472082374672145e-06, "loss": 0.1112, "step": 440 }, { "epoch": 6.345323741007194, "grad_norm": 2.8805271526903398, "learning_rate": 9.433017873995159e-06, "loss": 0.0567, "step": 441 }, { "epoch": 6.359712230215827, "grad_norm": 2.9270420886671626, "learning_rate": 9.393962051674319e-06, "loss": 0.073, "step": 442 }, { "epoch": 6.374100719424461, "grad_norm": 0.716770072244056, "learning_rate": 9.354915505506839e-06, "loss": 0.0273, "step": 443 }, { "epoch": 6.388489208633094, "grad_norm": 1.9375688885176805, "learning_rate": 9.315878833147953e-06, "loss": 0.0458, "step": 444 }, { "epoch": 6.402877697841727, "grad_norm": 1.5114813171921724, "learning_rate": 9.27685263210176e-06, "loss": 0.0408, "step": 445 }, { "epoch": 6.41726618705036, "grad_norm": 2.176739947497608, "learning_rate": 9.237837499712088e-06, "loss": 0.0406, "step": 446 }, { "epoch": 6.431654676258993, "grad_norm": 1.0546177826776708, "learning_rate": 9.19883403315334e-06, "loss": 0.032, "step": 447 }, { "epoch": 6.446043165467626, "grad_norm": 0.5710171789573429, "learning_rate": 9.159842829421358e-06, "loss": 0.0283, "step": 448 }, { "epoch": 6.460431654676259, "grad_norm": 2.4971494935474463, "learning_rate": 9.1208644853243e-06, "loss": 0.053, "step": 449 }, { "epoch": 6.474820143884892, "grad_norm": 2.1060917891375213, "learning_rate": 9.081899597473469e-06, "loss": 0.0685, "step": 450 }, { "epoch": 6.489208633093525, "grad_norm": 4.872774207148436, "learning_rate": 9.042948762274227e-06, "loss": 0.0878, "step": 451 }, { "epoch": 6.503597122302159, "grad_norm": 3.57310670704578, "learning_rate": 9.004012575916825e-06, "loss": 0.0898, "step": 452 }, { "epoch": 6.517985611510792, "grad_norm": 1.5890926297246664, "learning_rate": 8.965091634367306e-06, "loss": 0.0443, "step": 453 }, { "epoch": 6.532374100719425, "grad_norm": 1.6565607787351473, "learning_rate": 8.92618653335837e-06, "loss": 0.0414, "step": 454 }, { "epoch": 6.546762589928058, "grad_norm": 1.6598383644043355, "learning_rate": 8.887297868380255e-06, "loss": 0.0404, "step": 455 }, { "epoch": 6.561151079136691, "grad_norm": 1.3690066201582984, "learning_rate": 8.84842623467163e-06, "loss": 0.0596, "step": 456 }, { "epoch": 6.575539568345324, "grad_norm": 1.795250419015445, "learning_rate": 8.809572227210472e-06, "loss": 0.038, "step": 457 }, { "epoch": 6.589928057553957, "grad_norm": 3.139575746021586, "learning_rate": 8.770736440704979e-06, "loss": 0.0709, "step": 458 }, { "epoch": 6.60431654676259, "grad_norm": 3.5570255461130476, "learning_rate": 8.731919469584443e-06, "loss": 0.0707, "step": 459 }, { "epoch": 6.618705035971223, "grad_norm": 2.2552703837998322, "learning_rate": 8.693121907990177e-06, "loss": 0.0653, "step": 460 }, { "epoch": 6.633093525179856, "grad_norm": 1.9006385019285805, "learning_rate": 8.654344349766384e-06, "loss": 0.0629, "step": 461 }, { "epoch": 6.647482014388489, "grad_norm": 2.4225211791810053, "learning_rate": 8.615587388451116e-06, "loss": 0.0546, "step": 462 }, { "epoch": 6.661870503597123, "grad_norm": 2.687344986181635, "learning_rate": 8.576851617267151e-06, "loss": 0.0499, "step": 463 }, { "epoch": 6.676258992805756, "grad_norm": 1.6697444443320166, "learning_rate": 8.53813762911293e-06, "loss": 0.0424, "step": 464 }, { "epoch": 6.690647482014389, "grad_norm": 4.050618894489727, "learning_rate": 8.499446016553475e-06, "loss": 0.1016, "step": 465 }, { "epoch": 6.705035971223022, "grad_norm": 1.3790673230578159, "learning_rate": 8.460777371811327e-06, "loss": 0.0328, "step": 466 }, { "epoch": 6.719424460431655, "grad_norm": 1.1507918309844436, "learning_rate": 8.42213228675747e-06, "loss": 0.0223, "step": 467 }, { "epoch": 6.733812949640288, "grad_norm": 2.4647114023706256, "learning_rate": 8.383511352902285e-06, "loss": 0.0684, "step": 468 }, { "epoch": 6.748201438848921, "grad_norm": 1.9432128594740437, "learning_rate": 8.344915161386485e-06, "loss": 0.0544, "step": 469 }, { "epoch": 6.762589928057554, "grad_norm": 1.9522718283037046, "learning_rate": 8.306344302972066e-06, "loss": 0.0545, "step": 470 }, { "epoch": 6.7769784172661875, "grad_norm": 3.50307964418194, "learning_rate": 8.267799368033288e-06, "loss": 0.0727, "step": 471 }, { "epoch": 6.7913669064748206, "grad_norm": 4.659530620027555, "learning_rate": 8.229280946547595e-06, "loss": 0.1447, "step": 472 }, { "epoch": 6.805755395683454, "grad_norm": 2.459012650016016, "learning_rate": 8.190789628086632e-06, "loss": 0.0544, "step": 473 }, { "epoch": 6.820143884892087, "grad_norm": 2.6126248982483204, "learning_rate": 8.15232600180719e-06, "loss": 0.0799, "step": 474 }, { "epoch": 6.83453237410072, "grad_norm": 1.6545392348906836, "learning_rate": 8.113890656442194e-06, "loss": 0.0422, "step": 475 }, { "epoch": 6.848920863309353, "grad_norm": 1.8182442546260595, "learning_rate": 8.075484180291702e-06, "loss": 0.0539, "step": 476 }, { "epoch": 6.863309352517986, "grad_norm": 1.6010075595221613, "learning_rate": 8.037107161213886e-06, "loss": 0.0425, "step": 477 }, { "epoch": 6.877697841726619, "grad_norm": 1.6332321065292132, "learning_rate": 7.99876018661605e-06, "loss": 0.059, "step": 478 }, { "epoch": 6.892086330935252, "grad_norm": 2.4841015711601613, "learning_rate": 7.960443843445622e-06, "loss": 0.0493, "step": 479 }, { "epoch": 6.906474820143885, "grad_norm": 2.325274291210812, "learning_rate": 7.922158718181184e-06, "loss": 0.0535, "step": 480 }, { "epoch": 6.920863309352518, "grad_norm": 5.9237325194153225, "learning_rate": 7.883905396823487e-06, "loss": 0.0702, "step": 481 }, { "epoch": 6.935251798561151, "grad_norm": 1.3505054193697872, "learning_rate": 7.845684464886487e-06, "loss": 0.0463, "step": 482 }, { "epoch": 6.9496402877697845, "grad_norm": 4.741713866445365, "learning_rate": 7.80749650738838e-06, "loss": 0.0741, "step": 483 }, { "epoch": 6.9640287769784175, "grad_norm": 3.680501521362714, "learning_rate": 7.769342108842641e-06, "loss": 0.0597, "step": 484 }, { "epoch": 6.9784172661870505, "grad_norm": 1.5698663322195734, "learning_rate": 7.731221853249089e-06, "loss": 0.0481, "step": 485 }, { "epoch": 6.9928057553956835, "grad_norm": 2.7205015167111535, "learning_rate": 7.693136324084949e-06, "loss": 0.0779, "step": 486 }, { "epoch": 7.0071942446043165, "grad_norm": 2.492166076915753, "learning_rate": 7.655086104295904e-06, "loss": 0.0444, "step": 487 }, { "epoch": 7.0215827338129495, "grad_norm": 2.8913487401770097, "learning_rate": 7.617071776287196e-06, "loss": 0.0474, "step": 488 }, { "epoch": 7.0359712230215825, "grad_norm": 3.022407364083313, "learning_rate": 7.5790939219146874e-06, "loss": 0.0663, "step": 489 }, { "epoch": 7.0503597122302155, "grad_norm": 4.042879715219215, "learning_rate": 7.541153122475978e-06, "loss": 0.0654, "step": 490 }, { "epoch": 7.0647482014388485, "grad_norm": 3.5955340416725727, "learning_rate": 7.503249958701489e-06, "loss": 0.076, "step": 491 }, { "epoch": 7.079136690647482, "grad_norm": 1.1422784311056753, "learning_rate": 7.46538501074558e-06, "loss": 0.0312, "step": 492 }, { "epoch": 7.093525179856115, "grad_norm": 2.8616905836854674, "learning_rate": 7.427558858177679e-06, "loss": 0.0707, "step": 493 }, { "epoch": 7.107913669064748, "grad_norm": 3.736472268458676, "learning_rate": 7.389772079973397e-06, "loss": 0.07, "step": 494 }, { "epoch": 7.122302158273381, "grad_norm": 1.952123520605071, "learning_rate": 7.352025254505672e-06, "loss": 0.0644, "step": 495 }, { "epoch": 7.136690647482014, "grad_norm": 2.5321330171246577, "learning_rate": 7.31431895953592e-06, "loss": 0.0597, "step": 496 }, { "epoch": 7.151079136690647, "grad_norm": 1.3288743837048944, "learning_rate": 7.276653772205187e-06, "loss": 0.043, "step": 497 }, { "epoch": 7.16546762589928, "grad_norm": 1.1513739812834443, "learning_rate": 7.239030269025311e-06, "loss": 0.035, "step": 498 }, { "epoch": 7.179856115107913, "grad_norm": 1.5262098875270578, "learning_rate": 7.201449025870113e-06, "loss": 0.0377, "step": 499 }, { "epoch": 7.194244604316546, "grad_norm": 3.435543237475937, "learning_rate": 7.163910617966563e-06, "loss": 0.0609, "step": 500 }, { "epoch": 7.2086330935251794, "grad_norm": 0.9604425592050114, "learning_rate": 7.126415619885987e-06, "loss": 0.0307, "step": 501 }, { "epoch": 7.223021582733813, "grad_norm": 5.364737035927668, "learning_rate": 7.088964605535278e-06, "loss": 0.0779, "step": 502 }, { "epoch": 7.237410071942446, "grad_norm": 2.0995461577780086, "learning_rate": 7.0515581481480925e-06, "loss": 0.0614, "step": 503 }, { "epoch": 7.251798561151079, "grad_norm": 1.9139618540642809, "learning_rate": 7.014196820276098e-06, "loss": 0.0339, "step": 504 }, { "epoch": 7.266187050359712, "grad_norm": 1.282509471776567, "learning_rate": 6.976881193780196e-06, "loss": 0.0415, "step": 505 }, { "epoch": 7.280575539568345, "grad_norm": 1.8876563385392593, "learning_rate": 6.9396118398217675e-06, "loss": 0.0555, "step": 506 }, { "epoch": 7.294964028776978, "grad_norm": 1.6273768839175773, "learning_rate": 6.90238932885394e-06, "loss": 0.0315, "step": 507 }, { "epoch": 7.309352517985611, "grad_norm": 2.502695211360501, "learning_rate": 6.865214230612858e-06, "loss": 0.0517, "step": 508 }, { "epoch": 7.323741007194244, "grad_norm": 4.1055254501583995, "learning_rate": 6.8280871141089415e-06, "loss": 0.0733, "step": 509 }, { "epoch": 7.338129496402877, "grad_norm": 3.3526140983673285, "learning_rate": 6.791008547618207e-06, "loss": 0.0537, "step": 510 }, { "epoch": 7.35251798561151, "grad_norm": 2.27702848929816, "learning_rate": 6.753979098673539e-06, "loss": 0.0394, "step": 511 }, { "epoch": 7.366906474820144, "grad_norm": 2.5895671987953053, "learning_rate": 6.716999334056031e-06, "loss": 0.072, "step": 512 }, { "epoch": 7.381294964028777, "grad_norm": 2.560346455515735, "learning_rate": 6.680069819786288e-06, "loss": 0.0551, "step": 513 }, { "epoch": 7.39568345323741, "grad_norm": 2.4213413575290885, "learning_rate": 6.643191121115773e-06, "loss": 0.0604, "step": 514 }, { "epoch": 7.410071942446043, "grad_norm": 2.5037600137764415, "learning_rate": 6.6063638025181594e-06, "loss": 0.0505, "step": 515 }, { "epoch": 7.424460431654676, "grad_norm": 5.462794355900936, "learning_rate": 6.5695884276806784e-06, "loss": 0.0601, "step": 516 }, { "epoch": 7.438848920863309, "grad_norm": 5.901550532764724, "learning_rate": 6.532865559495505e-06, "loss": 0.0732, "step": 517 }, { "epoch": 7.453237410071942, "grad_norm": 2.1680456127592382, "learning_rate": 6.496195760051128e-06, "loss": 0.037, "step": 518 }, { "epoch": 7.467625899280575, "grad_norm": 1.0418430134694838, "learning_rate": 6.459579590623763e-06, "loss": 0.0296, "step": 519 }, { "epoch": 7.482014388489208, "grad_norm": 4.611606098360257, "learning_rate": 6.423017611668745e-06, "loss": 0.0904, "step": 520 }, { "epoch": 7.496402877697841, "grad_norm": 1.7334595469785676, "learning_rate": 6.386510382811963e-06, "loss": 0.065, "step": 521 }, { "epoch": 7.510791366906475, "grad_norm": 2.5488880130044302, "learning_rate": 6.350058462841283e-06, "loss": 0.0711, "step": 522 }, { "epoch": 7.525179856115108, "grad_norm": 4.383649882258492, "learning_rate": 6.313662409698004e-06, "loss": 0.0672, "step": 523 }, { "epoch": 7.539568345323741, "grad_norm": 2.161691670015276, "learning_rate": 6.277322780468317e-06, "loss": 0.0542, "step": 524 }, { "epoch": 7.553956834532374, "grad_norm": 1.75533651285235, "learning_rate": 6.241040131374769e-06, "loss": 0.0385, "step": 525 }, { "epoch": 7.568345323741007, "grad_norm": 1.6713788310284323, "learning_rate": 6.204815017767767e-06, "loss": 0.0651, "step": 526 }, { "epoch": 7.58273381294964, "grad_norm": 2.7767180589589957, "learning_rate": 6.168647994117057e-06, "loss": 0.0727, "step": 527 }, { "epoch": 7.597122302158273, "grad_norm": 1.7703681537405498, "learning_rate": 6.132539614003249e-06, "loss": 0.0399, "step": 528 }, { "epoch": 7.611510791366906, "grad_norm": 2.182319585582808, "learning_rate": 6.096490430109343e-06, "loss": 0.0537, "step": 529 }, { "epoch": 7.625899280575539, "grad_norm": 3.13728684435282, "learning_rate": 6.0605009942122705e-06, "loss": 0.0486, "step": 530 }, { "epoch": 7.640287769784173, "grad_norm": 1.9092924208411692, "learning_rate": 6.024571857174443e-06, "loss": 0.0426, "step": 531 }, { "epoch": 7.654676258992806, "grad_norm": 1.2985276847388703, "learning_rate": 5.988703568935329e-06, "loss": 0.0229, "step": 532 }, { "epoch": 7.669064748201439, "grad_norm": 0.8412966639532138, "learning_rate": 5.952896678503025e-06, "loss": 0.0185, "step": 533 }, { "epoch": 7.683453237410072, "grad_norm": 2.7092486788073074, "learning_rate": 5.917151733945865e-06, "loss": 0.0428, "step": 534 }, { "epoch": 7.697841726618705, "grad_norm": 3.4234238702022526, "learning_rate": 5.88146928238402e-06, "loss": 0.0405, "step": 535 }, { "epoch": 7.712230215827338, "grad_norm": 4.2390460482616685, "learning_rate": 5.845849869981137e-06, "loss": 0.0623, "step": 536 }, { "epoch": 7.726618705035971, "grad_norm": 2.9908661711146167, "learning_rate": 5.8102940419359595e-06, "loss": 0.0584, "step": 537 }, { "epoch": 7.741007194244604, "grad_norm": 1.6724838566811557, "learning_rate": 5.7748023424740085e-06, "loss": 0.0255, "step": 538 }, { "epoch": 7.755395683453237, "grad_norm": 5.552785695728314, "learning_rate": 5.739375314839226e-06, "loss": 0.047, "step": 539 }, { "epoch": 7.76978417266187, "grad_norm": 4.78316445415734, "learning_rate": 5.704013501285679e-06, "loss": 0.059, "step": 540 }, { "epoch": 7.784172661870503, "grad_norm": 2.2867834949348116, "learning_rate": 5.6687174430692495e-06, "loss": 0.049, "step": 541 }, { "epoch": 7.798561151079137, "grad_norm": 1.4114860474896007, "learning_rate": 5.633487680439362e-06, "loss": 0.0282, "step": 542 }, { "epoch": 7.81294964028777, "grad_norm": 3.3647369349864102, "learning_rate": 5.598324752630695e-06, "loss": 0.0749, "step": 543 }, { "epoch": 7.827338129496403, "grad_norm": 2.684742315386639, "learning_rate": 5.5632291978549445e-06, "loss": 0.0455, "step": 544 }, { "epoch": 7.841726618705036, "grad_norm": 2.0554793850593005, "learning_rate": 5.528201553292578e-06, "loss": 0.0439, "step": 545 }, { "epoch": 7.856115107913669, "grad_norm": 4.087197798794891, "learning_rate": 5.493242355084609e-06, "loss": 0.0688, "step": 546 }, { "epoch": 7.870503597122302, "grad_norm": 1.2791224839211275, "learning_rate": 5.458352138324408e-06, "loss": 0.0298, "step": 547 }, { "epoch": 7.884892086330935, "grad_norm": 5.979296747331615, "learning_rate": 5.423531437049491e-06, "loss": 0.0662, "step": 548 }, { "epoch": 7.899280575539568, "grad_norm": 4.045189397887292, "learning_rate": 5.388780784233354e-06, "loss": 0.0554, "step": 549 }, { "epoch": 7.913669064748201, "grad_norm": 4.898228214701075, "learning_rate": 5.354100711777317e-06, "loss": 0.0594, "step": 550 }, { "epoch": 7.928057553956835, "grad_norm": 2.02299170942208, "learning_rate": 5.319491750502383e-06, "loss": 0.0617, "step": 551 }, { "epoch": 7.942446043165468, "grad_norm": 3.392724832422359, "learning_rate": 5.284954430141109e-06, "loss": 0.0574, "step": 552 }, { "epoch": 7.956834532374101, "grad_norm": 2.0165506744847437, "learning_rate": 5.250489279329501e-06, "loss": 0.0261, "step": 553 }, { "epoch": 7.971223021582734, "grad_norm": 1.399835068970492, "learning_rate": 5.216096825598917e-06, "loss": 0.0324, "step": 554 }, { "epoch": 7.985611510791367, "grad_norm": 2.9458649739740155, "learning_rate": 5.18177759536801e-06, "loss": 0.0497, "step": 555 }, { "epoch": 8.0, "grad_norm": 0.7940680363085926, "learning_rate": 5.147532113934646e-06, "loss": 0.0181, "step": 556 }, { "epoch": 8.014388489208633, "grad_norm": 1.9007802957877749, "learning_rate": 5.113360905467875e-06, "loss": 0.037, "step": 557 }, { "epoch": 8.028776978417266, "grad_norm": 3.2781065841019066, "learning_rate": 5.079264492999916e-06, "loss": 0.036, "step": 558 }, { "epoch": 8.043165467625899, "grad_norm": 3.819122592314504, "learning_rate": 5.0452433984181315e-06, "loss": 0.0523, "step": 559 }, { "epoch": 8.057553956834532, "grad_norm": 4.2144956363601525, "learning_rate": 5.011298142457069e-06, "loss": 0.0636, "step": 560 }, { "epoch": 8.071942446043165, "grad_norm": 13.047476159517892, "learning_rate": 4.97742924469046e-06, "loss": 0.1163, "step": 561 }, { "epoch": 8.086330935251798, "grad_norm": 6.830416937477266, "learning_rate": 4.943637223523282e-06, "loss": 0.1028, "step": 562 }, { "epoch": 8.100719424460431, "grad_norm": 2.3255270246907203, "learning_rate": 4.909922596183822e-06, "loss": 0.0521, "step": 563 }, { "epoch": 8.115107913669064, "grad_norm": 1.1435086086266115, "learning_rate": 4.876285878715764e-06, "loss": 0.0157, "step": 564 }, { "epoch": 8.129496402877697, "grad_norm": 3.7825947186999436, "learning_rate": 4.842727585970284e-06, "loss": 0.0393, "step": 565 }, { "epoch": 8.14388489208633, "grad_norm": 3.3170824479840797, "learning_rate": 4.8092482315981685e-06, "loss": 0.0507, "step": 566 }, { "epoch": 8.158273381294965, "grad_norm": 4.7246048323427265, "learning_rate": 4.775848328041956e-06, "loss": 0.0752, "step": 567 }, { "epoch": 8.172661870503598, "grad_norm": 2.8604995363327896, "learning_rate": 4.742528386528094e-06, "loss": 0.0447, "step": 568 }, { "epoch": 8.18705035971223, "grad_norm": 4.122397781085086, "learning_rate": 4.709288917059118e-06, "loss": 0.0571, "step": 569 }, { "epoch": 8.201438848920864, "grad_norm": 3.8083495479732816, "learning_rate": 4.676130428405834e-06, "loss": 0.0494, "step": 570 }, { "epoch": 8.215827338129497, "grad_norm": 3.48317844967832, "learning_rate": 4.643053428099538e-06, "loss": 0.0841, "step": 571 }, { "epoch": 8.23021582733813, "grad_norm": 1.551504859564384, "learning_rate": 4.610058422424249e-06, "loss": 0.0375, "step": 572 }, { "epoch": 8.244604316546763, "grad_norm": 1.9477471961254322, "learning_rate": 4.577145916408955e-06, "loss": 0.0257, "step": 573 }, { "epoch": 8.258992805755396, "grad_norm": 3.2335530341856886, "learning_rate": 4.544316413819888e-06, "loss": 0.075, "step": 574 }, { "epoch": 8.273381294964029, "grad_norm": 1.0232024159790356, "learning_rate": 4.5115704171528105e-06, "loss": 0.026, "step": 575 }, { "epoch": 8.287769784172662, "grad_norm": 2.8739397189315956, "learning_rate": 4.478908427625323e-06, "loss": 0.0409, "step": 576 }, { "epoch": 8.302158273381295, "grad_norm": 7.820713750474336, "learning_rate": 4.446330945169197e-06, "loss": 0.0697, "step": 577 }, { "epoch": 8.316546762589928, "grad_norm": 1.4559489461697241, "learning_rate": 4.41383846842272e-06, "loss": 0.0338, "step": 578 }, { "epoch": 8.33093525179856, "grad_norm": 1.6200703776809549, "learning_rate": 4.381431494723056e-06, "loss": 0.0453, "step": 579 }, { "epoch": 8.345323741007194, "grad_norm": 2.462591590876853, "learning_rate": 4.349110520098644e-06, "loss": 0.0452, "step": 580 }, { "epoch": 8.359712230215827, "grad_norm": 2.4958974429500675, "learning_rate": 4.31687603926161e-06, "loss": 0.0419, "step": 581 }, { "epoch": 8.37410071942446, "grad_norm": 3.3620333990871414, "learning_rate": 4.284728545600174e-06, "loss": 0.0918, "step": 582 }, { "epoch": 8.388489208633093, "grad_norm": 2.076920143609935, "learning_rate": 4.252668531171117e-06, "loss": 0.0333, "step": 583 }, { "epoch": 8.402877697841726, "grad_norm": 11.337846192838462, "learning_rate": 4.220696486692241e-06, "loss": 0.0809, "step": 584 }, { "epoch": 8.417266187050359, "grad_norm": 1.4691851543426133, "learning_rate": 4.18881290153486e-06, "loss": 0.0297, "step": 585 }, { "epoch": 8.431654676258994, "grad_norm": 2.322343633696552, "learning_rate": 4.1570182637163155e-06, "loss": 0.041, "step": 586 }, { "epoch": 8.446043165467627, "grad_norm": 1.857573901571014, "learning_rate": 4.125313059892494e-06, "loss": 0.026, "step": 587 }, { "epoch": 8.46043165467626, "grad_norm": 2.420406989345141, "learning_rate": 4.093697775350388e-06, "loss": 0.0425, "step": 588 }, { "epoch": 8.474820143884893, "grad_norm": 2.528933273598754, "learning_rate": 4.062172894000664e-06, "loss": 0.0194, "step": 589 }, { "epoch": 8.489208633093526, "grad_norm": 4.851880404918257, "learning_rate": 4.0307388983702555e-06, "loss": 0.0456, "step": 590 }, { "epoch": 8.503597122302159, "grad_norm": 1.0566440356278923, "learning_rate": 3.9993962695949865e-06, "loss": 0.037, "step": 591 }, { "epoch": 8.517985611510792, "grad_norm": 0.9342625319695776, "learning_rate": 3.9681454874121905e-06, "loss": 0.0246, "step": 592 }, { "epoch": 8.532374100719425, "grad_norm": 1.404658963816671, "learning_rate": 3.9369870301533785e-06, "loss": 0.0323, "step": 593 }, { "epoch": 8.546762589928058, "grad_norm": 2.8421121591274474, "learning_rate": 3.905921374736919e-06, "loss": 0.042, "step": 594 }, { "epoch": 8.56115107913669, "grad_norm": 1.665249679691359, "learning_rate": 3.87494899666073e-06, "loss": 0.0471, "step": 595 }, { "epoch": 8.575539568345324, "grad_norm": 3.150615281085969, "learning_rate": 3.844070369995008e-06, "loss": 0.0592, "step": 596 }, { "epoch": 8.589928057553957, "grad_norm": 2.1657965045021093, "learning_rate": 3.8132859673749688e-06, "loss": 0.0335, "step": 597 }, { "epoch": 8.60431654676259, "grad_norm": 0.9753064067607254, "learning_rate": 3.7825962599936117e-06, "loss": 0.0173, "step": 598 }, { "epoch": 8.618705035971223, "grad_norm": 1.6281860386953595, "learning_rate": 3.7520017175945168e-06, "loss": 0.0327, "step": 599 }, { "epoch": 8.633093525179856, "grad_norm": 1.3250219320378607, "learning_rate": 3.7215028084646385e-06, "loss": 0.0389, "step": 600 }, { "epoch": 8.647482014388489, "grad_norm": 2.663488081521059, "learning_rate": 3.691099999427152e-06, "loss": 0.0451, "step": 601 }, { "epoch": 8.661870503597122, "grad_norm": 1.0684718799506963, "learning_rate": 3.6607937558342975e-06, "loss": 0.0227, "step": 602 }, { "epoch": 8.676258992805755, "grad_norm": 1.8713141278138534, "learning_rate": 3.6305845415602726e-06, "loss": 0.0324, "step": 603 }, { "epoch": 8.690647482014388, "grad_norm": 1.959513645334475, "learning_rate": 3.6004728189941142e-06, "loss": 0.0483, "step": 604 }, { "epoch": 8.70503597122302, "grad_norm": 8.273881877761735, "learning_rate": 3.5704590490326298e-06, "loss": 0.0701, "step": 605 }, { "epoch": 8.719424460431654, "grad_norm": 1.932144875971402, "learning_rate": 3.5405436910733437e-06, "loss": 0.0412, "step": 606 }, { "epoch": 8.733812949640289, "grad_norm": 7.650174976201004, "learning_rate": 3.5107272030074626e-06, "loss": 0.0525, "step": 607 }, { "epoch": 8.748201438848922, "grad_norm": 2.482410934643446, "learning_rate": 3.4810100412128743e-06, "loss": 0.0447, "step": 608 }, { "epoch": 8.762589928057555, "grad_norm": 1.7887809194693534, "learning_rate": 3.4513926605471504e-06, "loss": 0.0334, "step": 609 }, { "epoch": 8.776978417266188, "grad_norm": 2.262589027477362, "learning_rate": 3.421875514340589e-06, "loss": 0.0438, "step": 610 }, { "epoch": 8.79136690647482, "grad_norm": 5.096455221790141, "learning_rate": 3.392459054389281e-06, "loss": 0.0589, "step": 611 }, { "epoch": 8.805755395683454, "grad_norm": 1.7161312058815825, "learning_rate": 3.3631437309481853e-06, "loss": 0.0261, "step": 612 }, { "epoch": 8.820143884892087, "grad_norm": 2.941763984376986, "learning_rate": 3.333929992724253e-06, "loss": 0.0576, "step": 613 }, { "epoch": 8.83453237410072, "grad_norm": 3.006151368731597, "learning_rate": 3.30481828686954e-06, "loss": 0.0443, "step": 614 }, { "epoch": 8.848920863309353, "grad_norm": 1.6800648366968625, "learning_rate": 3.275809058974373e-06, "loss": 0.0307, "step": 615 }, { "epoch": 8.863309352517986, "grad_norm": 0.8243738116908417, "learning_rate": 3.2469027530605255e-06, "loss": 0.0184, "step": 616 }, { "epoch": 8.877697841726619, "grad_norm": 2.9566481267485365, "learning_rate": 3.2180998115744387e-06, "loss": 0.0373, "step": 617 }, { "epoch": 8.892086330935252, "grad_norm": 1.9637756364219565, "learning_rate": 3.1894006753804143e-06, "loss": 0.0414, "step": 618 }, { "epoch": 8.906474820143885, "grad_norm": 2.3021615520028984, "learning_rate": 3.1608057837538976e-06, "loss": 0.0423, "step": 619 }, { "epoch": 8.920863309352518, "grad_norm": 2.145448976885504, "learning_rate": 3.1323155743747393e-06, "loss": 0.0404, "step": 620 }, { "epoch": 8.93525179856115, "grad_norm": 2.6838866265175287, "learning_rate": 3.1039304833205073e-06, "loss": 0.042, "step": 621 }, { "epoch": 8.949640287769784, "grad_norm": 0.9803533745198019, "learning_rate": 3.075650945059799e-06, "loss": 0.028, "step": 622 }, { "epoch": 8.964028776978417, "grad_norm": 6.015138805293879, "learning_rate": 3.047477392445596e-06, "loss": 0.0469, "step": 623 }, { "epoch": 8.97841726618705, "grad_norm": 2.47190153965276, "learning_rate": 3.019410256708637e-06, "loss": 0.0699, "step": 624 }, { "epoch": 8.992805755395683, "grad_norm": 1.6888494123739861, "learning_rate": 2.9914499674508337e-06, "loss": 0.0352, "step": 625 }, { "epoch": 9.007194244604317, "grad_norm": 0.8291369253401952, "learning_rate": 2.9635969526386665e-06, "loss": 0.0173, "step": 626 }, { "epoch": 9.02158273381295, "grad_norm": 3.1456360877961043, "learning_rate": 2.935851638596655e-06, "loss": 0.0445, "step": 627 }, { "epoch": 9.035971223021583, "grad_norm": 2.3237289901109754, "learning_rate": 2.908214450000828e-06, "loss": 0.0392, "step": 628 }, { "epoch": 9.050359712230216, "grad_norm": 2.1054179088541547, "learning_rate": 2.8806858098722155e-06, "loss": 0.0585, "step": 629 }, { "epoch": 9.06474820143885, "grad_norm": 1.2969196699460492, "learning_rate": 2.853266139570391e-06, "loss": 0.0208, "step": 630 }, { "epoch": 9.079136690647482, "grad_norm": 0.9970140027989338, "learning_rate": 2.825955858787002e-06, "loss": 0.0183, "step": 631 }, { "epoch": 9.093525179856115, "grad_norm": 1.046038196325873, "learning_rate": 2.798755385539358e-06, "loss": 0.0196, "step": 632 }, { "epoch": 9.107913669064748, "grad_norm": 1.9785975611671889, "learning_rate": 2.7716651361640277e-06, "loss": 0.046, "step": 633 }, { "epoch": 9.122302158273381, "grad_norm": 2.076469916316301, "learning_rate": 2.7446855253104775e-06, "loss": 0.035, "step": 634 }, { "epoch": 9.136690647482014, "grad_norm": 0.996431472737031, "learning_rate": 2.717816965934705e-06, "loss": 0.0377, "step": 635 }, { "epoch": 9.151079136690647, "grad_norm": 4.566127623498886, "learning_rate": 2.6910598692929323e-06, "loss": 0.0767, "step": 636 }, { "epoch": 9.16546762589928, "grad_norm": 12.39526382129473, "learning_rate": 2.6644146449353103e-06, "loss": 0.1713, "step": 637 }, { "epoch": 9.179856115107913, "grad_norm": 2.4799629481553724, "learning_rate": 2.6378817006996393e-06, "loss": 0.0314, "step": 638 }, { "epoch": 9.194244604316546, "grad_norm": 8.085625915606265, "learning_rate": 2.611461442705152e-06, "loss": 0.051, "step": 639 }, { "epoch": 9.20863309352518, "grad_norm": 3.9263706221680685, "learning_rate": 2.5851542753462612e-06, "loss": 0.0521, "step": 640 }, { "epoch": 9.223021582733812, "grad_norm": 2.226548600355445, "learning_rate": 2.5589606012863968e-06, "loss": 0.0332, "step": 641 }, { "epoch": 9.237410071942445, "grad_norm": 1.255383022545736, "learning_rate": 2.532880821451833e-06, "loss": 0.0248, "step": 642 }, { "epoch": 9.251798561151078, "grad_norm": 5.241468552575159, "learning_rate": 2.5069153350255617e-06, "loss": 0.0544, "step": 643 }, { "epoch": 9.266187050359711, "grad_norm": 1.3999294797764177, "learning_rate": 2.4810645394411636e-06, "loss": 0.0284, "step": 644 }, { "epoch": 9.280575539568344, "grad_norm": 1.7303921026902358, "learning_rate": 2.455328830376741e-06, "loss": 0.0212, "step": 645 }, { "epoch": 9.29496402877698, "grad_norm": 3.2404444332120868, "learning_rate": 2.429708601748849e-06, "loss": 0.0698, "step": 646 }, { "epoch": 9.309352517985612, "grad_norm": 1.553700541641244, "learning_rate": 2.4042042457064863e-06, "loss": 0.0389, "step": 647 }, { "epoch": 9.323741007194245, "grad_norm": 2.570999782342014, "learning_rate": 2.3788161526250677e-06, "loss": 0.034, "step": 648 }, { "epoch": 9.338129496402878, "grad_norm": 1.4441708855755897, "learning_rate": 2.3535447111004662e-06, "loss": 0.0318, "step": 649 }, { "epoch": 9.352517985611511, "grad_norm": 2.9286217319244456, "learning_rate": 2.3283903079430582e-06, "loss": 0.0426, "step": 650 }, { "epoch": 9.366906474820144, "grad_norm": 2.3158759616027758, "learning_rate": 2.3033533281718036e-06, "loss": 0.04, "step": 651 }, { "epoch": 9.381294964028777, "grad_norm": 5.000706423916787, "learning_rate": 2.2784341550083577e-06, "loss": 0.0778, "step": 652 }, { "epoch": 9.39568345323741, "grad_norm": 3.1672796078263667, "learning_rate": 2.253633169871198e-06, "loss": 0.0477, "step": 653 }, { "epoch": 9.410071942446043, "grad_norm": 2.936168504993618, "learning_rate": 2.2289507523697894e-06, "loss": 0.0355, "step": 654 }, { "epoch": 9.424460431654676, "grad_norm": 4.144038176169699, "learning_rate": 2.204387280298772e-06, "loss": 0.0636, "step": 655 }, { "epoch": 9.43884892086331, "grad_norm": 3.317961055103226, "learning_rate": 2.1799431296321883e-06, "loss": 0.0254, "step": 656 }, { "epoch": 9.453237410071942, "grad_norm": 2.0945507810139166, "learning_rate": 2.155618674517711e-06, "loss": 0.0396, "step": 657 }, { "epoch": 9.467625899280575, "grad_norm": 2.1490741031658755, "learning_rate": 2.131414287270931e-06, "loss": 0.0351, "step": 658 }, { "epoch": 9.482014388489208, "grad_norm": 1.170394818981662, "learning_rate": 2.107330338369652e-06, "loss": 0.0223, "step": 659 }, { "epoch": 9.496402877697841, "grad_norm": 1.9060637808073648, "learning_rate": 2.083367196448219e-06, "loss": 0.0314, "step": 660 }, { "epoch": 9.510791366906474, "grad_norm": 2.682470653382018, "learning_rate": 2.0595252282918875e-06, "loss": 0.0311, "step": 661 }, { "epoch": 9.525179856115107, "grad_norm": 2.824810882470908, "learning_rate": 2.0358047988311857e-06, "loss": 0.0408, "step": 662 }, { "epoch": 9.53956834532374, "grad_norm": 1.8783543488553889, "learning_rate": 2.012206271136353e-06, "loss": 0.0316, "step": 663 }, { "epoch": 9.553956834532373, "grad_norm": 1.277908232236526, "learning_rate": 1.988730006411769e-06, "loss": 0.0227, "step": 664 }, { "epoch": 9.568345323741006, "grad_norm": 2.3170498768091763, "learning_rate": 1.9653763639904333e-06, "loss": 0.0324, "step": 665 }, { "epoch": 9.582733812949641, "grad_norm": 1.4282734057780846, "learning_rate": 1.942145701328456e-06, "loss": 0.0282, "step": 666 }, { "epoch": 9.597122302158274, "grad_norm": 2.7421058903271676, "learning_rate": 1.9190383739995933e-06, "loss": 0.0433, "step": 667 }, { "epoch": 9.611510791366907, "grad_norm": 0.5131708654352847, "learning_rate": 1.8960547356897997e-06, "loss": 0.0161, "step": 668 }, { "epoch": 9.62589928057554, "grad_norm": 2.0530874650800106, "learning_rate": 1.8731951381918257e-06, "loss": 0.0427, "step": 669 }, { "epoch": 9.640287769784173, "grad_norm": 1.8051899893158643, "learning_rate": 1.8504599313998196e-06, "loss": 0.0218, "step": 670 }, { "epoch": 9.654676258992806, "grad_norm": 0.7906143024517177, "learning_rate": 1.8278494633039756e-06, "loss": 0.0195, "step": 671 }, { "epoch": 9.66906474820144, "grad_norm": 3.283281196176163, "learning_rate": 1.8053640799852134e-06, "loss": 0.0279, "step": 672 }, { "epoch": 9.683453237410072, "grad_norm": 1.4837554695301376, "learning_rate": 1.783004125609873e-06, "loss": 0.0303, "step": 673 }, { "epoch": 9.697841726618705, "grad_norm": 3.9513747658965364, "learning_rate": 1.7607699424244583e-06, "loss": 0.0453, "step": 674 }, { "epoch": 9.712230215827338, "grad_norm": 2.932528943145533, "learning_rate": 1.7386618707503822e-06, "loss": 0.0551, "step": 675 }, { "epoch": 9.726618705035971, "grad_norm": 1.8691452428430109, "learning_rate": 1.7166802489787704e-06, "loss": 0.0288, "step": 676 }, { "epoch": 9.741007194244604, "grad_norm": 2.267950496543606, "learning_rate": 1.6948254135652764e-06, "loss": 0.0471, "step": 677 }, { "epoch": 9.755395683453237, "grad_norm": 3.362001877990058, "learning_rate": 1.673097699024938e-06, "loss": 0.0443, "step": 678 }, { "epoch": 9.76978417266187, "grad_norm": 0.6960568252828435, "learning_rate": 1.6514974379270465e-06, "loss": 0.0143, "step": 679 }, { "epoch": 9.784172661870503, "grad_norm": 1.5401636007060717, "learning_rate": 1.6300249608900654e-06, "loss": 0.0318, "step": 680 }, { "epoch": 9.798561151079136, "grad_norm": 2.2596630157728765, "learning_rate": 1.608680596576563e-06, "loss": 0.0332, "step": 681 }, { "epoch": 9.81294964028777, "grad_norm": 1.1661531548368926, "learning_rate": 1.587464671688187e-06, "loss": 0.0187, "step": 682 }, { "epoch": 9.827338129496402, "grad_norm": 1.4871849727237823, "learning_rate": 1.5663775109606682e-06, "loss": 0.0283, "step": 683 }, { "epoch": 9.841726618705035, "grad_norm": 0.9194790607066714, "learning_rate": 1.5454194371588383e-06, "loss": 0.0166, "step": 684 }, { "epoch": 9.85611510791367, "grad_norm": 2.308580159255162, "learning_rate": 1.5245907710716912e-06, "loss": 0.0349, "step": 685 }, { "epoch": 9.870503597122303, "grad_norm": 2.314639880080365, "learning_rate": 1.5038918315074825e-06, "loss": 0.0281, "step": 686 }, { "epoch": 9.884892086330936, "grad_norm": 1.619927122217182, "learning_rate": 1.48332293528885e-06, "loss": 0.0575, "step": 687 }, { "epoch": 9.899280575539569, "grad_norm": 1.5455675444988652, "learning_rate": 1.462884397247949e-06, "loss": 0.0281, "step": 688 }, { "epoch": 9.913669064748202, "grad_norm": 1.496371665596201, "learning_rate": 1.4425765302216467e-06, "loss": 0.0262, "step": 689 }, { "epoch": 9.928057553956835, "grad_norm": 0.6246401749746331, "learning_rate": 1.4223996450467291e-06, "loss": 0.0155, "step": 690 }, { "epoch": 9.942446043165468, "grad_norm": 10.244899114128767, "learning_rate": 1.4023540505551514e-06, "loss": 0.0584, "step": 691 }, { "epoch": 9.956834532374101, "grad_norm": 3.578140786106804, "learning_rate": 1.382440053569295e-06, "loss": 0.0403, "step": 692 }, { "epoch": 9.971223021582734, "grad_norm": 1.6841621634178627, "learning_rate": 1.3626579588972843e-06, "loss": 0.0347, "step": 693 }, { "epoch": 9.985611510791367, "grad_norm": 1.6886542091112717, "learning_rate": 1.3430080693283176e-06, "loss": 0.0252, "step": 694 }, { "epoch": 10.0, "grad_norm": 2.1884226287968, "learning_rate": 1.3234906856280272e-06, "loss": 0.0289, "step": 695 }, { "epoch": 10.014388489208633, "grad_norm": 1.8465342865749588, "learning_rate": 1.30410610653389e-06, "loss": 0.0317, "step": 696 }, { "epoch": 10.028776978417266, "grad_norm": 1.6653664187493198, "learning_rate": 1.2848546287506392e-06, "loss": 0.0327, "step": 697 }, { "epoch": 10.043165467625899, "grad_norm": 2.716731042561852, "learning_rate": 1.2657365469457295e-06, "loss": 0.0356, "step": 698 }, { "epoch": 10.057553956834532, "grad_norm": 0.5749921009255655, "learning_rate": 1.2467521537448258e-06, "loss": 0.0129, "step": 699 }, { "epoch": 10.071942446043165, "grad_norm": 1.6695913891538687, "learning_rate": 1.227901739727332e-06, "loss": 0.0386, "step": 700 }, { "epoch": 10.086330935251798, "grad_norm": 1.480135248408385, "learning_rate": 1.2091855934219289e-06, "loss": 0.0325, "step": 701 }, { "epoch": 10.100719424460431, "grad_norm": 1.3476082037635144, "learning_rate": 1.1906040013021668e-06, "loss": 0.0143, "step": 702 }, { "epoch": 10.115107913669064, "grad_norm": 1.0352529590879136, "learning_rate": 1.172157247782083e-06, "loss": 0.0198, "step": 703 }, { "epoch": 10.129496402877697, "grad_norm": 1.1278501275644894, "learning_rate": 1.1538456152118394e-06, "loss": 0.0207, "step": 704 }, { "epoch": 10.14388489208633, "grad_norm": 3.9502198656971963, "learning_rate": 1.1356693838734134e-06, "loss": 0.0514, "step": 705 }, { "epoch": 10.158273381294965, "grad_norm": 1.530725864660551, "learning_rate": 1.1176288319762963e-06, "loss": 0.0221, "step": 706 }, { "epoch": 10.172661870503598, "grad_norm": 2.2735261159620723, "learning_rate": 1.0997242356532335e-06, "loss": 0.0466, "step": 707 }, { "epoch": 10.18705035971223, "grad_norm": 1.0618045257756556, "learning_rate": 1.0819558689560162e-06, "loss": 0.0227, "step": 708 }, { "epoch": 10.201438848920864, "grad_norm": 1.6182455175606794, "learning_rate": 1.0643240038512648e-06, "loss": 0.0231, "step": 709 }, { "epoch": 10.215827338129497, "grad_norm": 8.00225455212014, "learning_rate": 1.0468289102162788e-06, "loss": 0.0681, "step": 710 }, { "epoch": 10.23021582733813, "grad_norm": 2.591497264989186, "learning_rate": 1.0294708558349031e-06, "loss": 0.0251, "step": 711 }, { "epoch": 10.244604316546763, "grad_norm": 1.7149356695320543, "learning_rate": 1.0122501063934266e-06, "loss": 0.0304, "step": 712 }, { "epoch": 10.258992805755396, "grad_norm": 2.487093456558291, "learning_rate": 9.951669254765227e-07, "loss": 0.0316, "step": 713 }, { "epoch": 10.273381294964029, "grad_norm": 1.8372925491948908, "learning_rate": 9.782215745632063e-07, "loss": 0.0244, "step": 714 }, { "epoch": 10.287769784172662, "grad_norm": 1.6522089041208652, "learning_rate": 9.614143130228336e-07, "loss": 0.0213, "step": 715 }, { "epoch": 10.302158273381295, "grad_norm": 1.388640294189459, "learning_rate": 9.447453981111377e-07, "loss": 0.022, "step": 716 }, { "epoch": 10.316546762589928, "grad_norm": 1.8567918043310965, "learning_rate": 9.282150849662841e-07, "loss": 0.0277, "step": 717 }, { "epoch": 10.33093525179856, "grad_norm": 1.7484564597114693, "learning_rate": 9.118236266049707e-07, "loss": 0.04, "step": 718 }, { "epoch": 10.345323741007194, "grad_norm": 1.3472624690293522, "learning_rate": 8.955712739185529e-07, "loss": 0.0284, "step": 719 }, { "epoch": 10.359712230215827, "grad_norm": 2.569862357797179, "learning_rate": 8.794582756691994e-07, "loss": 0.0487, "step": 720 }, { "epoch": 10.37410071942446, "grad_norm": 1.1341008302050821, "learning_rate": 8.634848784860916e-07, "loss": 0.0259, "step": 721 }, { "epoch": 10.388489208633093, "grad_norm": 3.9556296920276326, "learning_rate": 8.476513268616471e-07, "loss": 0.0303, "step": 722 }, { "epoch": 10.402877697841726, "grad_norm": 1.0067776542085125, "learning_rate": 8.319578631477731e-07, "loss": 0.0215, "step": 723 }, { "epoch": 10.417266187050359, "grad_norm": 1.5246834403644955, "learning_rate": 8.164047275521614e-07, "loss": 0.0279, "step": 724 }, { "epoch": 10.431654676258994, "grad_norm": 1.5007545531516848, "learning_rate": 8.00992158134607e-07, "loss": 0.0371, "step": 725 }, { "epoch": 10.446043165467627, "grad_norm": 0.962027429251609, "learning_rate": 7.857203908033684e-07, "loss": 0.026, "step": 726 }, { "epoch": 10.46043165467626, "grad_norm": 1.2598114860067426, "learning_rate": 7.705896593115614e-07, "loss": 0.0275, "step": 727 }, { "epoch": 10.474820143884893, "grad_norm": 1.5601411747424254, "learning_rate": 7.556001952535697e-07, "loss": 0.0336, "step": 728 }, { "epoch": 10.489208633093526, "grad_norm": 2.7709712616630626, "learning_rate": 7.40752228061502e-07, "loss": 0.0355, "step": 729 }, { "epoch": 10.503597122302159, "grad_norm": 4.058223338059946, "learning_rate": 7.260459850016932e-07, "loss": 0.0587, "step": 730 }, { "epoch": 10.517985611510792, "grad_norm": 0.8541524082485146, "learning_rate": 7.114816911712131e-07, "loss": 0.0137, "step": 731 }, { "epoch": 10.532374100719425, "grad_norm": 2.2965443467696542, "learning_rate": 6.970595694944215e-07, "loss": 0.0441, "step": 732 }, { "epoch": 10.546762589928058, "grad_norm": 4.196670787721921, "learning_rate": 6.827798407195629e-07, "loss": 0.0284, "step": 733 }, { "epoch": 10.56115107913669, "grad_norm": 1.1707179381668629, "learning_rate": 6.686427234153814e-07, "loss": 0.0277, "step": 734 }, { "epoch": 10.575539568345324, "grad_norm": 1.7237854874400753, "learning_rate": 6.546484339677817e-07, "loss": 0.0229, "step": 735 }, { "epoch": 10.589928057553957, "grad_norm": 1.9963363458117016, "learning_rate": 6.407971865765095e-07, "loss": 0.0403, "step": 736 }, { "epoch": 10.60431654676259, "grad_norm": 1.8475932957020715, "learning_rate": 6.270891932518775e-07, "loss": 0.0339, "step": 737 }, { "epoch": 10.618705035971223, "grad_norm": 2.207865836762862, "learning_rate": 6.1352466381152e-07, "loss": 0.0206, "step": 738 }, { "epoch": 10.633093525179856, "grad_norm": 1.5028628741315406, "learning_rate": 6.00103805877178e-07, "loss": 0.0279, "step": 739 }, { "epoch": 10.647482014388489, "grad_norm": 1.6578850661820663, "learning_rate": 5.868268248715292e-07, "loss": 0.0458, "step": 740 }, { "epoch": 10.661870503597122, "grad_norm": 2.8376855115850255, "learning_rate": 5.736939240150363e-07, "loss": 0.0357, "step": 741 }, { "epoch": 10.676258992805755, "grad_norm": 2.5385576697329313, "learning_rate": 5.607053043228361e-07, "loss": 0.0331, "step": 742 }, { "epoch": 10.690647482014388, "grad_norm": 1.2033464016461923, "learning_rate": 5.478611646016674e-07, "loss": 0.0207, "step": 743 }, { "epoch": 10.70503597122302, "grad_norm": 0.981803438753484, "learning_rate": 5.35161701446828e-07, "loss": 0.0188, "step": 744 }, { "epoch": 10.719424460431654, "grad_norm": 8.071336752025925, "learning_rate": 5.226071092391616e-07, "loss": 0.0645, "step": 745 }, { "epoch": 10.733812949640289, "grad_norm": 2.41778659174354, "learning_rate": 5.101975801420844e-07, "loss": 0.04, "step": 746 }, { "epoch": 10.748201438848922, "grad_norm": 0.641208094752535, "learning_rate": 4.979333040986434e-07, "loss": 0.0131, "step": 747 }, { "epoch": 10.762589928057555, "grad_norm": 6.926642444043881, "learning_rate": 4.858144688286103e-07, "loss": 0.0465, "step": 748 }, { "epoch": 10.776978417266188, "grad_norm": 2.1100789184063675, "learning_rate": 4.7384125982561035e-07, "loss": 0.0303, "step": 749 }, { "epoch": 10.79136690647482, "grad_norm": 4.2180297630453305, "learning_rate": 4.6201386035427785e-07, "loss": 0.0313, "step": 750 }, { "epoch": 10.805755395683454, "grad_norm": 2.0665863271791167, "learning_rate": 4.503324514474483e-07, "loss": 0.0274, "step": 751 }, { "epoch": 10.820143884892087, "grad_norm": 1.6728982206562557, "learning_rate": 4.387972119034023e-07, "loss": 0.0307, "step": 752 }, { "epoch": 10.83453237410072, "grad_norm": 3.4746815846649746, "learning_rate": 4.274083182831157e-07, "loss": 0.0355, "step": 753 }, { "epoch": 10.848920863309353, "grad_norm": 1.8969392294440341, "learning_rate": 4.161659449075572e-07, "loss": 0.0343, "step": 754 }, { "epoch": 10.863309352517986, "grad_norm": 2.7876842253949112, "learning_rate": 4.0507026385502747e-07, "loss": 0.0467, "step": 755 }, { "epoch": 10.877697841726619, "grad_norm": 4.67027301314563, "learning_rate": 3.9412144495851845e-07, "loss": 0.0436, "step": 756 }, { "epoch": 10.892086330935252, "grad_norm": 1.399538680812464, "learning_rate": 3.833196558031216e-07, "loss": 0.0217, "step": 757 }, { "epoch": 10.906474820143885, "grad_norm": 1.7409190227121822, "learning_rate": 3.7266506172345507e-07, "loss": 0.0372, "step": 758 }, { "epoch": 10.920863309352518, "grad_norm": 2.1365410486540974, "learning_rate": 3.621578258011338e-07, "loss": 0.0245, "step": 759 }, { "epoch": 10.93525179856115, "grad_norm": 3.4216815696618257, "learning_rate": 3.517981088622768e-07, "loss": 0.0447, "step": 760 }, { "epoch": 10.949640287769784, "grad_norm": 2.5234742535562695, "learning_rate": 3.4158606947504944e-07, "loss": 0.0397, "step": 761 }, { "epoch": 10.964028776978417, "grad_norm": 4.126068958805298, "learning_rate": 3.3152186394722506e-07, "loss": 0.0538, "step": 762 }, { "epoch": 10.97841726618705, "grad_norm": 2.2465535716930614, "learning_rate": 3.2160564632380043e-07, "loss": 0.0693, "step": 763 }, { "epoch": 10.992805755395683, "grad_norm": 3.814910469078625, "learning_rate": 3.118375683846353e-07, "loss": 0.0713, "step": 764 }, { "epoch": 11.007194244604317, "grad_norm": 0.870573413202881, "learning_rate": 3.022177796421322e-07, "loss": 0.0141, "step": 765 }, { "epoch": 11.02158273381295, "grad_norm": 2.5693617992072033, "learning_rate": 2.9274642733894577e-07, "loss": 0.0341, "step": 766 }, { "epoch": 11.035971223021583, "grad_norm": 1.6022862996501166, "learning_rate": 2.834236564457271e-07, "loss": 0.0354, "step": 767 }, { "epoch": 11.050359712230216, "grad_norm": 1.0198868538110064, "learning_rate": 2.742496096589076e-07, "loss": 0.0166, "step": 768 }, { "epoch": 11.06474820143885, "grad_norm": 1.9386025908425748, "learning_rate": 2.652244273985127e-07, "loss": 0.0406, "step": 769 }, { "epoch": 11.079136690647482, "grad_norm": 1.506641792856537, "learning_rate": 2.5634824780601753e-07, "loss": 0.0215, "step": 770 }, { "epoch": 11.093525179856115, "grad_norm": 2.8974005417112054, "learning_rate": 2.4762120674222456e-07, "loss": 0.0337, "step": 771 }, { "epoch": 11.107913669064748, "grad_norm": 3.2363471525554632, "learning_rate": 2.390434377851925e-07, "loss": 0.0659, "step": 772 }, { "epoch": 11.122302158273381, "grad_norm": 1.7250018435406151, "learning_rate": 2.3061507222818303e-07, "loss": 0.0264, "step": 773 }, { "epoch": 11.136690647482014, "grad_norm": 1.3716552652282374, "learning_rate": 2.2233623907765956e-07, "loss": 0.0301, "step": 774 }, { "epoch": 11.151079136690647, "grad_norm": 4.081730598474069, "learning_rate": 2.1420706505130728e-07, "loss": 0.0296, "step": 775 }, { "epoch": 11.16546762589928, "grad_norm": 2.1837875002466993, "learning_rate": 2.0622767457609384e-07, "loss": 0.0487, "step": 776 }, { "epoch": 11.179856115107913, "grad_norm": 2.5403013467016016, "learning_rate": 1.983981897863685e-07, "loss": 0.0298, "step": 777 }, { "epoch": 11.194244604316546, "grad_norm": 0.9102126378246074, "learning_rate": 1.9071873052198818e-07, "loss": 0.0198, "step": 778 }, { "epoch": 11.20863309352518, "grad_norm": 1.3128849233164, "learning_rate": 1.8318941432648785e-07, "loss": 0.0193, "step": 779 }, { "epoch": 11.223021582733812, "grad_norm": 1.4735468637747058, "learning_rate": 1.7581035644527623e-07, "loss": 0.0243, "step": 780 }, { "epoch": 11.237410071942445, "grad_norm": 1.8021548024444698, "learning_rate": 1.6858166982387624e-07, "loss": 0.0205, "step": 781 }, { "epoch": 11.251798561151078, "grad_norm": 2.25011481794677, "learning_rate": 1.6150346510619197e-07, "loss": 0.0401, "step": 782 }, { "epoch": 11.266187050359711, "grad_norm": 3.4908449211884482, "learning_rate": 1.5457585063282322e-07, "loss": 0.0739, "step": 783 }, { "epoch": 11.280575539568344, "grad_norm": 0.8061010306090391, "learning_rate": 1.4779893243939358e-07, "loss": 0.0143, "step": 784 }, { "epoch": 11.29496402877698, "grad_norm": 1.6504383979055732, "learning_rate": 1.4117281425494178e-07, "loss": 0.0248, "step": 785 }, { "epoch": 11.309352517985612, "grad_norm": 1.402157952033158, "learning_rate": 1.3469759750032508e-07, "loss": 0.023, "step": 786 }, { "epoch": 11.323741007194245, "grad_norm": 2.645079301698734, "learning_rate": 1.2837338128666942e-07, "loss": 0.0647, "step": 787 }, { "epoch": 11.338129496402878, "grad_norm": 0.7567449101065951, "learning_rate": 1.2220026241385296e-07, "loss": 0.0156, "step": 788 }, { "epoch": 11.352517985611511, "grad_norm": 0.6465755222813994, "learning_rate": 1.1617833536902489e-07, "loss": 0.0179, "step": 789 }, { "epoch": 11.366906474820144, "grad_norm": 1.8698065932207548, "learning_rate": 1.1030769232515559e-07, "loss": 0.023, "step": 790 }, { "epoch": 11.381294964028777, "grad_norm": 1.7988268213081555, "learning_rate": 1.0458842313963102e-07, "loss": 0.0411, "step": 791 }, { "epoch": 11.39568345323741, "grad_norm": 3.6957888652655444, "learning_rate": 9.902061535287278e-08, "loss": 0.0338, "step": 792 }, { "epoch": 11.410071942446043, "grad_norm": 2.7446261934693443, "learning_rate": 9.360435418700131e-08, "loss": 0.0324, "step": 793 }, { "epoch": 11.424460431654676, "grad_norm": 0.7757882042918355, "learning_rate": 8.83397225445315e-08, "loss": 0.0281, "step": 794 }, { "epoch": 11.43884892086331, "grad_norm": 2.857395014664893, "learning_rate": 8.322680100710023e-08, "loss": 0.0336, "step": 795 }, { "epoch": 11.453237410071942, "grad_norm": 0.8599695116705092, "learning_rate": 7.826566783423639e-08, "loss": 0.0153, "step": 796 }, { "epoch": 11.467625899280575, "grad_norm": 1.2009242568158411, "learning_rate": 7.345639896216173e-08, "loss": 0.0218, "step": 797 }, { "epoch": 11.482014388489208, "grad_norm": 1.8513393342942366, "learning_rate": 6.879906800262848e-08, "loss": 0.0248, "step": 798 }, { "epoch": 11.496402877697841, "grad_norm": 1.2374041080258498, "learning_rate": 6.429374624179474e-08, "loss": 0.0187, "step": 799 }, { "epoch": 11.510791366906474, "grad_norm": 1.493116059019553, "learning_rate": 5.994050263912976e-08, "loss": 0.0228, "step": 800 }, { "epoch": 11.525179856115107, "grad_norm": 1.9511941731868687, "learning_rate": 5.573940382636145e-08, "loss": 0.05, "step": 801 }, { "epoch": 11.53956834532374, "grad_norm": 1.6434577538884052, "learning_rate": 5.169051410645276e-08, "loss": 0.0183, "step": 802 }, { "epoch": 11.553956834532373, "grad_norm": 3.362352075361728, "learning_rate": 4.7793895452623584e-08, "loss": 0.037, "step": 803 }, { "epoch": 11.568345323741006, "grad_norm": 5.575378703603965, "learning_rate": 4.4049607507397066e-08, "loss": 0.0794, "step": 804 }, { "epoch": 11.582733812949641, "grad_norm": 3.109823706873304, "learning_rate": 4.045770758168699e-08, "loss": 0.0317, "step": 805 }, { "epoch": 11.597122302158274, "grad_norm": 2.2826191924472394, "learning_rate": 3.701825065392184e-08, "loss": 0.0293, "step": 806 }, { "epoch": 11.611510791366907, "grad_norm": 2.018015360147881, "learning_rate": 3.3731289369206556e-08, "loss": 0.0398, "step": 807 }, { "epoch": 11.62589928057554, "grad_norm": 8.047431870090547, "learning_rate": 3.059687403850986e-08, "loss": 0.1134, "step": 808 }, { "epoch": 11.640287769784173, "grad_norm": 2.298672016403596, "learning_rate": 2.761505263789821e-08, "loss": 0.0243, "step": 809 }, { "epoch": 11.654676258992806, "grad_norm": 2.5580107049006315, "learning_rate": 2.4785870807803036e-08, "loss": 0.0363, "step": 810 }, { "epoch": 11.66906474820144, "grad_norm": 2.1820443354655548, "learning_rate": 2.2109371852317985e-08, "loss": 0.0321, "step": 811 }, { "epoch": 11.683453237410072, "grad_norm": 1.8784576576635295, "learning_rate": 1.9585596738539436e-08, "loss": 0.0294, "step": 812 }, { "epoch": 11.697841726618705, "grad_norm": 1.2733181144368895, "learning_rate": 1.7214584095937015e-08, "loss": 0.0322, "step": 813 }, { "epoch": 11.712230215827338, "grad_norm": 1.4234884359583329, "learning_rate": 1.4996370215765165e-08, "loss": 0.0234, "step": 814 }, { "epoch": 11.726618705035971, "grad_norm": 1.3429094276944762, "learning_rate": 1.2930989050504717e-08, "loss": 0.0166, "step": 815 }, { "epoch": 11.741007194244604, "grad_norm": 0.9942863729919723, "learning_rate": 1.101847221334551e-08, "loss": 0.0178, "step": 816 }, { "epoch": 11.755395683453237, "grad_norm": 4.8436461710096514, "learning_rate": 9.25884897770013e-09, "loss": 0.0356, "step": 817 }, { "epoch": 11.76978417266187, "grad_norm": 3.3015502134325945, "learning_rate": 7.652146276759808e-09, "loss": 0.0656, "step": 818 }, { "epoch": 11.784172661870503, "grad_norm": 1.6919061388445014, "learning_rate": 6.1983887030769855e-09, "loss": 0.0243, "step": 819 }, { "epoch": 11.798561151079136, "grad_norm": 2.579249722184542, "learning_rate": 4.897598508192269e-09, "loss": 0.0411, "step": 820 }, { "epoch": 11.81294964028777, "grad_norm": 1.8241696897466821, "learning_rate": 3.749795602294715e-09, "loss": 0.0294, "step": 821 }, { "epoch": 11.827338129496402, "grad_norm": 2.10962756409833, "learning_rate": 2.7549975539120644e-09, "loss": 0.0307, "step": 822 }, { "epoch": 11.841726618705035, "grad_norm": 2.127397019212184, "learning_rate": 1.9132195896498505e-09, "loss": 0.0287, "step": 823 }, { "epoch": 11.85611510791367, "grad_norm": 1.7438775653570628, "learning_rate": 1.2244745939493651e-09, "loss": 0.023, "step": 824 }, { "epoch": 11.870503597122303, "grad_norm": 1.7148926403499662, "learning_rate": 6.887731088978111e-10, "loss": 0.0321, "step": 825 }, { "epoch": 11.884892086330936, "grad_norm": 3.3795126925432015, "learning_rate": 3.0612333406176976e-10, "loss": 0.0481, "step": 826 }, { "epoch": 11.899280575539569, "grad_norm": 2.9388316113176565, "learning_rate": 7.65311263661861e-11, "loss": 0.0546, "step": 827 }, { "epoch": 11.913669064748202, "grad_norm": 6.73693905281281, "learning_rate": 0.0, "loss": 0.0567, "step": 828 } ], "logging_steps": 1.0, "max_steps": 828, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 200.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 418903697129472.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }