{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5003474635163307, "eval_steps": 500, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001389854065323141, "grad_norm": 0.7409176826477051, "learning_rate": 3.999721593572758e-05, "loss": 3.5192, "step": 1 }, { "epoch": 0.002779708130646282, "grad_norm": 0.5615135431289673, "learning_rate": 3.9991646190173924e-05, "loss": 3.4286, "step": 2 }, { "epoch": 0.004169562195969423, "grad_norm": 0.5854966044425964, "learning_rate": 3.998328611487523e-05, "loss": 3.4286, "step": 3 }, { "epoch": 0.005559416261292564, "grad_norm": 0.5862582921981812, "learning_rate": 3.997213570983149e-05, "loss": 3.4585, "step": 4 }, { "epoch": 0.006949270326615705, "grad_norm": 0.5955198407173157, "learning_rate": 3.9958194975042716e-05, "loss": 3.1467, "step": 5 }, { "epoch": 0.008339124391938846, "grad_norm": 0.6464918851852417, "learning_rate": 3.9941460272530094e-05, "loss": 3.1823, "step": 6 }, { "epoch": 0.009728978457261988, "grad_norm": 0.6563470959663391, "learning_rate": 3.9921931602293625e-05, "loss": 3.0063, "step": 7 }, { "epoch": 0.011118832522585128, "grad_norm": 0.7250497341156006, "learning_rate": 3.989960896433331e-05, "loss": 2.9268, "step": 8 }, { "epoch": 0.01250868658790827, "grad_norm": 0.7942679524421692, "learning_rate": 3.987449599662796e-05, "loss": 2.9488, "step": 9 }, { "epoch": 0.01389854065323141, "grad_norm": 0.7256530523300171, "learning_rate": 3.984659269917756e-05, "loss": 2.8387, "step": 10 }, { "epoch": 0.015288394718554551, "grad_norm": 0.96202152967453, "learning_rate": 3.981589907198213e-05, "loss": 2.7071, "step": 11 }, { "epoch": 0.01667824878387769, "grad_norm": 0.8117583394050598, "learning_rate": 3.978242239099927e-05, "loss": 2.4934, "step": 12 }, { "epoch": 0.018068102849200834, "grad_norm": 0.9094793796539307, "learning_rate": 3.974616265622899e-05, "loss": 2.6515, "step": 13 }, { "epoch": 0.019457956914523976, "grad_norm": 0.8784312009811401, "learning_rate": 3.9707127143628895e-05, "loss": 2.2521, "step": 14 }, { "epoch": 0.020847810979847115, "grad_norm": 0.8378123641014099, "learning_rate": 3.966531585319899e-05, "loss": 2.6406, "step": 15 }, { "epoch": 0.022237665045170257, "grad_norm": 0.8524578809738159, "learning_rate": 3.962073606089689e-05, "loss": 2.5555, "step": 16 }, { "epoch": 0.0236275191104934, "grad_norm": 1.0161993503570557, "learning_rate": 3.9573395042680204e-05, "loss": 2.6443, "step": 17 }, { "epoch": 0.02501737317581654, "grad_norm": 0.8594086766242981, "learning_rate": 3.9523296436527744e-05, "loss": 2.4282, "step": 18 }, { "epoch": 0.02640722724113968, "grad_norm": 0.8018243312835693, "learning_rate": 3.947044751839712e-05, "loss": 2.2656, "step": 19 }, { "epoch": 0.02779708130646282, "grad_norm": 0.833044171333313, "learning_rate": 3.9414855564245954e-05, "loss": 2.2516, "step": 20 }, { "epoch": 0.029186935371785964, "grad_norm": 0.8001306056976318, "learning_rate": 3.935652785003185e-05, "loss": 2.334, "step": 21 }, { "epoch": 0.030576789437109102, "grad_norm": 1.0083926916122437, "learning_rate": 3.929547528969124e-05, "loss": 2.2434, "step": 22 }, { "epoch": 0.031966643502432245, "grad_norm": 0.8387123942375183, "learning_rate": 3.923170515918173e-05, "loss": 2.1532, "step": 23 }, { "epoch": 0.03335649756775538, "grad_norm": 0.8073887825012207, "learning_rate": 3.916522837243974e-05, "loss": 2.3309, "step": 24 }, { "epoch": 0.03474635163307853, "grad_norm": 0.709004819393158, "learning_rate": 3.90960558434017e-05, "loss": 2.18, "step": 25 }, { "epoch": 0.03613620569840167, "grad_norm": 0.674148440361023, "learning_rate": 3.9024198486004025e-05, "loss": 2.1713, "step": 26 }, { "epoch": 0.037526059763724806, "grad_norm": 0.7956393361091614, "learning_rate": 3.894966721418314e-05, "loss": 2.3782, "step": 27 }, { "epoch": 0.03891591382904795, "grad_norm": 0.6804589629173279, "learning_rate": 3.887247294187546e-05, "loss": 2.1933, "step": 28 }, { "epoch": 0.04030576789437109, "grad_norm": 1.6819874048233032, "learning_rate": 3.8792630220996216e-05, "loss": 2.2018, "step": 29 }, { "epoch": 0.04169562195969423, "grad_norm": 0.685869574546814, "learning_rate": 3.871014996548183e-05, "loss": 2.2206, "step": 30 }, { "epoch": 0.043085476025017375, "grad_norm": 0.6232013702392578, "learning_rate": 3.862505036522634e-05, "loss": 2.1251, "step": 31 }, { "epoch": 0.04447533009034051, "grad_norm": 0.609855592250824, "learning_rate": 3.853733869618736e-05, "loss": 2.2228, "step": 32 }, { "epoch": 0.04586518415566365, "grad_norm": 0.6258031725883484, "learning_rate": 3.844703678623773e-05, "loss": 2.2185, "step": 33 }, { "epoch": 0.0472550382209868, "grad_norm": 0.5621820688247681, "learning_rate": 3.835415554931387e-05, "loss": 2.168, "step": 34 }, { "epoch": 0.048644892286309936, "grad_norm": 0.5401716828346252, "learning_rate": 3.825871317530982e-05, "loss": 2.175, "step": 35 }, { "epoch": 0.05003474635163308, "grad_norm": 0.5031033754348755, "learning_rate": 3.816072421614081e-05, "loss": 2.0321, "step": 36 }, { "epoch": 0.05142460041695622, "grad_norm": 0.49892041087150574, "learning_rate": 3.806021049967967e-05, "loss": 2.2909, "step": 37 }, { "epoch": 0.05281445448227936, "grad_norm": 0.47056782245635986, "learning_rate": 3.795718293986283e-05, "loss": 2.1851, "step": 38 }, { "epoch": 0.054204308547602505, "grad_norm": 0.4496292173862457, "learning_rate": 3.785166336456314e-05, "loss": 2.0675, "step": 39 }, { "epoch": 0.05559416261292564, "grad_norm": 0.473352313041687, "learning_rate": 3.774366996367462e-05, "loss": 2.2851, "step": 40 }, { "epoch": 0.05698401667824878, "grad_norm": 0.4474506378173828, "learning_rate": 3.7633220927091315e-05, "loss": 2.1041, "step": 41 }, { "epoch": 0.05837387074357193, "grad_norm": 0.4823157489299774, "learning_rate": 3.752033808268607e-05, "loss": 2.0805, "step": 42 }, { "epoch": 0.059763724808895066, "grad_norm": 0.45550891757011414, "learning_rate": 3.740503962035291e-05, "loss": 2.2575, "step": 43 }, { "epoch": 0.061153578874218205, "grad_norm": 0.3821059763431549, "learning_rate": 3.7287347367964685e-05, "loss": 2.1388, "step": 44 }, { "epoch": 0.06254343293954134, "grad_norm": 0.36882323026657104, "learning_rate": 3.716727951541543e-05, "loss": 2.1219, "step": 45 }, { "epoch": 0.06393328700486449, "grad_norm": 0.38791537284851074, "learning_rate": 3.7044861528556794e-05, "loss": 2.1315, "step": 46 }, { "epoch": 0.06532314107018763, "grad_norm": 0.3582218885421753, "learning_rate": 3.692011523526162e-05, "loss": 2.2317, "step": 47 }, { "epoch": 0.06671299513551077, "grad_norm": 0.329677551984787, "learning_rate": 3.679306246340275e-05, "loss": 2.0869, "step": 48 }, { "epoch": 0.06810284920083391, "grad_norm": 0.32158908247947693, "learning_rate": 3.6663725040853024e-05, "loss": 2.3178, "step": 49 }, { "epoch": 0.06949270326615706, "grad_norm": 0.384804904460907, "learning_rate": 3.653212479548529e-05, "loss": 2.1155, "step": 50 }, { "epoch": 0.07088255733148019, "grad_norm": 0.3261670768260956, "learning_rate": 3.639828719315119e-05, "loss": 2.2381, "step": 51 }, { "epoch": 0.07227241139680333, "grad_norm": 0.3406740128993988, "learning_rate": 3.626223769970238e-05, "loss": 2.0653, "step": 52 }, { "epoch": 0.07366226546212648, "grad_norm": 0.32772770524024963, "learning_rate": 3.612400178099051e-05, "loss": 2.0903, "step": 53 }, { "epoch": 0.07505211952744961, "grad_norm": 0.3069205582141876, "learning_rate": 3.598360126488842e-05, "loss": 1.9182, "step": 54 }, { "epoch": 0.07644197359277276, "grad_norm": 0.2695797085762024, "learning_rate": 3.5841068893205374e-05, "loss": 1.97, "step": 55 }, { "epoch": 0.0778318276580959, "grad_norm": 0.37067824602127075, "learning_rate": 3.569642285583541e-05, "loss": 2.2509, "step": 56 }, { "epoch": 0.07922168172341904, "grad_norm": 0.3385728895664215, "learning_rate": 3.5549695894587785e-05, "loss": 2.2857, "step": 57 }, { "epoch": 0.08061153578874218, "grad_norm": 0.28427156805992126, "learning_rate": 3.540090983733535e-05, "loss": 2.0356, "step": 58 }, { "epoch": 0.08200138985406533, "grad_norm": 0.3377193510532379, "learning_rate": 3.5250093787908554e-05, "loss": 2.1744, "step": 59 }, { "epoch": 0.08339124391938846, "grad_norm": 0.27540090680122375, "learning_rate": 3.509727321215905e-05, "loss": 2.1665, "step": 60 }, { "epoch": 0.0847810979847116, "grad_norm": 0.3422718644142151, "learning_rate": 3.494248085189611e-05, "loss": 1.9342, "step": 61 }, { "epoch": 0.08617095205003475, "grad_norm": 0.2574728727340698, "learning_rate": 3.478574217297137e-05, "loss": 2.2009, "step": 62 }, { "epoch": 0.08756080611535788, "grad_norm": 0.2330848127603531, "learning_rate": 3.462708627921529e-05, "loss": 2.1879, "step": 63 }, { "epoch": 0.08895066018068103, "grad_norm": 0.2683883309364319, "learning_rate": 3.4466542274458334e-05, "loss": 2.1559, "step": 64 }, { "epoch": 0.09034051424600417, "grad_norm": 0.21715426445007324, "learning_rate": 3.430413926253095e-05, "loss": 2.0487, "step": 65 }, { "epoch": 0.0917303683113273, "grad_norm": 0.23342643678188324, "learning_rate": 3.413990998524241e-05, "loss": 2.1284, "step": 66 }, { "epoch": 0.09312022237665045, "grad_norm": 0.2351645976305008, "learning_rate": 3.397387990844436e-05, "loss": 2.113, "step": 67 }, { "epoch": 0.0945100764419736, "grad_norm": 0.352430135011673, "learning_rate": 3.380608541192487e-05, "loss": 2.2563, "step": 68 }, { "epoch": 0.09589993050729674, "grad_norm": 0.2351987361907959, "learning_rate": 3.363655196153559e-05, "loss": 2.1185, "step": 69 }, { "epoch": 0.09728978457261987, "grad_norm": 0.25391116738319397, "learning_rate": 3.346531229908578e-05, "loss": 1.9646, "step": 70 }, { "epoch": 0.09867963863794302, "grad_norm": 0.24878591299057007, "learning_rate": 3.3292395528405905e-05, "loss": 1.8972, "step": 71 }, { "epoch": 0.10006949270326616, "grad_norm": 0.2513987421989441, "learning_rate": 3.311783439130522e-05, "loss": 2.3366, "step": 72 }, { "epoch": 0.1014593467685893, "grad_norm": 0.22613292932510376, "learning_rate": 3.2941661629593e-05, "loss": 2.0588, "step": 73 }, { "epoch": 0.10284920083391244, "grad_norm": 0.21839161217212677, "learning_rate": 3.27639099850785e-05, "loss": 2.0658, "step": 74 }, { "epoch": 0.10423905489923559, "grad_norm": 0.21652136743068695, "learning_rate": 3.2584608561592177e-05, "loss": 1.9668, "step": 75 }, { "epoch": 0.10562890896455872, "grad_norm": 0.33612847328186035, "learning_rate": 3.2403793738922104e-05, "loss": 1.9812, "step": 76 }, { "epoch": 0.10701876302988186, "grad_norm": 0.2500985264778137, "learning_rate": 3.2221498258877546e-05, "loss": 1.9848, "step": 77 }, { "epoch": 0.10840861709520501, "grad_norm": 0.2622527778148651, "learning_rate": 3.203775122528896e-05, "loss": 2.2417, "step": 78 }, { "epoch": 0.10979847116052814, "grad_norm": 0.22762766480445862, "learning_rate": 3.185258901794441e-05, "loss": 2.2114, "step": 79 }, { "epoch": 0.11118832522585129, "grad_norm": 0.18959005177021027, "learning_rate": 3.166604437865317e-05, "loss": 1.9689, "step": 80 }, { "epoch": 0.11257817929117443, "grad_norm": 0.2245381474494934, "learning_rate": 3.1478150049224496e-05, "loss": 2.0861, "step": 81 }, { "epoch": 0.11396803335649756, "grad_norm": 0.2584340572357178, "learning_rate": 3.1288938771467656e-05, "loss": 2.0639, "step": 82 }, { "epoch": 0.11535788742182071, "grad_norm": 0.22430986166000366, "learning_rate": 3.109844692517072e-05, "loss": 1.9695, "step": 83 }, { "epoch": 0.11674774148714386, "grad_norm": 0.27213403582572937, "learning_rate": 3.090670725214295e-05, "loss": 2.0551, "step": 84 }, { "epoch": 0.11813759555246699, "grad_norm": 0.23510581254959106, "learning_rate": 3.0713752494193614e-05, "loss": 2.1565, "step": 85 }, { "epoch": 0.11952744961779013, "grad_norm": 0.4063816964626312, "learning_rate": 3.051961903111078e-05, "loss": 2.2242, "step": 86 }, { "epoch": 0.12091730368311328, "grad_norm": 0.3135417401790619, "learning_rate": 3.032433960470371e-05, "loss": 2.2841, "step": 87 }, { "epoch": 0.12230715774843641, "grad_norm": 0.2632151246070862, "learning_rate": 3.0127950594760478e-05, "loss": 2.1075, "step": 88 }, { "epoch": 0.12369701181375956, "grad_norm": 0.2494344264268875, "learning_rate": 2.9930484743090346e-05, "loss": 2.1096, "step": 89 }, { "epoch": 0.1250868658790827, "grad_norm": 0.2007075548171997, "learning_rate": 2.9731978429481387e-05, "loss": 2.1779, "step": 90 }, { "epoch": 0.12647671994440585, "grad_norm": 0.21900464594364166, "learning_rate": 2.9532464395742863e-05, "loss": 2.0586, "step": 91 }, { "epoch": 0.12786657400972898, "grad_norm": 0.2916090190410614, "learning_rate": 2.9331979021662846e-05, "loss": 1.9652, "step": 92 }, { "epoch": 0.1292564280750521, "grad_norm": 0.25345149636268616, "learning_rate": 2.9130556868040003e-05, "loss": 2.0456, "step": 93 }, { "epoch": 0.13064628214037527, "grad_norm": 0.33592185378074646, "learning_rate": 2.8928230676683597e-05, "loss": 2.0481, "step": 94 }, { "epoch": 0.1320361362056984, "grad_norm": 0.2137216031551361, "learning_rate": 2.87250368273817e-05, "loss": 1.9936, "step": 95 }, { "epoch": 0.13342599027102153, "grad_norm": 0.1986539214849472, "learning_rate": 2.8521011699922383e-05, "loss": 2.101, "step": 96 }, { "epoch": 0.1348158443363447, "grad_norm": 0.19974084198474884, "learning_rate": 2.8316189855104312e-05, "loss": 2.1086, "step": 97 }, { "epoch": 0.13620569840166782, "grad_norm": 0.22967414557933807, "learning_rate": 2.8110605853726156e-05, "loss": 1.9475, "step": 98 }, { "epoch": 0.13759555246699096, "grad_norm": 0.21497172117233276, "learning_rate": 2.790429425658658e-05, "loss": 2.1221, "step": 99 }, { "epoch": 0.13898540653231412, "grad_norm": 0.2060205489397049, "learning_rate": 2.7697289624484256e-05, "loss": 2.157, "step": 100 }, { "epoch": 0.14037526059763725, "grad_norm": 0.2061186283826828, "learning_rate": 2.7489626518217847e-05, "loss": 1.9418, "step": 101 }, { "epoch": 0.14176511466296038, "grad_norm": 0.2318679243326187, "learning_rate": 2.7281341317575425e-05, "loss": 1.9337, "step": 102 }, { "epoch": 0.14315496872828354, "grad_norm": 0.19199368357658386, "learning_rate": 2.7072468583355658e-05, "loss": 2.1347, "step": 103 }, { "epoch": 0.14454482279360667, "grad_norm": 0.5738226175308228, "learning_rate": 2.686304105736781e-05, "loss": 1.9709, "step": 104 }, { "epoch": 0.1459346768589298, "grad_norm": 0.2086687982082367, "learning_rate": 2.665309511939995e-05, "loss": 2.1757, "step": 105 }, { "epoch": 0.14732453092425296, "grad_norm": 0.18678060173988342, "learning_rate": 2.6442665330250748e-05, "loss": 1.9203, "step": 106 }, { "epoch": 0.1487143849895761, "grad_norm": 0.24349789321422577, "learning_rate": 2.623178625071887e-05, "loss": 2.0041, "step": 107 }, { "epoch": 0.15010423905489922, "grad_norm": 0.44293686747550964, "learning_rate": 2.602049062261358e-05, "loss": 2.0651, "step": 108 }, { "epoch": 0.15149409312022238, "grad_norm": 0.23788246512413025, "learning_rate": 2.5808814825722948e-05, "loss": 2.0853, "step": 109 }, { "epoch": 0.15288394718554552, "grad_norm": 0.24998068809509277, "learning_rate": 2.5596793420845643e-05, "loss": 2.1346, "step": 110 }, { "epoch": 0.15427380125086865, "grad_norm": 0.23647256195545197, "learning_rate": 2.5384459149790928e-05, "loss": 1.9715, "step": 111 }, { "epoch": 0.1556636553161918, "grad_norm": 0.31655153632164, "learning_rate": 2.517184657335747e-05, "loss": 2.0115, "step": 112 }, { "epoch": 0.15705350938151494, "grad_norm": 0.19617554545402527, "learning_rate": 2.4958988433354534e-05, "loss": 2.0552, "step": 113 }, { "epoch": 0.15844336344683807, "grad_norm": 0.22154586017131805, "learning_rate": 2.4745919290580787e-05, "loss": 2.0653, "step": 114 }, { "epoch": 0.15983321751216123, "grad_norm": 0.2188163846731186, "learning_rate": 2.4532671886845492e-05, "loss": 2.2253, "step": 115 }, { "epoch": 0.16122307157748436, "grad_norm": 0.19053936004638672, "learning_rate": 2.4319278963957913e-05, "loss": 2.0672, "step": 116 }, { "epoch": 0.1626129256428075, "grad_norm": 0.2588854432106018, "learning_rate": 2.410577508271672e-05, "loss": 2.0342, "step": 117 }, { "epoch": 0.16400277970813065, "grad_norm": 0.2963363230228424, "learning_rate": 2.389219298493117e-05, "loss": 2.0387, "step": 118 }, { "epoch": 0.16539263377345378, "grad_norm": 0.23298707604408264, "learning_rate": 2.3678565412410535e-05, "loss": 2.0423, "step": 119 }, { "epoch": 0.16678248783877692, "grad_norm": 0.20690463483333588, "learning_rate": 2.3464925106964074e-05, "loss": 1.9764, "step": 120 }, { "epoch": 0.16817234190410008, "grad_norm": 0.23895452916622162, "learning_rate": 2.325130299141165e-05, "loss": 1.9615, "step": 121 }, { "epoch": 0.1695621959694232, "grad_norm": 0.2269681841135025, "learning_rate": 2.3037731807562523e-05, "loss": 1.8685, "step": 122 }, { "epoch": 0.17095205003474634, "grad_norm": 0.26502808928489685, "learning_rate": 2.2824242478236556e-05, "loss": 1.9815, "step": 123 }, { "epoch": 0.1723419041000695, "grad_norm": 0.24106465280056, "learning_rate": 2.2610867745243013e-05, "loss": 2.0057, "step": 124 }, { "epoch": 0.17373175816539263, "grad_norm": 0.1920744776725769, "learning_rate": 2.2397638531401753e-05, "loss": 1.9905, "step": 125 }, { "epoch": 0.17512161223071576, "grad_norm": 0.299200177192688, "learning_rate": 2.2184585759532638e-05, "loss": 1.8186, "step": 126 }, { "epoch": 0.17651146629603892, "grad_norm": 0.21507440507411957, "learning_rate": 2.1971740352455527e-05, "loss": 1.965, "step": 127 }, { "epoch": 0.17790132036136205, "grad_norm": 0.2105863094329834, "learning_rate": 2.1759131414000876e-05, "loss": 2.115, "step": 128 }, { "epoch": 0.17929117442668518, "grad_norm": 0.21202439069747925, "learning_rate": 2.154679168597795e-05, "loss": 1.9482, "step": 129 }, { "epoch": 0.18068102849200834, "grad_norm": 0.21153898537158966, "learning_rate": 2.1334748453227803e-05, "loss": 1.9907, "step": 130 }, { "epoch": 0.18207088255733148, "grad_norm": 0.22135163843631744, "learning_rate": 2.1123032638570294e-05, "loss": 2.1229, "step": 131 }, { "epoch": 0.1834607366226546, "grad_norm": 0.2102488875389099, "learning_rate": 2.091167334583588e-05, "loss": 2.1547, "step": 132 }, { "epoch": 0.18485059068797777, "grad_norm": 0.500489354133606, "learning_rate": 2.070070149784442e-05, "loss": 2.0868, "step": 133 }, { "epoch": 0.1862404447533009, "grad_norm": 0.3173089325428009, "learning_rate": 2.049014437943697e-05, "loss": 1.8882, "step": 134 }, { "epoch": 0.18763029881862406, "grad_norm": 0.2581390142440796, "learning_rate": 2.028002927545458e-05, "loss": 2.0773, "step": 135 }, { "epoch": 0.1890201528839472, "grad_norm": 0.20216765999794006, "learning_rate": 2.007038528972771e-05, "loss": 2.0587, "step": 136 }, { "epoch": 0.19041000694927032, "grad_norm": 0.20798902213573456, "learning_rate": 1.986123970709741e-05, "loss": 1.9818, "step": 137 }, { "epoch": 0.19179986101459348, "grad_norm": 0.20478519797325134, "learning_rate": 1.9652619812404737e-05, "loss": 2.1231, "step": 138 }, { "epoch": 0.1931897150799166, "grad_norm": 0.2358742356300354, "learning_rate": 1.944455289049074e-05, "loss": 1.9059, "step": 139 }, { "epoch": 0.19457956914523974, "grad_norm": 0.19055970013141632, "learning_rate": 1.9237066226196475e-05, "loss": 1.9074, "step": 140 }, { "epoch": 0.1959694232105629, "grad_norm": 0.21943627297878265, "learning_rate": 1.9030183466384187e-05, "loss": 1.887, "step": 141 }, { "epoch": 0.19735927727588604, "grad_norm": 0.20230762660503387, "learning_rate": 1.8823933714884333e-05, "loss": 2.0427, "step": 142 }, { "epoch": 0.19874913134120917, "grad_norm": 0.2051026076078415, "learning_rate": 1.861834061855916e-05, "loss": 1.8915, "step": 143 }, { "epoch": 0.20013898540653233, "grad_norm": 1.0352438688278198, "learning_rate": 1.8413429643260315e-05, "loss": 1.9431, "step": 144 }, { "epoch": 0.20152883947185546, "grad_norm": 0.2811901271343231, "learning_rate": 1.8209228073828854e-05, "loss": 1.897, "step": 145 }, { "epoch": 0.2029186935371786, "grad_norm": 0.46414074301719666, "learning_rate": 1.8005757738137618e-05, "loss": 2.0354, "step": 146 }, { "epoch": 0.20430854760250175, "grad_norm": 0.22770829498767853, "learning_rate": 1.7803044102038257e-05, "loss": 2.0427, "step": 147 }, { "epoch": 0.20569840166782488, "grad_norm": 0.2260872721672058, "learning_rate": 1.7601108993403614e-05, "loss": 1.9558, "step": 148 }, { "epoch": 0.207088255733148, "grad_norm": 0.1908380687236786, "learning_rate": 1.7399976059095934e-05, "loss": 2.0494, "step": 149 }, { "epoch": 0.20847810979847117, "grad_norm": 0.19630920886993408, "learning_rate": 1.7199668945977464e-05, "loss": 1.893, "step": 150 }, { "epoch": 0.2098679638637943, "grad_norm": 0.18318656086921692, "learning_rate": 1.7000209481921047e-05, "loss": 1.8479, "step": 151 }, { "epoch": 0.21125781792911744, "grad_norm": 0.20488278567790985, "learning_rate": 1.6801621313788928e-05, "loss": 1.9398, "step": 152 }, { "epoch": 0.2126476719944406, "grad_norm": 0.18846479058265686, "learning_rate": 1.6603924450464547e-05, "loss": 1.8462, "step": 153 }, { "epoch": 0.21403752605976373, "grad_norm": 0.1990012526512146, "learning_rate": 1.6407140719820745e-05, "loss": 1.8491, "step": 154 }, { "epoch": 0.21542738012508686, "grad_norm": 0.20991306006908417, "learning_rate": 1.6211291949730366e-05, "loss": 2.0807, "step": 155 }, { "epoch": 0.21681723419041002, "grad_norm": 0.1805444061756134, "learning_rate": 1.6016396330087446e-05, "loss": 1.8295, "step": 156 }, { "epoch": 0.21820708825573315, "grad_norm": 0.22825907170772552, "learning_rate": 1.5822475688764825e-05, "loss": 2.0153, "step": 157 }, { "epoch": 0.21959694232105628, "grad_norm": 0.17874298989772797, "learning_rate": 1.562954821565654e-05, "loss": 1.8857, "step": 158 }, { "epoch": 0.22098679638637944, "grad_norm": 0.20816968381404877, "learning_rate": 1.5437633919646032e-05, "loss": 2.0169, "step": 159 }, { "epoch": 0.22237665045170257, "grad_norm": 0.1875351518392563, "learning_rate": 1.5246751900122035e-05, "loss": 2.0116, "step": 160 }, { "epoch": 0.2237665045170257, "grad_norm": 0.1880957931280136, "learning_rate": 1.5056919437483884e-05, "loss": 1.9458, "step": 161 }, { "epoch": 0.22515635858234886, "grad_norm": 0.2217608243227005, "learning_rate": 1.4868154721625615e-05, "loss": 2.1639, "step": 162 }, { "epoch": 0.226546212647672, "grad_norm": 0.21479056775569916, "learning_rate": 1.468047503294656e-05, "loss": 2.0, "step": 163 }, { "epoch": 0.22793606671299513, "grad_norm": 0.2338622510433197, "learning_rate": 1.4493896742351353e-05, "loss": 2.0731, "step": 164 }, { "epoch": 0.2293259207783183, "grad_norm": 0.23026101291179657, "learning_rate": 1.4308437130239327e-05, "loss": 2.1193, "step": 165 }, { "epoch": 0.23071577484364142, "grad_norm": 0.21515104174613953, "learning_rate": 1.4124113477009814e-05, "loss": 1.966, "step": 166 }, { "epoch": 0.23210562890896455, "grad_norm": 0.22686313092708588, "learning_rate": 1.3940940334578045e-05, "loss": 2.0554, "step": 167 }, { "epoch": 0.2334954829742877, "grad_norm": 0.22290675342082977, "learning_rate": 1.3758932254859246e-05, "loss": 2.1244, "step": 168 }, { "epoch": 0.23488533703961084, "grad_norm": 0.19737748801708221, "learning_rate": 1.357810469926335e-05, "loss": 2.0386, "step": 169 }, { "epoch": 0.23627519110493397, "grad_norm": 0.2284354418516159, "learning_rate": 1.3398472219705582e-05, "loss": 1.9145, "step": 170 }, { "epoch": 0.23766504517025713, "grad_norm": 0.18580572307109833, "learning_rate": 1.322004845860647e-05, "loss": 1.8991, "step": 171 }, { "epoch": 0.23905489923558026, "grad_norm": 0.18737782537937164, "learning_rate": 1.3042847058386542e-05, "loss": 1.8709, "step": 172 }, { "epoch": 0.2404447533009034, "grad_norm": 0.2080836445093155, "learning_rate": 1.2866880751971621e-05, "loss": 1.7018, "step": 173 }, { "epoch": 0.24183460736622656, "grad_norm": 0.20678725838661194, "learning_rate": 1.2692163181782234e-05, "loss": 1.7609, "step": 174 }, { "epoch": 0.2432244614315497, "grad_norm": 0.21453110873699188, "learning_rate": 1.2518705261754803e-05, "loss": 2.1415, "step": 175 }, { "epoch": 0.24461431549687282, "grad_norm": 0.23385198414325714, "learning_rate": 1.234651881532045e-05, "loss": 1.8789, "step": 176 }, { "epoch": 0.24600416956219598, "grad_norm": 0.18875467777252197, "learning_rate": 1.2175616575405002e-05, "loss": 1.7596, "step": 177 }, { "epoch": 0.2473940236275191, "grad_norm": 0.22677035629749298, "learning_rate": 1.2006007636955474e-05, "loss": 2.0212, "step": 178 }, { "epoch": 0.24878387769284224, "grad_norm": 0.16881951689720154, "learning_rate": 1.183770382340299e-05, "loss": 1.7595, "step": 179 }, { "epoch": 0.2501737317581654, "grad_norm": 0.21647167205810547, "learning_rate": 1.1670714229694568e-05, "loss": 2.1274, "step": 180 }, { "epoch": 0.25156358582348853, "grad_norm": 0.19476264715194702, "learning_rate": 1.1505048860271927e-05, "loss": 2.0031, "step": 181 }, { "epoch": 0.2529534398888117, "grad_norm": 0.18317262828350067, "learning_rate": 1.1340716810082085e-05, "loss": 1.7918, "step": 182 }, { "epoch": 0.2543432939541348, "grad_norm": 0.2024323046207428, "learning_rate": 1.117772717407206e-05, "loss": 2.0286, "step": 183 }, { "epoch": 0.25573314801945796, "grad_norm": 0.19228191673755646, "learning_rate": 1.1016087228199467e-05, "loss": 1.7245, "step": 184 }, { "epoch": 0.2571230020847811, "grad_norm": 0.3028217852115631, "learning_rate": 1.0855806067411322e-05, "loss": 2.0536, "step": 185 }, { "epoch": 0.2585128561501042, "grad_norm": 0.2116047590970993, "learning_rate": 1.0696890058170538e-05, "loss": 2.0023, "step": 186 }, { "epoch": 0.2599027102154274, "grad_norm": 0.19894298911094666, "learning_rate": 1.0539347385929432e-05, "loss": 1.9964, "step": 187 }, { "epoch": 0.26129256428075054, "grad_norm": 0.20690584182739258, "learning_rate": 1.0383184417150915e-05, "loss": 1.7767, "step": 188 }, { "epoch": 0.26268241834607364, "grad_norm": 0.20272022485733032, "learning_rate": 1.02284075182979e-05, "loss": 1.8808, "step": 189 }, { "epoch": 0.2640722724113968, "grad_norm": 0.20660698413848877, "learning_rate": 1.0075022146338597e-05, "loss": 1.9042, "step": 190 }, { "epoch": 0.26546212647671996, "grad_norm": 0.19578257203102112, "learning_rate": 9.923033758241218e-06, "loss": 1.925, "step": 191 }, { "epoch": 0.26685198054204307, "grad_norm": 0.22218400239944458, "learning_rate": 9.77244690147927e-06, "loss": 1.982, "step": 192 }, { "epoch": 0.2682418346073662, "grad_norm": 0.22300571203231812, "learning_rate": 9.623267033020966e-06, "loss": 1.9447, "step": 193 }, { "epoch": 0.2696316886726894, "grad_norm": 0.1989213228225708, "learning_rate": 9.475497790845111e-06, "loss": 2.0578, "step": 194 }, { "epoch": 0.2710215427380125, "grad_norm": 0.23240365087985992, "learning_rate": 9.329143722425215e-06, "loss": 1.9087, "step": 195 }, { "epoch": 0.27241139680333565, "grad_norm": 0.2053285837173462, "learning_rate": 9.184207556245383e-06, "loss": 1.9962, "step": 196 }, { "epoch": 0.2738012508686588, "grad_norm": 0.21051867306232452, "learning_rate": 9.040692930284422e-06, "loss": 1.9418, "step": 197 }, { "epoch": 0.2751911049339819, "grad_norm": 0.30462223291397095, "learning_rate": 8.898602573026437e-06, "loss": 1.818, "step": 198 }, { "epoch": 0.27658095899930507, "grad_norm": 0.20464156568050385, "learning_rate": 8.757938303460833e-06, "loss": 1.8764, "step": 199 }, { "epoch": 0.27797081306462823, "grad_norm": 0.23925362527370453, "learning_rate": 8.618702850071713e-06, "loss": 1.812, "step": 200 }, { "epoch": 0.27936066712995133, "grad_norm": 0.19577813148498535, "learning_rate": 8.480897122353781e-06, "loss": 1.8954, "step": 201 }, { "epoch": 0.2807505211952745, "grad_norm": 0.20012779533863068, "learning_rate": 8.344522029801738e-06, "loss": 2.0259, "step": 202 }, { "epoch": 0.28214037526059765, "grad_norm": 0.23986053466796875, "learning_rate": 8.209579391404986e-06, "loss": 1.9023, "step": 203 }, { "epoch": 0.28353022932592076, "grad_norm": 0.19048075377941132, "learning_rate": 8.076069207163528e-06, "loss": 1.7609, "step": 204 }, { "epoch": 0.2849200833912439, "grad_norm": 0.20962846279144287, "learning_rate": 7.943991477077361e-06, "loss": 1.8119, "step": 205 }, { "epoch": 0.2863099374565671, "grad_norm": 0.2099342793226242, "learning_rate": 7.813346201146487e-06, "loss": 2.034, "step": 206 }, { "epoch": 0.2876997915218902, "grad_norm": 0.22258687019348145, "learning_rate": 7.684133379370905e-06, "loss": 1.9987, "step": 207 }, { "epoch": 0.28908964558721334, "grad_norm": 0.2048138529062271, "learning_rate": 7.556351192761213e-06, "loss": 2.0468, "step": 208 }, { "epoch": 0.2904794996525365, "grad_norm": 0.23182573914527893, "learning_rate": 7.429999186570058e-06, "loss": 1.9998, "step": 209 }, { "epoch": 0.2918693537178596, "grad_norm": 0.21405896544456482, "learning_rate": 7.305075087060686e-06, "loss": 1.8496, "step": 210 }, { "epoch": 0.29325920778318276, "grad_norm": 0.21509391069412231, "learning_rate": 7.181577529991046e-06, "loss": 2.102, "step": 211 }, { "epoch": 0.2946490618485059, "grad_norm": 0.2172914743423462, "learning_rate": 7.059504696371732e-06, "loss": 1.9417, "step": 212 }, { "epoch": 0.296038915913829, "grad_norm": 0.19544851779937744, "learning_rate": 6.938853857718641e-06, "loss": 1.8943, "step": 213 }, { "epoch": 0.2974287699791522, "grad_norm": 0.25898879766464233, "learning_rate": 6.819621830800315e-06, "loss": 1.9846, "step": 214 }, { "epoch": 0.29881862404447534, "grad_norm": 0.19946029782295227, "learning_rate": 6.701806341880001e-06, "loss": 1.8731, "step": 215 }, { "epoch": 0.30020847810979845, "grad_norm": 0.2002125233411789, "learning_rate": 6.58540329823154e-06, "loss": 1.6768, "step": 216 }, { "epoch": 0.3015983321751216, "grad_norm": 0.1993391513824463, "learning_rate": 6.470409061876126e-06, "loss": 2.0811, "step": 217 }, { "epoch": 0.30298818624044477, "grad_norm": 0.18970079720020294, "learning_rate": 6.35681999483495e-06, "loss": 1.7993, "step": 218 }, { "epoch": 0.30437804030576787, "grad_norm": 0.24260976910591125, "learning_rate": 6.244632004381856e-06, "loss": 1.9406, "step": 219 }, { "epoch": 0.30576789437109103, "grad_norm": 0.18580634891986847, "learning_rate": 6.133840088295983e-06, "loss": 1.8223, "step": 220 }, { "epoch": 0.3071577484364142, "grad_norm": 0.1760164499282837, "learning_rate": 6.024440153851174e-06, "loss": 1.7043, "step": 221 }, { "epoch": 0.3085476025017373, "grad_norm": 0.2013455331325531, "learning_rate": 5.9164262893318664e-06, "loss": 1.9141, "step": 222 }, { "epoch": 0.30993745656706045, "grad_norm": 0.24060115218162537, "learning_rate": 5.809793947264552e-06, "loss": 1.9541, "step": 223 }, { "epoch": 0.3113273106323836, "grad_norm": 0.2021775245666504, "learning_rate": 5.704537215933669e-06, "loss": 1.6457, "step": 224 }, { "epoch": 0.3127171646977067, "grad_norm": 0.2032179832458496, "learning_rate": 5.6006506383710075e-06, "loss": 1.8436, "step": 225 }, { "epoch": 0.3141070187630299, "grad_norm": 0.19261512160301208, "learning_rate": 5.498127848113654e-06, "loss": 2.0785, "step": 226 }, { "epoch": 0.31549687282835304, "grad_norm": 0.19023051857948303, "learning_rate": 5.396962478698697e-06, "loss": 2.0608, "step": 227 }, { "epoch": 0.31688672689367614, "grad_norm": 0.18541821837425232, "learning_rate": 5.297148618410574e-06, "loss": 1.8676, "step": 228 }, { "epoch": 0.3182765809589993, "grad_norm": 0.20234748721122742, "learning_rate": 5.198678991291672e-06, "loss": 1.9006, "step": 229 }, { "epoch": 0.31966643502432246, "grad_norm": 0.22085219621658325, "learning_rate": 5.101547230879078e-06, "loss": 1.9287, "step": 230 }, { "epoch": 0.32105628908964556, "grad_norm": 0.2259192317724228, "learning_rate": 5.005745151720475e-06, "loss": 2.1083, "step": 231 }, { "epoch": 0.3224461431549687, "grad_norm": 0.1867905557155609, "learning_rate": 4.911266387352953e-06, "loss": 1.7521, "step": 232 }, { "epoch": 0.3238359972202919, "grad_norm": 0.35772791504859924, "learning_rate": 4.818103207071545e-06, "loss": 1.7412, "step": 233 }, { "epoch": 0.325225851285615, "grad_norm": 0.20189422369003296, "learning_rate": 4.726247880171286e-06, "loss": 1.9008, "step": 234 }, { "epoch": 0.32661570535093815, "grad_norm": 0.20113921165466309, "learning_rate": 4.63569176645251e-06, "loss": 1.8282, "step": 235 }, { "epoch": 0.3280055594162613, "grad_norm": 0.2050730139017105, "learning_rate": 4.546427589957602e-06, "loss": 2.0404, "step": 236 }, { "epoch": 0.3293954134815844, "grad_norm": 0.19817739725112915, "learning_rate": 4.458446710486896e-06, "loss": 1.8645, "step": 237 }, { "epoch": 0.33078526754690757, "grad_norm": 0.20303653180599213, "learning_rate": 4.371740487840725e-06, "loss": 1.8631, "step": 238 }, { "epoch": 0.33217512161223073, "grad_norm": 0.20286080241203308, "learning_rate": 4.286300281819422e-06, "loss": 1.8284, "step": 239 }, { "epoch": 0.33356497567755383, "grad_norm": 0.2330830693244934, "learning_rate": 4.2021179069706704e-06, "loss": 1.9205, "step": 240 }, { "epoch": 0.334954829742877, "grad_norm": 0.2032470554113388, "learning_rate": 4.1191838136001024e-06, "loss": 2.1029, "step": 241 }, { "epoch": 0.33634468380820015, "grad_norm": 0.21038895845413208, "learning_rate": 4.0374889067607e-06, "loss": 1.789, "step": 242 }, { "epoch": 0.33773453787352326, "grad_norm": 0.21678854525089264, "learning_rate": 3.9570240915054455e-06, "loss": 1.9519, "step": 243 }, { "epoch": 0.3391243919388464, "grad_norm": 0.19664430618286133, "learning_rate": 3.87777981813997e-06, "loss": 2.0141, "step": 244 }, { "epoch": 0.3405142460041696, "grad_norm": 0.2014831304550171, "learning_rate": 3.7997465369699057e-06, "loss": 1.8322, "step": 245 }, { "epoch": 0.3419041000694927, "grad_norm": 0.25958192348480225, "learning_rate": 3.722914698300883e-06, "loss": 1.6058, "step": 246 }, { "epoch": 0.34329395413481584, "grad_norm": 0.25615525245666504, "learning_rate": 3.647274297691183e-06, "loss": 1.898, "step": 247 }, { "epoch": 0.344683808200139, "grad_norm": 0.2273324579000473, "learning_rate": 3.5728155580727616e-06, "loss": 1.8708, "step": 248 }, { "epoch": 0.3460736622654621, "grad_norm": 0.2000439465045929, "learning_rate": 3.499528475003899e-06, "loss": 1.8123, "step": 249 }, { "epoch": 0.34746351633078526, "grad_norm": 0.2200114130973816, "learning_rate": 3.427403044042876e-06, "loss": 1.7382, "step": 250 }, { "epoch": 0.3488533703961084, "grad_norm": 0.2342006415128708, "learning_rate": 3.356428806000622e-06, "loss": 2.0013, "step": 251 }, { "epoch": 0.3502432244614315, "grad_norm": 0.22995337843894958, "learning_rate": 3.286595529061742e-06, "loss": 2.1444, "step": 252 }, { "epoch": 0.3516330785267547, "grad_norm": 0.1964460164308548, "learning_rate": 3.2178929814108415e-06, "loss": 1.8258, "step": 253 }, { "epoch": 0.35302293259207784, "grad_norm": 0.19003188610076904, "learning_rate": 3.1503102491114987e-06, "loss": 1.883, "step": 254 }, { "epoch": 0.35441278665740095, "grad_norm": 0.18669278919696808, "learning_rate": 3.083837100348319e-06, "loss": 1.8294, "step": 255 }, { "epoch": 0.3558026407227241, "grad_norm": 0.22715264558792114, "learning_rate": 3.018462621184881e-06, "loss": 1.9252, "step": 256 }, { "epoch": 0.35719249478804727, "grad_norm": 0.19796153903007507, "learning_rate": 2.9541761250584386e-06, "loss": 1.8063, "step": 257 }, { "epoch": 0.35858234885337037, "grad_norm": 0.20917272567749023, "learning_rate": 2.8909669254062464e-06, "loss": 2.118, "step": 258 }, { "epoch": 0.35997220291869353, "grad_norm": 0.2453163117170334, "learning_rate": 2.8288238809182076e-06, "loss": 1.9066, "step": 259 }, { "epoch": 0.3613620569840167, "grad_norm": 0.22330912947654724, "learning_rate": 2.767736305031576e-06, "loss": 1.7404, "step": 260 }, { "epoch": 0.3627519110493398, "grad_norm": 0.21004384756088257, "learning_rate": 2.707693283809931e-06, "loss": 1.9983, "step": 261 }, { "epoch": 0.36414176511466295, "grad_norm": 0.3166964054107666, "learning_rate": 2.6486834485695e-06, "loss": 1.8443, "step": 262 }, { "epoch": 0.3655316191799861, "grad_norm": 0.20551498234272003, "learning_rate": 2.5906958853738615e-06, "loss": 1.8614, "step": 263 }, { "epoch": 0.3669214732453092, "grad_norm": 0.1921970546245575, "learning_rate": 2.5337194529129192e-06, "loss": 1.6547, "step": 264 }, { "epoch": 0.3683113273106324, "grad_norm": 0.18080759048461914, "learning_rate": 2.4777432372502517e-06, "loss": 1.9168, "step": 265 }, { "epoch": 0.36970118137595553, "grad_norm": 0.18375758826732635, "learning_rate": 2.4227558697020868e-06, "loss": 1.6323, "step": 266 }, { "epoch": 0.37109103544127864, "grad_norm": 0.18418848514556885, "learning_rate": 2.3687459815846523e-06, "loss": 1.5647, "step": 267 }, { "epoch": 0.3724808895066018, "grad_norm": 0.47638699412345886, "learning_rate": 2.3157024315878516e-06, "loss": 1.7478, "step": 268 }, { "epoch": 0.37387074357192496, "grad_norm": 0.19321344792842865, "learning_rate": 2.263614078401588e-06, "loss": 1.5629, "step": 269 }, { "epoch": 0.3752605976372481, "grad_norm": 0.4666159451007843, "learning_rate": 2.212469553342089e-06, "loss": 1.9192, "step": 270 }, { "epoch": 0.3766504517025712, "grad_norm": 0.22022327780723572, "learning_rate": 2.1622577150992583e-06, "loss": 1.6754, "step": 271 }, { "epoch": 0.3780403057678944, "grad_norm": 0.18193675577640533, "learning_rate": 2.1129671949893236e-06, "loss": 1.7244, "step": 272 }, { "epoch": 0.37943015983321754, "grad_norm": 0.19526351988315582, "learning_rate": 2.0645866243285127e-06, "loss": 1.847, "step": 273 }, { "epoch": 0.38082001389854064, "grad_norm": 0.22929708659648895, "learning_rate": 2.017104861806729e-06, "loss": 1.8095, "step": 274 }, { "epoch": 0.3822098679638638, "grad_norm": 0.3174187242984772, "learning_rate": 1.9705107661138754e-06, "loss": 1.7546, "step": 275 }, { "epoch": 0.38359972202918696, "grad_norm": 0.21222339570522308, "learning_rate": 1.9247927411925048e-06, "loss": 1.9758, "step": 276 }, { "epoch": 0.38498957609451007, "grad_norm": 0.4566583037376404, "learning_rate": 1.8799396457325201e-06, "loss": 1.9456, "step": 277 }, { "epoch": 0.3863794301598332, "grad_norm": 0.20986810326576233, "learning_rate": 1.8359404521106626e-06, "loss": 1.8314, "step": 278 }, { "epoch": 0.3877692842251564, "grad_norm": 0.19502505660057068, "learning_rate": 1.7927836779563222e-06, "loss": 1.8558, "step": 279 }, { "epoch": 0.3891591382904795, "grad_norm": 0.2588600516319275, "learning_rate": 1.75045829564624e-06, "loss": 1.903, "step": 280 }, { "epoch": 0.39054899235580265, "grad_norm": 0.21653814613819122, "learning_rate": 1.7089530501834815e-06, "loss": 1.743, "step": 281 }, { "epoch": 0.3919388464211258, "grad_norm": 0.20987483859062195, "learning_rate": 1.66825680025795e-06, "loss": 1.8823, "step": 282 }, { "epoch": 0.3933287004864489, "grad_norm": 0.2525135576725006, "learning_rate": 1.6283585182463867e-06, "loss": 1.7555, "step": 283 }, { "epoch": 0.39471855455177207, "grad_norm": 0.19194070994853973, "learning_rate": 1.5892470628386945e-06, "loss": 1.8335, "step": 284 }, { "epoch": 0.39610840861709523, "grad_norm": 0.19618239998817444, "learning_rate": 1.5509114064116147e-06, "loss": 1.7887, "step": 285 }, { "epoch": 0.39749826268241834, "grad_norm": 0.46689727902412415, "learning_rate": 1.513340635028726e-06, "loss": 1.9366, "step": 286 }, { "epoch": 0.3988881167477415, "grad_norm": 0.1902017742395401, "learning_rate": 1.4765237210667692e-06, "loss": 1.7642, "step": 287 }, { "epoch": 0.40027797081306465, "grad_norm": 0.21295490860939026, "learning_rate": 1.440449864276161e-06, "loss": 1.7914, "step": 288 }, { "epoch": 0.40166782487838776, "grad_norm": 0.20643477141857147, "learning_rate": 1.40510815072048e-06, "loss": 2.033, "step": 289 }, { "epoch": 0.4030576789437109, "grad_norm": 0.488561749458313, "learning_rate": 1.3704878938369802e-06, "loss": 2.1574, "step": 290 }, { "epoch": 0.4044475330090341, "grad_norm": 0.22815576195716858, "learning_rate": 1.3365781796892406e-06, "loss": 1.6741, "step": 291 }, { "epoch": 0.4058373870743572, "grad_norm": 0.2025427669286728, "learning_rate": 1.303368435401353e-06, "loss": 1.898, "step": 292 }, { "epoch": 0.40722724113968034, "grad_norm": 0.21220636367797852, "learning_rate": 1.2708480880974093e-06, "loss": 1.945, "step": 293 }, { "epoch": 0.4086170952050035, "grad_norm": 0.19409982860088348, "learning_rate": 1.2390064512146637e-06, "loss": 1.8051, "step": 294 }, { "epoch": 0.4100069492703266, "grad_norm": 0.2595125734806061, "learning_rate": 1.2078331792508834e-06, "loss": 1.9311, "step": 295 }, { "epoch": 0.41139680333564976, "grad_norm": 0.216276153922081, "learning_rate": 1.1773176993301604e-06, "loss": 2.0231, "step": 296 }, { "epoch": 0.4127866574009729, "grad_norm": 0.21604764461517334, "learning_rate": 1.1474497796370997e-06, "loss": 2.1027, "step": 297 }, { "epoch": 0.414176511466296, "grad_norm": 0.2033761888742447, "learning_rate": 1.1182191883563064e-06, "loss": 1.8758, "step": 298 }, { "epoch": 0.4155663655316192, "grad_norm": 0.21036016941070557, "learning_rate": 1.0896155799855478e-06, "loss": 1.9532, "step": 299 }, { "epoch": 0.41695621959694235, "grad_norm": 0.2652454078197479, "learning_rate": 1.0616288363962667e-06, "loss": 1.8779, "step": 300 }, { "epoch": 0.41834607366226545, "grad_norm": 2.5246853828430176, "learning_rate": 1.0342490668335813e-06, "loss": 1.8124, "step": 301 }, { "epoch": 0.4197359277275886, "grad_norm": 0.21512825787067413, "learning_rate": 1.0074660394820967e-06, "loss": 1.7586, "step": 302 }, { "epoch": 0.42112578179291177, "grad_norm": 0.1967412233352661, "learning_rate": 9.812700909606065e-07, "loss": 1.7852, "step": 303 }, { "epoch": 0.4225156358582349, "grad_norm": 0.3073035478591919, "learning_rate": 9.556512168273912e-07, "loss": 1.8174, "step": 304 }, { "epoch": 0.42390548992355803, "grad_norm": 0.259111225605011, "learning_rate": 9.305997537012445e-07, "loss": 1.8097, "step": 305 }, { "epoch": 0.4252953439888812, "grad_norm": 0.21477729082107544, "learning_rate": 9.0610603820096e-07, "loss": 1.8902, "step": 306 }, { "epoch": 0.4266851980542043, "grad_norm": 0.20122215151786804, "learning_rate": 8.821605774755881e-07, "loss": 1.8512, "step": 307 }, { "epoch": 0.42807505211952745, "grad_norm": 0.19739589095115662, "learning_rate": 8.5875382183076e-07, "loss": 1.9256, "step": 308 }, { "epoch": 0.4294649061848506, "grad_norm": 0.22638773918151855, "learning_rate": 8.358763921023638e-07, "loss": 1.9929, "step": 309 }, { "epoch": 0.4308547602501737, "grad_norm": 0.18856070935726166, "learning_rate": 8.135190796565439e-07, "loss": 1.7015, "step": 310 }, { "epoch": 0.4322446143154969, "grad_norm": 0.24857446551322937, "learning_rate": 7.916725621726073e-07, "loss": 1.7849, "step": 311 }, { "epoch": 0.43363446838082004, "grad_norm": 0.2543996274471283, "learning_rate": 7.70327801546955e-07, "loss": 2.0024, "step": 312 }, { "epoch": 0.43502432244614314, "grad_norm": 0.24206531047821045, "learning_rate": 7.494757028325694e-07, "loss": 1.9814, "step": 313 }, { "epoch": 0.4364141765114663, "grad_norm": 0.20051902532577515, "learning_rate": 7.291073984561081e-07, "loss": 1.892, "step": 314 }, { "epoch": 0.43780403057678946, "grad_norm": 0.23333312571048737, "learning_rate": 7.092140208442288e-07, "loss": 1.7736, "step": 315 }, { "epoch": 0.43919388464211256, "grad_norm": 0.20156167447566986, "learning_rate": 6.897868729538459e-07, "loss": 1.8999, "step": 316 }, { "epoch": 0.4405837387074357, "grad_norm": 0.261990487575531, "learning_rate": 6.708173145852925e-07, "loss": 2.0568, "step": 317 }, { "epoch": 0.4419735927727589, "grad_norm": 0.2258765995502472, "learning_rate": 6.522967623823206e-07, "loss": 2.037, "step": 318 }, { "epoch": 0.443363446838082, "grad_norm": 0.2119254171848297, "learning_rate": 6.342168035189388e-07, "loss": 1.731, "step": 319 }, { "epoch": 0.44475330090340515, "grad_norm": 0.2066742181777954, "learning_rate": 6.165690251691558e-07, "loss": 1.767, "step": 320 }, { "epoch": 0.4461431549687283, "grad_norm": 0.21753494441509247, "learning_rate": 5.993452418806555e-07, "loss": 1.9796, "step": 321 }, { "epoch": 0.4475330090340514, "grad_norm": 0.21915656328201294, "learning_rate": 5.825372682011221e-07, "loss": 2.0308, "step": 322 }, { "epoch": 0.44892286309937457, "grad_norm": 0.28195130825042725, "learning_rate": 5.661370323650772e-07, "loss": 1.9736, "step": 323 }, { "epoch": 0.45031271716469773, "grad_norm": 0.5690734386444092, "learning_rate": 5.501366331372992e-07, "loss": 1.8671, "step": 324 }, { "epoch": 0.45170257123002083, "grad_norm": 0.21578340232372284, "learning_rate": 5.345282261259854e-07, "loss": 2.0083, "step": 325 }, { "epoch": 0.453092425295344, "grad_norm": 0.1926323026418686, "learning_rate": 5.193040237827518e-07, "loss": 1.8884, "step": 326 }, { "epoch": 0.45448227936066715, "grad_norm": 0.21623694896697998, "learning_rate": 5.04456409089471e-07, "loss": 2.0226, "step": 327 }, { "epoch": 0.45587213342599026, "grad_norm": 0.23700657486915588, "learning_rate": 4.899777650280157e-07, "loss": 1.8343, "step": 328 }, { "epoch": 0.4572619874913134, "grad_norm": 0.21847975254058838, "learning_rate": 4.7586070195393404e-07, "loss": 1.586, "step": 329 }, { "epoch": 0.4586518415566366, "grad_norm": 0.1861618459224701, "learning_rate": 4.620978586444835e-07, "loss": 1.6626, "step": 330 }, { "epoch": 0.4600416956219597, "grad_norm": 0.1939268410205841, "learning_rate": 4.486819875637593e-07, "loss": 1.9735, "step": 331 }, { "epoch": 0.46143154968728284, "grad_norm": 0.20332273840904236, "learning_rate": 4.3560592644098506e-07, "loss": 2.1484, "step": 332 }, { "epoch": 0.462821403752606, "grad_norm": 0.21045802533626556, "learning_rate": 4.228626551139314e-07, "loss": 1.8603, "step": 333 }, { "epoch": 0.4642112578179291, "grad_norm": 0.20922106504440308, "learning_rate": 4.1044518184207845e-07, "loss": 1.9512, "step": 334 }, { "epoch": 0.46560111188325226, "grad_norm": 0.2196597456932068, "learning_rate": 3.9834671383687237e-07, "loss": 1.9034, "step": 335 }, { "epoch": 0.4669909659485754, "grad_norm": 0.2610037326812744, "learning_rate": 3.865604867314687e-07, "loss": 1.938, "step": 336 }, { "epoch": 0.4683808200138985, "grad_norm": 0.19590795040130615, "learning_rate": 3.750798214241513e-07, "loss": 1.9635, "step": 337 }, { "epoch": 0.4697706740792217, "grad_norm": 0.21842974424362183, "learning_rate": 3.638982093434606e-07, "loss": 1.9537, "step": 338 }, { "epoch": 0.47116052814454484, "grad_norm": 0.32149797677993774, "learning_rate": 3.5300919876135595e-07, "loss": 1.6664, "step": 339 }, { "epoch": 0.47255038220986795, "grad_norm": 0.1941124051809311, "learning_rate": 3.424064516366343e-07, "loss": 1.5644, "step": 340 }, { "epoch": 0.4739402362751911, "grad_norm": 0.19114050269126892, "learning_rate": 3.3208368677151157e-07, "loss": 1.8061, "step": 341 }, { "epoch": 0.47533009034051427, "grad_norm": 0.18381401896476746, "learning_rate": 3.220347934984602e-07, "loss": 1.6162, "step": 342 }, { "epoch": 0.47671994440583737, "grad_norm": 0.25458601117134094, "learning_rate": 3.122536895716621e-07, "loss": 1.6492, "step": 343 }, { "epoch": 0.47810979847116053, "grad_norm": 0.22049333155155182, "learning_rate": 3.027344348538463e-07, "loss": 1.6919, "step": 344 }, { "epoch": 0.4794996525364837, "grad_norm": 0.18299983441829681, "learning_rate": 2.934711744728702e-07, "loss": 1.8232, "step": 345 }, { "epoch": 0.4808895066018068, "grad_norm": 0.1967984288930893, "learning_rate": 2.844581104000099e-07, "loss": 1.7902, "step": 346 }, { "epoch": 0.48227936066712995, "grad_norm": 0.21064946055412292, "learning_rate": 2.7568961513679824e-07, "loss": 1.6887, "step": 347 }, { "epoch": 0.4836692147324531, "grad_norm": 0.1825314313173294, "learning_rate": 2.6716011802818684e-07, "loss": 1.7346, "step": 348 }, { "epoch": 0.4850590687977762, "grad_norm": 0.1885017305612564, "learning_rate": 2.588641052625462e-07, "loss": 1.7041, "step": 349 }, { "epoch": 0.4864489228630994, "grad_norm": 0.2001916468143463, "learning_rate": 2.50796205136794e-07, "loss": 1.8274, "step": 350 }, { "epoch": 0.48783877692842254, "grad_norm": 0.20404978096485138, "learning_rate": 2.429511312129762e-07, "loss": 1.61, "step": 351 }, { "epoch": 0.48922863099374564, "grad_norm": 0.21010123193264008, "learning_rate": 2.3532369652912166e-07, "loss": 1.8404, "step": 352 }, { "epoch": 0.4906184850590688, "grad_norm": 0.18711315095424652, "learning_rate": 2.27908785177533e-07, "loss": 1.6448, "step": 353 }, { "epoch": 0.49200833912439196, "grad_norm": 0.19629548490047455, "learning_rate": 2.207013807264957e-07, "loss": 1.6833, "step": 354 }, { "epoch": 0.49339819318971506, "grad_norm": 0.19732961058616638, "learning_rate": 2.136965520094236e-07, "loss": 1.7481, "step": 355 }, { "epoch": 0.4947880472550382, "grad_norm": 0.18008306622505188, "learning_rate": 2.0688946733571356e-07, "loss": 1.5377, "step": 356 }, { "epoch": 0.4961779013203614, "grad_norm": 0.1791473925113678, "learning_rate": 2.0027538027989067e-07, "loss": 1.6549, "step": 357 }, { "epoch": 0.4975677553856845, "grad_norm": 0.23135995864868164, "learning_rate": 1.9384962968160835e-07, "loss": 1.7028, "step": 358 }, { "epoch": 0.49895760945100764, "grad_norm": 0.17524275183677673, "learning_rate": 1.876076254347936e-07, "loss": 1.7135, "step": 359 }, { "epoch": 0.5003474635163307, "grad_norm": 0.19246183335781097, "learning_rate": 1.8154490533106582e-07, "loss": 1.761, "step": 360 } ], "logging_steps": 1, "max_steps": 1438, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 180, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.0482841948420833e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }