{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2501737317581654, "eval_steps": 500, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001389854065323141, "grad_norm": 0.7409176826477051, "learning_rate": 3.999721593572758e-05, "loss": 3.5192, "step": 1 }, { "epoch": 0.002779708130646282, "grad_norm": 0.5615135431289673, "learning_rate": 3.9991646190173924e-05, "loss": 3.4286, "step": 2 }, { "epoch": 0.004169562195969423, "grad_norm": 0.5854966044425964, "learning_rate": 3.998328611487523e-05, "loss": 3.4286, "step": 3 }, { "epoch": 0.005559416261292564, "grad_norm": 0.5862582921981812, "learning_rate": 3.997213570983149e-05, "loss": 3.4585, "step": 4 }, { "epoch": 0.006949270326615705, "grad_norm": 0.5955198407173157, "learning_rate": 3.9958194975042716e-05, "loss": 3.1467, "step": 5 }, { "epoch": 0.008339124391938846, "grad_norm": 0.6464918851852417, "learning_rate": 3.9941460272530094e-05, "loss": 3.1823, "step": 6 }, { "epoch": 0.009728978457261988, "grad_norm": 0.6563470959663391, "learning_rate": 3.9921931602293625e-05, "loss": 3.0063, "step": 7 }, { "epoch": 0.011118832522585128, "grad_norm": 0.7250497341156006, "learning_rate": 3.989960896433331e-05, "loss": 2.9268, "step": 8 }, { "epoch": 0.01250868658790827, "grad_norm": 0.7942679524421692, "learning_rate": 3.987449599662796e-05, "loss": 2.9488, "step": 9 }, { "epoch": 0.01389854065323141, "grad_norm": 0.7256530523300171, "learning_rate": 3.984659269917756e-05, "loss": 2.8387, "step": 10 }, { "epoch": 0.015288394718554551, "grad_norm": 0.96202152967453, "learning_rate": 3.981589907198213e-05, "loss": 2.7071, "step": 11 }, { "epoch": 0.01667824878387769, "grad_norm": 0.8117583394050598, "learning_rate": 3.978242239099927e-05, "loss": 2.4934, "step": 12 }, { "epoch": 0.018068102849200834, "grad_norm": 0.9094793796539307, "learning_rate": 3.974616265622899e-05, "loss": 2.6515, "step": 13 }, { "epoch": 0.019457956914523976, "grad_norm": 0.8784312009811401, "learning_rate": 3.9707127143628895e-05, "loss": 2.2521, "step": 14 }, { "epoch": 0.020847810979847115, "grad_norm": 0.8378123641014099, "learning_rate": 3.966531585319899e-05, "loss": 2.6406, "step": 15 }, { "epoch": 0.022237665045170257, "grad_norm": 0.8524578809738159, "learning_rate": 3.962073606089689e-05, "loss": 2.5555, "step": 16 }, { "epoch": 0.0236275191104934, "grad_norm": 1.0161993503570557, "learning_rate": 3.9573395042680204e-05, "loss": 2.6443, "step": 17 }, { "epoch": 0.02501737317581654, "grad_norm": 0.8594086766242981, "learning_rate": 3.9523296436527744e-05, "loss": 2.4282, "step": 18 }, { "epoch": 0.02640722724113968, "grad_norm": 0.8018243312835693, "learning_rate": 3.947044751839712e-05, "loss": 2.2656, "step": 19 }, { "epoch": 0.02779708130646282, "grad_norm": 0.833044171333313, "learning_rate": 3.9414855564245954e-05, "loss": 2.2516, "step": 20 }, { "epoch": 0.029186935371785964, "grad_norm": 0.8001306056976318, "learning_rate": 3.935652785003185e-05, "loss": 2.334, "step": 21 }, { "epoch": 0.030576789437109102, "grad_norm": 1.0083926916122437, "learning_rate": 3.929547528969124e-05, "loss": 2.2434, "step": 22 }, { "epoch": 0.031966643502432245, "grad_norm": 0.8387123942375183, "learning_rate": 3.923170515918173e-05, "loss": 2.1532, "step": 23 }, { "epoch": 0.03335649756775538, "grad_norm": 0.8073887825012207, "learning_rate": 3.916522837243974e-05, "loss": 2.3309, "step": 24 }, { "epoch": 0.03474635163307853, "grad_norm": 0.709004819393158, "learning_rate": 3.90960558434017e-05, "loss": 2.18, "step": 25 }, { "epoch": 0.03613620569840167, "grad_norm": 0.674148440361023, "learning_rate": 3.9024198486004025e-05, "loss": 2.1713, "step": 26 }, { "epoch": 0.037526059763724806, "grad_norm": 0.7956393361091614, "learning_rate": 3.894966721418314e-05, "loss": 2.3782, "step": 27 }, { "epoch": 0.03891591382904795, "grad_norm": 0.6804589629173279, "learning_rate": 3.887247294187546e-05, "loss": 2.1933, "step": 28 }, { "epoch": 0.04030576789437109, "grad_norm": 1.6819874048233032, "learning_rate": 3.8792630220996216e-05, "loss": 2.2018, "step": 29 }, { "epoch": 0.04169562195969423, "grad_norm": 0.685869574546814, "learning_rate": 3.871014996548183e-05, "loss": 2.2206, "step": 30 }, { "epoch": 0.043085476025017375, "grad_norm": 0.6232013702392578, "learning_rate": 3.862505036522634e-05, "loss": 2.1251, "step": 31 }, { "epoch": 0.04447533009034051, "grad_norm": 0.609855592250824, "learning_rate": 3.853733869618736e-05, "loss": 2.2228, "step": 32 }, { "epoch": 0.04586518415566365, "grad_norm": 0.6258031725883484, "learning_rate": 3.844703678623773e-05, "loss": 2.2185, "step": 33 }, { "epoch": 0.0472550382209868, "grad_norm": 0.5621820688247681, "learning_rate": 3.835415554931387e-05, "loss": 2.168, "step": 34 }, { "epoch": 0.048644892286309936, "grad_norm": 0.5401716828346252, "learning_rate": 3.825871317530982e-05, "loss": 2.175, "step": 35 }, { "epoch": 0.05003474635163308, "grad_norm": 0.5031033754348755, "learning_rate": 3.816072421614081e-05, "loss": 2.0321, "step": 36 }, { "epoch": 0.05142460041695622, "grad_norm": 0.49892041087150574, "learning_rate": 3.806021049967967e-05, "loss": 2.2909, "step": 37 }, { "epoch": 0.05281445448227936, "grad_norm": 0.47056782245635986, "learning_rate": 3.795718293986283e-05, "loss": 2.1851, "step": 38 }, { "epoch": 0.054204308547602505, "grad_norm": 0.4496292173862457, "learning_rate": 3.785166336456314e-05, "loss": 2.0675, "step": 39 }, { "epoch": 0.05559416261292564, "grad_norm": 0.473352313041687, "learning_rate": 3.774366996367462e-05, "loss": 2.2851, "step": 40 }, { "epoch": 0.05698401667824878, "grad_norm": 0.4474506378173828, "learning_rate": 3.7633220927091315e-05, "loss": 2.1041, "step": 41 }, { "epoch": 0.05837387074357193, "grad_norm": 0.4823157489299774, "learning_rate": 3.752033808268607e-05, "loss": 2.0805, "step": 42 }, { "epoch": 0.059763724808895066, "grad_norm": 0.45550891757011414, "learning_rate": 3.740503962035291e-05, "loss": 2.2575, "step": 43 }, { "epoch": 0.061153578874218205, "grad_norm": 0.3821059763431549, "learning_rate": 3.7287347367964685e-05, "loss": 2.1388, "step": 44 }, { "epoch": 0.06254343293954134, "grad_norm": 0.36882323026657104, "learning_rate": 3.716727951541543e-05, "loss": 2.1219, "step": 45 }, { "epoch": 0.06393328700486449, "grad_norm": 0.38791537284851074, "learning_rate": 3.7044861528556794e-05, "loss": 2.1315, "step": 46 }, { "epoch": 0.06532314107018763, "grad_norm": 0.3582218885421753, "learning_rate": 3.692011523526162e-05, "loss": 2.2317, "step": 47 }, { "epoch": 0.06671299513551077, "grad_norm": 0.329677551984787, "learning_rate": 3.679306246340275e-05, "loss": 2.0869, "step": 48 }, { "epoch": 0.06810284920083391, "grad_norm": 0.32158908247947693, "learning_rate": 3.6663725040853024e-05, "loss": 2.3178, "step": 49 }, { "epoch": 0.06949270326615706, "grad_norm": 0.384804904460907, "learning_rate": 3.653212479548529e-05, "loss": 2.1155, "step": 50 }, { "epoch": 0.07088255733148019, "grad_norm": 0.3261670768260956, "learning_rate": 3.639828719315119e-05, "loss": 2.2381, "step": 51 }, { "epoch": 0.07227241139680333, "grad_norm": 0.3406740128993988, "learning_rate": 3.626223769970238e-05, "loss": 2.0653, "step": 52 }, { "epoch": 0.07366226546212648, "grad_norm": 0.32772770524024963, "learning_rate": 3.612400178099051e-05, "loss": 2.0903, "step": 53 }, { "epoch": 0.07505211952744961, "grad_norm": 0.3069205582141876, "learning_rate": 3.598360126488842e-05, "loss": 1.9182, "step": 54 }, { "epoch": 0.07644197359277276, "grad_norm": 0.2695797085762024, "learning_rate": 3.5841068893205374e-05, "loss": 1.97, "step": 55 }, { "epoch": 0.0778318276580959, "grad_norm": 0.37067824602127075, "learning_rate": 3.569642285583541e-05, "loss": 2.2509, "step": 56 }, { "epoch": 0.07922168172341904, "grad_norm": 0.3385728895664215, "learning_rate": 3.5549695894587785e-05, "loss": 2.2857, "step": 57 }, { "epoch": 0.08061153578874218, "grad_norm": 0.28427156805992126, "learning_rate": 3.540090983733535e-05, "loss": 2.0356, "step": 58 }, { "epoch": 0.08200138985406533, "grad_norm": 0.3377193510532379, "learning_rate": 3.5250093787908554e-05, "loss": 2.1744, "step": 59 }, { "epoch": 0.08339124391938846, "grad_norm": 0.27540090680122375, "learning_rate": 3.509727321215905e-05, "loss": 2.1665, "step": 60 }, { "epoch": 0.0847810979847116, "grad_norm": 0.3422718644142151, "learning_rate": 3.494248085189611e-05, "loss": 1.9342, "step": 61 }, { "epoch": 0.08617095205003475, "grad_norm": 0.2574728727340698, "learning_rate": 3.478574217297137e-05, "loss": 2.2009, "step": 62 }, { "epoch": 0.08756080611535788, "grad_norm": 0.2330848127603531, "learning_rate": 3.462708627921529e-05, "loss": 2.1879, "step": 63 }, { "epoch": 0.08895066018068103, "grad_norm": 0.2683883309364319, "learning_rate": 3.4466542274458334e-05, "loss": 2.1559, "step": 64 }, { "epoch": 0.09034051424600417, "grad_norm": 0.21715426445007324, "learning_rate": 3.430413926253095e-05, "loss": 2.0487, "step": 65 }, { "epoch": 0.0917303683113273, "grad_norm": 0.23342643678188324, "learning_rate": 3.413990998524241e-05, "loss": 2.1284, "step": 66 }, { "epoch": 0.09312022237665045, "grad_norm": 0.2351645976305008, "learning_rate": 3.397387990844436e-05, "loss": 2.113, "step": 67 }, { "epoch": 0.0945100764419736, "grad_norm": 0.352430135011673, "learning_rate": 3.380608541192487e-05, "loss": 2.2563, "step": 68 }, { "epoch": 0.09589993050729674, "grad_norm": 0.2351987361907959, "learning_rate": 3.363655196153559e-05, "loss": 2.1185, "step": 69 }, { "epoch": 0.09728978457261987, "grad_norm": 0.25391116738319397, "learning_rate": 3.346531229908578e-05, "loss": 1.9646, "step": 70 }, { "epoch": 0.09867963863794302, "grad_norm": 0.24878591299057007, "learning_rate": 3.3292395528405905e-05, "loss": 1.8972, "step": 71 }, { "epoch": 0.10006949270326616, "grad_norm": 0.2513987421989441, "learning_rate": 3.311783439130522e-05, "loss": 2.3366, "step": 72 }, { "epoch": 0.1014593467685893, "grad_norm": 0.22613292932510376, "learning_rate": 3.2941661629593e-05, "loss": 2.0588, "step": 73 }, { "epoch": 0.10284920083391244, "grad_norm": 0.21839161217212677, "learning_rate": 3.27639099850785e-05, "loss": 2.0658, "step": 74 }, { "epoch": 0.10423905489923559, "grad_norm": 0.21652136743068695, "learning_rate": 3.2584608561592177e-05, "loss": 1.9668, "step": 75 }, { "epoch": 0.10562890896455872, "grad_norm": 0.33612847328186035, "learning_rate": 3.2403793738922104e-05, "loss": 1.9812, "step": 76 }, { "epoch": 0.10701876302988186, "grad_norm": 0.2500985264778137, "learning_rate": 3.2221498258877546e-05, "loss": 1.9848, "step": 77 }, { "epoch": 0.10840861709520501, "grad_norm": 0.2622527778148651, "learning_rate": 3.203775122528896e-05, "loss": 2.2417, "step": 78 }, { "epoch": 0.10979847116052814, "grad_norm": 0.22762766480445862, "learning_rate": 3.185258901794441e-05, "loss": 2.2114, "step": 79 }, { "epoch": 0.11118832522585129, "grad_norm": 0.18959005177021027, "learning_rate": 3.166604437865317e-05, "loss": 1.9689, "step": 80 }, { "epoch": 0.11257817929117443, "grad_norm": 0.2245381474494934, "learning_rate": 3.1478150049224496e-05, "loss": 2.0861, "step": 81 }, { "epoch": 0.11396803335649756, "grad_norm": 0.2584340572357178, "learning_rate": 3.1288938771467656e-05, "loss": 2.0639, "step": 82 }, { "epoch": 0.11535788742182071, "grad_norm": 0.22430986166000366, "learning_rate": 3.109844692517072e-05, "loss": 1.9695, "step": 83 }, { "epoch": 0.11674774148714386, "grad_norm": 0.27213403582572937, "learning_rate": 3.090670725214295e-05, "loss": 2.0551, "step": 84 }, { "epoch": 0.11813759555246699, "grad_norm": 0.23510581254959106, "learning_rate": 3.0713752494193614e-05, "loss": 2.1565, "step": 85 }, { "epoch": 0.11952744961779013, "grad_norm": 0.4063816964626312, "learning_rate": 3.051961903111078e-05, "loss": 2.2242, "step": 86 }, { "epoch": 0.12091730368311328, "grad_norm": 0.3135417401790619, "learning_rate": 3.032433960470371e-05, "loss": 2.2841, "step": 87 }, { "epoch": 0.12230715774843641, "grad_norm": 0.2632151246070862, "learning_rate": 3.0127950594760478e-05, "loss": 2.1075, "step": 88 }, { "epoch": 0.12369701181375956, "grad_norm": 0.2494344264268875, "learning_rate": 2.9930484743090346e-05, "loss": 2.1096, "step": 89 }, { "epoch": 0.1250868658790827, "grad_norm": 0.2007075548171997, "learning_rate": 2.9731978429481387e-05, "loss": 2.1779, "step": 90 }, { "epoch": 0.12647671994440585, "grad_norm": 0.21900464594364166, "learning_rate": 2.9532464395742863e-05, "loss": 2.0586, "step": 91 }, { "epoch": 0.12786657400972898, "grad_norm": 0.2916090190410614, "learning_rate": 2.9331979021662846e-05, "loss": 1.9652, "step": 92 }, { "epoch": 0.1292564280750521, "grad_norm": 0.25345149636268616, "learning_rate": 2.9130556868040003e-05, "loss": 2.0456, "step": 93 }, { "epoch": 0.13064628214037527, "grad_norm": 0.33592185378074646, "learning_rate": 2.8928230676683597e-05, "loss": 2.0481, "step": 94 }, { "epoch": 0.1320361362056984, "grad_norm": 0.2137216031551361, "learning_rate": 2.87250368273817e-05, "loss": 1.9936, "step": 95 }, { "epoch": 0.13342599027102153, "grad_norm": 0.1986539214849472, "learning_rate": 2.8521011699922383e-05, "loss": 2.101, "step": 96 }, { "epoch": 0.1348158443363447, "grad_norm": 0.19974084198474884, "learning_rate": 2.8316189855104312e-05, "loss": 2.1086, "step": 97 }, { "epoch": 0.13620569840166782, "grad_norm": 0.22967414557933807, "learning_rate": 2.8110605853726156e-05, "loss": 1.9475, "step": 98 }, { "epoch": 0.13759555246699096, "grad_norm": 0.21497172117233276, "learning_rate": 2.790429425658658e-05, "loss": 2.1221, "step": 99 }, { "epoch": 0.13898540653231412, "grad_norm": 0.2060205489397049, "learning_rate": 2.7697289624484256e-05, "loss": 2.157, "step": 100 }, { "epoch": 0.14037526059763725, "grad_norm": 0.2061186283826828, "learning_rate": 2.7489626518217847e-05, "loss": 1.9418, "step": 101 }, { "epoch": 0.14176511466296038, "grad_norm": 0.2318679243326187, "learning_rate": 2.7281341317575425e-05, "loss": 1.9337, "step": 102 }, { "epoch": 0.14315496872828354, "grad_norm": 0.19199368357658386, "learning_rate": 2.7072468583355658e-05, "loss": 2.1347, "step": 103 }, { "epoch": 0.14454482279360667, "grad_norm": 0.5738226175308228, "learning_rate": 2.686304105736781e-05, "loss": 1.9709, "step": 104 }, { "epoch": 0.1459346768589298, "grad_norm": 0.2086687982082367, "learning_rate": 2.665309511939995e-05, "loss": 2.1757, "step": 105 }, { "epoch": 0.14732453092425296, "grad_norm": 0.18678060173988342, "learning_rate": 2.6442665330250748e-05, "loss": 1.9203, "step": 106 }, { "epoch": 0.1487143849895761, "grad_norm": 0.24349789321422577, "learning_rate": 2.623178625071887e-05, "loss": 2.0041, "step": 107 }, { "epoch": 0.15010423905489922, "grad_norm": 0.44293686747550964, "learning_rate": 2.602049062261358e-05, "loss": 2.0651, "step": 108 }, { "epoch": 0.15149409312022238, "grad_norm": 0.23788246512413025, "learning_rate": 2.5808814825722948e-05, "loss": 2.0853, "step": 109 }, { "epoch": 0.15288394718554552, "grad_norm": 0.24998068809509277, "learning_rate": 2.5596793420845643e-05, "loss": 2.1346, "step": 110 }, { "epoch": 0.15427380125086865, "grad_norm": 0.23647256195545197, "learning_rate": 2.5384459149790928e-05, "loss": 1.9715, "step": 111 }, { "epoch": 0.1556636553161918, "grad_norm": 0.31655153632164, "learning_rate": 2.517184657335747e-05, "loss": 2.0115, "step": 112 }, { "epoch": 0.15705350938151494, "grad_norm": 0.19617554545402527, "learning_rate": 2.4958988433354534e-05, "loss": 2.0552, "step": 113 }, { "epoch": 0.15844336344683807, "grad_norm": 0.22154586017131805, "learning_rate": 2.4745919290580787e-05, "loss": 2.0653, "step": 114 }, { "epoch": 0.15983321751216123, "grad_norm": 0.2188163846731186, "learning_rate": 2.4532671886845492e-05, "loss": 2.2253, "step": 115 }, { "epoch": 0.16122307157748436, "grad_norm": 0.19053936004638672, "learning_rate": 2.4319278963957913e-05, "loss": 2.0672, "step": 116 }, { "epoch": 0.1626129256428075, "grad_norm": 0.2588854432106018, "learning_rate": 2.410577508271672e-05, "loss": 2.0342, "step": 117 }, { "epoch": 0.16400277970813065, "grad_norm": 0.2963363230228424, "learning_rate": 2.389219298493117e-05, "loss": 2.0387, "step": 118 }, { "epoch": 0.16539263377345378, "grad_norm": 0.23298707604408264, "learning_rate": 2.3678565412410535e-05, "loss": 2.0423, "step": 119 }, { "epoch": 0.16678248783877692, "grad_norm": 0.20690463483333588, "learning_rate": 2.3464925106964074e-05, "loss": 1.9764, "step": 120 }, { "epoch": 0.16817234190410008, "grad_norm": 0.23895452916622162, "learning_rate": 2.325130299141165e-05, "loss": 1.9615, "step": 121 }, { "epoch": 0.1695621959694232, "grad_norm": 0.2269681841135025, "learning_rate": 2.3037731807562523e-05, "loss": 1.8685, "step": 122 }, { "epoch": 0.17095205003474634, "grad_norm": 0.26502808928489685, "learning_rate": 2.2824242478236556e-05, "loss": 1.9815, "step": 123 }, { "epoch": 0.1723419041000695, "grad_norm": 0.24106465280056, "learning_rate": 2.2610867745243013e-05, "loss": 2.0057, "step": 124 }, { "epoch": 0.17373175816539263, "grad_norm": 0.1920744776725769, "learning_rate": 2.2397638531401753e-05, "loss": 1.9905, "step": 125 }, { "epoch": 0.17512161223071576, "grad_norm": 0.299200177192688, "learning_rate": 2.2184585759532638e-05, "loss": 1.8186, "step": 126 }, { "epoch": 0.17651146629603892, "grad_norm": 0.21507440507411957, "learning_rate": 2.1971740352455527e-05, "loss": 1.965, "step": 127 }, { "epoch": 0.17790132036136205, "grad_norm": 0.2105863094329834, "learning_rate": 2.1759131414000876e-05, "loss": 2.115, "step": 128 }, { "epoch": 0.17929117442668518, "grad_norm": 0.21202439069747925, "learning_rate": 2.154679168597795e-05, "loss": 1.9482, "step": 129 }, { "epoch": 0.18068102849200834, "grad_norm": 0.21153898537158966, "learning_rate": 2.1334748453227803e-05, "loss": 1.9907, "step": 130 }, { "epoch": 0.18207088255733148, "grad_norm": 0.22135163843631744, "learning_rate": 2.1123032638570294e-05, "loss": 2.1229, "step": 131 }, { "epoch": 0.1834607366226546, "grad_norm": 0.2102488875389099, "learning_rate": 2.091167334583588e-05, "loss": 2.1547, "step": 132 }, { "epoch": 0.18485059068797777, "grad_norm": 0.500489354133606, "learning_rate": 2.070070149784442e-05, "loss": 2.0868, "step": 133 }, { "epoch": 0.1862404447533009, "grad_norm": 0.3173089325428009, "learning_rate": 2.049014437943697e-05, "loss": 1.8882, "step": 134 }, { "epoch": 0.18763029881862406, "grad_norm": 0.2581390142440796, "learning_rate": 2.028002927545458e-05, "loss": 2.0773, "step": 135 }, { "epoch": 0.1890201528839472, "grad_norm": 0.20216765999794006, "learning_rate": 2.007038528972771e-05, "loss": 2.0587, "step": 136 }, { "epoch": 0.19041000694927032, "grad_norm": 0.20798902213573456, "learning_rate": 1.986123970709741e-05, "loss": 1.9818, "step": 137 }, { "epoch": 0.19179986101459348, "grad_norm": 0.20478519797325134, "learning_rate": 1.9652619812404737e-05, "loss": 2.1231, "step": 138 }, { "epoch": 0.1931897150799166, "grad_norm": 0.2358742356300354, "learning_rate": 1.944455289049074e-05, "loss": 1.9059, "step": 139 }, { "epoch": 0.19457956914523974, "grad_norm": 0.19055970013141632, "learning_rate": 1.9237066226196475e-05, "loss": 1.9074, "step": 140 }, { "epoch": 0.1959694232105629, "grad_norm": 0.21943627297878265, "learning_rate": 1.9030183466384187e-05, "loss": 1.887, "step": 141 }, { "epoch": 0.19735927727588604, "grad_norm": 0.20230762660503387, "learning_rate": 1.8823933714884333e-05, "loss": 2.0427, "step": 142 }, { "epoch": 0.19874913134120917, "grad_norm": 0.2051026076078415, "learning_rate": 1.861834061855916e-05, "loss": 1.8915, "step": 143 }, { "epoch": 0.20013898540653233, "grad_norm": 1.0352438688278198, "learning_rate": 1.8413429643260315e-05, "loss": 1.9431, "step": 144 }, { "epoch": 0.20152883947185546, "grad_norm": 0.2811901271343231, "learning_rate": 1.8209228073828854e-05, "loss": 1.897, "step": 145 }, { "epoch": 0.2029186935371786, "grad_norm": 0.46414074301719666, "learning_rate": 1.8005757738137618e-05, "loss": 2.0354, "step": 146 }, { "epoch": 0.20430854760250175, "grad_norm": 0.22770829498767853, "learning_rate": 1.7803044102038257e-05, "loss": 2.0427, "step": 147 }, { "epoch": 0.20569840166782488, "grad_norm": 0.2260872721672058, "learning_rate": 1.7601108993403614e-05, "loss": 1.9558, "step": 148 }, { "epoch": 0.207088255733148, "grad_norm": 0.1908380687236786, "learning_rate": 1.7399976059095934e-05, "loss": 2.0494, "step": 149 }, { "epoch": 0.20847810979847117, "grad_norm": 0.19630920886993408, "learning_rate": 1.7199668945977464e-05, "loss": 1.893, "step": 150 }, { "epoch": 0.2098679638637943, "grad_norm": 0.18318656086921692, "learning_rate": 1.7000209481921047e-05, "loss": 1.8479, "step": 151 }, { "epoch": 0.21125781792911744, "grad_norm": 0.20488278567790985, "learning_rate": 1.6801621313788928e-05, "loss": 1.9398, "step": 152 }, { "epoch": 0.2126476719944406, "grad_norm": 0.18846479058265686, "learning_rate": 1.6603924450464547e-05, "loss": 1.8462, "step": 153 }, { "epoch": 0.21403752605976373, "grad_norm": 0.1990012526512146, "learning_rate": 1.6407140719820745e-05, "loss": 1.8491, "step": 154 }, { "epoch": 0.21542738012508686, "grad_norm": 0.20991306006908417, "learning_rate": 1.6211291949730366e-05, "loss": 2.0807, "step": 155 }, { "epoch": 0.21681723419041002, "grad_norm": 0.1805444061756134, "learning_rate": 1.6016396330087446e-05, "loss": 1.8295, "step": 156 }, { "epoch": 0.21820708825573315, "grad_norm": 0.22825907170772552, "learning_rate": 1.5822475688764825e-05, "loss": 2.0153, "step": 157 }, { "epoch": 0.21959694232105628, "grad_norm": 0.17874298989772797, "learning_rate": 1.562954821565654e-05, "loss": 1.8857, "step": 158 }, { "epoch": 0.22098679638637944, "grad_norm": 0.20816968381404877, "learning_rate": 1.5437633919646032e-05, "loss": 2.0169, "step": 159 }, { "epoch": 0.22237665045170257, "grad_norm": 0.1875351518392563, "learning_rate": 1.5246751900122035e-05, "loss": 2.0116, "step": 160 }, { "epoch": 0.2237665045170257, "grad_norm": 0.1880957931280136, "learning_rate": 1.5056919437483884e-05, "loss": 1.9458, "step": 161 }, { "epoch": 0.22515635858234886, "grad_norm": 0.2217608243227005, "learning_rate": 1.4868154721625615e-05, "loss": 2.1639, "step": 162 }, { "epoch": 0.226546212647672, "grad_norm": 0.21479056775569916, "learning_rate": 1.468047503294656e-05, "loss": 2.0, "step": 163 }, { "epoch": 0.22793606671299513, "grad_norm": 0.2338622510433197, "learning_rate": 1.4493896742351353e-05, "loss": 2.0731, "step": 164 }, { "epoch": 0.2293259207783183, "grad_norm": 0.23026101291179657, "learning_rate": 1.4308437130239327e-05, "loss": 2.1193, "step": 165 }, { "epoch": 0.23071577484364142, "grad_norm": 0.21515104174613953, "learning_rate": 1.4124113477009814e-05, "loss": 1.966, "step": 166 }, { "epoch": 0.23210562890896455, "grad_norm": 0.22686313092708588, "learning_rate": 1.3940940334578045e-05, "loss": 2.0554, "step": 167 }, { "epoch": 0.2334954829742877, "grad_norm": 0.22290675342082977, "learning_rate": 1.3758932254859246e-05, "loss": 2.1244, "step": 168 }, { "epoch": 0.23488533703961084, "grad_norm": 0.19737748801708221, "learning_rate": 1.357810469926335e-05, "loss": 2.0386, "step": 169 }, { "epoch": 0.23627519110493397, "grad_norm": 0.2284354418516159, "learning_rate": 1.3398472219705582e-05, "loss": 1.9145, "step": 170 }, { "epoch": 0.23766504517025713, "grad_norm": 0.18580572307109833, "learning_rate": 1.322004845860647e-05, "loss": 1.8991, "step": 171 }, { "epoch": 0.23905489923558026, "grad_norm": 0.18737782537937164, "learning_rate": 1.3042847058386542e-05, "loss": 1.8709, "step": 172 }, { "epoch": 0.2404447533009034, "grad_norm": 0.2080836445093155, "learning_rate": 1.2866880751971621e-05, "loss": 1.7018, "step": 173 }, { "epoch": 0.24183460736622656, "grad_norm": 0.20678725838661194, "learning_rate": 1.2692163181782234e-05, "loss": 1.7609, "step": 174 }, { "epoch": 0.2432244614315497, "grad_norm": 0.21453110873699188, "learning_rate": 1.2518705261754803e-05, "loss": 2.1415, "step": 175 }, { "epoch": 0.24461431549687282, "grad_norm": 0.23385198414325714, "learning_rate": 1.234651881532045e-05, "loss": 1.8789, "step": 176 }, { "epoch": 0.24600416956219598, "grad_norm": 0.18875467777252197, "learning_rate": 1.2175616575405002e-05, "loss": 1.7596, "step": 177 }, { "epoch": 0.2473940236275191, "grad_norm": 0.22677035629749298, "learning_rate": 1.2006007636955474e-05, "loss": 2.0212, "step": 178 }, { "epoch": 0.24878387769284224, "grad_norm": 0.16881951689720154, "learning_rate": 1.183770382340299e-05, "loss": 1.7595, "step": 179 }, { "epoch": 0.2501737317581654, "grad_norm": 0.21647167205810547, "learning_rate": 1.1670714229694568e-05, "loss": 2.1274, "step": 180 } ], "logging_steps": 1, "max_steps": 1438, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 180, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5241420974210417e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }