|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998919736415686, |
|
"eval_steps": 500, |
|
"global_step": 3471, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014403514457527638, |
|
"grad_norm": 2.62252908668901, |
|
"learning_rate": 2.2988505747126437e-07, |
|
"loss": 3.4281, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0028807028915055276, |
|
"grad_norm": 2.6537402724592827, |
|
"learning_rate": 5.172413793103449e-07, |
|
"loss": 3.4375, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004321054337258291, |
|
"grad_norm": 2.6461606608506973, |
|
"learning_rate": 8.045977011494253e-07, |
|
"loss": 3.4271, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005761405783011055, |
|
"grad_norm": 2.6482412561136113, |
|
"learning_rate": 1.0919540229885058e-06, |
|
"loss": 3.4316, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007201757228763818, |
|
"grad_norm": 2.7090530181422676, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 3.4271, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008642108674516582, |
|
"grad_norm": 2.6957713098098757, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 3.4222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010082460120269346, |
|
"grad_norm": 2.823975463539696, |
|
"learning_rate": 1.9540229885057475e-06, |
|
"loss": 3.4122, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01152281156602211, |
|
"grad_norm": 2.906829198673145, |
|
"learning_rate": 2.241379310344828e-06, |
|
"loss": 3.3972, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.012963163011774873, |
|
"grad_norm": 2.959760309292492, |
|
"learning_rate": 2.5287356321839083e-06, |
|
"loss": 3.3748, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014403514457527637, |
|
"grad_norm": 3.0140610424144882, |
|
"learning_rate": 2.8160919540229887e-06, |
|
"loss": 3.3312, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0158438659032804, |
|
"grad_norm": 3.0958728084508227, |
|
"learning_rate": 3.103448275862069e-06, |
|
"loss": 3.2621, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.017284217349033165, |
|
"grad_norm": 3.0789510621647116, |
|
"learning_rate": 3.3908045977011496e-06, |
|
"loss": 3.1805, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01872456879478593, |
|
"grad_norm": 3.060095215151247, |
|
"learning_rate": 3.67816091954023e-06, |
|
"loss": 3.0475, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.020164920240538693, |
|
"grad_norm": 2.862619501338678, |
|
"learning_rate": 3.96551724137931e-06, |
|
"loss": 2.8765, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.021605271686291457, |
|
"grad_norm": 2.4529701308424627, |
|
"learning_rate": 4.252873563218391e-06, |
|
"loss": 2.6438, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02304562313204422, |
|
"grad_norm": 1.9259597661186747, |
|
"learning_rate": 4.540229885057471e-06, |
|
"loss": 2.4074, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02448597457779698, |
|
"grad_norm": 1.1944899501060937, |
|
"learning_rate": 4.8275862068965525e-06, |
|
"loss": 2.1598, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.025926326023549745, |
|
"grad_norm": 0.7043736273275593, |
|
"learning_rate": 5.114942528735632e-06, |
|
"loss": 1.986, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02736667746930251, |
|
"grad_norm": 0.4401877202562375, |
|
"learning_rate": 5.402298850574713e-06, |
|
"loss": 1.8752, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.028807028915055273, |
|
"grad_norm": 0.29913771411892215, |
|
"learning_rate": 5.689655172413794e-06, |
|
"loss": 1.8032, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030247380360808037, |
|
"grad_norm": 0.2501344036852339, |
|
"learning_rate": 5.977011494252874e-06, |
|
"loss": 1.762, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0316877318065608, |
|
"grad_norm": 0.22028235935534177, |
|
"learning_rate": 6.264367816091954e-06, |
|
"loss": 1.7284, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03312808325231356, |
|
"grad_norm": 0.20809861418922077, |
|
"learning_rate": 6.551724137931035e-06, |
|
"loss": 1.7045, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03456843469806633, |
|
"grad_norm": 0.20520136163663816, |
|
"learning_rate": 6.839080459770115e-06, |
|
"loss": 1.6829, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03600878614381909, |
|
"grad_norm": 0.20174383175553343, |
|
"learning_rate": 7.126436781609196e-06, |
|
"loss": 1.6605, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03744913758957186, |
|
"grad_norm": 0.20319553328609763, |
|
"learning_rate": 7.413793103448277e-06, |
|
"loss": 1.6379, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03888948903532462, |
|
"grad_norm": 0.2038342591080777, |
|
"learning_rate": 7.701149425287356e-06, |
|
"loss": 1.6107, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.040329840481077385, |
|
"grad_norm": 0.20440794132298992, |
|
"learning_rate": 7.988505747126438e-06, |
|
"loss": 1.5846, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.041770191926830146, |
|
"grad_norm": 0.20590893215016112, |
|
"learning_rate": 8.275862068965518e-06, |
|
"loss": 1.5547, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04321054337258291, |
|
"grad_norm": 0.20185248245660797, |
|
"learning_rate": 8.563218390804599e-06, |
|
"loss": 1.526, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.044650894818335674, |
|
"grad_norm": 0.1991870123255739, |
|
"learning_rate": 8.85057471264368e-06, |
|
"loss": 1.4937, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04609124626408844, |
|
"grad_norm": 0.19465384170558617, |
|
"learning_rate": 9.13793103448276e-06, |
|
"loss": 1.4598, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0475315977098412, |
|
"grad_norm": 0.18797574210376147, |
|
"learning_rate": 9.42528735632184e-06, |
|
"loss": 1.4343, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.04897194915559396, |
|
"grad_norm": 0.18212469780560764, |
|
"learning_rate": 9.71264367816092e-06, |
|
"loss": 1.3986, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05041230060134673, |
|
"grad_norm": 0.1851275242281937, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3663, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.05185265204709949, |
|
"grad_norm": 0.1894926644233431, |
|
"learning_rate": 1.0287356321839081e-05, |
|
"loss": 1.3365, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05329300349285226, |
|
"grad_norm": 0.19239941503471172, |
|
"learning_rate": 1.0574712643678162e-05, |
|
"loss": 1.3029, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.05473335493860502, |
|
"grad_norm": 0.1983109078749674, |
|
"learning_rate": 1.0862068965517242e-05, |
|
"loss": 1.2663, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.056173706384357786, |
|
"grad_norm": 0.20536206577251548, |
|
"learning_rate": 1.1149425287356324e-05, |
|
"loss": 1.2238, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.057614057830110546, |
|
"grad_norm": 0.20960804903533373, |
|
"learning_rate": 1.1436781609195405e-05, |
|
"loss": 1.1841, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.059054409275863314, |
|
"grad_norm": 0.2188932735565814, |
|
"learning_rate": 1.1724137931034483e-05, |
|
"loss": 1.1332, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.060494760721616074, |
|
"grad_norm": 0.22876533797063386, |
|
"learning_rate": 1.2011494252873564e-05, |
|
"loss": 1.0851, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.061935112167368835, |
|
"grad_norm": 0.23174345011010952, |
|
"learning_rate": 1.2298850574712644e-05, |
|
"loss": 1.0293, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.0633754636131216, |
|
"grad_norm": 0.23330633292890332, |
|
"learning_rate": 1.2586206896551725e-05, |
|
"loss": 0.9716, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06481581505887436, |
|
"grad_norm": 0.24464497890825554, |
|
"learning_rate": 1.2873563218390805e-05, |
|
"loss": 0.9035, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06625616650462712, |
|
"grad_norm": 0.25134894132627916, |
|
"learning_rate": 1.3160919540229885e-05, |
|
"loss": 0.8296, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0676965179503799, |
|
"grad_norm": 0.2586116096970685, |
|
"learning_rate": 1.3448275862068967e-05, |
|
"loss": 0.749, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06913686939613266, |
|
"grad_norm": 0.25755530564734574, |
|
"learning_rate": 1.3735632183908048e-05, |
|
"loss": 0.657, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07057722084188542, |
|
"grad_norm": 0.24571389947226407, |
|
"learning_rate": 1.4022988505747128e-05, |
|
"loss": 0.5611, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07201757228763818, |
|
"grad_norm": 0.2161386991422811, |
|
"learning_rate": 1.4310344827586209e-05, |
|
"loss": 0.4684, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07345792373339095, |
|
"grad_norm": 0.1845577123208276, |
|
"learning_rate": 1.459770114942529e-05, |
|
"loss": 0.3929, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.07489827517914371, |
|
"grad_norm": 0.15938611776040426, |
|
"learning_rate": 1.4885057471264368e-05, |
|
"loss": 0.3274, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07633862662489647, |
|
"grad_norm": 0.13184840828375916, |
|
"learning_rate": 1.5172413793103448e-05, |
|
"loss": 0.273, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.07777897807064924, |
|
"grad_norm": 0.10547360872838579, |
|
"learning_rate": 1.545977011494253e-05, |
|
"loss": 0.2341, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.079219329516402, |
|
"grad_norm": 0.08378821715636167, |
|
"learning_rate": 1.574712643678161e-05, |
|
"loss": 0.2152, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08065968096215477, |
|
"grad_norm": 0.06639227255563633, |
|
"learning_rate": 1.603448275862069e-05, |
|
"loss": 0.2028, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08210003240790753, |
|
"grad_norm": 0.05326270333294462, |
|
"learning_rate": 1.632183908045977e-05, |
|
"loss": 0.1915, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.08354038385366029, |
|
"grad_norm": 0.045250863339317625, |
|
"learning_rate": 1.6609195402298854e-05, |
|
"loss": 0.1831, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08498073529941305, |
|
"grad_norm": 0.03997876406980682, |
|
"learning_rate": 1.6896551724137932e-05, |
|
"loss": 0.1783, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.08642108674516583, |
|
"grad_norm": 0.03683411390731312, |
|
"learning_rate": 1.7183908045977015e-05, |
|
"loss": 0.1803, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08786143819091859, |
|
"grad_norm": 0.033494444332579816, |
|
"learning_rate": 1.7471264367816093e-05, |
|
"loss": 0.1792, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.08930178963667135, |
|
"grad_norm": 0.027545194459475855, |
|
"learning_rate": 1.7758620689655175e-05, |
|
"loss": 0.1776, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09074214108242411, |
|
"grad_norm": 0.025534566575995447, |
|
"learning_rate": 1.8045977011494254e-05, |
|
"loss": 0.1627, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09218249252817688, |
|
"grad_norm": 0.02432314883206978, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 0.1694, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09362284397392964, |
|
"grad_norm": 0.02296007196296768, |
|
"learning_rate": 1.8620689655172415e-05, |
|
"loss": 0.1632, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0950631954196824, |
|
"grad_norm": 0.02231231517863281, |
|
"learning_rate": 1.8908045977011497e-05, |
|
"loss": 0.1603, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09650354686543516, |
|
"grad_norm": 0.020592453220906963, |
|
"learning_rate": 1.9195402298850576e-05, |
|
"loss": 0.1578, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.09794389831118792, |
|
"grad_norm": 0.020380116268697165, |
|
"learning_rate": 1.9482758620689658e-05, |
|
"loss": 0.1568, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0993842497569407, |
|
"grad_norm": 0.02128754387207953, |
|
"learning_rate": 1.9770114942528737e-05, |
|
"loss": 0.1577, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.10082460120269346, |
|
"grad_norm": 0.020077957237521784, |
|
"learning_rate": 1.9999994940288617e-05, |
|
"loss": 0.1645, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10226495264844622, |
|
"grad_norm": 0.02081981445796709, |
|
"learning_rate": 1.999981785092774e-05, |
|
"loss": 0.1458, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.10370530409419898, |
|
"grad_norm": 0.019642666023756323, |
|
"learning_rate": 1.9999387781117715e-05, |
|
"loss": 0.1515, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10514565553995175, |
|
"grad_norm": 0.01993767310786855, |
|
"learning_rate": 1.9998704741738657e-05, |
|
"loss": 0.1558, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.10658600698570452, |
|
"grad_norm": 0.018085760313418967, |
|
"learning_rate": 1.9997768750070442e-05, |
|
"loss": 0.1553, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.10802635843145728, |
|
"grad_norm": 0.0203452490973548, |
|
"learning_rate": 1.9996579829792263e-05, |
|
"loss": 0.15, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.10946670987721004, |
|
"grad_norm": 0.018590604714013987, |
|
"learning_rate": 1.9995138010982028e-05, |
|
"loss": 0.1492, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1109070613229628, |
|
"grad_norm": 0.017630128077178075, |
|
"learning_rate": 1.9993443330115592e-05, |
|
"loss": 0.1474, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.11234741276871557, |
|
"grad_norm": 0.019441688441053217, |
|
"learning_rate": 1.9991495830065857e-05, |
|
"loss": 0.1509, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11378776421446833, |
|
"grad_norm": 0.023197073067636517, |
|
"learning_rate": 1.9989295560101656e-05, |
|
"loss": 0.1512, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.11522811566022109, |
|
"grad_norm": 0.022598358372408592, |
|
"learning_rate": 1.998684257588654e-05, |
|
"loss": 0.1457, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11666846710597385, |
|
"grad_norm": 0.018391884570447603, |
|
"learning_rate": 1.9984136939477333e-05, |
|
"loss": 0.1487, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.11810881855172663, |
|
"grad_norm": 0.01730298666489709, |
|
"learning_rate": 1.9981178719322606e-05, |
|
"loss": 0.1408, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11954916999747939, |
|
"grad_norm": 0.018077200725101054, |
|
"learning_rate": 1.99779679902609e-05, |
|
"loss": 0.1543, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.12098952144323215, |
|
"grad_norm": 0.018875155851426084, |
|
"learning_rate": 1.9974504833518863e-05, |
|
"loss": 0.1526, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12242987288898491, |
|
"grad_norm": 0.01550384193559367, |
|
"learning_rate": 1.9970789336709185e-05, |
|
"loss": 0.1503, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.12387022433473767, |
|
"grad_norm": 0.01563256542028533, |
|
"learning_rate": 1.9966821593828393e-05, |
|
"loss": 0.1475, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12531057578049043, |
|
"grad_norm": 0.015801051344428534, |
|
"learning_rate": 1.9962601705254442e-05, |
|
"loss": 0.1384, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.1267509272262432, |
|
"grad_norm": 0.018919536477648418, |
|
"learning_rate": 1.995812977774421e-05, |
|
"loss": 0.146, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.12819127867199598, |
|
"grad_norm": 0.01564902677446783, |
|
"learning_rate": 1.995340592443078e-05, |
|
"loss": 0.1446, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.12963163011774873, |
|
"grad_norm": 0.01649586526356844, |
|
"learning_rate": 1.9948430264820588e-05, |
|
"loss": 0.1443, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1310719815635015, |
|
"grad_norm": 0.016233025873589237, |
|
"learning_rate": 1.994320292479038e-05, |
|
"loss": 0.1398, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.13251233300925425, |
|
"grad_norm": 0.018370727196108234, |
|
"learning_rate": 1.993772403658405e-05, |
|
"loss": 0.1413, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.13395268445500702, |
|
"grad_norm": 0.015413145114948405, |
|
"learning_rate": 1.9931993738809288e-05, |
|
"loss": 0.1377, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1353930359007598, |
|
"grad_norm": 0.01579533249322328, |
|
"learning_rate": 1.9926012176434054e-05, |
|
"loss": 0.1343, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.13683338734651254, |
|
"grad_norm": 0.016623384137655177, |
|
"learning_rate": 1.991977950078295e-05, |
|
"loss": 0.1362, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.13827373879226532, |
|
"grad_norm": 0.016561476846420342, |
|
"learning_rate": 1.9913295869533345e-05, |
|
"loss": 0.1383, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.13971409023801806, |
|
"grad_norm": 0.016900232091944686, |
|
"learning_rate": 1.990656144671143e-05, |
|
"loss": 0.1448, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.14115444168377084, |
|
"grad_norm": 0.018123435569779828, |
|
"learning_rate": 1.9899576402688038e-05, |
|
"loss": 0.1364, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1425947931295236, |
|
"grad_norm": 0.01869327376515684, |
|
"learning_rate": 1.9892340914174344e-05, |
|
"loss": 0.1457, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14403514457527636, |
|
"grad_norm": 0.017383446282339288, |
|
"learning_rate": 1.988485516421739e-05, |
|
"loss": 0.1369, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14403514457527636, |
|
"eval_loss": 0.13828575611114502, |
|
"eval_runtime": 863.0505, |
|
"eval_samples_per_second": 2.09, |
|
"eval_steps_per_second": 0.523, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14547549602102913, |
|
"grad_norm": 0.01727758799136872, |
|
"learning_rate": 1.9877119342195478e-05, |
|
"loss": 0.141, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.1469158474667819, |
|
"grad_norm": 0.020724335957621343, |
|
"learning_rate": 1.986913364381333e-05, |
|
"loss": 0.1417, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14835619891253465, |
|
"grad_norm": 0.015171059899028828, |
|
"learning_rate": 1.9860898271097194e-05, |
|
"loss": 0.1361, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.14979655035828743, |
|
"grad_norm": 0.019695421273977713, |
|
"learning_rate": 1.9852413432389685e-05, |
|
"loss": 0.1314, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15123690180404017, |
|
"grad_norm": 0.016105641997242213, |
|
"learning_rate": 1.984367934234455e-05, |
|
"loss": 0.1355, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.15267725324979295, |
|
"grad_norm": 0.017169458650654956, |
|
"learning_rate": 1.9834696221921213e-05, |
|
"loss": 0.135, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15411760469554572, |
|
"grad_norm": 0.016758621454569307, |
|
"learning_rate": 1.98254642983792e-05, |
|
"loss": 0.1319, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.15555795614129847, |
|
"grad_norm": 0.018163113510772863, |
|
"learning_rate": 1.9815983805272378e-05, |
|
"loss": 0.1303, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15699830758705124, |
|
"grad_norm": 0.017990247516200045, |
|
"learning_rate": 1.980625498244306e-05, |
|
"loss": 0.139, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.158438659032804, |
|
"grad_norm": 0.015967634026842183, |
|
"learning_rate": 1.9796278076015924e-05, |
|
"loss": 0.1319, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15987901047855677, |
|
"grad_norm": 0.014829856672342604, |
|
"learning_rate": 1.9786053338391792e-05, |
|
"loss": 0.1352, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.16131936192430954, |
|
"grad_norm": 0.0172321563882635, |
|
"learning_rate": 1.9775581028241253e-05, |
|
"loss": 0.1253, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1627597133700623, |
|
"grad_norm": 0.018596015242561326, |
|
"learning_rate": 1.97648614104981e-05, |
|
"loss": 0.1281, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.16420006481581506, |
|
"grad_norm": 0.018018526896851798, |
|
"learning_rate": 1.9753894756352643e-05, |
|
"loss": 0.1345, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.16564041626156784, |
|
"grad_norm": 0.016819098654316897, |
|
"learning_rate": 1.9742681343244853e-05, |
|
"loss": 0.1248, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.16708076770732058, |
|
"grad_norm": 0.015776677724923773, |
|
"learning_rate": 1.9731221454857322e-05, |
|
"loss": 0.1313, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.16852111915307336, |
|
"grad_norm": 0.017615190905280357, |
|
"learning_rate": 1.9719515381108093e-05, |
|
"loss": 0.1368, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1699614705988261, |
|
"grad_norm": 0.01586176965856958, |
|
"learning_rate": 1.970756341814335e-05, |
|
"loss": 0.1308, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17140182204457888, |
|
"grad_norm": 0.01582181491527461, |
|
"learning_rate": 1.9695365868329895e-05, |
|
"loss": 0.1334, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.17284217349033165, |
|
"grad_norm": 0.01832860755228363, |
|
"learning_rate": 1.9682923040247513e-05, |
|
"loss": 0.1303, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1742825249360844, |
|
"grad_norm": 0.01608043519957003, |
|
"learning_rate": 1.9670235248681154e-05, |
|
"loss": 0.128, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.17572287638183717, |
|
"grad_norm": 0.015462595330781244, |
|
"learning_rate": 1.965730281461299e-05, |
|
"loss": 0.1318, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.17716322782758992, |
|
"grad_norm": 0.01829748858884658, |
|
"learning_rate": 1.964412606521428e-05, |
|
"loss": 0.128, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.1786035792733427, |
|
"grad_norm": 0.01741437351404647, |
|
"learning_rate": 1.9630705333837096e-05, |
|
"loss": 0.129, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18004393071909547, |
|
"grad_norm": 0.01714971316628412, |
|
"learning_rate": 1.9617040960005883e-05, |
|
"loss": 0.1274, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.18148428216484822, |
|
"grad_norm": 0.017341345727170193, |
|
"learning_rate": 1.9603133289408883e-05, |
|
"loss": 0.1285, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.182924633610601, |
|
"grad_norm": 0.018545013302290372, |
|
"learning_rate": 1.9588982673889373e-05, |
|
"loss": 0.129, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.18436498505635376, |
|
"grad_norm": 0.01917072532666851, |
|
"learning_rate": 1.9574589471436794e-05, |
|
"loss": 0.1246, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1858053365021065, |
|
"grad_norm": 0.017610415450957716, |
|
"learning_rate": 1.955995404617765e-05, |
|
"loss": 0.1296, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.18724568794785929, |
|
"grad_norm": 0.017348619206735666, |
|
"learning_rate": 1.9545076768366336e-05, |
|
"loss": 0.1201, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18868603939361203, |
|
"grad_norm": 0.018402248035990453, |
|
"learning_rate": 1.9529958014375748e-05, |
|
"loss": 0.1288, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.1901263908393648, |
|
"grad_norm": 0.017347671383931933, |
|
"learning_rate": 1.9514598166687772e-05, |
|
"loss": 0.1322, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.19156674228511758, |
|
"grad_norm": 0.01862965351110765, |
|
"learning_rate": 1.9498997613883597e-05, |
|
"loss": 0.1368, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.19300709373087033, |
|
"grad_norm": 0.016601710581881732, |
|
"learning_rate": 1.9483156750633906e-05, |
|
"loss": 0.1326, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1944474451766231, |
|
"grad_norm": 0.015529988935684115, |
|
"learning_rate": 1.946707597768886e-05, |
|
"loss": 0.1224, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.19588779662237585, |
|
"grad_norm": 0.02017712606978146, |
|
"learning_rate": 1.9450755701867994e-05, |
|
"loss": 0.1291, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.19732814806812862, |
|
"grad_norm": 0.0167232245419185, |
|
"learning_rate": 1.9434196336049897e-05, |
|
"loss": 0.1201, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.1987684995138814, |
|
"grad_norm": 0.019048290037720794, |
|
"learning_rate": 1.941739829916177e-05, |
|
"loss": 0.1241, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.20020885095963414, |
|
"grad_norm": 0.017899758339100985, |
|
"learning_rate": 1.940036201616886e-05, |
|
"loss": 0.1255, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.20164920240538692, |
|
"grad_norm": 0.018753604988563825, |
|
"learning_rate": 1.9383087918063662e-05, |
|
"loss": 0.1279, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20308955385113966, |
|
"grad_norm": 0.016720002206463015, |
|
"learning_rate": 1.9365576441855046e-05, |
|
"loss": 0.1265, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.20452990529689244, |
|
"grad_norm": 0.01598156321133028, |
|
"learning_rate": 1.9347828030557196e-05, |
|
"loss": 0.1247, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2059702567426452, |
|
"grad_norm": 0.015176452766976297, |
|
"learning_rate": 1.932984313317839e-05, |
|
"loss": 0.1269, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.20741060818839796, |
|
"grad_norm": 0.01743363415449858, |
|
"learning_rate": 1.931162220470967e-05, |
|
"loss": 0.1252, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.20885095963415073, |
|
"grad_norm": 0.020263642348868235, |
|
"learning_rate": 1.9293165706113287e-05, |
|
"loss": 0.1245, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2102913110799035, |
|
"grad_norm": 0.016973378857152475, |
|
"learning_rate": 1.9274474104311083e-05, |
|
"loss": 0.1229, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.21173166252565626, |
|
"grad_norm": 0.01631083673087625, |
|
"learning_rate": 1.925554787217265e-05, |
|
"loss": 0.124, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21317201397140903, |
|
"grad_norm": 0.019965918558652533, |
|
"learning_rate": 1.9236387488503378e-05, |
|
"loss": 0.1223, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.21461236541716178, |
|
"grad_norm": 0.021848215070733194, |
|
"learning_rate": 1.921699343803235e-05, |
|
"loss": 0.1138, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.21605271686291455, |
|
"grad_norm": 0.018291368502582516, |
|
"learning_rate": 1.9197366211400058e-05, |
|
"loss": 0.1241, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.21749306830866733, |
|
"grad_norm": 0.019059330643677385, |
|
"learning_rate": 1.9177506305146008e-05, |
|
"loss": 0.1268, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.21893341975442007, |
|
"grad_norm": 0.017080958515049458, |
|
"learning_rate": 1.9157414221696155e-05, |
|
"loss": 0.1206, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.22037377120017285, |
|
"grad_norm": 0.017957542038359017, |
|
"learning_rate": 1.9137090469350185e-05, |
|
"loss": 0.1244, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2218141226459256, |
|
"grad_norm": 0.01802711782041451, |
|
"learning_rate": 1.9116535562268658e-05, |
|
"loss": 0.1206, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.22325447409167837, |
|
"grad_norm": 0.019287045682764212, |
|
"learning_rate": 1.9095750020460017e-05, |
|
"loss": 0.1144, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.22469482553743114, |
|
"grad_norm": 0.01879207545472608, |
|
"learning_rate": 1.9074734369767407e-05, |
|
"loss": 0.1192, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2261351769831839, |
|
"grad_norm": 0.019461601988728348, |
|
"learning_rate": 1.9053489141855386e-05, |
|
"loss": 0.125, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.22757552842893666, |
|
"grad_norm": 0.019720624818386844, |
|
"learning_rate": 1.9032014874196476e-05, |
|
"loss": 0.1269, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.22901587987468944, |
|
"grad_norm": 0.018318990062697504, |
|
"learning_rate": 1.901031211005756e-05, |
|
"loss": 0.1203, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.23045623132044218, |
|
"grad_norm": 0.019164348842944955, |
|
"learning_rate": 1.898838139848614e-05, |
|
"loss": 0.1213, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23189658276619496, |
|
"grad_norm": 0.02213208810579334, |
|
"learning_rate": 1.8966223294296445e-05, |
|
"loss": 0.1213, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2333369342119477, |
|
"grad_norm": 0.018200798899224192, |
|
"learning_rate": 1.8943838358055403e-05, |
|
"loss": 0.1275, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.23477728565770048, |
|
"grad_norm": 0.018399660059020002, |
|
"learning_rate": 1.892122715606846e-05, |
|
"loss": 0.1226, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.23621763710345325, |
|
"grad_norm": 0.017674503935350343, |
|
"learning_rate": 1.8898390260365227e-05, |
|
"loss": 0.1175, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.237657988549206, |
|
"grad_norm": 0.019168528020990075, |
|
"learning_rate": 1.8875328248685047e-05, |
|
"loss": 0.1228, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.23909833999495878, |
|
"grad_norm": 0.01707510254798075, |
|
"learning_rate": 1.885204170446235e-05, |
|
"loss": 0.1229, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.24053869144071152, |
|
"grad_norm": 0.017143155499374434, |
|
"learning_rate": 1.8828531216811912e-05, |
|
"loss": 0.1163, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.2419790428864643, |
|
"grad_norm": 0.01972260397119316, |
|
"learning_rate": 1.8804797380513944e-05, |
|
"loss": 0.1151, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.24341939433221707, |
|
"grad_norm": 0.017300491914188608, |
|
"learning_rate": 1.878084079599903e-05, |
|
"loss": 0.1197, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.24485974577796982, |
|
"grad_norm": 0.017415521885546983, |
|
"learning_rate": 1.8756662069332966e-05, |
|
"loss": 0.12, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2463000972237226, |
|
"grad_norm": 0.020650688836768367, |
|
"learning_rate": 1.8732261812201408e-05, |
|
"loss": 0.1159, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.24774044866947534, |
|
"grad_norm": 0.022394976870943134, |
|
"learning_rate": 1.8707640641894395e-05, |
|
"loss": 0.1182, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2491808001152281, |
|
"grad_norm": 0.018481000136883082, |
|
"learning_rate": 1.8682799181290747e-05, |
|
"loss": 0.117, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.25062115156098086, |
|
"grad_norm": 0.020812551732477143, |
|
"learning_rate": 1.86577380588423e-05, |
|
"loss": 0.1174, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.25206150300673363, |
|
"grad_norm": 0.019146509121457756, |
|
"learning_rate": 1.8632457908558006e-05, |
|
"loss": 0.1128, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.2535018544524864, |
|
"grad_norm": 0.02088872645363683, |
|
"learning_rate": 1.8606959369987885e-05, |
|
"loss": 0.1258, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2549422058982392, |
|
"grad_norm": 0.020975712319762363, |
|
"learning_rate": 1.8581243088206865e-05, |
|
"loss": 0.1252, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.25638255734399196, |
|
"grad_norm": 0.022104093266461965, |
|
"learning_rate": 1.8555309713798445e-05, |
|
"loss": 0.1175, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2578229087897447, |
|
"grad_norm": 0.01935436681312094, |
|
"learning_rate": 1.8529159902838253e-05, |
|
"loss": 0.1155, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.25926326023549745, |
|
"grad_norm": 0.02103943800048682, |
|
"learning_rate": 1.8502794316877423e-05, |
|
"loss": 0.1163, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2607036116812502, |
|
"grad_norm": 0.02064951760573597, |
|
"learning_rate": 1.8476213622925885e-05, |
|
"loss": 0.1151, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.262143963127003, |
|
"grad_norm": 0.02391509020619219, |
|
"learning_rate": 1.844941849343548e-05, |
|
"loss": 0.1154, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2635843145727558, |
|
"grad_norm": 0.019299067245624735, |
|
"learning_rate": 1.842240960628294e-05, |
|
"loss": 0.118, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2650246660185085, |
|
"grad_norm": 0.018999660132613904, |
|
"learning_rate": 1.8395187644752756e-05, |
|
"loss": 0.1121, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.26646501746426127, |
|
"grad_norm": 0.019465104345890567, |
|
"learning_rate": 1.8367753297519874e-05, |
|
"loss": 0.1147, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.26790536891001404, |
|
"grad_norm": 0.017798540374291294, |
|
"learning_rate": 1.8340107258632288e-05, |
|
"loss": 0.1192, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2693457203557668, |
|
"grad_norm": 0.022579684079520896, |
|
"learning_rate": 1.831225022749347e-05, |
|
"loss": 0.1127, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.2707860718015196, |
|
"grad_norm": 0.019409928418770574, |
|
"learning_rate": 1.828418290884468e-05, |
|
"loss": 0.1141, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2722264232472723, |
|
"grad_norm": 0.019982776222918414, |
|
"learning_rate": 1.8255906012747137e-05, |
|
"loss": 0.1107, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.2736667746930251, |
|
"grad_norm": 0.019327451944760602, |
|
"learning_rate": 1.8227420254564066e-05, |
|
"loss": 0.1088, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.27510712613877786, |
|
"grad_norm": 0.02195579382510436, |
|
"learning_rate": 1.819872635494258e-05, |
|
"loss": 0.1149, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.27654747758453063, |
|
"grad_norm": 0.019331091074997867, |
|
"learning_rate": 1.816982503979546e-05, |
|
"loss": 0.1187, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2779878290302834, |
|
"grad_norm": 0.0198267215148203, |
|
"learning_rate": 1.8140717040282797e-05, |
|
"loss": 0.1133, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2794281804760361, |
|
"grad_norm": 0.021536265466576106, |
|
"learning_rate": 1.811140309279348e-05, |
|
"loss": 0.1169, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2808685319217889, |
|
"grad_norm": 0.02522795945903232, |
|
"learning_rate": 1.808188393892658e-05, |
|
"loss": 0.1162, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2823088833675417, |
|
"grad_norm": 0.021202903232191053, |
|
"learning_rate": 1.805216032547258e-05, |
|
"loss": 0.1177, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.28374923481329445, |
|
"grad_norm": 0.02708284619788456, |
|
"learning_rate": 1.8022233004394487e-05, |
|
"loss": 0.1189, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2851895862590472, |
|
"grad_norm": 0.02891719428802073, |
|
"learning_rate": 1.7992102732808798e-05, |
|
"loss": 0.116, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2866299377048, |
|
"grad_norm": 0.020017133174120256, |
|
"learning_rate": 1.796177027296637e-05, |
|
"loss": 0.1192, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.2880702891505527, |
|
"grad_norm": 0.022814840523695007, |
|
"learning_rate": 1.79312363922331e-05, |
|
"loss": 0.1196, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2880702891505527, |
|
"eval_loss": 0.1148042306303978, |
|
"eval_runtime": 862.8924, |
|
"eval_samples_per_second": 2.091, |
|
"eval_steps_per_second": 0.523, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2895106405963055, |
|
"grad_norm": 0.020165318251700003, |
|
"learning_rate": 1.7900501863070552e-05, |
|
"loss": 0.1092, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.29095099204205827, |
|
"grad_norm": 0.020276491773740017, |
|
"learning_rate": 1.7869567463016394e-05, |
|
"loss": 0.108, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.29239134348781104, |
|
"grad_norm": 0.025430907092320074, |
|
"learning_rate": 1.7838433974664714e-05, |
|
"loss": 0.1198, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2938316949335638, |
|
"grad_norm": 0.02600654612802859, |
|
"learning_rate": 1.7807102185646247e-05, |
|
"loss": 0.1164, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.29527204637931653, |
|
"grad_norm": 0.0193830589897909, |
|
"learning_rate": 1.7775572888608438e-05, |
|
"loss": 0.1151, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2967123978250693, |
|
"grad_norm": 0.023599658745454725, |
|
"learning_rate": 1.774384688119539e-05, |
|
"loss": 0.1183, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2981527492708221, |
|
"grad_norm": 0.021674931585504164, |
|
"learning_rate": 1.7711924966027678e-05, |
|
"loss": 0.1141, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.29959310071657486, |
|
"grad_norm": 0.023299214871684407, |
|
"learning_rate": 1.767980795068206e-05, |
|
"loss": 0.1194, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.30103345216232763, |
|
"grad_norm": 0.02147089523180919, |
|
"learning_rate": 1.7647496647671033e-05, |
|
"loss": 0.1123, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.30247380360808035, |
|
"grad_norm": 0.021457716083701384, |
|
"learning_rate": 1.761499187442228e-05, |
|
"loss": 0.1058, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3039141550538331, |
|
"grad_norm": 0.01969637917151117, |
|
"learning_rate": 1.7582294453257996e-05, |
|
"loss": 0.1207, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.3053545064995859, |
|
"grad_norm": 0.023839310938145133, |
|
"learning_rate": 1.7549405211374072e-05, |
|
"loss": 0.1146, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3067948579453387, |
|
"grad_norm": 0.01832092911473367, |
|
"learning_rate": 1.7516324980819185e-05, |
|
"loss": 0.113, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.30823520939109145, |
|
"grad_norm": 0.02124767953755396, |
|
"learning_rate": 1.7483054598473734e-05, |
|
"loss": 0.1213, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.30967556083684417, |
|
"grad_norm": 0.0247043392947563, |
|
"learning_rate": 1.7449594906028684e-05, |
|
"loss": 0.1121, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.31111591228259694, |
|
"grad_norm": 0.022139995366462776, |
|
"learning_rate": 1.7415946749964252e-05, |
|
"loss": 0.1132, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3125562637283497, |
|
"grad_norm": 0.01979540959946202, |
|
"learning_rate": 1.7382110981528506e-05, |
|
"loss": 0.1085, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.3139966151741025, |
|
"grad_norm": 0.029924025152490485, |
|
"learning_rate": 1.734808845671583e-05, |
|
"loss": 0.1054, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.31543696661985526, |
|
"grad_norm": 0.023093772502185278, |
|
"learning_rate": 1.7313880036245257e-05, |
|
"loss": 0.112, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.316877318065608, |
|
"grad_norm": 0.023473379108296808, |
|
"learning_rate": 1.7279486585538712e-05, |
|
"loss": 0.1135, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.31831766951136076, |
|
"grad_norm": 0.021733393242624945, |
|
"learning_rate": 1.7244908974699112e-05, |
|
"loss": 0.1134, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.31975802095711353, |
|
"grad_norm": 0.021271929042974584, |
|
"learning_rate": 1.721014807848833e-05, |
|
"loss": 0.1164, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3211983724028663, |
|
"grad_norm": 0.020487120222693576, |
|
"learning_rate": 1.7175204776305102e-05, |
|
"loss": 0.1077, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3226387238486191, |
|
"grad_norm": 0.021171582329437753, |
|
"learning_rate": 1.7140079952162765e-05, |
|
"loss": 0.1102, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.32407907529437185, |
|
"grad_norm": 0.02144446773648067, |
|
"learning_rate": 1.7104774494666877e-05, |
|
"loss": 0.1092, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3255194267401246, |
|
"grad_norm": 0.019040039144316633, |
|
"learning_rate": 1.7069289296992756e-05, |
|
"loss": 0.112, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.32695977818587735, |
|
"grad_norm": 0.021286149625859865, |
|
"learning_rate": 1.703362525686288e-05, |
|
"loss": 0.1106, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.3284001296316301, |
|
"grad_norm": 0.02183865564006019, |
|
"learning_rate": 1.6997783276524177e-05, |
|
"loss": 0.1134, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3298404810773829, |
|
"grad_norm": 0.021387717982638638, |
|
"learning_rate": 1.6961764262725187e-05, |
|
"loss": 0.1063, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.33128083252313567, |
|
"grad_norm": 0.024865173996299898, |
|
"learning_rate": 1.6925569126693135e-05, |
|
"loss": 0.1051, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3327211839688884, |
|
"grad_norm": 0.023180712867703127, |
|
"learning_rate": 1.6889198784110883e-05, |
|
"loss": 0.1058, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.33416153541464116, |
|
"grad_norm": 0.02218441265430226, |
|
"learning_rate": 1.6852654155093745e-05, |
|
"loss": 0.1117, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.33560188686039394, |
|
"grad_norm": 0.02165110006319973, |
|
"learning_rate": 1.681593616416623e-05, |
|
"loss": 0.112, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3370422383061467, |
|
"grad_norm": 0.02200923978026833, |
|
"learning_rate": 1.6779045740238643e-05, |
|
"loss": 0.114, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3384825897518995, |
|
"grad_norm": 0.02197865224880007, |
|
"learning_rate": 1.6741983816583583e-05, |
|
"loss": 0.111, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3399229411976522, |
|
"grad_norm": 0.019554896311713962, |
|
"learning_rate": 1.6704751330812342e-05, |
|
"loss": 0.1019, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.341363292643405, |
|
"grad_norm": 0.024134291900428952, |
|
"learning_rate": 1.666734922485117e-05, |
|
"loss": 0.1092, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.34280364408915776, |
|
"grad_norm": 0.022673698274783088, |
|
"learning_rate": 1.662977844491746e-05, |
|
"loss": 0.113, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.34424399553491053, |
|
"grad_norm": 0.02603003971238878, |
|
"learning_rate": 1.6592039941495803e-05, |
|
"loss": 0.1036, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.3456843469806633, |
|
"grad_norm": 0.024554078349019966, |
|
"learning_rate": 1.6554134669313943e-05, |
|
"loss": 0.1121, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.347124698426416, |
|
"grad_norm": 0.019717425570643314, |
|
"learning_rate": 1.6516063587318627e-05, |
|
"loss": 0.1068, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.3485650498721688, |
|
"grad_norm": 0.027429511589759405, |
|
"learning_rate": 1.647782765865134e-05, |
|
"loss": 0.106, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.35000540131792157, |
|
"grad_norm": 0.026628925564535798, |
|
"learning_rate": 1.6439427850623944e-05, |
|
"loss": 0.1123, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.35144575276367435, |
|
"grad_norm": 0.026461597229239776, |
|
"learning_rate": 1.64008651346942e-05, |
|
"loss": 0.1099, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3528861042094271, |
|
"grad_norm": 0.02584266985928258, |
|
"learning_rate": 1.63621404864412e-05, |
|
"loss": 0.1107, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.35432645565517984, |
|
"grad_norm": 0.019646883384690197, |
|
"learning_rate": 1.6323254885540672e-05, |
|
"loss": 0.1121, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3557668071009326, |
|
"grad_norm": 0.024026012804653142, |
|
"learning_rate": 1.6284209315740225e-05, |
|
"loss": 0.1047, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.3572071585466854, |
|
"grad_norm": 0.024476248259603612, |
|
"learning_rate": 1.6245004764834423e-05, |
|
"loss": 0.109, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.35864750999243816, |
|
"grad_norm": 0.022810087697062553, |
|
"learning_rate": 1.620564222463982e-05, |
|
"loss": 0.1102, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.36008786143819094, |
|
"grad_norm": 0.024569475912673062, |
|
"learning_rate": 1.6166122690969872e-05, |
|
"loss": 0.1057, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.36152821288394366, |
|
"grad_norm": 0.026160188797535698, |
|
"learning_rate": 1.612644716360972e-05, |
|
"loss": 0.0995, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.36296856432969643, |
|
"grad_norm": 0.025602704231673833, |
|
"learning_rate": 1.6086616646290926e-05, |
|
"loss": 0.116, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3644089157754492, |
|
"grad_norm": 0.02307850757189033, |
|
"learning_rate": 1.6046632146666056e-05, |
|
"loss": 0.1088, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.365849267221202, |
|
"grad_norm": 0.024139060873310914, |
|
"learning_rate": 1.60064946762832e-05, |
|
"loss": 0.1044, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.36728961866695475, |
|
"grad_norm": 0.023315249645455345, |
|
"learning_rate": 1.5966205250560393e-05, |
|
"loss": 0.1119, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.36872997011270753, |
|
"grad_norm": 0.021641266926506458, |
|
"learning_rate": 1.592576488875989e-05, |
|
"loss": 0.1094, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.37017032155846025, |
|
"grad_norm": 0.022047478963055464, |
|
"learning_rate": 1.5885174613962427e-05, |
|
"loss": 0.1083, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.371610673004213, |
|
"grad_norm": 0.02400330121786872, |
|
"learning_rate": 1.5844435453041294e-05, |
|
"loss": 0.1139, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3730510244499658, |
|
"grad_norm": 0.022595131555882185, |
|
"learning_rate": 1.5803548436636394e-05, |
|
"loss": 0.108, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.37449137589571857, |
|
"grad_norm": 0.02791419433017582, |
|
"learning_rate": 1.576251459912814e-05, |
|
"loss": 0.1134, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.37593172734147134, |
|
"grad_norm": 0.026002680251400723, |
|
"learning_rate": 1.5721334978611307e-05, |
|
"loss": 0.1107, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.37737207878722406, |
|
"grad_norm": 0.022484735806660563, |
|
"learning_rate": 1.5680010616868762e-05, |
|
"loss": 0.1076, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.37881243023297684, |
|
"grad_norm": 0.021483070318961833, |
|
"learning_rate": 1.5638542559345106e-05, |
|
"loss": 0.1034, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.3802527816787296, |
|
"grad_norm": 0.023871643967031505, |
|
"learning_rate": 1.559693185512023e-05, |
|
"loss": 0.1106, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3816931331244824, |
|
"grad_norm": 0.021906873584555547, |
|
"learning_rate": 1.555517955688277e-05, |
|
"loss": 0.1077, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.38313348457023516, |
|
"grad_norm": 0.023737665896899265, |
|
"learning_rate": 1.5513286720903488e-05, |
|
"loss": 0.1153, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3845738360159879, |
|
"grad_norm": 0.022433920258610178, |
|
"learning_rate": 1.5471254407008526e-05, |
|
"loss": 0.1037, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.38601418746174065, |
|
"grad_norm": 0.02261886005036298, |
|
"learning_rate": 1.542908367855263e-05, |
|
"loss": 0.1088, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.38745453890749343, |
|
"grad_norm": 0.021421057577652544, |
|
"learning_rate": 1.53867756023922e-05, |
|
"loss": 0.1047, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3888948903532462, |
|
"grad_norm": 0.021709509569311863, |
|
"learning_rate": 1.534433124885836e-05, |
|
"loss": 0.1033, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.390335241798999, |
|
"grad_norm": 0.024820443979506786, |
|
"learning_rate": 1.530175169172982e-05, |
|
"loss": 0.1097, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.3917755932447517, |
|
"grad_norm": 0.025227643673449818, |
|
"learning_rate": 1.525903800820575e-05, |
|
"loss": 0.1065, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.39321594469050447, |
|
"grad_norm": 0.022487840207955263, |
|
"learning_rate": 1.5216191278878522e-05, |
|
"loss": 0.1064, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.39465629613625725, |
|
"grad_norm": 0.025854963249344314, |
|
"learning_rate": 1.517321258770636e-05, |
|
"loss": 0.1043, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.39609664758201, |
|
"grad_norm": 0.02260216955018553, |
|
"learning_rate": 1.5130103021985929e-05, |
|
"loss": 0.1126, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.3975369990277628, |
|
"grad_norm": 0.02491639306364899, |
|
"learning_rate": 1.5086863672324826e-05, |
|
"loss": 0.1039, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3989773504735155, |
|
"grad_norm": 0.022899995574611955, |
|
"learning_rate": 1.5043495632613982e-05, |
|
"loss": 0.1066, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.4004177019192683, |
|
"grad_norm": 0.024707905868095445, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.1106, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.40185805336502106, |
|
"grad_norm": 0.0241361491731915, |
|
"learning_rate": 1.4956377874857395e-05, |
|
"loss": 0.1001, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.40329840481077384, |
|
"grad_norm": 0.025677892301256368, |
|
"learning_rate": 1.4912630360760743e-05, |
|
"loss": 0.1092, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4047387562565266, |
|
"grad_norm": 0.024135758692475445, |
|
"learning_rate": 1.4868758564456785e-05, |
|
"loss": 0.1116, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.40617910770227933, |
|
"grad_norm": 0.024527918000747655, |
|
"learning_rate": 1.4824763595836404e-05, |
|
"loss": 0.1065, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4076194591480321, |
|
"grad_norm": 0.023548606886532042, |
|
"learning_rate": 1.4780646567906571e-05, |
|
"loss": 0.1009, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4090598105937849, |
|
"grad_norm": 0.026633678739013535, |
|
"learning_rate": 1.473640859676217e-05, |
|
"loss": 0.1059, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.41050016203953765, |
|
"grad_norm": 0.02593995796499118, |
|
"learning_rate": 1.4692050801557769e-05, |
|
"loss": 0.107, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.4119405134852904, |
|
"grad_norm": 0.02413535724648029, |
|
"learning_rate": 1.4647574304479295e-05, |
|
"loss": 0.1115, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4133808649310432, |
|
"grad_norm": 0.02297878729642286, |
|
"learning_rate": 1.4602980230715674e-05, |
|
"loss": 0.1083, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.4148212163767959, |
|
"grad_norm": 0.02697991820571532, |
|
"learning_rate": 1.4558269708430333e-05, |
|
"loss": 0.1051, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4162615678225487, |
|
"grad_norm": 0.025004735182102, |
|
"learning_rate": 1.4513443868732674e-05, |
|
"loss": 0.1103, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.41770191926830147, |
|
"grad_norm": 0.024578324660308907, |
|
"learning_rate": 1.4468503845649462e-05, |
|
"loss": 0.108, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.41914227071405424, |
|
"grad_norm": 0.024290400744172484, |
|
"learning_rate": 1.4423450776096122e-05, |
|
"loss": 0.102, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.420582622159807, |
|
"grad_norm": 0.025652851866900635, |
|
"learning_rate": 1.4378285799848004e-05, |
|
"loss": 0.1139, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.42202297360555974, |
|
"grad_norm": 0.022542605080051747, |
|
"learning_rate": 1.4333010059511505e-05, |
|
"loss": 0.1048, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.4234633250513125, |
|
"grad_norm": 0.022564970591101585, |
|
"learning_rate": 1.4287624700495211e-05, |
|
"loss": 0.1053, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.4249036764970653, |
|
"grad_norm": 0.024332336625839772, |
|
"learning_rate": 1.4242130870980882e-05, |
|
"loss": 0.1035, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.42634402794281806, |
|
"grad_norm": 0.02565641763751157, |
|
"learning_rate": 1.4196529721894427e-05, |
|
"loss": 0.1054, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.42778437938857083, |
|
"grad_norm": 0.022329221067199423, |
|
"learning_rate": 1.4150822406876774e-05, |
|
"loss": 0.1064, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.42922473083432355, |
|
"grad_norm": 0.02525079350860865, |
|
"learning_rate": 1.4105010082254697e-05, |
|
"loss": 0.1089, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.43066508228007633, |
|
"grad_norm": 0.025470140748940266, |
|
"learning_rate": 1.4059093907011552e-05, |
|
"loss": 0.103, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.4321054337258291, |
|
"grad_norm": 0.03889636426311705, |
|
"learning_rate": 1.401307504275796e-05, |
|
"loss": 0.106, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4321054337258291, |
|
"eval_loss": 0.10669519007205963, |
|
"eval_runtime": 863.9882, |
|
"eval_samples_per_second": 2.088, |
|
"eval_steps_per_second": 0.522, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4335457851715819, |
|
"grad_norm": 0.03107255388021069, |
|
"learning_rate": 1.3966954653702423e-05, |
|
"loss": 0.1086, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.43498613661733465, |
|
"grad_norm": 0.029782191799490893, |
|
"learning_rate": 1.3920733906621861e-05, |
|
"loss": 0.1056, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.43642648806308737, |
|
"grad_norm": 0.03596510921750815, |
|
"learning_rate": 1.3874413970832123e-05, |
|
"loss": 0.1057, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.43786683950884014, |
|
"grad_norm": 0.02690673392192122, |
|
"learning_rate": 1.3827996018158356e-05, |
|
"loss": 0.1009, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4393071909545929, |
|
"grad_norm": 0.02415574342794587, |
|
"learning_rate": 1.378148122290541e-05, |
|
"loss": 0.1092, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4407475424003457, |
|
"grad_norm": 0.025267363786179952, |
|
"learning_rate": 1.3734870761828095e-05, |
|
"loss": 0.1046, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.44218789384609847, |
|
"grad_norm": 0.02251899862464666, |
|
"learning_rate": 1.368816581410143e-05, |
|
"loss": 0.1026, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.4436282452918512, |
|
"grad_norm": 0.024093194788389363, |
|
"learning_rate": 1.3641367561290795e-05, |
|
"loss": 0.1069, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.44506859673760396, |
|
"grad_norm": 0.02865301350630449, |
|
"learning_rate": 1.3594477187322065e-05, |
|
"loss": 0.101, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.44650894818335674, |
|
"grad_norm": 0.025991749936729126, |
|
"learning_rate": 1.3547495878451635e-05, |
|
"loss": 0.104, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4479492996291095, |
|
"grad_norm": 0.02427741302197225, |
|
"learning_rate": 1.3500424823236413e-05, |
|
"loss": 0.1097, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4493896510748623, |
|
"grad_norm": 0.025286951386541058, |
|
"learning_rate": 1.3453265212503756e-05, |
|
"loss": 0.1025, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.450830002520615, |
|
"grad_norm": 0.02293738313311065, |
|
"learning_rate": 1.340601823932135e-05, |
|
"loss": 0.1026, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.4522703539663678, |
|
"grad_norm": 0.029909633188798766, |
|
"learning_rate": 1.335868509896702e-05, |
|
"loss": 0.1093, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.45371070541212055, |
|
"grad_norm": 0.02489864832302057, |
|
"learning_rate": 1.3311266988898477e-05, |
|
"loss": 0.1075, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4551510568578733, |
|
"grad_norm": 0.02626703651775361, |
|
"learning_rate": 1.3263765108723061e-05, |
|
"loss": 0.1052, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4565914083036261, |
|
"grad_norm": 0.028232314920455838, |
|
"learning_rate": 1.3216180660167355e-05, |
|
"loss": 0.103, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.4580317597493789, |
|
"grad_norm": 0.02893621210041688, |
|
"learning_rate": 1.3168514847046802e-05, |
|
"loss": 0.102, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4594721111951316, |
|
"grad_norm": 0.027792315343772488, |
|
"learning_rate": 1.3120768875235252e-05, |
|
"loss": 0.1006, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.46091246264088437, |
|
"grad_norm": 0.022941453299660446, |
|
"learning_rate": 1.3072943952634446e-05, |
|
"loss": 0.101, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.46235281408663714, |
|
"grad_norm": 0.02754317441994994, |
|
"learning_rate": 1.3025041289143459e-05, |
|
"loss": 0.1014, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4637931655323899, |
|
"grad_norm": 0.023409812236038566, |
|
"learning_rate": 1.2977062096628096e-05, |
|
"loss": 0.1018, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4652335169781427, |
|
"grad_norm": 0.02736910879882447, |
|
"learning_rate": 1.2929007588890241e-05, |
|
"loss": 0.1029, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.4666738684238954, |
|
"grad_norm": 0.024599830487748515, |
|
"learning_rate": 1.2880878981637129e-05, |
|
"loss": 0.1076, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4681142198696482, |
|
"grad_norm": 0.030038012520127225, |
|
"learning_rate": 1.2832677492450602e-05, |
|
"loss": 0.1058, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.46955457131540096, |
|
"grad_norm": 0.027368913953633472, |
|
"learning_rate": 1.2784404340756315e-05, |
|
"loss": 0.1082, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.47099492276115373, |
|
"grad_norm": 0.02334017117279336, |
|
"learning_rate": 1.2736060747792877e-05, |
|
"loss": 0.106, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.4724352742069065, |
|
"grad_norm": 0.02870373822056335, |
|
"learning_rate": 1.268764793658094e-05, |
|
"loss": 0.1099, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4738756256526592, |
|
"grad_norm": 0.029235939335636004, |
|
"learning_rate": 1.2639167131892294e-05, |
|
"loss": 0.0973, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.475315977098412, |
|
"grad_norm": 0.026146387485585586, |
|
"learning_rate": 1.2590619560218851e-05, |
|
"loss": 0.1087, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4767563285441648, |
|
"grad_norm": 0.02842049987719084, |
|
"learning_rate": 1.2542006449741631e-05, |
|
"loss": 0.1061, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.47819667998991755, |
|
"grad_norm": 0.024518713018732985, |
|
"learning_rate": 1.249332903029969e-05, |
|
"loss": 0.1051, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4796370314356703, |
|
"grad_norm": 0.02484058398034788, |
|
"learning_rate": 1.2444588533358996e-05, |
|
"loss": 0.1022, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.48107738288142304, |
|
"grad_norm": 0.03724000521562738, |
|
"learning_rate": 1.23957861919813e-05, |
|
"loss": 0.1056, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4825177343271758, |
|
"grad_norm": 0.03293802155567722, |
|
"learning_rate": 1.2346923240792907e-05, |
|
"loss": 0.1092, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.4839580857729286, |
|
"grad_norm": 0.026650789981673747, |
|
"learning_rate": 1.229800091595347e-05, |
|
"loss": 0.1008, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.48539843721868137, |
|
"grad_norm": 0.02758805567603314, |
|
"learning_rate": 1.2249020455124703e-05, |
|
"loss": 0.1024, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.48683878866443414, |
|
"grad_norm": 0.0289561047779867, |
|
"learning_rate": 1.2199983097439079e-05, |
|
"loss": 0.0974, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.48827914011018686, |
|
"grad_norm": 0.02472150617570116, |
|
"learning_rate": 1.2150890083468465e-05, |
|
"loss": 0.1017, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.48971949155593963, |
|
"grad_norm": 0.026933463536791147, |
|
"learning_rate": 1.2101742655192761e-05, |
|
"loss": 0.107, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4911598430016924, |
|
"grad_norm": 0.02704841529205261, |
|
"learning_rate": 1.2052542055968461e-05, |
|
"loss": 0.1049, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.4926001944474452, |
|
"grad_norm": 0.029435211264724145, |
|
"learning_rate": 1.2003289530497206e-05, |
|
"loss": 0.1097, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.49404054589319796, |
|
"grad_norm": 0.023704099909174187, |
|
"learning_rate": 1.1953986324794295e-05, |
|
"loss": 0.1027, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.4954808973389507, |
|
"grad_norm": 0.024368103569431525, |
|
"learning_rate": 1.1904633686157158e-05, |
|
"loss": 0.1078, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.49692124878470345, |
|
"grad_norm": 0.024017154477919515, |
|
"learning_rate": 1.1855232863133809e-05, |
|
"loss": 0.1009, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4983616002304562, |
|
"grad_norm": 0.03151983841636764, |
|
"learning_rate": 1.1805785105491247e-05, |
|
"loss": 0.1021, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.499801951676209, |
|
"grad_norm": 0.02991694116471432, |
|
"learning_rate": 1.1756291664183858e-05, |
|
"loss": 0.1076, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5012423031219617, |
|
"grad_norm": 0.03657521191046876, |
|
"learning_rate": 1.1706753791321748e-05, |
|
"loss": 0.1042, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5026826545677145, |
|
"grad_norm": 0.0279482378393294, |
|
"learning_rate": 1.1657172740139074e-05, |
|
"loss": 0.1017, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.5041230060134673, |
|
"grad_norm": 0.030069339039746814, |
|
"learning_rate": 1.1607549764962342e-05, |
|
"loss": 0.1042, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5055633574592201, |
|
"grad_norm": 0.026487495544772206, |
|
"learning_rate": 1.1557886121178683e-05, |
|
"loss": 0.1079, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5070037089049728, |
|
"grad_norm": 0.024984858540099013, |
|
"learning_rate": 1.1508183065204066e-05, |
|
"loss": 0.1037, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5084440603507255, |
|
"grad_norm": 0.026914213670237396, |
|
"learning_rate": 1.1458441854451539e-05, |
|
"loss": 0.1001, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5098844117964784, |
|
"grad_norm": 0.029179893358272806, |
|
"learning_rate": 1.1408663747299409e-05, |
|
"loss": 0.1002, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5113247632422311, |
|
"grad_norm": 0.039586221088331666, |
|
"learning_rate": 1.13588500030594e-05, |
|
"loss": 0.105, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5127651146879839, |
|
"grad_norm": 0.03249368292701708, |
|
"learning_rate": 1.130900188194481e-05, |
|
"loss": 0.1031, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5142054661337366, |
|
"grad_norm": 0.024055776588308493, |
|
"learning_rate": 1.1259120645038612e-05, |
|
"loss": 0.1013, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5156458175794894, |
|
"grad_norm": 0.025863370886123525, |
|
"learning_rate": 1.1209207554261573e-05, |
|
"loss": 0.1, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5170861690252422, |
|
"grad_norm": 0.025434390514411754, |
|
"learning_rate": 1.1159263872340293e-05, |
|
"loss": 0.1007, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.5185265204709949, |
|
"grad_norm": 0.02745842348721705, |
|
"learning_rate": 1.1109290862775307e-05, |
|
"loss": 0.096, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5199668719167477, |
|
"grad_norm": 0.027615229169377083, |
|
"learning_rate": 1.1059289789809071e-05, |
|
"loss": 0.0998, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.5214072233625004, |
|
"grad_norm": 0.027453704008570706, |
|
"learning_rate": 1.1009261918394028e-05, |
|
"loss": 0.0972, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5228475748082532, |
|
"grad_norm": 0.024700516688583, |
|
"learning_rate": 1.0959208514160561e-05, |
|
"loss": 0.1055, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.524287926254006, |
|
"grad_norm": 0.032125782290431566, |
|
"learning_rate": 1.0909130843385009e-05, |
|
"loss": 0.0979, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5257282776997587, |
|
"grad_norm": 0.027487991421855863, |
|
"learning_rate": 1.085903017295761e-05, |
|
"loss": 0.0992, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.5271686291455115, |
|
"grad_norm": 0.0284479042668781, |
|
"learning_rate": 1.0808907770350463e-05, |
|
"loss": 0.098, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5286089805912643, |
|
"grad_norm": 0.02721705482015998, |
|
"learning_rate": 1.0758764903585457e-05, |
|
"loss": 0.1025, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.530049332037017, |
|
"grad_norm": 0.028561724576000452, |
|
"learning_rate": 1.070860284120219e-05, |
|
"loss": 0.1002, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5314896834827698, |
|
"grad_norm": 0.030215649153775434, |
|
"learning_rate": 1.0658422852225889e-05, |
|
"loss": 0.113, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5329300349285225, |
|
"grad_norm": 0.023003505373751384, |
|
"learning_rate": 1.0608226206135292e-05, |
|
"loss": 0.1004, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5343703863742754, |
|
"grad_norm": 0.032778870546567584, |
|
"learning_rate": 1.0558014172830537e-05, |
|
"loss": 0.1026, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5358107378200281, |
|
"grad_norm": 0.024228471157562194, |
|
"learning_rate": 1.0507788022601033e-05, |
|
"loss": 0.0959, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5372510892657808, |
|
"grad_norm": 0.027347316934608, |
|
"learning_rate": 1.0457549026093338e-05, |
|
"loss": 0.0994, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.5386914407115336, |
|
"grad_norm": 0.02492119460153467, |
|
"learning_rate": 1.0407298454278983e-05, |
|
"loss": 0.096, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5401317921572864, |
|
"grad_norm": 0.02730713294802567, |
|
"learning_rate": 1.0357037578422349e-05, |
|
"loss": 0.1063, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5415721436030392, |
|
"grad_norm": 0.02753505593775209, |
|
"learning_rate": 1.0306767670048497e-05, |
|
"loss": 0.1004, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5430124950487919, |
|
"grad_norm": 0.027125750017208197, |
|
"learning_rate": 1.0256490000910986e-05, |
|
"loss": 0.1033, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.5444528464945446, |
|
"grad_norm": 0.02526923620994891, |
|
"learning_rate": 1.0206205842959718e-05, |
|
"loss": 0.1044, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5458931979402974, |
|
"grad_norm": 0.03456611614472051, |
|
"learning_rate": 1.0155916468308749e-05, |
|
"loss": 0.1018, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.5473335493860502, |
|
"grad_norm": 0.02687752666688773, |
|
"learning_rate": 1.0105623149204118e-05, |
|
"loss": 0.1071, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.548773900831803, |
|
"grad_norm": 0.0348842935948145, |
|
"learning_rate": 1.0055327157991639e-05, |
|
"loss": 0.1018, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5502142522775557, |
|
"grad_norm": 0.03047489251408279, |
|
"learning_rate": 1.0005029767084739e-05, |
|
"loss": 0.1029, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5516546037233084, |
|
"grad_norm": 0.03295680008205151, |
|
"learning_rate": 9.954732248932243e-06, |
|
"loss": 0.1042, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5530949551690613, |
|
"grad_norm": 0.026523017503203378, |
|
"learning_rate": 9.904435875986213e-06, |
|
"loss": 0.0989, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.554535306614814, |
|
"grad_norm": 0.03199960655757483, |
|
"learning_rate": 9.85414192066972e-06, |
|
"loss": 0.1034, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5559756580605668, |
|
"grad_norm": 0.03321351438487483, |
|
"learning_rate": 9.803851655344682e-06, |
|
"loss": 0.0954, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5574160095063195, |
|
"grad_norm": 0.026379907025225486, |
|
"learning_rate": 9.75356635227966e-06, |
|
"loss": 0.1007, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5588563609520723, |
|
"grad_norm": 0.03215809081400103, |
|
"learning_rate": 9.70328728361769e-06, |
|
"loss": 0.1041, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5602967123978251, |
|
"grad_norm": 0.028935560901012115, |
|
"learning_rate": 9.653015721344073e-06, |
|
"loss": 0.104, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5617370638435778, |
|
"grad_norm": 0.026479371672269946, |
|
"learning_rate": 9.602752937254215e-06, |
|
"loss": 0.0987, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5631774152893306, |
|
"grad_norm": 0.02735022790208615, |
|
"learning_rate": 9.552500202921449e-06, |
|
"loss": 0.1052, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.5646177667350833, |
|
"grad_norm": 0.029074239132167123, |
|
"learning_rate": 9.502258789664865e-06, |
|
"loss": 0.1069, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5660581181808362, |
|
"grad_norm": 0.03497134937258824, |
|
"learning_rate": 9.45202996851714e-06, |
|
"loss": 0.1068, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5674984696265889, |
|
"grad_norm": 0.03121000622963146, |
|
"learning_rate": 9.401815010192388e-06, |
|
"loss": 0.1058, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5689388210723416, |
|
"grad_norm": 0.029762366581440217, |
|
"learning_rate": 9.351615185054029e-06, |
|
"loss": 0.1027, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5703791725180944, |
|
"grad_norm": 0.031933869149627876, |
|
"learning_rate": 9.301431763082623e-06, |
|
"loss": 0.1001, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5718195239638472, |
|
"grad_norm": 0.024595673857129345, |
|
"learning_rate": 9.251266013843757e-06, |
|
"loss": 0.0993, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5732598754096, |
|
"grad_norm": 0.02931011040644607, |
|
"learning_rate": 9.201119206455927e-06, |
|
"loss": 0.107, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5747002268553527, |
|
"grad_norm": 0.02841549880839795, |
|
"learning_rate": 9.150992609558425e-06, |
|
"loss": 0.1013, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5761405783011054, |
|
"grad_norm": 0.029699203984976344, |
|
"learning_rate": 9.10088749127926e-06, |
|
"loss": 0.1003, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5761405783011054, |
|
"eval_loss": 0.10270440578460693, |
|
"eval_runtime": 862.9139, |
|
"eval_samples_per_second": 2.091, |
|
"eval_steps_per_second": 0.523, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5775809297468583, |
|
"grad_norm": 0.02555642653773717, |
|
"learning_rate": 9.050805119203035e-06, |
|
"loss": 0.0964, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.579021281192611, |
|
"grad_norm": 0.03206482511392409, |
|
"learning_rate": 9.000746760338938e-06, |
|
"loss": 0.1061, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5804616326383638, |
|
"grad_norm": 0.029335358877190958, |
|
"learning_rate": 8.950713681088647e-06, |
|
"loss": 0.0989, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.5819019840841165, |
|
"grad_norm": 0.029061894519663043, |
|
"learning_rate": 8.900707147214301e-06, |
|
"loss": 0.1041, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5833423355298692, |
|
"grad_norm": 0.027789281243274542, |
|
"learning_rate": 8.850728423806487e-06, |
|
"loss": 0.1002, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5847826869756221, |
|
"grad_norm": 0.028006016395305748, |
|
"learning_rate": 8.800778775252221e-06, |
|
"loss": 0.0971, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5862230384213748, |
|
"grad_norm": 0.029135758984445807, |
|
"learning_rate": 8.75085946520298e-06, |
|
"loss": 0.1089, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.5876633898671276, |
|
"grad_norm": 0.028264974680774237, |
|
"learning_rate": 8.700971756542707e-06, |
|
"loss": 0.0999, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5891037413128803, |
|
"grad_norm": 0.02787134319938327, |
|
"learning_rate": 8.65111691135589e-06, |
|
"loss": 0.0996, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5905440927586331, |
|
"grad_norm": 0.02930265638543135, |
|
"learning_rate": 8.601296190895611e-06, |
|
"loss": 0.0998, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5919844442043859, |
|
"grad_norm": 0.028136073425125765, |
|
"learning_rate": 8.551510855551658e-06, |
|
"loss": 0.1022, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5934247956501386, |
|
"grad_norm": 0.030075075847505933, |
|
"learning_rate": 8.501762164818615e-06, |
|
"loss": 0.1011, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5948651470958914, |
|
"grad_norm": 0.02509232015436124, |
|
"learning_rate": 8.452051377264025e-06, |
|
"loss": 0.1089, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5963054985416442, |
|
"grad_norm": 0.029019522531841563, |
|
"learning_rate": 8.402379750496535e-06, |
|
"loss": 0.1043, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5977458499873969, |
|
"grad_norm": 0.02946442999816807, |
|
"learning_rate": 8.35274854113407e-06, |
|
"loss": 0.1033, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.5991862014331497, |
|
"grad_norm": 0.028470170352907883, |
|
"learning_rate": 8.303159004772065e-06, |
|
"loss": 0.1015, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6006265528789024, |
|
"grad_norm": 0.027074007980222393, |
|
"learning_rate": 8.253612395951697e-06, |
|
"loss": 0.1077, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6020669043246553, |
|
"grad_norm": 0.02594322523858361, |
|
"learning_rate": 8.204109968128128e-06, |
|
"loss": 0.1052, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.603507255770408, |
|
"grad_norm": 0.027981527626155637, |
|
"learning_rate": 8.15465297363881e-06, |
|
"loss": 0.1076, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.6049476072161607, |
|
"grad_norm": 0.027128594062002694, |
|
"learning_rate": 8.105242663671807e-06, |
|
"loss": 0.1019, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6063879586619135, |
|
"grad_norm": 0.02399321575005835, |
|
"learning_rate": 8.055880288234127e-06, |
|
"loss": 0.1014, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.6078283101076662, |
|
"grad_norm": 0.030384147510662384, |
|
"learning_rate": 8.006567096120103e-06, |
|
"loss": 0.102, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6092686615534191, |
|
"grad_norm": 0.0275729769794035, |
|
"learning_rate": 7.957304334879809e-06, |
|
"loss": 0.1066, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6107090129991718, |
|
"grad_norm": 0.02827555110704183, |
|
"learning_rate": 7.908093250787496e-06, |
|
"loss": 0.1026, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6121493644449245, |
|
"grad_norm": 0.0280447976215247, |
|
"learning_rate": 7.858935088810054e-06, |
|
"loss": 0.1041, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6135897158906773, |
|
"grad_norm": 0.026871658750667743, |
|
"learning_rate": 7.809831092575528e-06, |
|
"loss": 0.1033, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6150300673364301, |
|
"grad_norm": 0.029232871601405933, |
|
"learning_rate": 7.760782504341644e-06, |
|
"loss": 0.1047, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6164704187821829, |
|
"grad_norm": 0.023805191017251096, |
|
"learning_rate": 7.7117905649644e-06, |
|
"loss": 0.0983, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6179107702279356, |
|
"grad_norm": 0.03103222984517563, |
|
"learning_rate": 7.662856513866643e-06, |
|
"loss": 0.0968, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6193511216736883, |
|
"grad_norm": 0.028105627160877313, |
|
"learning_rate": 7.613981589006754e-06, |
|
"loss": 0.099, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6207914731194412, |
|
"grad_norm": 0.030597646841460515, |
|
"learning_rate": 7.565167026847294e-06, |
|
"loss": 0.0979, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.6222318245651939, |
|
"grad_norm": 0.02970594057212837, |
|
"learning_rate": 7.5164140623237454e-06, |
|
"loss": 0.0975, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6236721760109467, |
|
"grad_norm": 0.027159849019462532, |
|
"learning_rate": 7.467723928813262e-06, |
|
"loss": 0.1036, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6251125274566994, |
|
"grad_norm": 0.03257508113397351, |
|
"learning_rate": 7.419097858103464e-06, |
|
"loss": 0.0997, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6265528789024521, |
|
"grad_norm": 0.029509877961538443, |
|
"learning_rate": 7.370537080361282e-06, |
|
"loss": 0.0977, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.627993230348205, |
|
"grad_norm": 0.02914606675872259, |
|
"learning_rate": 7.3220428241018225e-06, |
|
"loss": 0.0961, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6294335817939577, |
|
"grad_norm": 0.028245193180560037, |
|
"learning_rate": 7.273616316157312e-06, |
|
"loss": 0.1009, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.6308739332397105, |
|
"grad_norm": 0.031258732018464554, |
|
"learning_rate": 7.225258781646036e-06, |
|
"loss": 0.0959, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6323142846854632, |
|
"grad_norm": 0.029187877030262547, |
|
"learning_rate": 7.176971443941359e-06, |
|
"loss": 0.0984, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.633754636131216, |
|
"grad_norm": 0.027481848533676127, |
|
"learning_rate": 7.128755524640771e-06, |
|
"loss": 0.1023, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6351949875769688, |
|
"grad_norm": 0.028467126169710644, |
|
"learning_rate": 7.080612243534981e-06, |
|
"loss": 0.104, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6366353390227215, |
|
"grad_norm": 0.02584742929400372, |
|
"learning_rate": 7.032542818577066e-06, |
|
"loss": 0.0968, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.6380756904684743, |
|
"grad_norm": 0.024188950488983806, |
|
"learning_rate": 6.984548465851641e-06, |
|
"loss": 0.0965, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.6395160419142271, |
|
"grad_norm": 0.028851010852631174, |
|
"learning_rate": 6.93663039954412e-06, |
|
"loss": 0.1007, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6409563933599798, |
|
"grad_norm": 0.025328516360271506, |
|
"learning_rate": 6.888789831909972e-06, |
|
"loss": 0.101, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.6423967448057326, |
|
"grad_norm": 0.03102519024419765, |
|
"learning_rate": 6.841027973244077e-06, |
|
"loss": 0.098, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6438370962514853, |
|
"grad_norm": 0.037884633433375385, |
|
"learning_rate": 6.793346031850085e-06, |
|
"loss": 0.0987, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6452774476972382, |
|
"grad_norm": 0.03111478820563393, |
|
"learning_rate": 6.745745214009866e-06, |
|
"loss": 0.0984, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6467177991429909, |
|
"grad_norm": 0.02761748015741369, |
|
"learning_rate": 6.698226723952985e-06, |
|
"loss": 0.0996, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.6481581505887437, |
|
"grad_norm": 0.02546847357291813, |
|
"learning_rate": 6.65079176382623e-06, |
|
"loss": 0.1027, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6495985020344964, |
|
"grad_norm": 0.028225946201781264, |
|
"learning_rate": 6.603441533663214e-06, |
|
"loss": 0.1052, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.6510388534802491, |
|
"grad_norm": 0.028818402757150797, |
|
"learning_rate": 6.556177231354003e-06, |
|
"loss": 0.103, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.652479204926002, |
|
"grad_norm": 0.027109318220465493, |
|
"learning_rate": 6.509000052614824e-06, |
|
"loss": 0.1043, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.6539195563717547, |
|
"grad_norm": 0.02806157217123479, |
|
"learning_rate": 6.4619111909577995e-06, |
|
"loss": 0.0962, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6553599078175075, |
|
"grad_norm": 0.029350220106511105, |
|
"learning_rate": 6.414911837660768e-06, |
|
"loss": 0.0997, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6568002592632602, |
|
"grad_norm": 0.02863157441645804, |
|
"learning_rate": 6.36800318173714e-06, |
|
"loss": 0.1004, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.658240610709013, |
|
"grad_norm": 0.030549045202254512, |
|
"learning_rate": 6.321186409905812e-06, |
|
"loss": 0.0995, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6596809621547658, |
|
"grad_norm": 0.03217236024450049, |
|
"learning_rate": 6.274462706561153e-06, |
|
"loss": 0.1008, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6611213136005185, |
|
"grad_norm": 0.02822548256388095, |
|
"learning_rate": 6.227833253743045e-06, |
|
"loss": 0.0962, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6625616650462713, |
|
"grad_norm": 0.02614875322287693, |
|
"learning_rate": 6.181299231106963e-06, |
|
"loss": 0.0987, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6640020164920241, |
|
"grad_norm": 0.02541722852556565, |
|
"learning_rate": 6.134861815894146e-06, |
|
"loss": 0.0967, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6654423679377768, |
|
"grad_norm": 0.02850719972847212, |
|
"learning_rate": 6.08852218290181e-06, |
|
"loss": 0.0947, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6668827193835296, |
|
"grad_norm": 0.02908443110544134, |
|
"learning_rate": 6.0422815044534265e-06, |
|
"loss": 0.1037, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6683230708292823, |
|
"grad_norm": 0.029315617072749215, |
|
"learning_rate": 5.9961409503690605e-06, |
|
"loss": 0.0963, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6697634222750352, |
|
"grad_norm": 0.030033042623501118, |
|
"learning_rate": 5.950101687935783e-06, |
|
"loss": 0.1027, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6712037737207879, |
|
"grad_norm": 0.02915046992535852, |
|
"learning_rate": 5.904164881878143e-06, |
|
"loss": 0.1055, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6726441251665406, |
|
"grad_norm": 0.025236471457550445, |
|
"learning_rate": 5.858331694328686e-06, |
|
"loss": 0.0982, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6740844766122934, |
|
"grad_norm": 0.03401543932639446, |
|
"learning_rate": 5.812603284798575e-06, |
|
"loss": 0.0949, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6755248280580461, |
|
"grad_norm": 0.028026329462367786, |
|
"learning_rate": 5.766980810148238e-06, |
|
"loss": 0.1, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.676965179503799, |
|
"grad_norm": 0.02874201116713264, |
|
"learning_rate": 5.721465424558111e-06, |
|
"loss": 0.0958, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6784055309495517, |
|
"grad_norm": 0.028963974093940446, |
|
"learning_rate": 5.676058279499438e-06, |
|
"loss": 0.0989, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6798458823953044, |
|
"grad_norm": 0.029496429086982617, |
|
"learning_rate": 5.630760523705137e-06, |
|
"loss": 0.0969, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6812862338410572, |
|
"grad_norm": 0.02472939959975296, |
|
"learning_rate": 5.585573303140741e-06, |
|
"loss": 0.1005, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.68272658528681, |
|
"grad_norm": 0.02943173144839556, |
|
"learning_rate": 5.540497760975412e-06, |
|
"loss": 0.0991, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6841669367325628, |
|
"grad_norm": 0.027039554476503823, |
|
"learning_rate": 5.495535037553003e-06, |
|
"loss": 0.0967, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.6856072881783155, |
|
"grad_norm": 0.03130069538494267, |
|
"learning_rate": 5.450686270363244e-06, |
|
"loss": 0.104, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6870476396240682, |
|
"grad_norm": 0.02792170269632637, |
|
"learning_rate": 5.405952594012921e-06, |
|
"loss": 0.0971, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6884879910698211, |
|
"grad_norm": 0.026536938565135147, |
|
"learning_rate": 5.361335140197199e-06, |
|
"loss": 0.0998, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6899283425155738, |
|
"grad_norm": 0.030495085484657028, |
|
"learning_rate": 5.316835037670987e-06, |
|
"loss": 0.0989, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6913686939613266, |
|
"grad_norm": 0.025636540838806603, |
|
"learning_rate": 5.272453412220389e-06, |
|
"loss": 0.1005, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6928090454070793, |
|
"grad_norm": 0.029276688852897748, |
|
"learning_rate": 5.228191386634212e-06, |
|
"loss": 0.096, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.694249396852832, |
|
"grad_norm": 0.028892951411287924, |
|
"learning_rate": 5.184050080675558e-06, |
|
"loss": 0.0982, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6956897482985849, |
|
"grad_norm": 0.029668168507339023, |
|
"learning_rate": 5.140030611053512e-06, |
|
"loss": 0.0983, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6971300997443376, |
|
"grad_norm": 0.029331223786389957, |
|
"learning_rate": 5.096134091394879e-06, |
|
"loss": 0.1039, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6985704511900904, |
|
"grad_norm": 0.028007436697092276, |
|
"learning_rate": 5.052361632216009e-06, |
|
"loss": 0.0952, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.7000108026358431, |
|
"grad_norm": 0.028924268558975137, |
|
"learning_rate": 5.008714340894716e-06, |
|
"loss": 0.1025, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7014511540815959, |
|
"grad_norm": 0.028546541140812298, |
|
"learning_rate": 4.965193321642245e-06, |
|
"loss": 0.1051, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7028915055273487, |
|
"grad_norm": 0.03228047586525536, |
|
"learning_rate": 4.9217996754753505e-06, |
|
"loss": 0.1042, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7043318569731014, |
|
"grad_norm": 0.026889053511112666, |
|
"learning_rate": 4.878534500188443e-06, |
|
"loss": 0.0979, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.7057722084188542, |
|
"grad_norm": 0.024493931252593228, |
|
"learning_rate": 4.835398890325806e-06, |
|
"loss": 0.1006, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.707212559864607, |
|
"grad_norm": 0.031072455568760707, |
|
"learning_rate": 4.792393937153914e-06, |
|
"loss": 0.1, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7086529113103597, |
|
"grad_norm": 0.03526863560562569, |
|
"learning_rate": 4.749520728633812e-06, |
|
"loss": 0.1004, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7100932627561125, |
|
"grad_norm": 0.027375718409826295, |
|
"learning_rate": 4.706780349393621e-06, |
|
"loss": 0.099, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7115336142018652, |
|
"grad_norm": 0.029579970429738103, |
|
"learning_rate": 4.664173880701065e-06, |
|
"loss": 0.1008, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7129739656476181, |
|
"grad_norm": 0.031758604921792485, |
|
"learning_rate": 4.62170240043614e-06, |
|
"loss": 0.0983, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7144143170933708, |
|
"grad_norm": 0.02656899768489825, |
|
"learning_rate": 4.579366983063829e-06, |
|
"loss": 0.0997, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7158546685391235, |
|
"grad_norm": 0.02566962597338034, |
|
"learning_rate": 4.537168699606928e-06, |
|
"loss": 0.1015, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7172950199848763, |
|
"grad_norm": 0.028715798458422183, |
|
"learning_rate": 4.4951086176189666e-06, |
|
"loss": 0.107, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.718735371430629, |
|
"grad_norm": 0.02887905888346922, |
|
"learning_rate": 4.453187801157155e-06, |
|
"loss": 0.0984, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.7201757228763819, |
|
"grad_norm": 0.028332689752910086, |
|
"learning_rate": 4.411407310755513e-06, |
|
"loss": 0.1007, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7201757228763819, |
|
"eval_loss": 0.10072407871484756, |
|
"eval_runtime": 862.0549, |
|
"eval_samples_per_second": 2.093, |
|
"eval_steps_per_second": 0.523, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7216160743221346, |
|
"grad_norm": 0.02833390362499488, |
|
"learning_rate": 4.369768203398014e-06, |
|
"loss": 0.0957, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.7230564257678873, |
|
"grad_norm": 0.030945876414709347, |
|
"learning_rate": 4.328271532491859e-06, |
|
"loss": 0.102, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7244967772136401, |
|
"grad_norm": 0.029283752981204014, |
|
"learning_rate": 4.2869183478408125e-06, |
|
"loss": 0.0985, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.7259371286593929, |
|
"grad_norm": 0.027428758917667755, |
|
"learning_rate": 4.2457096956186525e-06, |
|
"loss": 0.1018, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7273774801051457, |
|
"grad_norm": 0.03315591743445466, |
|
"learning_rate": 4.2046466183427035e-06, |
|
"loss": 0.1075, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.7288178315508984, |
|
"grad_norm": 0.030025762956048904, |
|
"learning_rate": 4.163730154847462e-06, |
|
"loss": 0.0929, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.7302581829966511, |
|
"grad_norm": 0.030103449099341675, |
|
"learning_rate": 4.122961340258312e-06, |
|
"loss": 0.0964, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.731698534442404, |
|
"grad_norm": 0.02917754573138957, |
|
"learning_rate": 4.082341205965344e-06, |
|
"loss": 0.0993, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7331388858881567, |
|
"grad_norm": 0.02789369472409532, |
|
"learning_rate": 4.0418707795972575e-06, |
|
"loss": 0.0995, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.7345792373339095, |
|
"grad_norm": 0.029979889727084676, |
|
"learning_rate": 4.001551084995363e-06, |
|
"loss": 0.1031, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7360195887796622, |
|
"grad_norm": 0.027974195684437115, |
|
"learning_rate": 3.961383142187691e-06, |
|
"loss": 0.1021, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7374599402254151, |
|
"grad_norm": 0.030070715679730322, |
|
"learning_rate": 3.9213679673631745e-06, |
|
"loss": 0.1029, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7389002916711678, |
|
"grad_norm": 0.027066646976217957, |
|
"learning_rate": 3.881506572845933e-06, |
|
"loss": 0.1056, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7403406431169205, |
|
"grad_norm": 0.0284274448607563, |
|
"learning_rate": 3.841799967069686e-06, |
|
"loss": 0.1075, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7417809945626733, |
|
"grad_norm": 0.028074887598320608, |
|
"learning_rate": 3.8022491545522346e-06, |
|
"loss": 0.103, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.743221346008426, |
|
"grad_norm": 0.03333775657686949, |
|
"learning_rate": 3.7628551358700303e-06, |
|
"loss": 0.0997, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7446616974541789, |
|
"grad_norm": 0.027376231042512, |
|
"learning_rate": 3.723618907632882e-06, |
|
"loss": 0.1024, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.7461020488999316, |
|
"grad_norm": 0.031319210954457215, |
|
"learning_rate": 3.6845414624587326e-06, |
|
"loss": 0.1032, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7475424003456843, |
|
"grad_norm": 0.028908867069655585, |
|
"learning_rate": 3.6456237889485547e-06, |
|
"loss": 0.1023, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7489827517914371, |
|
"grad_norm": 0.027194027978343076, |
|
"learning_rate": 3.606866871661333e-06, |
|
"loss": 0.0976, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7504231032371899, |
|
"grad_norm": 0.028443079367809734, |
|
"learning_rate": 3.5682716910891613e-06, |
|
"loss": 0.0986, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.7518634546829427, |
|
"grad_norm": 0.030731013519078837, |
|
"learning_rate": 3.5298392236324365e-06, |
|
"loss": 0.1009, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7533038061286954, |
|
"grad_norm": 0.0285250306739096, |
|
"learning_rate": 3.491570441575154e-06, |
|
"loss": 0.0989, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.7547441575744481, |
|
"grad_norm": 0.02742316326439345, |
|
"learning_rate": 3.453466313060322e-06, |
|
"loss": 0.0997, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.756184509020201, |
|
"grad_norm": 0.03242870927580512, |
|
"learning_rate": 3.4155278020654502e-06, |
|
"loss": 0.101, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7576248604659537, |
|
"grad_norm": 0.02846968206424331, |
|
"learning_rate": 3.3777558683781795e-06, |
|
"loss": 0.1002, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7590652119117065, |
|
"grad_norm": 0.025257296374579147, |
|
"learning_rate": 3.3401514675719815e-06, |
|
"loss": 0.0969, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.7605055633574592, |
|
"grad_norm": 0.02845603161574245, |
|
"learning_rate": 3.302715550982014e-06, |
|
"loss": 0.0998, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7619459148032119, |
|
"grad_norm": 0.029606705311034962, |
|
"learning_rate": 3.2654490656810256e-06, |
|
"loss": 0.0965, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7633862662489648, |
|
"grad_norm": 0.029342507148503363, |
|
"learning_rate": 3.228352954455406e-06, |
|
"loss": 0.0966, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7648266176947175, |
|
"grad_norm": 0.02856949602254836, |
|
"learning_rate": 3.1914281557813386e-06, |
|
"loss": 0.0966, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7662669691404703, |
|
"grad_norm": 0.03497921521636249, |
|
"learning_rate": 3.1546756038010507e-06, |
|
"loss": 0.0981, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.767707320586223, |
|
"grad_norm": 0.030940811674726062, |
|
"learning_rate": 3.1180962282991976e-06, |
|
"loss": 0.1008, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7691476720319758, |
|
"grad_norm": 0.02732290490010739, |
|
"learning_rate": 3.081690954679313e-06, |
|
"loss": 0.0968, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7705880234777286, |
|
"grad_norm": 0.029043196623878744, |
|
"learning_rate": 3.0454607039404206e-06, |
|
"loss": 0.0957, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.7720283749234813, |
|
"grad_norm": 0.029602576168767845, |
|
"learning_rate": 3.0094063926537233e-06, |
|
"loss": 0.0993, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7734687263692341, |
|
"grad_norm": 0.02834425216635453, |
|
"learning_rate": 2.973528932939429e-06, |
|
"loss": 0.0954, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7749090778149869, |
|
"grad_norm": 0.02696740012171304, |
|
"learning_rate": 2.937829232443654e-06, |
|
"loss": 0.0974, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7763494292607396, |
|
"grad_norm": 0.029300105549739816, |
|
"learning_rate": 2.9023081943154753e-06, |
|
"loss": 0.1012, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.7777897807064924, |
|
"grad_norm": 0.025590035684474745, |
|
"learning_rate": 2.86696671718408e-06, |
|
"loss": 0.0954, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7792301321522451, |
|
"grad_norm": 0.028340578845450066, |
|
"learning_rate": 2.8318056951360294e-06, |
|
"loss": 0.0978, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.780670483597998, |
|
"grad_norm": 0.027993276158026614, |
|
"learning_rate": 2.7968260176926407e-06, |
|
"loss": 0.1014, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7821108350437507, |
|
"grad_norm": 0.03153867114181106, |
|
"learning_rate": 2.762028569787485e-06, |
|
"loss": 0.1051, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7835511864895034, |
|
"grad_norm": 0.03024494353093925, |
|
"learning_rate": 2.7274142317439956e-06, |
|
"loss": 0.0979, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7849915379352562, |
|
"grad_norm": 0.02889386857808468, |
|
"learning_rate": 2.6929838792532035e-06, |
|
"loss": 0.102, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.7864318893810089, |
|
"grad_norm": 0.028444248618397415, |
|
"learning_rate": 2.6587383833515746e-06, |
|
"loss": 0.097, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7878722408267618, |
|
"grad_norm": 0.03018963515700107, |
|
"learning_rate": 2.6246786103989887e-06, |
|
"loss": 0.0969, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.7893125922725145, |
|
"grad_norm": 0.028448487698969887, |
|
"learning_rate": 2.590805422056807e-06, |
|
"loss": 0.0976, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.7907529437182672, |
|
"grad_norm": 0.025980334968211127, |
|
"learning_rate": 2.5571196752660733e-06, |
|
"loss": 0.098, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.79219329516402, |
|
"grad_norm": 0.0279120349990706, |
|
"learning_rate": 2.5236222222258455e-06, |
|
"loss": 0.1006, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7936336466097728, |
|
"grad_norm": 0.03043213886693375, |
|
"learning_rate": 2.4903139103716365e-06, |
|
"loss": 0.0961, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.7950739980555256, |
|
"grad_norm": 0.027713327075462212, |
|
"learning_rate": 2.4571955823539617e-06, |
|
"loss": 0.0927, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7965143495012783, |
|
"grad_norm": 0.03200709702140973, |
|
"learning_rate": 2.424268076017032e-06, |
|
"loss": 0.099, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.797954700947031, |
|
"grad_norm": 0.02862031576494855, |
|
"learning_rate": 2.3915322243775564e-06, |
|
"loss": 0.095, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7993950523927839, |
|
"grad_norm": 0.030880108006987813, |
|
"learning_rate": 2.3589888556036623e-06, |
|
"loss": 0.1012, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8008354038385366, |
|
"grad_norm": 0.028581095269047594, |
|
"learning_rate": 2.3266387929939525e-06, |
|
"loss": 0.1004, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8022757552842894, |
|
"grad_norm": 0.029454825149171658, |
|
"learning_rate": 2.294482854956672e-06, |
|
"loss": 0.0984, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8037161067300421, |
|
"grad_norm": 0.03006431233771053, |
|
"learning_rate": 2.2625218549890014e-06, |
|
"loss": 0.0995, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8051564581757948, |
|
"grad_norm": 0.027799037780853072, |
|
"learning_rate": 2.230756601656481e-06, |
|
"loss": 0.1018, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.8065968096215477, |
|
"grad_norm": 0.03649536849392369, |
|
"learning_rate": 2.1991878985725566e-06, |
|
"loss": 0.0936, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8080371610673004, |
|
"grad_norm": 0.028269913936191833, |
|
"learning_rate": 2.167816544378244e-06, |
|
"loss": 0.1009, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8094775125130532, |
|
"grad_norm": 0.028604988396104106, |
|
"learning_rate": 2.1366433327219284e-06, |
|
"loss": 0.1025, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8109178639588059, |
|
"grad_norm": 0.02876137453572099, |
|
"learning_rate": 2.105669052239274e-06, |
|
"loss": 0.1029, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.8123582154045587, |
|
"grad_norm": 0.028447843531367326, |
|
"learning_rate": 2.0748944865333033e-06, |
|
"loss": 0.0989, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8137985668503115, |
|
"grad_norm": 0.02868959528432683, |
|
"learning_rate": 2.0443204141545393e-06, |
|
"loss": 0.0991, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8152389182960642, |
|
"grad_norm": 0.030631865795776963, |
|
"learning_rate": 2.013947608581327e-06, |
|
"loss": 0.0988, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.816679269741817, |
|
"grad_norm": 0.028274634163983327, |
|
"learning_rate": 1.983776838200262e-06, |
|
"loss": 0.0967, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8181196211875698, |
|
"grad_norm": 0.03127569269957067, |
|
"learning_rate": 1.9538088662867495e-06, |
|
"loss": 0.0989, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8195599726333225, |
|
"grad_norm": 0.028435906385402127, |
|
"learning_rate": 1.924044450985706e-06, |
|
"loss": 0.1035, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.8210003240790753, |
|
"grad_norm": 0.030905808512586234, |
|
"learning_rate": 1.8944843452923546e-06, |
|
"loss": 0.1031, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.822440675524828, |
|
"grad_norm": 0.02821159641559338, |
|
"learning_rate": 1.8651292970332003e-06, |
|
"loss": 0.0975, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.8238810269705809, |
|
"grad_norm": 0.026301972641988965, |
|
"learning_rate": 1.835980048847098e-06, |
|
"loss": 0.1016, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8253213784163336, |
|
"grad_norm": 0.02850407318953127, |
|
"learning_rate": 1.8070373381664752e-06, |
|
"loss": 0.0957, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.8267617298620864, |
|
"grad_norm": 0.03287640405240904, |
|
"learning_rate": 1.77830189719866e-06, |
|
"loss": 0.0974, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8282020813078391, |
|
"grad_norm": 0.02979639575966765, |
|
"learning_rate": 1.7497744529073712e-06, |
|
"loss": 0.0962, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.8296424327535918, |
|
"grad_norm": 0.028883353854663875, |
|
"learning_rate": 1.721455726994321e-06, |
|
"loss": 0.1033, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8310827841993447, |
|
"grad_norm": 0.027623128069878516, |
|
"learning_rate": 1.6933464358809593e-06, |
|
"loss": 0.0915, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.8325231356450974, |
|
"grad_norm": 0.031518406493856295, |
|
"learning_rate": 1.6654472906903486e-06, |
|
"loss": 0.0953, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8339634870908502, |
|
"grad_norm": 0.025571212721863242, |
|
"learning_rate": 1.637758997229173e-06, |
|
"loss": 0.0939, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8354038385366029, |
|
"grad_norm": 0.026694901604252565, |
|
"learning_rate": 1.6102822559698828e-06, |
|
"loss": 0.0947, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8368441899823557, |
|
"grad_norm": 0.0277750741435232, |
|
"learning_rate": 1.5830177620329712e-06, |
|
"loss": 0.0989, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8382845414281085, |
|
"grad_norm": 0.02924974903749895, |
|
"learning_rate": 1.5559662051694002e-06, |
|
"loss": 0.0986, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8397248928738612, |
|
"grad_norm": 0.032423419131996915, |
|
"learning_rate": 1.5291282697431353e-06, |
|
"loss": 0.0989, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.841165244319614, |
|
"grad_norm": 0.030642374976224046, |
|
"learning_rate": 1.502504634713835e-06, |
|
"loss": 0.0963, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8426055957653668, |
|
"grad_norm": 0.02981716833954207, |
|
"learning_rate": 1.4760959736196834e-06, |
|
"loss": 0.0961, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8440459472111195, |
|
"grad_norm": 0.028126597558143722, |
|
"learning_rate": 1.4499029545603472e-06, |
|
"loss": 0.0996, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8454862986568723, |
|
"grad_norm": 0.030061407833917978, |
|
"learning_rate": 1.423926240180068e-06, |
|
"loss": 0.1027, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.846926650102625, |
|
"grad_norm": 0.030976625673774977, |
|
"learning_rate": 1.3981664876509028e-06, |
|
"loss": 0.0995, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8483670015483779, |
|
"grad_norm": 0.030419140568639338, |
|
"learning_rate": 1.3726243486560975e-06, |
|
"loss": 0.1049, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.8498073529941306, |
|
"grad_norm": 0.02669531962544557, |
|
"learning_rate": 1.3473004693736037e-06, |
|
"loss": 0.1026, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8512477044398833, |
|
"grad_norm": 0.028011138842770462, |
|
"learning_rate": 1.3221954904597256e-06, |
|
"loss": 0.0957, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8526880558856361, |
|
"grad_norm": 0.029219102403077252, |
|
"learning_rate": 1.2973100470329159e-06, |
|
"loss": 0.0989, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8541284073313888, |
|
"grad_norm": 0.028252079128452696, |
|
"learning_rate": 1.272644768657707e-06, |
|
"loss": 0.096, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.8555687587771417, |
|
"grad_norm": 0.028035615158873324, |
|
"learning_rate": 1.248200279328784e-06, |
|
"loss": 0.0985, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8570091102228944, |
|
"grad_norm": 0.03053162765030892, |
|
"learning_rate": 1.223977197455204e-06, |
|
"loss": 0.1006, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8584494616686471, |
|
"grad_norm": 0.028648955173488484, |
|
"learning_rate": 1.1999761358447403e-06, |
|
"loss": 0.0994, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8598898131143999, |
|
"grad_norm": 0.02748195928805478, |
|
"learning_rate": 1.1761977016883897e-06, |
|
"loss": 0.0958, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.8613301645601527, |
|
"grad_norm": 0.028621187530613948, |
|
"learning_rate": 1.152642496544998e-06, |
|
"loss": 0.0958, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8627705160059055, |
|
"grad_norm": 0.027853067803610992, |
|
"learning_rate": 1.1293111163260639e-06, |
|
"loss": 0.0997, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.8642108674516582, |
|
"grad_norm": 0.027187976560834638, |
|
"learning_rate": 1.1062041512806409e-06, |
|
"loss": 0.1028, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8642108674516582, |
|
"eval_loss": 0.09996497631072998, |
|
"eval_runtime": 862.256, |
|
"eval_samples_per_second": 2.092, |
|
"eval_steps_per_second": 0.523, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8656512188974109, |
|
"grad_norm": 0.026511113426348355, |
|
"learning_rate": 1.0833221859804188e-06, |
|
"loss": 0.0976, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.8670915703431638, |
|
"grad_norm": 0.02987993551142259, |
|
"learning_rate": 1.0606657993049253e-06, |
|
"loss": 0.0966, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8685319217889165, |
|
"grad_norm": 0.029689732788246524, |
|
"learning_rate": 1.0382355644268871e-06, |
|
"loss": 0.1041, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.8699722732346693, |
|
"grad_norm": 0.0267869386208169, |
|
"learning_rate": 1.0160320487977349e-06, |
|
"loss": 0.0966, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.871412624680422, |
|
"grad_norm": 0.032293109106566735, |
|
"learning_rate": 9.940558141332323e-07, |
|
"loss": 0.1048, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.8728529761261747, |
|
"grad_norm": 0.026383256040976497, |
|
"learning_rate": 9.723074163992774e-07, |
|
"loss": 0.0988, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8742933275719276, |
|
"grad_norm": 0.028451018898224757, |
|
"learning_rate": 9.507874057978339e-07, |
|
"loss": 0.0974, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.8757336790176803, |
|
"grad_norm": 0.03180099043766917, |
|
"learning_rate": 9.294963267530177e-07, |
|
"loss": 0.097, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8771740304634331, |
|
"grad_norm": 0.030029444501747108, |
|
"learning_rate": 9.084347178973107e-07, |
|
"loss": 0.0963, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8786143819091858, |
|
"grad_norm": 0.030459369725609208, |
|
"learning_rate": 8.876031120579454e-07, |
|
"loss": 0.0985, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8800547333549386, |
|
"grad_norm": 0.030593158083603012, |
|
"learning_rate": 8.670020362434229e-07, |
|
"loss": 0.0975, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.8814950848006914, |
|
"grad_norm": 0.02863412252326483, |
|
"learning_rate": 8.466320116301752e-07, |
|
"loss": 0.0959, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8829354362464441, |
|
"grad_norm": 0.028283718935748024, |
|
"learning_rate": 8.264935535493879e-07, |
|
"loss": 0.0956, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.8843757876921969, |
|
"grad_norm": 0.032405657391703896, |
|
"learning_rate": 8.065871714739581e-07, |
|
"loss": 0.1016, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8858161391379497, |
|
"grad_norm": 0.02841639634176925, |
|
"learning_rate": 7.869133690056063e-07, |
|
"loss": 0.0982, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.8872564905837024, |
|
"grad_norm": 0.026380393134296947, |
|
"learning_rate": 7.67472643862136e-07, |
|
"loss": 0.0949, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8886968420294552, |
|
"grad_norm": 0.03051349318569555, |
|
"learning_rate": 7.482654878648465e-07, |
|
"loss": 0.1076, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.8901371934752079, |
|
"grad_norm": 0.026798564091589188, |
|
"learning_rate": 7.292923869260837e-07, |
|
"loss": 0.1009, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8915775449209608, |
|
"grad_norm": 0.026243273967039876, |
|
"learning_rate": 7.105538210369467e-07, |
|
"loss": 0.0937, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.8930178963667135, |
|
"grad_norm": 0.027316057706649032, |
|
"learning_rate": 6.920502642551519e-07, |
|
"loss": 0.0981, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8944582478124662, |
|
"grad_norm": 0.030461472244114565, |
|
"learning_rate": 6.737821846930403e-07, |
|
"loss": 0.1013, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.895898599258219, |
|
"grad_norm": 0.029378550013335232, |
|
"learning_rate": 6.557500445057252e-07, |
|
"loss": 0.1032, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8973389507039717, |
|
"grad_norm": 0.03205510198240369, |
|
"learning_rate": 6.379542998794086e-07, |
|
"loss": 0.0942, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.8987793021497246, |
|
"grad_norm": 0.02919082875531507, |
|
"learning_rate": 6.203954010198387e-07, |
|
"loss": 0.1016, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9002196535954773, |
|
"grad_norm": 0.028309778648473748, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.0972, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.90166000504123, |
|
"grad_norm": 0.02666645822561416, |
|
"learning_rate": 5.859899114534662e-07, |
|
"loss": 0.0958, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9031003564869828, |
|
"grad_norm": 0.02914798086446344, |
|
"learning_rate": 5.691441911541385e-07, |
|
"loss": 0.0989, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.9045407079327356, |
|
"grad_norm": 0.029321419229845342, |
|
"learning_rate": 5.525370574144873e-07, |
|
"loss": 0.0943, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9059810593784884, |
|
"grad_norm": 0.029224751094801393, |
|
"learning_rate": 5.361689303701767e-07, |
|
"loss": 0.0927, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.9074214108242411, |
|
"grad_norm": 0.028920548399641463, |
|
"learning_rate": 5.200402241103674e-07, |
|
"loss": 0.0973, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9088617622699939, |
|
"grad_norm": 0.026508682335799016, |
|
"learning_rate": 5.041513466672254e-07, |
|
"loss": 0.0911, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.9103021137157467, |
|
"grad_norm": 0.0286913163547185, |
|
"learning_rate": 4.885027000056075e-07, |
|
"loss": 0.1003, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.9117424651614994, |
|
"grad_norm": 0.028408650516893636, |
|
"learning_rate": 4.730946800128888e-07, |
|
"loss": 0.0991, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.9131828166072522, |
|
"grad_norm": 0.027915924830151973, |
|
"learning_rate": 4.5792767648895396e-07, |
|
"loss": 0.1008, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.9146231680530049, |
|
"grad_norm": 0.03016832518348953, |
|
"learning_rate": 4.4300207313632713e-07, |
|
"loss": 0.0979, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.9160635194987578, |
|
"grad_norm": 0.025345673276194875, |
|
"learning_rate": 4.2831824755046994e-07, |
|
"loss": 0.0963, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9175038709445105, |
|
"grad_norm": 0.026699911404801765, |
|
"learning_rate": 4.138765712102299e-07, |
|
"loss": 0.0949, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.9189442223902632, |
|
"grad_norm": 0.028093887453801158, |
|
"learning_rate": 3.9967740946843523e-07, |
|
"loss": 0.0962, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.920384573836016, |
|
"grad_norm": 0.030027557409672574, |
|
"learning_rate": 3.8572112154266593e-07, |
|
"loss": 0.1003, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.9218249252817687, |
|
"grad_norm": 0.030720426408657343, |
|
"learning_rate": 3.7200806050614714e-07, |
|
"loss": 0.0966, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9232652767275216, |
|
"grad_norm": 0.028926025323559437, |
|
"learning_rate": 3.585385732788327e-07, |
|
"loss": 0.0957, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.9247056281732743, |
|
"grad_norm": 0.03702215777030734, |
|
"learning_rate": 3.453130006186234e-07, |
|
"loss": 0.0982, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.926145979619027, |
|
"grad_norm": 0.029009798104548613, |
|
"learning_rate": 3.3233167711274496e-07, |
|
"loss": 0.1018, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.9275863310647798, |
|
"grad_norm": 0.026198880889564888, |
|
"learning_rate": 3.1959493116928473e-07, |
|
"loss": 0.0987, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9290266825105326, |
|
"grad_norm": 0.028949046641791966, |
|
"learning_rate": 3.0710308500888184e-07, |
|
"loss": 0.1009, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.9304670339562854, |
|
"grad_norm": 0.03211145771064907, |
|
"learning_rate": 2.948564546565791e-07, |
|
"loss": 0.0964, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.9319073854020381, |
|
"grad_norm": 0.031242518818499414, |
|
"learning_rate": 2.828553499338227e-07, |
|
"loss": 0.0946, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.9333477368477908, |
|
"grad_norm": 0.031264506938530154, |
|
"learning_rate": 2.71100074450632e-07, |
|
"loss": 0.0958, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9347880882935437, |
|
"grad_norm": 0.029366673066917617, |
|
"learning_rate": 2.595909255979079e-07, |
|
"loss": 0.0981, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.9362284397392964, |
|
"grad_norm": 0.0265988622897014, |
|
"learning_rate": 2.4832819453992073e-07, |
|
"loss": 0.099, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9376687911850492, |
|
"grad_norm": 0.027150460711859906, |
|
"learning_rate": 2.3731216620693554e-07, |
|
"loss": 0.1026, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.9391091426308019, |
|
"grad_norm": 0.03127106760239122, |
|
"learning_rate": 2.2654311928800965e-07, |
|
"loss": 0.093, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9405494940765546, |
|
"grad_norm": 0.030758685060327442, |
|
"learning_rate": 2.1602132622393745e-07, |
|
"loss": 0.1015, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.9419898455223075, |
|
"grad_norm": 0.028184366940994177, |
|
"learning_rate": 2.0574705320036025e-07, |
|
"loss": 0.1074, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9434301969680602, |
|
"grad_norm": 0.02849030323321046, |
|
"learning_rate": 1.9572056014103281e-07, |
|
"loss": 0.1001, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.944870548413813, |
|
"grad_norm": 0.03266549303159609, |
|
"learning_rate": 1.8594210070124852e-07, |
|
"loss": 0.1045, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9463108998595657, |
|
"grad_norm": 0.026988212769410665, |
|
"learning_rate": 1.7641192226141913e-07, |
|
"loss": 0.0937, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.9477512513053185, |
|
"grad_norm": 0.02529946877891728, |
|
"learning_rate": 1.671302659208185e-07, |
|
"loss": 0.098, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9491916027510713, |
|
"grad_norm": 0.03314606321848586, |
|
"learning_rate": 1.58097366491482e-07, |
|
"loss": 0.1009, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.950631954196824, |
|
"grad_norm": 0.027540135720360067, |
|
"learning_rate": 1.4931345249226792e-07, |
|
"loss": 0.0967, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9520723056425768, |
|
"grad_norm": 0.028602952429147437, |
|
"learning_rate": 1.407787461430743e-07, |
|
"loss": 0.1042, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.9535126570883296, |
|
"grad_norm": 0.028879029680259325, |
|
"learning_rate": 1.324934633592201e-07, |
|
"loss": 0.0945, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9549530085340823, |
|
"grad_norm": 0.02860475239355599, |
|
"learning_rate": 1.2445781374597842e-07, |
|
"loss": 0.1017, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.9563933599798351, |
|
"grad_norm": 0.026737800323851892, |
|
"learning_rate": 1.1667200059327644e-07, |
|
"loss": 0.0971, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9578337114255878, |
|
"grad_norm": 0.029364181788732343, |
|
"learning_rate": 1.0913622087055264e-07, |
|
"loss": 0.1005, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.9592740628713406, |
|
"grad_norm": 0.03131909258187323, |
|
"learning_rate": 1.0185066522177545e-07, |
|
"loss": 0.092, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9607144143170934, |
|
"grad_norm": 0.029983775954717285, |
|
"learning_rate": 9.481551796061472e-08, |
|
"loss": 0.0989, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.9621547657628461, |
|
"grad_norm": 0.030901618242872846, |
|
"learning_rate": 8.803095706578335e-08, |
|
"loss": 0.0987, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9635951172085989, |
|
"grad_norm": 0.027894545371751872, |
|
"learning_rate": 8.149715417653414e-08, |
|
"loss": 0.0969, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.9650354686543516, |
|
"grad_norm": 0.027564458728654692, |
|
"learning_rate": 7.521427458831776e-08, |
|
"loss": 0.099, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9664758201001045, |
|
"grad_norm": 0.025676377662014388, |
|
"learning_rate": 6.918247724859939e-08, |
|
"loss": 0.0941, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.9679161715458572, |
|
"grad_norm": 0.030832764931546873, |
|
"learning_rate": 6.340191475283753e-08, |
|
"loss": 0.0988, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9693565229916099, |
|
"grad_norm": 0.03090619704744881, |
|
"learning_rate": 5.787273334062593e-08, |
|
"loss": 0.1, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.9707968744373627, |
|
"grad_norm": 0.028989234177967914, |
|
"learning_rate": 5.259507289199328e-08, |
|
"loss": 0.1042, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.9722372258831155, |
|
"grad_norm": 0.031096318018671423, |
|
"learning_rate": 4.756906692386043e-08, |
|
"loss": 0.1015, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.9736775773288683, |
|
"grad_norm": 0.031160065189087587, |
|
"learning_rate": 4.2794842586670884e-08, |
|
"loss": 0.0987, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.975117928774621, |
|
"grad_norm": 0.027856429944083137, |
|
"learning_rate": 3.827252066116338e-08, |
|
"loss": 0.0934, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.9765582802203737, |
|
"grad_norm": 0.0292530610363585, |
|
"learning_rate": 3.400221555532768e-08, |
|
"loss": 0.0931, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9779986316661265, |
|
"grad_norm": 0.030499523184600807, |
|
"learning_rate": 2.998403530150018e-08, |
|
"loss": 0.1011, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.9794389831118793, |
|
"grad_norm": 0.030425018653641535, |
|
"learning_rate": 2.6218081553638363e-08, |
|
"loss": 0.1015, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9808793345576321, |
|
"grad_norm": 0.028935900226943704, |
|
"learning_rate": 2.2704449584745046e-08, |
|
"loss": 0.1015, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.9823196860033848, |
|
"grad_norm": 0.031000647192898592, |
|
"learning_rate": 1.9443228284455882e-08, |
|
"loss": 0.0967, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.9837600374491375, |
|
"grad_norm": 0.02615522374107861, |
|
"learning_rate": 1.6434500156800037e-08, |
|
"loss": 0.098, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.9852003888948904, |
|
"grad_norm": 0.0296512129505122, |
|
"learning_rate": 1.3678341318100751e-08, |
|
"loss": 0.0941, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9866407403406431, |
|
"grad_norm": 0.027814116523255324, |
|
"learning_rate": 1.1174821495059106e-08, |
|
"loss": 0.0959, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.9880810917863959, |
|
"grad_norm": 0.03492776644651804, |
|
"learning_rate": 8.924004022986543e-09, |
|
"loss": 0.0953, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9895214432321486, |
|
"grad_norm": 0.02820257683900699, |
|
"learning_rate": 6.9259458442005875e-09, |
|
"loss": 0.1026, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.9909617946779014, |
|
"grad_norm": 0.030304015663341097, |
|
"learning_rate": 5.180697506587118e-09, |
|
"loss": 0.0955, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.9924021461236542, |
|
"grad_norm": 0.026993412863199263, |
|
"learning_rate": 3.688303162322493e-09, |
|
"loss": 0.0969, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.9938424975694069, |
|
"grad_norm": 0.02671367368351453, |
|
"learning_rate": 2.44880056675334e-09, |
|
"loss": 0.0921, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9952828490151597, |
|
"grad_norm": 0.027626479574708512, |
|
"learning_rate": 1.4622210774428714e-09, |
|
"loss": 0.0986, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.9967232004609125, |
|
"grad_norm": 0.02933812040981167, |
|
"learning_rate": 7.285896533770765e-10, |
|
"loss": 0.0992, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9981635519066653, |
|
"grad_norm": 0.027258112232128342, |
|
"learning_rate": 2.479248543363344e-10, |
|
"loss": 0.0967, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.999603903352418, |
|
"grad_norm": 0.029806576902039788, |
|
"learning_rate": 2.0238840421349382e-11, |
|
"loss": 0.0979, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.9998919736415686, |
|
"step": 3471, |
|
"total_flos": 2.660820529446912e+16, |
|
"train_loss": 0.2432632086317191, |
|
"train_runtime": 212607.6799, |
|
"train_samples_per_second": 1.045, |
|
"train_steps_per_second": 0.016 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3471, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.660820529446912e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|