|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994447529150472, |
|
"eval_steps": 100, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01110494169905608, |
|
"grad_norm": 5.397180080413818, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.3609, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02220988339811216, |
|
"grad_norm": 3.014500141143799, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.366, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03331482509716824, |
|
"grad_norm": 2.8811240196228027, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.2626, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04441976679622432, |
|
"grad_norm": 2.0619661808013916, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 1.1859, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0555247084952804, |
|
"grad_norm": 1.5281075239181519, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 1.1469, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06662965019433648, |
|
"grad_norm": 1.2808576822280884, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.1063, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07773459189339256, |
|
"grad_norm": 1.2353777885437012, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 1.0649, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08883953359244864, |
|
"grad_norm": 1.280089020729065, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 1.0574, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09994447529150471, |
|
"grad_norm": 1.1332577466964722, |
|
"learning_rate": 2e-05, |
|
"loss": 1.059, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1110494169905608, |
|
"grad_norm": 1.115410566329956, |
|
"learning_rate": 1.9992479525042305e-05, |
|
"loss": 1.0259, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12215435868961688, |
|
"grad_norm": 1.1548365354537964, |
|
"learning_rate": 1.996992941167792e-05, |
|
"loss": 1.0407, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13325930038867295, |
|
"grad_norm": 1.1644798517227173, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 1.0219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14436424208772905, |
|
"grad_norm": 1.0820422172546387, |
|
"learning_rate": 1.9879898494768093e-05, |
|
"loss": 1.0128, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15546918378678512, |
|
"grad_norm": 1.086032748222351, |
|
"learning_rate": 1.9812553106273848e-05, |
|
"loss": 1.0004, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1665741254858412, |
|
"grad_norm": 1.0374068021774292, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 1.0018, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1776790671848973, |
|
"grad_norm": 1.0140597820281982, |
|
"learning_rate": 1.9633708786158803e-05, |
|
"loss": 0.9955, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18878400888395336, |
|
"grad_norm": 1.124200701713562, |
|
"learning_rate": 1.9522478853384154e-05, |
|
"loss": 0.9913, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19988895058300943, |
|
"grad_norm": 1.048175573348999, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.9913, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21099389228206553, |
|
"grad_norm": 1.1582220792770386, |
|
"learning_rate": 1.9257239692688907e-05, |
|
"loss": 1.0055, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2220988339811216, |
|
"grad_norm": 0.9994953870773315, |
|
"learning_rate": 1.9103629409661468e-05, |
|
"loss": 0.9986, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2220988339811216, |
|
"eval_loss": 1.0169899463653564, |
|
"eval_runtime": 32.3732, |
|
"eval_samples_per_second": 7.908, |
|
"eval_steps_per_second": 0.988, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23320377568017767, |
|
"grad_norm": 1.1112126111984253, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.9954, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.24430871737923376, |
|
"grad_norm": 1.0678929090499878, |
|
"learning_rate": 1.8755582313020912e-05, |
|
"loss": 0.9847, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.25541365907828983, |
|
"grad_norm": 1.041691780090332, |
|
"learning_rate": 1.8561668995302668e-05, |
|
"loss": 0.9853, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2665186007773459, |
|
"grad_norm": 1.0803673267364502, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.9597, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.277623542476402, |
|
"grad_norm": 1.0018560886383057, |
|
"learning_rate": 1.8135520702629677e-05, |
|
"loss": 0.9833, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2887284841754581, |
|
"grad_norm": 1.1001731157302856, |
|
"learning_rate": 1.7903926695187595e-05, |
|
"loss": 0.9699, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29983342587451417, |
|
"grad_norm": 1.0463827848434448, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.9712, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.31093836757357024, |
|
"grad_norm": 1.3691056966781616, |
|
"learning_rate": 1.740544013109005e-05, |
|
"loss": 0.987, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3220433092726263, |
|
"grad_norm": 1.0720419883728027, |
|
"learning_rate": 1.7139297345578992e-05, |
|
"loss": 0.9626, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3331482509716824, |
|
"grad_norm": 1.0211371183395386, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.9522, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3442531926707385, |
|
"grad_norm": 1.0119701623916626, |
|
"learning_rate": 1.657521368569064e-05, |
|
"loss": 0.977, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3553581343697946, |
|
"grad_norm": 1.0520477294921875, |
|
"learning_rate": 1.627812124672099e-05, |
|
"loss": 0.9769, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36646307606885065, |
|
"grad_norm": 1.1625330448150635, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.9802, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3775680177679067, |
|
"grad_norm": 1.0677601099014282, |
|
"learning_rate": 1.5656068754865388e-05, |
|
"loss": 0.96, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3886729594669628, |
|
"grad_norm": 1.1049383878707886, |
|
"learning_rate": 1.5332044328016916e-05, |
|
"loss": 0.9643, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.39977790116601886, |
|
"grad_norm": 1.062893271446228, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.9386, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.410882842865075, |
|
"grad_norm": 1.0128767490386963, |
|
"learning_rate": 1.4660435197025391e-05, |
|
"loss": 0.9559, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.42198778456413105, |
|
"grad_norm": 1.0378108024597168, |
|
"learning_rate": 1.4313860656812537e-05, |
|
"loss": 0.9347, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4330927262631871, |
|
"grad_norm": 1.0094784498214722, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 0.9623, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4441976679622432, |
|
"grad_norm": 0.9790409803390503, |
|
"learning_rate": 1.3601777248047105e-05, |
|
"loss": 0.9678, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4441976679622432, |
|
"eval_loss": 0.9827648997306824, |
|
"eval_runtime": 32.188, |
|
"eval_samples_per_second": 7.953, |
|
"eval_steps_per_second": 0.994, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45530260966129926, |
|
"grad_norm": 0.9436436891555786, |
|
"learning_rate": 1.3237339420583213e-05, |
|
"loss": 0.96, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.46640755136035533, |
|
"grad_norm": 0.9316679239273071, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.9326, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47751249305941146, |
|
"grad_norm": 0.9514685273170471, |
|
"learning_rate": 1.2494411440579814e-05, |
|
"loss": 0.9661, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.48861743475846753, |
|
"grad_norm": 1.0059814453125, |
|
"learning_rate": 1.211703872229411e-05, |
|
"loss": 0.915, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4997223764575236, |
|
"grad_norm": 0.9615886211395264, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.9478, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5108273181565797, |
|
"grad_norm": 0.9393269419670105, |
|
"learning_rate": 1.1353312997501313e-05, |
|
"loss": 0.93, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5219322598556357, |
|
"grad_norm": 0.9677947759628296, |
|
"learning_rate": 1.0968108707031792e-05, |
|
"loss": 0.9507, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5330372015546918, |
|
"grad_norm": 0.9616990685462952, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.9205, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5441421432537479, |
|
"grad_norm": 0.9439184665679932, |
|
"learning_rate": 1.0193913317718245e-05, |
|
"loss": 0.9673, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.555247084952804, |
|
"grad_norm": 1.0039938688278198, |
|
"learning_rate": 9.806086682281759e-06, |
|
"loss": 0.9373, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5663520266518601, |
|
"grad_norm": 0.966042697429657, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 0.9518, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5774569683509162, |
|
"grad_norm": 0.9549779891967773, |
|
"learning_rate": 9.03189129296821e-06, |
|
"loss": 0.9537, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5885619100499723, |
|
"grad_norm": 0.9227808713912964, |
|
"learning_rate": 8.646687002498692e-06, |
|
"loss": 0.9477, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5996668517490283, |
|
"grad_norm": 0.9466913938522339, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.9306, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6107717934480844, |
|
"grad_norm": 0.8789124488830566, |
|
"learning_rate": 7.882961277705897e-06, |
|
"loss": 0.9392, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6218767351471405, |
|
"grad_norm": 0.9538043737411499, |
|
"learning_rate": 7.505588559420188e-06, |
|
"loss": 0.9541, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6329816768461966, |
|
"grad_norm": 0.9842989444732666, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.9555, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6440866185452526, |
|
"grad_norm": 0.951591432094574, |
|
"learning_rate": 6.762660579416791e-06, |
|
"loss": 0.947, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6551915602443087, |
|
"grad_norm": 0.9323354363441467, |
|
"learning_rate": 6.3982227519528986e-06, |
|
"loss": 0.9434, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6662965019433648, |
|
"grad_norm": 0.8553444147109985, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.9332, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6662965019433648, |
|
"eval_loss": 0.964606523513794, |
|
"eval_runtime": 32.3264, |
|
"eval_samples_per_second": 7.919, |
|
"eval_steps_per_second": 0.99, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6774014436424208, |
|
"grad_norm": 0.9099004864692688, |
|
"learning_rate": 5.686139343187468e-06, |
|
"loss": 0.9368, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.688506385341477, |
|
"grad_norm": 0.8473957777023315, |
|
"learning_rate": 5.339564802974615e-06, |
|
"loss": 0.9326, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6996113270405331, |
|
"grad_norm": 0.8807346224784851, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.9316, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7107162687395892, |
|
"grad_norm": 0.9112072587013245, |
|
"learning_rate": 4.66795567198309e-06, |
|
"loss": 0.9502, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7218212104386452, |
|
"grad_norm": 0.892538845539093, |
|
"learning_rate": 4.343931245134616e-06, |
|
"loss": 0.9412, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7329261521377013, |
|
"grad_norm": 0.8999789357185364, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.9345, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7440310938367574, |
|
"grad_norm": 0.878555178642273, |
|
"learning_rate": 3.7218787532790167e-06, |
|
"loss": 0.9271, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7551360355358134, |
|
"grad_norm": 0.8958238363265991, |
|
"learning_rate": 3.424786314309365e-06, |
|
"loss": 0.9289, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7662409772348695, |
|
"grad_norm": 0.8239888548851013, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.9422, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7773459189339256, |
|
"grad_norm": 0.9257540702819824, |
|
"learning_rate": 2.8607026544210115e-06, |
|
"loss": 0.9281, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7884508606329816, |
|
"grad_norm": 0.883544921875, |
|
"learning_rate": 2.594559868909956e-06, |
|
"loss": 0.9136, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7995558023320377, |
|
"grad_norm": 6.725770950317383, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.938, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8106607440310938, |
|
"grad_norm": 0.8539109230041504, |
|
"learning_rate": 2.0960733048124082e-06, |
|
"loss": 0.9311, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.82176568573015, |
|
"grad_norm": 0.9241852760314941, |
|
"learning_rate": 1.8644792973703252e-06, |
|
"loss": 0.9349, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.832870627429206, |
|
"grad_norm": 0.8888701796531677, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.9426, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8439755691282621, |
|
"grad_norm": 0.8378512263298035, |
|
"learning_rate": 1.4383310046973365e-06, |
|
"loss": 0.9213, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8550805108273182, |
|
"grad_norm": 0.8606922626495361, |
|
"learning_rate": 1.2444176869790925e-06, |
|
"loss": 0.927, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8661854525263742, |
|
"grad_norm": 0.7981701493263245, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.9145, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8772903942254303, |
|
"grad_norm": 0.8214133977890015, |
|
"learning_rate": 8.963705903385344e-07, |
|
"loss": 0.9398, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8883953359244864, |
|
"grad_norm": 0.8718788623809814, |
|
"learning_rate": 7.427603073110967e-07, |
|
"loss": 0.9027, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8883953359244864, |
|
"eval_loss": 0.9576538801193237, |
|
"eval_runtime": 31.9874, |
|
"eval_samples_per_second": 8.003, |
|
"eval_steps_per_second": 1.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8995002776235425, |
|
"grad_norm": 0.8060287237167358, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.9305, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9106052193225985, |
|
"grad_norm": 0.8300138711929321, |
|
"learning_rate": 4.775211466158469e-07, |
|
"loss": 0.9381, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9217101610216546, |
|
"grad_norm": 0.8601639270782471, |
|
"learning_rate": 3.662912138411967e-07, |
|
"loss": 0.9231, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9328151027207107, |
|
"grad_norm": 0.8134458065032959, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.9115, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9439200444197668, |
|
"grad_norm": 0.7816701531410217, |
|
"learning_rate": 1.874468937261531e-07, |
|
"loss": 0.9183, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9550249861188229, |
|
"grad_norm": 0.8716827034950256, |
|
"learning_rate": 1.201015052319099e-07, |
|
"loss": 0.9503, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.966129927817879, |
|
"grad_norm": 0.8040913939476013, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.9306, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9772348695169351, |
|
"grad_norm": 0.7763182520866394, |
|
"learning_rate": 3.0070588322079765e-08, |
|
"loss": 0.9244, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9883398112159911, |
|
"grad_norm": 0.825076162815094, |
|
"learning_rate": 7.520474957699586e-09, |
|
"loss": 0.9321, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9994447529150472, |
|
"grad_norm": 0.8443522453308105, |
|
"learning_rate": 0.0, |
|
"loss": 0.9389, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9994447529150472, |
|
"step": 450, |
|
"total_flos": 1.8998720188121088e+17, |
|
"train_loss": 0.9762679852379693, |
|
"train_runtime": 18617.8565, |
|
"train_samples_per_second": 2.321, |
|
"train_steps_per_second": 0.024 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8998720188121088e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|