|
{ |
|
"best_global_step": 1396, |
|
"best_metric": 0.6223743557929993, |
|
"best_model_checkpoint": "checkpoints/star_plus-llama-3.1-8b-gsm8k/gsm8k/finetune-llama-3.1-8b-gsm8k-step-3/checkpoint-1396", |
|
"epoch": 0.9016793454858619, |
|
"eval_steps": 349, |
|
"global_step": 3141, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014353380221042056, |
|
"grad_norm": 13.870022843272388, |
|
"learning_rate": 1.1461318051575931e-09, |
|
"loss": 1.1547, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002870676044208411, |
|
"grad_norm": 15.087326577278679, |
|
"learning_rate": 2.5787965616045846e-09, |
|
"loss": 1.1398, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004306014066312616, |
|
"grad_norm": 14.18146075651338, |
|
"learning_rate": 4.011461318051576e-09, |
|
"loss": 1.1402, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005741352088416822, |
|
"grad_norm": 13.690203022341509, |
|
"learning_rate": 5.444126074498567e-09, |
|
"loss": 1.1197, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0071766901105210276, |
|
"grad_norm": 14.994216901060916, |
|
"learning_rate": 6.876790830945558e-09, |
|
"loss": 1.1369, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008612028132625233, |
|
"grad_norm": 14.400896427137175, |
|
"learning_rate": 8.30945558739255e-09, |
|
"loss": 1.1355, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01004736615472944, |
|
"grad_norm": 14.26913515195767, |
|
"learning_rate": 9.742120343839541e-09, |
|
"loss": 1.1442, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011482704176833645, |
|
"grad_norm": 13.797459248853228, |
|
"learning_rate": 1.1174785100286532e-08, |
|
"loss": 1.1198, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01291804219893785, |
|
"grad_norm": 14.471409919274842, |
|
"learning_rate": 1.2607449856733523e-08, |
|
"loss": 1.1203, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014353380221042055, |
|
"grad_norm": 14.748844131710204, |
|
"learning_rate": 1.4040114613180515e-08, |
|
"loss": 1.1226, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01578871824314626, |
|
"grad_norm": 15.729621083463075, |
|
"learning_rate": 1.5472779369627508e-08, |
|
"loss": 1.1524, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.017224056265250465, |
|
"grad_norm": 15.386565556660619, |
|
"learning_rate": 1.69054441260745e-08, |
|
"loss": 1.1555, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01865939428735467, |
|
"grad_norm": 13.85550381101084, |
|
"learning_rate": 1.833810888252149e-08, |
|
"loss": 1.1401, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02009473230945888, |
|
"grad_norm": 13.717607018021473, |
|
"learning_rate": 1.977077363896848e-08, |
|
"loss": 1.1384, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.021530070331563084, |
|
"grad_norm": 13.565604110956505, |
|
"learning_rate": 2.1203438395415473e-08, |
|
"loss": 1.1018, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02296540835366729, |
|
"grad_norm": 13.867377061312162, |
|
"learning_rate": 2.2636103151862464e-08, |
|
"loss": 1.1124, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.024400746375771495, |
|
"grad_norm": 13.724459591900809, |
|
"learning_rate": 2.4068767908309455e-08, |
|
"loss": 1.1398, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0258360843978757, |
|
"grad_norm": 15.266898340074492, |
|
"learning_rate": 2.5501432664756446e-08, |
|
"loss": 1.147, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.027271422419979905, |
|
"grad_norm": 13.80077008791852, |
|
"learning_rate": 2.6934097421203438e-08, |
|
"loss": 1.0931, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.02870676044208411, |
|
"grad_norm": 14.264049058840255, |
|
"learning_rate": 2.8366762177650426e-08, |
|
"loss": 1.1096, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030142098464188315, |
|
"grad_norm": 13.948028510127017, |
|
"learning_rate": 2.979942693409742e-08, |
|
"loss": 1.1363, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03157743648629252, |
|
"grad_norm": 13.79267645960095, |
|
"learning_rate": 3.123209169054441e-08, |
|
"loss": 1.1159, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03301277450839673, |
|
"grad_norm": 14.588900467039972, |
|
"learning_rate": 3.2664756446991406e-08, |
|
"loss": 1.1057, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03444811253050093, |
|
"grad_norm": 12.835011681719193, |
|
"learning_rate": 3.409742120343839e-08, |
|
"loss": 1.0768, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03588345055260514, |
|
"grad_norm": 12.865687203941038, |
|
"learning_rate": 3.553008595988539e-08, |
|
"loss": 1.0634, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03731878857470934, |
|
"grad_norm": 13.384102130291096, |
|
"learning_rate": 3.696275071633237e-08, |
|
"loss": 1.0633, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03875412659681355, |
|
"grad_norm": 13.041165747177553, |
|
"learning_rate": 3.839541547277937e-08, |
|
"loss": 1.0453, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04018946461891776, |
|
"grad_norm": 13.819322616196978, |
|
"learning_rate": 3.9828080229226356e-08, |
|
"loss": 1.0406, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04162480264102196, |
|
"grad_norm": 15.132519917569722, |
|
"learning_rate": 4.1260744985673354e-08, |
|
"loss": 1.0248, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04306014066312617, |
|
"grad_norm": 16.304556779067884, |
|
"learning_rate": 4.269340974212034e-08, |
|
"loss": 1.0249, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04449547868523037, |
|
"grad_norm": 16.01778148458211, |
|
"learning_rate": 4.4126074498567336e-08, |
|
"loss": 0.9789, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04593081670733458, |
|
"grad_norm": 17.49228077832121, |
|
"learning_rate": 4.555873925501432e-08, |
|
"loss": 0.9715, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04736615472943878, |
|
"grad_norm": 14.72967937422827, |
|
"learning_rate": 4.699140401146132e-08, |
|
"loss": 0.907, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.04880149275154299, |
|
"grad_norm": 13.06311838609418, |
|
"learning_rate": 4.8424068767908303e-08, |
|
"loss": 0.8993, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05023683077364719, |
|
"grad_norm": 10.962383047958323, |
|
"learning_rate": 4.98567335243553e-08, |
|
"loss": 0.8467, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0516721687957514, |
|
"grad_norm": 10.794251111033377, |
|
"learning_rate": 5.1289398280802286e-08, |
|
"loss": 0.8314, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05310750681785561, |
|
"grad_norm": 12.739679331085805, |
|
"learning_rate": 5.272206303724928e-08, |
|
"loss": 0.8188, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.05454284483995981, |
|
"grad_norm": 16.34256020226391, |
|
"learning_rate": 5.4154727793696275e-08, |
|
"loss": 0.7914, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05597818286206402, |
|
"grad_norm": 10.706894844684937, |
|
"learning_rate": 5.5587392550143266e-08, |
|
"loss": 0.7459, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05741352088416822, |
|
"grad_norm": 13.476028036070423, |
|
"learning_rate": 5.702005730659025e-08, |
|
"loss": 0.7227, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05884885890627243, |
|
"grad_norm": 6.419388332826234, |
|
"learning_rate": 5.845272206303725e-08, |
|
"loss": 0.647, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.06028419692837663, |
|
"grad_norm": 5.573797397881519, |
|
"learning_rate": 5.988538681948424e-08, |
|
"loss": 0.6502, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06171953495048084, |
|
"grad_norm": 3.3893623971550917, |
|
"learning_rate": 6.131805157593123e-08, |
|
"loss": 0.6281, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.06315487297258504, |
|
"grad_norm": 2.8756784288209825, |
|
"learning_rate": 6.275071633237822e-08, |
|
"loss": 0.6464, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06459021099468924, |
|
"grad_norm": 2.6960562601658853, |
|
"learning_rate": 6.418338108882521e-08, |
|
"loss": 0.6342, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06602554901679346, |
|
"grad_norm": 2.672645623610309, |
|
"learning_rate": 6.56160458452722e-08, |
|
"loss": 0.6306, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06746088703889766, |
|
"grad_norm": 2.7155506472775435, |
|
"learning_rate": 6.70487106017192e-08, |
|
"loss": 0.6313, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06889622506100186, |
|
"grad_norm": 2.4926643277894285, |
|
"learning_rate": 6.848137535816619e-08, |
|
"loss": 0.6371, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07033156308310608, |
|
"grad_norm": 2.5535696596499373, |
|
"learning_rate": 6.991404011461318e-08, |
|
"loss": 0.6337, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07176690110521028, |
|
"grad_norm": 2.305504274318628, |
|
"learning_rate": 7.134670487106017e-08, |
|
"loss": 0.6252, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07320223912731448, |
|
"grad_norm": 2.259445659814349, |
|
"learning_rate": 7.277936962750716e-08, |
|
"loss": 0.6223, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.07463757714941868, |
|
"grad_norm": 2.4454411408983514, |
|
"learning_rate": 7.421203438395415e-08, |
|
"loss": 0.646, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0760729151715229, |
|
"grad_norm": 2.442391967235382, |
|
"learning_rate": 7.564469914040114e-08, |
|
"loss": 0.6105, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0775082531936271, |
|
"grad_norm": 2.173170940546, |
|
"learning_rate": 7.707736389684814e-08, |
|
"loss": 0.6173, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0789435912157313, |
|
"grad_norm": 2.2310208210940825, |
|
"learning_rate": 7.851002865329513e-08, |
|
"loss": 0.6244, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08037892923783552, |
|
"grad_norm": 2.1247720503861687, |
|
"learning_rate": 7.994269340974212e-08, |
|
"loss": 0.611, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08181426725993972, |
|
"grad_norm": 2.3653542261559144, |
|
"learning_rate": 8.137535816618911e-08, |
|
"loss": 0.6191, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.08324960528204392, |
|
"grad_norm": 2.33933439467133, |
|
"learning_rate": 8.28080229226361e-08, |
|
"loss": 0.6104, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08468494330414812, |
|
"grad_norm": 2.3499668034162267, |
|
"learning_rate": 8.424068767908309e-08, |
|
"loss": 0.6117, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.08612028132625234, |
|
"grad_norm": 2.2312767888077643, |
|
"learning_rate": 8.567335243553008e-08, |
|
"loss": 0.6137, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08755561934835654, |
|
"grad_norm": 2.258014565016843, |
|
"learning_rate": 8.710601719197707e-08, |
|
"loss": 0.6165, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.08899095737046074, |
|
"grad_norm": 2.361801401311493, |
|
"learning_rate": 8.853868194842407e-08, |
|
"loss": 0.6201, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09042629539256494, |
|
"grad_norm": 2.0790761899486148, |
|
"learning_rate": 8.997134670487106e-08, |
|
"loss": 0.5957, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09186163341466916, |
|
"grad_norm": 2.223817923234156, |
|
"learning_rate": 9.140401146131805e-08, |
|
"loss": 0.5882, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09329697143677336, |
|
"grad_norm": 2.2127486325154764, |
|
"learning_rate": 9.283667621776504e-08, |
|
"loss": 0.6082, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.09473230945887756, |
|
"grad_norm": 2.0643057499306114, |
|
"learning_rate": 9.426934097421203e-08, |
|
"loss": 0.5952, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09616764748098178, |
|
"grad_norm": 2.2854134236761046, |
|
"learning_rate": 9.570200573065902e-08, |
|
"loss": 0.5981, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.09760298550308598, |
|
"grad_norm": 2.1937418634526926, |
|
"learning_rate": 9.713467048710601e-08, |
|
"loss": 0.6301, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09903832352519018, |
|
"grad_norm": 2.3914797327148474, |
|
"learning_rate": 9.8567335243553e-08, |
|
"loss": 0.6095, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.10018659394287355, |
|
"eval_loss": 0.629633367061615, |
|
"eval_runtime": 284.7819, |
|
"eval_samples_per_second": 134.226, |
|
"eval_steps_per_second": 2.1, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.10047366154729438, |
|
"grad_norm": 2.2241062219621788, |
|
"learning_rate": 1e-07, |
|
"loss": 0.5995, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1019089995693986, |
|
"grad_norm": 2.3025600042484196, |
|
"learning_rate": 9.984051036682614e-08, |
|
"loss": 0.6244, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.1033443375915028, |
|
"grad_norm": 2.1673308778669664, |
|
"learning_rate": 9.96810207336523e-08, |
|
"loss": 0.6057, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.104779675613607, |
|
"grad_norm": 2.148853032332558, |
|
"learning_rate": 9.952153110047846e-08, |
|
"loss": 0.5998, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.10621501363571122, |
|
"grad_norm": 2.0349810685950698, |
|
"learning_rate": 9.936204146730463e-08, |
|
"loss": 0.6053, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.10765035165781542, |
|
"grad_norm": 2.174540606431127, |
|
"learning_rate": 9.920255183413077e-08, |
|
"loss": 0.5961, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.10908568967991962, |
|
"grad_norm": 2.177994561683314, |
|
"learning_rate": 9.904306220095693e-08, |
|
"loss": 0.5995, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11052102770202382, |
|
"grad_norm": 2.1071224070808854, |
|
"learning_rate": 9.88835725677831e-08, |
|
"loss": 0.6316, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.11195636572412804, |
|
"grad_norm": 2.051925662287215, |
|
"learning_rate": 9.872408293460924e-08, |
|
"loss": 0.6034, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11339170374623224, |
|
"grad_norm": 2.0760748827078883, |
|
"learning_rate": 9.85645933014354e-08, |
|
"loss": 0.5925, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.11482704176833644, |
|
"grad_norm": 2.0510637954672166, |
|
"learning_rate": 9.840510366826155e-08, |
|
"loss": 0.5906, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11626237979044064, |
|
"grad_norm": 2.256307217104805, |
|
"learning_rate": 9.824561403508771e-08, |
|
"loss": 0.6197, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.11769771781254486, |
|
"grad_norm": 2.1508326768819, |
|
"learning_rate": 9.808612440191387e-08, |
|
"loss": 0.6059, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11913305583464906, |
|
"grad_norm": 2.158467198244223, |
|
"learning_rate": 9.792663476874002e-08, |
|
"loss": 0.6287, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.12056839385675326, |
|
"grad_norm": 2.0519012340549607, |
|
"learning_rate": 9.776714513556618e-08, |
|
"loss": 0.5919, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12200373187885748, |
|
"grad_norm": 2.2122386720795206, |
|
"learning_rate": 9.760765550239234e-08, |
|
"loss": 0.6047, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.12343906990096168, |
|
"grad_norm": 2.4377341243140855, |
|
"learning_rate": 9.74481658692185e-08, |
|
"loss": 0.6086, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12487440792306588, |
|
"grad_norm": 2.2032974131908736, |
|
"learning_rate": 9.728867623604465e-08, |
|
"loss": 0.5956, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.12630974594517008, |
|
"grad_norm": 2.1439507923710908, |
|
"learning_rate": 9.71291866028708e-08, |
|
"loss": 0.6188, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1277450839672743, |
|
"grad_norm": 2.124605788252112, |
|
"learning_rate": 9.696969696969697e-08, |
|
"loss": 0.595, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.12918042198937849, |
|
"grad_norm": 2.0145813941505626, |
|
"learning_rate": 9.681020733652312e-08, |
|
"loss": 0.604, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1306157600114827, |
|
"grad_norm": 1.9750273830741611, |
|
"learning_rate": 9.665071770334928e-08, |
|
"loss": 0.602, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.13205109803358692, |
|
"grad_norm": 2.132152341731948, |
|
"learning_rate": 9.649122807017543e-08, |
|
"loss": 0.5919, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1334864360556911, |
|
"grad_norm": 2.2780064420605104, |
|
"learning_rate": 9.633173843700159e-08, |
|
"loss": 0.6126, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.13492177407779532, |
|
"grad_norm": 2.2739660981275884, |
|
"learning_rate": 9.617224880382775e-08, |
|
"loss": 0.6029, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.13635711209989954, |
|
"grad_norm": 2.1742546599852637, |
|
"learning_rate": 9.60127591706539e-08, |
|
"loss": 0.5769, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.13779245012200372, |
|
"grad_norm": 2.42381996317855, |
|
"learning_rate": 9.585326953748006e-08, |
|
"loss": 0.5994, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.13922778814410794, |
|
"grad_norm": 2.227117458354492, |
|
"learning_rate": 9.569377990430622e-08, |
|
"loss": 0.594, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.14066312616621215, |
|
"grad_norm": 2.4643334214114456, |
|
"learning_rate": 9.553429027113237e-08, |
|
"loss": 0.6027, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.14209846418831634, |
|
"grad_norm": 2.1983131254869415, |
|
"learning_rate": 9.537480063795853e-08, |
|
"loss": 0.6055, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14353380221042056, |
|
"grad_norm": 2.143709628361645, |
|
"learning_rate": 9.521531100478468e-08, |
|
"loss": 0.6028, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14496914023252475, |
|
"grad_norm": 2.111089939054113, |
|
"learning_rate": 9.505582137161085e-08, |
|
"loss": 0.6142, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.14640447825462896, |
|
"grad_norm": 2.185072710633692, |
|
"learning_rate": 9.4896331738437e-08, |
|
"loss": 0.6036, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14783981627673318, |
|
"grad_norm": 2.1749714984974027, |
|
"learning_rate": 9.473684210526315e-08, |
|
"loss": 0.6017, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.14927515429883736, |
|
"grad_norm": 2.0595469823971086, |
|
"learning_rate": 9.457735247208931e-08, |
|
"loss": 0.593, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15071049232094158, |
|
"grad_norm": 2.166316240639028, |
|
"learning_rate": 9.441786283891547e-08, |
|
"loss": 0.602, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1521458303430458, |
|
"grad_norm": 2.1081294919011713, |
|
"learning_rate": 9.425837320574163e-08, |
|
"loss": 0.5704, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15358116836514998, |
|
"grad_norm": 2.086486832035454, |
|
"learning_rate": 9.409888357256778e-08, |
|
"loss": 0.5829, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.1550165063872542, |
|
"grad_norm": 1.9452488677658146, |
|
"learning_rate": 9.393939393939394e-08, |
|
"loss": 0.5819, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15645184440935841, |
|
"grad_norm": 2.2482994216353247, |
|
"learning_rate": 9.37799043062201e-08, |
|
"loss": 0.595, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.1578871824314626, |
|
"grad_norm": 2.1541134403144366, |
|
"learning_rate": 9.362041467304625e-08, |
|
"loss": 0.5947, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15932252045356682, |
|
"grad_norm": 2.106629528341244, |
|
"learning_rate": 9.34609250398724e-08, |
|
"loss": 0.5914, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.16075785847567103, |
|
"grad_norm": 2.1549978294974013, |
|
"learning_rate": 9.330143540669855e-08, |
|
"loss": 0.5972, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.16219319649777522, |
|
"grad_norm": 2.087927226201651, |
|
"learning_rate": 9.314194577352472e-08, |
|
"loss": 0.601, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.16362853451987944, |
|
"grad_norm": 2.2969758412493184, |
|
"learning_rate": 9.298245614035088e-08, |
|
"loss": 0.6121, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.16506387254198363, |
|
"grad_norm": 1.9822061570846972, |
|
"learning_rate": 9.282296650717702e-08, |
|
"loss": 0.5901, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.16649921056408784, |
|
"grad_norm": 2.096712972417583, |
|
"learning_rate": 9.266347687400318e-08, |
|
"loss": 0.5907, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.16793454858619206, |
|
"grad_norm": 2.2566429251485394, |
|
"learning_rate": 9.250398724082935e-08, |
|
"loss": 0.6063, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.16936988660829624, |
|
"grad_norm": 2.1805651747312447, |
|
"learning_rate": 9.23444976076555e-08, |
|
"loss": 0.5957, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17080522463040046, |
|
"grad_norm": 2.1247600803710296, |
|
"learning_rate": 9.218500797448165e-08, |
|
"loss": 0.5885, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.17224056265250468, |
|
"grad_norm": 2.0815283359286876, |
|
"learning_rate": 9.20255183413078e-08, |
|
"loss": 0.599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17367590067460886, |
|
"grad_norm": 2.038372892981339, |
|
"learning_rate": 9.186602870813396e-08, |
|
"loss": 0.609, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.17511123869671308, |
|
"grad_norm": 2.263660725821227, |
|
"learning_rate": 9.170653907496012e-08, |
|
"loss": 0.5962, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1765465767188173, |
|
"grad_norm": 1.9909834330631233, |
|
"learning_rate": 9.154704944178628e-08, |
|
"loss": 0.6044, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.17798191474092148, |
|
"grad_norm": 2.047085721846824, |
|
"learning_rate": 9.138755980861243e-08, |
|
"loss": 0.5861, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1794172527630257, |
|
"grad_norm": 2.142108425915519, |
|
"learning_rate": 9.122807017543859e-08, |
|
"loss": 0.6001, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.18085259078512989, |
|
"grad_norm": 2.2269167910002428, |
|
"learning_rate": 9.106858054226475e-08, |
|
"loss": 0.5692, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.1822879288072341, |
|
"grad_norm": 2.163281563946126, |
|
"learning_rate": 9.09090909090909e-08, |
|
"loss": 0.5746, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.18372326682933832, |
|
"grad_norm": 2.2283513846745393, |
|
"learning_rate": 9.074960127591706e-08, |
|
"loss": 0.6013, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1851586048514425, |
|
"grad_norm": 2.4916531419640378, |
|
"learning_rate": 9.059011164274322e-08, |
|
"loss": 0.584, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.18659394287354672, |
|
"grad_norm": 2.2163307550487583, |
|
"learning_rate": 9.043062200956937e-08, |
|
"loss": 0.5997, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18802928089565094, |
|
"grad_norm": 2.1338109129705343, |
|
"learning_rate": 9.027113237639553e-08, |
|
"loss": 0.6148, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.18946461891775512, |
|
"grad_norm": 2.1154029002889962, |
|
"learning_rate": 9.011164274322168e-08, |
|
"loss": 0.5934, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.19089995693985934, |
|
"grad_norm": 2.1834360253997604, |
|
"learning_rate": 8.995215311004784e-08, |
|
"loss": 0.6034, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.19233529496196355, |
|
"grad_norm": 2.224312171540706, |
|
"learning_rate": 8.9792663476874e-08, |
|
"loss": 0.6001, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.19377063298406774, |
|
"grad_norm": 2.149180259343769, |
|
"learning_rate": 8.963317384370016e-08, |
|
"loss": 0.5711, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.19520597100617196, |
|
"grad_norm": 2.2487709986154703, |
|
"learning_rate": 8.947368421052631e-08, |
|
"loss": 0.6106, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.19664130902827615, |
|
"grad_norm": 2.1736142585918863, |
|
"learning_rate": 8.931419457735247e-08, |
|
"loss": 0.5965, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.19807664705038036, |
|
"grad_norm": 2.2565465434395997, |
|
"learning_rate": 8.915470494417863e-08, |
|
"loss": 0.5878, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.19951198507248458, |
|
"grad_norm": 2.117117032717574, |
|
"learning_rate": 8.899521531100478e-08, |
|
"loss": 0.5795, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.2003731878857471, |
|
"eval_loss": 0.6242366433143616, |
|
"eval_runtime": 276.6162, |
|
"eval_samples_per_second": 138.188, |
|
"eval_steps_per_second": 2.162, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.20094732309458876, |
|
"grad_norm": 2.282588992438042, |
|
"learning_rate": 8.883572567783094e-08, |
|
"loss": 0.6082, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20238266111669298, |
|
"grad_norm": 2.3530914921224504, |
|
"learning_rate": 8.867623604465709e-08, |
|
"loss": 0.5971, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2038179991387972, |
|
"grad_norm": 2.2866310443011173, |
|
"learning_rate": 8.851674641148325e-08, |
|
"loss": 0.6002, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.20525333716090138, |
|
"grad_norm": 2.301531540868706, |
|
"learning_rate": 8.835725677830941e-08, |
|
"loss": 0.5925, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.2066886751830056, |
|
"grad_norm": 2.1919572172369985, |
|
"learning_rate": 8.819776714513556e-08, |
|
"loss": 0.6174, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.20812401320510981, |
|
"grad_norm": 2.2967058044542794, |
|
"learning_rate": 8.803827751196172e-08, |
|
"loss": 0.5926, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.209559351227214, |
|
"grad_norm": 2.1307037968639717, |
|
"learning_rate": 8.787878787878788e-08, |
|
"loss": 0.6104, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.21099468924931822, |
|
"grad_norm": 2.209964266498109, |
|
"learning_rate": 8.771929824561403e-08, |
|
"loss": 0.5943, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21243002727142243, |
|
"grad_norm": 2.419009311462197, |
|
"learning_rate": 8.755980861244019e-08, |
|
"loss": 0.5874, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.21386536529352662, |
|
"grad_norm": 2.389943282672986, |
|
"learning_rate": 8.740031897926634e-08, |
|
"loss": 0.586, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.21530070331563084, |
|
"grad_norm": 2.084899396749322, |
|
"learning_rate": 8.724082934609251e-08, |
|
"loss": 0.5955, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.21673604133773502, |
|
"grad_norm": 2.2165347770637687, |
|
"learning_rate": 8.708133971291866e-08, |
|
"loss": 0.5928, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.21817137935983924, |
|
"grad_norm": 2.1541567696115766, |
|
"learning_rate": 8.69218500797448e-08, |
|
"loss": 0.5802, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.21960671738194346, |
|
"grad_norm": 2.078810823982944, |
|
"learning_rate": 8.676236044657097e-08, |
|
"loss": 0.5851, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.22104205540404764, |
|
"grad_norm": 2.205438995615468, |
|
"learning_rate": 8.660287081339713e-08, |
|
"loss": 0.5822, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.22247739342615186, |
|
"grad_norm": 2.3681917067545335, |
|
"learning_rate": 8.644338118022329e-08, |
|
"loss": 0.5797, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.22391273144825607, |
|
"grad_norm": 2.1787226733938527, |
|
"learning_rate": 8.628389154704943e-08, |
|
"loss": 0.5952, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.22534806947036026, |
|
"grad_norm": 2.1341166867892705, |
|
"learning_rate": 8.61244019138756e-08, |
|
"loss": 0.5793, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.22678340749246448, |
|
"grad_norm": 2.226823901565418, |
|
"learning_rate": 8.596491228070176e-08, |
|
"loss": 0.5998, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2282187455145687, |
|
"grad_norm": 2.154565895126637, |
|
"learning_rate": 8.58054226475279e-08, |
|
"loss": 0.6018, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.22965408353667288, |
|
"grad_norm": 2.1902644679628427, |
|
"learning_rate": 8.564593301435407e-08, |
|
"loss": 0.5846, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2310894215587771, |
|
"grad_norm": 2.132807778812804, |
|
"learning_rate": 8.548644338118021e-08, |
|
"loss": 0.5973, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.23252475958088129, |
|
"grad_norm": 2.1541803653195237, |
|
"learning_rate": 8.532695374800639e-08, |
|
"loss": 0.6016, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2339600976029855, |
|
"grad_norm": 2.6732037615470223, |
|
"learning_rate": 8.516746411483253e-08, |
|
"loss": 0.5996, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.23539543562508972, |
|
"grad_norm": 2.13269819035701, |
|
"learning_rate": 8.500797448165868e-08, |
|
"loss": 0.5944, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2368307736471939, |
|
"grad_norm": 2.0948220019531134, |
|
"learning_rate": 8.484848484848484e-08, |
|
"loss": 0.5801, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.23826611166929812, |
|
"grad_norm": 2.141745313126493, |
|
"learning_rate": 8.4688995215311e-08, |
|
"loss": 0.5912, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.23970144969140234, |
|
"grad_norm": 2.185598157293274, |
|
"learning_rate": 8.452950558213716e-08, |
|
"loss": 0.561, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.24113678771350652, |
|
"grad_norm": 2.113152259023194, |
|
"learning_rate": 8.437001594896331e-08, |
|
"loss": 0.5767, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.24257212573561074, |
|
"grad_norm": 2.1653119179818403, |
|
"learning_rate": 8.421052631578946e-08, |
|
"loss": 0.5994, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.24400746375771495, |
|
"grad_norm": 2.232117286703433, |
|
"learning_rate": 8.405103668261563e-08, |
|
"loss": 0.5965, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.24544280177981914, |
|
"grad_norm": 2.1224679961554624, |
|
"learning_rate": 8.389154704944178e-08, |
|
"loss": 0.6024, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.24687813980192336, |
|
"grad_norm": 2.23125032456125, |
|
"learning_rate": 8.373205741626794e-08, |
|
"loss": 0.6039, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.24831347782402755, |
|
"grad_norm": 2.087520038725682, |
|
"learning_rate": 8.357256778309409e-08, |
|
"loss": 0.5846, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.24974881584613176, |
|
"grad_norm": 2.3368921148872337, |
|
"learning_rate": 8.341307814992025e-08, |
|
"loss": 0.5902, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.25118415386823595, |
|
"grad_norm": 2.1530686965187185, |
|
"learning_rate": 8.325358851674641e-08, |
|
"loss": 0.5749, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.25261949189034016, |
|
"grad_norm": 2.1127613396706186, |
|
"learning_rate": 8.309409888357256e-08, |
|
"loss": 0.583, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2540548299124444, |
|
"grad_norm": 2.3444742975266393, |
|
"learning_rate": 8.293460925039872e-08, |
|
"loss": 0.5896, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.2554901679345486, |
|
"grad_norm": 2.2978259898614044, |
|
"learning_rate": 8.277511961722488e-08, |
|
"loss": 0.5949, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2569255059566528, |
|
"grad_norm": 2.433573563011829, |
|
"learning_rate": 8.261562998405104e-08, |
|
"loss": 0.5727, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.25836084397875697, |
|
"grad_norm": 2.3201307959383177, |
|
"learning_rate": 8.245614035087719e-08, |
|
"loss": 0.6097, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2597961820008612, |
|
"grad_norm": 2.2013387745102238, |
|
"learning_rate": 8.229665071770334e-08, |
|
"loss": 0.5788, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.2612315200229654, |
|
"grad_norm": 2.183018300558425, |
|
"learning_rate": 8.21371610845295e-08, |
|
"loss": 0.5812, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2626668580450696, |
|
"grad_norm": 2.3072922091990025, |
|
"learning_rate": 8.197767145135566e-08, |
|
"loss": 0.5943, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.26410219606717383, |
|
"grad_norm": 2.3037915303690353, |
|
"learning_rate": 8.181818181818182e-08, |
|
"loss": 0.5748, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.26553753408927805, |
|
"grad_norm": 2.2529199926874672, |
|
"learning_rate": 8.165869218500797e-08, |
|
"loss": 0.5817, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2669728721113822, |
|
"grad_norm": 2.0936454029080296, |
|
"learning_rate": 8.149920255183413e-08, |
|
"loss": 0.5886, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2684082101334864, |
|
"grad_norm": 2.204603428252123, |
|
"learning_rate": 8.133971291866029e-08, |
|
"loss": 0.58, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.26984354815559064, |
|
"grad_norm": 2.3046758578427844, |
|
"learning_rate": 8.118022328548644e-08, |
|
"loss": 0.5639, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.27127888617769486, |
|
"grad_norm": 2.039030706090379, |
|
"learning_rate": 8.10207336523126e-08, |
|
"loss": 0.5747, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.27271422419979907, |
|
"grad_norm": 2.4822001236050184, |
|
"learning_rate": 8.086124401913875e-08, |
|
"loss": 0.5814, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.27414956222190323, |
|
"grad_norm": 2.072097265053284, |
|
"learning_rate": 8.070175438596491e-08, |
|
"loss": 0.6058, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.27558490024400745, |
|
"grad_norm": 2.213534670295135, |
|
"learning_rate": 8.054226475279107e-08, |
|
"loss": 0.5733, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.27702023826611166, |
|
"grad_norm": 2.0538875642379124, |
|
"learning_rate": 8.038277511961722e-08, |
|
"loss": 0.58, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2784555762882159, |
|
"grad_norm": 2.656256926455447, |
|
"learning_rate": 8.022328548644338e-08, |
|
"loss": 0.5827, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2798909143103201, |
|
"grad_norm": 2.2918877338477106, |
|
"learning_rate": 8.006379585326954e-08, |
|
"loss": 0.5906, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2813262523324243, |
|
"grad_norm": 2.1437496217506307, |
|
"learning_rate": 7.990430622009568e-08, |
|
"loss": 0.5994, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.28276159035452847, |
|
"grad_norm": 2.108637907788069, |
|
"learning_rate": 7.974481658692185e-08, |
|
"loss": 0.5824, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2841969283766327, |
|
"grad_norm": 2.2635800105704416, |
|
"learning_rate": 7.9585326953748e-08, |
|
"loss": 0.5899, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2856322663987369, |
|
"grad_norm": 2.5457571482007864, |
|
"learning_rate": 7.942583732057417e-08, |
|
"loss": 0.5737, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.2870676044208411, |
|
"grad_norm": 2.194123282495016, |
|
"learning_rate": 7.926634768740032e-08, |
|
"loss": 0.5867, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.28850294244294533, |
|
"grad_norm": 2.1371261720660537, |
|
"learning_rate": 7.910685805422646e-08, |
|
"loss": 0.6, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.2899382804650495, |
|
"grad_norm": 2.2866199715661497, |
|
"learning_rate": 7.894736842105262e-08, |
|
"loss": 0.5897, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2913736184871537, |
|
"grad_norm": 2.3979447418720765, |
|
"learning_rate": 7.878787878787878e-08, |
|
"loss": 0.5673, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2928089565092579, |
|
"grad_norm": 2.2911555208949426, |
|
"learning_rate": 7.862838915470495e-08, |
|
"loss": 0.5967, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.29424429453136214, |
|
"grad_norm": 2.16579462949441, |
|
"learning_rate": 7.846889952153109e-08, |
|
"loss": 0.5821, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.29567963255346635, |
|
"grad_norm": 2.044335696136996, |
|
"learning_rate": 7.830940988835725e-08, |
|
"loss": 0.5696, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.29711497057557057, |
|
"grad_norm": 2.431018134661352, |
|
"learning_rate": 7.814992025518342e-08, |
|
"loss": 0.5837, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.29855030859767473, |
|
"grad_norm": 2.210821229899501, |
|
"learning_rate": 7.799043062200956e-08, |
|
"loss": 0.6103, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.29998564661977895, |
|
"grad_norm": 2.104446504756259, |
|
"learning_rate": 7.783094098883572e-08, |
|
"loss": 0.5858, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.30055978182862064, |
|
"eval_loss": 0.6225999593734741, |
|
"eval_runtime": 276.7691, |
|
"eval_samples_per_second": 138.112, |
|
"eval_steps_per_second": 2.161, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.30142098464188316, |
|
"grad_norm": 2.3677638058778063, |
|
"learning_rate": 7.767145135566187e-08, |
|
"loss": 0.5937, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3028563226639874, |
|
"grad_norm": 2.206731325058568, |
|
"learning_rate": 7.751196172248805e-08, |
|
"loss": 0.5859, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.3042916606860916, |
|
"grad_norm": 2.1480500318169504, |
|
"learning_rate": 7.735247208931419e-08, |
|
"loss": 0.5868, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3057269987081958, |
|
"grad_norm": 2.5582545964146983, |
|
"learning_rate": 7.719298245614034e-08, |
|
"loss": 0.5742, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.30716233673029997, |
|
"grad_norm": 2.193868662465662, |
|
"learning_rate": 7.70334928229665e-08, |
|
"loss": 0.5953, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3085976747524042, |
|
"grad_norm": 2.150662198093994, |
|
"learning_rate": 7.687400318979266e-08, |
|
"loss": 0.5893, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3100330127745084, |
|
"grad_norm": 2.248241358693427, |
|
"learning_rate": 7.671451355661882e-08, |
|
"loss": 0.5768, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3114683507966126, |
|
"grad_norm": 2.043693547149444, |
|
"learning_rate": 7.655502392344497e-08, |
|
"loss": 0.5645, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.31290368881871683, |
|
"grad_norm": 2.123014406455518, |
|
"learning_rate": 7.639553429027112e-08, |
|
"loss": 0.5886, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.314339026840821, |
|
"grad_norm": 2.1690683360268803, |
|
"learning_rate": 7.623604465709729e-08, |
|
"loss": 0.5855, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.3157743648629252, |
|
"grad_norm": 2.2483407272993348, |
|
"learning_rate": 7.607655502392344e-08, |
|
"loss": 0.5734, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3172097028850294, |
|
"grad_norm": 2.2532867903751037, |
|
"learning_rate": 7.59170653907496e-08, |
|
"loss": 0.5905, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.31864504090713364, |
|
"grad_norm": 2.1607546363917143, |
|
"learning_rate": 7.575757575757575e-08, |
|
"loss": 0.5715, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.32008037892923785, |
|
"grad_norm": 2.4129775854563564, |
|
"learning_rate": 7.559808612440191e-08, |
|
"loss": 0.5882, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.32151571695134207, |
|
"grad_norm": 2.2020460031905547, |
|
"learning_rate": 7.543859649122807e-08, |
|
"loss": 0.5755, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3229510549734462, |
|
"grad_norm": 2.2513751622058233, |
|
"learning_rate": 7.527910685805422e-08, |
|
"loss": 0.573, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.32438639299555044, |
|
"grad_norm": 2.05291200128756, |
|
"learning_rate": 7.511961722488038e-08, |
|
"loss": 0.5725, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.32582173101765466, |
|
"grad_norm": 2.167144338148049, |
|
"learning_rate": 7.496012759170654e-08, |
|
"loss": 0.5952, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.3272570690397589, |
|
"grad_norm": 2.342558025297928, |
|
"learning_rate": 7.48006379585327e-08, |
|
"loss": 0.6007, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3286924070618631, |
|
"grad_norm": 2.2082285106467037, |
|
"learning_rate": 7.464114832535885e-08, |
|
"loss": 0.5988, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.33012774508396725, |
|
"grad_norm": 2.4965805150161424, |
|
"learning_rate": 7.4481658692185e-08, |
|
"loss": 0.5976, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.33156308310607147, |
|
"grad_norm": 2.199510861589018, |
|
"learning_rate": 7.432216905901117e-08, |
|
"loss": 0.5933, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.3329984211281757, |
|
"grad_norm": 2.213256566440661, |
|
"learning_rate": 7.416267942583732e-08, |
|
"loss": 0.5925, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3344337591502799, |
|
"grad_norm": 2.267147072610886, |
|
"learning_rate": 7.400318979266348e-08, |
|
"loss": 0.5952, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3358690971723841, |
|
"grad_norm": 2.384880893465299, |
|
"learning_rate": 7.384370015948963e-08, |
|
"loss": 0.5854, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.33730443519448833, |
|
"grad_norm": 2.3975013590320957, |
|
"learning_rate": 7.368421052631579e-08, |
|
"loss": 0.5896, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3387397732165925, |
|
"grad_norm": 2.196343857149047, |
|
"learning_rate": 7.352472089314195e-08, |
|
"loss": 0.5949, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3401751112386967, |
|
"grad_norm": 2.2389824981496416, |
|
"learning_rate": 7.33652312599681e-08, |
|
"loss": 0.5815, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3416104492608009, |
|
"grad_norm": 2.3137486247453163, |
|
"learning_rate": 7.320574162679426e-08, |
|
"loss": 0.5875, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.34304578728290513, |
|
"grad_norm": 2.4693537666125085, |
|
"learning_rate": 7.304625199362042e-08, |
|
"loss": 0.5959, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.34448112530500935, |
|
"grad_norm": 2.1602741385717725, |
|
"learning_rate": 7.288676236044657e-08, |
|
"loss": 0.5945, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3459164633271135, |
|
"grad_norm": 2.210588507776581, |
|
"learning_rate": 7.272727272727273e-08, |
|
"loss": 0.5743, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.3473518013492177, |
|
"grad_norm": 2.097615546869791, |
|
"learning_rate": 7.256778309409887e-08, |
|
"loss": 0.5937, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.34878713937132194, |
|
"grad_norm": 2.317699028012793, |
|
"learning_rate": 7.240829346092503e-08, |
|
"loss": 0.5849, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.35022247739342616, |
|
"grad_norm": 2.3091147413758724, |
|
"learning_rate": 7.22488038277512e-08, |
|
"loss": 0.5856, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3516578154155304, |
|
"grad_norm": 2.2476415769641167, |
|
"learning_rate": 7.208931419457734e-08, |
|
"loss": 0.5574, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3530931534376346, |
|
"grad_norm": 2.4762397998859704, |
|
"learning_rate": 7.19298245614035e-08, |
|
"loss": 0.5851, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.35452849145973875, |
|
"grad_norm": 2.437984095784635, |
|
"learning_rate": 7.177033492822967e-08, |
|
"loss": 0.5707, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.35596382948184296, |
|
"grad_norm": 2.451803793047694, |
|
"learning_rate": 7.161084529505583e-08, |
|
"loss": 0.609, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3573991675039472, |
|
"grad_norm": 2.1857999101978476, |
|
"learning_rate": 7.145135566188197e-08, |
|
"loss": 0.5943, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.3588345055260514, |
|
"grad_norm": 2.4116162170012, |
|
"learning_rate": 7.129186602870812e-08, |
|
"loss": 0.5869, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3602698435481556, |
|
"grad_norm": 2.545797080063671, |
|
"learning_rate": 7.113237639553428e-08, |
|
"loss": 0.5966, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.36170518157025977, |
|
"grad_norm": 2.117352288405697, |
|
"learning_rate": 7.097288676236044e-08, |
|
"loss": 0.5887, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.363140519592364, |
|
"grad_norm": 2.275523882502902, |
|
"learning_rate": 7.08133971291866e-08, |
|
"loss": 0.5975, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.3645758576144682, |
|
"grad_norm": 2.3571318848422798, |
|
"learning_rate": 7.065390749601275e-08, |
|
"loss": 0.5645, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3660111956365724, |
|
"grad_norm": 2.312480194276115, |
|
"learning_rate": 7.049441786283891e-08, |
|
"loss": 0.5999, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.36744653365867663, |
|
"grad_norm": 2.367840377627457, |
|
"learning_rate": 7.033492822966507e-08, |
|
"loss": 0.5824, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.36888187168078085, |
|
"grad_norm": 2.160147447958279, |
|
"learning_rate": 7.017543859649122e-08, |
|
"loss": 0.5837, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.370317209702885, |
|
"grad_norm": 2.0709776247715954, |
|
"learning_rate": 7.001594896331738e-08, |
|
"loss": 0.5699, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3717525477249892, |
|
"grad_norm": 2.3103426448215814, |
|
"learning_rate": 6.985645933014353e-08, |
|
"loss": 0.5608, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.37318788574709344, |
|
"grad_norm": 2.4092008720321116, |
|
"learning_rate": 6.96969696969697e-08, |
|
"loss": 0.5814, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.37462322376919766, |
|
"grad_norm": 2.310371459366773, |
|
"learning_rate": 6.953748006379585e-08, |
|
"loss": 0.5894, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.37605856179130187, |
|
"grad_norm": 2.253935903638457, |
|
"learning_rate": 6.9377990430622e-08, |
|
"loss": 0.5728, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.37749389981340603, |
|
"grad_norm": 2.1948403595861765, |
|
"learning_rate": 6.921850079744816e-08, |
|
"loss": 0.5907, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.37892923783551025, |
|
"grad_norm": 2.5444511215863352, |
|
"learning_rate": 6.905901116427432e-08, |
|
"loss": 0.5916, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.38036457585761446, |
|
"grad_norm": 2.4157863054442363, |
|
"learning_rate": 6.889952153110048e-08, |
|
"loss": 0.5966, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3817999138797187, |
|
"grad_norm": 2.358893882397905, |
|
"learning_rate": 6.874003189792663e-08, |
|
"loss": 0.584, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3832352519018229, |
|
"grad_norm": 2.4540399070214503, |
|
"learning_rate": 6.858054226475278e-08, |
|
"loss": 0.5682, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.3846705899239271, |
|
"grad_norm": 2.4924888134144942, |
|
"learning_rate": 6.842105263157895e-08, |
|
"loss": 0.585, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.38610592794603127, |
|
"grad_norm": 2.4341634135514805, |
|
"learning_rate": 6.82615629984051e-08, |
|
"loss": 0.5794, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3875412659681355, |
|
"grad_norm": 2.2137130985517888, |
|
"learning_rate": 6.810207336523126e-08, |
|
"loss": 0.5843, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3889766039902397, |
|
"grad_norm": 2.259943764213412, |
|
"learning_rate": 6.794258373205741e-08, |
|
"loss": 0.5751, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.3904119420123439, |
|
"grad_norm": 2.313121297434376, |
|
"learning_rate": 6.778309409888357e-08, |
|
"loss": 0.5648, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.39184728003444813, |
|
"grad_norm": 2.1195979632978053, |
|
"learning_rate": 6.762360446570973e-08, |
|
"loss": 0.5757, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3932826180565523, |
|
"grad_norm": 2.34003269760013, |
|
"learning_rate": 6.746411483253588e-08, |
|
"loss": 0.5839, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.3947179560786565, |
|
"grad_norm": 2.2650000981436014, |
|
"learning_rate": 6.730462519936204e-08, |
|
"loss": 0.574, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.3961532941007607, |
|
"grad_norm": 2.265678124826934, |
|
"learning_rate": 6.71451355661882e-08, |
|
"loss": 0.5925, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.39758863212286494, |
|
"grad_norm": 2.4721732776855694, |
|
"learning_rate": 6.698564593301436e-08, |
|
"loss": 0.5749, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.39902397014496915, |
|
"grad_norm": 2.0341699451050226, |
|
"learning_rate": 6.682615629984051e-08, |
|
"loss": 0.568, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.40045930816707337, |
|
"grad_norm": 2.258266710480253, |
|
"learning_rate": 6.666666666666665e-08, |
|
"loss": 0.579, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.4007463757714942, |
|
"eval_loss": 0.6223743557929993, |
|
"eval_runtime": 276.9327, |
|
"eval_samples_per_second": 138.03, |
|
"eval_steps_per_second": 2.159, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 0.40189464618917753, |
|
"grad_norm": 2.3590634880789527, |
|
"learning_rate": 6.650717703349283e-08, |
|
"loss": 0.5979, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.40332998421128174, |
|
"grad_norm": 2.2482983012145112, |
|
"learning_rate": 6.634768740031898e-08, |
|
"loss": 0.5725, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.40476532223338596, |
|
"grad_norm": 2.099031799334457, |
|
"learning_rate": 6.618819776714514e-08, |
|
"loss": 0.5702, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4062006602554902, |
|
"grad_norm": 2.2837745097672766, |
|
"learning_rate": 6.602870813397129e-08, |
|
"loss": 0.5853, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4076359982775944, |
|
"grad_norm": 2.1686626635780204, |
|
"learning_rate": 6.586921850079745e-08, |
|
"loss": 0.5925, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.40907133629969855, |
|
"grad_norm": 2.3045919749457426, |
|
"learning_rate": 6.570972886762361e-08, |
|
"loss": 0.5678, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.41050667432180277, |
|
"grad_norm": 2.506461569864655, |
|
"learning_rate": 6.555023923444975e-08, |
|
"loss": 0.5734, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.411942012343907, |
|
"grad_norm": 2.216335116055202, |
|
"learning_rate": 6.539074960127592e-08, |
|
"loss": 0.5588, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.4133773503660112, |
|
"grad_norm": 2.2460456566846614, |
|
"learning_rate": 6.523125996810208e-08, |
|
"loss": 0.5805, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4148126883881154, |
|
"grad_norm": 2.246523877288839, |
|
"learning_rate": 6.507177033492822e-08, |
|
"loss": 0.5595, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.41624802641021963, |
|
"grad_norm": 2.3369923163629736, |
|
"learning_rate": 6.491228070175438e-08, |
|
"loss": 0.583, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4176833644323238, |
|
"grad_norm": 2.4647497141369445, |
|
"learning_rate": 6.475279106858053e-08, |
|
"loss": 0.5709, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.419118702454428, |
|
"grad_norm": 2.169633790885161, |
|
"learning_rate": 6.45933014354067e-08, |
|
"loss": 0.5698, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4205540404765322, |
|
"grad_norm": 2.3384413570606712, |
|
"learning_rate": 6.443381180223285e-08, |
|
"loss": 0.5896, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.42198937849863644, |
|
"grad_norm": 2.265985280435675, |
|
"learning_rate": 6.4274322169059e-08, |
|
"loss": 0.5824, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.42342471652074065, |
|
"grad_norm": 2.3198946197532524, |
|
"learning_rate": 6.411483253588516e-08, |
|
"loss": 0.5846, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.42486005454284487, |
|
"grad_norm": 2.1622460803134267, |
|
"learning_rate": 6.395534290271132e-08, |
|
"loss": 0.5901, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.426295392564949, |
|
"grad_norm": 2.1003168882776335, |
|
"learning_rate": 6.379585326953748e-08, |
|
"loss": 0.5648, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.42773073058705324, |
|
"grad_norm": 2.1502708485133595, |
|
"learning_rate": 6.363636363636363e-08, |
|
"loss": 0.5789, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.42916606860915746, |
|
"grad_norm": 2.2104881202400306, |
|
"learning_rate": 6.347687400318978e-08, |
|
"loss": 0.5653, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.4306014066312617, |
|
"grad_norm": 2.3011371767087176, |
|
"learning_rate": 6.331738437001594e-08, |
|
"loss": 0.5766, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4320367446533659, |
|
"grad_norm": 3.260166860791866, |
|
"learning_rate": 6.31578947368421e-08, |
|
"loss": 0.5731, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.43347208267547005, |
|
"grad_norm": 2.394536391337356, |
|
"learning_rate": 6.299840510366826e-08, |
|
"loss": 0.5752, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.43490742069757427, |
|
"grad_norm": 2.283308722940367, |
|
"learning_rate": 6.283891547049441e-08, |
|
"loss": 0.5743, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.4363427587196785, |
|
"grad_norm": 2.3203679079324635, |
|
"learning_rate": 6.267942583732057e-08, |
|
"loss": 0.5753, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4377780967417827, |
|
"grad_norm": 2.3222606808309667, |
|
"learning_rate": 6.251993620414673e-08, |
|
"loss": 0.5926, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4392134347638869, |
|
"grad_norm": 2.207029727095187, |
|
"learning_rate": 6.236044657097288e-08, |
|
"loss": 0.5636, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4406487727859911, |
|
"grad_norm": 2.1568606265391055, |
|
"learning_rate": 6.220095693779904e-08, |
|
"loss": 0.569, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.4420841108080953, |
|
"grad_norm": 2.317372995242904, |
|
"learning_rate": 6.204146730462519e-08, |
|
"loss": 0.5783, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4435194488301995, |
|
"grad_norm": 2.162686944734311, |
|
"learning_rate": 6.188197767145136e-08, |
|
"loss": 0.5799, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.4449547868523037, |
|
"grad_norm": 2.4388032101358528, |
|
"learning_rate": 6.172248803827751e-08, |
|
"loss": 0.5839, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.44639012487440793, |
|
"grad_norm": 2.2305867419581293, |
|
"learning_rate": 6.156299840510366e-08, |
|
"loss": 0.5735, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.44782546289651215, |
|
"grad_norm": 2.134988514368576, |
|
"learning_rate": 6.140350877192982e-08, |
|
"loss": 0.5632, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4492608009186163, |
|
"grad_norm": 2.3800787456946115, |
|
"learning_rate": 6.124401913875598e-08, |
|
"loss": 0.5796, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.4506961389407205, |
|
"grad_norm": 2.1810860707910735, |
|
"learning_rate": 6.108452950558214e-08, |
|
"loss": 0.5788, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.45213147696282474, |
|
"grad_norm": 2.308640019656118, |
|
"learning_rate": 6.092503987240829e-08, |
|
"loss": 0.5839, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.45356681498492896, |
|
"grad_norm": 2.297594960230257, |
|
"learning_rate": 6.076555023923444e-08, |
|
"loss": 0.605, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.45500215300703317, |
|
"grad_norm": 2.1364449207680396, |
|
"learning_rate": 6.060606060606061e-08, |
|
"loss": 0.5867, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.4564374910291374, |
|
"grad_norm": 2.1756934857023666, |
|
"learning_rate": 6.044657097288676e-08, |
|
"loss": 0.5722, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.45787282905124155, |
|
"grad_norm": 2.18990044643342, |
|
"learning_rate": 6.028708133971292e-08, |
|
"loss": 0.5754, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.45930816707334576, |
|
"grad_norm": 2.4364139727048415, |
|
"learning_rate": 6.012759170653907e-08, |
|
"loss": 0.5856, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.46074350509545, |
|
"grad_norm": 2.2360356658373446, |
|
"learning_rate": 5.996810207336523e-08, |
|
"loss": 0.5612, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4621788431175542, |
|
"grad_norm": 2.190809489755964, |
|
"learning_rate": 5.980861244019139e-08, |
|
"loss": 0.5717, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4636141811396584, |
|
"grad_norm": 2.359827734351754, |
|
"learning_rate": 5.964912280701754e-08, |
|
"loss": 0.5733, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.46504951916176257, |
|
"grad_norm": 2.21035489055241, |
|
"learning_rate": 5.94896331738437e-08, |
|
"loss": 0.5758, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4664848571838668, |
|
"grad_norm": 2.4239500111786776, |
|
"learning_rate": 5.933014354066985e-08, |
|
"loss": 0.5709, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.467920195205971, |
|
"grad_norm": 2.3531687327749444, |
|
"learning_rate": 5.917065390749602e-08, |
|
"loss": 0.5769, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4693555332280752, |
|
"grad_norm": 2.2487850185411267, |
|
"learning_rate": 5.9011164274322166e-08, |
|
"loss": 0.5682, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.47079087125017943, |
|
"grad_norm": 2.309323517203607, |
|
"learning_rate": 5.885167464114832e-08, |
|
"loss": 0.5848, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.47222620927228365, |
|
"grad_norm": 2.214237754922774, |
|
"learning_rate": 5.869218500797448e-08, |
|
"loss": 0.572, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.4736615472943878, |
|
"grad_norm": 2.3622619155735305, |
|
"learning_rate": 5.8532695374800635e-08, |
|
"loss": 0.5569, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.475096885316492, |
|
"grad_norm": 2.5867769507291904, |
|
"learning_rate": 5.8373205741626796e-08, |
|
"loss": 0.5837, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.47653222333859624, |
|
"grad_norm": 2.529755318765141, |
|
"learning_rate": 5.821371610845295e-08, |
|
"loss": 0.5811, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.47796756136070045, |
|
"grad_norm": 2.266910537139947, |
|
"learning_rate": 5.80542264752791e-08, |
|
"loss": 0.578, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.47940289938280467, |
|
"grad_norm": 2.279806038320283, |
|
"learning_rate": 5.7894736842105265e-08, |
|
"loss": 0.5526, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.48083823740490883, |
|
"grad_norm": 2.12918604571782, |
|
"learning_rate": 5.773524720893141e-08, |
|
"loss": 0.5794, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.48227357542701305, |
|
"grad_norm": 2.2284026717270806, |
|
"learning_rate": 5.757575757575758e-08, |
|
"loss": 0.5704, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.48370891344911726, |
|
"grad_norm": 2.4032593629075145, |
|
"learning_rate": 5.741626794258373e-08, |
|
"loss": 0.5459, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.4851442514712215, |
|
"grad_norm": 2.203189127996456, |
|
"learning_rate": 5.725677830940988e-08, |
|
"loss": 0.5997, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4865795894933257, |
|
"grad_norm": 2.4266862396460627, |
|
"learning_rate": 5.7097288676236043e-08, |
|
"loss": 0.5728, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.4880149275154299, |
|
"grad_norm": 2.1211185101213785, |
|
"learning_rate": 5.69377990430622e-08, |
|
"loss": 0.5727, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.48945026553753407, |
|
"grad_norm": 2.3364430793877244, |
|
"learning_rate": 5.677830940988836e-08, |
|
"loss": 0.58, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.4908856035596383, |
|
"grad_norm": 2.206551955612983, |
|
"learning_rate": 5.661881977671451e-08, |
|
"loss": 0.5696, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4923209415817425, |
|
"grad_norm": 2.2652638647430687, |
|
"learning_rate": 5.645933014354066e-08, |
|
"loss": 0.5679, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.4937562796038467, |
|
"grad_norm": 2.596656624165877, |
|
"learning_rate": 5.629984051036683e-08, |
|
"loss": 0.5767, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.49519161762595093, |
|
"grad_norm": 2.3164896628154286, |
|
"learning_rate": 5.6140350877192976e-08, |
|
"loss": 0.5628, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4966269556480551, |
|
"grad_norm": 2.602375169914542, |
|
"learning_rate": 5.5980861244019137e-08, |
|
"loss": 0.5769, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4980622936701593, |
|
"grad_norm": 2.5046523107168928, |
|
"learning_rate": 5.582137161084529e-08, |
|
"loss": 0.5654, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.4994976316922635, |
|
"grad_norm": 2.2378268554432013, |
|
"learning_rate": 5.566188197767145e-08, |
|
"loss": 0.5619, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5009329697143677, |
|
"grad_norm": 2.3540069510141444, |
|
"learning_rate": 5.5502392344497606e-08, |
|
"loss": 0.565, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.5009329697143677, |
|
"eval_loss": 0.6235304474830627, |
|
"eval_runtime": 276.9897, |
|
"eval_samples_per_second": 138.002, |
|
"eval_steps_per_second": 2.159, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.5023683077364719, |
|
"grad_norm": 2.322877723800299, |
|
"learning_rate": 5.534290271132376e-08, |
|
"loss": 0.57, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5038036457585762, |
|
"grad_norm": 2.2404868803436684, |
|
"learning_rate": 5.518341307814992e-08, |
|
"loss": 0.5619, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5052389837806803, |
|
"grad_norm": 2.427309485938781, |
|
"learning_rate": 5.5023923444976075e-08, |
|
"loss": 0.5568, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5066743218027846, |
|
"grad_norm": 2.3119777544390323, |
|
"learning_rate": 5.4864433811802236e-08, |
|
"loss": 0.5769, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5081096598248888, |
|
"grad_norm": 2.876236781539242, |
|
"learning_rate": 5.4704944178628384e-08, |
|
"loss": 0.5479, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5095449978469929, |
|
"grad_norm": 2.2849069424986106, |
|
"learning_rate": 5.454545454545454e-08, |
|
"loss": 0.5648, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5109803358690972, |
|
"grad_norm": 2.477418575095315, |
|
"learning_rate": 5.43859649122807e-08, |
|
"loss": 0.573, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5124156738912014, |
|
"grad_norm": 2.293676037844509, |
|
"learning_rate": 5.4226475279106853e-08, |
|
"loss": 0.5717, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5138510119133056, |
|
"grad_norm": 2.1498159417130145, |
|
"learning_rate": 5.4066985645933014e-08, |
|
"loss": 0.5574, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5152863499354098, |
|
"grad_norm": 2.3707187570502937, |
|
"learning_rate": 5.390749601275917e-08, |
|
"loss": 0.5793, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.5167216879575139, |
|
"grad_norm": 2.4610592838296506, |
|
"learning_rate": 5.374800637958532e-08, |
|
"loss": 0.5603, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5181570259796182, |
|
"grad_norm": 2.235095726599105, |
|
"learning_rate": 5.3588516746411484e-08, |
|
"loss": 0.5732, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.5195923640017224, |
|
"grad_norm": 2.2403487945921796, |
|
"learning_rate": 5.342902711323764e-08, |
|
"loss": 0.5726, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5210277020238266, |
|
"grad_norm": 2.46672680845345, |
|
"learning_rate": 5.32695374800638e-08, |
|
"loss": 0.5784, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5224630400459308, |
|
"grad_norm": 2.1910024552090284, |
|
"learning_rate": 5.3110047846889946e-08, |
|
"loss": 0.5778, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5238983780680351, |
|
"grad_norm": 2.2204944277498426, |
|
"learning_rate": 5.29505582137161e-08, |
|
"loss": 0.572, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.5253337160901392, |
|
"grad_norm": 2.211764134448438, |
|
"learning_rate": 5.279106858054226e-08, |
|
"loss": 0.5563, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5267690541122434, |
|
"grad_norm": 2.160392217124782, |
|
"learning_rate": 5.2631578947368416e-08, |
|
"loss": 0.5715, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.5282043921343477, |
|
"grad_norm": 2.3173974716804837, |
|
"learning_rate": 5.247208931419458e-08, |
|
"loss": 0.5799, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5296397301564518, |
|
"grad_norm": 2.4119603113614625, |
|
"learning_rate": 5.231259968102073e-08, |
|
"loss": 0.5715, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5310750681785561, |
|
"grad_norm": 2.3942922290744852, |
|
"learning_rate": 5.2153110047846885e-08, |
|
"loss": 0.5798, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5325104062006603, |
|
"grad_norm": 2.4827513279602487, |
|
"learning_rate": 5.1993620414673046e-08, |
|
"loss": 0.5759, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5339457442227644, |
|
"grad_norm": 2.303837079874495, |
|
"learning_rate": 5.1834130781499194e-08, |
|
"loss": 0.5793, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5353810822448687, |
|
"grad_norm": 2.3729441241328324, |
|
"learning_rate": 5.167464114832536e-08, |
|
"loss": 0.5647, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.5368164202669728, |
|
"grad_norm": 2.584955282461203, |
|
"learning_rate": 5.151515151515151e-08, |
|
"loss": 0.5635, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5382517582890771, |
|
"grad_norm": 2.3732511749039014, |
|
"learning_rate": 5.1355661881977677e-08, |
|
"loss": 0.5677, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5396870963111813, |
|
"grad_norm": 2.3031000575513967, |
|
"learning_rate": 5.1196172248803824e-08, |
|
"loss": 0.5765, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5411224343332854, |
|
"grad_norm": 2.13233132113133, |
|
"learning_rate": 5.103668261562998e-08, |
|
"loss": 0.5713, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.5425577723553897, |
|
"grad_norm": 2.3633442992361844, |
|
"learning_rate": 5.087719298245614e-08, |
|
"loss": 0.5548, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5439931103774939, |
|
"grad_norm": 2.369702190598172, |
|
"learning_rate": 5.0717703349282294e-08, |
|
"loss": 0.5665, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.5454284483995981, |
|
"grad_norm": 2.3321818602794377, |
|
"learning_rate": 5.0558213716108454e-08, |
|
"loss": 0.5704, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5468637864217023, |
|
"grad_norm": 2.380628420225508, |
|
"learning_rate": 5.039872408293461e-08, |
|
"loss": 0.5634, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5482991244438065, |
|
"grad_norm": 2.416952583768008, |
|
"learning_rate": 5.0239234449760756e-08, |
|
"loss": 0.5816, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5497344624659107, |
|
"grad_norm": 2.143565389342716, |
|
"learning_rate": 5.0079744816586924e-08, |
|
"loss": 0.5685, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5511698004880149, |
|
"grad_norm": 2.318881848504763, |
|
"learning_rate": 4.992025518341307e-08, |
|
"loss": 0.5538, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5526051385101192, |
|
"grad_norm": 2.4470463053460043, |
|
"learning_rate": 4.976076555023923e-08, |
|
"loss": 0.5726, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5540404765322233, |
|
"grad_norm": 2.3102885996703946, |
|
"learning_rate": 4.960127591706539e-08, |
|
"loss": 0.553, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5554758145543276, |
|
"grad_norm": 2.485118825226686, |
|
"learning_rate": 4.944178628389155e-08, |
|
"loss": 0.5614, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5569111525764318, |
|
"grad_norm": 2.4666660309950554, |
|
"learning_rate": 4.92822966507177e-08, |
|
"loss": 0.5767, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5583464905985359, |
|
"grad_norm": 2.6440619922621194, |
|
"learning_rate": 4.9122807017543856e-08, |
|
"loss": 0.577, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5597818286206402, |
|
"grad_norm": 2.234037650395611, |
|
"learning_rate": 4.896331738437001e-08, |
|
"loss": 0.5603, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5612171666427443, |
|
"grad_norm": 2.2239222727962518, |
|
"learning_rate": 4.880382775119617e-08, |
|
"loss": 0.5684, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.5626525046648486, |
|
"grad_norm": 2.2149189102032643, |
|
"learning_rate": 4.8644338118022326e-08, |
|
"loss": 0.564, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5640878426869528, |
|
"grad_norm": 2.302377689722843, |
|
"learning_rate": 4.8484848484848486e-08, |
|
"loss": 0.5723, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5655231807090569, |
|
"grad_norm": 2.3609046338014767, |
|
"learning_rate": 4.832535885167464e-08, |
|
"loss": 0.5576, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5669585187311612, |
|
"grad_norm": 2.3940149734210183, |
|
"learning_rate": 4.8165869218500795e-08, |
|
"loss": 0.5701, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5683938567532654, |
|
"grad_norm": 2.499184750287661, |
|
"learning_rate": 4.800637958532695e-08, |
|
"loss": 0.5917, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5698291947753696, |
|
"grad_norm": 2.3892770083720953, |
|
"learning_rate": 4.784688995215311e-08, |
|
"loss": 0.5739, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5712645327974738, |
|
"grad_norm": 2.342709389162696, |
|
"learning_rate": 4.7687400318979264e-08, |
|
"loss": 0.5687, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.572699870819578, |
|
"grad_norm": 2.3950007221533762, |
|
"learning_rate": 4.7527910685805425e-08, |
|
"loss": 0.5823, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5741352088416822, |
|
"grad_norm": 2.456899330442452, |
|
"learning_rate": 4.736842105263157e-08, |
|
"loss": 0.5554, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5755705468637864, |
|
"grad_norm": 2.0195213406503503, |
|
"learning_rate": 4.7208931419457734e-08, |
|
"loss": 0.5498, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.5770058848858907, |
|
"grad_norm": 2.4854469417568144, |
|
"learning_rate": 4.704944178628389e-08, |
|
"loss": 0.5797, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5784412229079948, |
|
"grad_norm": 2.397518747088301, |
|
"learning_rate": 4.688995215311005e-08, |
|
"loss": 0.5799, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.579876560930099, |
|
"grad_norm": 2.30185261859014, |
|
"learning_rate": 4.67304625199362e-08, |
|
"loss": 0.5716, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5813118989522033, |
|
"grad_norm": 2.454194414740183, |
|
"learning_rate": 4.657097288676236e-08, |
|
"loss": 0.5714, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5827472369743074, |
|
"grad_norm": 2.2295797322099564, |
|
"learning_rate": 4.641148325358851e-08, |
|
"loss": 0.56, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5841825749964117, |
|
"grad_norm": 2.2435929203230836, |
|
"learning_rate": 4.625199362041467e-08, |
|
"loss": 0.5612, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.5856179130185158, |
|
"grad_norm": 2.354108340281971, |
|
"learning_rate": 4.609250398724083e-08, |
|
"loss": 0.5559, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5870532510406201, |
|
"grad_norm": 2.383408186012462, |
|
"learning_rate": 4.593301435406698e-08, |
|
"loss": 0.5552, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5884885890627243, |
|
"grad_norm": 2.274522558925121, |
|
"learning_rate": 4.577352472089314e-08, |
|
"loss": 0.5712, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5899239270848284, |
|
"grad_norm": 2.193942049887206, |
|
"learning_rate": 4.5614035087719296e-08, |
|
"loss": 0.5591, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5913592651069327, |
|
"grad_norm": 2.5336345097087176, |
|
"learning_rate": 4.545454545454545e-08, |
|
"loss": 0.5589, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5927946031290369, |
|
"grad_norm": 2.2998619526688633, |
|
"learning_rate": 4.529505582137161e-08, |
|
"loss": 0.5672, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5942299411511411, |
|
"grad_norm": 2.2622163134776603, |
|
"learning_rate": 4.5135566188197766e-08, |
|
"loss": 0.563, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5956652791732453, |
|
"grad_norm": 2.3239949997017204, |
|
"learning_rate": 4.497607655502392e-08, |
|
"loss": 0.5764, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.5971006171953495, |
|
"grad_norm": 2.2025498742254337, |
|
"learning_rate": 4.481658692185008e-08, |
|
"loss": 0.5676, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5985359552174537, |
|
"grad_norm": 2.5241546548407943, |
|
"learning_rate": 4.4657097288676235e-08, |
|
"loss": 0.5602, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.5999712932395579, |
|
"grad_norm": 2.3507845605252493, |
|
"learning_rate": 4.449760765550239e-08, |
|
"loss": 0.565, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6011195636572413, |
|
"eval_loss": 0.624911904335022, |
|
"eval_runtime": 276.9879, |
|
"eval_samples_per_second": 138.002, |
|
"eval_steps_per_second": 2.159, |
|
"step": 2094 |
|
}, |
|
{ |
|
"epoch": 0.6014066312616622, |
|
"grad_norm": 2.249365554658679, |
|
"learning_rate": 4.4338118022328544e-08, |
|
"loss": 0.5816, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.6028419692837663, |
|
"grad_norm": 2.679508996031371, |
|
"learning_rate": 4.4178628389154705e-08, |
|
"loss": 0.584, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6042773073058705, |
|
"grad_norm": 2.2991072095507423, |
|
"learning_rate": 4.401913875598086e-08, |
|
"loss": 0.5634, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.6057126453279748, |
|
"grad_norm": 2.8967113990790376, |
|
"learning_rate": 4.385964912280701e-08, |
|
"loss": 0.5705, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6071479833500789, |
|
"grad_norm": 2.418044587004619, |
|
"learning_rate": 4.370015948963317e-08, |
|
"loss": 0.5786, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6085833213721832, |
|
"grad_norm": 2.220282737585246, |
|
"learning_rate": 4.354066985645933e-08, |
|
"loss": 0.5708, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6100186593942873, |
|
"grad_norm": 2.242153390322156, |
|
"learning_rate": 4.338118022328548e-08, |
|
"loss": 0.5587, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6114539974163916, |
|
"grad_norm": 2.224785684141244, |
|
"learning_rate": 4.3221690590111644e-08, |
|
"loss": 0.5734, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6128893354384958, |
|
"grad_norm": 2.3268091122198484, |
|
"learning_rate": 4.30622009569378e-08, |
|
"loss": 0.5529, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6143246734605999, |
|
"grad_norm": 2.363021850053623, |
|
"learning_rate": 4.290271132376395e-08, |
|
"loss": 0.569, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6157600114827042, |
|
"grad_norm": 2.3431314056786574, |
|
"learning_rate": 4.2743221690590106e-08, |
|
"loss": 0.5602, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6171953495048084, |
|
"grad_norm": 2.4159747940780245, |
|
"learning_rate": 4.258373205741627e-08, |
|
"loss": 0.5618, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6186306875269126, |
|
"grad_norm": 2.4207306331146694, |
|
"learning_rate": 4.242424242424242e-08, |
|
"loss": 0.5543, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.6200660255490168, |
|
"grad_norm": 2.1923873922362054, |
|
"learning_rate": 4.226475279106858e-08, |
|
"loss": 0.5555, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.621501363571121, |
|
"grad_norm": 2.338285581454532, |
|
"learning_rate": 4.210526315789473e-08, |
|
"loss": 0.583, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6229367015932252, |
|
"grad_norm": 2.3753940324053464, |
|
"learning_rate": 4.194577352472089e-08, |
|
"loss": 0.5838, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6243720396153294, |
|
"grad_norm": 2.2591075922980197, |
|
"learning_rate": 4.1786283891547045e-08, |
|
"loss": 0.5584, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6258073776374337, |
|
"grad_norm": 2.3674897727831556, |
|
"learning_rate": 4.1626794258373206e-08, |
|
"loss": 0.5726, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6272427156595378, |
|
"grad_norm": 2.440181667868599, |
|
"learning_rate": 4.146730462519936e-08, |
|
"loss": 0.5575, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.628678053681642, |
|
"grad_norm": 2.3559449056154227, |
|
"learning_rate": 4.130781499202552e-08, |
|
"loss": 0.5791, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6301133917037463, |
|
"grad_norm": 2.731472032876172, |
|
"learning_rate": 4.114832535885167e-08, |
|
"loss": 0.5657, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.6315487297258504, |
|
"grad_norm": 2.2497218848315987, |
|
"learning_rate": 4.098883572567783e-08, |
|
"loss": 0.5629, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6329840677479547, |
|
"grad_norm": 2.443671279805305, |
|
"learning_rate": 4.0829346092503984e-08, |
|
"loss": 0.5529, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6344194057700588, |
|
"grad_norm": 2.7423599812106514, |
|
"learning_rate": 4.0669856459330145e-08, |
|
"loss": 0.574, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.635854743792163, |
|
"grad_norm": 2.4366565097143327, |
|
"learning_rate": 4.05103668261563e-08, |
|
"loss": 0.5677, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.6372900818142673, |
|
"grad_norm": 2.4786368215629597, |
|
"learning_rate": 4.0350877192982454e-08, |
|
"loss": 0.5666, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6387254198363714, |
|
"grad_norm": 2.1264349737147192, |
|
"learning_rate": 4.019138755980861e-08, |
|
"loss": 0.5657, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.6401607578584757, |
|
"grad_norm": 2.4930450914127635, |
|
"learning_rate": 4.003189792663477e-08, |
|
"loss": 0.5711, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6415960958805799, |
|
"grad_norm": 2.5188486429575825, |
|
"learning_rate": 3.987240829346092e-08, |
|
"loss": 0.5606, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6430314339026841, |
|
"grad_norm": 2.4615749950506607, |
|
"learning_rate": 3.9712918660287084e-08, |
|
"loss": 0.5608, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6444667719247883, |
|
"grad_norm": 2.3168585600756577, |
|
"learning_rate": 3.955342902711323e-08, |
|
"loss": 0.5779, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.6459021099468925, |
|
"grad_norm": 2.431077288205627, |
|
"learning_rate": 3.939393939393939e-08, |
|
"loss": 0.5637, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6473374479689967, |
|
"grad_norm": 2.4743477577848716, |
|
"learning_rate": 3.9234449760765547e-08, |
|
"loss": 0.5802, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.6487727859911009, |
|
"grad_norm": 2.3956871074752732, |
|
"learning_rate": 3.907496012759171e-08, |
|
"loss": 0.5462, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6502081240132052, |
|
"grad_norm": 2.232965267462123, |
|
"learning_rate": 3.891547049441786e-08, |
|
"loss": 0.5633, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.6516434620353093, |
|
"grad_norm": 2.313054571588013, |
|
"learning_rate": 3.875598086124402e-08, |
|
"loss": 0.5703, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6530788000574135, |
|
"grad_norm": 2.5589108956349134, |
|
"learning_rate": 3.859649122807017e-08, |
|
"loss": 0.5655, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6545141380795177, |
|
"grad_norm": 2.4031051489391952, |
|
"learning_rate": 3.843700159489633e-08, |
|
"loss": 0.5754, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6559494761016219, |
|
"grad_norm": 2.2922272855048393, |
|
"learning_rate": 3.8277511961722485e-08, |
|
"loss": 0.5676, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6573848141237262, |
|
"grad_norm": 2.3624459351716216, |
|
"learning_rate": 3.8118022328548646e-08, |
|
"loss": 0.5524, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6588201521458303, |
|
"grad_norm": 2.370212518166197, |
|
"learning_rate": 3.79585326953748e-08, |
|
"loss": 0.5564, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6602554901679345, |
|
"grad_norm": 2.2505462367021125, |
|
"learning_rate": 3.7799043062200955e-08, |
|
"loss": 0.5743, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6616908281900388, |
|
"grad_norm": 2.3819573849600926, |
|
"learning_rate": 3.763955342902711e-08, |
|
"loss": 0.5641, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6631261662121429, |
|
"grad_norm": 2.262014053722333, |
|
"learning_rate": 3.748006379585327e-08, |
|
"loss": 0.5618, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6645615042342472, |
|
"grad_norm": 2.3932483107950198, |
|
"learning_rate": 3.7320574162679424e-08, |
|
"loss": 0.5753, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6659968422563514, |
|
"grad_norm": 2.551394665480823, |
|
"learning_rate": 3.7161084529505585e-08, |
|
"loss": 0.564, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6674321802784555, |
|
"grad_norm": 2.601979736097557, |
|
"learning_rate": 3.700159489633174e-08, |
|
"loss": 0.5683, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6688675183005598, |
|
"grad_norm": 2.256444776064737, |
|
"learning_rate": 3.6842105263157894e-08, |
|
"loss": 0.5458, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.670302856322664, |
|
"grad_norm": 2.674757703170623, |
|
"learning_rate": 3.668261562998405e-08, |
|
"loss": 0.5594, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6717381943447682, |
|
"grad_norm": 2.2518812694654686, |
|
"learning_rate": 3.652312599681021e-08, |
|
"loss": 0.5588, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6731735323668724, |
|
"grad_norm": 2.4794287433388416, |
|
"learning_rate": 3.636363636363636e-08, |
|
"loss": 0.5646, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6746088703889767, |
|
"grad_norm": 2.4696337536144326, |
|
"learning_rate": 3.620414673046252e-08, |
|
"loss": 0.5735, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6760442084110808, |
|
"grad_norm": 2.4381605737624557, |
|
"learning_rate": 3.604465709728867e-08, |
|
"loss": 0.5498, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.677479546433185, |
|
"grad_norm": 2.493543754896731, |
|
"learning_rate": 3.588516746411483e-08, |
|
"loss": 0.5641, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6789148844552892, |
|
"grad_norm": 2.4237111562016116, |
|
"learning_rate": 3.572567783094099e-08, |
|
"loss": 0.5636, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.6803502224773934, |
|
"grad_norm": 2.5259877154306465, |
|
"learning_rate": 3.556618819776714e-08, |
|
"loss": 0.563, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6817855604994977, |
|
"grad_norm": 2.4361062944752625, |
|
"learning_rate": 3.54066985645933e-08, |
|
"loss": 0.543, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.6832208985216018, |
|
"grad_norm": 2.518366498082825, |
|
"learning_rate": 3.5247208931419456e-08, |
|
"loss": 0.5739, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.684656236543706, |
|
"grad_norm": 2.4025021753219877, |
|
"learning_rate": 3.508771929824561e-08, |
|
"loss": 0.5607, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6860915745658103, |
|
"grad_norm": 2.4146630449992537, |
|
"learning_rate": 3.4928229665071765e-08, |
|
"loss": 0.5454, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6875269125879144, |
|
"grad_norm": 2.3763826608890595, |
|
"learning_rate": 3.4768740031897926e-08, |
|
"loss": 0.5525, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6889622506100187, |
|
"grad_norm": 2.346702361136207, |
|
"learning_rate": 3.460925039872408e-08, |
|
"loss": 0.5747, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6903975886321229, |
|
"grad_norm": 2.3609513152625667, |
|
"learning_rate": 3.444976076555024e-08, |
|
"loss": 0.571, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.691832926654227, |
|
"grad_norm": 2.5451074586099827, |
|
"learning_rate": 3.429027113237639e-08, |
|
"loss": 0.5591, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6932682646763313, |
|
"grad_norm": 2.4807919469343687, |
|
"learning_rate": 3.413078149920255e-08, |
|
"loss": 0.5635, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6947036026984355, |
|
"grad_norm": 2.3699253800445015, |
|
"learning_rate": 3.3971291866028704e-08, |
|
"loss": 0.5478, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6961389407205397, |
|
"grad_norm": 2.536150295890729, |
|
"learning_rate": 3.3811802232854865e-08, |
|
"loss": 0.5704, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.6975742787426439, |
|
"grad_norm": 2.3887434323456658, |
|
"learning_rate": 3.365231259968102e-08, |
|
"loss": 0.5636, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.699009616764748, |
|
"grad_norm": 2.42308842897933, |
|
"learning_rate": 3.349282296650718e-08, |
|
"loss": 0.5536, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7004449547868523, |
|
"grad_norm": 2.478461202673975, |
|
"learning_rate": 3.333333333333333e-08, |
|
"loss": 0.5376, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7013061576001148, |
|
"eval_loss": 0.624883770942688, |
|
"eval_runtime": 276.8184, |
|
"eval_samples_per_second": 138.087, |
|
"eval_steps_per_second": 2.16, |
|
"step": 2443 |
|
}, |
|
{ |
|
"epoch": 0.7018802928089565, |
|
"grad_norm": 2.6213513088762364, |
|
"learning_rate": 3.317384370015949e-08, |
|
"loss": 0.5952, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.7033156308310607, |
|
"grad_norm": 2.5659887569487565, |
|
"learning_rate": 3.301435406698564e-08, |
|
"loss": 0.5523, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7047509688531649, |
|
"grad_norm": 2.361371207424685, |
|
"learning_rate": 3.2854864433811803e-08, |
|
"loss": 0.5559, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7061863068752692, |
|
"grad_norm": 2.4798471285290904, |
|
"learning_rate": 3.269537480063796e-08, |
|
"loss": 0.551, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7076216448973733, |
|
"grad_norm": 2.2363305539445273, |
|
"learning_rate": 3.253588516746411e-08, |
|
"loss": 0.5579, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7090569829194775, |
|
"grad_norm": 2.261300566267236, |
|
"learning_rate": 3.2376395534290266e-08, |
|
"loss": 0.5626, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7104923209415818, |
|
"grad_norm": 2.440848756899625, |
|
"learning_rate": 3.221690590111643e-08, |
|
"loss": 0.5722, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7119276589636859, |
|
"grad_norm": 2.23072306956738, |
|
"learning_rate": 3.205741626794258e-08, |
|
"loss": 0.5613, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7133629969857902, |
|
"grad_norm": 2.5653088309028913, |
|
"learning_rate": 3.189792663476874e-08, |
|
"loss": 0.5678, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7147983350078944, |
|
"grad_norm": 2.222236453599309, |
|
"learning_rate": 3.173843700159489e-08, |
|
"loss": 0.5683, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7162336730299985, |
|
"grad_norm": 2.301868976256063, |
|
"learning_rate": 3.157894736842105e-08, |
|
"loss": 0.5578, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.7176690110521028, |
|
"grad_norm": 2.2272526230809877, |
|
"learning_rate": 3.1419457735247205e-08, |
|
"loss": 0.5702, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.719104349074207, |
|
"grad_norm": 2.456995665005312, |
|
"learning_rate": 3.1259968102073366e-08, |
|
"loss": 0.5604, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.7205396870963112, |
|
"grad_norm": 2.3490860128288467, |
|
"learning_rate": 3.110047846889952e-08, |
|
"loss": 0.5608, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7219750251184154, |
|
"grad_norm": 2.352273017737412, |
|
"learning_rate": 3.094098883572568e-08, |
|
"loss": 0.5524, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.7234103631405195, |
|
"grad_norm": 2.415857321587192, |
|
"learning_rate": 3.078149920255183e-08, |
|
"loss": 0.5665, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7248457011626238, |
|
"grad_norm": 2.446235510745109, |
|
"learning_rate": 3.062200956937799e-08, |
|
"loss": 0.5649, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.726281039184728, |
|
"grad_norm": 2.313314202520151, |
|
"learning_rate": 3.0462519936204144e-08, |
|
"loss": 0.5544, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.7277163772068322, |
|
"grad_norm": 2.2292005323902266, |
|
"learning_rate": 3.0303030303030305e-08, |
|
"loss": 0.5523, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.7291517152289364, |
|
"grad_norm": 2.374986392317948, |
|
"learning_rate": 3.014354066985646e-08, |
|
"loss": 0.5754, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7305870532510407, |
|
"grad_norm": 2.3305738566656156, |
|
"learning_rate": 2.9984051036682613e-08, |
|
"loss": 0.5518, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.7320223912731448, |
|
"grad_norm": 2.2387038610888617, |
|
"learning_rate": 2.982456140350877e-08, |
|
"loss": 0.5663, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.733457729295249, |
|
"grad_norm": 2.585952380969797, |
|
"learning_rate": 2.9665071770334925e-08, |
|
"loss": 0.5648, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7348930673173533, |
|
"grad_norm": 2.337232248685897, |
|
"learning_rate": 2.9505582137161083e-08, |
|
"loss": 0.5639, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7363284053394574, |
|
"grad_norm": 2.466139299880866, |
|
"learning_rate": 2.934609250398724e-08, |
|
"loss": 0.5552, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7377637433615617, |
|
"grad_norm": 3.101041282908991, |
|
"learning_rate": 2.9186602870813398e-08, |
|
"loss": 0.5704, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7391990813836659, |
|
"grad_norm": 2.487285631269175, |
|
"learning_rate": 2.902711323763955e-08, |
|
"loss": 0.5477, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.74063441940577, |
|
"grad_norm": 2.5281080819393127, |
|
"learning_rate": 2.8867623604465707e-08, |
|
"loss": 0.5635, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7420697574278743, |
|
"grad_norm": 2.3789929302011887, |
|
"learning_rate": 2.8708133971291864e-08, |
|
"loss": 0.5448, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.7435050954499784, |
|
"grad_norm": 2.393381150003913, |
|
"learning_rate": 2.8548644338118022e-08, |
|
"loss": 0.5644, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7449404334720827, |
|
"grad_norm": 2.550473175661344, |
|
"learning_rate": 2.838915470494418e-08, |
|
"loss": 0.5605, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7463757714941869, |
|
"grad_norm": 2.5353596792783164, |
|
"learning_rate": 2.822966507177033e-08, |
|
"loss": 0.563, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.747811109516291, |
|
"grad_norm": 2.3846607043055914, |
|
"learning_rate": 2.8070175438596488e-08, |
|
"loss": 0.5693, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.7492464475383953, |
|
"grad_norm": 2.6849030076571294, |
|
"learning_rate": 2.7910685805422645e-08, |
|
"loss": 0.5638, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7506817855604995, |
|
"grad_norm": 2.4740286555102453, |
|
"learning_rate": 2.7751196172248803e-08, |
|
"loss": 0.5597, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.7521171235826037, |
|
"grad_norm": 2.447084260292741, |
|
"learning_rate": 2.759170653907496e-08, |
|
"loss": 0.5578, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7535524616047079, |
|
"grad_norm": 2.5839932986940117, |
|
"learning_rate": 2.7432216905901118e-08, |
|
"loss": 0.5766, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7549877996268121, |
|
"grad_norm": 2.3445256188925816, |
|
"learning_rate": 2.727272727272727e-08, |
|
"loss": 0.5672, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7564231376489163, |
|
"grad_norm": 2.4468131835893896, |
|
"learning_rate": 2.7113237639553427e-08, |
|
"loss": 0.5665, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.7578584756710205, |
|
"grad_norm": 2.5456487718415484, |
|
"learning_rate": 2.6953748006379584e-08, |
|
"loss": 0.5652, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7592938136931248, |
|
"grad_norm": 2.4023284608519186, |
|
"learning_rate": 2.6794258373205742e-08, |
|
"loss": 0.5596, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7607291517152289, |
|
"grad_norm": 2.5316031016203135, |
|
"learning_rate": 2.66347687400319e-08, |
|
"loss": 0.5603, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7621644897373332, |
|
"grad_norm": 2.433666966566567, |
|
"learning_rate": 2.647527910685805e-08, |
|
"loss": 0.5671, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7635998277594374, |
|
"grad_norm": 2.3587826822508755, |
|
"learning_rate": 2.6315789473684208e-08, |
|
"loss": 0.5504, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7650351657815415, |
|
"grad_norm": 2.398341019086129, |
|
"learning_rate": 2.6156299840510366e-08, |
|
"loss": 0.5641, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7664705038036458, |
|
"grad_norm": 2.2174413530817696, |
|
"learning_rate": 2.5996810207336523e-08, |
|
"loss": 0.5546, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.76790584182575, |
|
"grad_norm": 2.293798551862801, |
|
"learning_rate": 2.583732057416268e-08, |
|
"loss": 0.5651, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.7693411798478542, |
|
"grad_norm": 2.6205007545310766, |
|
"learning_rate": 2.5677830940988838e-08, |
|
"loss": 0.5511, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7707765178699584, |
|
"grad_norm": 2.5273343089178577, |
|
"learning_rate": 2.551834130781499e-08, |
|
"loss": 0.5646, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7722118558920625, |
|
"grad_norm": 2.4202912870329896, |
|
"learning_rate": 2.5358851674641147e-08, |
|
"loss": 0.5586, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7736471939141668, |
|
"grad_norm": 2.3235211285214694, |
|
"learning_rate": 2.5199362041467304e-08, |
|
"loss": 0.5715, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.775082531936271, |
|
"grad_norm": 2.4065106135536602, |
|
"learning_rate": 2.5039872408293462e-08, |
|
"loss": 0.5594, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7765178699583752, |
|
"grad_norm": 2.431036411294595, |
|
"learning_rate": 2.4880382775119616e-08, |
|
"loss": 0.5437, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.7779532079804794, |
|
"grad_norm": 2.449941200842066, |
|
"learning_rate": 2.4720893141945774e-08, |
|
"loss": 0.5458, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7793885460025836, |
|
"grad_norm": 2.5115374799858907, |
|
"learning_rate": 2.4561403508771928e-08, |
|
"loss": 0.5569, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7808238840246878, |
|
"grad_norm": 2.3359120202737023, |
|
"learning_rate": 2.4401913875598086e-08, |
|
"loss": 0.5623, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.782259222046792, |
|
"grad_norm": 2.448687571770946, |
|
"learning_rate": 2.4242424242424243e-08, |
|
"loss": 0.5704, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.7836945600688963, |
|
"grad_norm": 2.386097453587479, |
|
"learning_rate": 2.4082934609250398e-08, |
|
"loss": 0.5595, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7851298980910004, |
|
"grad_norm": 2.29157924213661, |
|
"learning_rate": 2.3923444976076555e-08, |
|
"loss": 0.5493, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.7865652361131046, |
|
"grad_norm": 2.368351844371977, |
|
"learning_rate": 2.3763955342902713e-08, |
|
"loss": 0.5621, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.7880005741352089, |
|
"grad_norm": 2.428279265040824, |
|
"learning_rate": 2.3604465709728867e-08, |
|
"loss": 0.5501, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.789435912157313, |
|
"grad_norm": 2.5354885880092355, |
|
"learning_rate": 2.3444976076555025e-08, |
|
"loss": 0.56, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7908712501794173, |
|
"grad_norm": 2.394481001467146, |
|
"learning_rate": 2.328548644338118e-08, |
|
"loss": 0.553, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.7923065882015214, |
|
"grad_norm": 2.733847809680854, |
|
"learning_rate": 2.3125996810207336e-08, |
|
"loss": 0.5522, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7937419262236257, |
|
"grad_norm": 2.390397541327289, |
|
"learning_rate": 2.296650717703349e-08, |
|
"loss": 0.5475, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.7951772642457299, |
|
"grad_norm": 2.3229132156440624, |
|
"learning_rate": 2.2807017543859648e-08, |
|
"loss": 0.5471, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.796612602267834, |
|
"grad_norm": 2.249806012410395, |
|
"learning_rate": 2.2647527910685806e-08, |
|
"loss": 0.5526, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.7980479402899383, |
|
"grad_norm": 2.2913607195571513, |
|
"learning_rate": 2.248803827751196e-08, |
|
"loss": 0.5699, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7994832783120425, |
|
"grad_norm": 2.3155768396344465, |
|
"learning_rate": 2.2328548644338118e-08, |
|
"loss": 0.5605, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8009186163341467, |
|
"grad_norm": 2.2266682437603094, |
|
"learning_rate": 2.2169059011164272e-08, |
|
"loss": 0.5633, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8014927515429884, |
|
"eval_loss": 0.6254069209098816, |
|
"eval_runtime": 276.7807, |
|
"eval_samples_per_second": 138.106, |
|
"eval_steps_per_second": 2.161, |
|
"step": 2792 |
|
}, |
|
{ |
|
"epoch": 0.8023539543562509, |
|
"grad_norm": 2.3313765523766996, |
|
"learning_rate": 2.200956937799043e-08, |
|
"loss": 0.564, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.8037892923783551, |
|
"grad_norm": 2.2750443956794637, |
|
"learning_rate": 2.1850079744816584e-08, |
|
"loss": 0.5425, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8052246304004593, |
|
"grad_norm": 2.3759701760182956, |
|
"learning_rate": 2.169059011164274e-08, |
|
"loss": 0.5614, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8066599684225635, |
|
"grad_norm": 2.3995646656798226, |
|
"learning_rate": 2.15311004784689e-08, |
|
"loss": 0.5679, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8080953064446678, |
|
"grad_norm": 2.413178742154949, |
|
"learning_rate": 2.1371610845295053e-08, |
|
"loss": 0.5364, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.8095306444667719, |
|
"grad_norm": 2.451085540687265, |
|
"learning_rate": 2.121212121212121e-08, |
|
"loss": 0.558, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8109659824888761, |
|
"grad_norm": 2.5366424198902884, |
|
"learning_rate": 2.1052631578947365e-08, |
|
"loss": 0.5563, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8124013205109804, |
|
"grad_norm": 2.4480623057702564, |
|
"learning_rate": 2.0893141945773523e-08, |
|
"loss": 0.5533, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8138366585330845, |
|
"grad_norm": 2.5085180829799225, |
|
"learning_rate": 2.073365231259968e-08, |
|
"loss": 0.5569, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8152719965551888, |
|
"grad_norm": 2.518306435416413, |
|
"learning_rate": 2.0574162679425834e-08, |
|
"loss": 0.559, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8167073345772929, |
|
"grad_norm": 2.3775863845429805, |
|
"learning_rate": 2.0414673046251992e-08, |
|
"loss": 0.553, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.8181426725993971, |
|
"grad_norm": 2.2707446927149193, |
|
"learning_rate": 2.025518341307815e-08, |
|
"loss": 0.5459, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8195780106215014, |
|
"grad_norm": 2.545080295796524, |
|
"learning_rate": 2.0095693779904304e-08, |
|
"loss": 0.5685, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.8210133486436055, |
|
"grad_norm": 2.53691680864861, |
|
"learning_rate": 1.993620414673046e-08, |
|
"loss": 0.555, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8224486866657098, |
|
"grad_norm": 2.500766862141125, |
|
"learning_rate": 1.9776714513556616e-08, |
|
"loss": 0.5485, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.823884024687814, |
|
"grad_norm": 2.5800550412102474, |
|
"learning_rate": 1.9617224880382773e-08, |
|
"loss": 0.5539, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8253193627099182, |
|
"grad_norm": 2.459836362956779, |
|
"learning_rate": 1.945773524720893e-08, |
|
"loss": 0.5572, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.8267547007320224, |
|
"grad_norm": 2.4537466248582604, |
|
"learning_rate": 1.9298245614035085e-08, |
|
"loss": 0.5353, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8281900387541266, |
|
"grad_norm": 2.459607137477631, |
|
"learning_rate": 1.9138755980861243e-08, |
|
"loss": 0.5341, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.8296253767762308, |
|
"grad_norm": 2.6669636316278016, |
|
"learning_rate": 1.89792663476874e-08, |
|
"loss": 0.5671, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.831060714798335, |
|
"grad_norm": 2.5873206815337064, |
|
"learning_rate": 1.8819776714513555e-08, |
|
"loss": 0.5603, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8324960528204393, |
|
"grad_norm": 2.4057810595787275, |
|
"learning_rate": 1.8660287081339712e-08, |
|
"loss": 0.5296, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8339313908425434, |
|
"grad_norm": 2.525542516694272, |
|
"learning_rate": 1.850079744816587e-08, |
|
"loss": 0.563, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8353667288646476, |
|
"grad_norm": 2.304325422653923, |
|
"learning_rate": 1.8341307814992024e-08, |
|
"loss": 0.5491, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8368020668867518, |
|
"grad_norm": 2.5083207145172124, |
|
"learning_rate": 1.818181818181818e-08, |
|
"loss": 0.5621, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.838237404908856, |
|
"grad_norm": 2.362375542748003, |
|
"learning_rate": 1.8022328548644336e-08, |
|
"loss": 0.5552, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8396727429309603, |
|
"grad_norm": 2.5039301590611602, |
|
"learning_rate": 1.7862838915470493e-08, |
|
"loss": 0.5574, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8411080809530644, |
|
"grad_norm": 2.381035985901319, |
|
"learning_rate": 1.770334928229665e-08, |
|
"loss": 0.5576, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8425434189751686, |
|
"grad_norm": 2.4011187394557454, |
|
"learning_rate": 1.7543859649122805e-08, |
|
"loss": 0.5562, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.8439787569972729, |
|
"grad_norm": 2.423352651833944, |
|
"learning_rate": 1.7384370015948963e-08, |
|
"loss": 0.5467, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.845414095019377, |
|
"grad_norm": 2.377148884585651, |
|
"learning_rate": 1.722488038277512e-08, |
|
"loss": 0.5559, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.8468494330414813, |
|
"grad_norm": 2.625302837686785, |
|
"learning_rate": 1.7065390749601275e-08, |
|
"loss": 0.5651, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8482847710635855, |
|
"grad_norm": 2.387352810607353, |
|
"learning_rate": 1.6905901116427432e-08, |
|
"loss": 0.5655, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8497201090856897, |
|
"grad_norm": 2.608468244575993, |
|
"learning_rate": 1.674641148325359e-08, |
|
"loss": 0.5572, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8511554471077939, |
|
"grad_norm": 2.413659561825893, |
|
"learning_rate": 1.6586921850079744e-08, |
|
"loss": 0.554, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.852590785129898, |
|
"grad_norm": 2.3750806487783143, |
|
"learning_rate": 1.6427432216905902e-08, |
|
"loss": 0.5581, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8540261231520023, |
|
"grad_norm": 2.279688370144133, |
|
"learning_rate": 1.6267942583732056e-08, |
|
"loss": 0.5547, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8554614611741065, |
|
"grad_norm": 2.624719049851082, |
|
"learning_rate": 1.6108452950558214e-08, |
|
"loss": 0.5533, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8568967991962108, |
|
"grad_norm": 2.377843268250057, |
|
"learning_rate": 1.594896331738437e-08, |
|
"loss": 0.5438, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.8583321372183149, |
|
"grad_norm": 2.2847075132530525, |
|
"learning_rate": 1.5789473684210525e-08, |
|
"loss": 0.5335, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8597674752404191, |
|
"grad_norm": 2.37084129738354, |
|
"learning_rate": 1.5629984051036683e-08, |
|
"loss": 0.5663, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.8612028132625233, |
|
"grad_norm": 2.4355739071215647, |
|
"learning_rate": 1.547049441786284e-08, |
|
"loss": 0.5494, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8626381512846275, |
|
"grad_norm": 2.470577499040234, |
|
"learning_rate": 1.5311004784688995e-08, |
|
"loss": 0.5655, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.8640734893067318, |
|
"grad_norm": 2.3472749158221506, |
|
"learning_rate": 1.5151515151515152e-08, |
|
"loss": 0.5441, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8655088273288359, |
|
"grad_norm": 2.435819416568884, |
|
"learning_rate": 1.4992025518341307e-08, |
|
"loss": 0.5422, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.8669441653509401, |
|
"grad_norm": 2.4087778144150036, |
|
"learning_rate": 1.4832535885167463e-08, |
|
"loss": 0.5541, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8683795033730444, |
|
"grad_norm": 2.5159308315827085, |
|
"learning_rate": 1.467304625199362e-08, |
|
"loss": 0.5572, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.8698148413951485, |
|
"grad_norm": 2.177063646000523, |
|
"learning_rate": 1.4513556618819774e-08, |
|
"loss": 0.5563, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8712501794172528, |
|
"grad_norm": 2.5037716344512058, |
|
"learning_rate": 1.4354066985645932e-08, |
|
"loss": 0.5628, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.872685517439357, |
|
"grad_norm": 2.402176353413018, |
|
"learning_rate": 1.419457735247209e-08, |
|
"loss": 0.556, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8741208554614611, |
|
"grad_norm": 2.550729584909021, |
|
"learning_rate": 1.4035087719298244e-08, |
|
"loss": 0.5315, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8755561934835654, |
|
"grad_norm": 3.0055945543123133, |
|
"learning_rate": 1.3875598086124401e-08, |
|
"loss": 0.5639, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8769915315056696, |
|
"grad_norm": 2.3539320173290474, |
|
"learning_rate": 1.3716108452950559e-08, |
|
"loss": 0.5564, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.8784268695277738, |
|
"grad_norm": 2.3311951542487046, |
|
"learning_rate": 1.3556618819776713e-08, |
|
"loss": 0.5606, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.879862207549878, |
|
"grad_norm": 2.242631561275903, |
|
"learning_rate": 1.3397129186602871e-08, |
|
"loss": 0.5525, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.8812975455719823, |
|
"grad_norm": 2.339532716408427, |
|
"learning_rate": 1.3237639553429025e-08, |
|
"loss": 0.5419, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8827328835940864, |
|
"grad_norm": 2.547870617282417, |
|
"learning_rate": 1.3078149920255183e-08, |
|
"loss": 0.5644, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.8841682216161906, |
|
"grad_norm": 2.3303852663788907, |
|
"learning_rate": 1.291866028708134e-08, |
|
"loss": 0.5586, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8856035596382948, |
|
"grad_norm": 2.4204934502872115, |
|
"learning_rate": 1.2759170653907495e-08, |
|
"loss": 0.553, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.887038897660399, |
|
"grad_norm": 2.696258891027339, |
|
"learning_rate": 1.2599681020733652e-08, |
|
"loss": 0.5762, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8884742356825033, |
|
"grad_norm": 2.6104754887154256, |
|
"learning_rate": 1.2440191387559808e-08, |
|
"loss": 0.5502, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.8899095737046074, |
|
"grad_norm": 2.2629330882804197, |
|
"learning_rate": 1.2280701754385964e-08, |
|
"loss": 0.5545, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8913449117267116, |
|
"grad_norm": 2.3232641254149633, |
|
"learning_rate": 1.2121212121212122e-08, |
|
"loss": 0.532, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.8927802497488159, |
|
"grad_norm": 2.342474888159451, |
|
"learning_rate": 1.1961722488038278e-08, |
|
"loss": 0.5704, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.89421558777092, |
|
"grad_norm": 2.424809382809371, |
|
"learning_rate": 1.1802232854864433e-08, |
|
"loss": 0.5537, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.8956509257930243, |
|
"grad_norm": 2.50275110580096, |
|
"learning_rate": 1.164274322169059e-08, |
|
"loss": 0.547, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.8970862638151285, |
|
"grad_norm": 2.3447767193393165, |
|
"learning_rate": 1.1483253588516745e-08, |
|
"loss": 0.5565, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.8985216018372326, |
|
"grad_norm": 2.3166718517750384, |
|
"learning_rate": 1.1323763955342903e-08, |
|
"loss": 0.5425, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.8999569398593369, |
|
"grad_norm": 2.5132920153594536, |
|
"learning_rate": 1.1164274322169059e-08, |
|
"loss": 0.558, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.901392277881441, |
|
"grad_norm": 2.5619104750161146, |
|
"learning_rate": 1.1004784688995215e-08, |
|
"loss": 0.5563, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9016793454858619, |
|
"eval_loss": 0.6268156170845032, |
|
"eval_runtime": 276.7226, |
|
"eval_samples_per_second": 138.135, |
|
"eval_steps_per_second": 2.161, |
|
"step": 3141 |
|
}, |
|
{ |
|
"epoch": 0.9016793454858619, |
|
"step": 3141, |
|
"total_flos": 7109747952582656.0, |
|
"train_loss": 0.6063995264169328, |
|
"train_runtime": 10128.64, |
|
"train_samples_per_second": 22.011, |
|
"train_steps_per_second": 0.344 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3484, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 349, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 5 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7109747952582656.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|