|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997860047078965, |
|
"eval_steps": 117, |
|
"global_step": 1168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008559811684142949, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.4683, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008559811684142949, |
|
"eval_loss": 1.6825672388076782, |
|
"eval_runtime": 22.7989, |
|
"eval_samples_per_second": 17.106, |
|
"eval_steps_per_second": 17.106, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017119623368285898, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5e-06, |
|
"loss": 1.6305, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0025679435052428845, |
|
"grad_norm": 3.0, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.6191, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034239246736571796, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6011, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004279905842071475, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.6021, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005135887010485769, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.4842, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005991868178900064, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.75e-05, |
|
"loss": 1.718, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006847849347314359, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2e-05, |
|
"loss": 1.621, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007703830515728654, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.648, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00855981168414295, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5684, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009415792852557245, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 1.6588, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010271774020971538, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5649, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011127755189385833, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 1.5527, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011983736357800128, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.5464, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.012839717526214423, |
|
"grad_norm": 2.125, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.7606, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013695698694628718, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4e-05, |
|
"loss": 1.5089, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014551679863043013, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.25e-05, |
|
"loss": 1.5609, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015407661031457309, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.688, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.016263642199871604, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.75e-05, |
|
"loss": 1.4745, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0171196233682859, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6253, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017975604536700194, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.999990638925292e-05, |
|
"loss": 1.5921, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01883158570511449, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.999962555771271e-05, |
|
"loss": 1.443, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01968756687352878, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.999915750748249e-05, |
|
"loss": 1.609, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.020543548041943076, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.999850224206741e-05, |
|
"loss": 1.6203, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02139952921035737, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.999765976637467e-05, |
|
"loss": 1.4801, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022255510378771666, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.999663008671344e-05, |
|
"loss": 1.6311, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02311149154718596, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.9995413210794864e-05, |
|
"loss": 1.586, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.023967472715600256, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.999400914773193e-05, |
|
"loss": 1.5281, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02482345388401455, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.99924179080395e-05, |
|
"loss": 1.589, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.025679435052428846, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.999063950363413e-05, |
|
"loss": 1.6053, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02653541622084314, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.998867394783404e-05, |
|
"loss": 1.4413, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027391397389257437, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.9986521255359004e-05, |
|
"loss": 1.5999, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028247378557671732, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.998418144233023e-05, |
|
"loss": 1.6345, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.029103359726086027, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.998165452627025e-05, |
|
"loss": 1.665, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.029959340894500322, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.997894052610279e-05, |
|
"loss": 1.5723, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.030815322062914617, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.997603946215262e-05, |
|
"loss": 1.4505, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03167130323132891, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.997295135614539e-05, |
|
"loss": 1.5724, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03252728439974321, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.9969676231207494e-05, |
|
"loss": 1.6605, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0333832655681575, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.996621411186589e-05, |
|
"loss": 1.5345, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0342392467365718, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.99625650240479e-05, |
|
"loss": 1.6665, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03509522790498609, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.995872899508103e-05, |
|
"loss": 1.5707, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03595120907340039, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.995470605369277e-05, |
|
"loss": 1.652, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03680719024181468, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.995049623001036e-05, |
|
"loss": 1.3974, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03766317141022898, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.9946099555560565e-05, |
|
"loss": 1.613, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03851915257864327, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.994151606326949e-05, |
|
"loss": 1.5067, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03937513374705756, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.993674578746225e-05, |
|
"loss": 1.5115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.040231114915471856, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.993178876386278e-05, |
|
"loss": 1.6309, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04108709608388615, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.992664502959351e-05, |
|
"loss": 1.6605, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04194307725230045, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.9921314623175174e-05, |
|
"loss": 1.5052, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04279905842071474, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.991579758452644e-05, |
|
"loss": 1.5388, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04365503958912904, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.99100939549636e-05, |
|
"loss": 1.6761, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04451102075754333, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.990420377720038e-05, |
|
"loss": 1.6295, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04536700192595763, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.9898127095347466e-05, |
|
"loss": 1.5579, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04622298309437192, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.989186395491229e-05, |
|
"loss": 1.4967, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04707896426278622, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.9885414402798624e-05, |
|
"loss": 1.4205, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04793494543120051, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.987877848730627e-05, |
|
"loss": 1.6522, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04879092659961481, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.987195625813066e-05, |
|
"loss": 1.5241, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0496469077680291, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.9864947766362505e-05, |
|
"loss": 1.548, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0505028889364434, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.985775306448743e-05, |
|
"loss": 1.5058, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.05135887010485769, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.985037220638555e-05, |
|
"loss": 1.6028, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05221485127327199, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 1.5308, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.05307083244168628, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.9835052243991874e-05, |
|
"loss": 1.5042, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.05392681361010058, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.982711325442914e-05, |
|
"loss": 1.5008, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05478279477851487, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.981898833809683e-05, |
|
"loss": 1.6986, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05563877594692917, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.9810677555841314e-05, |
|
"loss": 1.651, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.056494757115343464, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.980218096990087e-05, |
|
"loss": 1.5315, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05735073828375776, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.9793498643905236e-05, |
|
"loss": 1.5917, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.058206719452172054, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.978463064287513e-05, |
|
"loss": 1.5897, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05906270062058635, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.977557703322178e-05, |
|
"loss": 1.5924, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.059918681789000644, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.97663378827464e-05, |
|
"loss": 1.5634, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06077466295741494, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.9756913260639675e-05, |
|
"loss": 1.5397, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.061630644125829234, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.974730323748129e-05, |
|
"loss": 1.6735, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06248662529424353, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.9737507885239366e-05, |
|
"loss": 1.4538, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06334260646265782, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.9727527277269915e-05, |
|
"loss": 1.4092, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06419858763107211, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.97173614883163e-05, |
|
"loss": 1.5579, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06505456879948641, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.970701059450872e-05, |
|
"loss": 1.6395, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0659105499679007, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.9696474673363536e-05, |
|
"loss": 1.4457, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.066766531136315, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.96857538037828e-05, |
|
"loss": 1.5416, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0676225123047293, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.9674848066053586e-05, |
|
"loss": 1.4792, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0684784934731436, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.966375754184746e-05, |
|
"loss": 1.467, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06933447464155788, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.965248231421977e-05, |
|
"loss": 1.6674, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.07019045580997219, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.964102246760914e-05, |
|
"loss": 1.473, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07104643697838647, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.962937808783675e-05, |
|
"loss": 1.61, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07190241814680078, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.9617549262105724e-05, |
|
"loss": 1.5847, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07275839931521506, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.9605536079000476e-05, |
|
"loss": 1.7443, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07361438048362937, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.9593338628486055e-05, |
|
"loss": 1.5063, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07447036165204365, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.9580957001907445e-05, |
|
"loss": 1.6636, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07532634282045796, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.9568391291988927e-05, |
|
"loss": 1.6315, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07618232398887224, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.9555641592833334e-05, |
|
"loss": 1.5544, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07703830515728655, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.954270799992138e-05, |
|
"loss": 1.4513, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07789428632570083, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.9529590610110914e-05, |
|
"loss": 1.5529, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.07875026749411512, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.9516289521636244e-05, |
|
"loss": 1.3935, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07960624866252942, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.9502804834107354e-05, |
|
"loss": 1.5309, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.08046222983094371, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.948913664850917e-05, |
|
"loss": 1.5814, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08131821099935801, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.947528506720082e-05, |
|
"loss": 1.5933, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0821741921677723, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.946125019391486e-05, |
|
"loss": 1.4894, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0830301733361866, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.944703213375648e-05, |
|
"loss": 1.5702, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0838861545046009, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.943263099320275e-05, |
|
"loss": 1.6595, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0847421356730152, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.941804688010178e-05, |
|
"loss": 1.5197, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08559811684142948, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.940327990367195e-05, |
|
"loss": 1.6567, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08645409800984379, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.938833017450108e-05, |
|
"loss": 1.6511, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08731007917825807, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 1.6058, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.08816606034667238, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.9357882907129685e-05, |
|
"loss": 1.5673, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.08902204151508666, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.934238559694448e-05, |
|
"loss": 1.5504, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.08987802268350097, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.932670599004715e-05, |
|
"loss": 1.5693, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09073400385191525, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.9310844203860084e-05, |
|
"loss": 1.5945, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09158998502032956, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.929480035716997e-05, |
|
"loss": 1.6466, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09244596618874384, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.927857457012692e-05, |
|
"loss": 1.6873, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09330194735715815, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.9262166964243596e-05, |
|
"loss": 1.7084, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.09415792852557243, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.924557766239423e-05, |
|
"loss": 1.4966, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09501390969398674, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.92288067888138e-05, |
|
"loss": 1.5114, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.09586989086240102, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.921185446909702e-05, |
|
"loss": 1.5532, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.09672587203081533, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.919472083019743e-05, |
|
"loss": 1.6787, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.09758185319922962, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.917740600042645e-05, |
|
"loss": 1.4609, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.09843783436764392, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.915991010945241e-05, |
|
"loss": 1.4925, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0992938155360582, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.914223328829959e-05, |
|
"loss": 1.5845, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.10014979670447251, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.912437566934723e-05, |
|
"loss": 1.7777, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.10014979670447251, |
|
"eval_loss": 1.627388596534729, |
|
"eval_runtime": 21.3696, |
|
"eval_samples_per_second": 18.25, |
|
"eval_steps_per_second": 18.25, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1010057778728868, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.9106337386328524e-05, |
|
"loss": 1.6118, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1018617590413011, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.908811857432965e-05, |
|
"loss": 1.5514, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.10271774020971539, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.9069719369788734e-05, |
|
"loss": 1.5689, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10357372137812969, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.905113991049484e-05, |
|
"loss": 1.564, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.10442970254654398, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.903238033558692e-05, |
|
"loss": 1.6917, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10528568371495826, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.901344078555282e-05, |
|
"loss": 1.4474, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.10614166488337257, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.899432140222816e-05, |
|
"loss": 1.6063, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.10699764605178685, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.8975022328795325e-05, |
|
"loss": 1.5834, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10785362722020116, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.895554370978238e-05, |
|
"loss": 1.6613, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.10870960838861544, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.893588569106195e-05, |
|
"loss": 1.6858, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.10956558955702975, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.89160484198502e-05, |
|
"loss": 1.6384, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11042157072544403, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.8896032044705655e-05, |
|
"loss": 1.5923, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11127755189385834, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.887583671552816e-05, |
|
"loss": 1.5658, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11213353306227263, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.885546258355769e-05, |
|
"loss": 1.4684, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11298951423068693, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.8834909801373264e-05, |
|
"loss": 1.5512, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11384549539910122, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.881417852289179e-05, |
|
"loss": 1.5687, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.11470147656751552, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.8793268903366905e-05, |
|
"loss": 1.6813, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.1155574577359298, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.877218109938781e-05, |
|
"loss": 1.4457, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11641343890434411, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.875091526887813e-05, |
|
"loss": 1.6283, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1172694200727584, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.872947157109467e-05, |
|
"loss": 1.5411, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1181254012411727, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.8707850166626266e-05, |
|
"loss": 1.5107, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.11898138240958699, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.8686051217392606e-05, |
|
"loss": 1.404, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.11983736357800129, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.866407488664296e-05, |
|
"loss": 1.5754, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12069334474641558, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.864192133895498e-05, |
|
"loss": 1.5735, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12154932591482988, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.861959074023348e-05, |
|
"loss": 1.5884, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.12240530708324417, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.8597083257709194e-05, |
|
"loss": 1.5551, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12326128825165847, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.857439905993748e-05, |
|
"loss": 1.4693, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12411726942007276, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.855153831679713e-05, |
|
"loss": 1.6085, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12497325058848706, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.852850119948904e-05, |
|
"loss": 1.4771, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.12582923175690136, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.850528788053495e-05, |
|
"loss": 1.4144, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.12668521292531565, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.848189853377615e-05, |
|
"loss": 1.3908, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.12754119409372994, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.8458333334372185e-05, |
|
"loss": 1.6438, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.12839717526214423, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.843459245879951e-05, |
|
"loss": 1.5459, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12925315643055854, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.841067608485024e-05, |
|
"loss": 1.4941, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.13010913759897283, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.8386584391630716e-05, |
|
"loss": 1.3663, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13096511876738712, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.8362317559560274e-05, |
|
"loss": 1.4986, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1318210999358014, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.833787577036981e-05, |
|
"loss": 1.4611, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1326770811042157, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.831325920710045e-05, |
|
"loss": 1.6472, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13353306227263, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.8288468054102186e-05, |
|
"loss": 1.5, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.1343890434410443, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.8263502497032484e-05, |
|
"loss": 1.4545, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1352450246094586, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.823836272285491e-05, |
|
"loss": 1.5297, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.13610100577787287, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.82130489198377e-05, |
|
"loss": 1.5259, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1369569869462872, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.8187561277552374e-05, |
|
"loss": 1.554, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13781296811470148, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.816189998687231e-05, |
|
"loss": 1.6408, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.13866894928311577, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.813606523997132e-05, |
|
"loss": 1.5234, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.13952493045153005, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.811005723032219e-05, |
|
"loss": 1.4525, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.14038091161994437, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.808387615269528e-05, |
|
"loss": 1.5951, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14123689278835866, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.805752220315699e-05, |
|
"loss": 1.3059, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14209287395677295, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.8030995579068356e-05, |
|
"loss": 1.5359, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.14294885512518724, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.800429647908354e-05, |
|
"loss": 1.5332, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14380483629360155, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.797742510314838e-05, |
|
"loss": 1.5602, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14466081746201584, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.7950381652498816e-05, |
|
"loss": 1.5634, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.14551679863043013, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.7923166329659466e-05, |
|
"loss": 1.5805, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14637277979884442, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.7895779338442076e-05, |
|
"loss": 1.5187, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.14722876096725873, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.786822088394397e-05, |
|
"loss": 1.664, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.14808474213567302, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.784049117254656e-05, |
|
"loss": 1.6186, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1489407233040873, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.781259041191375e-05, |
|
"loss": 1.4065, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.1497967044725016, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.778451881099044e-05, |
|
"loss": 1.652, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1506526856409159, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.775627658000091e-05, |
|
"loss": 1.4527, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.1515086668093302, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.772786393044726e-05, |
|
"loss": 1.4748, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.1523646479777445, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.7699281075107835e-05, |
|
"loss": 1.6003, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.15322062914615878, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.767052822803565e-05, |
|
"loss": 1.6305, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.1540766103145731, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.764160560455673e-05, |
|
"loss": 1.3937, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15493259148298738, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.7612513421268544e-05, |
|
"loss": 1.4548, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.15578857265140167, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.7583251896038386e-05, |
|
"loss": 1.4323, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15664455381981596, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.7553821248001695e-05, |
|
"loss": 1.4816, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.15750053498823025, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 1.4022, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.15835651615664456, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.749445346638163e-05, |
|
"loss": 1.5193, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15921249732505885, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.7464516777395234e-05, |
|
"loss": 1.589, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.16006847849347314, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.743441185479297e-05, |
|
"loss": 1.4739, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.16092445966188743, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.740413892402639e-05, |
|
"loss": 1.4312, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.16178044083030174, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.7373698211805215e-05, |
|
"loss": 1.677, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16263642199871603, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.7343089946095674e-05, |
|
"loss": 1.6992, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16349240316713032, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.7312314356118776e-05, |
|
"loss": 1.5619, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.1643483843355446, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.7281371672348595e-05, |
|
"loss": 1.6068, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.16520436550395892, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.725026212651056e-05, |
|
"loss": 1.6795, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1660603466723732, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.7218985951579685e-05, |
|
"loss": 1.6281, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.1669163278407875, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 4.7187543381778864e-05, |
|
"loss": 1.4485, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1677723090092018, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.715593465257709e-05, |
|
"loss": 1.5356, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1686282901776161, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.712416000068771e-05, |
|
"loss": 1.6105, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.1694842713460304, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.7092219664066636e-05, |
|
"loss": 1.7753, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.17034025251444468, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.706011388191057e-05, |
|
"loss": 1.6989, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17119623368285897, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.7027842894655205e-05, |
|
"loss": 1.5058, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17205221485127328, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.699540694397343e-05, |
|
"loss": 1.6399, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.17290819601968757, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.6962806272773564e-05, |
|
"loss": 1.491, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17376417718810186, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.693004112519743e-05, |
|
"loss": 1.5155, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17462015835651615, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.689711174661864e-05, |
|
"loss": 1.4796, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.17547613952493046, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.686401838364068e-05, |
|
"loss": 1.5699, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17633212069334475, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.683076128409512e-05, |
|
"loss": 1.5628, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.17718810186175904, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.6797340697039705e-05, |
|
"loss": 1.5281, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.17804408303017333, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.6763756872756525e-05, |
|
"loss": 1.5223, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.17890006419858764, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.6730010062750134e-05, |
|
"loss": 1.5561, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.17975604536700193, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.669610051974566e-05, |
|
"loss": 1.3003, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18061202653541622, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.6662028497686905e-05, |
|
"loss": 1.5831, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.1814680077038305, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.662779425173448e-05, |
|
"loss": 1.4068, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18232398887224482, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.659339803826384e-05, |
|
"loss": 1.2956, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.1831799700406591, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.655884011486341e-05, |
|
"loss": 1.4742, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.1840359512090734, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.652412074033263e-05, |
|
"loss": 1.4319, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1848919323774877, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.648924017468003e-05, |
|
"loss": 1.4521, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.18574791354590198, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.645419867912128e-05, |
|
"loss": 1.5488, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.1866038947143163, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.6418996516077205e-05, |
|
"loss": 1.6545, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.18745987588273058, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.6383633949171884e-05, |
|
"loss": 1.5419, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.18831585705114487, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.634811124323062e-05, |
|
"loss": 1.4832, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18917183821955916, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.6312428664277976e-05, |
|
"loss": 1.6318, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.19002781938797347, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.627658647953579e-05, |
|
"loss": 1.4994, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.19088380055638776, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.624058495742114e-05, |
|
"loss": 1.5991, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19173978172480205, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.620442436754438e-05, |
|
"loss": 1.4461, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19259576289321634, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.6168104980707107e-05, |
|
"loss": 1.5396, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19345174406163065, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.613162706890011e-05, |
|
"loss": 1.4974, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19430772523004494, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.609499090530136e-05, |
|
"loss": 1.6796, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.19516370639845923, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.605819676427393e-05, |
|
"loss": 1.4685, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.19601968756687352, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.602124492136401e-05, |
|
"loss": 1.5252, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.19687566873528783, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.598413565329875e-05, |
|
"loss": 1.5882, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19773164990370212, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.594686923798426e-05, |
|
"loss": 1.5452, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1985876310721164, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.5909445954503506e-05, |
|
"loss": 1.5358, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1994436122405307, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.5871866083114204e-05, |
|
"loss": 1.6252, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.20029959340894501, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.5834129905246725e-05, |
|
"loss": 1.4701, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.20029959340894501, |
|
"eval_loss": 1.6031174659729004, |
|
"eval_runtime": 21.3555, |
|
"eval_samples_per_second": 18.262, |
|
"eval_steps_per_second": 18.262, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2011555745773593, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.5796237703502044e-05, |
|
"loss": 1.6016, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2020115557457736, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.5758189761649514e-05, |
|
"loss": 1.5205, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.20286753691418788, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.5719986364624866e-05, |
|
"loss": 1.4364, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2037235180826022, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.5681627798527965e-05, |
|
"loss": 1.254, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20457949925101648, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.564311435062074e-05, |
|
"loss": 1.5015, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.20543548041943077, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.5604446309324986e-05, |
|
"loss": 1.3402, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.20629146158784506, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.5565623964220266e-05, |
|
"loss": 1.436, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.20714744275625938, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.5526647606041666e-05, |
|
"loss": 1.6074, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.20800342392467366, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.548751752667767e-05, |
|
"loss": 1.5374, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.20885940509308795, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.5448234019167945e-05, |
|
"loss": 1.4411, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.20971538626150224, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.5408797377701176e-05, |
|
"loss": 1.4943, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21057136742991653, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.5369207897612854e-05, |
|
"loss": 1.567, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.21142734859833084, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.532946587538302e-05, |
|
"loss": 1.587, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21228332976674513, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.5289571608634116e-05, |
|
"loss": 1.4585, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21313931093515942, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.524952539612872e-05, |
|
"loss": 1.5406, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2139952921035737, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.5209327537767295e-05, |
|
"loss": 1.4958, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21485127327198802, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.5168978334585956e-05, |
|
"loss": 1.6202, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.2157072544404023, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.512847808875424e-05, |
|
"loss": 1.5408, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2165632356088166, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.5087827103572796e-05, |
|
"loss": 1.6394, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.2174192167772309, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.504702568347117e-05, |
|
"loss": 1.3343, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2182751979456452, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.500607413400546e-05, |
|
"loss": 1.5471, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2191311791140595, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.4964972761856084e-05, |
|
"loss": 1.4912, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.21998716028247378, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.492372187482545e-05, |
|
"loss": 1.2951, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.22084314145088807, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.488232178183567e-05, |
|
"loss": 1.4651, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.22169912261930239, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.484077279292622e-05, |
|
"loss": 1.3435, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22255510378771667, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.479907521925168e-05, |
|
"loss": 1.5813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22341108495613096, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.4757229373079306e-05, |
|
"loss": 1.3951, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.22426706612454525, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.471523556778679e-05, |
|
"loss": 1.4809, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22512304729295957, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.467309411785984e-05, |
|
"loss": 1.4175, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.22597902846137385, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.4630805338889866e-05, |
|
"loss": 1.587, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.22683500962978814, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.458836954757161e-05, |
|
"loss": 1.3758, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.22769099079820243, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 1.4746, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.22854697196661675, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.450305820017156e-05, |
|
"loss": 1.5459, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.22940295313503103, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.446018328297449e-05, |
|
"loss": 1.361, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.23025893430344532, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.441716263119379e-05, |
|
"loss": 1.6767, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.2311149154718596, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.437399656700507e-05, |
|
"loss": 1.4742, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23197089664027393, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.433068541367295e-05, |
|
"loss": 1.5136, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.23282687780868822, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.428722949554857e-05, |
|
"loss": 1.4492, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2336828589771025, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.424362913806722e-05, |
|
"loss": 1.4585, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2345388401455168, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.419988466774586e-05, |
|
"loss": 1.3074, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.23539482131393108, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.415599641218068e-05, |
|
"loss": 1.2787, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2362508024823454, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.4111964700044686e-05, |
|
"loss": 1.489, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.23710678365075968, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.4067789861085185e-05, |
|
"loss": 1.4373, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.23796276481917397, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.402347222612137e-05, |
|
"loss": 1.4773, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.23881874598758826, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.397901212704176e-05, |
|
"loss": 1.4799, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.23967472715600258, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.393440989680184e-05, |
|
"loss": 1.4964, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24053070832441686, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.3889665869421436e-05, |
|
"loss": 1.3405, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.24138668949283115, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.3844780379982296e-05, |
|
"loss": 1.4144, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.24224267066124544, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.3799753764625564e-05, |
|
"loss": 1.4202, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24309865182965976, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.375458636054924e-05, |
|
"loss": 1.6295, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24395463299807404, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.370927850600569e-05, |
|
"loss": 1.5213, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.24481061416648833, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.366383054029906e-05, |
|
"loss": 1.4423, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.24566659533490262, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.3618242803782825e-05, |
|
"loss": 1.6341, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.24652257650331694, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.357251563785713e-05, |
|
"loss": 1.5936, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.24737855767173123, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.352664938496631e-05, |
|
"loss": 1.5026, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.2482345388401455, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.348064438859629e-05, |
|
"loss": 1.6062, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2490905200085598, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.3434500993272066e-05, |
|
"loss": 1.5012, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.24994650117697412, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.338821954455503e-05, |
|
"loss": 1.5942, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.2508024823453884, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.334180038904046e-05, |
|
"loss": 1.5837, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.2516584635138027, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.3295243874354926e-05, |
|
"loss": 1.6746, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.252514444682217, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.3248550349153616e-05, |
|
"loss": 1.467, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2533704258506313, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.3201720163117795e-05, |
|
"loss": 1.497, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.2542264070190456, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.315475366695217e-05, |
|
"loss": 1.2926, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2550823881874599, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.3107651212382236e-05, |
|
"loss": 1.6157, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.25593836935587416, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.306041315215167e-05, |
|
"loss": 1.538, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.25679435052428845, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.301303984001967e-05, |
|
"loss": 1.4402, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.25765033169270274, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.296553163075836e-05, |
|
"loss": 1.6127, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.2585063128611171, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.291788888015002e-05, |
|
"loss": 1.5769, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.25936229402953137, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.287011194498456e-05, |
|
"loss": 1.2251, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.26021827519794566, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.282220118305672e-05, |
|
"loss": 1.4914, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.26107425636635995, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.277415695316349e-05, |
|
"loss": 1.5531, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.26193023753477424, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.272597961510137e-05, |
|
"loss": 1.3468, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.2627862187031885, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 1.5566, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.2636421998716028, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.2629227058637904e-05, |
|
"loss": 1.4052, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.2644981810400171, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.258065256480288e-05, |
|
"loss": 1.4669, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2653541622084314, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.253194641192621e-05, |
|
"loss": 1.3902, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26621014337684573, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.24831089647614e-05, |
|
"loss": 1.4913, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.26706612454526, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.243414058904528e-05, |
|
"loss": 1.4332, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.2679221057136743, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.238504165149515e-05, |
|
"loss": 1.3904, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.2687780868820886, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.233581251980604e-05, |
|
"loss": 1.5778, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2696340680505029, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.2286453562648046e-05, |
|
"loss": 1.6316, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2704900492189172, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.223696514966346e-05, |
|
"loss": 1.5792, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.27134603038733146, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.2187347651464055e-05, |
|
"loss": 1.4227, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.27220201155574575, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.213760143962834e-05, |
|
"loss": 1.3087, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.2730579927241601, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.20877268866987e-05, |
|
"loss": 1.5096, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.2739139738925744, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.203772436617868e-05, |
|
"loss": 1.3995, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.27476995506098867, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.198759425253014e-05, |
|
"loss": 1.6112, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.27562593622940296, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.1937336921170476e-05, |
|
"loss": 1.6356, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.27648191739781725, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.188695274846979e-05, |
|
"loss": 1.3759, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.27733789856623153, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.183644211174809e-05, |
|
"loss": 1.4551, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2781938797346458, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.1785805389272445e-05, |
|
"loss": 1.4036, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2790498609030601, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.173504296025417e-05, |
|
"loss": 1.3411, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.27990584207147445, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.1684155204845974e-05, |
|
"loss": 1.5365, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.28076182323988874, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.163314250413913e-05, |
|
"loss": 1.6192, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.28161780440830303, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.15820052401606e-05, |
|
"loss": 1.6317, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2824737855767173, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.153074379587018e-05, |
|
"loss": 1.5873, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2833297667451316, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.147935855515763e-05, |
|
"loss": 1.4148, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.2841857479135459, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.142784990283982e-05, |
|
"loss": 1.5794, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2850417290819602, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.1376218224657825e-05, |
|
"loss": 1.4822, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.28589771025037447, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.132446390727404e-05, |
|
"loss": 1.4558, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2867536914187888, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.127258733826929e-05, |
|
"loss": 1.61, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2876096725872031, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.122058890613991e-05, |
|
"loss": 1.3766, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2884656537556174, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.1168469000294895e-05, |
|
"loss": 1.4012, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.2893216349240317, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.11162280110529e-05, |
|
"loss": 1.5115, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.29017761609244597, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.106386632963936e-05, |
|
"loss": 1.5486, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.29103359726086026, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.101138434818357e-05, |
|
"loss": 1.4817, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29188957842927454, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.095878245971573e-05, |
|
"loss": 1.5482, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.29274555959768883, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.0906061058163995e-05, |
|
"loss": 1.48, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.2936015407661031, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.085322053835157e-05, |
|
"loss": 1.4816, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.29445752193451746, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.080026129599368e-05, |
|
"loss": 1.4987, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.29531350310293175, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.0747183727694674e-05, |
|
"loss": 1.6119, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.29616948427134604, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.0693988230945004e-05, |
|
"loss": 1.5121, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.29702546543976033, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.064067520411831e-05, |
|
"loss": 1.5578, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2978814466081746, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 1.4593, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.2987374277765889, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 4.0533698158126085e-05, |
|
"loss": 1.3536, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.2995934089450032, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.048003494009666e-05, |
|
"loss": 1.4781, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3004493901134175, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.042625579425639e-05, |
|
"loss": 1.6591, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.3004493901134175, |
|
"eval_loss": 1.5815147161483765, |
|
"eval_runtime": 21.3462, |
|
"eval_samples_per_second": 18.27, |
|
"eval_steps_per_second": 18.27, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.3013053712818318, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.0372361123349756e-05, |
|
"loss": 1.3439, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.3021613524502461, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.031835133098639e-05, |
|
"loss": 1.5028, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3030173336186604, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.026422682163804e-05, |
|
"loss": 1.5099, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3038733147870747, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.020998800063559e-05, |
|
"loss": 1.4798, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.304729295955489, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.015563527416595e-05, |
|
"loss": 1.4064, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.30558527712390327, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.010116904926907e-05, |
|
"loss": 1.5338, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.30644125829231755, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.0046589733834875e-05, |
|
"loss": 1.5153, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.30729723946073184, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.9991897736600184e-05, |
|
"loss": 1.4596, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.3081532206291462, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.9937093467145726e-05, |
|
"loss": 1.5873, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3090092017975605, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.988217733589296e-05, |
|
"loss": 1.5941, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.30986518296597476, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.982714975410111e-05, |
|
"loss": 1.4578, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.31072116413438905, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.977201113386402e-05, |
|
"loss": 1.4387, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.31157714530280334, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.971676188810707e-05, |
|
"loss": 1.496, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3124331264712176, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.966140243058413e-05, |
|
"loss": 1.3948, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3132891076396319, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.96059331758744e-05, |
|
"loss": 1.5101, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3141450888080462, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.955035453937935e-05, |
|
"loss": 1.5071, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.3150010699764605, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.949466693731962e-05, |
|
"loss": 1.4645, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.31585705114487483, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.9438870786731815e-05, |
|
"loss": 1.522, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.3167130323132891, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.938296650546552e-05, |
|
"loss": 1.4065, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3175690134817034, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.9326954512180026e-05, |
|
"loss": 1.4124, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.3184249946501177, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.927083522634132e-05, |
|
"loss": 1.4137, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.319280975818532, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.9214609068218834e-05, |
|
"loss": 1.482, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.3201369569869463, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.915827645888241e-05, |
|
"loss": 1.3655, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.32099293815536056, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.910183782019905e-05, |
|
"loss": 1.3776, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32184891932377485, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.9045293574829814e-05, |
|
"loss": 1.5067, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.3227049004921892, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.8988644146226606e-05, |
|
"loss": 1.4391, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.3235608816606035, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.8931889958629066e-05, |
|
"loss": 1.4054, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.32441686282901777, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.887503143706134e-05, |
|
"loss": 1.721, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.32527284399743206, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.881806900732893e-05, |
|
"loss": 1.5304, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.32612882516584635, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.8761003096015466e-05, |
|
"loss": 1.4313, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.32698480633426064, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.870383413047959e-05, |
|
"loss": 1.4311, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.3278407875026749, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.864656253885163e-05, |
|
"loss": 1.5491, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.3286967686710892, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.858918875003053e-05, |
|
"loss": 1.5921, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.32955274983950356, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.853171319368054e-05, |
|
"loss": 1.3189, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.33040873100791784, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.847413630022804e-05, |
|
"loss": 1.5709, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.33126471217633213, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.841645850085831e-05, |
|
"loss": 1.5226, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3321206933447464, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.835868022751231e-05, |
|
"loss": 1.6103, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3329766745131607, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.830080191288342e-05, |
|
"loss": 1.4644, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.333832655681575, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.8242823990414214e-05, |
|
"loss": 1.5841, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3346886368499893, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.818474689429323e-05, |
|
"loss": 1.4086, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.3355446180184036, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.812657105945171e-05, |
|
"loss": 1.4696, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.3364005991868179, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.806829692156031e-05, |
|
"loss": 1.3922, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.3372565803552322, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.8009924917025864e-05, |
|
"loss": 1.4289, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.3381125615236465, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.795145548298815e-05, |
|
"loss": 1.435, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3389685426920608, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.789288905731655e-05, |
|
"loss": 1.4943, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.33982452386047507, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.783422607860681e-05, |
|
"loss": 1.5017, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.34068050502888936, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.777546698617776e-05, |
|
"loss": 1.5723, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.34153648619730365, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.7716612220068006e-05, |
|
"loss": 1.4734, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.34239246736571793, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.765766222103262e-05, |
|
"loss": 1.5986, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3432484485341322, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.7598617430539884e-05, |
|
"loss": 1.4154, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.34410442970254657, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.753947829076797e-05, |
|
"loss": 1.6668, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.34496041087096085, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.7480245244601584e-05, |
|
"loss": 1.4141, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.34581639203937514, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.742091873562871e-05, |
|
"loss": 1.3079, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.34667237320778943, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.7361499208137254e-05, |
|
"loss": 1.5055, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3475283543762037, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.730198710711173e-05, |
|
"loss": 1.457, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.348384335544618, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.724238287822991e-05, |
|
"loss": 1.4187, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3492403167130323, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.71826869678595e-05, |
|
"loss": 1.4398, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.3500962978814466, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.7122899823054814e-05, |
|
"loss": 1.4736, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.3509522790498609, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.706302189155338e-05, |
|
"loss": 1.4837, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3518082602182752, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.7003053621772656e-05, |
|
"loss": 1.4027, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.3526642413866895, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.694299546280657e-05, |
|
"loss": 1.6534, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.3535202225551038, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.688284786442229e-05, |
|
"loss": 1.5668, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.3543762037235181, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.682261127705671e-05, |
|
"loss": 1.3467, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.35523218489193237, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.676228615181321e-05, |
|
"loss": 1.4635, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.35608816606034666, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.6701872940458186e-05, |
|
"loss": 1.3886, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.35694414722876094, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.66413720954177e-05, |
|
"loss": 1.5161, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.3578001283971753, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.6580784069774105e-05, |
|
"loss": 1.5301, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.3586561095655896, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.652010931726262e-05, |
|
"loss": 1.3991, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.35951209073400386, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.645934829226797e-05, |
|
"loss": 1.4226, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.36036807190241815, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.6398501449820936e-05, |
|
"loss": 1.5157, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.36122405307083244, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.6337569245595005e-05, |
|
"loss": 1.5619, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.36208003423924673, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.62765521359029e-05, |
|
"loss": 1.4751, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.362936015407661, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 3.6215450577693196e-05, |
|
"loss": 1.4708, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.3637919965760753, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.615426502854689e-05, |
|
"loss": 1.4924, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.36464797774448965, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.6092995946673994e-05, |
|
"loss": 1.5001, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.36550395891290394, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.603164379091006e-05, |
|
"loss": 1.3498, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3663599400813182, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.597020902071278e-05, |
|
"loss": 1.378, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3672159212497325, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.590869209615854e-05, |
|
"loss": 1.7722, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3680719024181468, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 1.5215, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3689278835865611, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.578541362735744e-05, |
|
"loss": 1.5693, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3697838647549754, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.572365300632574e-05, |
|
"loss": 1.5959, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.37063984592338967, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.56618120773605e-05, |
|
"loss": 1.6924, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.37149582709180395, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.5599891303579746e-05, |
|
"loss": 1.6631, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3723518082602183, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.553789114869947e-05, |
|
"loss": 1.4271, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3732077894286326, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.547581207703017e-05, |
|
"loss": 1.4559, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3740637705970469, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.541365455347327e-05, |
|
"loss": 1.3832, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.37491975176546116, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.535141904351776e-05, |
|
"loss": 1.5994, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.37577573293387545, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.528910601323666e-05, |
|
"loss": 1.4947, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.37663171410228974, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.5226715929283506e-05, |
|
"loss": 1.3042, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.377487695270704, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.516424925888887e-05, |
|
"loss": 1.4926, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.3783436764391183, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.510170646985691e-05, |
|
"loss": 1.4419, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.37919965760753266, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.50390880305618e-05, |
|
"loss": 1.4541, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.38005563877594695, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.497639440994424e-05, |
|
"loss": 1.5821, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.38091161994436123, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.491362607750796e-05, |
|
"loss": 1.4526, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3817676011127755, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.485078350331622e-05, |
|
"loss": 1.5525, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3826235822811898, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.478786715798823e-05, |
|
"loss": 1.3649, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.3834795634496041, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.4724877512695674e-05, |
|
"loss": 1.6517, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3843355446180184, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.466181503915918e-05, |
|
"loss": 1.441, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.3851915257864327, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.459868020964478e-05, |
|
"loss": 1.6027, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.386047506954847, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.453547349696033e-05, |
|
"loss": 1.3575, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.3869034881232613, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.447219537445207e-05, |
|
"loss": 1.4457, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3877594692916756, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.4408846316000956e-05, |
|
"loss": 1.4387, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.3886154504600899, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.434542679601922e-05, |
|
"loss": 1.5498, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.38947143162850417, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.428193728944675e-05, |
|
"loss": 1.3684, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.39032741279691846, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.421837827174757e-05, |
|
"loss": 1.5111, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.39118339396533275, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.415475021890622e-05, |
|
"loss": 1.5642, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.39203937513374704, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.4091053607424295e-05, |
|
"loss": 1.4413, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3928953563021613, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.402728891431677e-05, |
|
"loss": 1.3544, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.39375133747057567, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.396345661710849e-05, |
|
"loss": 1.4379, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39460731863898996, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.389955719383058e-05, |
|
"loss": 1.7564, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.39546329980740424, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.3835591123016865e-05, |
|
"loss": 1.5366, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.39631928097581853, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.3771558883700284e-05, |
|
"loss": 1.7521, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.3971752621442328, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 3.370746095540928e-05, |
|
"loss": 1.4594, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.3980312433126471, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.364329781816426e-05, |
|
"loss": 1.4018, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3988872244810614, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.357906995247396e-05, |
|
"loss": 1.5263, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.3997432056494757, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.3514777839331856e-05, |
|
"loss": 1.5457, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.40059918681789003, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.3450421960212566e-05, |
|
"loss": 1.664, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.40059918681789003, |
|
"eval_loss": 1.5587416887283325, |
|
"eval_runtime": 21.3401, |
|
"eval_samples_per_second": 18.275, |
|
"eval_steps_per_second": 18.275, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4014551679863043, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.338600279706826e-05, |
|
"loss": 1.5381, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.4023111491547186, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.3321520832325e-05, |
|
"loss": 1.4321, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4031671303231329, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.3256976548879184e-05, |
|
"loss": 1.4431, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.4040231114915472, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.319237043009389e-05, |
|
"loss": 1.3993, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.40487909265996147, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.3127702959795296e-05, |
|
"loss": 1.3284, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.40573507382837576, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.306297462226901e-05, |
|
"loss": 1.3601, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.40659105499679005, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.299818590225647e-05, |
|
"loss": 1.4164, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4074470361652044, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.2933337284951336e-05, |
|
"loss": 1.4316, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4083030173336187, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.286842925599579e-05, |
|
"loss": 1.5327, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.40915899850203297, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.2803462301476964e-05, |
|
"loss": 1.3832, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.41001497967044725, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.273843690792326e-05, |
|
"loss": 1.2295, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.41087096083886154, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.267335356230075e-05, |
|
"loss": 1.4291, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.41172694200727583, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.260821275200947e-05, |
|
"loss": 1.7269, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.4125829231756901, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.2543014964879816e-05, |
|
"loss": 1.3251, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4134389043441044, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.247776068916887e-05, |
|
"loss": 1.6163, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.41429488551251875, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.241245041355675e-05, |
|
"loss": 1.3584, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.41515086668093304, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.234708462714297e-05, |
|
"loss": 1.4595, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4160068478493473, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.228166381944272e-05, |
|
"loss": 1.7641, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.4168628290177616, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.2216188480383256e-05, |
|
"loss": 1.4908, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.4177188101861759, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.215065910030021e-05, |
|
"loss": 1.6466, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.4185747913545902, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.208507616993393e-05, |
|
"loss": 1.4535, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.4194307725230045, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.201944018042577e-05, |
|
"loss": 1.4366, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42028675369141877, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.1953751623314475e-05, |
|
"loss": 1.3296, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.42114273485983306, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.1888010990532415e-05, |
|
"loss": 1.4605, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.4219987160282474, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.182221877440198e-05, |
|
"loss": 1.3257, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.4228546971966617, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.175637546763183e-05, |
|
"loss": 1.4084, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.423710678365076, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.169048156331329e-05, |
|
"loss": 1.5077, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.42456665953349026, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.162453755491655e-05, |
|
"loss": 1.2778, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.42542264070190455, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.1558543936287035e-05, |
|
"loss": 1.3954, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.42627862187031884, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.149250120164171e-05, |
|
"loss": 1.4434, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.42713460303873313, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.142640984556536e-05, |
|
"loss": 1.5035, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4279905842071474, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.136027036300687e-05, |
|
"loss": 1.5234, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.42884656537556176, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.1294083249275545e-05, |
|
"loss": 1.3764, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.42970254654397605, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.122784900003742e-05, |
|
"loss": 1.4066, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.43055852771239034, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.116156811131148e-05, |
|
"loss": 1.6143, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.4314145088808046, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.109524107946602e-05, |
|
"loss": 1.3665, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.4322704900492189, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.102886840121486e-05, |
|
"loss": 1.3919, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4331264712176332, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.0962450573613704e-05, |
|
"loss": 1.6993, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.4339824523860475, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.089598809405633e-05, |
|
"loss": 1.3292, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.4348384335544618, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.0829481460270936e-05, |
|
"loss": 1.3597, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.4356944147228761, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.0762931170316385e-05, |
|
"loss": 1.3326, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.4365503958912904, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.0696337722578444e-05, |
|
"loss": 1.4273, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4374063770597047, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 3.062970161576612e-05, |
|
"loss": 1.4425, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.438262358228119, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 1.5967, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.4391183393965333, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.0496303421347872e-05, |
|
"loss": 1.5083, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.43997432056494756, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.0429542332742323e-05, |
|
"loss": 1.3709, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.44083030173336185, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.036274058305565e-05, |
|
"loss": 1.4481, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.44168628290177614, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.029589867255678e-05, |
|
"loss": 1.4541, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.4425422640701905, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.022901710181542e-05, |
|
"loss": 1.6127, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.44339824523860477, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.0162096371698267e-05, |
|
"loss": 1.2699, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.44425422640701906, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.0095136983365286e-05, |
|
"loss": 1.4119, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.44511020757543335, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.0028139438265944e-05, |
|
"loss": 1.4058, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.44596618874384764, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.9961104238135457e-05, |
|
"loss": 1.6121, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.4468221699122619, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.989403188499105e-05, |
|
"loss": 1.5662, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.4476781510806762, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.9826922881128162e-05, |
|
"loss": 1.5012, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.4485341322490905, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.975977772911671e-05, |
|
"loss": 1.4917, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.4493901134175048, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.969259693179733e-05, |
|
"loss": 1.3906, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.45024609458591913, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.9625380992277584e-05, |
|
"loss": 1.583, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.4511020757543334, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.955813041392822e-05, |
|
"loss": 1.4414, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.4519580569227477, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.949084570037939e-05, |
|
"loss": 1.2735, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.452814038091162, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.9423527355516876e-05, |
|
"loss": 1.3283, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.4536700192595763, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.9356175883478322e-05, |
|
"loss": 1.5274, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4545260004279906, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.9288791788649462e-05, |
|
"loss": 1.4455, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.45538198159640486, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.922137557566032e-05, |
|
"loss": 1.4383, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.45623796276481915, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.9153927749381483e-05, |
|
"loss": 1.4231, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.4570939439332335, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.9086448814920242e-05, |
|
"loss": 1.4336, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.4579499251016478, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.9018939277616886e-05, |
|
"loss": 1.3865, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.45880590627006207, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.8951399643040867e-05, |
|
"loss": 1.3812, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.45966188743847636, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.888383041698704e-05, |
|
"loss": 1.4111, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.46051786860689065, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.8816232105471863e-05, |
|
"loss": 1.2808, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.46137384977530493, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.874860521472962e-05, |
|
"loss": 1.4054, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.4622298309437192, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.8680950251208595e-05, |
|
"loss": 1.4313, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4630858121121335, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.8613267721567333e-05, |
|
"loss": 1.3595, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.46394179328054785, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.8545558132670803e-05, |
|
"loss": 1.4876, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.46479777444896214, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.847782199158663e-05, |
|
"loss": 1.4332, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.46565375561737643, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.8410059805581258e-05, |
|
"loss": 1.4712, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.4665097367857907, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 2.834227208211621e-05, |
|
"loss": 1.4455, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.467365717954205, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.8274459328844248e-05, |
|
"loss": 1.4987, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.4682216991226193, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.8206622053605553e-05, |
|
"loss": 1.4329, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.4690776802910336, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.813876076442397e-05, |
|
"loss": 1.3499, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.46993366145944787, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.8070875969503192e-05, |
|
"loss": 1.4936, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.47078964262786216, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.8002968177222917e-05, |
|
"loss": 1.4108, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4716456237962765, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.793503789613507e-05, |
|
"loss": 1.4677, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.4725016049646908, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.7867085634960016e-05, |
|
"loss": 1.6387, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.4733575861331051, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 1.4241, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.47421356730151937, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.7731117208048872e-05, |
|
"loss": 1.5287, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.47506954846993366, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.7663102060561275e-05, |
|
"loss": 1.4029, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.47592552963834794, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.75950669694758e-05, |
|
"loss": 1.3678, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.47678151080676223, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.7527012444297707e-05, |
|
"loss": 1.3775, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.4776374919751765, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.7458938994677786e-05, |
|
"loss": 1.6167, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.47849347314359086, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.739084713040856e-05, |
|
"loss": 1.4628, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.47934945431200515, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.7322737361420454e-05, |
|
"loss": 1.5349, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.48020543548041944, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.725461019777797e-05, |
|
"loss": 1.4614, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.48106141664883373, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.7186466149675887e-05, |
|
"loss": 1.6509, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.481917397817248, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.7118305727435434e-05, |
|
"loss": 1.4552, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.4827733789856623, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.7050129441500436e-05, |
|
"loss": 1.6248, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.4836293601540766, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.698193780243355e-05, |
|
"loss": 1.4198, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4844853413224909, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.69137313209124e-05, |
|
"loss": 1.4712, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.4853413224909052, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.6845510507725745e-05, |
|
"loss": 1.3251, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.4861973036593195, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.67772758737697e-05, |
|
"loss": 1.3109, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.4870532848277338, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.670902793004389e-05, |
|
"loss": 1.4285, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.4879092659961481, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.664076718764756e-05, |
|
"loss": 1.4363, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4887652471645624, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.657249415777585e-05, |
|
"loss": 1.2128, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.48962122833297667, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.6504209351715914e-05, |
|
"loss": 1.472, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.49047720950139095, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.643591328084309e-05, |
|
"loss": 1.3816, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.49133319066980524, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.6367606456617055e-05, |
|
"loss": 1.5654, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.4921891718382196, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.6299289390578053e-05, |
|
"loss": 1.5554, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4930451530066339, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.623096259434302e-05, |
|
"loss": 1.5279, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.49390113417504816, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.616262657960173e-05, |
|
"loss": 1.4617, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.49475711534346245, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.6094281858113022e-05, |
|
"loss": 1.4409, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.49561309651187674, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.6025928941700945e-05, |
|
"loss": 1.38, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.496469077680291, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.595756834225089e-05, |
|
"loss": 1.4494, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4973250588487053, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.5889200571705795e-05, |
|
"loss": 1.4874, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.4981810400171196, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.5820826142062323e-05, |
|
"loss": 1.5417, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.4990370211855339, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.575244556536697e-05, |
|
"loss": 1.4868, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.49989300235394823, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.5684059353712307e-05, |
|
"loss": 1.3093, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.5007489835223625, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.5615668019233064e-05, |
|
"loss": 1.5308, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5007489835223625, |
|
"eval_loss": 1.5403519868850708, |
|
"eval_runtime": 21.3271, |
|
"eval_samples_per_second": 18.287, |
|
"eval_steps_per_second": 18.287, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5016049646907768, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.5547272074102374e-05, |
|
"loss": 1.339, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.5024609458591911, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5478872030527855e-05, |
|
"loss": 1.413, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.5033169270276054, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.5410468400747854e-05, |
|
"loss": 1.399, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.5041729081960197, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.534206169702757e-05, |
|
"loss": 1.5245, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.505028889364434, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.5273652431655204e-05, |
|
"loss": 1.418, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5058848705328483, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.520524111693814e-05, |
|
"loss": 1.4231, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.5067408517012626, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.513682826519914e-05, |
|
"loss": 1.3967, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5075968328696768, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.5068414388772453e-05, |
|
"loss": 1.3799, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.5084528140380912, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4701, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.5093087952065054, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.4931585611227543e-05, |
|
"loss": 1.3946, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5101647763749197, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.4863171734800865e-05, |
|
"loss": 1.5882, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.5110207575433341, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.479475888306186e-05, |
|
"loss": 1.4909, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.5118767387117483, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.472634756834481e-05, |
|
"loss": 1.2668, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.5127327198801627, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.4657938302972437e-05, |
|
"loss": 1.2743, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.5135887010485769, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.458953159925215e-05, |
|
"loss": 1.3327, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5144446822169912, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.4521127969472148e-05, |
|
"loss": 1.5656, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.5153006633854055, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.4452727925897635e-05, |
|
"loss": 1.2883, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.5161566445538198, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.438433198076694e-05, |
|
"loss": 1.4471, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.5170126257222342, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.4315940646287695e-05, |
|
"loss": 1.3376, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.5178686068906484, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.424755443463303e-05, |
|
"loss": 1.4541, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5187245880590627, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.4179173857937683e-05, |
|
"loss": 1.2946, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.519580569227477, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.411079942829421e-05, |
|
"loss": 1.4473, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.5204365503958913, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.4042431657749117e-05, |
|
"loss": 1.5921, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.5212925315643056, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.3974071058299064e-05, |
|
"loss": 1.3892, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.5221485127327199, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.390571814188698e-05, |
|
"loss": 1.4598, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5230044939011341, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.383737342039827e-05, |
|
"loss": 1.4553, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.5238604750695485, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.3769037405656987e-05, |
|
"loss": 1.5219, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.5247164562379628, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.3700710609421946e-05, |
|
"loss": 1.255, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.525572437406377, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.3632393543382954e-05, |
|
"loss": 1.4204, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.5264284185747914, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.356408671915692e-05, |
|
"loss": 1.4509, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5272843997432056, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.3495790648284092e-05, |
|
"loss": 1.3018, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.52814038091162, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.3427505842224154e-05, |
|
"loss": 1.5016, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.5289963620800342, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.3359232812352443e-05, |
|
"loss": 1.3029, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.5298523432484485, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.3290972069956117e-05, |
|
"loss": 1.4533, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.5307083244168628, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.3222724126230294e-05, |
|
"loss": 1.36, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5315643055852771, |
|
"grad_norm": 1.875, |
|
"learning_rate": 2.315448949227426e-05, |
|
"loss": 1.5453, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.5324202867536915, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.3086268679087607e-05, |
|
"loss": 1.3677, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.5332762679221057, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.3018062197566462e-05, |
|
"loss": 1.5106, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.53413224909052, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.294987055849957e-05, |
|
"loss": 1.6279, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.5349882302589343, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.288169427256458e-05, |
|
"loss": 1.4241, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5358442114273486, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.281353385032412e-05, |
|
"loss": 1.4114, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.5367001925957628, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.2745389802222032e-05, |
|
"loss": 1.4671, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.5375561737641772, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.2677262638579555e-05, |
|
"loss": 1.5669, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.5384121549325915, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.2609152869591446e-05, |
|
"loss": 1.4634, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.5392681361010058, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.2541061005322227e-05, |
|
"loss": 1.4757, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5401241172694201, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.2472987555702302e-05, |
|
"loss": 1.504, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.5409800984378343, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.240493303052421e-05, |
|
"loss": 1.5711, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.5418360796062487, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.2336897939438734e-05, |
|
"loss": 1.3183, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.5426920607746629, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.2268882791951127e-05, |
|
"loss": 1.4867, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.5435480419430773, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 2.2200888097417307e-05, |
|
"loss": 1.2882, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5444040231114915, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.2132914365039993e-05, |
|
"loss": 1.4977, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.5452600042799058, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.2064962103864937e-05, |
|
"loss": 1.4808, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.5461159854483202, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.1997031822777093e-05, |
|
"loss": 1.3365, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.5469719666167344, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.1929124030496817e-05, |
|
"loss": 1.3079, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.5478279477851488, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.186123923557603e-05, |
|
"loss": 1.4077, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.548683928953563, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.1793377946394446e-05, |
|
"loss": 1.5337, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.5495399101219773, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.1725540671155758e-05, |
|
"loss": 1.3779, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.5503958912903916, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.165772791788379e-05, |
|
"loss": 1.2943, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.5512518724588059, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.1589940194418748e-05, |
|
"loss": 1.4558, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.5521078536272201, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.1522178008413377e-05, |
|
"loss": 1.3845, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5529638347956345, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.1454441867329203e-05, |
|
"loss": 1.4121, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.5538198159640488, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.1386732278432676e-05, |
|
"loss": 1.3775, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.5546757971324631, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.1319049748791418e-05, |
|
"loss": 1.3581, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.5555317783008774, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.1251394785270386e-05, |
|
"loss": 1.5385, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.5563877594692916, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.1183767894528136e-05, |
|
"loss": 1.4733, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.557243740637706, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.1116169583012965e-05, |
|
"loss": 1.3986, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.5580997218061202, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.1048600356959132e-05, |
|
"loss": 1.3114, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.5589557029745346, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.0981060722383127e-05, |
|
"loss": 1.33, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.5598116841429489, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.0913551185079764e-05, |
|
"loss": 1.4388, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.5606676653113631, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.084607225061853e-05, |
|
"loss": 1.6617, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5615236464797775, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.077862442433968e-05, |
|
"loss": 1.3882, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.5623796276481917, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.071120821135054e-05, |
|
"loss": 1.2734, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.5632356088166061, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.064382411652168e-05, |
|
"loss": 1.4545, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.5640915899850203, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.057647264448313e-05, |
|
"loss": 1.4795, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.5649475711534346, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.050915429962062e-05, |
|
"loss": 1.5849, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5658035523218489, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.0441869586071783e-05, |
|
"loss": 1.5012, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.5666595334902632, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.037461900772242e-05, |
|
"loss": 1.5541, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.5675155146586776, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.0307403068202676e-05, |
|
"loss": 1.4741, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.5683714958270918, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.0240222270883288e-05, |
|
"loss": 1.5354, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5692274769955061, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.0173077118871844e-05, |
|
"loss": 1.4909, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5700834581639204, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.0105968115008954e-05, |
|
"loss": 1.5927, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.5709394393323347, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.003889576186455e-05, |
|
"loss": 1.4568, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.5717954205007489, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.997186056173406e-05, |
|
"loss": 1.4905, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.5726514016691633, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.9904863016634723e-05, |
|
"loss": 1.5317, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.5735073828375776, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.983790362830174e-05, |
|
"loss": 1.4985, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5743633640059919, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.977098289818459e-05, |
|
"loss": 1.4502, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.5752193451744062, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.970410132744322e-05, |
|
"loss": 1.3937, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.5760753263428204, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.9637259416944352e-05, |
|
"loss": 1.3821, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.5769313075112348, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.9570457667257686e-05, |
|
"loss": 1.4048, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.577787288679649, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.950369657865213e-05, |
|
"loss": 1.3841, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5786432698480634, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 1.4192, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.5794992510164776, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.937029838423389e-05, |
|
"loss": 1.2927, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.5803552321848919, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.9303662277421568e-05, |
|
"loss": 1.5403, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.5812112133533063, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.923706882968362e-05, |
|
"loss": 1.3693, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.5820671945217205, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.917051853972906e-05, |
|
"loss": 1.3371, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5829231756901349, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.910401190594367e-05, |
|
"loss": 1.4528, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5837791568585491, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9037549426386302e-05, |
|
"loss": 1.4057, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.5846351380269634, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.8971131598785148e-05, |
|
"loss": 1.4727, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.5854911191953777, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8904758920533988e-05, |
|
"loss": 1.4969, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.586347100363792, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.8838431888688527e-05, |
|
"loss": 1.3984, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5872030815322062, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.8772150999962587e-05, |
|
"loss": 1.4929, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.5880590627006206, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.870591675072446e-05, |
|
"loss": 1.3202, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.5889150438690349, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.863972963699314e-05, |
|
"loss": 1.5529, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.5897710250374492, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.857359015443465e-05, |
|
"loss": 1.4185, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.5906270062058635, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.8507498798358297e-05, |
|
"loss": 1.4122, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5914829873742777, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.844145606371297e-05, |
|
"loss": 1.3178, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.5923389685426921, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.8375462445083464e-05, |
|
"loss": 1.4875, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.5931949497111063, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.830951843668672e-05, |
|
"loss": 1.443, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.5940509308795207, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.8243624532368174e-05, |
|
"loss": 1.4547, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.594906912047935, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.8177781225598032e-05, |
|
"loss": 1.3457, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5957628932163492, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.811198900946759e-05, |
|
"loss": 1.4981, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.5966188743847636, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.804624837668553e-05, |
|
"loss": 1.5599, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.5974748555531778, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.7980559819574223e-05, |
|
"loss": 1.3979, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.5983308367215922, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.7914923830066074e-05, |
|
"loss": 1.4061, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.5991868178900064, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.784934089969979e-05, |
|
"loss": 1.5827, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6000427990584207, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.7783811519616757e-05, |
|
"loss": 1.4095, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.600898780226835, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.7718336180557288e-05, |
|
"loss": 1.3583, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.600898780226835, |
|
"eval_loss": 1.5267729759216309, |
|
"eval_runtime": 21.3333, |
|
"eval_samples_per_second": 18.281, |
|
"eval_steps_per_second": 18.281, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.6017547613952493, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.7652915372857035e-05, |
|
"loss": 1.4024, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.6026107425636636, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.7587549586443252e-05, |
|
"loss": 1.349, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.6034667237320779, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.7522239310831134e-05, |
|
"loss": 1.5471, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6043227049004922, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.7456985035120193e-05, |
|
"loss": 1.4457, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.6051786860689065, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7391787247990538e-05, |
|
"loss": 1.2629, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.6060346672373208, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.732664643769926e-05, |
|
"loss": 1.4819, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.606890648405735, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.726156309207674e-05, |
|
"loss": 1.4687, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.6077466295741494, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.7196537698523052e-05, |
|
"loss": 1.4168, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6086026107425636, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.7131570744004215e-05, |
|
"loss": 1.4856, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.609458591910978, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.7066662715048666e-05, |
|
"loss": 1.4287, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.6103145730793923, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.7001814097743528e-05, |
|
"loss": 1.5557, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.6111705542478065, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.693702537773099e-05, |
|
"loss": 1.4353, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.6120265354162209, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.687229704020471e-05, |
|
"loss": 1.5126, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6128825165846351, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.6807629569906112e-05, |
|
"loss": 1.479, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.6137384977530495, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.6743023451120832e-05, |
|
"loss": 1.4706, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.6145944789214637, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.6678479167675006e-05, |
|
"loss": 1.6114, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.615450460089878, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.6613997202931746e-05, |
|
"loss": 1.4916, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.6163064412582924, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.6549578039787436e-05, |
|
"loss": 1.3918, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6171624224267066, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.6485222160668146e-05, |
|
"loss": 1.3791, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.618018403595121, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.642093004752605e-05, |
|
"loss": 1.5026, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.6188743847635352, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.635670218183575e-05, |
|
"loss": 1.4059, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.6197303659319495, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.629253904459073e-05, |
|
"loss": 1.4202, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.6205863471003638, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.622844111629972e-05, |
|
"loss": 1.4348, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6214423282687781, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.616440887698313e-05, |
|
"loss": 1.4223, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.6222983094371923, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.6100442806169422e-05, |
|
"loss": 1.4637, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.6231542906056067, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.6036543382891512e-05, |
|
"loss": 1.3871, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.624010271774021, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.597271108568324e-05, |
|
"loss": 1.5021, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.6248662529424353, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.5908946392575714e-05, |
|
"loss": 1.628, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6257222341108496, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.5845249781093786e-05, |
|
"loss": 1.4596, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.6265782152792638, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.578162172825244e-05, |
|
"loss": 1.1683, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.6274341964476782, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.5718062710553253e-05, |
|
"loss": 1.3545, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.6282901776160924, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.5654573203980784e-05, |
|
"loss": 1.3087, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.6291461587845067, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.5591153683999043e-05, |
|
"loss": 1.3387, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.630002139952921, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.5527804625547938e-05, |
|
"loss": 1.3403, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.6308581211213353, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.5464526503039666e-05, |
|
"loss": 1.4556, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.6317141022897497, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.540131979035523e-05, |
|
"loss": 1.3776, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.6325700834581639, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.5338184960840824e-05, |
|
"loss": 1.3059, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.6334260646265782, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.5275122487304335e-05, |
|
"loss": 1.5742, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6342820457949925, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.5212132842011779e-05, |
|
"loss": 1.4275, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.6351380269634068, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.5149216496683787e-05, |
|
"loss": 1.489, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.635994008131821, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.5086373922492048e-05, |
|
"loss": 1.4186, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.6368499893002354, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.5023605590055767e-05, |
|
"loss": 1.4414, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.6377059704686497, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.4960911969438213e-05, |
|
"loss": 1.3893, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.638561951637064, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.4898293530143095e-05, |
|
"loss": 1.4831, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.6394179328054783, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.4835750741111138e-05, |
|
"loss": 1.4675, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.6402739139738926, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.4773284070716503e-05, |
|
"loss": 1.4084, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.6411298951423069, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.4710893986763347e-05, |
|
"loss": 1.4119, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.6419858763107211, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.464858095648224e-05, |
|
"loss": 1.464, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6428418574791355, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4586345446526733e-05, |
|
"loss": 1.3932, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.6436978386475497, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.4524187922969839e-05, |
|
"loss": 1.4852, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.644553819815964, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.4462108851300523e-05, |
|
"loss": 1.3278, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.6454098009843784, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.4400108696420264e-05, |
|
"loss": 1.3441, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.6462657821527926, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.4338187922639507e-05, |
|
"loss": 1.3425, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.647121763321207, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.4276346993674266e-05, |
|
"loss": 1.381, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.6479777444896212, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.4214586372642563e-05, |
|
"loss": 1.4587, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.6488337256580355, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 1.3396, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.6496897068264498, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.4091307903841466e-05, |
|
"loss": 1.3532, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.6505456879948641, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.4029790979287216e-05, |
|
"loss": 1.3586, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6514016691632785, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3968356209089944e-05, |
|
"loss": 1.4067, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.6522576503316927, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3907004053326006e-05, |
|
"loss": 1.4696, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.653113631500107, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.3845734971453114e-05, |
|
"loss": 1.4284, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.6539696126685213, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.3784549422306808e-05, |
|
"loss": 1.4767, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.6548255938369356, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.3723447864097105e-05, |
|
"loss": 1.579, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6556815750053498, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.3662430754405004e-05, |
|
"loss": 1.3239, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.6565375561737642, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.360149855017906e-05, |
|
"loss": 1.1992, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.6573935373421784, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.3540651707732035e-05, |
|
"loss": 1.354, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.6582495185105928, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.3479890682737379e-05, |
|
"loss": 1.4431, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.6591054996790071, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.3419215930225899e-05, |
|
"loss": 1.3813, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6599614808474213, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3358627904582307e-05, |
|
"loss": 1.3639, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.6608174620158357, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.3298127059541828e-05, |
|
"loss": 1.3568, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.6616734431842499, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.3237713848186797e-05, |
|
"loss": 1.4048, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.6625294243526643, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.317738872294329e-05, |
|
"loss": 1.4503, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.6633854055210785, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.311715213557772e-05, |
|
"loss": 1.3446, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6642413866894928, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.3057004537193423e-05, |
|
"loss": 1.2524, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.6650973678579071, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.2996946378227352e-05, |
|
"loss": 1.5227, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.6659533490263214, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.2936978108446624e-05, |
|
"loss": 1.4289, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.6668093301947358, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.28771001769452e-05, |
|
"loss": 1.6197, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.66766531136315, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.2817313032140505e-05, |
|
"loss": 1.4775, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6685212925315643, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.2757617121770093e-05, |
|
"loss": 1.4731, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.6693772736999786, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.2698012892888272e-05, |
|
"loss": 1.4356, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.6702332548683929, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.263850079186274e-05, |
|
"loss": 1.3329, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.6710892360368071, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.257908126437129e-05, |
|
"loss": 1.4069, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.6719452172052215, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2519754755398422e-05, |
|
"loss": 1.501, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6728011983736358, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.2460521709232043e-05, |
|
"loss": 1.5482, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.6736571795420501, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.2401382569460119e-05, |
|
"loss": 1.3473, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.6745131607104644, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.2342337778967384e-05, |
|
"loss": 1.4373, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.6753691418788786, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.2283387779932005e-05, |
|
"loss": 1.4588, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.676225123047293, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.2224533013822238e-05, |
|
"loss": 1.2549, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6770811042157072, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.216577392139319e-05, |
|
"loss": 1.4916, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.6779370853841216, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.2107110942683459e-05, |
|
"loss": 1.4571, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.6787930665525358, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2048544517011862e-05, |
|
"loss": 1.4943, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.6796490477209501, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.1990075082974139e-05, |
|
"loss": 1.3433, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.6805050288893645, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.1931703078439704e-05, |
|
"loss": 1.5043, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6813610100577787, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.1873428940548292e-05, |
|
"loss": 1.5344, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.6822169912261931, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.181525310570677e-05, |
|
"loss": 1.4948, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.6830729723946073, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.1757176009585793e-05, |
|
"loss": 1.4303, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6839289535630216, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 1.4565, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.6847849347314359, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.1641319772487699e-05, |
|
"loss": 1.5477, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6856409158998502, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.158354149914169e-05, |
|
"loss": 1.4628, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.6864968970682644, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.1525863699771966e-05, |
|
"loss": 1.5269, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.6873528782366788, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.1468286806319462e-05, |
|
"loss": 1.355, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.6882088594050931, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.1410811249969475e-05, |
|
"loss": 1.531, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.6890648405735074, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.1353437461148377e-05, |
|
"loss": 1.596, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6899208217419217, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.129616586952042e-05, |
|
"loss": 1.2953, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.6907768029103359, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.1238996903984537e-05, |
|
"loss": 1.2693, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.6916327840787503, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.1181930992671078e-05, |
|
"loss": 1.176, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.6924887652471645, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.112496856293867e-05, |
|
"loss": 1.3185, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.6933447464155789, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.1068110041370938e-05, |
|
"loss": 1.4027, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6942007275839932, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.10113558537734e-05, |
|
"loss": 1.3788, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.6950567087524074, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.0954706425170197e-05, |
|
"loss": 1.4144, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.6959126899208218, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.0898162179800947e-05, |
|
"loss": 1.5627, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.696768671089236, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.0841723541117594e-05, |
|
"loss": 1.5203, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.6976246522576504, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.0785390931781165e-05, |
|
"loss": 1.5606, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6984806334260646, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.0729164773658693e-05, |
|
"loss": 1.399, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.6993366145944789, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.0673045487819975e-05, |
|
"loss": 1.3372, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.7001925957628932, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.0617033494534486e-05, |
|
"loss": 1.3698, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.7010485769313075, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.0561129213268187e-05, |
|
"loss": 1.4297, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.7010485769313075, |
|
"eval_loss": 1.5197569131851196, |
|
"eval_runtime": 21.3185, |
|
"eval_samples_per_second": 18.294, |
|
"eval_steps_per_second": 18.294, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.7019045580997219, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.0505333062680383e-05, |
|
"loss": 1.4227, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7027605392681361, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.0449645460620649e-05, |
|
"loss": 1.3861, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.7036165204365504, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.0394066824125603e-05, |
|
"loss": 1.4062, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.7044725016049647, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.0338597569415877e-05, |
|
"loss": 1.3354, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.705328482773379, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.028323811189293e-05, |
|
"loss": 1.4555, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.7061844639417932, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.0227988866135996e-05, |
|
"loss": 1.2839, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7070404451102076, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.0172850245898893e-05, |
|
"loss": 1.5304, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.7078964262786218, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.0117822664107038e-05, |
|
"loss": 1.6997, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.7087524074470362, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.0062906532854283e-05, |
|
"loss": 1.3367, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.7096083886154505, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.000810226339981e-05, |
|
"loss": 1.3577, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.7104643697838647, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.95341026616513e-06, |
|
"loss": 1.5752, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7113203509522791, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 9.898830950730933e-06, |
|
"loss": 1.5784, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.7121763321206933, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 9.844364725834057e-06, |
|
"loss": 1.527, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.7130323132891077, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.790011999364413e-06, |
|
"loss": 1.5338, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.7138882944575219, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.735773178361964e-06, |
|
"loss": 1.3994, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.7147442756259362, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.681648669013619e-06, |
|
"loss": 1.4432, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7156002567943506, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.627638876650243e-06, |
|
"loss": 1.3741, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.7164562379627648, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 9.573744205743612e-06, |
|
"loss": 1.3791, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.7173122191311792, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.519965059903349e-06, |
|
"loss": 1.4102, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.7181682002995934, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.46630184187393e-06, |
|
"loss": 1.3081, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.7190241814680077, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 1.3067, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.719880162636422, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.359324795881708e-06, |
|
"loss": 1.3967, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.7207361438048363, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.306011769054998e-06, |
|
"loss": 1.3527, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.7215921249732505, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.252816272305329e-06, |
|
"loss": 1.4973, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.7224481061416649, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.199738704006321e-06, |
|
"loss": 1.3451, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.7233040873100792, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.146779461648436e-06, |
|
"loss": 1.3985, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7241600684784935, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.09393894183601e-06, |
|
"loss": 1.5013, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.7250160496469078, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.041217540284277e-06, |
|
"loss": 1.4524, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.725872030815322, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 8.98861565181644e-06, |
|
"loss": 1.4127, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.7267280119837364, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.936133670360644e-06, |
|
"loss": 1.5011, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.7275839931521506, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 8.883771988947099e-06, |
|
"loss": 1.4038, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.728439974320565, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 8.831530999705104e-06, |
|
"loss": 1.4896, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.7292959554889793, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 8.77941109386009e-06, |
|
"loss": 1.3577, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.7301519366573935, |
|
"grad_norm": 1.5, |
|
"learning_rate": 8.727412661730724e-06, |
|
"loss": 1.3243, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.7310079178258079, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 8.675536092725966e-06, |
|
"loss": 1.482, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.7318638989942221, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 8.623781775342183e-06, |
|
"loss": 1.5252, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7327198801626364, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 8.572150097160179e-06, |
|
"loss": 1.5078, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.7335758613310507, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 8.520641444842373e-06, |
|
"loss": 1.4596, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.734431842499465, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.469256204129828e-06, |
|
"loss": 1.4019, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.7352878236678793, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 8.417994759839401e-06, |
|
"loss": 1.3862, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.7361438048362936, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 8.36685749586087e-06, |
|
"loss": 1.5438, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.736999786004708, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 8.315844795154024e-06, |
|
"loss": 1.1669, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.7378557671731222, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 8.264957039745836e-06, |
|
"loss": 1.2759, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.7387117483415365, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 8.214194610727557e-06, |
|
"loss": 1.3324, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.7395677295099508, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 8.163557888251917e-06, |
|
"loss": 1.4036, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.7404237106783651, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 8.113047251530215e-06, |
|
"loss": 1.4018, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7412796918467793, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.062663078829525e-06, |
|
"loss": 1.3247, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.7421356730151937, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 8.012405747469862e-06, |
|
"loss": 1.4302, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.7429916541836079, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.96227563382132e-06, |
|
"loss": 1.5155, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.7438476353520223, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 7.912273113301306e-06, |
|
"loss": 1.4633, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.7447036165204366, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 7.862398560371664e-06, |
|
"loss": 1.3607, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7455595976888508, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.812652348535948e-06, |
|
"loss": 1.4725, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.7464155788572652, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 7.763034850336553e-06, |
|
"loss": 1.4298, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.7472715600256794, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 7.713546437351965e-06, |
|
"loss": 1.3457, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.7481275411940937, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 7.66418748019396e-06, |
|
"loss": 1.265, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.748983522362508, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 7.614958348504853e-06, |
|
"loss": 1.5109, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7498395035309223, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 7.565859410954718e-06, |
|
"loss": 1.457, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.7506954846993367, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.516891035238596e-06, |
|
"loss": 1.6443, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.7515514658677509, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.468053588073803e-06, |
|
"loss": 1.4027, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.7524074470361652, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.4193474351971245e-06, |
|
"loss": 1.4607, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.7532634282045795, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.3707729413621055e-06, |
|
"loss": 1.4838, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7541194093729938, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 1.309, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.754975390541408, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 7.274020384898628e-06, |
|
"loss": 1.3888, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.7558313717098224, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 7.225843046836514e-06, |
|
"loss": 1.649, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.7566873528782366, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 7.177798816943287e-06, |
|
"loss": 1.4189, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.757543334046651, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.129888055015455e-06, |
|
"loss": 1.4323, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7583993152150653, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 7.0821111198499795e-06, |
|
"loss": 1.4199, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.7592552963834795, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 7.034468369241651e-06, |
|
"loss": 1.6481, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.7601112775518939, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.986960159980327e-06, |
|
"loss": 1.4306, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.7609672587203081, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.939586847848334e-06, |
|
"loss": 1.4569, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.7618232398887225, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 6.892348787617769e-06, |
|
"loss": 1.4033, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7626792210571367, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 6.845246333047836e-06, |
|
"loss": 1.5268, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.763535202225551, |
|
"grad_norm": 1.5, |
|
"learning_rate": 6.79827983688221e-06, |
|
"loss": 1.4712, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.7643911833939653, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 6.751449650846389e-06, |
|
"loss": 1.3403, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.7652471645623796, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6.704756125645081e-06, |
|
"loss": 1.3823, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.766103145730794, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.658199610959537e-06, |
|
"loss": 1.2359, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7669591268992082, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 6.611780455444979e-06, |
|
"loss": 1.3427, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.7678151080676225, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.565499006727938e-06, |
|
"loss": 1.4077, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.7686710892360368, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 6.51935561140371e-06, |
|
"loss": 1.5555, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.7695270704044511, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 6.4733506150337016e-06, |
|
"loss": 1.3699, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.7703830515728654, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.427484362142877e-06, |
|
"loss": 1.3224, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7712390327412797, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 6.381757196217181e-06, |
|
"loss": 1.5472, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.772095013909694, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 6.336169459700933e-06, |
|
"loss": 1.5253, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.7729509950781083, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 6.290721493994317e-06, |
|
"loss": 1.3984, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.7738069762465226, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 6.245413639450757e-06, |
|
"loss": 1.3538, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.7746629574149368, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.200246235374438e-06, |
|
"loss": 1.5044, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7755189385833512, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 6.155219620017708e-06, |
|
"loss": 1.4854, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.7763749197517654, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.1103341305785655e-06, |
|
"loss": 1.6012, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.7772309009201798, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.065590103198165e-06, |
|
"loss": 1.4091, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.778086882088594, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 6.020987872958236e-06, |
|
"loss": 1.4079, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.7789428632570083, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 5.97652777387864e-06, |
|
"loss": 1.4951, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7797988444254227, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.932210138914821e-06, |
|
"loss": 1.5049, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.7806548255938369, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 5.888035299955325e-06, |
|
"loss": 1.3488, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.7815108067622513, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 5.844003587819327e-06, |
|
"loss": 1.5192, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.7823667879306655, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 5.800115332254144e-06, |
|
"loss": 1.549, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.7832227690990798, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 5.75637086193278e-06, |
|
"loss": 1.4354, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7840787502674941, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 5.712770504451426e-06, |
|
"loss": 1.4676, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.7849347314359084, |
|
"grad_norm": 1.625, |
|
"learning_rate": 5.669314586327054e-06, |
|
"loss": 1.5199, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.7857907126043226, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.626003432994933e-06, |
|
"loss": 1.4853, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.786646693772737, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 5.582837368806224e-06, |
|
"loss": 1.3789, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.7875026749411513, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 5.539816717025515e-06, |
|
"loss": 1.5069, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7883586561095656, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.496941799828443e-06, |
|
"loss": 1.364, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.7892146372779799, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 1.4134, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.7900706184463941, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.411630452428395e-06, |
|
"loss": 1.4641, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.7909265996148085, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.369194661110138e-06, |
|
"loss": 1.2542, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.7917825807832227, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.326905882140168e-06, |
|
"loss": 1.5729, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7926385619516371, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 5.284764432213221e-06, |
|
"loss": 1.5403, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.7934945431200514, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 5.242770626920695e-06, |
|
"loss": 1.418, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.7943505242884656, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 5.200924780748323e-06, |
|
"loss": 1.4128, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.79520650545688, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.15922720707378e-06, |
|
"loss": 1.4845, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.7960624866252942, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.117678218164338e-06, |
|
"loss": 1.5405, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7969184677937086, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.076278125174555e-06, |
|
"loss": 1.361, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.7977744489621228, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.0350272381439244e-06, |
|
"loss": 1.5649, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.7986304301305371, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 4.993925865994548e-06, |
|
"loss": 1.388, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.7994864112989514, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.952974316528833e-06, |
|
"loss": 1.5386, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.8003423924673657, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.912172896427205e-06, |
|
"loss": 1.2794, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8011983736357801, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.8715219112457635e-06, |
|
"loss": 1.7561, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.8011983736357801, |
|
"eval_loss": 1.516785979270935, |
|
"eval_runtime": 21.3507, |
|
"eval_samples_per_second": 18.266, |
|
"eval_steps_per_second": 18.266, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.8020543548041943, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.8310216654140425e-06, |
|
"loss": 1.5413, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.8029103359726086, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.790672462232715e-06, |
|
"loss": 1.3485, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.8037663171410229, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.750474603871283e-06, |
|
"loss": 1.3616, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.8046222983094372, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.710428391365887e-06, |
|
"loss": 1.5232, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8054782794778514, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.670534124616982e-06, |
|
"loss": 1.5764, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.8063342606462658, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.630792102387155e-06, |
|
"loss": 1.4513, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.8071902418146801, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.591202622298824e-06, |
|
"loss": 1.4137, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.8080462229830944, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.551765980832059e-06, |
|
"loss": 1.3718, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.8089022041515087, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.512482473322341e-06, |
|
"loss": 1.4205, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8097581853199229, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.473352393958338e-06, |
|
"loss": 1.5571, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.8106141664883373, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.4343760357797386e-06, |
|
"loss": 1.2288, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.8114701476567515, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.3955536906750135e-06, |
|
"loss": 1.3646, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.8123261288251659, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.356885649379269e-06, |
|
"loss": 1.4272, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.8131821099935801, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.318372201472037e-06, |
|
"loss": 1.4271, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8140380911619944, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.280013635375138e-06, |
|
"loss": 1.5182, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.8148940723304088, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.2418102383504885e-06, |
|
"loss": 1.3662, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.815750053498823, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.203762296497965e-06, |
|
"loss": 1.5375, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.8166060346672374, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.1658700947532795e-06, |
|
"loss": 1.4522, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.8174620158356516, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.128133916885804e-06, |
|
"loss": 1.4576, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8183179970040659, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.0905540454965006e-06, |
|
"loss": 1.3513, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.8191739781724802, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.053130762015736e-06, |
|
"loss": 1.4043, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.8200299593408945, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.015864346701251e-06, |
|
"loss": 1.3098, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.8208859405093087, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.978755078635995e-06, |
|
"loss": 1.5399, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.8217419216777231, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.941803235726069e-06, |
|
"loss": 1.6757, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8225979028461374, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.90500909469865e-06, |
|
"loss": 1.3494, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.8234538840145517, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.8683729310998926e-06, |
|
"loss": 1.3379, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.824309865182966, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.831895019292897e-06, |
|
"loss": 1.5126, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.8251658463513802, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.7955756324556197e-06, |
|
"loss": 1.507, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.8260218275197946, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.7594150425788675e-06, |
|
"loss": 1.3546, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8268778086882088, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.7234135204642195e-06, |
|
"loss": 1.466, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.8277337898566232, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.687571335722023e-06, |
|
"loss": 1.388, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.8285897710250375, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.651888756769381e-06, |
|
"loss": 1.4069, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.8294457521934517, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.6163660508281154e-06, |
|
"loss": 1.451, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.8303017333618661, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.5810034839228015e-06, |
|
"loss": 1.4336, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8311577145302803, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.5458013208787333e-06, |
|
"loss": 1.4418, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.8320136956986947, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.5107598253199758e-06, |
|
"loss": 1.4126, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.8328696768671089, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.4758792596673725e-06, |
|
"loss": 1.3229, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.8337256580355232, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.4411598851365966e-06, |
|
"loss": 1.2822, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.8345816392039375, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.406601961736164e-06, |
|
"loss": 1.607, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8354376203723518, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.372205748265522e-06, |
|
"loss": 1.5054, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.8362936015407662, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.337971502313095e-06, |
|
"loss": 1.4882, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.8371495827091804, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.3038994802543467e-06, |
|
"loss": 1.5285, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.8380055638775947, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.2699899372498733e-06, |
|
"loss": 1.4404, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.838861545046009, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.236243127243477e-06, |
|
"loss": 1.5433, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8397175262144233, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.202659302960301e-06, |
|
"loss": 1.207, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.8405735073828375, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.169238715904882e-06, |
|
"loss": 1.4336, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.8414294885512519, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.135981616359315e-06, |
|
"loss": 1.4036, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.8422854697196661, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.1028882533813643e-06, |
|
"loss": 1.4233, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.8431414508880805, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.0699588748025755e-06, |
|
"loss": 1.3475, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8439974320564948, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.037193727226445e-06, |
|
"loss": 1.3735, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.844853413224909, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.0045930560265666e-06, |
|
"loss": 1.49, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.8457093943933234, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.9721571053448053e-06, |
|
"loss": 1.5413, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.8465653755617376, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.9398861180894355e-06, |
|
"loss": 1.4234, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.847421356730152, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.9077803359333607e-06, |
|
"loss": 1.542, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8482773378985662, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.8758399993122854e-06, |
|
"loss": 1.4682, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.8491333190669805, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.8440653474229085e-06, |
|
"loss": 1.3124, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.8499893002353949, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.812456618221143e-06, |
|
"loss": 1.5196, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.8508452814038091, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.7810140484203188e-06, |
|
"loss": 1.4316, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.8517012625722234, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.7497378734894497e-06, |
|
"loss": 1.408, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8525572437406377, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.718628327651407e-06, |
|
"loss": 1.4881, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.853413224909052, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.6876856438812296e-06, |
|
"loss": 1.4838, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.8542692060774663, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.6569100539043325e-06, |
|
"loss": 1.4737, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.8551251872458806, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.626301788194785e-06, |
|
"loss": 1.319, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.8559811684142948, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.595861075973613e-06, |
|
"loss": 1.2919, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8568371495827092, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.5655881452070264e-06, |
|
"loss": 1.5445, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.8576931307511235, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.5354832226047705e-06, |
|
"loss": 1.2502, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.8585491119195378, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.5055465336183774e-06, |
|
"loss": 1.3512, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.8594050930879521, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 1.4999, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.8602610742563663, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.4461787519983127e-06, |
|
"loss": 1.2433, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8611170554247807, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.416748103961625e-06, |
|
"loss": 1.4821, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.8619730365931949, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.3874865787314598e-06, |
|
"loss": 1.4389, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.8628290177616093, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.3583943954432725e-06, |
|
"loss": 1.6777, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.8636849989300235, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.3294717719643534e-06, |
|
"loss": 1.5674, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.8645409800984378, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.300718924892159e-06, |
|
"loss": 1.3248, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8653969612668522, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.2721360695527437e-06, |
|
"loss": 1.3882, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.8662529424352664, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.243723419999097e-06, |
|
"loss": 1.4108, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.8671089236036807, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.2154811890095605e-06, |
|
"loss": 1.2796, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.867964904772095, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.1874095880862505e-06, |
|
"loss": 1.4911, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.8688208859405093, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.1595088274534436e-06, |
|
"loss": 1.4234, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8696768671089236, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.1317791160560318e-06, |
|
"loss": 1.4759, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.8705328482773379, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.1042206615579237e-06, |
|
"loss": 1.5507, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.8713888294457522, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.076833670340533e-06, |
|
"loss": 1.3777, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.8722448106141665, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.0496183475011894e-06, |
|
"loss": 1.4103, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.8731007917825808, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.0225748968516284e-06, |
|
"loss": 1.2965, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.873956772950995, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.995703520916456e-06, |
|
"loss": 1.4232, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.8748127541194094, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.9690044209316444e-06, |
|
"loss": 1.4387, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.8756687352878236, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.9424777968430146e-06, |
|
"loss": 1.4927, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.876524716456238, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.916123847304721e-06, |
|
"loss": 1.6027, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.8773806976246522, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.8899427696778105e-06, |
|
"loss": 1.5225, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8782366787930665, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.8639347600286877e-06, |
|
"loss": 1.4555, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.8790926599614809, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8381000131277e-06, |
|
"loss": 1.4311, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.8799486411298951, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.8124387224476347e-06, |
|
"loss": 1.5006, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.8808046222983095, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.7869510801623053e-06, |
|
"loss": 1.4779, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.8816606034667237, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.761637277145095e-06, |
|
"loss": 1.2851, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.882516584635138, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.7364975029675184e-06, |
|
"loss": 1.4212, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.8833725658035523, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.7115319458978236e-06, |
|
"loss": 1.5496, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.8842285469719666, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.6867407928995577e-06, |
|
"loss": 1.4217, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.885084528140381, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.6621242296301964e-06, |
|
"loss": 1.5435, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.8859405093087952, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.6376824404397251e-06, |
|
"loss": 1.3545, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8867964904772095, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.613415608369284e-06, |
|
"loss": 1.4856, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.8876524716456238, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.5893239151497652e-06, |
|
"loss": 1.3376, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.8885084528140381, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.5654075412004893e-06, |
|
"loss": 1.4068, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.8893644339824524, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.5416666656278222e-06, |
|
"loss": 1.3882, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.8902204151508667, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.5181014662238508e-06, |
|
"loss": 1.2639, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8910763963192809, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.4947121194650527e-06, |
|
"loss": 1.388, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.8919323774876953, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.471498800510962e-06, |
|
"loss": 1.5616, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.8927883586561096, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.448461683202873e-06, |
|
"loss": 1.5843, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.8936443398245238, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.4256009400625214e-06, |
|
"loss": 1.4752, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.8945003209929382, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.4029167422908107e-06, |
|
"loss": 1.4538, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8953563021613524, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.3804092597665186e-06, |
|
"loss": 1.4397, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.8962122833297668, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.3580786610450202e-06, |
|
"loss": 1.3437, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.897068264498181, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.335925113357042e-06, |
|
"loss": 1.3665, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.8979242456665953, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.3139487826073937e-06, |
|
"loss": 1.2993, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.8987802268350096, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.2921498333737375e-06, |
|
"loss": 1.4769, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8996362080034239, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.2705284289053403e-06, |
|
"loss": 1.5111, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.9004921891718383, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.2490847311218773e-06, |
|
"loss": 1.619, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.9013481703402525, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.2278189006121904e-06, |
|
"loss": 1.6656, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.9013481703402525, |
|
"eval_loss": 1.516126275062561, |
|
"eval_runtime": 21.3286, |
|
"eval_samples_per_second": 18.285, |
|
"eval_steps_per_second": 18.285, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.9022041515086668, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.2067310966330959e-06, |
|
"loss": 1.0402, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.9030601326770811, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.185821477108212e-06, |
|
"loss": 1.231, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9039161138454954, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.1650901986267365e-06, |
|
"loss": 1.2964, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.9047720950139096, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.144537416442315e-06, |
|
"loss": 1.5158, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.905628076182324, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.1241632844718465e-06, |
|
"loss": 1.5144, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.9064840573507383, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.1039679552943493e-06, |
|
"loss": 1.4618, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.9073400385191526, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.0839515801498084e-06, |
|
"loss": 1.3926, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9081960196875669, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.0641143089380523e-06, |
|
"loss": 1.454, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.9090520008559811, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.0444562902176296e-06, |
|
"loss": 1.349, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.9099079820243955, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.0249776712046744e-06, |
|
"loss": 1.33, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.9107639631928097, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.0056785977718447e-06, |
|
"loss": 1.3807, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.9116199443612241, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 9.865592144471886e-07, |
|
"loss": 1.539, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9124759255296383, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.67619664413086e-07, |
|
"loss": 1.4141, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.9133319066980526, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.488600895051714e-07, |
|
"loss": 1.4709, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.914187887866467, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.302806302112693e-07, |
|
"loss": 1.383, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.9150438690348812, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.118814256703523e-07, |
|
"loss": 1.3668, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.9158998502032956, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 8.936626136714754e-07, |
|
"loss": 1.4376, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9167558313717098, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 8.756243306527689e-07, |
|
"loss": 1.3291, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.9176118125401241, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 8.577667117004085e-07, |
|
"loss": 1.447, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.9184677937085384, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.400898905475934e-07, |
|
"loss": 1.4864, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.9193237748769527, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 8.225939995735593e-07, |
|
"loss": 1.3948, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.920179756045367, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 8.05279169802578e-07, |
|
"loss": 1.5526, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9210357372137813, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.881455309029894e-07, |
|
"loss": 1.2545, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.9218917183821956, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 7.711932111862025e-07, |
|
"loss": 1.4127, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.9227476995506099, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 7.544223376057702e-07, |
|
"loss": 1.3862, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.9236036807190242, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 7.378330357564134e-07, |
|
"loss": 1.3634, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.9244596618874384, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 7.214254298730793e-07, |
|
"loss": 1.4651, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9253156430558528, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 7.051996428300317e-07, |
|
"loss": 1.5383, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.926171624224267, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 6.891557961399175e-07, |
|
"loss": 1.3888, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.9270276053926814, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 6.73294009952849e-07, |
|
"loss": 1.3839, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.9278835865610957, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 6.576144030555259e-07, |
|
"loss": 1.3927, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.9287395677295099, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 6.421170928703174e-07, |
|
"loss": 1.4722, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9295955488979243, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.268021954544096e-07, |
|
"loss": 1.2901, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.9304515300663385, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.116698254989256e-07, |
|
"loss": 1.39, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.9313075112347529, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.967200963280545e-07, |
|
"loss": 1.2328, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.9321634924031671, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 5.819531198982264e-07, |
|
"loss": 1.3817, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.9330194735715814, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 5.673690067972553e-07, |
|
"loss": 1.521, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9338754547399957, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 5.529678662435228e-07, |
|
"loss": 1.5298, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.93473143590841, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 5.387498060851454e-07, |
|
"loss": 1.4792, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.9355874170768244, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 5.247149327991835e-07, |
|
"loss": 1.3686, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.9364433982452386, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.108633514908367e-07, |
|
"loss": 1.4196, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.9372993794136529, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.971951658926527e-07, |
|
"loss": 1.3794, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9381553605820672, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.83710478363758e-07, |
|
"loss": 1.5475, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.9390113417504815, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.704093898890871e-07, |
|
"loss": 1.4144, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.9398673229188957, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.5729200007862683e-07, |
|
"loss": 1.3816, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.9407233040873101, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.4435840716667007e-07, |
|
"loss": 1.4647, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.9415792852557243, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 4.316087080110748e-07, |
|
"loss": 1.2495, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9424352664241387, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.1904299809255867e-07, |
|
"loss": 1.4592, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.943291247592553, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.0666137151395277e-07, |
|
"loss": 1.4884, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.9441472287609672, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.944639209995299e-07, |
|
"loss": 1.5319, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.9450032099293816, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.824507378942799e-07, |
|
"loss": 1.2856, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.9458591910977958, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.70621912163252e-07, |
|
"loss": 1.3458, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9467151722662102, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.589775323908612e-07, |
|
"loss": 1.7292, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.9475711534346244, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.475176857802298e-07, |
|
"loss": 1.3163, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.9484271346030387, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.3624245815254975e-07, |
|
"loss": 1.5198, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.9492831157714531, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.2515193394641595e-07, |
|
"loss": 1.5222, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.9501390969398673, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.142461962172105e-07, |
|
"loss": 1.3569, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9509950781082817, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.035253266364696e-07, |
|
"loss": 1.4204, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.9518510592766959, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.9298940549128964e-07, |
|
"loss": 1.3198, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.9527070404451102, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.8263851168369714e-07, |
|
"loss": 1.4136, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.9535630216135245, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.724727227300911e-07, |
|
"loss": 1.4979, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.9544190027819388, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.624921147606374e-07, |
|
"loss": 1.3033, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.955274983950353, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.526967625187088e-07, |
|
"loss": 1.429, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.9561309651187674, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.4308673936032646e-07, |
|
"loss": 1.4569, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.9569869462871817, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.3366211725360798e-07, |
|
"loss": 1.3534, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.957842927455596, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.244229667782205e-07, |
|
"loss": 1.5615, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.9586989086240103, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.1536935712486994e-07, |
|
"loss": 1.4168, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9595548897924245, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.0650135609477094e-07, |
|
"loss": 1.3854, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.9604108709608389, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.9781903009913338e-07, |
|
"loss": 1.3355, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.9612668521292531, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.893224441586877e-07, |
|
"loss": 1.5445, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.9621228332976675, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.8101166190316875e-07, |
|
"loss": 1.4701, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.9629788144660818, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.7288674557086048e-07, |
|
"loss": 1.4356, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.963834795634496, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.6494775600812417e-07, |
|
"loss": 1.4501, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.9646907768029104, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.571947526689349e-07, |
|
"loss": 1.4544, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.9655467579713246, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.4962779361445412e-07, |
|
"loss": 1.5713, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.966402739139739, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4224693551256885e-07, |
|
"loss": 1.7056, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.9672587203081532, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.3505223363749487e-07, |
|
"loss": 1.2895, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9681147014765675, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.2804374186934643e-07, |
|
"loss": 1.3881, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.9689706826449818, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.2122151269373383e-07, |
|
"loss": 1.2761, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.9698266638133961, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.1458559720137762e-07, |
|
"loss": 1.3987, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.9706826449818104, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.0813604508771169e-07, |
|
"loss": 1.3975, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.9715386261502247, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.018729046525363e-07, |
|
"loss": 1.3861, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.972394607318639, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.579622279962397e-08, |
|
"loss": 1.3842, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.9732505884870533, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 8.990604503639477e-08, |
|
"loss": 1.4654, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.9741065696554676, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 8.420241547356933e-08, |
|
"loss": 1.4066, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.9749625508238818, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 7.868537682482469e-08, |
|
"loss": 1.3077, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.9758185319922962, |
|
"grad_norm": 1.5, |
|
"learning_rate": 7.335497040648898e-08, |
|
"loss": 1.4708, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9766745131607104, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.821123613723057e-08, |
|
"loss": 1.6011, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.9775304943291248, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 6.325421253775277e-08, |
|
"loss": 1.2807, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.9783864754975391, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 5.848393673051067e-08, |
|
"loss": 1.3443, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.9792424566659533, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 5.390044443943365e-08, |
|
"loss": 1.5044, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.9800984378343677, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.9503769989647786e-08, |
|
"loss": 1.3441, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9809544190027819, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.529394630723438e-08, |
|
"loss": 1.3954, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.9818104001711963, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.1271004918971847e-08, |
|
"loss": 1.3292, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.9826663813396105, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.7434975952102546e-08, |
|
"loss": 1.2322, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.9835223625080248, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.378588813411354e-08, |
|
"loss": 1.3188, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.9843783436764392, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.032376879250898e-08, |
|
"loss": 1.2855, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9852343248448534, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.7048643854615806e-08, |
|
"loss": 1.6131, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.9860903060132677, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.3960537847383946e-08, |
|
"loss": 1.2274, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.986946287181682, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.1059473897208637e-08, |
|
"loss": 1.4714, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.9878022683500963, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.834547372975004e-08, |
|
"loss": 1.4097, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.9886582495185106, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.581855766977225e-08, |
|
"loss": 1.292, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9895142306869249, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3478744640998963e-08, |
|
"loss": 1.1981, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.9903702118553391, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.1326052165960831e-08, |
|
"loss": 1.5196, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.9912261930237535, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.360496365870553e-09, |
|
"loss": 1.4829, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.9920821741921678, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.582091960497973e-09, |
|
"loss": 1.4483, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.992938155360582, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 5.990852268064618e-09, |
|
"loss": 1.5446, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9937941365289964, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 4.586789205140995e-09, |
|
"loss": 1.39, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.9946501176974106, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.3699132865605553e-09, |
|
"loss": 1.2557, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.995506098865825, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.3402336253364187e-09, |
|
"loss": 1.4511, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.9963620800342392, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.4977579325919923e-09, |
|
"loss": 1.3592, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.9972180612026535, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.424925175137821e-10, |
|
"loss": 1.4615, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9980740423710678, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.744422872875575e-10, |
|
"loss": 1.5993, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.9989300235394821, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.361074708169604e-11, |
|
"loss": 1.3935, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.9997860047078965, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.0, |
|
"loss": 1.271, |
|
"step": 1168 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.472919195037204e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|