|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9944506104328523, |
|
"eval_steps": 100, |
|
"global_step": 674, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014798372179060304, |
|
"grad_norm": 2.593348807063828, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.094, |
|
"mean_token_accuracy": 0.7108849765114198, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.029596744358120607, |
|
"grad_norm": 1.9750641421169883, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 1.0985, |
|
"mean_token_accuracy": 0.7104195185735463, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 1.256871834005428, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 1.0326, |
|
"mean_token_accuracy": 0.7226216543652402, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.059193488716241215, |
|
"grad_norm": 1.2314608509786273, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.0116, |
|
"mean_token_accuracy": 0.7213333714754395, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07399186089530152, |
|
"grad_norm": 0.9985709690791873, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 0.9423, |
|
"mean_token_accuracy": 0.7356589233943926, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.677669379025463, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.9249, |
|
"mean_token_accuracy": 0.7369974576486416, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10358860525342212, |
|
"grad_norm": 0.5930723910211383, |
|
"learning_rate": 1.0294117647058823e-05, |
|
"loss": 0.8781, |
|
"mean_token_accuracy": 0.7478943769535908, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11838697743248243, |
|
"grad_norm": 0.6120099810718413, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.8509, |
|
"mean_token_accuracy": 0.7538154218630463, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.5175092564052673, |
|
"learning_rate": 1.323529411764706e-05, |
|
"loss": 0.8477, |
|
"mean_token_accuracy": 0.7534044571299353, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14798372179060304, |
|
"grad_norm": 0.5643708633707751, |
|
"learning_rate": 1.4705882352941179e-05, |
|
"loss": 0.8265, |
|
"mean_token_accuracy": 0.7580760243590007, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16278209396966334, |
|
"grad_norm": 0.5199539772551212, |
|
"learning_rate": 1.6176470588235296e-05, |
|
"loss": 0.8319, |
|
"mean_token_accuracy": 0.7564302122452184, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.49258521590997956, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.8176, |
|
"mean_token_accuracy": 0.7597445209517484, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19237883832778394, |
|
"grad_norm": 0.5307403214560474, |
|
"learning_rate": 1.911764705882353e-05, |
|
"loss": 0.8054, |
|
"mean_token_accuracy": 0.7615138291997056, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.20717721050684423, |
|
"grad_norm": 0.4960255459749203, |
|
"learning_rate": 1.9999462497359468e-05, |
|
"loss": 0.8069, |
|
"mean_token_accuracy": 0.761267918792979, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.5003355217225459, |
|
"learning_rate": 1.9993416256221894e-05, |
|
"loss": 0.7868, |
|
"mean_token_accuracy": 0.7661573062751283, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.23677395486496486, |
|
"grad_norm": 0.5583669207747801, |
|
"learning_rate": 1.9980655971335944e-05, |
|
"loss": 0.795, |
|
"mean_token_accuracy": 0.7641677417554135, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"grad_norm": 0.5240356080381341, |
|
"learning_rate": 1.996119021565693e-05, |
|
"loss": 0.798, |
|
"mean_token_accuracy": 0.7623190879812837, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.5086305355433351, |
|
"learning_rate": 1.993503206718859e-05, |
|
"loss": 0.7725, |
|
"mean_token_accuracy": 0.7697917930408653, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28116907140214575, |
|
"grad_norm": 0.5210948328729179, |
|
"learning_rate": 1.9902199100196697e-05, |
|
"loss": 0.7797, |
|
"mean_token_accuracy": 0.7675447411814386, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"grad_norm": 0.5293928629771224, |
|
"learning_rate": 1.986271337340182e-05, |
|
"loss": 0.7832, |
|
"mean_token_accuracy": 0.7657440282462281, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"eval_loss": 0.8053451776504517, |
|
"eval_mean_token_accuracy": 0.7508788819017185, |
|
"eval_runtime": 2.9743, |
|
"eval_samples_per_second": 43.371, |
|
"eval_steps_per_second": 3.026, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.5213337115292652, |
|
"learning_rate": 1.9816601415159266e-05, |
|
"loss": 0.7803, |
|
"mean_token_accuracy": 0.7667738476997512, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3255641879393267, |
|
"grad_norm": 0.5422814463410677, |
|
"learning_rate": 1.976389420563607e-05, |
|
"loss": 0.7754, |
|
"mean_token_accuracy": 0.767532399595269, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.340362560118387, |
|
"grad_norm": 0.5101794394574756, |
|
"learning_rate": 1.970462715599711e-05, |
|
"loss": 0.786, |
|
"mean_token_accuracy": 0.7652675887356217, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.5098589304227549, |
|
"learning_rate": 1.9638840084614182e-05, |
|
"loss": 0.7907, |
|
"mean_token_accuracy": 0.763076072814916, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3699593044765076, |
|
"grad_norm": 0.4867400992746686, |
|
"learning_rate": 1.95665771903142e-05, |
|
"loss": 0.7766, |
|
"mean_token_accuracy": 0.7664838272290541, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.38475767665556787, |
|
"grad_norm": 0.5010790639260299, |
|
"learning_rate": 1.9487887022684336e-05, |
|
"loss": 0.7742, |
|
"mean_token_accuracy": 0.7671358251075762, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.5154337703533832, |
|
"learning_rate": 1.9402822449454154e-05, |
|
"loss": 0.7823, |
|
"mean_token_accuracy": 0.7651854378067892, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.41435442101368847, |
|
"grad_norm": 0.5784623873225747, |
|
"learning_rate": 1.9311440620976597e-05, |
|
"loss": 0.7514, |
|
"mean_token_accuracy": 0.7740071044189999, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4291527931927488, |
|
"grad_norm": 0.4782136651266562, |
|
"learning_rate": 1.9213802931831697e-05, |
|
"loss": 0.7657, |
|
"mean_token_accuracy": 0.7693332456214177, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.47005711458881455, |
|
"learning_rate": 1.9109974979578852e-05, |
|
"loss": 0.7649, |
|
"mean_token_accuracy": 0.7701379314881234, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4587495375508694, |
|
"grad_norm": 0.5608878376753778, |
|
"learning_rate": 1.90000265206853e-05, |
|
"loss": 0.7513, |
|
"mean_token_accuracy": 0.7730658336329694, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4735479097299297, |
|
"grad_norm": 0.50522739081176, |
|
"learning_rate": 1.8884031423660492e-05, |
|
"loss": 0.7815, |
|
"mean_token_accuracy": 0.7646060372851711, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.5820021754434196, |
|
"learning_rate": 1.8762067619427745e-05, |
|
"loss": 0.7613, |
|
"mean_token_accuracy": 0.7710034817869009, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.4676454101627837, |
|
"learning_rate": 1.8634217048966638e-05, |
|
"loss": 0.7724, |
|
"mean_token_accuracy": 0.7671357955663405, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5179430262671106, |
|
"grad_norm": 0.5383542205708923, |
|
"learning_rate": 1.8500565608261215e-05, |
|
"loss": 0.7672, |
|
"mean_token_accuracy": 0.768479148549233, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.5303302181160002, |
|
"learning_rate": 1.836120309059107e-05, |
|
"loss": 0.753, |
|
"mean_token_accuracy": 0.7730720476425463, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5475397706252312, |
|
"grad_norm": 0.5448865957224543, |
|
"learning_rate": 1.821622312620401e-05, |
|
"loss": 0.7592, |
|
"mean_token_accuracy": 0.7711734616701431, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5623381428042915, |
|
"grad_norm": 0.5325429730759403, |
|
"learning_rate": 1.8065723119410885e-05, |
|
"loss": 0.7494, |
|
"mean_token_accuracy": 0.7731226250657407, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.5255216453770453, |
|
"learning_rate": 1.7909804183144837e-05, |
|
"loss": 0.7383, |
|
"mean_token_accuracy": 0.7756631657183498, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"grad_norm": 0.5120462736044016, |
|
"learning_rate": 1.77485710710289e-05, |
|
"loss": 0.7521, |
|
"mean_token_accuracy": 0.7721419640874242, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"eval_loss": 0.7753354907035828, |
|
"eval_mean_token_accuracy": 0.7572332747175835, |
|
"eval_runtime": 2.5766, |
|
"eval_samples_per_second": 50.066, |
|
"eval_steps_per_second": 3.493, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6067332593414725, |
|
"grad_norm": 0.4938631944306276, |
|
"learning_rate": 1.7582132106997615e-05, |
|
"loss": 0.742, |
|
"mean_token_accuracy": 0.7750476845373353, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.5671545592273357, |
|
"learning_rate": 1.741059911251997e-05, |
|
"loss": 0.7494, |
|
"mean_token_accuracy": 0.7730425374431613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.636330003699593, |
|
"grad_norm": 0.5698015954748781, |
|
"learning_rate": 1.72340873314725e-05, |
|
"loss": 0.7755, |
|
"mean_token_accuracy": 0.7657081123174699, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6511283758786534, |
|
"grad_norm": 0.4871241944840193, |
|
"learning_rate": 1.7052715352713076e-05, |
|
"loss": 0.7503, |
|
"mean_token_accuracy": 0.7729788491307596, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.4845619074170943, |
|
"learning_rate": 1.686660503040737e-05, |
|
"loss": 0.7594, |
|
"mean_token_accuracy": 0.7697606181515417, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.680725120236774, |
|
"grad_norm": 0.449929707423831, |
|
"learning_rate": 1.667588140216154e-05, |
|
"loss": 0.7378, |
|
"mean_token_accuracy": 0.7765756802087522, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6955234924158342, |
|
"grad_norm": 0.4638428834610375, |
|
"learning_rate": 1.648067260501611e-05, |
|
"loss": 0.7403, |
|
"mean_token_accuracy": 0.7761926000092616, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.4835551262994511, |
|
"learning_rate": 1.628110978935756e-05, |
|
"loss": 0.7312, |
|
"mean_token_accuracy": 0.7778456216806866, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7251202367739549, |
|
"grad_norm": 0.47864040254789736, |
|
"learning_rate": 1.6077327030805318e-05, |
|
"loss": 0.7432, |
|
"mean_token_accuracy": 0.7748753587572953, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7399186089530152, |
|
"grad_norm": 0.5215157420531165, |
|
"learning_rate": 1.586946124013354e-05, |
|
"loss": 0.7431, |
|
"mean_token_accuracy": 0.7745837726921236, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.5416399731442417, |
|
"learning_rate": 1.565765207128805e-05, |
|
"loss": 0.7472, |
|
"mean_token_accuracy": 0.7734427892740634, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7695153533111357, |
|
"grad_norm": 0.5299768257874566, |
|
"learning_rate": 1.5442041827560274e-05, |
|
"loss": 0.742, |
|
"mean_token_accuracy": 0.7745810392528254, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 0.5033823978878146, |
|
"learning_rate": 1.5222775365981272e-05, |
|
"loss": 0.7586, |
|
"mean_token_accuracy": 0.7694231223052592, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7991120976692564, |
|
"grad_norm": 0.5009396005762752, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.7416, |
|
"mean_token_accuracy": 0.774962565654001, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8139104698483167, |
|
"grad_norm": 0.5720141971810079, |
|
"learning_rate": 1.477386540051127e-05, |
|
"loss": 0.74, |
|
"mean_token_accuracy": 0.7753138828627616, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8287088420273769, |
|
"grad_norm": 0.46793189962319914, |
|
"learning_rate": 1.4544523495299843e-05, |
|
"loss": 0.7412, |
|
"mean_token_accuracy": 0.7753686497664456, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8435072142064373, |
|
"grad_norm": 0.49343522620873687, |
|
"learning_rate": 1.4312128366968244e-05, |
|
"loss": 0.7411, |
|
"mean_token_accuracy": 0.7749813549179148, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8583055863854976, |
|
"grad_norm": 0.4704246548142292, |
|
"learning_rate": 1.4076836149416889e-05, |
|
"loss": 0.7377, |
|
"mean_token_accuracy": 0.7758572122067074, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8731039585645579, |
|
"grad_norm": 0.5336045372375675, |
|
"learning_rate": 1.3838804922946027e-05, |
|
"loss": 0.734, |
|
"mean_token_accuracy": 0.7765910014311306, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"grad_norm": 0.44316568207451656, |
|
"learning_rate": 1.3598194608050011e-05, |
|
"loss": 0.7255, |
|
"mean_token_accuracy": 0.7792648379657762, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"eval_loss": 0.7596829533576965, |
|
"eval_mean_token_accuracy": 0.761409966675403, |
|
"eval_runtime": 2.5745, |
|
"eval_samples_per_second": 50.107, |
|
"eval_steps_per_second": 3.496, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9027007029226785, |
|
"grad_norm": 0.48113060886258535, |
|
"learning_rate": 1.335516685797525e-05, |
|
"loss": 0.7335, |
|
"mean_token_accuracy": 0.7769502910551431, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9174990751017388, |
|
"grad_norm": 0.48237154295068696, |
|
"learning_rate": 1.3109884950114007e-05, |
|
"loss": 0.7057, |
|
"mean_token_accuracy": 0.7848206362928833, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9322974472807991, |
|
"grad_norm": 0.4668338419115322, |
|
"learning_rate": 1.2862513676307009e-05, |
|
"loss": 0.733, |
|
"mean_token_accuracy": 0.7774705687024319, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9470958194598594, |
|
"grad_norm": 0.5011764068821749, |
|
"learning_rate": 1.2613219232128608e-05, |
|
"loss": 0.7293, |
|
"mean_token_accuracy": 0.7780742689022585, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9618941916389198, |
|
"grad_norm": 0.4885559923585182, |
|
"learning_rate": 1.2362169105228828e-05, |
|
"loss": 0.7152, |
|
"mean_token_accuracy": 0.7818612325296337, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.97669256381798, |
|
"grad_norm": 0.4785348974189487, |
|
"learning_rate": 1.2109531962807333e-05, |
|
"loss": 0.7289, |
|
"mean_token_accuracy": 0.7781427874114122, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9914909359970403, |
|
"grad_norm": 0.48751875931309574, |
|
"learning_rate": 1.1855477538294934e-05, |
|
"loss": 0.7276, |
|
"mean_token_accuracy": 0.7779443587593641, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.005919348871624, |
|
"grad_norm": 0.48718844213578066, |
|
"learning_rate": 1.1600176517318742e-05, |
|
"loss": 0.7304, |
|
"mean_token_accuracy": 0.7758459646187758, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0207177210506844, |
|
"grad_norm": 0.47201016864606404, |
|
"learning_rate": 1.1343800423027583e-05, |
|
"loss": 0.6812, |
|
"mean_token_accuracy": 0.7899202867391438, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0355160932297447, |
|
"grad_norm": 0.4747324999715173, |
|
"learning_rate": 1.1086521500854746e-05, |
|
"loss": 0.7012, |
|
"mean_token_accuracy": 0.7836693959349688, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.050314465408805, |
|
"grad_norm": 0.4910325009000735, |
|
"learning_rate": 1.0828512602795462e-05, |
|
"loss": 0.6876, |
|
"mean_token_accuracy": 0.7875374486248747, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0651128375878653, |
|
"grad_norm": 0.4571618422535637, |
|
"learning_rate": 1.0569947071276847e-05, |
|
"loss": 0.6884, |
|
"mean_token_accuracy": 0.78774961123011, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0799112097669257, |
|
"grad_norm": 0.4403850687340159, |
|
"learning_rate": 1.031099862269837e-05, |
|
"loss": 0.6783, |
|
"mean_token_accuracy": 0.7904201368396653, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.094709581945986, |
|
"grad_norm": 0.4209799061995027, |
|
"learning_rate": 1.0051841230721065e-05, |
|
"loss": 0.6831, |
|
"mean_token_accuracy": 0.7893388888775005, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1095079541250463, |
|
"grad_norm": 0.44400251187223466, |
|
"learning_rate": 9.7926490093839e-06, |
|
"loss": 0.694, |
|
"mean_token_accuracy": 0.7860355175167035, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.1243063263041067, |
|
"grad_norm": 0.44389093926366824, |
|
"learning_rate": 9.533596096125826e-06, |
|
"loss": 0.6805, |
|
"mean_token_accuracy": 0.7897672438720169, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1391046984831668, |
|
"grad_norm": 0.4402746900329913, |
|
"learning_rate": 9.274856534792138e-06, |
|
"loss": 0.6876, |
|
"mean_token_accuracy": 0.787742068812946, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.153903070662227, |
|
"grad_norm": 0.4103517180651528, |
|
"learning_rate": 9.016604158703654e-06, |
|
"loss": 0.6936, |
|
"mean_token_accuracy": 0.7862575514246564, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1687014428412874, |
|
"grad_norm": 0.4372548428563679, |
|
"learning_rate": 8.759012473867407e-06, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.7881330535798076, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1834998150203477, |
|
"grad_norm": 0.4517878712131358, |
|
"learning_rate": 8.502254542407186e-06, |
|
"loss": 0.6602, |
|
"mean_token_accuracy": 0.7958344628887729, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1834998150203477, |
|
"eval_loss": 0.7526344656944275, |
|
"eval_mean_token_accuracy": 0.7637808838540963, |
|
"eval_runtime": 2.5819, |
|
"eval_samples_per_second": 49.964, |
|
"eval_steps_per_second": 3.486, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.198298187199408, |
|
"grad_norm": 0.425798077366515, |
|
"learning_rate": 8.246502866292324e-06, |
|
"loss": 0.6762, |
|
"mean_token_accuracy": 0.7913646662783693, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.2130965593784684, |
|
"grad_norm": 0.4638789221888502, |
|
"learning_rate": 7.991929271442817e-06, |
|
"loss": 0.6988, |
|
"mean_token_accuracy": 0.7838723917785275, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2278949315575287, |
|
"grad_norm": 0.4280501917061392, |
|
"learning_rate": 7.738704792288654e-06, |
|
"loss": 0.6774, |
|
"mean_token_accuracy": 0.7903381169266661, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.242693303736589, |
|
"grad_norm": 0.42498111768242547, |
|
"learning_rate": 7.48699955686089e-06, |
|
"loss": 0.6813, |
|
"mean_token_accuracy": 0.7894631759093487, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2574916759156491, |
|
"grad_norm": 0.41478152725647516, |
|
"learning_rate": 7.236982672491699e-06, |
|
"loss": 0.6728, |
|
"mean_token_accuracy": 0.7919249787920513, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2722900480947095, |
|
"grad_norm": 0.4541548007471146, |
|
"learning_rate": 6.988822112200157e-06, |
|
"loss": 0.6796, |
|
"mean_token_accuracy": 0.7900502463044352, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2870884202737698, |
|
"grad_norm": 0.4347630729515234, |
|
"learning_rate": 6.742684601840142e-06, |
|
"loss": 0.6717, |
|
"mean_token_accuracy": 0.7929136972724844, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.3018867924528301, |
|
"grad_norm": 0.45090441896346267, |
|
"learning_rate": 6.498735508086094e-06, |
|
"loss": 0.6968, |
|
"mean_token_accuracy": 0.7851391617996991, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3166851646318904, |
|
"grad_norm": 0.4667797906922687, |
|
"learning_rate": 6.2571387273319905e-06, |
|
"loss": 0.6981, |
|
"mean_token_accuracy": 0.7850517425351553, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.3314835368109508, |
|
"grad_norm": 0.4648667234354044, |
|
"learning_rate": 6.018056575578075e-06, |
|
"loss": 0.6883, |
|
"mean_token_accuracy": 0.7874625531372195, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.346281908990011, |
|
"grad_norm": 0.41673039178072924, |
|
"learning_rate": 5.781649679379379e-06, |
|
"loss": 0.6991, |
|
"mean_token_accuracy": 0.7841906497424455, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3610802811690714, |
|
"grad_norm": 0.421710853156402, |
|
"learning_rate": 5.548076867929331e-06, |
|
"loss": 0.686, |
|
"mean_token_accuracy": 0.7881790344194883, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3758786533481318, |
|
"grad_norm": 0.4095421501939646, |
|
"learning_rate": 5.31749506635086e-06, |
|
"loss": 0.6978, |
|
"mean_token_accuracy": 0.7845082248101509, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.390677025527192, |
|
"grad_norm": 0.38836233257615616, |
|
"learning_rate": 5.090059190266779e-06, |
|
"loss": 0.6759, |
|
"mean_token_accuracy": 0.790896360104509, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4054753977062524, |
|
"grad_norm": 0.44432765945429187, |
|
"learning_rate": 4.865922041720239e-06, |
|
"loss": 0.7049, |
|
"mean_token_accuracy": 0.7829590808442444, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.4202737698853127, |
|
"grad_norm": 0.42410175022228774, |
|
"learning_rate": 4.645234206515171e-06, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7861480885303875, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.435072142064373, |
|
"grad_norm": 0.40621594870021627, |
|
"learning_rate": 4.4281439530457174e-06, |
|
"loss": 0.6863, |
|
"mean_token_accuracy": 0.7874242608443394, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.4498705142434332, |
|
"grad_norm": 0.4193922892692362, |
|
"learning_rate": 4.214797132682597e-06, |
|
"loss": 0.6788, |
|
"mean_token_accuracy": 0.7899688239885381, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4646688864224935, |
|
"grad_norm": 0.43222747626210295, |
|
"learning_rate": 4.00533708178334e-06, |
|
"loss": 0.682, |
|
"mean_token_accuracy": 0.7892313286173167, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.4794672586015538, |
|
"grad_norm": 0.4120701167860772, |
|
"learning_rate": 3.799904525392251e-06, |
|
"loss": 0.6796, |
|
"mean_token_accuracy": 0.7901880590604035, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4794672586015538, |
|
"eval_loss": 0.7470049262046814, |
|
"eval_mean_token_accuracy": 0.7654142432713686, |
|
"eval_runtime": 2.5774, |
|
"eval_samples_per_second": 50.05, |
|
"eval_steps_per_second": 3.492, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4942656307806141, |
|
"grad_norm": 0.4088311845013094, |
|
"learning_rate": 3.5986374826947067e-06, |
|
"loss": 0.6759, |
|
"mean_token_accuracy": 0.7912675486581924, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.5090640029596745, |
|
"grad_norm": 0.43601026958405814, |
|
"learning_rate": 3.401671174289469e-06, |
|
"loss": 0.6868, |
|
"mean_token_accuracy": 0.7875342211095951, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5238623751387348, |
|
"grad_norm": 0.42059576380951874, |
|
"learning_rate": 3.209137931341143e-06, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.789064854621197, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.538660747317795, |
|
"grad_norm": 0.42071162363599823, |
|
"learning_rate": 3.021167106673928e-06, |
|
"loss": 0.6811, |
|
"mean_token_accuracy": 0.7895668923221757, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5534591194968552, |
|
"grad_norm": 0.4006495337567465, |
|
"learning_rate": 2.837884987866363e-06, |
|
"loss": 0.6711, |
|
"mean_token_accuracy": 0.7922967599695604, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5682574916759155, |
|
"grad_norm": 0.39801538104964046, |
|
"learning_rate": 2.6594147124053983e-06, |
|
"loss": 0.6778, |
|
"mean_token_accuracy": 0.7905583011930852, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5830558638549759, |
|
"grad_norm": 0.40423745101731073, |
|
"learning_rate": 2.485876184956928e-06, |
|
"loss": 0.6918, |
|
"mean_token_accuracy": 0.7868834178242057, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.5978542360340362, |
|
"grad_norm": 0.42810509152800985, |
|
"learning_rate": 2.317385996808195e-06, |
|
"loss": 0.6828, |
|
"mean_token_accuracy": 0.7893337452821828, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6126526082130965, |
|
"grad_norm": 0.41397615085905737, |
|
"learning_rate": 2.1540573475363402e-06, |
|
"loss": 0.6857, |
|
"mean_token_accuracy": 0.7880505231976661, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.6274509803921569, |
|
"grad_norm": 0.4345953263358439, |
|
"learning_rate": 1.9959999689556407e-06, |
|
"loss": 0.688, |
|
"mean_token_accuracy": 0.7875360022810415, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6422493525712172, |
|
"grad_norm": 0.3933827018911945, |
|
"learning_rate": 1.8433200513945338e-06, |
|
"loss": 0.6718, |
|
"mean_token_accuracy": 0.7921645756607958, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.6570477247502775, |
|
"grad_norm": 0.39767074529668084, |
|
"learning_rate": 1.6961201723520248e-06, |
|
"loss": 0.6804, |
|
"mean_token_accuracy": 0.7892949470146251, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.6718460969293378, |
|
"grad_norm": 0.4246684953971968, |
|
"learning_rate": 1.5544992275813053e-06, |
|
"loss": 0.6785, |
|
"mean_token_accuracy": 0.7903202169117076, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.6866444691083982, |
|
"grad_norm": 0.40591338048572306, |
|
"learning_rate": 1.4185523646469822e-06, |
|
"loss": 0.6755, |
|
"mean_token_accuracy": 0.7911190977884677, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7014428412874585, |
|
"grad_norm": 0.38989739838620574, |
|
"learning_rate": 1.2883709190004956e-06, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.7874141706041697, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.7162412134665188, |
|
"grad_norm": 0.4048975612390519, |
|
"learning_rate": 1.1640423526166987e-06, |
|
"loss": 0.689, |
|
"mean_token_accuracy": 0.7870928623453113, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7310395856455791, |
|
"grad_norm": 0.38649469437003026, |
|
"learning_rate": 1.0456501952328191e-06, |
|
"loss": 0.6779, |
|
"mean_token_accuracy": 0.7905220607290137, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.7458379578246392, |
|
"grad_norm": 0.43427031919198744, |
|
"learning_rate": 9.332739882292752e-07, |
|
"loss": 0.6858, |
|
"mean_token_accuracy": 0.7882371685872589, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7606363300036996, |
|
"grad_norm": 0.4073950752567948, |
|
"learning_rate": 8.269892311900696e-07, |
|
"loss": 0.681, |
|
"mean_token_accuracy": 0.7890982111968786, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.77543470218276, |
|
"grad_norm": 0.3882086832272342, |
|
"learning_rate": 7.268673311786378e-07, |
|
"loss": 0.6832, |
|
"mean_token_accuracy": 0.7889850474950262, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.77543470218276, |
|
"eval_loss": 0.7440360188484192, |
|
"eval_mean_token_accuracy": 0.7662619985199512, |
|
"eval_runtime": 2.589, |
|
"eval_samples_per_second": 49.827, |
|
"eval_steps_per_second": 3.476, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7902330743618202, |
|
"grad_norm": 0.37678451317919576, |
|
"learning_rate": 6.329755547632499e-07, |
|
"loss": 0.6785, |
|
"mean_token_accuracy": 0.7897574813595997, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.8050314465408805, |
|
"grad_norm": 0.401763674994648, |
|
"learning_rate": 5.453769828241872e-07, |
|
"loss": 0.6745, |
|
"mean_token_accuracy": 0.791093994328963, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8198298187199407, |
|
"grad_norm": 0.42751930092874074, |
|
"learning_rate": 4.6413046817306404e-07, |
|
"loss": 0.6815, |
|
"mean_token_accuracy": 0.7891455211870513, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.834628190899001, |
|
"grad_norm": 0.37516345502958226, |
|
"learning_rate": 3.8929059601275463e-07, |
|
"loss": 0.6851, |
|
"mean_token_accuracy": 0.7874412703053781, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8494265630780613, |
|
"grad_norm": 0.4083596318817903, |
|
"learning_rate": 3.209076472645112e-07, |
|
"loss": 0.6848, |
|
"mean_token_accuracy": 0.7885656704705398, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.8642249352571216, |
|
"grad_norm": 0.4152899888471385, |
|
"learning_rate": 2.5902756478688674e-07, |
|
"loss": 0.6931, |
|
"mean_token_accuracy": 0.7853106727569341, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.879023307436182, |
|
"grad_norm": 0.3791618126160387, |
|
"learning_rate": 2.036919225091827e-07, |
|
"loss": 0.6925, |
|
"mean_token_accuracy": 0.7861030467863773, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.8938216796152423, |
|
"grad_norm": 0.3733283279669527, |
|
"learning_rate": 1.5493789750014032e-07, |
|
"loss": 0.6756, |
|
"mean_token_accuracy": 0.7909808020722119, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9086200517943026, |
|
"grad_norm": 0.42543070144440165, |
|
"learning_rate": 1.1279824499064396e-07, |
|
"loss": 0.689, |
|
"mean_token_accuracy": 0.786992236965976, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.923418423973363, |
|
"grad_norm": 0.39849949807122753, |
|
"learning_rate": 7.730127636723539e-08, |
|
"loss": 0.6942, |
|
"mean_token_accuracy": 0.7851729688326422, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9382167961524233, |
|
"grad_norm": 0.40279497710177514, |
|
"learning_rate": 4.8470840151195745e-08, |
|
"loss": 0.6902, |
|
"mean_token_accuracy": 0.7862401874833821, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.9530151683314836, |
|
"grad_norm": 0.40049648728709475, |
|
"learning_rate": 2.6326305976001054e-08, |
|
"loss": 0.7015, |
|
"mean_token_accuracy": 0.7839154693712095, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.967813540510544, |
|
"grad_norm": 0.4058408115055615, |
|
"learning_rate": 1.0882551573891953e-08, |
|
"loss": 0.6753, |
|
"mean_token_accuracy": 0.7906313809153375, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.9826119126896042, |
|
"grad_norm": 0.4037258285940985, |
|
"learning_rate": 2.149952780321485e-09, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7864113234230127, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.9944506104328523, |
|
"mean_token_accuracy": 0.7874216550233467, |
|
"step": 674, |
|
"total_flos": 153548771819520.0, |
|
"train_loss": 0.7361217298564289, |
|
"train_runtime": 2538.6324, |
|
"train_samples_per_second": 17.035, |
|
"train_steps_per_second": 0.265 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 674, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 153548771819520.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|