diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11678 @@ +{ + "best_global_step": 5200, + "best_metric": 0.02761992, + "best_model_checkpoint": "/workspace/output/v0-20250502-234115/checkpoint-5200", + "epoch": 0.9998653198653199, + "eval_steps": 100, + "global_step": 5568, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017957351290684624, + "grad_norm": 25.546934127807617, + "learning_rate": 3.5842293906810036e-08, + "loss": 0.6741741895675659, + "memory(GiB)": 17.38, + "step": 1, + "token_acc": 0.7282608695652174, + "train_speed(iter/s)": 0.107486 + }, + { + "epoch": 0.0008978675645342312, + "grad_norm": 13.356419563293457, + "learning_rate": 1.7921146953405018e-07, + "loss": 0.7542595863342285, + "memory(GiB)": 17.38, + "step": 5, + "token_acc": 0.6989247311827957, + "train_speed(iter/s)": 0.191 + }, + { + "epoch": 0.0017957351290684624, + "grad_norm": 63.4417839050293, + "learning_rate": 3.5842293906810036e-07, + "loss": 0.7736139297485352, + "memory(GiB)": 17.38, + "step": 10, + "token_acc": 0.6877637130801688, + "train_speed(iter/s)": 0.212572 + }, + { + "epoch": 0.0026936026936026937, + "grad_norm": 15.046552658081055, + "learning_rate": 5.376344086021506e-07, + "loss": 0.7174010753631592, + "memory(GiB)": 17.38, + "step": 15, + "token_acc": 0.692, + "train_speed(iter/s)": 0.221324 + }, + { + "epoch": 0.003591470258136925, + "grad_norm": 14.752659797668457, + "learning_rate": 7.168458781362007e-07, + "loss": 0.6696049690246582, + "memory(GiB)": 17.38, + "step": 20, + "token_acc": 0.7295081967213115, + "train_speed(iter/s)": 0.225355 + }, + { + "epoch": 0.004489337822671156, + "grad_norm": 12.195171356201172, + "learning_rate": 8.96057347670251e-07, + "loss": 0.5692520141601562, + "memory(GiB)": 17.38, + "step": 25, + "token_acc": 0.7842323651452282, + "train_speed(iter/s)": 0.227964 + }, + { + "epoch": 0.0053872053872053875, + "grad_norm": 12.308070182800293, + "learning_rate": 1.0752688172043011e-06, + "loss": 0.51717209815979, + "memory(GiB)": 17.38, + "step": 30, + "token_acc": 0.7918454935622318, + "train_speed(iter/s)": 0.229744 + }, + { + "epoch": 0.006285072951739619, + "grad_norm": 11.015219688415527, + "learning_rate": 1.2544802867383513e-06, + "loss": 0.4495699882507324, + "memory(GiB)": 17.38, + "step": 35, + "token_acc": 0.8162393162393162, + "train_speed(iter/s)": 0.231032 + }, + { + "epoch": 0.00718294051627385, + "grad_norm": 96.34432983398438, + "learning_rate": 1.4336917562724014e-06, + "loss": 0.4137262344360352, + "memory(GiB)": 17.38, + "step": 40, + "token_acc": 0.8138075313807531, + "train_speed(iter/s)": 0.231968 + }, + { + "epoch": 0.00808080808080808, + "grad_norm": 11.654269218444824, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.3732659578323364, + "memory(GiB)": 17.38, + "step": 45, + "token_acc": 0.8340336134453782, + "train_speed(iter/s)": 0.232731 + }, + { + "epoch": 0.008978675645342313, + "grad_norm": 86.17777252197266, + "learning_rate": 1.792114695340502e-06, + "loss": 0.33175342082977294, + "memory(GiB)": 17.38, + "step": 50, + "token_acc": 0.8140495867768595, + "train_speed(iter/s)": 0.233317 + }, + { + "epoch": 0.009876543209876543, + "grad_norm": 19.6676025390625, + "learning_rate": 1.9713261648745523e-06, + "loss": 0.30897040367126466, + "memory(GiB)": 17.38, + "step": 55, + "token_acc": 0.8361344537815126, + "train_speed(iter/s)": 0.233847 + }, + { + "epoch": 0.010774410774410775, + "grad_norm": 32.43022537231445, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.273148250579834, + "memory(GiB)": 17.38, + "step": 60, + "token_acc": 0.8686440677966102, + "train_speed(iter/s)": 0.234128 + }, + { + "epoch": 0.011672278338945005, + "grad_norm": 11.042097091674805, + "learning_rate": 2.3297491039426526e-06, + "loss": 0.2617670774459839, + "memory(GiB)": 17.38, + "step": 65, + "token_acc": 0.8823529411764706, + "train_speed(iter/s)": 0.23439 + }, + { + "epoch": 0.012570145903479237, + "grad_norm": 27.9367733001709, + "learning_rate": 2.5089605734767026e-06, + "loss": 0.22742559909820556, + "memory(GiB)": 17.38, + "step": 70, + "token_acc": 0.8886554621848739, + "train_speed(iter/s)": 0.234752 + }, + { + "epoch": 0.013468013468013467, + "grad_norm": 16.88421058654785, + "learning_rate": 2.688172043010753e-06, + "loss": 0.19885880947113038, + "memory(GiB)": 17.38, + "step": 75, + "token_acc": 0.899581589958159, + "train_speed(iter/s)": 0.234148 + }, + { + "epoch": 0.0143658810325477, + "grad_norm": 5.280670166015625, + "learning_rate": 2.867383512544803e-06, + "loss": 0.1845339059829712, + "memory(GiB)": 17.38, + "step": 80, + "token_acc": 0.9159663865546218, + "train_speed(iter/s)": 0.234495 + }, + { + "epoch": 0.01526374859708193, + "grad_norm": 13.90598201751709, + "learning_rate": 3.0465949820788532e-06, + "loss": 0.17320866584777833, + "memory(GiB)": 17.38, + "step": 85, + "token_acc": 0.9111570247933884, + "train_speed(iter/s)": 0.234671 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 7.1387786865234375, + "learning_rate": 3.225806451612903e-06, + "loss": 0.16013689041137696, + "memory(GiB)": 17.38, + "step": 90, + "token_acc": 0.9102040816326531, + "train_speed(iter/s)": 0.234812 + }, + { + "epoch": 0.017059483726150394, + "grad_norm": 4.906293869018555, + "learning_rate": 3.4050179211469536e-06, + "loss": 0.14434845447540284, + "memory(GiB)": 17.38, + "step": 95, + "token_acc": 0.9288702928870293, + "train_speed(iter/s)": 0.235001 + }, + { + "epoch": 0.017957351290684626, + "grad_norm": 6.868030548095703, + "learning_rate": 3.584229390681004e-06, + "loss": 0.1334373712539673, + "memory(GiB)": 17.38, + "step": 100, + "token_acc": 0.924892703862661, + "train_speed(iter/s)": 0.235209 + }, + { + "epoch": 0.017957351290684626, + "eval_loss": 0.13820524513721466, + "eval_runtime": 63.5198, + "eval_samples_per_second": 14.169, + "eval_steps_per_second": 7.084, + "eval_token_acc": 0.9261900351917022, + "step": 100 + }, + { + "epoch": 0.018855218855218854, + "grad_norm": 12.286417007446289, + "learning_rate": 3.763440860215054e-06, + "loss": 0.1528111219406128, + "memory(GiB)": 17.38, + "step": 105, + "token_acc": 0.9286691755046186, + "train_speed(iter/s)": 0.202565 + }, + { + "epoch": 0.019753086419753086, + "grad_norm": 50.207679748535156, + "learning_rate": 3.942652329749105e-06, + "loss": 0.1361134648323059, + "memory(GiB)": 17.38, + "step": 110, + "token_acc": 0.9316239316239316, + "train_speed(iter/s)": 0.203942 + }, + { + "epoch": 0.020650953984287318, + "grad_norm": 7.399094581604004, + "learning_rate": 4.121863799283155e-06, + "loss": 0.1452315092086792, + "memory(GiB)": 17.38, + "step": 115, + "token_acc": 0.9243697478991597, + "train_speed(iter/s)": 0.205233 + }, + { + "epoch": 0.02154882154882155, + "grad_norm": 11.770547866821289, + "learning_rate": 4.3010752688172045e-06, + "loss": 0.13639533519744873, + "memory(GiB)": 17.38, + "step": 120, + "token_acc": 0.9173387096774194, + "train_speed(iter/s)": 0.206393 + }, + { + "epoch": 0.02244668911335578, + "grad_norm": 5.79747200012207, + "learning_rate": 4.480286738351255e-06, + "loss": 0.13760348558425903, + "memory(GiB)": 17.38, + "step": 125, + "token_acc": 0.9300847457627118, + "train_speed(iter/s)": 0.20746 + }, + { + "epoch": 0.02334455667789001, + "grad_norm": 8.29529857635498, + "learning_rate": 4.659498207885305e-06, + "loss": 0.1341536521911621, + "memory(GiB)": 17.38, + "step": 130, + "token_acc": 0.9197530864197531, + "train_speed(iter/s)": 0.20851 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 5.264034271240234, + "learning_rate": 4.838709677419355e-06, + "loss": 0.1391567349433899, + "memory(GiB)": 17.38, + "step": 135, + "token_acc": 0.9326086956521739, + "train_speed(iter/s)": 0.20948 + }, + { + "epoch": 0.025140291806958474, + "grad_norm": 4.4858222007751465, + "learning_rate": 5.017921146953405e-06, + "loss": 0.13087493181228638, + "memory(GiB)": 17.38, + "step": 140, + "token_acc": 0.9348739495798319, + "train_speed(iter/s)": 0.210408 + }, + { + "epoch": 0.026038159371492706, + "grad_norm": 2.595498561859131, + "learning_rate": 5.197132616487456e-06, + "loss": 0.13321056365966796, + "memory(GiB)": 17.38, + "step": 145, + "token_acc": 0.9285714285714286, + "train_speed(iter/s)": 0.211278 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 2.693901300430298, + "learning_rate": 5.376344086021506e-06, + "loss": 0.13038065433502197, + "memory(GiB)": 17.38, + "step": 150, + "token_acc": 0.9382978723404255, + "train_speed(iter/s)": 0.212138 + }, + { + "epoch": 0.027833894500561167, + "grad_norm": 4.095688819885254, + "learning_rate": 5.555555555555557e-06, + "loss": 0.11746394634246826, + "memory(GiB)": 17.38, + "step": 155, + "token_acc": 0.9398340248962656, + "train_speed(iter/s)": 0.212946 + }, + { + "epoch": 0.0287317620650954, + "grad_norm": 3.832766056060791, + "learning_rate": 5.734767025089606e-06, + "loss": 0.11778759956359863, + "memory(GiB)": 17.38, + "step": 160, + "token_acc": 0.9400826446280992, + "train_speed(iter/s)": 0.213738 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 3.49568772315979, + "learning_rate": 5.9139784946236566e-06, + "loss": 0.10912375450134278, + "memory(GiB)": 17.38, + "step": 165, + "token_acc": 0.943089430894309, + "train_speed(iter/s)": 0.214419 + }, + { + "epoch": 0.03052749719416386, + "grad_norm": 4.383439540863037, + "learning_rate": 6.0931899641577065e-06, + "loss": 0.12683531045913696, + "memory(GiB)": 17.38, + "step": 170, + "token_acc": 0.9377593360995851, + "train_speed(iter/s)": 0.215035 + }, + { + "epoch": 0.031425364758698095, + "grad_norm": 2.33516526222229, + "learning_rate": 6.272401433691757e-06, + "loss": 0.10385030508041382, + "memory(GiB)": 17.38, + "step": 175, + "token_acc": 0.9345991561181435, + "train_speed(iter/s)": 0.21567 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 2.4893486499786377, + "learning_rate": 6.451612903225806e-06, + "loss": 0.09152342081069946, + "memory(GiB)": 17.38, + "step": 180, + "token_acc": 0.9487704918032787, + "train_speed(iter/s)": 0.216271 + }, + { + "epoch": 0.03322109988776655, + "grad_norm": 2.6175694465637207, + "learning_rate": 6.630824372759857e-06, + "loss": 0.08451794385910034, + "memory(GiB)": 17.38, + "step": 185, + "token_acc": 0.9543568464730291, + "train_speed(iter/s)": 0.216815 + }, + { + "epoch": 0.03411896745230079, + "grad_norm": 20.739519119262695, + "learning_rate": 6.810035842293907e-06, + "loss": 0.11208593845367432, + "memory(GiB)": 17.38, + "step": 190, + "token_acc": 0.9567901234567902, + "train_speed(iter/s)": 0.217327 + }, + { + "epoch": 0.035016835016835016, + "grad_norm": 7.13236141204834, + "learning_rate": 6.989247311827958e-06, + "loss": 0.08776360750198364, + "memory(GiB)": 17.38, + "step": 195, + "token_acc": 0.9578059071729957, + "train_speed(iter/s)": 0.217855 + }, + { + "epoch": 0.03591470258136925, + "grad_norm": 9.431038856506348, + "learning_rate": 7.168458781362008e-06, + "loss": 0.0882810115814209, + "memory(GiB)": 17.38, + "step": 200, + "token_acc": 0.9487704918032787, + "train_speed(iter/s)": 0.218335 + }, + { + "epoch": 0.03591470258136925, + "eval_loss": 0.08947260677814484, + "eval_runtime": 62.3934, + "eval_samples_per_second": 14.425, + "eval_steps_per_second": 7.212, + "eval_token_acc": 0.9561955917762549, + "step": 200 + }, + { + "epoch": 0.03681257014590348, + "grad_norm": 4.276820659637451, + "learning_rate": 7.347670250896059e-06, + "loss": 0.08567115068435668, + "memory(GiB)": 17.38, + "step": 205, + "token_acc": 0.9549334243769204, + "train_speed(iter/s)": 0.203391 + }, + { + "epoch": 0.03771043771043771, + "grad_norm": 5.032042980194092, + "learning_rate": 7.526881720430108e-06, + "loss": 0.09127090573310852, + "memory(GiB)": 17.38, + "step": 210, + "token_acc": 0.9497907949790795, + "train_speed(iter/s)": 0.204121 + }, + { + "epoch": 0.038608305274971944, + "grad_norm": 5.121944904327393, + "learning_rate": 7.706093189964159e-06, + "loss": 0.0885853886604309, + "memory(GiB)": 17.38, + "step": 215, + "token_acc": 0.9567901234567902, + "train_speed(iter/s)": 0.204816 + }, + { + "epoch": 0.03950617283950617, + "grad_norm": 1.8794276714324951, + "learning_rate": 7.88530465949821e-06, + "loss": 0.08779671192169189, + "memory(GiB)": 17.38, + "step": 220, + "token_acc": 0.9578059071729957, + "train_speed(iter/s)": 0.205524 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 7.9404096603393555, + "learning_rate": 8.064516129032258e-06, + "loss": 0.08796645998954773, + "memory(GiB)": 17.38, + "step": 225, + "token_acc": 0.9456066945606695, + "train_speed(iter/s)": 0.206162 + }, + { + "epoch": 0.041301907968574636, + "grad_norm": 2.2835612297058105, + "learning_rate": 8.24372759856631e-06, + "loss": 0.07854433655738831, + "memory(GiB)": 17.38, + "step": 230, + "token_acc": 0.9541666666666667, + "train_speed(iter/s)": 0.20677 + }, + { + "epoch": 0.042199775533108864, + "grad_norm": 8.72272777557373, + "learning_rate": 8.422939068100358e-06, + "loss": 0.07751513719558716, + "memory(GiB)": 17.38, + "step": 235, + "token_acc": 0.9605809128630706, + "train_speed(iter/s)": 0.207354 + }, + { + "epoch": 0.0430976430976431, + "grad_norm": 18.351613998413086, + "learning_rate": 8.602150537634409e-06, + "loss": 0.09581940174102783, + "memory(GiB)": 17.38, + "step": 240, + "token_acc": 0.9621848739495799, + "train_speed(iter/s)": 0.20793 + }, + { + "epoch": 0.04399551066217733, + "grad_norm": 7.6089982986450195, + "learning_rate": 8.78136200716846e-06, + "loss": 0.09454305171966552, + "memory(GiB)": 17.38, + "step": 245, + "token_acc": 0.9430379746835443, + "train_speed(iter/s)": 0.208462 + }, + { + "epoch": 0.04489337822671156, + "grad_norm": 6.246233940124512, + "learning_rate": 8.96057347670251e-06, + "loss": 0.08203399181365967, + "memory(GiB)": 17.38, + "step": 250, + "token_acc": 0.9576271186440678, + "train_speed(iter/s)": 0.20899 + }, + { + "epoch": 0.04579124579124579, + "grad_norm": 2.6615347862243652, + "learning_rate": 9.13978494623656e-06, + "loss": 0.06987979412078857, + "memory(GiB)": 17.38, + "step": 255, + "token_acc": 0.9715447154471545, + "train_speed(iter/s)": 0.209505 + }, + { + "epoch": 0.04668911335578002, + "grad_norm": 2.540647506713867, + "learning_rate": 9.31899641577061e-06, + "loss": 0.07735643386840821, + "memory(GiB)": 17.38, + "step": 260, + "token_acc": 0.9537815126050421, + "train_speed(iter/s)": 0.209977 + }, + { + "epoch": 0.047586980920314256, + "grad_norm": 7.764015197753906, + "learning_rate": 9.49820788530466e-06, + "loss": 0.07664202451705933, + "memory(GiB)": 17.38, + "step": 265, + "token_acc": 0.9521739130434783, + "train_speed(iter/s)": 0.210475 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 2.772840976715088, + "learning_rate": 9.67741935483871e-06, + "loss": 0.07115872502326966, + "memory(GiB)": 17.38, + "step": 270, + "token_acc": 0.9581589958158996, + "train_speed(iter/s)": 0.21096 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 14.721755981445312, + "learning_rate": 9.856630824372761e-06, + "loss": 0.0990296721458435, + "memory(GiB)": 17.38, + "step": 275, + "token_acc": 0.9596774193548387, + "train_speed(iter/s)": 0.211428 + }, + { + "epoch": 0.05028058361391695, + "grad_norm": 3.6788995265960693, + "learning_rate": 9.999999117951213e-06, + "loss": 0.07492555975914002, + "memory(GiB)": 17.38, + "step": 280, + "token_acc": 0.9529914529914529, + "train_speed(iter/s)": 0.211877 + }, + { + "epoch": 0.05117845117845118, + "grad_norm": 2.780642032623291, + "learning_rate": 9.99996824627633e-06, + "loss": 0.07523268461227417, + "memory(GiB)": 17.38, + "step": 285, + "token_acc": 0.9595141700404858, + "train_speed(iter/s)": 0.212313 + }, + { + "epoch": 0.05207631874298541, + "grad_norm": 2.495203971862793, + "learning_rate": 9.999893272473282e-06, + "loss": 0.08823745250701905, + "memory(GiB)": 17.38, + "step": 290, + "token_acc": 0.9481327800829875, + "train_speed(iter/s)": 0.212731 + }, + { + "epoch": 0.05297418630751964, + "grad_norm": 1.4152640104293823, + "learning_rate": 9.999774197203372e-06, + "loss": 0.0693905234336853, + "memory(GiB)": 17.38, + "step": 295, + "token_acc": 0.9605809128630706, + "train_speed(iter/s)": 0.21314 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 3.5159544944763184, + "learning_rate": 9.999611021516902e-06, + "loss": 0.06859614849090576, + "memory(GiB)": 17.38, + "step": 300, + "token_acc": 0.96, + "train_speed(iter/s)": 0.213527 + }, + { + "epoch": 0.05387205387205387, + "eval_loss": 0.06587199866771698, + "eval_runtime": 62.9847, + "eval_samples_per_second": 14.289, + "eval_steps_per_second": 7.145, + "eval_token_acc": 0.9583256158547879, + "step": 300 + }, + { + "epoch": 0.054769921436588105, + "grad_norm": 2.6069297790527344, + "learning_rate": 9.99940374685316e-06, + "loss": 0.06800574660301209, + "memory(GiB)": 17.38, + "step": 305, + "token_acc": 0.9598427887901572, + "train_speed(iter/s)": 0.203648 + }, + { + "epoch": 0.055667789001122334, + "grad_norm": 2.0542044639587402, + "learning_rate": 9.999152375040408e-06, + "loss": 0.0701672077178955, + "memory(GiB)": 17.38, + "step": 310, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.204137 + }, + { + "epoch": 0.05656565656565657, + "grad_norm": 1.3478411436080933, + "learning_rate": 9.998856908295869e-06, + "loss": 0.06529048681259156, + "memory(GiB)": 17.38, + "step": 315, + "token_acc": 0.9524793388429752, + "train_speed(iter/s)": 0.20462 + }, + { + "epoch": 0.0574635241301908, + "grad_norm": 1.0773569345474243, + "learning_rate": 9.998517349225698e-06, + "loss": 0.07465919852256775, + "memory(GiB)": 17.38, + "step": 320, + "token_acc": 0.9491525423728814, + "train_speed(iter/s)": 0.205093 + }, + { + "epoch": 0.058361391694725026, + "grad_norm": 6.35412073135376, + "learning_rate": 9.998133700824973e-06, + "loss": 0.07644960880279542, + "memory(GiB)": 17.38, + "step": 325, + "token_acc": 0.951417004048583, + "train_speed(iter/s)": 0.205544 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 2.644592761993408, + "learning_rate": 9.997705966477654e-06, + "loss": 0.06313223242759705, + "memory(GiB)": 17.38, + "step": 330, + "token_acc": 0.9594017094017094, + "train_speed(iter/s)": 0.205988 + }, + { + "epoch": 0.06015712682379349, + "grad_norm": 1.799385666847229, + "learning_rate": 9.997234149956568e-06, + "loss": 0.06203016042709351, + "memory(GiB)": 17.38, + "step": 335, + "token_acc": 0.9651639344262295, + "train_speed(iter/s)": 0.206407 + }, + { + "epoch": 0.06105499438832772, + "grad_norm": 1.2612967491149902, + "learning_rate": 9.99671825542336e-06, + "loss": 0.06435502767562866, + "memory(GiB)": 17.38, + "step": 340, + "token_acc": 0.9693877551020408, + "train_speed(iter/s)": 0.206823 + }, + { + "epoch": 0.061952861952861954, + "grad_norm": 1.3353652954101562, + "learning_rate": 9.996158287428472e-06, + "loss": 0.05762578845024109, + "memory(GiB)": 17.38, + "step": 345, + "token_acc": 0.966804979253112, + "train_speed(iter/s)": 0.2072 + }, + { + "epoch": 0.06285072951739619, + "grad_norm": 0.585201621055603, + "learning_rate": 9.99555425091109e-06, + "loss": 0.06478224992752075, + "memory(GiB)": 17.38, + "step": 350, + "token_acc": 0.9506437768240343, + "train_speed(iter/s)": 0.207601 + }, + { + "epoch": 0.06374859708193041, + "grad_norm": 4.670670032501221, + "learning_rate": 9.994906151199106e-06, + "loss": 0.06622880697250366, + "memory(GiB)": 17.38, + "step": 355, + "token_acc": 0.9636752136752137, + "train_speed(iter/s)": 0.207977 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 0.9509394764900208, + "learning_rate": 9.994213994009073e-06, + "loss": 0.05616945028305054, + "memory(GiB)": 17.38, + "step": 360, + "token_acc": 0.970954356846473, + "train_speed(iter/s)": 0.208345 + }, + { + "epoch": 0.06554433221099888, + "grad_norm": 1.6002154350280762, + "learning_rate": 9.993477785446151e-06, + "loss": 0.08915088176727295, + "memory(GiB)": 17.38, + "step": 365, + "token_acc": 0.9700854700854701, + "train_speed(iter/s)": 0.208713 + }, + { + "epoch": 0.0664421997755331, + "grad_norm": 1.428268551826477, + "learning_rate": 9.992697532004052e-06, + "loss": 0.07786840200424194, + "memory(GiB)": 17.38, + "step": 370, + "token_acc": 0.9446808510638298, + "train_speed(iter/s)": 0.209072 + }, + { + "epoch": 0.06734006734006734, + "grad_norm": 9.231217384338379, + "learning_rate": 9.991873240564988e-06, + "loss": 0.07174537181854249, + "memory(GiB)": 17.38, + "step": 375, + "token_acc": 0.9547325102880658, + "train_speed(iter/s)": 0.209421 + }, + { + "epoch": 0.06823793490460157, + "grad_norm": 1.6220571994781494, + "learning_rate": 9.991004918399609e-06, + "loss": 0.08527856469154357, + "memory(GiB)": 17.38, + "step": 380, + "token_acc": 0.9465020576131687, + "train_speed(iter/s)": 0.209772 + }, + { + "epoch": 0.0691358024691358, + "grad_norm": 0.5473368167877197, + "learning_rate": 9.990092573166932e-06, + "loss": 0.06169809699058533, + "memory(GiB)": 17.38, + "step": 385, + "token_acc": 0.9588477366255144, + "train_speed(iter/s)": 0.210108 + }, + { + "epoch": 0.07003367003367003, + "grad_norm": 0.6322096586227417, + "learning_rate": 9.989136212914282e-06, + "loss": 0.0714680790901184, + "memory(GiB)": 17.38, + "step": 390, + "token_acc": 0.9543568464730291, + "train_speed(iter/s)": 0.21044 + }, + { + "epoch": 0.07093153759820427, + "grad_norm": 0.8629601001739502, + "learning_rate": 9.988135846077215e-06, + "loss": 0.0808696448802948, + "memory(GiB)": 17.38, + "step": 395, + "token_acc": 0.9645833333333333, + "train_speed(iter/s)": 0.210762 + }, + { + "epoch": 0.0718294051627385, + "grad_norm": 1.1042979955673218, + "learning_rate": 9.987091481479453e-06, + "loss": 0.05970144271850586, + "memory(GiB)": 17.38, + "step": 400, + "token_acc": 0.9585062240663901, + "train_speed(iter/s)": 0.211069 + }, + { + "epoch": 0.0718294051627385, + "eval_loss": 0.07266301661729813, + "eval_runtime": 63.052, + "eval_samples_per_second": 14.274, + "eval_steps_per_second": 7.137, + "eval_token_acc": 0.9553621040933507, + "step": 400 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.7640923857688904, + "learning_rate": 9.986003128332793e-06, + "loss": 0.06673493981361389, + "memory(GiB)": 17.38, + "step": 405, + "token_acc": 0.9562990781836804, + "train_speed(iter/s)": 0.203767 + }, + { + "epoch": 0.07362514029180696, + "grad_norm": 1.1157026290893555, + "learning_rate": 9.984870796237031e-06, + "loss": 0.06565943360328674, + "memory(GiB)": 17.38, + "step": 410, + "token_acc": 0.94375, + "train_speed(iter/s)": 0.204128 + }, + { + "epoch": 0.0745230078563412, + "grad_norm": 1.5420334339141846, + "learning_rate": 9.983694495179885e-06, + "loss": 0.06286911964416504, + "memory(GiB)": 17.38, + "step": 415, + "token_acc": 0.9576271186440678, + "train_speed(iter/s)": 0.20448 + }, + { + "epoch": 0.07542087542087542, + "grad_norm": 1.2485747337341309, + "learning_rate": 9.982474235536896e-06, + "loss": 0.05981000065803528, + "memory(GiB)": 17.38, + "step": 420, + "token_acc": 0.9549180327868853, + "train_speed(iter/s)": 0.204835 + }, + { + "epoch": 0.07631874298540965, + "grad_norm": 0.9268509745597839, + "learning_rate": 9.981210028071342e-06, + "loss": 0.06386615633964539, + "memory(GiB)": 17.38, + "step": 425, + "token_acc": 0.9545454545454546, + "train_speed(iter/s)": 0.205157 + }, + { + "epoch": 0.07721661054994389, + "grad_norm": 0.8398964405059814, + "learning_rate": 9.97990188393414e-06, + "loss": 0.06617263555526734, + "memory(GiB)": 17.38, + "step": 430, + "token_acc": 0.9564315352697096, + "train_speed(iter/s)": 0.205496 + }, + { + "epoch": 0.07811447811447811, + "grad_norm": 4.99791145324707, + "learning_rate": 9.978549814663751e-06, + "loss": 0.06504554152488709, + "memory(GiB)": 17.38, + "step": 435, + "token_acc": 0.959915611814346, + "train_speed(iter/s)": 0.205824 + }, + { + "epoch": 0.07901234567901234, + "grad_norm": 3.2876806259155273, + "learning_rate": 9.977153832186082e-06, + "loss": 0.07636173963546752, + "memory(GiB)": 17.38, + "step": 440, + "token_acc": 0.9602510460251046, + "train_speed(iter/s)": 0.206146 + }, + { + "epoch": 0.07991021324354658, + "grad_norm": 4.486605167388916, + "learning_rate": 9.975713948814365e-06, + "loss": 0.06614078283309936, + "memory(GiB)": 17.38, + "step": 445, + "token_acc": 0.9527896995708155, + "train_speed(iter/s)": 0.206469 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 2.2590088844299316, + "learning_rate": 9.97423017724907e-06, + "loss": 0.0652412235736847, + "memory(GiB)": 17.38, + "step": 450, + "token_acc": 0.9493927125506073, + "train_speed(iter/s)": 0.206787 + }, + { + "epoch": 0.08170594837261504, + "grad_norm": 0.4276881217956543, + "learning_rate": 9.972702530577771e-06, + "loss": 0.06498476266860961, + "memory(GiB)": 17.38, + "step": 455, + "token_acc": 0.9604166666666667, + "train_speed(iter/s)": 0.207105 + }, + { + "epoch": 0.08260381593714927, + "grad_norm": 0.5902699828147888, + "learning_rate": 9.97113102227505e-06, + "loss": 0.07003803253173828, + "memory(GiB)": 17.38, + "step": 460, + "token_acc": 0.9564315352697096, + "train_speed(iter/s)": 0.207415 + }, + { + "epoch": 0.08350168350168351, + "grad_norm": 0.6259745359420776, + "learning_rate": 9.969515666202371e-06, + "loss": 0.06311853528022766, + "memory(GiB)": 17.38, + "step": 465, + "token_acc": 0.9558823529411765, + "train_speed(iter/s)": 0.207706 + }, + { + "epoch": 0.08439955106621773, + "grad_norm": 0.6515107154846191, + "learning_rate": 9.96785647660795e-06, + "loss": 0.08104856014251709, + "memory(GiB)": 17.38, + "step": 470, + "token_acc": 0.9585062240663901, + "train_speed(iter/s)": 0.208002 + }, + { + "epoch": 0.08529741863075196, + "grad_norm": 5.168325901031494, + "learning_rate": 9.966153468126635e-06, + "loss": 0.06628147959709167, + "memory(GiB)": 17.38, + "step": 475, + "token_acc": 0.9539748953974896, + "train_speed(iter/s)": 0.208298 + }, + { + "epoch": 0.0861952861952862, + "grad_norm": 1.1524102687835693, + "learning_rate": 9.964406655779786e-06, + "loss": 0.06002994179725647, + "memory(GiB)": 17.38, + "step": 480, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.208575 + }, + { + "epoch": 0.08709315375982042, + "grad_norm": 0.7319135665893555, + "learning_rate": 9.962616054975126e-06, + "loss": 0.06634606122970581, + "memory(GiB)": 17.38, + "step": 485, + "token_acc": 0.9663865546218487, + "train_speed(iter/s)": 0.208845 + }, + { + "epoch": 0.08799102132435466, + "grad_norm": 1.7812848091125488, + "learning_rate": 9.960781681506617e-06, + "loss": 0.060043954849243165, + "memory(GiB)": 17.38, + "step": 490, + "token_acc": 0.956, + "train_speed(iter/s)": 0.209115 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.6321188807487488, + "learning_rate": 9.958903551554318e-06, + "loss": 0.06447861194610596, + "memory(GiB)": 17.38, + "step": 495, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.209378 + }, + { + "epoch": 0.08978675645342311, + "grad_norm": 0.9982246160507202, + "learning_rate": 9.95698168168424e-06, + "loss": 0.05519977807998657, + "memory(GiB)": 17.38, + "step": 500, + "token_acc": 0.9734693877551021, + "train_speed(iter/s)": 0.209644 + }, + { + "epoch": 0.08978675645342311, + "eval_loss": 0.06810503453016281, + "eval_runtime": 62.6327, + "eval_samples_per_second": 14.369, + "eval_steps_per_second": 7.185, + "eval_token_acc": 0.9563808112613447, + "step": 500 + }, + { + "epoch": 0.09068462401795735, + "grad_norm": 0.8768059015274048, + "learning_rate": 9.955016088848197e-06, + "loss": 0.06417921781539918, + "memory(GiB)": 17.38, + "step": 505, + "token_acc": 0.9563804310639754, + "train_speed(iter/s)": 0.203904 + }, + { + "epoch": 0.09158249158249158, + "grad_norm": 1.075790286064148, + "learning_rate": 9.953006790383671e-06, + "loss": 0.07829501032829285, + "memory(GiB)": 17.38, + "step": 510, + "token_acc": 0.9508547008547008, + "train_speed(iter/s)": 0.204177 + }, + { + "epoch": 0.09248035914702582, + "grad_norm": 0.9587723016738892, + "learning_rate": 9.950953804013636e-06, + "loss": 0.06316940784454346, + "memory(GiB)": 17.38, + "step": 515, + "token_acc": 0.9615384615384616, + "train_speed(iter/s)": 0.204467 + }, + { + "epoch": 0.09337822671156004, + "grad_norm": 0.5903010964393616, + "learning_rate": 9.948857147846424e-06, + "loss": 0.05942028760910034, + "memory(GiB)": 17.38, + "step": 520, + "token_acc": 0.9497907949790795, + "train_speed(iter/s)": 0.204751 + }, + { + "epoch": 0.09427609427609428, + "grad_norm": 0.7420110702514648, + "learning_rate": 9.946716840375552e-06, + "loss": 0.06319589018821717, + "memory(GiB)": 17.38, + "step": 525, + "token_acc": 0.9576612903225806, + "train_speed(iter/s)": 0.205034 + }, + { + "epoch": 0.09517396184062851, + "grad_norm": 0.46533867716789246, + "learning_rate": 9.944532900479563e-06, + "loss": 0.0792462706565857, + "memory(GiB)": 17.38, + "step": 530, + "token_acc": 0.9595744680851064, + "train_speed(iter/s)": 0.205318 + }, + { + "epoch": 0.09607182940516273, + "grad_norm": 0.4853519797325134, + "learning_rate": 9.942305347421856e-06, + "loss": 0.06993516087532044, + "memory(GiB)": 17.38, + "step": 535, + "token_acc": 0.9621513944223108, + "train_speed(iter/s)": 0.205583 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 2.3817763328552246, + "learning_rate": 9.940034200850525e-06, + "loss": 0.066546630859375, + "memory(GiB)": 17.38, + "step": 540, + "token_acc": 0.9555084745762712, + "train_speed(iter/s)": 0.205847 + }, + { + "epoch": 0.0978675645342312, + "grad_norm": 0.4085097312927246, + "learning_rate": 9.937719480798174e-06, + "loss": 0.05827748775482178, + "memory(GiB)": 17.38, + "step": 545, + "token_acc": 0.9665271966527197, + "train_speed(iter/s)": 0.206114 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.8443474769592285, + "learning_rate": 9.935361207681753e-06, + "loss": 0.0626309871673584, + "memory(GiB)": 17.38, + "step": 550, + "token_acc": 0.9636752136752137, + "train_speed(iter/s)": 0.206363 + }, + { + "epoch": 0.09966329966329966, + "grad_norm": 1.0572654008865356, + "learning_rate": 9.932959402302363e-06, + "loss": 0.07306020259857178, + "memory(GiB)": 17.38, + "step": 555, + "token_acc": 0.9541484716157205, + "train_speed(iter/s)": 0.206617 + }, + { + "epoch": 0.1005611672278339, + "grad_norm": 0.46259805560112, + "learning_rate": 9.930514085845084e-06, + "loss": 0.06064082384109497, + "memory(GiB)": 17.38, + "step": 560, + "token_acc": 0.959915611814346, + "train_speed(iter/s)": 0.206882 + }, + { + "epoch": 0.10145903479236812, + "grad_norm": 1.1355466842651367, + "learning_rate": 9.928025279878789e-06, + "loss": 0.06170288324356079, + "memory(GiB)": 17.38, + "step": 565, + "token_acc": 0.9537815126050421, + "train_speed(iter/s)": 0.207126 + }, + { + "epoch": 0.10235690235690235, + "grad_norm": 1.021114706993103, + "learning_rate": 9.92549300635594e-06, + "loss": 0.0675793468952179, + "memory(GiB)": 17.38, + "step": 570, + "token_acc": 0.951063829787234, + "train_speed(iter/s)": 0.207375 + }, + { + "epoch": 0.10325476992143659, + "grad_norm": 1.0056439638137817, + "learning_rate": 9.922917287612412e-06, + "loss": 0.06039591431617737, + "memory(GiB)": 17.38, + "step": 575, + "token_acc": 0.9581589958158996, + "train_speed(iter/s)": 0.207614 + }, + { + "epoch": 0.10415263748597083, + "grad_norm": 2.023305892944336, + "learning_rate": 9.920298146367287e-06, + "loss": 0.061490225791931155, + "memory(GiB)": 17.38, + "step": 580, + "token_acc": 0.9541666666666667, + "train_speed(iter/s)": 0.207856 + }, + { + "epoch": 0.10505050505050505, + "grad_norm": 0.39992204308509827, + "learning_rate": 9.917635605722648e-06, + "loss": 0.058789968490600586, + "memory(GiB)": 17.38, + "step": 585, + "token_acc": 0.95625, + "train_speed(iter/s)": 0.208093 + }, + { + "epoch": 0.10594837261503928, + "grad_norm": 0.3619423508644104, + "learning_rate": 9.914929689163389e-06, + "loss": 0.06168057918548584, + "memory(GiB)": 17.38, + "step": 590, + "token_acc": 0.9523809523809523, + "train_speed(iter/s)": 0.208323 + }, + { + "epoch": 0.10684624017957352, + "grad_norm": 0.3399408757686615, + "learning_rate": 9.912180420556996e-06, + "loss": 0.06328504085540772, + "memory(GiB)": 17.38, + "step": 595, + "token_acc": 0.9545454545454546, + "train_speed(iter/s)": 0.208554 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 0.2703775465488434, + "learning_rate": 9.909387824153342e-06, + "loss": 0.058006536960601804, + "memory(GiB)": 17.38, + "step": 600, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.208782 + }, + { + "epoch": 0.10774410774410774, + "eval_loss": 0.06153036281466484, + "eval_runtime": 63.0374, + "eval_samples_per_second": 14.277, + "eval_steps_per_second": 7.139, + "eval_token_acc": 0.9577699573995184, + "step": 600 + }, + { + "epoch": 0.10864197530864197, + "grad_norm": 0.3182883560657501, + "learning_rate": 9.906551924584474e-06, + "loss": 0.0692869246006012, + "memory(GiB)": 17.38, + "step": 605, + "token_acc": 0.9564697849095254, + "train_speed(iter/s)": 0.203965 + }, + { + "epoch": 0.10953984287317621, + "grad_norm": 0.32532423734664917, + "learning_rate": 9.903672746864388e-06, + "loss": 0.0589275062084198, + "memory(GiB)": 17.38, + "step": 610, + "token_acc": 0.9430379746835443, + "train_speed(iter/s)": 0.204215 + }, + { + "epoch": 0.11043771043771043, + "grad_norm": 0.49120432138442993, + "learning_rate": 9.900750316388824e-06, + "loss": 0.06472208499908447, + "memory(GiB)": 17.38, + "step": 615, + "token_acc": 0.946058091286307, + "train_speed(iter/s)": 0.204466 + }, + { + "epoch": 0.11133557800224467, + "grad_norm": 0.3534823954105377, + "learning_rate": 9.897784658935024e-06, + "loss": 0.06211544275283813, + "memory(GiB)": 17.38, + "step": 620, + "token_acc": 0.9623430962343096, + "train_speed(iter/s)": 0.204715 + }, + { + "epoch": 0.1122334455667789, + "grad_norm": 0.5427143573760986, + "learning_rate": 9.894775800661512e-06, + "loss": 0.05923385620117187, + "memory(GiB)": 17.38, + "step": 625, + "token_acc": 0.9588477366255144, + "train_speed(iter/s)": 0.204958 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 0.34281685948371887, + "learning_rate": 9.891723768107873e-06, + "loss": 0.05856308341026306, + "memory(GiB)": 17.38, + "step": 630, + "token_acc": 0.9602510460251046, + "train_speed(iter/s)": 0.205192 + }, + { + "epoch": 0.11402918069584736, + "grad_norm": 0.6786876916885376, + "learning_rate": 9.888628588194499e-06, + "loss": 0.06303842663764954, + "memory(GiB)": 17.38, + "step": 635, + "token_acc": 0.9470338983050848, + "train_speed(iter/s)": 0.20542 + }, + { + "epoch": 0.1149270482603816, + "grad_norm": 0.46106839179992676, + "learning_rate": 9.88549028822237e-06, + "loss": 0.05881195068359375, + "memory(GiB)": 17.38, + "step": 640, + "token_acc": 0.9600840336134454, + "train_speed(iter/s)": 0.205649 + }, + { + "epoch": 0.11582491582491583, + "grad_norm": 0.5666362047195435, + "learning_rate": 9.882308895872803e-06, + "loss": 0.060858654975891116, + "memory(GiB)": 17.38, + "step": 645, + "token_acc": 0.9604166666666667, + "train_speed(iter/s)": 0.205878 + }, + { + "epoch": 0.11672278338945005, + "grad_norm": 0.5955503582954407, + "learning_rate": 9.879084439207211e-06, + "loss": 0.0706628680229187, + "memory(GiB)": 17.38, + "step": 650, + "token_acc": 0.9583333333333334, + "train_speed(iter/s)": 0.2061 + }, + { + "epoch": 0.11762065095398429, + "grad_norm": 0.44884777069091797, + "learning_rate": 9.875816946666856e-06, + "loss": 0.06129485368728638, + "memory(GiB)": 17.38, + "step": 655, + "token_acc": 0.9602510460251046, + "train_speed(iter/s)": 0.206311 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 0.6378412246704102, + "learning_rate": 9.872506447072594e-06, + "loss": 0.0718530535697937, + "memory(GiB)": 17.38, + "step": 660, + "token_acc": 0.951417004048583, + "train_speed(iter/s)": 0.20652 + }, + { + "epoch": 0.11941638608305274, + "grad_norm": 0.38415154814720154, + "learning_rate": 9.869152969624626e-06, + "loss": 0.05956906080245972, + "memory(GiB)": 17.38, + "step": 665, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.206733 + }, + { + "epoch": 0.12031425364758698, + "grad_norm": 3.7167587280273438, + "learning_rate": 9.865756543902238e-06, + "loss": 0.06507371664047241, + "memory(GiB)": 17.38, + "step": 670, + "token_acc": 0.9645833333333333, + "train_speed(iter/s)": 0.206945 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 5.0985188484191895, + "learning_rate": 9.862317199863545e-06, + "loss": 0.07170389890670777, + "memory(GiB)": 17.38, + "step": 675, + "token_acc": 0.9493670886075949, + "train_speed(iter/s)": 0.207156 + }, + { + "epoch": 0.12210998877665544, + "grad_norm": 5.764897346496582, + "learning_rate": 9.858834967845212e-06, + "loss": 0.07747161388397217, + "memory(GiB)": 17.38, + "step": 680, + "token_acc": 0.950207468879668, + "train_speed(iter/s)": 0.207362 + }, + { + "epoch": 0.12300785634118967, + "grad_norm": 0.6894210577011108, + "learning_rate": 9.855309878562205e-06, + "loss": 0.06004178524017334, + "memory(GiB)": 17.38, + "step": 685, + "token_acc": 0.9471544715447154, + "train_speed(iter/s)": 0.207564 + }, + { + "epoch": 0.12390572390572391, + "grad_norm": 1.336855173110962, + "learning_rate": 9.851741963107511e-06, + "loss": 0.06509203910827636, + "memory(GiB)": 17.38, + "step": 690, + "token_acc": 0.9638297872340426, + "train_speed(iter/s)": 0.207775 + }, + { + "epoch": 0.12480359147025814, + "grad_norm": 0.7053393125534058, + "learning_rate": 9.848131252951861e-06, + "loss": 0.057686471939086915, + "memory(GiB)": 17.38, + "step": 695, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 0.207969 + }, + { + "epoch": 0.12570145903479238, + "grad_norm": 1.207558035850525, + "learning_rate": 9.844477779943458e-06, + "loss": 0.05900728702545166, + "memory(GiB)": 17.38, + "step": 700, + "token_acc": 0.9489795918367347, + "train_speed(iter/s)": 0.208168 + }, + { + "epoch": 0.12570145903479238, + "eval_loss": 0.06005348265171051, + "eval_runtime": 63.0721, + "eval_samples_per_second": 14.269, + "eval_steps_per_second": 7.135, + "eval_token_acc": 0.9584182255973328, + "step": 700 + }, + { + "epoch": 0.1265993265993266, + "grad_norm": 1.1200499534606934, + "learning_rate": 9.840781576307694e-06, + "loss": 0.05839783549308777, + "memory(GiB)": 17.38, + "step": 705, + "token_acc": 0.9596994535519126, + "train_speed(iter/s)": 0.204055 + }, + { + "epoch": 0.12749719416386082, + "grad_norm": 1.4395155906677246, + "learning_rate": 9.837042674646864e-06, + "loss": 0.06512539982795715, + "memory(GiB)": 17.38, + "step": 710, + "token_acc": 0.9629629629629629, + "train_speed(iter/s)": 0.204268 + }, + { + "epoch": 0.12839506172839507, + "grad_norm": 0.6217043995857239, + "learning_rate": 9.833261107939885e-06, + "loss": 0.06195682883262634, + "memory(GiB)": 17.38, + "step": 715, + "token_acc": 0.9416666666666667, + "train_speed(iter/s)": 0.20447 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 0.791068434715271, + "learning_rate": 9.829436909541991e-06, + "loss": 0.059240984916687014, + "memory(GiB)": 17.38, + "step": 720, + "token_acc": 0.9650205761316872, + "train_speed(iter/s)": 0.204675 + }, + { + "epoch": 0.13019079685746351, + "grad_norm": 0.9022353291511536, + "learning_rate": 9.82557011318446e-06, + "loss": 0.06719270944595337, + "memory(GiB)": 17.38, + "step": 725, + "token_acc": 0.9641350210970464, + "train_speed(iter/s)": 0.204877 + }, + { + "epoch": 0.13108866442199776, + "grad_norm": 1.0495399236679077, + "learning_rate": 9.821660752974294e-06, + "loss": 0.061438947916030884, + "memory(GiB)": 17.38, + "step": 730, + "token_acc": 0.9530612244897959, + "train_speed(iter/s)": 0.205069 + }, + { + "epoch": 0.13198653198653199, + "grad_norm": 0.9535583853721619, + "learning_rate": 9.817708863393934e-06, + "loss": 0.06009225845336914, + "memory(GiB)": 17.38, + "step": 735, + "token_acc": 0.9602510460251046, + "train_speed(iter/s)": 0.205274 + }, + { + "epoch": 0.1328843995510662, + "grad_norm": 5.588057518005371, + "learning_rate": 9.813714479300951e-06, + "loss": 0.06616498827934265, + "memory(GiB)": 17.38, + "step": 740, + "token_acc": 0.9553191489361702, + "train_speed(iter/s)": 0.205473 + }, + { + "epoch": 0.13378226711560046, + "grad_norm": 0.4241677522659302, + "learning_rate": 9.809677635927735e-06, + "loss": 0.0572514533996582, + "memory(GiB)": 17.38, + "step": 745, + "token_acc": 0.9615384615384616, + "train_speed(iter/s)": 0.205669 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 1.185747504234314, + "learning_rate": 9.80559836888119e-06, + "loss": 0.06120376586914063, + "memory(GiB)": 17.38, + "step": 750, + "token_acc": 0.9610655737704918, + "train_speed(iter/s)": 0.205857 + }, + { + "epoch": 0.1355780022446689, + "grad_norm": 0.8714107275009155, + "learning_rate": 9.801476714142414e-06, + "loss": 0.06542506217956542, + "memory(GiB)": 17.38, + "step": 755, + "token_acc": 0.9669421487603306, + "train_speed(iter/s)": 0.206049 + }, + { + "epoch": 0.13647586980920315, + "grad_norm": 1.6032990217208862, + "learning_rate": 9.79731270806639e-06, + "loss": 0.06454580426216125, + "memory(GiB)": 17.38, + "step": 760, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.206237 + }, + { + "epoch": 0.13737373737373737, + "grad_norm": 5.791224002838135, + "learning_rate": 9.793106387381654e-06, + "loss": 0.05988554954528809, + "memory(GiB)": 17.38, + "step": 765, + "token_acc": 0.9672131147540983, + "train_speed(iter/s)": 0.206424 + }, + { + "epoch": 0.1382716049382716, + "grad_norm": 0.5975757837295532, + "learning_rate": 9.788857789189985e-06, + "loss": 0.060889732837677, + "memory(GiB)": 17.38, + "step": 770, + "token_acc": 0.9572649572649573, + "train_speed(iter/s)": 0.206606 + }, + { + "epoch": 0.13916947250280584, + "grad_norm": 0.6202221512794495, + "learning_rate": 9.78456695096606e-06, + "loss": 0.056883621215820315, + "memory(GiB)": 17.38, + "step": 775, + "token_acc": 0.959349593495935, + "train_speed(iter/s)": 0.206791 + }, + { + "epoch": 0.14006734006734006, + "grad_norm": 1.3406264781951904, + "learning_rate": 9.780233910557142e-06, + "loss": 0.06375112533569335, + "memory(GiB)": 17.38, + "step": 780, + "token_acc": 0.966804979253112, + "train_speed(iter/s)": 0.206974 + }, + { + "epoch": 0.1409652076318743, + "grad_norm": 0.5974138975143433, + "learning_rate": 9.775858706182734e-06, + "loss": 0.0632392406463623, + "memory(GiB)": 17.38, + "step": 785, + "token_acc": 0.9470338983050848, + "train_speed(iter/s)": 0.207159 + }, + { + "epoch": 0.14186307519640853, + "grad_norm": 0.8759323954582214, + "learning_rate": 9.771441376434247e-06, + "loss": 0.065260511636734, + "memory(GiB)": 17.38, + "step": 790, + "token_acc": 0.9432773109243697, + "train_speed(iter/s)": 0.207335 + }, + { + "epoch": 0.14276094276094276, + "grad_norm": 0.44687405228614807, + "learning_rate": 9.766981960274653e-06, + "loss": 0.05966047644615173, + "memory(GiB)": 17.38, + "step": 795, + "token_acc": 0.9564315352697096, + "train_speed(iter/s)": 0.207512 + }, + { + "epoch": 0.143658810325477, + "grad_norm": 0.5833959579467773, + "learning_rate": 9.762480497038155e-06, + "loss": 0.06292394995689392, + "memory(GiB)": 17.38, + "step": 800, + "token_acc": 0.9605809128630706, + "train_speed(iter/s)": 0.207688 + }, + { + "epoch": 0.143658810325477, + "eval_loss": 0.06439540535211563, + "eval_runtime": 63.6052, + "eval_samples_per_second": 14.15, + "eval_steps_per_second": 7.075, + "eval_token_acc": 0.9573995184293388, + "step": 800 + }, + { + "epoch": 0.14455667789001123, + "grad_norm": 1.3725666999816895, + "learning_rate": 9.757937026429825e-06, + "loss": 0.06043773889541626, + "memory(GiB)": 17.38, + "step": 805, + "token_acc": 0.9573484069886947, + "train_speed(iter/s)": 0.204069 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 4.164862632751465, + "learning_rate": 9.753351588525261e-06, + "loss": 0.06062279343605041, + "memory(GiB)": 17.38, + "step": 810, + "token_acc": 0.9688796680497925, + "train_speed(iter/s)": 0.204255 + }, + { + "epoch": 0.1463524130190797, + "grad_norm": 0.7628666162490845, + "learning_rate": 9.748724223770234e-06, + "loss": 0.0616323709487915, + "memory(GiB)": 17.38, + "step": 815, + "token_acc": 0.95625, + "train_speed(iter/s)": 0.204445 + }, + { + "epoch": 0.14725028058361392, + "grad_norm": 1.8662331104278564, + "learning_rate": 9.744054972980332e-06, + "loss": 0.06144759654998779, + "memory(GiB)": 17.38, + "step": 820, + "token_acc": 0.943089430894309, + "train_speed(iter/s)": 0.204622 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.4211827516555786, + "learning_rate": 9.739343877340595e-06, + "loss": 0.05864361524581909, + "memory(GiB)": 17.38, + "step": 825, + "token_acc": 0.952191235059761, + "train_speed(iter/s)": 0.204796 + }, + { + "epoch": 0.1490460157126824, + "grad_norm": 0.5156888961791992, + "learning_rate": 9.734590978405156e-06, + "loss": 0.06171987652778625, + "memory(GiB)": 17.38, + "step": 830, + "token_acc": 0.9560669456066946, + "train_speed(iter/s)": 0.204973 + }, + { + "epoch": 0.1499438832772166, + "grad_norm": 0.8593261241912842, + "learning_rate": 9.729796318096871e-06, + "loss": 0.060599392652511595, + "memory(GiB)": 17.38, + "step": 835, + "token_acc": 0.9641350210970464, + "train_speed(iter/s)": 0.205145 + }, + { + "epoch": 0.15084175084175083, + "grad_norm": 1.611090898513794, + "learning_rate": 9.724959938706956e-06, + "loss": 0.06412776112556458, + "memory(GiB)": 17.38, + "step": 840, + "token_acc": 0.9644351464435147, + "train_speed(iter/s)": 0.20532 + }, + { + "epoch": 0.15173961840628508, + "grad_norm": 0.6378880739212036, + "learning_rate": 9.720081882894604e-06, + "loss": 0.06018850803375244, + "memory(GiB)": 17.38, + "step": 845, + "token_acc": 0.9613821138211383, + "train_speed(iter/s)": 0.20549 + }, + { + "epoch": 0.1526374859708193, + "grad_norm": 0.43050727248191833, + "learning_rate": 9.715162193686619e-06, + "loss": 0.06343936920166016, + "memory(GiB)": 17.38, + "step": 850, + "token_acc": 0.9545454545454546, + "train_speed(iter/s)": 0.205665 + }, + { + "epoch": 0.15353535353535352, + "grad_norm": 0.5668479204177856, + "learning_rate": 9.710200914477026e-06, + "loss": 0.06395221948623657, + "memory(GiB)": 17.38, + "step": 855, + "token_acc": 0.9560669456066946, + "train_speed(iter/s)": 0.205835 + }, + { + "epoch": 0.15443322109988777, + "grad_norm": 0.577335000038147, + "learning_rate": 9.705198089026701e-06, + "loss": 0.06996033787727356, + "memory(GiB)": 17.38, + "step": 860, + "token_acc": 0.959915611814346, + "train_speed(iter/s)": 0.206003 + }, + { + "epoch": 0.155331088664422, + "grad_norm": 0.55158531665802, + "learning_rate": 9.700153761462974e-06, + "loss": 0.05910910367965698, + "memory(GiB)": 17.38, + "step": 865, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.206168 + }, + { + "epoch": 0.15622895622895622, + "grad_norm": 0.5211525559425354, + "learning_rate": 9.69506797627924e-06, + "loss": 0.06616908311843872, + "memory(GiB)": 17.38, + "step": 870, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.206338 + }, + { + "epoch": 0.15712682379349047, + "grad_norm": 0.49179428815841675, + "learning_rate": 9.689940778334579e-06, + "loss": 0.06158690452575684, + "memory(GiB)": 17.38, + "step": 875, + "token_acc": 0.9543568464730291, + "train_speed(iter/s)": 0.2065 + }, + { + "epoch": 0.1580246913580247, + "grad_norm": 0.5869054794311523, + "learning_rate": 9.684772212853341e-06, + "loss": 0.061235034465789796, + "memory(GiB)": 17.38, + "step": 880, + "token_acc": 0.9541666666666667, + "train_speed(iter/s)": 0.20666 + }, + { + "epoch": 0.1589225589225589, + "grad_norm": 0.31923583149909973, + "learning_rate": 9.679562325424767e-06, + "loss": 0.06122039556503296, + "memory(GiB)": 17.38, + "step": 885, + "token_acc": 0.9510204081632653, + "train_speed(iter/s)": 0.206825 + }, + { + "epoch": 0.15982042648709316, + "grad_norm": 0.3428490459918976, + "learning_rate": 9.674311162002573e-06, + "loss": 0.059962010383605956, + "memory(GiB)": 17.38, + "step": 890, + "token_acc": 0.9516806722689075, + "train_speed(iter/s)": 0.207002 + }, + { + "epoch": 0.16071829405162738, + "grad_norm": 0.4209570288658142, + "learning_rate": 9.669018768904545e-06, + "loss": 0.06541280150413513, + "memory(GiB)": 17.38, + "step": 895, + "token_acc": 0.9578059071729957, + "train_speed(iter/s)": 0.207165 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.31668972969055176, + "learning_rate": 9.663685192812147e-06, + "loss": 0.062369120121002194, + "memory(GiB)": 17.38, + "step": 900, + "token_acc": 0.9568965517241379, + "train_speed(iter/s)": 0.207332 + }, + { + "epoch": 0.16161616161616163, + "eval_loss": 0.06034030020236969, + "eval_runtime": 62.8066, + "eval_samples_per_second": 14.33, + "eval_steps_per_second": 7.165, + "eval_token_acc": 0.9611965178736803, + "step": 900 + }, + { + "epoch": 0.16251402918069585, + "grad_norm": 0.4104420244693756, + "learning_rate": 9.658310480770083e-06, + "loss": 0.059364324808120726, + "memory(GiB)": 17.38, + "step": 905, + "token_acc": 0.9634562841530054, + "train_speed(iter/s)": 0.204165 + }, + { + "epoch": 0.16341189674523007, + "grad_norm": 0.43599995970726013, + "learning_rate": 9.652894680185902e-06, + "loss": 0.060947751998901366, + "memory(GiB)": 17.38, + "step": 910, + "token_acc": 0.9578059071729957, + "train_speed(iter/s)": 0.20434 + }, + { + "epoch": 0.16430976430976432, + "grad_norm": 0.6397150158882141, + "learning_rate": 9.647437838829579e-06, + "loss": 0.07662461996078491, + "memory(GiB)": 17.38, + "step": 915, + "token_acc": 0.9600840336134454, + "train_speed(iter/s)": 0.204513 + }, + { + "epoch": 0.16520763187429854, + "grad_norm": 0.3698771595954895, + "learning_rate": 9.641940004833078e-06, + "loss": 0.06173065900802612, + "memory(GiB)": 17.38, + "step": 920, + "token_acc": 0.9556962025316456, + "train_speed(iter/s)": 0.204683 + }, + { + "epoch": 0.16610549943883277, + "grad_norm": 0.3027670383453369, + "learning_rate": 9.636401226689945e-06, + "loss": 0.05950305461883545, + "memory(GiB)": 17.38, + "step": 925, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.204856 + }, + { + "epoch": 0.16700336700336701, + "grad_norm": 0.2766636908054352, + "learning_rate": 9.63082155325487e-06, + "loss": 0.057304519414901736, + "memory(GiB)": 17.38, + "step": 930, + "token_acc": 0.9721030042918455, + "train_speed(iter/s)": 0.205025 + }, + { + "epoch": 0.16790123456790124, + "grad_norm": 0.35388001799583435, + "learning_rate": 9.625201033743262e-06, + "loss": 0.07434110045433044, + "memory(GiB)": 17.38, + "step": 935, + "token_acc": 0.9539748953974896, + "train_speed(iter/s)": 0.205192 + }, + { + "epoch": 0.16879910213243546, + "grad_norm": 0.36312925815582275, + "learning_rate": 9.619539717730806e-06, + "loss": 0.05914363861083984, + "memory(GiB)": 17.38, + "step": 940, + "token_acc": 0.9691358024691358, + "train_speed(iter/s)": 0.205362 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 0.6041818857192993, + "learning_rate": 9.613837655153041e-06, + "loss": 0.06174136400222778, + "memory(GiB)": 17.38, + "step": 945, + "token_acc": 0.9604166666666667, + "train_speed(iter/s)": 0.205528 + }, + { + "epoch": 0.17059483726150393, + "grad_norm": 0.7084026336669922, + "learning_rate": 9.608094896304902e-06, + "loss": 0.07253472805023194, + "memory(GiB)": 17.38, + "step": 950, + "token_acc": 0.9556962025316456, + "train_speed(iter/s)": 0.205693 + }, + { + "epoch": 0.17149270482603815, + "grad_norm": 0.25087109208106995, + "learning_rate": 9.60231149184029e-06, + "loss": 0.05871572494506836, + "memory(GiB)": 17.38, + "step": 955, + "token_acc": 0.9645833333333333, + "train_speed(iter/s)": 0.205857 + }, + { + "epoch": 0.1723905723905724, + "grad_norm": 0.46449679136276245, + "learning_rate": 9.596487492771619e-06, + "loss": 0.061652785539627074, + "memory(GiB)": 17.38, + "step": 960, + "token_acc": 0.9729166666666667, + "train_speed(iter/s)": 0.206017 + }, + { + "epoch": 0.17328843995510662, + "grad_norm": 0.2851842939853668, + "learning_rate": 9.590622950469361e-06, + "loss": 0.060762083530426024, + "memory(GiB)": 17.38, + "step": 965, + "token_acc": 0.9567901234567902, + "train_speed(iter/s)": 0.206178 + }, + { + "epoch": 0.17418630751964084, + "grad_norm": 0.43767955899238586, + "learning_rate": 9.584717916661608e-06, + "loss": 0.05893281102180481, + "memory(GiB)": 17.38, + "step": 970, + "token_acc": 0.9585062240663901, + "train_speed(iter/s)": 0.206341 + }, + { + "epoch": 0.1750841750841751, + "grad_norm": 0.2510776221752167, + "learning_rate": 9.5787724434336e-06, + "loss": 0.06013035774230957, + "memory(GiB)": 17.38, + "step": 975, + "token_acc": 0.9672131147540983, + "train_speed(iter/s)": 0.206502 + }, + { + "epoch": 0.1759820426487093, + "grad_norm": 0.6884217262268066, + "learning_rate": 9.572786583227278e-06, + "loss": 0.07722243666648865, + "memory(GiB)": 17.38, + "step": 980, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.206657 + }, + { + "epoch": 0.17687991021324354, + "grad_norm": 0.3164050877094269, + "learning_rate": 9.566760388840808e-06, + "loss": 0.05833897590637207, + "memory(GiB)": 17.38, + "step": 985, + "token_acc": 0.9626556016597511, + "train_speed(iter/s)": 0.206811 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.22375927865505219, + "learning_rate": 9.56069391342813e-06, + "loss": 0.06287506818771363, + "memory(GiB)": 17.38, + "step": 990, + "token_acc": 0.9609053497942387, + "train_speed(iter/s)": 0.206963 + }, + { + "epoch": 0.178675645342312, + "grad_norm": 0.8741426467895508, + "learning_rate": 9.554587210498476e-06, + "loss": 0.07396726608276367, + "memory(GiB)": 17.38, + "step": 995, + "token_acc": 0.9554655870445344, + "train_speed(iter/s)": 0.207118 + }, + { + "epoch": 0.17957351290684623, + "grad_norm": 0.29830697178840637, + "learning_rate": 9.54844033391591e-06, + "loss": 0.06164867281913757, + "memory(GiB)": 17.38, + "step": 1000, + "token_acc": 0.9659574468085106, + "train_speed(iter/s)": 0.207274 + }, + { + "epoch": 0.17957351290684623, + "eval_loss": 0.06104493513703346, + "eval_runtime": 61.1371, + "eval_samples_per_second": 14.721, + "eval_steps_per_second": 7.361, + "eval_token_acc": 0.9565660307464345, + "step": 1000 + }, + { + "epoch": 0.18047138047138048, + "grad_norm": 0.23519226908683777, + "learning_rate": 9.542253337898844e-06, + "loss": 0.05950922966003418, + "memory(GiB)": 17.38, + "step": 1005, + "token_acc": 0.9549180327868853, + "train_speed(iter/s)": 0.204488 + }, + { + "epoch": 0.1813692480359147, + "grad_norm": 0.2536954879760742, + "learning_rate": 9.536026277019562e-06, + "loss": 0.06263896822929382, + "memory(GiB)": 17.38, + "step": 1010, + "token_acc": 0.9487704918032787, + "train_speed(iter/s)": 0.204652 + }, + { + "epoch": 0.18226711560044892, + "grad_norm": 1.2299308776855469, + "learning_rate": 9.529759206203738e-06, + "loss": 0.06158524751663208, + "memory(GiB)": 17.38, + "step": 1015, + "token_acc": 0.9567901234567902, + "train_speed(iter/s)": 0.204808 + }, + { + "epoch": 0.18316498316498317, + "grad_norm": 0.2919534146785736, + "learning_rate": 9.523452180729958e-06, + "loss": 0.05624125003814697, + "memory(GiB)": 17.38, + "step": 1020, + "token_acc": 0.9648760330578512, + "train_speed(iter/s)": 0.204965 + }, + { + "epoch": 0.1840628507295174, + "grad_norm": 3.7201414108276367, + "learning_rate": 9.517105256229224e-06, + "loss": 0.06964196562767029, + "memory(GiB)": 17.38, + "step": 1025, + "token_acc": 0.9549356223175965, + "train_speed(iter/s)": 0.205117 + }, + { + "epoch": 0.18496071829405164, + "grad_norm": 0.22875644266605377, + "learning_rate": 9.510718488684467e-06, + "loss": 0.06508823633193969, + "memory(GiB)": 17.38, + "step": 1030, + "token_acc": 0.9612068965517241, + "train_speed(iter/s)": 0.20527 + }, + { + "epoch": 0.18585858585858586, + "grad_norm": 0.9431266784667969, + "learning_rate": 9.504291934430054e-06, + "loss": 0.059895122051239015, + "memory(GiB)": 17.38, + "step": 1035, + "token_acc": 0.9506172839506173, + "train_speed(iter/s)": 0.205426 + }, + { + "epoch": 0.18675645342312008, + "grad_norm": 0.33424726128578186, + "learning_rate": 9.497825650151286e-06, + "loss": 0.06327733397483826, + "memory(GiB)": 17.38, + "step": 1040, + "token_acc": 0.9600840336134454, + "train_speed(iter/s)": 0.205577 + }, + { + "epoch": 0.18765432098765433, + "grad_norm": 0.24912895262241364, + "learning_rate": 9.491319692883907e-06, + "loss": 0.061274147033691405, + "memory(GiB)": 17.38, + "step": 1045, + "token_acc": 0.9565217391304348, + "train_speed(iter/s)": 0.205723 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 0.2653322219848633, + "learning_rate": 9.484774120013593e-06, + "loss": 0.061475610733032225, + "memory(GiB)": 17.38, + "step": 1050, + "token_acc": 0.9549356223175965, + "train_speed(iter/s)": 0.205871 + }, + { + "epoch": 0.18945005611672278, + "grad_norm": 0.34487950801849365, + "learning_rate": 9.47818898927545e-06, + "loss": 0.05869110226631165, + "memory(GiB)": 17.38, + "step": 1055, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.206008 + }, + { + "epoch": 0.19034792368125703, + "grad_norm": 0.34836098551750183, + "learning_rate": 9.471564358753502e-06, + "loss": 0.059927338361740114, + "memory(GiB)": 17.38, + "step": 1060, + "token_acc": 0.9725738396624473, + "train_speed(iter/s)": 0.206147 + }, + { + "epoch": 0.19124579124579125, + "grad_norm": 0.2456379532814026, + "learning_rate": 9.464900286880181e-06, + "loss": 0.061039865016937256, + "memory(GiB)": 17.38, + "step": 1065, + "token_acc": 0.9645833333333333, + "train_speed(iter/s)": 0.20629 + }, + { + "epoch": 0.19214365881032547, + "grad_norm": 0.7069658637046814, + "learning_rate": 9.458196832435811e-06, + "loss": 0.058842587471008304, + "memory(GiB)": 17.38, + "step": 1070, + "token_acc": 0.9551020408163265, + "train_speed(iter/s)": 0.206432 + }, + { + "epoch": 0.19304152637485972, + "grad_norm": 2.046337842941284, + "learning_rate": 9.45145405454809e-06, + "loss": 0.060351550579071045, + "memory(GiB)": 17.38, + "step": 1075, + "token_acc": 0.9583333333333334, + "train_speed(iter/s)": 0.206578 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.5140432119369507, + "learning_rate": 9.444672012691564e-06, + "loss": 0.06441909074783325, + "memory(GiB)": 17.38, + "step": 1080, + "token_acc": 0.9583333333333334, + "train_speed(iter/s)": 0.206719 + }, + { + "epoch": 0.19483726150392816, + "grad_norm": 0.1938686966896057, + "learning_rate": 9.437850766687112e-06, + "loss": 0.05462031364440918, + "memory(GiB)": 17.38, + "step": 1085, + "token_acc": 0.966804979253112, + "train_speed(iter/s)": 0.206867 + }, + { + "epoch": 0.1957351290684624, + "grad_norm": 0.25077563524246216, + "learning_rate": 9.430990376701406e-06, + "loss": 0.06830579042434692, + "memory(GiB)": 17.38, + "step": 1090, + "token_acc": 0.9591836734693877, + "train_speed(iter/s)": 0.207005 + }, + { + "epoch": 0.19663299663299663, + "grad_norm": 0.25749671459198, + "learning_rate": 9.424090903246392e-06, + "loss": 0.06092467308044434, + "memory(GiB)": 17.38, + "step": 1095, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.207142 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.30902498960494995, + "learning_rate": 9.417152407178747e-06, + "loss": 0.05994751453399658, + "memory(GiB)": 17.38, + "step": 1100, + "token_acc": 0.9506172839506173, + "train_speed(iter/s)": 0.207279 + }, + { + "epoch": 0.19753086419753085, + "eval_loss": 0.061332765966653824, + "eval_runtime": 61.2586, + "eval_samples_per_second": 14.692, + "eval_steps_per_second": 7.346, + "eval_token_acc": 0.958233006112243, + "step": 1100 + }, + { + "epoch": 0.1984287317620651, + "grad_norm": 0.45211002230644226, + "learning_rate": 9.410174949699352e-06, + "loss": 0.058654028177261355, + "memory(GiB)": 17.38, + "step": 1105, + "token_acc": 0.9584473324213406, + "train_speed(iter/s)": 0.204734 + }, + { + "epoch": 0.19932659932659932, + "grad_norm": 0.42476797103881836, + "learning_rate": 9.403158592352739e-06, + "loss": 0.06276293992996215, + "memory(GiB)": 17.38, + "step": 1110, + "token_acc": 0.9571428571428572, + "train_speed(iter/s)": 0.204877 + }, + { + "epoch": 0.20022446689113355, + "grad_norm": 0.20244072377681732, + "learning_rate": 9.396103397026561e-06, + "loss": 0.05925884246826172, + "memory(GiB)": 17.38, + "step": 1115, + "token_acc": 0.9566115702479339, + "train_speed(iter/s)": 0.205021 + }, + { + "epoch": 0.2011223344556678, + "grad_norm": 0.8865803480148315, + "learning_rate": 9.389009425951038e-06, + "loss": 0.058167487382888794, + "memory(GiB)": 17.38, + "step": 1120, + "token_acc": 0.9616935483870968, + "train_speed(iter/s)": 0.205163 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.3987865149974823, + "learning_rate": 9.381876741698414e-06, + "loss": 0.060905849933624266, + "memory(GiB)": 17.38, + "step": 1125, + "token_acc": 0.9633620689655172, + "train_speed(iter/s)": 0.205288 + }, + { + "epoch": 0.20291806958473624, + "grad_norm": 0.34117624163627625, + "learning_rate": 9.374705407182396e-06, + "loss": 0.057905805110931394, + "memory(GiB)": 17.38, + "step": 1130, + "token_acc": 0.9615384615384616, + "train_speed(iter/s)": 0.205427 + }, + { + "epoch": 0.2038159371492705, + "grad_norm": 1.16488778591156, + "learning_rate": 9.367495485657612e-06, + "loss": 0.06035115718841553, + "memory(GiB)": 17.38, + "step": 1135, + "token_acc": 0.9558823529411765, + "train_speed(iter/s)": 0.205565 + }, + { + "epoch": 0.2047138047138047, + "grad_norm": 0.2598137855529785, + "learning_rate": 9.36024704071904e-06, + "loss": 0.06316685676574707, + "memory(GiB)": 17.38, + "step": 1140, + "token_acc": 0.9555084745762712, + "train_speed(iter/s)": 0.2057 + }, + { + "epoch": 0.20561167227833896, + "grad_norm": 0.3389371335506439, + "learning_rate": 9.352960136301454e-06, + "loss": 0.05683580636978149, + "memory(GiB)": 17.38, + "step": 1145, + "token_acc": 0.961864406779661, + "train_speed(iter/s)": 0.205837 + }, + { + "epoch": 0.20650953984287318, + "grad_norm": 12.365500450134277, + "learning_rate": 9.345634836678866e-06, + "loss": 0.0674280047416687, + "memory(GiB)": 17.38, + "step": 1150, + "token_acc": 0.9551020408163265, + "train_speed(iter/s)": 0.205969 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 0.4306594431400299, + "learning_rate": 9.338271206463943e-06, + "loss": 0.05808438658714295, + "memory(GiB)": 17.38, + "step": 1155, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.2061 + }, + { + "epoch": 0.20830527497194165, + "grad_norm": 0.4316254258155823, + "learning_rate": 9.33086931060745e-06, + "loss": 0.060810518264770505, + "memory(GiB)": 17.38, + "step": 1160, + "token_acc": 0.9552845528455285, + "train_speed(iter/s)": 0.206235 + }, + { + "epoch": 0.20920314253647587, + "grad_norm": 0.3063175678253174, + "learning_rate": 9.323429214397677e-06, + "loss": 0.05965996980667114, + "memory(GiB)": 17.38, + "step": 1165, + "token_acc": 0.9532520325203252, + "train_speed(iter/s)": 0.206366 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 0.33290836215019226, + "learning_rate": 9.315950983459853e-06, + "loss": 0.06026350259780884, + "memory(GiB)": 17.38, + "step": 1170, + "token_acc": 0.9601593625498008, + "train_speed(iter/s)": 0.206494 + }, + { + "epoch": 0.21099887766554434, + "grad_norm": 0.5707617998123169, + "learning_rate": 9.308434683755576e-06, + "loss": 0.05856707096099854, + "memory(GiB)": 17.38, + "step": 1175, + "token_acc": 0.9539748953974896, + "train_speed(iter/s)": 0.206622 + }, + { + "epoch": 0.21189674523007856, + "grad_norm": 0.44109290838241577, + "learning_rate": 9.300880381582234e-06, + "loss": 0.06187763214111328, + "memory(GiB)": 17.38, + "step": 1180, + "token_acc": 0.9631147540983607, + "train_speed(iter/s)": 0.206747 + }, + { + "epoch": 0.2127946127946128, + "grad_norm": 0.40886279940605164, + "learning_rate": 9.293288143572405e-06, + "loss": 0.06144980192184448, + "memory(GiB)": 17.38, + "step": 1185, + "token_acc": 0.9574468085106383, + "train_speed(iter/s)": 0.20687 + }, + { + "epoch": 0.21369248035914704, + "grad_norm": 1.1169712543487549, + "learning_rate": 9.285658036693291e-06, + "loss": 0.06811643838882446, + "memory(GiB)": 17.38, + "step": 1190, + "token_acc": 0.9493670886075949, + "train_speed(iter/s)": 0.206995 + }, + { + "epoch": 0.21459034792368126, + "grad_norm": 1.602323293685913, + "learning_rate": 9.277990128246103e-06, + "loss": 0.05875219106674194, + "memory(GiB)": 17.38, + "step": 1195, + "token_acc": 0.9650205761316872, + "train_speed(iter/s)": 0.207119 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 0.3107532262802124, + "learning_rate": 9.270284485865493e-06, + "loss": 0.05820940732955933, + "memory(GiB)": 17.38, + "step": 1200, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.207244 + }, + { + "epoch": 0.21548821548821548, + "eval_loss": 0.061315201222896576, + "eval_runtime": 62.3154, + "eval_samples_per_second": 14.443, + "eval_steps_per_second": 7.221, + "eval_token_acc": 0.9594369327653269, + "step": 1200 + }, + { + "epoch": 0.21638608305274973, + "grad_norm": 0.692822277545929, + "learning_rate": 9.262541177518936e-06, + "loss": 0.05642242431640625, + "memory(GiB)": 17.38, + "step": 1205, + "token_acc": 0.9600818833162743, + "train_speed(iter/s)": 0.204877 + }, + { + "epoch": 0.21728395061728395, + "grad_norm": 1.1112533807754517, + "learning_rate": 9.25476027150614e-06, + "loss": 0.0682375431060791, + "memory(GiB)": 17.38, + "step": 1210, + "token_acc": 0.9524793388429752, + "train_speed(iter/s)": 0.205007 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.40480947494506836, + "learning_rate": 9.246941836458442e-06, + "loss": 0.06561314463615417, + "memory(GiB)": 17.38, + "step": 1215, + "token_acc": 0.9581589958158996, + "train_speed(iter/s)": 0.205136 + }, + { + "epoch": 0.21907968574635242, + "grad_norm": 0.17906655371189117, + "learning_rate": 9.239085941338211e-06, + "loss": 0.059771674871444705, + "memory(GiB)": 17.38, + "step": 1220, + "token_acc": 0.9572649572649573, + "train_speed(iter/s)": 0.205266 + }, + { + "epoch": 0.21997755331088664, + "grad_norm": 0.2035798728466034, + "learning_rate": 9.231192655438222e-06, + "loss": 0.06261578798294068, + "memory(GiB)": 17.38, + "step": 1225, + "token_acc": 0.9556962025316456, + "train_speed(iter/s)": 0.205397 + }, + { + "epoch": 0.22087542087542086, + "grad_norm": 0.2759051024913788, + "learning_rate": 9.223262048381056e-06, + "loss": 0.056782233715057376, + "memory(GiB)": 17.38, + "step": 1230, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.205518 + }, + { + "epoch": 0.2217732884399551, + "grad_norm": 0.418022096157074, + "learning_rate": 9.215294190118491e-06, + "loss": 0.0618687093257904, + "memory(GiB)": 17.38, + "step": 1235, + "token_acc": 0.9516806722689075, + "train_speed(iter/s)": 0.205645 + }, + { + "epoch": 0.22267115600448933, + "grad_norm": 0.5110579133033752, + "learning_rate": 9.207289150930874e-06, + "loss": 0.0632178008556366, + "memory(GiB)": 17.38, + "step": 1240, + "token_acc": 0.9536290322580645, + "train_speed(iter/s)": 0.205773 + }, + { + "epoch": 0.22356902356902356, + "grad_norm": 0.7280980944633484, + "learning_rate": 9.199247001426504e-06, + "loss": 0.058500027656555174, + "memory(GiB)": 17.38, + "step": 1245, + "token_acc": 0.9595141700404858, + "train_speed(iter/s)": 0.205897 + }, + { + "epoch": 0.2244668911335578, + "grad_norm": 0.6378828883171082, + "learning_rate": 9.191167812541016e-06, + "loss": 0.06280182003974914, + "memory(GiB)": 17.38, + "step": 1250, + "token_acc": 0.9605809128630706, + "train_speed(iter/s)": 0.206021 + }, + { + "epoch": 0.22536475869809203, + "grad_norm": 0.7098469734191895, + "learning_rate": 9.183051655536744e-06, + "loss": 0.056632041931152344, + "memory(GiB)": 17.38, + "step": 1255, + "token_acc": 0.9639830508474576, + "train_speed(iter/s)": 0.206144 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 3.889753580093384, + "learning_rate": 9.174898602002105e-06, + "loss": 0.06656032800674438, + "memory(GiB)": 17.38, + "step": 1260, + "token_acc": 0.9504132231404959, + "train_speed(iter/s)": 0.206266 + }, + { + "epoch": 0.2271604938271605, + "grad_norm": 1.8271089792251587, + "learning_rate": 9.166708723850957e-06, + "loss": 0.06816760301589966, + "memory(GiB)": 17.38, + "step": 1265, + "token_acc": 0.9461206896551724, + "train_speed(iter/s)": 0.206383 + }, + { + "epoch": 0.22805836139169472, + "grad_norm": 3.842327356338501, + "learning_rate": 9.15848209332197e-06, + "loss": 0.07103725671768188, + "memory(GiB)": 17.38, + "step": 1270, + "token_acc": 0.9623430962343096, + "train_speed(iter/s)": 0.2065 + }, + { + "epoch": 0.22895622895622897, + "grad_norm": 0.42257869243621826, + "learning_rate": 9.150218782977987e-06, + "loss": 0.06250607967376709, + "memory(GiB)": 17.38, + "step": 1275, + "token_acc": 0.9547325102880658, + "train_speed(iter/s)": 0.206621 + }, + { + "epoch": 0.2298540965207632, + "grad_norm": 0.29388418793678284, + "learning_rate": 9.141918865705384e-06, + "loss": 0.05651596188545227, + "memory(GiB)": 17.38, + "step": 1280, + "token_acc": 0.9607438016528925, + "train_speed(iter/s)": 0.206735 + }, + { + "epoch": 0.2307519640852974, + "grad_norm": 0.8813516497612, + "learning_rate": 9.133582414713434e-06, + "loss": 0.061053109169006345, + "memory(GiB)": 17.38, + "step": 1285, + "token_acc": 0.962, + "train_speed(iter/s)": 0.206855 + }, + { + "epoch": 0.23164983164983166, + "grad_norm": 0.3710384666919708, + "learning_rate": 9.125209503533645e-06, + "loss": 0.06351611614227295, + "memory(GiB)": 17.38, + "step": 1290, + "token_acc": 0.9533898305084746, + "train_speed(iter/s)": 0.206971 + }, + { + "epoch": 0.23254769921436588, + "grad_norm": 0.6008475422859192, + "learning_rate": 9.116800206019127e-06, + "loss": 0.05928425788879395, + "memory(GiB)": 17.38, + "step": 1295, + "token_acc": 0.9489795918367347, + "train_speed(iter/s)": 0.207088 + }, + { + "epoch": 0.2334455667789001, + "grad_norm": 0.7750935554504395, + "learning_rate": 9.108354596343938e-06, + "loss": 0.05974637866020203, + "memory(GiB)": 17.38, + "step": 1300, + "token_acc": 0.9595141700404858, + "train_speed(iter/s)": 0.207201 + }, + { + "epoch": 0.2334455667789001, + "eval_loss": 0.062158890068531036, + "eval_runtime": 61.5409, + "eval_samples_per_second": 14.624, + "eval_steps_per_second": 7.312, + "eval_token_acc": 0.9603630301907761, + "step": 1300 + }, + { + "epoch": 0.23434343434343435, + "grad_norm": 0.42738351225852966, + "learning_rate": 9.099872749002418e-06, + "loss": 0.06223499774932861, + "memory(GiB)": 17.38, + "step": 1305, + "token_acc": 0.9606837606837607, + "train_speed(iter/s)": 0.205038 + }, + { + "epoch": 0.23524130190796858, + "grad_norm": 2.978898286819458, + "learning_rate": 9.09135473880855e-06, + "loss": 0.05798090100288391, + "memory(GiB)": 17.38, + "step": 1310, + "token_acc": 0.9665271966527197, + "train_speed(iter/s)": 0.205161 + }, + { + "epoch": 0.2361391694725028, + "grad_norm": 0.3358847498893738, + "learning_rate": 9.082800640895286e-06, + "loss": 0.05697367787361145, + "memory(GiB)": 17.38, + "step": 1315, + "token_acc": 0.9651639344262295, + "train_speed(iter/s)": 0.205278 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 0.32103848457336426, + "learning_rate": 9.07421053071389e-06, + "loss": 0.06541591882705688, + "memory(GiB)": 17.38, + "step": 1320, + "token_acc": 0.9526748971193416, + "train_speed(iter/s)": 0.205399 + }, + { + "epoch": 0.23793490460157127, + "grad_norm": 0.22060243785381317, + "learning_rate": 9.065584484033267e-06, + "loss": 0.062460148334503175, + "memory(GiB)": 17.38, + "step": 1325, + "token_acc": 0.9537815126050421, + "train_speed(iter/s)": 0.205513 + }, + { + "epoch": 0.2388327721661055, + "grad_norm": 0.43442702293395996, + "learning_rate": 9.056922576939307e-06, + "loss": 0.058587956428527835, + "memory(GiB)": 17.38, + "step": 1330, + "token_acc": 0.95, + "train_speed(iter/s)": 0.205626 + }, + { + "epoch": 0.23973063973063974, + "grad_norm": 0.336465060710907, + "learning_rate": 9.048224885834203e-06, + "loss": 0.062299489974975586, + "memory(GiB)": 17.38, + "step": 1335, + "token_acc": 0.9493670886075949, + "train_speed(iter/s)": 0.205739 + }, + { + "epoch": 0.24062850729517396, + "grad_norm": 0.4352080523967743, + "learning_rate": 9.039491487435778e-06, + "loss": 0.05664920210838318, + "memory(GiB)": 17.38, + "step": 1340, + "token_acc": 0.9556962025316456, + "train_speed(iter/s)": 0.205851 + }, + { + "epoch": 0.24152637485970818, + "grad_norm": 0.7088451981544495, + "learning_rate": 9.030722458776815e-06, + "loss": 0.05996109247207641, + "memory(GiB)": 17.38, + "step": 1345, + "token_acc": 0.9526748971193416, + "train_speed(iter/s)": 0.205962 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 3.2741127014160156, + "learning_rate": 9.021917877204367e-06, + "loss": 0.057466554641723636, + "memory(GiB)": 17.38, + "step": 1350, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.206061 + }, + { + "epoch": 0.24332210998877665, + "grad_norm": 0.5009698271751404, + "learning_rate": 9.013077820379086e-06, + "loss": 0.05917593240737915, + "memory(GiB)": 17.38, + "step": 1355, + "token_acc": 0.9616935483870968, + "train_speed(iter/s)": 0.206175 + }, + { + "epoch": 0.24421997755331087, + "grad_norm": 0.4095957279205322, + "learning_rate": 9.00420236627453e-06, + "loss": 0.0621448278427124, + "memory(GiB)": 17.38, + "step": 1360, + "token_acc": 0.9659574468085106, + "train_speed(iter/s)": 0.206286 + }, + { + "epoch": 0.24511784511784512, + "grad_norm": 0.5142233967781067, + "learning_rate": 8.995291593176482e-06, + "loss": 0.05982544422149658, + "memory(GiB)": 17.38, + "step": 1365, + "token_acc": 0.9539748953974896, + "train_speed(iter/s)": 0.206397 + }, + { + "epoch": 0.24601571268237934, + "grad_norm": 0.23607836663722992, + "learning_rate": 8.98634557968225e-06, + "loss": 0.05764271020889282, + "memory(GiB)": 17.38, + "step": 1370, + "token_acc": 0.9545454545454546, + "train_speed(iter/s)": 0.206505 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 0.35170498490333557, + "learning_rate": 8.97736440469998e-06, + "loss": 0.06106231808662414, + "memory(GiB)": 17.38, + "step": 1375, + "token_acc": 0.9535864978902954, + "train_speed(iter/s)": 0.20661 + }, + { + "epoch": 0.24781144781144782, + "grad_norm": 0.40215885639190674, + "learning_rate": 8.968348147447965e-06, + "loss": 0.05974069833755493, + "memory(GiB)": 17.38, + "step": 1380, + "token_acc": 0.9623430962343096, + "train_speed(iter/s)": 0.206719 + }, + { + "epoch": 0.24870931537598204, + "grad_norm": 0.4953094720840454, + "learning_rate": 8.959296887453935e-06, + "loss": 0.05892341136932373, + "memory(GiB)": 17.38, + "step": 1385, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.206827 + }, + { + "epoch": 0.2496071829405163, + "grad_norm": 0.5564194321632385, + "learning_rate": 8.950210704554364e-06, + "loss": 0.06049641966819763, + "memory(GiB)": 17.38, + "step": 1390, + "token_acc": 0.9541666666666667, + "train_speed(iter/s)": 0.206934 + }, + { + "epoch": 0.2505050505050505, + "grad_norm": 0.4139876663684845, + "learning_rate": 8.941089678893758e-06, + "loss": 0.05883210897445679, + "memory(GiB)": 17.38, + "step": 1395, + "token_acc": 0.9574468085106383, + "train_speed(iter/s)": 0.207039 + }, + { + "epoch": 0.25140291806958476, + "grad_norm": 1.0044163465499878, + "learning_rate": 8.93193389092396e-06, + "loss": 0.05726579427719116, + "memory(GiB)": 17.38, + "step": 1400, + "token_acc": 0.9729166666666667, + "train_speed(iter/s)": 0.207147 + }, + { + "epoch": 0.25140291806958476, + "eval_loss": 0.06149906665086746, + "eval_runtime": 61.7156, + "eval_samples_per_second": 14.583, + "eval_steps_per_second": 7.292, + "eval_token_acc": 0.9578625671420633, + "step": 1400 + }, + { + "epoch": 0.25230078563411895, + "grad_norm": 1.0752087831497192, + "learning_rate": 8.922743421403427e-06, + "loss": 0.06310935020446777, + "memory(GiB)": 17.38, + "step": 1405, + "token_acc": 0.9565663474692202, + "train_speed(iter/s)": 0.20511 + }, + { + "epoch": 0.2531986531986532, + "grad_norm": 0.4143998324871063, + "learning_rate": 8.91351835139653e-06, + "loss": 0.055285751819610596, + "memory(GiB)": 17.38, + "step": 1410, + "token_acc": 0.9650205761316872, + "train_speed(iter/s)": 0.205215 + }, + { + "epoch": 0.25409652076318745, + "grad_norm": 0.5342971086502075, + "learning_rate": 8.904258762272829e-06, + "loss": 0.060773026943206784, + "memory(GiB)": 17.38, + "step": 1415, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.205323 + }, + { + "epoch": 0.25499438832772164, + "grad_norm": 0.9748996496200562, + "learning_rate": 8.894964735706357e-06, + "loss": 0.06706810593605042, + "memory(GiB)": 17.38, + "step": 1420, + "token_acc": 0.9519650655021834, + "train_speed(iter/s)": 0.205421 + }, + { + "epoch": 0.2558922558922559, + "grad_norm": 0.5542710423469543, + "learning_rate": 8.885636353674908e-06, + "loss": 0.05943622589111328, + "memory(GiB)": 17.38, + "step": 1425, + "token_acc": 0.950207468879668, + "train_speed(iter/s)": 0.205526 + }, + { + "epoch": 0.25679012345679014, + "grad_norm": 0.40452125668525696, + "learning_rate": 8.8762736984593e-06, + "loss": 0.06278891563415527, + "memory(GiB)": 17.38, + "step": 1430, + "token_acc": 0.9597457627118644, + "train_speed(iter/s)": 0.205632 + }, + { + "epoch": 0.25768799102132434, + "grad_norm": 0.3508487939834595, + "learning_rate": 8.86687685264267e-06, + "loss": 0.0584681510925293, + "memory(GiB)": 17.38, + "step": 1435, + "token_acc": 0.967391304347826, + "train_speed(iter/s)": 0.205741 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 0.38747382164001465, + "learning_rate": 8.857445899109716e-06, + "loss": 0.06145972013473511, + "memory(GiB)": 17.38, + "step": 1440, + "token_acc": 0.9578059071729957, + "train_speed(iter/s)": 0.205849 + }, + { + "epoch": 0.25948372615039283, + "grad_norm": 1.4630292654037476, + "learning_rate": 8.847980921045993e-06, + "loss": 0.06131004095077515, + "memory(GiB)": 17.38, + "step": 1445, + "token_acc": 0.9684873949579832, + "train_speed(iter/s)": 0.205958 + }, + { + "epoch": 0.26038159371492703, + "grad_norm": 0.24476726353168488, + "learning_rate": 8.838482001937167e-06, + "loss": 0.06051992177963257, + "memory(GiB)": 17.38, + "step": 1450, + "token_acc": 0.9617021276595744, + "train_speed(iter/s)": 0.206064 + }, + { + "epoch": 0.2612794612794613, + "grad_norm": 0.19611415266990662, + "learning_rate": 8.82894922556828e-06, + "loss": 0.061788105964660646, + "memory(GiB)": 17.38, + "step": 1455, + "token_acc": 0.9549180327868853, + "train_speed(iter/s)": 0.206168 + }, + { + "epoch": 0.2621773288439955, + "grad_norm": 0.2117372453212738, + "learning_rate": 8.819382676023012e-06, + "loss": 0.07761403918266296, + "memory(GiB)": 17.38, + "step": 1460, + "token_acc": 0.9670781893004116, + "train_speed(iter/s)": 0.20627 + }, + { + "epoch": 0.2630751964085297, + "grad_norm": 0.2789381444454193, + "learning_rate": 8.809782437682934e-06, + "loss": 0.06046514511108399, + "memory(GiB)": 17.38, + "step": 1465, + "token_acc": 0.9533898305084746, + "train_speed(iter/s)": 0.206373 + }, + { + "epoch": 0.26397306397306397, + "grad_norm": 0.28909796476364136, + "learning_rate": 8.800148595226774e-06, + "loss": 0.06368215680122376, + "memory(GiB)": 17.38, + "step": 1470, + "token_acc": 0.95625, + "train_speed(iter/s)": 0.206478 + }, + { + "epoch": 0.2648709315375982, + "grad_norm": 0.19754165410995483, + "learning_rate": 8.790481233629666e-06, + "loss": 0.07494029998779297, + "memory(GiB)": 17.38, + "step": 1475, + "token_acc": 0.9644351464435147, + "train_speed(iter/s)": 0.206577 + }, + { + "epoch": 0.2657687991021324, + "grad_norm": 0.2973647713661194, + "learning_rate": 8.78078043816239e-06, + "loss": 0.06123720407485962, + "memory(GiB)": 17.38, + "step": 1480, + "token_acc": 0.9508547008547008, + "train_speed(iter/s)": 0.206679 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.2255685329437256, + "learning_rate": 8.771046294390636e-06, + "loss": 0.05986815690994263, + "memory(GiB)": 17.38, + "step": 1485, + "token_acc": 0.9656652360515021, + "train_speed(iter/s)": 0.206778 + }, + { + "epoch": 0.2675645342312009, + "grad_norm": 0.35256877541542053, + "learning_rate": 8.761278888174244e-06, + "loss": 0.06030440926551819, + "memory(GiB)": 17.38, + "step": 1490, + "token_acc": 0.9572649572649573, + "train_speed(iter/s)": 0.206877 + }, + { + "epoch": 0.2684624017957351, + "grad_norm": 0.20107224583625793, + "learning_rate": 8.751478305666437e-06, + "loss": 0.05950585603713989, + "memory(GiB)": 17.38, + "step": 1495, + "token_acc": 0.9612068965517241, + "train_speed(iter/s)": 0.206974 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 3.832690715789795, + "learning_rate": 8.741644633313075e-06, + "loss": 0.06611365079879761, + "memory(GiB)": 17.38, + "step": 1500, + "token_acc": 0.9710743801652892, + "train_speed(iter/s)": 0.207071 + }, + { + "epoch": 0.26936026936026936, + "eval_loss": 0.06270831823348999, + "eval_runtime": 61.1515, + "eval_samples_per_second": 14.718, + "eval_steps_per_second": 7.359, + "eval_token_acc": 0.9591591035376922, + "step": 1500 + }, + { + "epoch": 0.2702581369248036, + "grad_norm": 0.20050875842571259, + "learning_rate": 8.731777957851886e-06, + "loss": 0.05926918983459473, + "memory(GiB)": 17.38, + "step": 1505, + "token_acc": 0.9596167008898016, + "train_speed(iter/s)": 0.20518 + }, + { + "epoch": 0.2711560044893378, + "grad_norm": 1.3584425449371338, + "learning_rate": 8.721878366311699e-06, + "loss": 0.05722533464431763, + "memory(GiB)": 17.38, + "step": 1510, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.205284 + }, + { + "epoch": 0.27205387205387205, + "grad_norm": 1.3556839227676392, + "learning_rate": 8.711945946011676e-06, + "loss": 0.057417023181915286, + "memory(GiB)": 17.38, + "step": 1515, + "token_acc": 0.9518828451882845, + "train_speed(iter/s)": 0.20539 + }, + { + "epoch": 0.2729517396184063, + "grad_norm": 0.4494972229003906, + "learning_rate": 8.701980784560552e-06, + "loss": 0.05820035934448242, + "memory(GiB)": 17.38, + "step": 1520, + "token_acc": 0.9661016949152542, + "train_speed(iter/s)": 0.205493 + }, + { + "epoch": 0.2738496071829405, + "grad_norm": 0.4754307270050049, + "learning_rate": 8.69198296985585e-06, + "loss": 0.06051011681556702, + "memory(GiB)": 17.38, + "step": 1525, + "token_acc": 0.9442148760330579, + "train_speed(iter/s)": 0.205596 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 0.4620916247367859, + "learning_rate": 8.681952590083109e-06, + "loss": 0.05676969289779663, + "memory(GiB)": 17.38, + "step": 1530, + "token_acc": 0.9626556016597511, + "train_speed(iter/s)": 0.205698 + }, + { + "epoch": 0.275645342312009, + "grad_norm": 0.7325692772865295, + "learning_rate": 8.671889733715113e-06, + "loss": 0.062285888195037845, + "memory(GiB)": 17.38, + "step": 1535, + "token_acc": 0.9564315352697096, + "train_speed(iter/s)": 0.205793 + }, + { + "epoch": 0.2765432098765432, + "grad_norm": 0.3295556604862213, + "learning_rate": 8.6617944895111e-06, + "loss": 0.06592202186584473, + "memory(GiB)": 17.38, + "step": 1540, + "token_acc": 0.9609053497942387, + "train_speed(iter/s)": 0.205894 + }, + { + "epoch": 0.27744107744107743, + "grad_norm": 0.22279192507266998, + "learning_rate": 8.651666946515987e-06, + "loss": 0.06685788631439209, + "memory(GiB)": 17.38, + "step": 1545, + "token_acc": 0.9487179487179487, + "train_speed(iter/s)": 0.205991 + }, + { + "epoch": 0.2783389450056117, + "grad_norm": 0.4168628752231598, + "learning_rate": 8.64150719405958e-06, + "loss": 0.061379635334014894, + "memory(GiB)": 17.38, + "step": 1550, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.206094 + }, + { + "epoch": 0.2792368125701459, + "grad_norm": 1.9790608882904053, + "learning_rate": 8.631315321755791e-06, + "loss": 0.05886937379837036, + "memory(GiB)": 17.38, + "step": 1555, + "token_acc": 0.9558232931726908, + "train_speed(iter/s)": 0.206193 + }, + { + "epoch": 0.2801346801346801, + "grad_norm": 0.2869833707809448, + "learning_rate": 8.62109141950184e-06, + "loss": 0.06456583738327026, + "memory(GiB)": 17.38, + "step": 1560, + "token_acc": 0.9472573839662447, + "train_speed(iter/s)": 0.20629 + }, + { + "epoch": 0.2810325476992144, + "grad_norm": 0.23892711102962494, + "learning_rate": 8.610835577477473e-06, + "loss": 0.06134705543518067, + "memory(GiB)": 17.38, + "step": 1565, + "token_acc": 0.96900826446281, + "train_speed(iter/s)": 0.206388 + }, + { + "epoch": 0.2819304152637486, + "grad_norm": 0.24749203026294708, + "learning_rate": 8.600547886144152e-06, + "loss": 0.05652496814727783, + "memory(GiB)": 17.38, + "step": 1570, + "token_acc": 0.9663865546218487, + "train_speed(iter/s)": 0.206484 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 0.4194713830947876, + "learning_rate": 8.590228436244273e-06, + "loss": 0.057959342002868654, + "memory(GiB)": 17.38, + "step": 1575, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.206582 + }, + { + "epoch": 0.28372615039281707, + "grad_norm": 0.3685196042060852, + "learning_rate": 8.579877318800351e-06, + "loss": 0.0615118145942688, + "memory(GiB)": 17.38, + "step": 1580, + "token_acc": 0.95625, + "train_speed(iter/s)": 0.206677 + }, + { + "epoch": 0.2846240179573513, + "grad_norm": 0.2775591015815735, + "learning_rate": 8.569494625114232e-06, + "loss": 0.06615750789642334, + "memory(GiB)": 17.38, + "step": 1585, + "token_acc": 0.9532520325203252, + "train_speed(iter/s)": 0.206775 + }, + { + "epoch": 0.2855218855218855, + "grad_norm": 0.19464662671089172, + "learning_rate": 8.559080446766272e-06, + "loss": 0.0631313979625702, + "memory(GiB)": 17.38, + "step": 1590, + "token_acc": 0.9596774193548387, + "train_speed(iter/s)": 0.206872 + }, + { + "epoch": 0.28641975308641976, + "grad_norm": 0.2997286319732666, + "learning_rate": 8.54863487561454e-06, + "loss": 0.05590323805809021, + "memory(GiB)": 17.38, + "step": 1595, + "token_acc": 0.9662447257383966, + "train_speed(iter/s)": 0.206965 + }, + { + "epoch": 0.287317620650954, + "grad_norm": 0.1819065809249878, + "learning_rate": 8.538158003794005e-06, + "loss": 0.05923629999160766, + "memory(GiB)": 17.38, + "step": 1600, + "token_acc": 0.9637096774193549, + "train_speed(iter/s)": 0.207062 + }, + { + "epoch": 0.287317620650954, + "eval_loss": 0.06009458750486374, + "eval_runtime": 61.6117, + "eval_samples_per_second": 14.608, + "eval_steps_per_second": 7.304, + "eval_token_acc": 0.9612891276162252, + "step": 1600 + }, + { + "epoch": 0.2882154882154882, + "grad_norm": 0.18381528556346893, + "learning_rate": 8.527649923715726e-06, + "loss": 0.06964247822761535, + "memory(GiB)": 17.38, + "step": 1605, + "token_acc": 0.9635613612925404, + "train_speed(iter/s)": 0.205298 + }, + { + "epoch": 0.28911335578002245, + "grad_norm": 0.38830941915512085, + "learning_rate": 8.517110728066025e-06, + "loss": 0.06106977462768555, + "memory(GiB)": 17.38, + "step": 1610, + "token_acc": 0.9529914529914529, + "train_speed(iter/s)": 0.205394 + }, + { + "epoch": 0.2900112233445567, + "grad_norm": 0.19983626902103424, + "learning_rate": 8.506540509805689e-06, + "loss": 0.057277143001556396, + "memory(GiB)": 17.38, + "step": 1615, + "token_acc": 0.9665271966527197, + "train_speed(iter/s)": 0.205489 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.2314106971025467, + "learning_rate": 8.49593936216913e-06, + "loss": 0.0593514084815979, + "memory(GiB)": 17.38, + "step": 1620, + "token_acc": 0.9567901234567902, + "train_speed(iter/s)": 0.205584 + }, + { + "epoch": 0.29180695847362514, + "grad_norm": 0.686742901802063, + "learning_rate": 8.485307378663576e-06, + "loss": 0.05730807185173035, + "memory(GiB)": 17.38, + "step": 1625, + "token_acc": 0.9623430962343096, + "train_speed(iter/s)": 0.205669 + }, + { + "epoch": 0.2927048260381594, + "grad_norm": 0.3983691930770874, + "learning_rate": 8.474644653068247e-06, + "loss": 0.06406124234199524, + "memory(GiB)": 17.38, + "step": 1630, + "token_acc": 0.9571428571428572, + "train_speed(iter/s)": 0.205766 + }, + { + "epoch": 0.2936026936026936, + "grad_norm": 0.9178083539009094, + "learning_rate": 8.463951279433516e-06, + "loss": 0.055891549587249754, + "memory(GiB)": 17.38, + "step": 1635, + "token_acc": 0.9662447257383966, + "train_speed(iter/s)": 0.205861 + }, + { + "epoch": 0.29450056116722784, + "grad_norm": 0.53924161195755, + "learning_rate": 8.453227352080086e-06, + "loss": 0.05637087225914002, + "memory(GiB)": 17.38, + "step": 1640, + "token_acc": 0.9617021276595744, + "train_speed(iter/s)": 0.205954 + }, + { + "epoch": 0.2953984287317621, + "grad_norm": 0.7779171466827393, + "learning_rate": 8.442472965598167e-06, + "loss": 0.06116440296173096, + "memory(GiB)": 17.38, + "step": 1645, + "token_acc": 0.9648760330578512, + "train_speed(iter/s)": 0.206046 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.0439373254776, + "learning_rate": 8.431688214846624e-06, + "loss": 0.056550908088684085, + "memory(GiB)": 17.38, + "step": 1650, + "token_acc": 0.9626556016597511, + "train_speed(iter/s)": 0.206136 + }, + { + "epoch": 0.29719416386083053, + "grad_norm": 1.112012505531311, + "learning_rate": 8.420873194952153e-06, + "loss": 0.05412648916244507, + "memory(GiB)": 17.38, + "step": 1655, + "token_acc": 0.959349593495935, + "train_speed(iter/s)": 0.206228 + }, + { + "epoch": 0.2980920314253648, + "grad_norm": 0.6796290278434753, + "learning_rate": 8.41002800130844e-06, + "loss": 0.055440759658813475, + "memory(GiB)": 17.38, + "step": 1660, + "token_acc": 0.9733606557377049, + "train_speed(iter/s)": 0.206318 + }, + { + "epoch": 0.298989898989899, + "grad_norm": 0.7253136038780212, + "learning_rate": 8.399152729575315e-06, + "loss": 0.055655372142791745, + "memory(GiB)": 17.38, + "step": 1665, + "token_acc": 0.9647302904564315, + "train_speed(iter/s)": 0.206407 + }, + { + "epoch": 0.2998877665544332, + "grad_norm": 0.5415838360786438, + "learning_rate": 8.388247475677913e-06, + "loss": 0.05884740948677063, + "memory(GiB)": 17.38, + "step": 1670, + "token_acc": 0.9629629629629629, + "train_speed(iter/s)": 0.206499 + }, + { + "epoch": 0.30078563411896747, + "grad_norm": 0.9751707911491394, + "learning_rate": 8.377312335805829e-06, + "loss": 0.054012417793273926, + "memory(GiB)": 17.38, + "step": 1675, + "token_acc": 0.9651639344262295, + "train_speed(iter/s)": 0.20659 + }, + { + "epoch": 0.30168350168350166, + "grad_norm": 0.3519562780857086, + "learning_rate": 8.366347406412258e-06, + "loss": 0.057966434955596925, + "memory(GiB)": 17.38, + "step": 1680, + "token_acc": 0.9684873949579832, + "train_speed(iter/s)": 0.206678 + }, + { + "epoch": 0.3025813692480359, + "grad_norm": 0.5439744591712952, + "learning_rate": 8.355352784213164e-06, + "loss": 0.051614034175872806, + "memory(GiB)": 17.38, + "step": 1685, + "token_acc": 0.9651639344262295, + "train_speed(iter/s)": 0.206766 + }, + { + "epoch": 0.30347923681257016, + "grad_norm": 1.1293874979019165, + "learning_rate": 8.344328566186408e-06, + "loss": 0.06429520845413209, + "memory(GiB)": 17.38, + "step": 1690, + "token_acc": 0.9650205761316872, + "train_speed(iter/s)": 0.206856 + }, + { + "epoch": 0.30437710437710436, + "grad_norm": 0.4583297371864319, + "learning_rate": 8.333274849570901e-06, + "loss": 0.05393800735473633, + "memory(GiB)": 17.38, + "step": 1695, + "token_acc": 0.9686192468619247, + "train_speed(iter/s)": 0.206944 + }, + { + "epoch": 0.3052749719416386, + "grad_norm": 1.2167142629623413, + "learning_rate": 8.32219173186575e-06, + "loss": 0.05225977897644043, + "memory(GiB)": 17.38, + "step": 1700, + "token_acc": 0.96875, + "train_speed(iter/s)": 0.207031 + }, + { + "epoch": 0.3052749719416386, + "eval_loss": 0.05385885015130043, + "eval_runtime": 62.4396, + "eval_samples_per_second": 14.414, + "eval_steps_per_second": 7.207, + "eval_token_acc": 0.9687905167623634, + "step": 1700 + }, + { + "epoch": 0.30617283950617286, + "grad_norm": 0.5205121636390686, + "learning_rate": 8.311079310829392e-06, + "loss": 0.06428790092468262, + "memory(GiB)": 17.38, + "step": 1705, + "token_acc": 0.968215994531784, + "train_speed(iter/s)": 0.205347 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 0.45046693086624146, + "learning_rate": 8.29993768447873e-06, + "loss": 0.0510045051574707, + "memory(GiB)": 17.38, + "step": 1710, + "token_acc": 0.9725738396624473, + "train_speed(iter/s)": 0.205434 + }, + { + "epoch": 0.3079685746352413, + "grad_norm": 0.5792934894561768, + "learning_rate": 8.288766951088278e-06, + "loss": 0.053136146068573, + "memory(GiB)": 17.38, + "step": 1715, + "token_acc": 0.9714285714285714, + "train_speed(iter/s)": 0.205524 + }, + { + "epoch": 0.30886644219977555, + "grad_norm": 0.6511686444282532, + "learning_rate": 8.27756720918928e-06, + "loss": 0.043464139103889465, + "memory(GiB)": 17.38, + "step": 1720, + "token_acc": 0.9732510288065843, + "train_speed(iter/s)": 0.205611 + }, + { + "epoch": 0.30976430976430974, + "grad_norm": 1.1749088764190674, + "learning_rate": 8.26633855756886e-06, + "loss": 0.049206393957138064, + "memory(GiB)": 17.38, + "step": 1725, + "token_acc": 0.9710743801652892, + "train_speed(iter/s)": 0.2057 + }, + { + "epoch": 0.310662177328844, + "grad_norm": 2.389378547668457, + "learning_rate": 8.255081095269129e-06, + "loss": 0.05044243335723877, + "memory(GiB)": 17.38, + "step": 1730, + "token_acc": 0.9728033472803347, + "train_speed(iter/s)": 0.20579 + }, + { + "epoch": 0.31156004489337824, + "grad_norm": 0.7854822874069214, + "learning_rate": 8.243794921586328e-06, + "loss": 0.04849475920200348, + "memory(GiB)": 17.38, + "step": 1735, + "token_acc": 0.9578313253012049, + "train_speed(iter/s)": 0.205877 + }, + { + "epoch": 0.31245791245791243, + "grad_norm": 0.5730651617050171, + "learning_rate": 8.232480136069947e-06, + "loss": 0.053887224197387694, + "memory(GiB)": 17.38, + "step": 1740, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.205966 + }, + { + "epoch": 0.3133557800224467, + "grad_norm": 0.5408889651298523, + "learning_rate": 8.221136838521842e-06, + "loss": 0.05286896824836731, + "memory(GiB)": 17.38, + "step": 1745, + "token_acc": 0.9688796680497925, + "train_speed(iter/s)": 0.206055 + }, + { + "epoch": 0.31425364758698093, + "grad_norm": 0.5818673968315125, + "learning_rate": 8.209765128995359e-06, + "loss": 0.05269973874092102, + "memory(GiB)": 17.38, + "step": 1750, + "token_acc": 0.9707112970711297, + "train_speed(iter/s)": 0.206144 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 0.7109072804450989, + "learning_rate": 8.198365107794457e-06, + "loss": 0.05988657474517822, + "memory(GiB)": 17.38, + "step": 1755, + "token_acc": 0.9755102040816327, + "train_speed(iter/s)": 0.206228 + }, + { + "epoch": 0.3160493827160494, + "grad_norm": 0.40680649876594543, + "learning_rate": 8.18693687547281e-06, + "loss": 0.050401312112808225, + "memory(GiB)": 17.38, + "step": 1760, + "token_acc": 0.9650655021834061, + "train_speed(iter/s)": 0.206314 + }, + { + "epoch": 0.3169472502805836, + "grad_norm": 1.201696515083313, + "learning_rate": 8.175480532832938e-06, + "loss": 0.0515005350112915, + "memory(GiB)": 17.38, + "step": 1765, + "token_acc": 0.96900826446281, + "train_speed(iter/s)": 0.2064 + }, + { + "epoch": 0.3178451178451178, + "grad_norm": 0.8858659267425537, + "learning_rate": 8.163996180925293e-06, + "loss": 0.053725415468215944, + "memory(GiB)": 17.38, + "step": 1770, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.206483 + }, + { + "epoch": 0.31874298540965207, + "grad_norm": 0.41544297337532043, + "learning_rate": 8.152483921047393e-06, + "loss": 0.05614883303642273, + "memory(GiB)": 17.38, + "step": 1775, + "token_acc": 0.9707112970711297, + "train_speed(iter/s)": 0.206567 + }, + { + "epoch": 0.3196408529741863, + "grad_norm": 0.4510469436645508, + "learning_rate": 8.140943854742919e-06, + "loss": 0.04743733704090118, + "memory(GiB)": 17.38, + "step": 1780, + "token_acc": 0.9695121951219512, + "train_speed(iter/s)": 0.206651 + }, + { + "epoch": 0.3205387205387205, + "grad_norm": 0.5475513935089111, + "learning_rate": 8.129376083800808e-06, + "loss": 0.0602342963218689, + "memory(GiB)": 17.38, + "step": 1785, + "token_acc": 0.9644351464435147, + "train_speed(iter/s)": 0.206739 + }, + { + "epoch": 0.32143658810325476, + "grad_norm": 0.3983801305294037, + "learning_rate": 8.117780710254376e-06, + "loss": 0.049009278416633606, + "memory(GiB)": 17.38, + "step": 1790, + "token_acc": 0.9702127659574468, + "train_speed(iter/s)": 0.206828 + }, + { + "epoch": 0.322334455667789, + "grad_norm": 0.4395924210548401, + "learning_rate": 8.1061578363804e-06, + "loss": 0.05034338235855103, + "memory(GiB)": 17.38, + "step": 1795, + "token_acc": 0.9629629629629629, + "train_speed(iter/s)": 0.206913 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 0.3134554624557495, + "learning_rate": 8.094507564698227e-06, + "loss": 0.049434316158294675, + "memory(GiB)": 17.38, + "step": 1800, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.206995 + }, + { + "epoch": 0.32323232323232326, + "eval_loss": 0.050244588404893875, + "eval_runtime": 61.5398, + "eval_samples_per_second": 14.625, + "eval_steps_per_second": 7.312, + "eval_token_acc": 0.9705501018707168, + "step": 1800 + }, + { + "epoch": 0.32413019079685745, + "grad_norm": 1.3453532457351685, + "learning_rate": 8.082829997968864e-06, + "loss": 0.04731326699256897, + "memory(GiB)": 17.38, + "step": 1805, + "token_acc": 0.9711407103825137, + "train_speed(iter/s)": 0.205433 + }, + { + "epoch": 0.3250280583613917, + "grad_norm": 1.6478904485702515, + "learning_rate": 8.071125239194072e-06, + "loss": 0.059991860389709474, + "memory(GiB)": 17.38, + "step": 1810, + "token_acc": 0.9625, + "train_speed(iter/s)": 0.205521 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 0.34592726826667786, + "learning_rate": 8.059393391615467e-06, + "loss": 0.05439823269844055, + "memory(GiB)": 17.38, + "step": 1815, + "token_acc": 0.9683544303797469, + "train_speed(iter/s)": 0.205606 + }, + { + "epoch": 0.32682379349046015, + "grad_norm": 0.4402664601802826, + "learning_rate": 8.047634558713591e-06, + "loss": 0.04894966781139374, + "memory(GiB)": 17.38, + "step": 1820, + "token_acc": 0.9663865546218487, + "train_speed(iter/s)": 0.205692 + }, + { + "epoch": 0.3277216610549944, + "grad_norm": 0.5864477157592773, + "learning_rate": 8.035848844207013e-06, + "loss": 0.050298839807510376, + "memory(GiB)": 17.38, + "step": 1825, + "token_acc": 0.9731404958677686, + "train_speed(iter/s)": 0.205777 + }, + { + "epoch": 0.32861952861952864, + "grad_norm": 1.1882489919662476, + "learning_rate": 8.024036352051413e-06, + "loss": 0.05107756853103638, + "memory(GiB)": 17.38, + "step": 1830, + "token_acc": 0.9726890756302521, + "train_speed(iter/s)": 0.205862 + }, + { + "epoch": 0.32951739618406284, + "grad_norm": 0.4595033824443817, + "learning_rate": 8.012197186438661e-06, + "loss": 0.05513496398925781, + "memory(GiB)": 17.38, + "step": 1835, + "token_acc": 0.9564315352697096, + "train_speed(iter/s)": 0.205948 + }, + { + "epoch": 0.3304152637485971, + "grad_norm": 0.3178597390651703, + "learning_rate": 8.0003314517959e-06, + "loss": 0.05318605899810791, + "memory(GiB)": 17.38, + "step": 1840, + "token_acc": 0.9702127659574468, + "train_speed(iter/s)": 0.206032 + }, + { + "epoch": 0.33131313131313134, + "grad_norm": 0.34179985523223877, + "learning_rate": 7.988439252784626e-06, + "loss": 0.05075373649597168, + "memory(GiB)": 17.38, + "step": 1845, + "token_acc": 0.9755102040816327, + "train_speed(iter/s)": 0.206113 + }, + { + "epoch": 0.33221099887766553, + "grad_norm": 0.6778779625892639, + "learning_rate": 7.976520694299758e-06, + "loss": 0.04252697825431824, + "memory(GiB)": 17.38, + "step": 1850, + "token_acc": 0.9726890756302521, + "train_speed(iter/s)": 0.206194 + }, + { + "epoch": 0.3331088664421998, + "grad_norm": 1.666723608970642, + "learning_rate": 7.964575881468728e-06, + "loss": 0.04545501470565796, + "memory(GiB)": 17.38, + "step": 1855, + "token_acc": 0.9726890756302521, + "train_speed(iter/s)": 0.206277 + }, + { + "epoch": 0.33400673400673403, + "grad_norm": 1.1488292217254639, + "learning_rate": 7.952604919650535e-06, + "loss": 0.04444233775138855, + "memory(GiB)": 17.38, + "step": 1860, + "token_acc": 0.9748953974895398, + "train_speed(iter/s)": 0.206361 + }, + { + "epoch": 0.3349046015712682, + "grad_norm": 0.7611648440361023, + "learning_rate": 7.940607914434829e-06, + "loss": 0.04190738201141357, + "memory(GiB)": 17.38, + "step": 1865, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.206443 + }, + { + "epoch": 0.3358024691358025, + "grad_norm": 3.0720436573028564, + "learning_rate": 7.928584971640974e-06, + "loss": 0.05007625818252563, + "memory(GiB)": 17.38, + "step": 1870, + "token_acc": 0.9663865546218487, + "train_speed(iter/s)": 0.206525 + }, + { + "epoch": 0.3367003367003367, + "grad_norm": 1.0441442728042603, + "learning_rate": 7.916536197317118e-06, + "loss": 0.05039591789245605, + "memory(GiB)": 17.38, + "step": 1875, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.206605 + }, + { + "epoch": 0.3375982042648709, + "grad_norm": 0.4030882716178894, + "learning_rate": 7.90446169773925e-06, + "loss": 0.05007714033126831, + "memory(GiB)": 17.38, + "step": 1880, + "token_acc": 0.977366255144033, + "train_speed(iter/s)": 0.206677 + }, + { + "epoch": 0.33849607182940517, + "grad_norm": 0.37194904685020447, + "learning_rate": 7.892361579410278e-06, + "loss": 0.0584410548210144, + "memory(GiB)": 17.38, + "step": 1885, + "token_acc": 0.9635193133047211, + "train_speed(iter/s)": 0.206755 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.32008102536201477, + "learning_rate": 7.880235949059066e-06, + "loss": 0.0479887843132019, + "memory(GiB)": 17.38, + "step": 1890, + "token_acc": 0.9678111587982833, + "train_speed(iter/s)": 0.206833 + }, + { + "epoch": 0.3402918069584736, + "grad_norm": 0.7468282580375671, + "learning_rate": 7.86808491363952e-06, + "loss": 0.0499185711145401, + "memory(GiB)": 17.38, + "step": 1895, + "token_acc": 0.975103734439834, + "train_speed(iter/s)": 0.206911 + }, + { + "epoch": 0.34118967452300786, + "grad_norm": 0.9350959062576294, + "learning_rate": 7.855908580329622e-06, + "loss": 0.04516475796699524, + "memory(GiB)": 17.38, + "step": 1900, + "token_acc": 0.9743589743589743, + "train_speed(iter/s)": 0.20699 + }, + { + "epoch": 0.34118967452300786, + "eval_loss": 0.05578334257006645, + "eval_runtime": 61.8444, + "eval_samples_per_second": 14.553, + "eval_steps_per_second": 7.276, + "eval_token_acc": 0.9721244674939804, + "step": 1900 + }, + { + "epoch": 0.3420875420875421, + "grad_norm": 0.7588117122650146, + "learning_rate": 7.843707056530498e-06, + "loss": 0.04957005977630615, + "memory(GiB)": 17.38, + "step": 1905, + "token_acc": 0.973107228502912, + "train_speed(iter/s)": 0.2055 + }, + { + "epoch": 0.3429854096520763, + "grad_norm": 0.6615130305290222, + "learning_rate": 7.831480449865463e-06, + "loss": 0.06455750465393066, + "memory(GiB)": 17.38, + "step": 1910, + "token_acc": 0.9741379310344828, + "train_speed(iter/s)": 0.205583 + }, + { + "epoch": 0.34388327721661055, + "grad_norm": 0.3485505282878876, + "learning_rate": 7.81922886817908e-06, + "loss": 0.0470316469669342, + "memory(GiB)": 17.38, + "step": 1915, + "token_acc": 0.9744680851063829, + "train_speed(iter/s)": 0.205663 + }, + { + "epoch": 0.3447811447811448, + "grad_norm": 0.35855746269226074, + "learning_rate": 7.806952419536197e-06, + "loss": 0.048644530773162845, + "memory(GiB)": 17.38, + "step": 1920, + "token_acc": 0.9730290456431535, + "train_speed(iter/s)": 0.205743 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 0.5461351275444031, + "learning_rate": 7.794651212221004e-06, + "loss": 0.048619586229324344, + "memory(GiB)": 17.38, + "step": 1925, + "token_acc": 0.969758064516129, + "train_speed(iter/s)": 0.205823 + }, + { + "epoch": 0.34657687991021324, + "grad_norm": 1.272194743156433, + "learning_rate": 7.782325354736079e-06, + "loss": 0.04851185381412506, + "memory(GiB)": 17.38, + "step": 1930, + "token_acc": 0.9718875502008032, + "train_speed(iter/s)": 0.205902 + }, + { + "epoch": 0.3474747474747475, + "grad_norm": 0.5359762907028198, + "learning_rate": 7.769974955801416e-06, + "loss": 0.06297271251678467, + "memory(GiB)": 17.38, + "step": 1935, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.205983 + }, + { + "epoch": 0.3483726150392817, + "grad_norm": 0.5424381494522095, + "learning_rate": 7.757600124353486e-06, + "loss": 0.04396359622478485, + "memory(GiB)": 17.38, + "step": 1940, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.206058 + }, + { + "epoch": 0.34927048260381593, + "grad_norm": 1.1437106132507324, + "learning_rate": 7.745200969544262e-06, + "loss": 0.04972865283489227, + "memory(GiB)": 17.38, + "step": 1945, + "token_acc": 0.9683544303797469, + "train_speed(iter/s)": 0.206137 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 0.3352559804916382, + "learning_rate": 7.73277760074026e-06, + "loss": 0.06029191017150879, + "memory(GiB)": 17.38, + "step": 1950, + "token_acc": 0.9684873949579832, + "train_speed(iter/s)": 0.206214 + }, + { + "epoch": 0.3510662177328844, + "grad_norm": 0.22174805402755737, + "learning_rate": 7.720330127521578e-06, + "loss": 0.047991964221000674, + "memory(GiB)": 17.38, + "step": 1955, + "token_acc": 0.9732510288065843, + "train_speed(iter/s)": 0.206289 + }, + { + "epoch": 0.3519640852974186, + "grad_norm": 0.3176683187484741, + "learning_rate": 7.707858659680924e-06, + "loss": 0.0564542293548584, + "memory(GiB)": 17.38, + "step": 1960, + "token_acc": 0.9615384615384616, + "train_speed(iter/s)": 0.206368 + }, + { + "epoch": 0.3528619528619529, + "grad_norm": 0.3824206590652466, + "learning_rate": 7.695363307222651e-06, + "loss": 0.048759883642196654, + "memory(GiB)": 17.38, + "step": 1965, + "token_acc": 0.9746835443037974, + "train_speed(iter/s)": 0.206445 + }, + { + "epoch": 0.35375982042648707, + "grad_norm": 0.43617939949035645, + "learning_rate": 7.682844180361787e-06, + "loss": 0.05128822922706604, + "memory(GiB)": 17.38, + "step": 1970, + "token_acc": 0.96900826446281, + "train_speed(iter/s)": 0.206522 + }, + { + "epoch": 0.3546576879910213, + "grad_norm": 11.234249114990234, + "learning_rate": 7.670301389523062e-06, + "loss": 0.051172614097595215, + "memory(GiB)": 17.38, + "step": 1975, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.206597 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.683441698551178, + "learning_rate": 7.65773504533993e-06, + "loss": 0.04348492622375488, + "memory(GiB)": 17.38, + "step": 1980, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.206675 + }, + { + "epoch": 0.35645342312008976, + "grad_norm": 0.6764236688613892, + "learning_rate": 7.645145258653601e-06, + "loss": 0.04853289127349854, + "memory(GiB)": 17.38, + "step": 1985, + "token_acc": 0.9747899159663865, + "train_speed(iter/s)": 0.206749 + }, + { + "epoch": 0.357351290684624, + "grad_norm": 0.725853681564331, + "learning_rate": 7.632532140512057e-06, + "loss": 0.043640559911727904, + "memory(GiB)": 17.38, + "step": 1990, + "token_acc": 0.9739130434782609, + "train_speed(iter/s)": 0.206825 + }, + { + "epoch": 0.35824915824915826, + "grad_norm": 0.6935517191886902, + "learning_rate": 7.619895802169074e-06, + "loss": 0.04902539849281311, + "memory(GiB)": 17.38, + "step": 1995, + "token_acc": 0.9730290456431535, + "train_speed(iter/s)": 0.206889 + }, + { + "epoch": 0.35914702581369246, + "grad_norm": 0.7494962811470032, + "learning_rate": 7.607236355083243e-06, + "loss": 0.054493141174316403, + "memory(GiB)": 17.38, + "step": 2000, + "token_acc": 0.970954356846473, + "train_speed(iter/s)": 0.206963 + }, + { + "epoch": 0.35914702581369246, + "eval_loss": 0.048365890979766846, + "eval_runtime": 61.3927, + "eval_samples_per_second": 14.66, + "eval_steps_per_second": 7.33, + "eval_token_acc": 0.9724022967216152, + "step": 2000 + }, + { + "epoch": 0.3600448933782267, + "grad_norm": 0.6867471933364868, + "learning_rate": 7.594553910916986e-06, + "loss": 0.051302409172058104, + "memory(GiB)": 17.38, + "step": 2005, + "token_acc": 0.9723266142808337, + "train_speed(iter/s)": 0.205546 + }, + { + "epoch": 0.36094276094276095, + "grad_norm": 0.43226101994514465, + "learning_rate": 7.581848581535568e-06, + "loss": 0.049455440044403075, + "memory(GiB)": 17.38, + "step": 2010, + "token_acc": 0.9651639344262295, + "train_speed(iter/s)": 0.205625 + }, + { + "epoch": 0.36184062850729515, + "grad_norm": 0.7718700170516968, + "learning_rate": 7.569120479006113e-06, + "loss": 0.06295598149299622, + "memory(GiB)": 17.38, + "step": 2015, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.205702 + }, + { + "epoch": 0.3627384960718294, + "grad_norm": 1.1958757638931274, + "learning_rate": 7.5563697155966175e-06, + "loss": 0.05034412741661072, + "memory(GiB)": 17.38, + "step": 2020, + "token_acc": 0.9810126582278481, + "train_speed(iter/s)": 0.205776 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.9355653524398804, + "learning_rate": 7.543596403774955e-06, + "loss": 0.04434739351272583, + "memory(GiB)": 17.38, + "step": 2025, + "token_acc": 0.9819277108433735, + "train_speed(iter/s)": 0.205855 + }, + { + "epoch": 0.36453423120089784, + "grad_norm": 0.3606739938259125, + "learning_rate": 7.530800656207888e-06, + "loss": 0.04035702049732208, + "memory(GiB)": 17.38, + "step": 2030, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.20593 + }, + { + "epoch": 0.3654320987654321, + "grad_norm": 0.739323616027832, + "learning_rate": 7.5179825857600745e-06, + "loss": 0.05430004596710205, + "memory(GiB)": 17.38, + "step": 2035, + "token_acc": 0.9684873949579832, + "train_speed(iter/s)": 0.206005 + }, + { + "epoch": 0.36632996632996634, + "grad_norm": 1.0498813390731812, + "learning_rate": 7.505142305493067e-06, + "loss": 0.05211538076400757, + "memory(GiB)": 17.38, + "step": 2040, + "token_acc": 0.9733606557377049, + "train_speed(iter/s)": 0.20608 + }, + { + "epoch": 0.3672278338945006, + "grad_norm": 1.0236821174621582, + "learning_rate": 7.492279928664323e-06, + "loss": 0.04863170683383942, + "memory(GiB)": 17.38, + "step": 2045, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.206155 + }, + { + "epoch": 0.3681257014590348, + "grad_norm": 0.5476414561271667, + "learning_rate": 7.4793955687262045e-06, + "loss": 0.050250005722045896, + "memory(GiB)": 17.38, + "step": 2050, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.206229 + }, + { + "epoch": 0.36902356902356903, + "grad_norm": 1.2242101430892944, + "learning_rate": 7.466489339324967e-06, + "loss": 0.04730603396892548, + "memory(GiB)": 17.38, + "step": 2055, + "token_acc": 0.9790794979079498, + "train_speed(iter/s)": 0.206301 + }, + { + "epoch": 0.3699214365881033, + "grad_norm": 0.736853837966919, + "learning_rate": 7.453561354299775e-06, + "loss": 0.04943092167377472, + "memory(GiB)": 17.38, + "step": 2060, + "token_acc": 0.9730290456431535, + "train_speed(iter/s)": 0.206375 + }, + { + "epoch": 0.3708193041526375, + "grad_norm": 0.7283698320388794, + "learning_rate": 7.440611727681677e-06, + "loss": 0.04298454225063324, + "memory(GiB)": 17.38, + "step": 2065, + "token_acc": 0.9752066115702479, + "train_speed(iter/s)": 0.206451 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 1.2035702466964722, + "learning_rate": 7.427640573692622e-06, + "loss": 0.04201290011405945, + "memory(GiB)": 17.38, + "step": 2070, + "token_acc": 0.9752066115702479, + "train_speed(iter/s)": 0.206522 + }, + { + "epoch": 0.372615039281706, + "grad_norm": 0.5028029680252075, + "learning_rate": 7.4146480067444335e-06, + "loss": 0.048489111661911014, + "memory(GiB)": 17.38, + "step": 2075, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.206593 + }, + { + "epoch": 0.37351290684624017, + "grad_norm": 0.6115928292274475, + "learning_rate": 7.401634141437812e-06, + "loss": 0.043162816762924196, + "memory(GiB)": 17.38, + "step": 2080, + "token_acc": 0.9686192468619247, + "train_speed(iter/s)": 0.206667 + }, + { + "epoch": 0.3744107744107744, + "grad_norm": 0.5469858646392822, + "learning_rate": 7.388599092561315e-06, + "loss": 0.04374293386936188, + "memory(GiB)": 17.38, + "step": 2085, + "token_acc": 0.9753086419753086, + "train_speed(iter/s)": 0.206742 + }, + { + "epoch": 0.37530864197530867, + "grad_norm": 0.9803187251091003, + "learning_rate": 7.375542975090354e-06, + "loss": 0.047701552510261536, + "memory(GiB)": 17.38, + "step": 2090, + "token_acc": 0.9703389830508474, + "train_speed(iter/s)": 0.206815 + }, + { + "epoch": 0.37620650953984286, + "grad_norm": 0.6424798965454102, + "learning_rate": 7.362465904186171e-06, + "loss": 0.054633927345275876, + "memory(GiB)": 17.38, + "step": 2095, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.206885 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 0.9060097336769104, + "learning_rate": 7.349367995194835e-06, + "loss": 0.047419339418411255, + "memory(GiB)": 17.38, + "step": 2100, + "token_acc": 0.9725738396624473, + "train_speed(iter/s)": 0.206957 + }, + { + "epoch": 0.3771043771043771, + "eval_loss": 0.04444968327879906, + "eval_runtime": 61.8277, + "eval_samples_per_second": 14.557, + "eval_steps_per_second": 7.278, + "eval_token_acc": 0.9756436377106872, + "step": 2100 + }, + { + "epoch": 0.37800224466891136, + "grad_norm": 0.44990283250808716, + "learning_rate": 7.336249363646206e-06, + "loss": 0.0417256623506546, + "memory(GiB)": 17.38, + "step": 2105, + "token_acc": 0.9757347915242652, + "train_speed(iter/s)": 0.205635 + }, + { + "epoch": 0.37890011223344555, + "grad_norm": 1.9681529998779297, + "learning_rate": 7.3231101252529355e-06, + "loss": 0.048003393411636355, + "memory(GiB)": 17.38, + "step": 2110, + "token_acc": 0.9809322033898306, + "train_speed(iter/s)": 0.205742 + }, + { + "epoch": 0.3797979797979798, + "grad_norm": 0.3652014434337616, + "learning_rate": 7.309950395909436e-06, + "loss": 0.040440842509269714, + "memory(GiB)": 17.38, + "step": 2115, + "token_acc": 0.9769874476987448, + "train_speed(iter/s)": 0.205849 + }, + { + "epoch": 0.38069584736251405, + "grad_norm": 0.6151148080825806, + "learning_rate": 7.296770291690855e-06, + "loss": 0.059120213985443114, + "memory(GiB)": 17.38, + "step": 2120, + "token_acc": 0.9573170731707317, + "train_speed(iter/s)": 0.20592 + }, + { + "epoch": 0.38159371492704824, + "grad_norm": 0.878516435623169, + "learning_rate": 7.283569928852064e-06, + "loss": 0.05222681760787964, + "memory(GiB)": 17.38, + "step": 2125, + "token_acc": 0.9645833333333333, + "train_speed(iter/s)": 0.205992 + }, + { + "epoch": 0.3824915824915825, + "grad_norm": 0.3211354613304138, + "learning_rate": 7.270349423826619e-06, + "loss": 0.04556797444820404, + "memory(GiB)": 17.38, + "step": 2130, + "token_acc": 0.9752066115702479, + "train_speed(iter/s)": 0.206064 + }, + { + "epoch": 0.38338945005611674, + "grad_norm": 0.524276077747345, + "learning_rate": 7.257108893225741e-06, + "loss": 0.04268750548362732, + "memory(GiB)": 17.38, + "step": 2135, + "token_acc": 0.9742063492063492, + "train_speed(iter/s)": 0.206136 + }, + { + "epoch": 0.38428731762065094, + "grad_norm": 0.6066728830337524, + "learning_rate": 7.24384845383729e-06, + "loss": 0.042040497064590454, + "memory(GiB)": 17.38, + "step": 2140, + "token_acc": 0.9745762711864406, + "train_speed(iter/s)": 0.20621 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 0.7198181748390198, + "learning_rate": 7.230568222624727e-06, + "loss": 0.0421042263507843, + "memory(GiB)": 17.38, + "step": 2145, + "token_acc": 0.9747899159663865, + "train_speed(iter/s)": 0.206303 + }, + { + "epoch": 0.38608305274971944, + "grad_norm": 0.8296064138412476, + "learning_rate": 7.217268316726086e-06, + "loss": 0.047165513038635254, + "memory(GiB)": 17.38, + "step": 2150, + "token_acc": 0.9591836734693877, + "train_speed(iter/s)": 0.206407 + }, + { + "epoch": 0.38698092031425363, + "grad_norm": 1.414443016052246, + "learning_rate": 7.203948853452946e-06, + "loss": 0.04513181447982788, + "memory(GiB)": 17.38, + "step": 2155, + "token_acc": 0.9732510288065843, + "train_speed(iter/s)": 0.206511 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 1.2538434267044067, + "learning_rate": 7.1906099502893855e-06, + "loss": 0.06422624588012696, + "memory(GiB)": 17.38, + "step": 2160, + "token_acc": 0.9715447154471545, + "train_speed(iter/s)": 0.206613 + }, + { + "epoch": 0.38877665544332213, + "grad_norm": 0.3281596004962921, + "learning_rate": 7.177251724890957e-06, + "loss": 0.04669368267059326, + "memory(GiB)": 17.38, + "step": 2165, + "token_acc": 0.9774590163934426, + "train_speed(iter/s)": 0.206714 + }, + { + "epoch": 0.3896745230078563, + "grad_norm": 0.33012834191322327, + "learning_rate": 7.1638742950836426e-06, + "loss": 0.050878942012786865, + "memory(GiB)": 17.38, + "step": 2170, + "token_acc": 0.9713114754098361, + "train_speed(iter/s)": 0.206817 + }, + { + "epoch": 0.39057239057239057, + "grad_norm": 0.23498868942260742, + "learning_rate": 7.150477778862814e-06, + "loss": 0.0474587619304657, + "memory(GiB)": 17.38, + "step": 2175, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.206918 + }, + { + "epoch": 0.3914702581369248, + "grad_norm": 0.4933842122554779, + "learning_rate": 7.1370622943922004e-06, + "loss": 0.04833953380584717, + "memory(GiB)": 17.38, + "step": 2180, + "token_acc": 0.9688796680497925, + "train_speed(iter/s)": 0.207019 + }, + { + "epoch": 0.392368125701459, + "grad_norm": 1.5294967889785767, + "learning_rate": 7.123627960002834e-06, + "loss": 0.04325726330280304, + "memory(GiB)": 17.38, + "step": 2185, + "token_acc": 0.9707112970711297, + "train_speed(iter/s)": 0.207121 + }, + { + "epoch": 0.39326599326599326, + "grad_norm": 0.6287378072738647, + "learning_rate": 7.110174894192014e-06, + "loss": 0.051592016220092775, + "memory(GiB)": 17.38, + "step": 2190, + "token_acc": 0.96900826446281, + "train_speed(iter/s)": 0.207224 + }, + { + "epoch": 0.3941638608305275, + "grad_norm": 0.5986263155937195, + "learning_rate": 7.096703215622262e-06, + "loss": 0.046039119362831116, + "memory(GiB)": 17.38, + "step": 2195, + "token_acc": 0.9744680851063829, + "train_speed(iter/s)": 0.207326 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.44889023900032043, + "learning_rate": 7.083213043120272e-06, + "loss": 0.041845005750656125, + "memory(GiB)": 17.38, + "step": 2200, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.207424 + }, + { + "epoch": 0.3950617283950617, + "eval_loss": 0.043757688254117966, + "eval_runtime": 59.3307, + "eval_samples_per_second": 15.169, + "eval_steps_per_second": 7.585, + "eval_token_acc": 0.9751805889979626, + "step": 2200 + }, + { + "epoch": 0.39595959595959596, + "grad_norm": 1.2163937091827393, + "learning_rate": 7.069704495675862e-06, + "loss": 0.044440239667892456, + "memory(GiB)": 17.38, + "step": 2205, + "token_acc": 0.9772804919713016, + "train_speed(iter/s)": 0.206201 + }, + { + "epoch": 0.3968574635241302, + "grad_norm": 1.7640342712402344, + "learning_rate": 7.056177692440927e-06, + "loss": 0.05539994239807129, + "memory(GiB)": 17.38, + "step": 2210, + "token_acc": 0.9710743801652892, + "train_speed(iter/s)": 0.206298 + }, + { + "epoch": 0.3977553310886644, + "grad_norm": 0.5201627016067505, + "learning_rate": 7.042632752728387e-06, + "loss": 0.04197717308998108, + "memory(GiB)": 17.38, + "step": 2215, + "token_acc": 0.9666666666666667, + "train_speed(iter/s)": 0.206397 + }, + { + "epoch": 0.39865319865319865, + "grad_norm": 0.7768644690513611, + "learning_rate": 7.029069796011135e-06, + "loss": 0.0394055962562561, + "memory(GiB)": 17.38, + "step": 2220, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.206495 + }, + { + "epoch": 0.3995510662177329, + "grad_norm": 0.9082227349281311, + "learning_rate": 7.01548894192098e-06, + "loss": 0.03776232898235321, + "memory(GiB)": 17.38, + "step": 2225, + "token_acc": 0.9794238683127572, + "train_speed(iter/s)": 0.206597 + }, + { + "epoch": 0.4004489337822671, + "grad_norm": 1.6319297552108765, + "learning_rate": 7.001890310247597e-06, + "loss": 0.0462755024433136, + "memory(GiB)": 17.38, + "step": 2230, + "token_acc": 0.9796747967479674, + "train_speed(iter/s)": 0.206694 + }, + { + "epoch": 0.40134680134680134, + "grad_norm": 2.149829864501953, + "learning_rate": 6.988274020937469e-06, + "loss": 0.040911251306533815, + "memory(GiB)": 17.38, + "step": 2235, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.206795 + }, + { + "epoch": 0.4022446689113356, + "grad_norm": 0.8278529047966003, + "learning_rate": 6.974640194092823e-06, + "loss": 0.038359200954437254, + "memory(GiB)": 17.38, + "step": 2240, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.206893 + }, + { + "epoch": 0.4031425364758698, + "grad_norm": 0.7873987555503845, + "learning_rate": 6.96098894997058e-06, + "loss": 0.03981956243515015, + "memory(GiB)": 17.38, + "step": 2245, + "token_acc": 0.9754098360655737, + "train_speed(iter/s)": 0.206988 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.3110383749008179, + "learning_rate": 6.947320408981288e-06, + "loss": 0.04616835117340088, + "memory(GiB)": 17.38, + "step": 2250, + "token_acc": 0.9728033472803347, + "train_speed(iter/s)": 0.207085 + }, + { + "epoch": 0.4049382716049383, + "grad_norm": 0.6646740436553955, + "learning_rate": 6.933634691688062e-06, + "loss": 0.04807235598564148, + "memory(GiB)": 17.38, + "step": 2255, + "token_acc": 0.9704641350210971, + "train_speed(iter/s)": 0.207183 + }, + { + "epoch": 0.4058361391694725, + "grad_norm": 0.4680635631084442, + "learning_rate": 6.91993191880552e-06, + "loss": 0.03745869398117065, + "memory(GiB)": 17.38, + "step": 2260, + "token_acc": 0.9805194805194806, + "train_speed(iter/s)": 0.207278 + }, + { + "epoch": 0.4067340067340067, + "grad_norm": 0.7591114640235901, + "learning_rate": 6.906212211198722e-06, + "loss": 0.03965793251991272, + "memory(GiB)": 17.38, + "step": 2265, + "token_acc": 0.9723404255319149, + "train_speed(iter/s)": 0.207372 + }, + { + "epoch": 0.407631874298541, + "grad_norm": 0.8961293697357178, + "learning_rate": 6.892475689882092e-06, + "loss": 0.0416390061378479, + "memory(GiB)": 17.38, + "step": 2270, + "token_acc": 0.9662447257383966, + "train_speed(iter/s)": 0.207469 + }, + { + "epoch": 0.40852974186307517, + "grad_norm": 0.7805488109588623, + "learning_rate": 6.878722476018367e-06, + "loss": 0.03906413316726685, + "memory(GiB)": 17.38, + "step": 2275, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.207564 + }, + { + "epoch": 0.4094276094276094, + "grad_norm": 0.6274279952049255, + "learning_rate": 6.864952690917517e-06, + "loss": 0.032743868231773374, + "memory(GiB)": 17.38, + "step": 2280, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.207661 + }, + { + "epoch": 0.41032547699214367, + "grad_norm": 0.7028347253799438, + "learning_rate": 6.851166456035678e-06, + "loss": 0.046334341168403625, + "memory(GiB)": 17.38, + "step": 2285, + "token_acc": 0.9680851063829787, + "train_speed(iter/s)": 0.207756 + }, + { + "epoch": 0.4112233445566779, + "grad_norm": 1.123255968093872, + "learning_rate": 6.8373638929740835e-06, + "loss": 0.05282944440841675, + "memory(GiB)": 17.38, + "step": 2290, + "token_acc": 0.9793388429752066, + "train_speed(iter/s)": 0.207849 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 0.6760699152946472, + "learning_rate": 6.823545123477987e-06, + "loss": 0.05010502934455872, + "memory(GiB)": 17.38, + "step": 2295, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.207942 + }, + { + "epoch": 0.41301907968574636, + "grad_norm": 0.8715448379516602, + "learning_rate": 6.80971026943559e-06, + "loss": 0.047029736638069156, + "memory(GiB)": 17.38, + "step": 2300, + "token_acc": 0.975, + "train_speed(iter/s)": 0.208036 + }, + { + "epoch": 0.41301907968574636, + "eval_loss": 0.04449711740016937, + "eval_runtime": 59.8096, + "eval_samples_per_second": 15.048, + "eval_steps_per_second": 7.524, + "eval_token_acc": 0.9733283941470643, + "step": 2300 + }, + { + "epoch": 0.4139169472502806, + "grad_norm": 0.8121962547302246, + "learning_rate": 6.795859452876972e-06, + "loss": 0.03931237459182739, + "memory(GiB)": 17.38, + "step": 2305, + "token_acc": 0.9732081911262799, + "train_speed(iter/s)": 0.206845 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 0.37205252051353455, + "learning_rate": 6.781992795973003e-06, + "loss": 0.04309642314910889, + "memory(GiB)": 17.38, + "step": 2310, + "token_acc": 0.9711934156378601, + "train_speed(iter/s)": 0.206941 + }, + { + "epoch": 0.41571268237934905, + "grad_norm": 0.5727130174636841, + "learning_rate": 6.768110421034276e-06, + "loss": 0.044375211000442505, + "memory(GiB)": 17.38, + "step": 2315, + "token_acc": 0.9718614718614719, + "train_speed(iter/s)": 0.207037 + }, + { + "epoch": 0.4166105499438833, + "grad_norm": 0.9681130647659302, + "learning_rate": 6.754212450510026e-06, + "loss": 0.047539955377578734, + "memory(GiB)": 17.38, + "step": 2320, + "token_acc": 0.9723404255319149, + "train_speed(iter/s)": 0.207126 + }, + { + "epoch": 0.4175084175084175, + "grad_norm": 0.7969462275505066, + "learning_rate": 6.7402990069870454e-06, + "loss": 0.040248429775238036, + "memory(GiB)": 17.38, + "step": 2325, + "token_acc": 0.9852941176470589, + "train_speed(iter/s)": 0.207216 + }, + { + "epoch": 0.41840628507295174, + "grad_norm": 0.5914021134376526, + "learning_rate": 6.726370213188609e-06, + "loss": 0.0341882199048996, + "memory(GiB)": 17.38, + "step": 2330, + "token_acc": 0.9806034482758621, + "train_speed(iter/s)": 0.207307 + }, + { + "epoch": 0.419304152637486, + "grad_norm": 1.8980515003204346, + "learning_rate": 6.712426191973387e-06, + "loss": 0.039409366250038144, + "memory(GiB)": 17.38, + "step": 2335, + "token_acc": 0.9746835443037974, + "train_speed(iter/s)": 0.2074 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 0.9412928223609924, + "learning_rate": 6.698467066334361e-06, + "loss": 0.04413142800331116, + "memory(GiB)": 17.38, + "step": 2340, + "token_acc": 0.9896694214876033, + "train_speed(iter/s)": 0.20749 + }, + { + "epoch": 0.42109988776655444, + "grad_norm": 1.8393278121948242, + "learning_rate": 6.6844929593977435e-06, + "loss": 0.04835282564163208, + "memory(GiB)": 17.38, + "step": 2345, + "token_acc": 0.9745762711864406, + "train_speed(iter/s)": 0.207583 + }, + { + "epoch": 0.4219977553310887, + "grad_norm": 0.7256748676300049, + "learning_rate": 6.670503994421888e-06, + "loss": 0.043755143880844116, + "memory(GiB)": 17.38, + "step": 2350, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.207678 + }, + { + "epoch": 0.4228956228956229, + "grad_norm": 1.2499632835388184, + "learning_rate": 6.656500294796205e-06, + "loss": 0.04659755825996399, + "memory(GiB)": 17.38, + "step": 2355, + "token_acc": 0.9754098360655737, + "train_speed(iter/s)": 0.207771 + }, + { + "epoch": 0.42379349046015713, + "grad_norm": 0.6871971487998962, + "learning_rate": 6.642481984040067e-06, + "loss": 0.045611506700515746, + "memory(GiB)": 17.38, + "step": 2360, + "token_acc": 0.9722222222222222, + "train_speed(iter/s)": 0.207865 + }, + { + "epoch": 0.4246913580246914, + "grad_norm": 0.5366142988204956, + "learning_rate": 6.628449185801729e-06, + "loss": 0.044897323846817015, + "memory(GiB)": 17.38, + "step": 2365, + "token_acc": 0.9786324786324786, + "train_speed(iter/s)": 0.207959 + }, + { + "epoch": 0.4255892255892256, + "grad_norm": 0.7094274759292603, + "learning_rate": 6.614402023857231e-06, + "loss": 0.04476945698261261, + "memory(GiB)": 17.38, + "step": 2370, + "token_acc": 0.975103734439834, + "train_speed(iter/s)": 0.20805 + }, + { + "epoch": 0.4264870931537598, + "grad_norm": 0.8505343794822693, + "learning_rate": 6.600340622109306e-06, + "loss": 0.048710429668426515, + "memory(GiB)": 17.38, + "step": 2375, + "token_acc": 0.970954356846473, + "train_speed(iter/s)": 0.208142 + }, + { + "epoch": 0.42738496071829407, + "grad_norm": 0.3232537508010864, + "learning_rate": 6.5862651045862895e-06, + "loss": 0.03807567954063416, + "memory(GiB)": 17.38, + "step": 2380, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.208235 + }, + { + "epoch": 0.42828282828282827, + "grad_norm": 0.9937730431556702, + "learning_rate": 6.572175595441026e-06, + "loss": 0.03878996074199677, + "memory(GiB)": 17.38, + "step": 2385, + "token_acc": 0.9794238683127572, + "train_speed(iter/s)": 0.208327 + }, + { + "epoch": 0.4291806958473625, + "grad_norm": 0.9780154824256897, + "learning_rate": 6.558072218949773e-06, + "loss": 0.037696027755737306, + "memory(GiB)": 17.38, + "step": 2390, + "token_acc": 0.9808510638297873, + "train_speed(iter/s)": 0.20842 + }, + { + "epoch": 0.43007856341189676, + "grad_norm": 0.6990713477134705, + "learning_rate": 6.543955099511103e-06, + "loss": 0.04573353826999664, + "memory(GiB)": 17.38, + "step": 2395, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.208512 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 0.7605579495429993, + "learning_rate": 6.52982436164481e-06, + "loss": 0.041476625204086306, + "memory(GiB)": 17.38, + "step": 2400, + "token_acc": 0.9728033472803347, + "train_speed(iter/s)": 0.2086 + }, + { + "epoch": 0.43097643097643096, + "eval_loss": 0.040721289813518524, + "eval_runtime": 60.5017, + "eval_samples_per_second": 14.876, + "eval_steps_per_second": 7.438, + "eval_token_acc": 0.9784219299870346, + "step": 2400 + }, + { + "epoch": 0.4318742985409652, + "grad_norm": 6.684725284576416, + "learning_rate": 6.515680129990807e-06, + "loss": 0.05723460912704468, + "memory(GiB)": 17.38, + "step": 2405, + "token_acc": 0.9789599726308587, + "train_speed(iter/s)": 0.20744 + }, + { + "epoch": 0.43277216610549946, + "grad_norm": 0.4514862298965454, + "learning_rate": 6.5015225293080305e-06, + "loss": 0.040979838371276854, + "memory(GiB)": 17.38, + "step": 2410, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.20753 + }, + { + "epoch": 0.43367003367003365, + "grad_norm": 0.9881991147994995, + "learning_rate": 6.487351684473338e-06, + "loss": 0.03439622819423675, + "memory(GiB)": 17.38, + "step": 2415, + "token_acc": 0.9868995633187773, + "train_speed(iter/s)": 0.207622 + }, + { + "epoch": 0.4345679012345679, + "grad_norm": 0.7329021692276001, + "learning_rate": 6.473167720480403e-06, + "loss": 0.04383282959461212, + "memory(GiB)": 17.38, + "step": 2420, + "token_acc": 0.9713114754098361, + "train_speed(iter/s)": 0.207711 + }, + { + "epoch": 0.43546576879910215, + "grad_norm": 0.5319974422454834, + "learning_rate": 6.458970762438626e-06, + "loss": 0.04468891620635986, + "memory(GiB)": 17.38, + "step": 2425, + "token_acc": 0.9791666666666666, + "train_speed(iter/s)": 0.207802 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.723374605178833, + "learning_rate": 6.444760935572009e-06, + "loss": 0.04997303783893585, + "memory(GiB)": 17.38, + "step": 2430, + "token_acc": 0.966804979253112, + "train_speed(iter/s)": 0.207893 + }, + { + "epoch": 0.4372615039281706, + "grad_norm": 3.7178099155426025, + "learning_rate": 6.430538365218071e-06, + "loss": 0.04470430016517639, + "memory(GiB)": 17.38, + "step": 2435, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.207983 + }, + { + "epoch": 0.43815937149270484, + "grad_norm": 0.6523065567016602, + "learning_rate": 6.416303176826735e-06, + "loss": 0.041749882698059085, + "memory(GiB)": 17.38, + "step": 2440, + "token_acc": 0.9849137931034483, + "train_speed(iter/s)": 0.208073 + }, + { + "epoch": 0.43905723905723903, + "grad_norm": 0.6325268745422363, + "learning_rate": 6.402055495959217e-06, + "loss": 0.04349135756492615, + "memory(GiB)": 17.38, + "step": 2445, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.208162 + }, + { + "epoch": 0.4399551066217733, + "grad_norm": 1.2231210470199585, + "learning_rate": 6.387795448286923e-06, + "loss": 0.0460403710603714, + "memory(GiB)": 17.38, + "step": 2450, + "token_acc": 0.9721030042918455, + "train_speed(iter/s)": 0.208251 + }, + { + "epoch": 0.44085297418630753, + "grad_norm": 0.5724068880081177, + "learning_rate": 6.373523159590345e-06, + "loss": 0.04551941156387329, + "memory(GiB)": 17.38, + "step": 2455, + "token_acc": 0.970954356846473, + "train_speed(iter/s)": 0.208338 + }, + { + "epoch": 0.4417508417508417, + "grad_norm": 0.6074643135070801, + "learning_rate": 6.3592387557579404e-06, + "loss": 0.041909605264663696, + "memory(GiB)": 17.38, + "step": 2460, + "token_acc": 0.9818548387096774, + "train_speed(iter/s)": 0.208426 + }, + { + "epoch": 0.442648709315376, + "grad_norm": 0.8428773880004883, + "learning_rate": 6.344942362785034e-06, + "loss": 0.03988522887229919, + "memory(GiB)": 17.38, + "step": 2465, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.208516 + }, + { + "epoch": 0.4435465768799102, + "grad_norm": 0.7340371012687683, + "learning_rate": 6.330634106772696e-06, + "loss": 0.035030472278594973, + "memory(GiB)": 17.38, + "step": 2470, + "token_acc": 0.9790794979079498, + "train_speed(iter/s)": 0.208605 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.985042929649353, + "learning_rate": 6.316314113926637e-06, + "loss": 0.037481766939163205, + "memory(GiB)": 17.38, + "step": 2475, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.208693 + }, + { + "epoch": 0.44534231200897867, + "grad_norm": 0.7302332520484924, + "learning_rate": 6.301982510556091e-06, + "loss": 0.045238193869590757, + "memory(GiB)": 17.38, + "step": 2480, + "token_acc": 0.9715447154471545, + "train_speed(iter/s)": 0.208782 + }, + { + "epoch": 0.4462401795735129, + "grad_norm": 0.5465536713600159, + "learning_rate": 6.287639423072701e-06, + "loss": 0.031100502610206603, + "memory(GiB)": 17.38, + "step": 2485, + "token_acc": 0.98068669527897, + "train_speed(iter/s)": 0.208867 + }, + { + "epoch": 0.4471380471380471, + "grad_norm": 0.7432754635810852, + "learning_rate": 6.2732849779894105e-06, + "loss": 0.045597967505455014, + "memory(GiB)": 17.38, + "step": 2490, + "token_acc": 0.9707112970711297, + "train_speed(iter/s)": 0.20895 + }, + { + "epoch": 0.44803591470258136, + "grad_norm": 0.7030917406082153, + "learning_rate": 6.258919301919334e-06, + "loss": 0.043600338697433474, + "memory(GiB)": 17.38, + "step": 2495, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.209034 + }, + { + "epoch": 0.4489337822671156, + "grad_norm": 0.7413378953933716, + "learning_rate": 6.244542521574657e-06, + "loss": 0.04475947916507721, + "memory(GiB)": 17.38, + "step": 2500, + "token_acc": 0.9705882352941176, + "train_speed(iter/s)": 0.20912 + }, + { + "epoch": 0.4489337822671156, + "eval_loss": 0.041380707174539566, + "eval_runtime": 59.6668, + "eval_samples_per_second": 15.084, + "eval_steps_per_second": 7.542, + "eval_token_acc": 0.9755510279681423, + "step": 2500 + }, + { + "epoch": 0.4498316498316498, + "grad_norm": 1.3256155252456665, + "learning_rate": 6.230154763765506e-06, + "loss": 0.03763327598571777, + "memory(GiB)": 17.38, + "step": 2505, + "token_acc": 0.9767600820232399, + "train_speed(iter/s)": 0.208022 + }, + { + "epoch": 0.45072951739618405, + "grad_norm": 0.6078214645385742, + "learning_rate": 6.215756155398834e-06, + "loss": 0.037750205397605895, + "memory(GiB)": 17.38, + "step": 2510, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.20811 + }, + { + "epoch": 0.4516273849607183, + "grad_norm": 1.3939481973648071, + "learning_rate": 6.2013468234773034e-06, + "loss": 0.049828732013702394, + "memory(GiB)": 17.38, + "step": 2515, + "token_acc": 0.9670781893004116, + "train_speed(iter/s)": 0.208197 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 0.8786147832870483, + "learning_rate": 6.186926895098162e-06, + "loss": 0.03286179006099701, + "memory(GiB)": 17.38, + "step": 2520, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.208285 + }, + { + "epoch": 0.45342312008978675, + "grad_norm": 1.1058120727539062, + "learning_rate": 6.172496497452122e-06, + "loss": 0.04316239356994629, + "memory(GiB)": 17.38, + "step": 2525, + "token_acc": 0.9710743801652892, + "train_speed(iter/s)": 0.20837 + }, + { + "epoch": 0.454320987654321, + "grad_norm": 0.7172059416770935, + "learning_rate": 6.158055757822241e-06, + "loss": 0.04646519124507904, + "memory(GiB)": 17.38, + "step": 2530, + "token_acc": 0.975103734439834, + "train_speed(iter/s)": 0.208457 + }, + { + "epoch": 0.45521885521885525, + "grad_norm": 0.4345340132713318, + "learning_rate": 6.143604803582799e-06, + "loss": 0.051696008443832396, + "memory(GiB)": 17.38, + "step": 2535, + "token_acc": 0.9789029535864979, + "train_speed(iter/s)": 0.208541 + }, + { + "epoch": 0.45611672278338944, + "grad_norm": 1.0206499099731445, + "learning_rate": 6.129143762198172e-06, + "loss": 0.032700496912002566, + "memory(GiB)": 17.38, + "step": 2540, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.208624 + }, + { + "epoch": 0.4570145903479237, + "grad_norm": 1.1213144063949585, + "learning_rate": 6.1146727612217116e-06, + "loss": 0.04223711490631103, + "memory(GiB)": 17.38, + "step": 2545, + "token_acc": 0.9769874476987448, + "train_speed(iter/s)": 0.208706 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 0.8084053993225098, + "learning_rate": 6.100191928294614e-06, + "loss": 0.033938392996788025, + "memory(GiB)": 17.38, + "step": 2550, + "token_acc": 0.977366255144033, + "train_speed(iter/s)": 0.208788 + }, + { + "epoch": 0.45881032547699213, + "grad_norm": 0.7507816553115845, + "learning_rate": 6.085701391144802e-06, + "loss": 0.03840683698654175, + "memory(GiB)": 17.38, + "step": 2555, + "token_acc": 0.9857142857142858, + "train_speed(iter/s)": 0.208867 + }, + { + "epoch": 0.4597081930415264, + "grad_norm": 1.502113699913025, + "learning_rate": 6.071201277585792e-06, + "loss": 0.03366389274597168, + "memory(GiB)": 17.38, + "step": 2560, + "token_acc": 0.9857142857142858, + "train_speed(iter/s)": 0.208948 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 1.3176274299621582, + "learning_rate": 6.056691715515569e-06, + "loss": 0.043088209629058835, + "memory(GiB)": 17.38, + "step": 2565, + "token_acc": 0.9736842105263158, + "train_speed(iter/s)": 0.209031 + }, + { + "epoch": 0.4615039281705948, + "grad_norm": 1.0873850584030151, + "learning_rate": 6.042172832915461e-06, + "loss": 0.046823954582214354, + "memory(GiB)": 17.38, + "step": 2570, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.209109 + }, + { + "epoch": 0.4624017957351291, + "grad_norm": 2.3708643913269043, + "learning_rate": 6.027644757849004e-06, + "loss": 0.04343923330307007, + "memory(GiB)": 17.38, + "step": 2575, + "token_acc": 0.977366255144033, + "train_speed(iter/s)": 0.209182 + }, + { + "epoch": 0.4632996632996633, + "grad_norm": 0.4276960790157318, + "learning_rate": 6.013107618460818e-06, + "loss": 0.038988494873046876, + "memory(GiB)": 17.38, + "step": 2580, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.209259 + }, + { + "epoch": 0.4641975308641975, + "grad_norm": 0.33236658573150635, + "learning_rate": 5.998561542975472e-06, + "loss": 0.04181536436080933, + "memory(GiB)": 17.38, + "step": 2585, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.209338 + }, + { + "epoch": 0.46509539842873177, + "grad_norm": 0.27041783928871155, + "learning_rate": 5.984006659696362e-06, + "loss": 0.03674737811088562, + "memory(GiB)": 17.38, + "step": 2590, + "token_acc": 0.9702127659574468, + "train_speed(iter/s)": 0.209421 + }, + { + "epoch": 0.465993265993266, + "grad_norm": 0.4284606873989105, + "learning_rate": 5.9694430970045654e-06, + "loss": 0.03759410083293915, + "memory(GiB)": 17.38, + "step": 2595, + "token_acc": 0.9795081967213115, + "train_speed(iter/s)": 0.209501 + }, + { + "epoch": 0.4668911335578002, + "grad_norm": 1.4704172611236572, + "learning_rate": 5.954870983357722e-06, + "loss": 0.03459267616271973, + "memory(GiB)": 17.38, + "step": 2600, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.209583 + }, + { + "epoch": 0.4668911335578002, + "eval_loss": 0.03899447247385979, + "eval_runtime": 59.6345, + "eval_samples_per_second": 15.092, + "eval_steps_per_second": 7.546, + "eval_token_acc": 0.9798110761252083, + "step": 2600 + }, + { + "epoch": 0.46778900112233446, + "grad_norm": 1.949906587600708, + "learning_rate": 5.940290447288891e-06, + "loss": 0.03979158103466034, + "memory(GiB)": 17.38, + "step": 2605, + "token_acc": 0.9799794661190965, + "train_speed(iter/s)": 0.208514 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 0.7221953868865967, + "learning_rate": 5.925701617405425e-06, + "loss": 0.03356524109840393, + "memory(GiB)": 17.38, + "step": 2610, + "token_acc": 0.9848484848484849, + "train_speed(iter/s)": 0.208598 + }, + { + "epoch": 0.4695847362514029, + "grad_norm": 0.926184356212616, + "learning_rate": 5.91110462238783e-06, + "loss": 0.04912663102149963, + "memory(GiB)": 17.38, + "step": 2615, + "token_acc": 0.9741379310344828, + "train_speed(iter/s)": 0.208681 + }, + { + "epoch": 0.47048260381593715, + "grad_norm": 0.5160276293754578, + "learning_rate": 5.896499590988631e-06, + "loss": 0.029301244020462035, + "memory(GiB)": 17.38, + "step": 2620, + "token_acc": 0.9915254237288136, + "train_speed(iter/s)": 0.208763 + }, + { + "epoch": 0.4713804713804714, + "grad_norm": 0.584625780582428, + "learning_rate": 5.881886652031241e-06, + "loss": 0.049494200944900514, + "memory(GiB)": 17.38, + "step": 2625, + "token_acc": 0.9734693877551021, + "train_speed(iter/s)": 0.208843 + }, + { + "epoch": 0.4722783389450056, + "grad_norm": 0.37072888016700745, + "learning_rate": 5.867265934408819e-06, + "loss": 0.038386750221252444, + "memory(GiB)": 17.38, + "step": 2630, + "token_acc": 0.9725738396624473, + "train_speed(iter/s)": 0.208924 + }, + { + "epoch": 0.47317620650953984, + "grad_norm": 0.8629380464553833, + "learning_rate": 5.852637567083137e-06, + "loss": 0.04185566306114197, + "memory(GiB)": 17.38, + "step": 2635, + "token_acc": 0.9753086419753086, + "train_speed(iter/s)": 0.209003 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 1.4146498441696167, + "learning_rate": 5.838001679083442e-06, + "loss": 0.045223063230514525, + "memory(GiB)": 17.38, + "step": 2640, + "token_acc": 0.9775510204081632, + "train_speed(iter/s)": 0.209085 + }, + { + "epoch": 0.4749719416386083, + "grad_norm": 1.115925669670105, + "learning_rate": 5.823358399505312e-06, + "loss": 0.04004613161087036, + "memory(GiB)": 17.38, + "step": 2645, + "token_acc": 0.9787234042553191, + "train_speed(iter/s)": 0.209167 + }, + { + "epoch": 0.47586980920314254, + "grad_norm": 0.6182511448860168, + "learning_rate": 5.808707857509529e-06, + "loss": 0.03453727960586548, + "memory(GiB)": 17.38, + "step": 2650, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.209248 + }, + { + "epoch": 0.4767676767676768, + "grad_norm": 1.0996288061141968, + "learning_rate": 5.79405018232093e-06, + "loss": 0.0355169951915741, + "memory(GiB)": 17.38, + "step": 2655, + "token_acc": 0.9849785407725322, + "train_speed(iter/s)": 0.209331 + }, + { + "epoch": 0.477665544332211, + "grad_norm": 1.1120131015777588, + "learning_rate": 5.779385503227266e-06, + "loss": 0.045806384086608885, + "memory(GiB)": 17.38, + "step": 2660, + "token_acc": 0.9793388429752066, + "train_speed(iter/s)": 0.209412 + }, + { + "epoch": 0.4785634118967452, + "grad_norm": 0.9793822169303894, + "learning_rate": 5.764713949578074e-06, + "loss": 0.03301399350166321, + "memory(GiB)": 17.38, + "step": 2665, + "token_acc": 0.9770833333333333, + "train_speed(iter/s)": 0.20949 + }, + { + "epoch": 0.4794612794612795, + "grad_norm": 0.8665223121643066, + "learning_rate": 5.750035650783523e-06, + "loss": 0.04569063782691955, + "memory(GiB)": 17.38, + "step": 2670, + "token_acc": 0.975103734439834, + "train_speed(iter/s)": 0.209567 + }, + { + "epoch": 0.48035914702581367, + "grad_norm": 1.3416739702224731, + "learning_rate": 5.735350736313275e-06, + "loss": 0.031667521595954894, + "memory(GiB)": 17.38, + "step": 2675, + "token_acc": 0.9838709677419355, + "train_speed(iter/s)": 0.209647 + }, + { + "epoch": 0.4812570145903479, + "grad_norm": 0.5596939325332642, + "learning_rate": 5.7206593356953544e-06, + "loss": 0.03234308958053589, + "memory(GiB)": 17.38, + "step": 2680, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.209726 + }, + { + "epoch": 0.48215488215488217, + "grad_norm": 0.7293614149093628, + "learning_rate": 5.705961578514986e-06, + "loss": 0.031192868947982788, + "memory(GiB)": 17.38, + "step": 2685, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.209806 + }, + { + "epoch": 0.48305274971941636, + "grad_norm": 1.0724992752075195, + "learning_rate": 5.691257594413468e-06, + "loss": 0.03985053300857544, + "memory(GiB)": 17.38, + "step": 2690, + "token_acc": 0.9817813765182186, + "train_speed(iter/s)": 0.209885 + }, + { + "epoch": 0.4839506172839506, + "grad_norm": 1.1715577840805054, + "learning_rate": 5.676547513087024e-06, + "loss": 0.0457848995923996, + "memory(GiB)": 17.38, + "step": 2695, + "token_acc": 0.9770833333333333, + "train_speed(iter/s)": 0.209964 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.4473322629928589, + "learning_rate": 5.6618314642856554e-06, + "loss": 0.04222930371761322, + "memory(GiB)": 17.38, + "step": 2700, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.210042 + }, + { + "epoch": 0.48484848484848486, + "eval_loss": 0.04132239520549774, + "eval_runtime": 59.4395, + "eval_samples_per_second": 15.141, + "eval_steps_per_second": 7.571, + "eval_token_acc": 0.9754584182255973, + "step": 2700 + }, + { + "epoch": 0.48574635241301906, + "grad_norm": 0.33762404322624207, + "learning_rate": 5.647109577812001e-06, + "loss": 0.039106807112693785, + "memory(GiB)": 17.38, + "step": 2705, + "token_acc": 0.9773972602739726, + "train_speed(iter/s)": 0.209018 + }, + { + "epoch": 0.4866442199775533, + "grad_norm": 0.3893556594848633, + "learning_rate": 5.63238198352019e-06, + "loss": 0.04239853024482727, + "memory(GiB)": 17.38, + "step": 2710, + "token_acc": 0.9757085020242915, + "train_speed(iter/s)": 0.209098 + }, + { + "epoch": 0.48754208754208755, + "grad_norm": 0.47717538475990295, + "learning_rate": 5.617648811314697e-06, + "loss": 0.04036793410778046, + "memory(GiB)": 17.38, + "step": 2715, + "token_acc": 0.9743589743589743, + "train_speed(iter/s)": 0.209178 + }, + { + "epoch": 0.48843995510662175, + "grad_norm": 0.3555564880371094, + "learning_rate": 5.602910191149198e-06, + "loss": 0.03995971083641052, + "memory(GiB)": 17.38, + "step": 2720, + "token_acc": 0.9814814814814815, + "train_speed(iter/s)": 0.209256 + }, + { + "epoch": 0.489337822671156, + "grad_norm": 0.3762608766555786, + "learning_rate": 5.588166253025421e-06, + "loss": 0.04338297247886658, + "memory(GiB)": 17.38, + "step": 2725, + "token_acc": 0.9620253164556962, + "train_speed(iter/s)": 0.209334 + }, + { + "epoch": 0.49023569023569025, + "grad_norm": 0.3586757183074951, + "learning_rate": 5.573417126992004e-06, + "loss": 0.034859451651573184, + "memory(GiB)": 17.38, + "step": 2730, + "token_acc": 0.9755102040816327, + "train_speed(iter/s)": 0.209412 + }, + { + "epoch": 0.49113355780022444, + "grad_norm": 0.7197408676147461, + "learning_rate": 5.558662943143338e-06, + "loss": 0.034445399045944215, + "memory(GiB)": 17.38, + "step": 2735, + "token_acc": 0.9789029535864979, + "train_speed(iter/s)": 0.209487 + }, + { + "epoch": 0.4920314253647587, + "grad_norm": 1.078397274017334, + "learning_rate": 5.5439038316184344e-06, + "loss": 0.03632235527038574, + "memory(GiB)": 17.38, + "step": 2740, + "token_acc": 0.9892703862660944, + "train_speed(iter/s)": 0.209563 + }, + { + "epoch": 0.49292929292929294, + "grad_norm": 0.4836512804031372, + "learning_rate": 5.529139922599765e-06, + "loss": 0.03440414667129517, + "memory(GiB)": 17.38, + "step": 2745, + "token_acc": 0.9816326530612245, + "train_speed(iter/s)": 0.20964 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 12.24158000946045, + "learning_rate": 5.514371346312118e-06, + "loss": 0.048089760541915896, + "memory(GiB)": 17.38, + "step": 2750, + "token_acc": 0.977366255144033, + "train_speed(iter/s)": 0.209717 + }, + { + "epoch": 0.4947250280583614, + "grad_norm": 0.3620738685131073, + "learning_rate": 5.499598233021451e-06, + "loss": 0.031771937012672426, + "memory(GiB)": 17.38, + "step": 2755, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.209794 + }, + { + "epoch": 0.49562289562289563, + "grad_norm": 0.7418166995048523, + "learning_rate": 5.484820713033736e-06, + "loss": 0.03597655296325684, + "memory(GiB)": 17.38, + "step": 2760, + "token_acc": 0.976, + "train_speed(iter/s)": 0.20987 + }, + { + "epoch": 0.4965207631874299, + "grad_norm": 1.3017184734344482, + "learning_rate": 5.4700389166938185e-06, + "loss": 0.04800904393196106, + "memory(GiB)": 17.38, + "step": 2765, + "token_acc": 0.9586776859504132, + "train_speed(iter/s)": 0.209945 + }, + { + "epoch": 0.4974186307519641, + "grad_norm": 1.3031936883926392, + "learning_rate": 5.455252974384261e-06, + "loss": 0.03634591996669769, + "memory(GiB)": 17.38, + "step": 2770, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.210019 + }, + { + "epoch": 0.4983164983164983, + "grad_norm": 0.5656425356864929, + "learning_rate": 5.440463016524198e-06, + "loss": 0.04363555312156677, + "memory(GiB)": 17.38, + "step": 2775, + "token_acc": 0.9708333333333333, + "train_speed(iter/s)": 0.210095 + }, + { + "epoch": 0.4992143658810326, + "grad_norm": 0.9590857028961182, + "learning_rate": 5.425669173568179e-06, + "loss": 0.03879062831401825, + "memory(GiB)": 17.38, + "step": 2780, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.210172 + }, + { + "epoch": 0.5001122334455668, + "grad_norm": 0.5146193504333496, + "learning_rate": 5.410871576005027e-06, + "loss": 0.03509630560874939, + "memory(GiB)": 17.38, + "step": 2785, + "token_acc": 0.9836734693877551, + "train_speed(iter/s)": 0.210249 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 0.7044083476066589, + "learning_rate": 5.396070354356678e-06, + "loss": 0.03525749444961548, + "memory(GiB)": 17.38, + "step": 2790, + "token_acc": 0.9765957446808511, + "train_speed(iter/s)": 0.210324 + }, + { + "epoch": 0.5019079685746353, + "grad_norm": 1.4625669717788696, + "learning_rate": 5.381265639177035e-06, + "loss": 0.029810887575149537, + "memory(GiB)": 17.38, + "step": 2795, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.2104 + }, + { + "epoch": 0.5028058361391695, + "grad_norm": 0.7489975094795227, + "learning_rate": 5.366457561050819e-06, + "loss": 0.03495717644691467, + "memory(GiB)": 17.38, + "step": 2800, + "token_acc": 0.9801762114537445, + "train_speed(iter/s)": 0.210476 + }, + { + "epoch": 0.5028058361391695, + "eval_loss": 0.036955345422029495, + "eval_runtime": 59.2315, + "eval_samples_per_second": 15.195, + "eval_steps_per_second": 7.597, + "eval_token_acc": 0.980274124837933, + "step": 2800 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 2.799170732498169, + "learning_rate": 5.35164625059241e-06, + "loss": 0.038314640522003174, + "memory(GiB)": 17.38, + "step": 2805, + "token_acc": 0.9827291381668947, + "train_speed(iter/s)": 0.20948 + }, + { + "epoch": 0.5046015712682379, + "grad_norm": 0.6724898219108582, + "learning_rate": 5.336831838444701e-06, + "loss": 0.045146864652633664, + "memory(GiB)": 17.38, + "step": 2810, + "token_acc": 0.9658634538152611, + "train_speed(iter/s)": 0.209556 + }, + { + "epoch": 0.5054994388327722, + "grad_norm": 0.3283923864364624, + "learning_rate": 5.322014455277942e-06, + "loss": 0.03479876220226288, + "memory(GiB)": 17.38, + "step": 2815, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.209631 + }, + { + "epoch": 0.5063973063973064, + "grad_norm": 0.25968095660209656, + "learning_rate": 5.307194231788587e-06, + "loss": 0.03254989385604858, + "memory(GiB)": 17.38, + "step": 2820, + "token_acc": 0.9827586206896551, + "train_speed(iter/s)": 0.209706 + }, + { + "epoch": 0.5072951739618407, + "grad_norm": 0.5001474618911743, + "learning_rate": 5.292371298698149e-06, + "loss": 0.04186424314975738, + "memory(GiB)": 17.38, + "step": 2825, + "token_acc": 0.9772727272727273, + "train_speed(iter/s)": 0.209782 + }, + { + "epoch": 0.5081930415263749, + "grad_norm": 0.5631856322288513, + "learning_rate": 5.277545786752037e-06, + "loss": 0.04390389323234558, + "memory(GiB)": 17.38, + "step": 2830, + "token_acc": 0.9816326530612245, + "train_speed(iter/s)": 0.209857 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.3655097484588623, + "learning_rate": 5.262717826718406e-06, + "loss": 0.03938818573951721, + "memory(GiB)": 17.38, + "step": 2835, + "token_acc": 0.9713114754098361, + "train_speed(iter/s)": 0.20993 + }, + { + "epoch": 0.5099887766554433, + "grad_norm": 0.6644555926322937, + "learning_rate": 5.247887549387003e-06, + "loss": 0.04027451872825623, + "memory(GiB)": 17.38, + "step": 2840, + "token_acc": 0.9764957264957265, + "train_speed(iter/s)": 0.210005 + }, + { + "epoch": 0.5108866442199775, + "grad_norm": 0.4988654851913452, + "learning_rate": 5.2330550855680215e-06, + "loss": 0.03327443599700928, + "memory(GiB)": 17.38, + "step": 2845, + "token_acc": 0.9790794979079498, + "train_speed(iter/s)": 0.21008 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 1.1400690078735352, + "learning_rate": 5.218220566090934e-06, + "loss": 0.0381145179271698, + "memory(GiB)": 17.38, + "step": 2850, + "token_acc": 0.9781659388646288, + "train_speed(iter/s)": 0.210152 + }, + { + "epoch": 0.512682379349046, + "grad_norm": 0.723612368106842, + "learning_rate": 5.203384121803347e-06, + "loss": 0.05521612167358399, + "memory(GiB)": 17.38, + "step": 2855, + "token_acc": 0.9776422764227642, + "train_speed(iter/s)": 0.210226 + }, + { + "epoch": 0.5135802469135803, + "grad_norm": 0.5082634091377258, + "learning_rate": 5.188545883569844e-06, + "loss": 0.04100078642368317, + "memory(GiB)": 17.38, + "step": 2860, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.210301 + }, + { + "epoch": 0.5144781144781144, + "grad_norm": 1.1570026874542236, + "learning_rate": 5.173705982270837e-06, + "loss": 0.03345622718334198, + "memory(GiB)": 17.38, + "step": 2865, + "token_acc": 0.9827586206896551, + "train_speed(iter/s)": 0.21037 + }, + { + "epoch": 0.5153759820426487, + "grad_norm": 1.248317003250122, + "learning_rate": 5.158864548801401e-06, + "loss": 0.03739322423934936, + "memory(GiB)": 17.38, + "step": 2870, + "token_acc": 0.9807692307692307, + "train_speed(iter/s)": 0.210443 + }, + { + "epoch": 0.5162738496071829, + "grad_norm": 1.8555207252502441, + "learning_rate": 5.1440217140701275e-06, + "loss": 0.0388792097568512, + "memory(GiB)": 17.38, + "step": 2875, + "token_acc": 0.9798387096774194, + "train_speed(iter/s)": 0.210516 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 0.44100362062454224, + "learning_rate": 5.129177608997968e-06, + "loss": 0.03348960280418396, + "memory(GiB)": 17.38, + "step": 2880, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.210589 + }, + { + "epoch": 0.5180695847362514, + "grad_norm": 0.9781392812728882, + "learning_rate": 5.1143323645170784e-06, + "loss": 0.04089007675647736, + "memory(GiB)": 17.38, + "step": 2885, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.21066 + }, + { + "epoch": 0.5189674523007857, + "grad_norm": 0.632527232170105, + "learning_rate": 5.0994861115696646e-06, + "loss": 0.04224309921264648, + "memory(GiB)": 17.38, + "step": 2890, + "token_acc": 0.9731404958677686, + "train_speed(iter/s)": 0.210732 + }, + { + "epoch": 0.5198653198653199, + "grad_norm": 0.47493231296539307, + "learning_rate": 5.084638981106831e-06, + "loss": 0.04239720106124878, + "memory(GiB)": 17.38, + "step": 2895, + "token_acc": 0.970954356846473, + "train_speed(iter/s)": 0.210804 + }, + { + "epoch": 0.5207631874298541, + "grad_norm": 0.43635794520378113, + "learning_rate": 5.069791104087417e-06, + "loss": 0.02833125591278076, + "memory(GiB)": 17.38, + "step": 2900, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.210878 + }, + { + "epoch": 0.5207631874298541, + "eval_loss": 0.03555697947740555, + "eval_runtime": 60.1363, + "eval_samples_per_second": 14.966, + "eval_steps_per_second": 7.483, + "eval_token_acc": 0.9805519540655677, + "step": 2900 + }, + { + "epoch": 0.5216610549943883, + "grad_norm": 0.9657431244850159, + "learning_rate": 5.054942611476849e-06, + "loss": 0.031498870253562926, + "memory(GiB)": 17.38, + "step": 2905, + "token_acc": 0.9810191518467852, + "train_speed(iter/s)": 0.209905 + }, + { + "epoch": 0.5225589225589226, + "grad_norm": 0.6496365070343018, + "learning_rate": 5.0400936342459875e-06, + "loss": 0.03288198709487915, + "memory(GiB)": 17.38, + "step": 2910, + "token_acc": 0.98559670781893, + "train_speed(iter/s)": 0.209978 + }, + { + "epoch": 0.5234567901234568, + "grad_norm": 1.7576866149902344, + "learning_rate": 5.025244303369959e-06, + "loss": 0.03778642416000366, + "memory(GiB)": 17.38, + "step": 2915, + "token_acc": 0.9787234042553191, + "train_speed(iter/s)": 0.210052 + }, + { + "epoch": 0.524354657687991, + "grad_norm": 1.7906994819641113, + "learning_rate": 5.010394749827016e-06, + "loss": 0.03218943178653717, + "memory(GiB)": 17.38, + "step": 2920, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.210124 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 0.3959713280200958, + "learning_rate": 4.995545104597372e-06, + "loss": 0.02817581295967102, + "memory(GiB)": 17.38, + "step": 2925, + "token_acc": 0.9772727272727273, + "train_speed(iter/s)": 0.210196 + }, + { + "epoch": 0.5261503928170594, + "grad_norm": 1.376173734664917, + "learning_rate": 4.980695498662051e-06, + "loss": 0.03809363543987274, + "memory(GiB)": 17.38, + "step": 2930, + "token_acc": 0.9777327935222672, + "train_speed(iter/s)": 0.210267 + }, + { + "epoch": 0.5270482603815937, + "grad_norm": 1.1541138887405396, + "learning_rate": 4.96584606300173e-06, + "loss": 0.034730464220047, + "memory(GiB)": 17.38, + "step": 2935, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.210338 + }, + { + "epoch": 0.5279461279461279, + "grad_norm": 0.5953764915466309, + "learning_rate": 4.9509969285955865e-06, + "loss": 0.031148940324783325, + "memory(GiB)": 17.38, + "step": 2940, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.21041 + }, + { + "epoch": 0.5288439955106622, + "grad_norm": 1.2990620136260986, + "learning_rate": 4.936148226420133e-06, + "loss": 0.03804878890514374, + "memory(GiB)": 17.38, + "step": 2945, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.210482 + }, + { + "epoch": 0.5297418630751964, + "grad_norm": 0.6265944838523865, + "learning_rate": 4.921300087448076e-06, + "loss": 0.029886150360107423, + "memory(GiB)": 17.38, + "step": 2950, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.210555 + }, + { + "epoch": 0.5306397306397307, + "grad_norm": 0.5224615335464478, + "learning_rate": 4.906452642647154e-06, + "loss": 0.04189828634262085, + "memory(GiB)": 17.38, + "step": 2955, + "token_acc": 0.9808510638297873, + "train_speed(iter/s)": 0.210625 + }, + { + "epoch": 0.5315375982042648, + "grad_norm": 0.4404231309890747, + "learning_rate": 4.89160602297898e-06, + "loss": 0.0386346310377121, + "memory(GiB)": 17.38, + "step": 2960, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.210697 + }, + { + "epoch": 0.5324354657687991, + "grad_norm": 1.133245825767517, + "learning_rate": 4.8767603593978915e-06, + "loss": 0.03626513779163361, + "memory(GiB)": 17.38, + "step": 2965, + "token_acc": 0.9789029535864979, + "train_speed(iter/s)": 0.210767 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.5238416194915771, + "learning_rate": 4.861915782849794e-06, + "loss": 0.03348979651927948, + "memory(GiB)": 17.38, + "step": 2970, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.210839 + }, + { + "epoch": 0.5342312008978676, + "grad_norm": 0.5781230926513672, + "learning_rate": 4.847072424270998e-06, + "loss": 0.04255947172641754, + "memory(GiB)": 17.38, + "step": 2975, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.21091 + }, + { + "epoch": 0.5351290684624018, + "grad_norm": 1.1504417657852173, + "learning_rate": 4.8322304145870765e-06, + "loss": 0.03373274207115173, + "memory(GiB)": 17.38, + "step": 2980, + "token_acc": 0.9877551020408163, + "train_speed(iter/s)": 0.21098 + }, + { + "epoch": 0.5360269360269361, + "grad_norm": 0.7020504474639893, + "learning_rate": 4.817389884711706e-06, + "loss": 0.03762509822845459, + "memory(GiB)": 17.38, + "step": 2985, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.211047 + }, + { + "epoch": 0.5369248035914702, + "grad_norm": 0.5911434292793274, + "learning_rate": 4.8025509655455065e-06, + "loss": 0.030811995267868042, + "memory(GiB)": 17.38, + "step": 2990, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.211116 + }, + { + "epoch": 0.5378226711560045, + "grad_norm": 0.4149399697780609, + "learning_rate": 4.7877137879748935e-06, + "loss": 0.02650405764579773, + "memory(GiB)": 17.38, + "step": 2995, + "token_acc": 0.991701244813278, + "train_speed(iter/s)": 0.211186 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 1.455705165863037, + "learning_rate": 4.772878482870916e-06, + "loss": 0.03337220251560211, + "memory(GiB)": 17.38, + "step": 3000, + "token_acc": 0.983739837398374, + "train_speed(iter/s)": 0.211254 + }, + { + "epoch": 0.5387205387205387, + "eval_loss": 0.03923524543642998, + "eval_runtime": 59.5008, + "eval_samples_per_second": 15.126, + "eval_steps_per_second": 7.563, + "eval_token_acc": 0.9787923689572143, + "step": 3000 + }, + { + "epoch": 0.539618406285073, + "grad_norm": 0.8941336274147034, + "learning_rate": 4.758045181088112e-06, + "loss": 0.03463448286056518, + "memory(GiB)": 17.38, + "step": 3005, + "token_acc": 0.9793303723949436, + "train_speed(iter/s)": 0.210319 + }, + { + "epoch": 0.5405162738496072, + "grad_norm": 1.6975080966949463, + "learning_rate": 4.7432140134633495e-06, + "loss": 0.0446954607963562, + "memory(GiB)": 17.38, + "step": 3010, + "token_acc": 0.966804979253112, + "train_speed(iter/s)": 0.21039 + }, + { + "epoch": 0.5414141414141415, + "grad_norm": 0.5025302171707153, + "learning_rate": 4.7283851108146686e-06, + "loss": 0.037160784006118774, + "memory(GiB)": 17.38, + "step": 3015, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.210462 + }, + { + "epoch": 0.5423120089786756, + "grad_norm": 0.8865044116973877, + "learning_rate": 4.7135586039401345e-06, + "loss": 0.037977713346481326, + "memory(GiB)": 17.38, + "step": 3020, + "token_acc": 0.9799196787148594, + "train_speed(iter/s)": 0.210533 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 0.617268979549408, + "learning_rate": 4.69873462361668e-06, + "loss": 0.035759395360946654, + "memory(GiB)": 17.38, + "step": 3025, + "token_acc": 0.9789029535864979, + "train_speed(iter/s)": 0.210602 + }, + { + "epoch": 0.5441077441077441, + "grad_norm": 0.6682493090629578, + "learning_rate": 4.683913300598947e-06, + "loss": 0.03950292766094208, + "memory(GiB)": 17.38, + "step": 3030, + "token_acc": 0.9710743801652892, + "train_speed(iter/s)": 0.210671 + }, + { + "epoch": 0.5450056116722783, + "grad_norm": 0.9522141814231873, + "learning_rate": 4.6690947656181455e-06, + "loss": 0.03798787593841553, + "memory(GiB)": 17.38, + "step": 3035, + "token_acc": 0.9745762711864406, + "train_speed(iter/s)": 0.21074 + }, + { + "epoch": 0.5459034792368126, + "grad_norm": 0.7488031387329102, + "learning_rate": 4.654279149380892e-06, + "loss": 0.04434499740600586, + "memory(GiB)": 17.38, + "step": 3040, + "token_acc": 0.9789915966386554, + "train_speed(iter/s)": 0.210808 + }, + { + "epoch": 0.5468013468013468, + "grad_norm": 0.6428148746490479, + "learning_rate": 4.639466582568058e-06, + "loss": 0.03335587680339813, + "memory(GiB)": 17.38, + "step": 3045, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.210877 + }, + { + "epoch": 0.547699214365881, + "grad_norm": 0.494208961725235, + "learning_rate": 4.624657195833618e-06, + "loss": 0.03471240997314453, + "memory(GiB)": 17.38, + "step": 3050, + "token_acc": 0.9733606557377049, + "train_speed(iter/s)": 0.210946 + }, + { + "epoch": 0.5485970819304152, + "grad_norm": 0.5969663262367249, + "learning_rate": 4.609851119803494e-06, + "loss": 0.03831402063369751, + "memory(GiB)": 17.38, + "step": 3055, + "token_acc": 0.9754098360655737, + "train_speed(iter/s)": 0.211016 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 1.2252219915390015, + "learning_rate": 4.595048485074406e-06, + "loss": 0.04612976610660553, + "memory(GiB)": 17.38, + "step": 3060, + "token_acc": 0.977366255144033, + "train_speed(iter/s)": 0.211085 + }, + { + "epoch": 0.5503928170594837, + "grad_norm": 0.47977834939956665, + "learning_rate": 4.580249422212726e-06, + "loss": 0.03429934680461884, + "memory(GiB)": 17.38, + "step": 3065, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.211153 + }, + { + "epoch": 0.551290684624018, + "grad_norm": 1.0602229833602905, + "learning_rate": 4.5654540617533145e-06, + "loss": 0.03894851505756378, + "memory(GiB)": 17.38, + "step": 3070, + "token_acc": 0.9730290456431535, + "train_speed(iter/s)": 0.211222 + }, + { + "epoch": 0.5521885521885522, + "grad_norm": 0.3010494112968445, + "learning_rate": 4.55066253419838e-06, + "loss": 0.029984426498413087, + "memory(GiB)": 17.38, + "step": 3075, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.211291 + }, + { + "epoch": 0.5530864197530864, + "grad_norm": 0.9391909241676331, + "learning_rate": 4.535874970016315e-06, + "loss": 0.030388161540031433, + "memory(GiB)": 17.38, + "step": 3080, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.211361 + }, + { + "epoch": 0.5539842873176206, + "grad_norm": 0.5286198854446411, + "learning_rate": 4.52109149964056e-06, + "loss": 0.03878920972347259, + "memory(GiB)": 17.38, + "step": 3085, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.21143 + }, + { + "epoch": 0.5548821548821549, + "grad_norm": 2.599327325820923, + "learning_rate": 4.506312253468445e-06, + "loss": 0.0375044047832489, + "memory(GiB)": 17.38, + "step": 3090, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.211499 + }, + { + "epoch": 0.5557800224466891, + "grad_norm": 0.8950568437576294, + "learning_rate": 4.491537361860038e-06, + "loss": 0.024225854873657228, + "memory(GiB)": 17.38, + "step": 3095, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.211566 + }, + { + "epoch": 0.5566778900112234, + "grad_norm": 0.7307533621788025, + "learning_rate": 4.4767669551370005e-06, + "loss": 0.043083444237709045, + "memory(GiB)": 17.38, + "step": 3100, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.211632 + }, + { + "epoch": 0.5566778900112234, + "eval_loss": 0.03592285141348839, + "eval_runtime": 59.4654, + "eval_samples_per_second": 15.135, + "eval_steps_per_second": 7.567, + "eval_token_acc": 0.9822189294313762, + "step": 3100 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 0.8084725737571716, + "learning_rate": 4.4620011635814355e-06, + "loss": 0.03740071654319763, + "memory(GiB)": 17.38, + "step": 3105, + "token_acc": 0.9822465005121201, + "train_speed(iter/s)": 0.210717 + }, + { + "epoch": 0.5584736251402918, + "grad_norm": 0.7775259017944336, + "learning_rate": 4.447240117434733e-06, + "loss": 0.03527735471725464, + "memory(GiB)": 17.38, + "step": 3110, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.210785 + }, + { + "epoch": 0.559371492704826, + "grad_norm": 1.0231081247329712, + "learning_rate": 4.432483946896432e-06, + "loss": 0.024633996188640594, + "memory(GiB)": 17.38, + "step": 3115, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.210852 + }, + { + "epoch": 0.5602693602693603, + "grad_norm": 0.7937242984771729, + "learning_rate": 4.4177327821230635e-06, + "loss": 0.02747095227241516, + "memory(GiB)": 17.38, + "step": 3120, + "token_acc": 0.9915966386554622, + "train_speed(iter/s)": 0.21092 + }, + { + "epoch": 0.5611672278338945, + "grad_norm": 0.6463097333908081, + "learning_rate": 4.4029867532270046e-06, + "loss": 0.0343576043844223, + "memory(GiB)": 17.38, + "step": 3125, + "token_acc": 0.9897119341563786, + "train_speed(iter/s)": 0.210986 + }, + { + "epoch": 0.5620650953984287, + "grad_norm": 0.6784676313400269, + "learning_rate": 4.388245990275337e-06, + "loss": 0.041317546367645265, + "memory(GiB)": 17.38, + "step": 3130, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.211053 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 1.1222407817840576, + "learning_rate": 4.373510623288684e-06, + "loss": 0.0380826860666275, + "memory(GiB)": 17.38, + "step": 3135, + "token_acc": 0.9733606557377049, + "train_speed(iter/s)": 0.211119 + }, + { + "epoch": 0.5638608305274972, + "grad_norm": 1.0174131393432617, + "learning_rate": 4.358780782240082e-06, + "loss": 0.05013369917869568, + "memory(GiB)": 17.38, + "step": 3140, + "token_acc": 0.9830508474576272, + "train_speed(iter/s)": 0.211185 + }, + { + "epoch": 0.5647586980920314, + "grad_norm": 0.9281760454177856, + "learning_rate": 4.3440565970538235e-06, + "loss": 0.043619048595428464, + "memory(GiB)": 17.38, + "step": 3145, + "token_acc": 0.9715447154471545, + "train_speed(iter/s)": 0.211251 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.8259347081184387, + "learning_rate": 4.3293381976043146e-06, + "loss": 0.04000493586063385, + "memory(GiB)": 17.38, + "step": 3150, + "token_acc": 0.9779116465863453, + "train_speed(iter/s)": 0.211315 + }, + { + "epoch": 0.5665544332210999, + "grad_norm": 0.6491321921348572, + "learning_rate": 4.3146257137149286e-06, + "loss": 0.03730204701423645, + "memory(GiB)": 17.38, + "step": 3155, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.211378 + }, + { + "epoch": 0.5674523007856341, + "grad_norm": 0.3386561870574951, + "learning_rate": 4.299919275156857e-06, + "loss": 0.029863068461418153, + "memory(GiB)": 17.38, + "step": 3160, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.211444 + }, + { + "epoch": 0.5683501683501684, + "grad_norm": 0.31830787658691406, + "learning_rate": 4.2852190116479706e-06, + "loss": 0.03299294114112854, + "memory(GiB)": 17.38, + "step": 3165, + "token_acc": 0.9891774891774892, + "train_speed(iter/s)": 0.21151 + }, + { + "epoch": 0.5692480359147026, + "grad_norm": 1.0402253866195679, + "learning_rate": 4.270525052851677e-06, + "loss": 0.030310297012329103, + "memory(GiB)": 17.38, + "step": 3170, + "token_acc": 0.9787234042553191, + "train_speed(iter/s)": 0.211574 + }, + { + "epoch": 0.5701459034792368, + "grad_norm": 1.4358203411102295, + "learning_rate": 4.255837528375768e-06, + "loss": 0.04616828262805939, + "memory(GiB)": 17.38, + "step": 3175, + "token_acc": 0.9852941176470589, + "train_speed(iter/s)": 0.211641 + }, + { + "epoch": 0.571043771043771, + "grad_norm": 1.466363549232483, + "learning_rate": 4.241156567771285e-06, + "loss": 0.050893133878707884, + "memory(GiB)": 17.38, + "step": 3180, + "token_acc": 0.9734693877551021, + "train_speed(iter/s)": 0.211705 + }, + { + "epoch": 0.5719416386083053, + "grad_norm": 0.5825255513191223, + "learning_rate": 4.226482300531372e-06, + "loss": 0.028993701934814452, + "memory(GiB)": 17.38, + "step": 3185, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.21177 + }, + { + "epoch": 0.5728395061728395, + "grad_norm": 0.410525918006897, + "learning_rate": 4.2118148560901325e-06, + "loss": 0.03415879905223847, + "memory(GiB)": 17.38, + "step": 3190, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.211836 + }, + { + "epoch": 0.5737373737373738, + "grad_norm": 1.2232425212860107, + "learning_rate": 4.1971543638214915e-06, + "loss": 0.039345848560333255, + "memory(GiB)": 17.38, + "step": 3195, + "token_acc": 0.9836734693877551, + "train_speed(iter/s)": 0.2119 + }, + { + "epoch": 0.574635241301908, + "grad_norm": 0.9477332234382629, + "learning_rate": 4.182500953038052e-06, + "loss": 0.04104136824607849, + "memory(GiB)": 17.38, + "step": 3200, + "token_acc": 0.9736842105263158, + "train_speed(iter/s)": 0.211964 + }, + { + "epoch": 0.574635241301908, + "eval_loss": 0.03428991511464119, + "eval_runtime": 59.5163, + "eval_samples_per_second": 15.122, + "eval_steps_per_second": 7.561, + "eval_token_acc": 0.9826819781441007, + "step": 3200 + }, + { + "epoch": 0.5755331088664422, + "grad_norm": 0.9450451135635376, + "learning_rate": 4.167854752989954e-06, + "loss": 0.03343685567378998, + "memory(GiB)": 17.38, + "step": 3205, + "token_acc": 0.9844497607655502, + "train_speed(iter/s)": 0.211073 + }, + { + "epoch": 0.5764309764309764, + "grad_norm": 0.4707873463630676, + "learning_rate": 4.15321589286374e-06, + "loss": 0.03108985722064972, + "memory(GiB)": 17.38, + "step": 3210, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.211138 + }, + { + "epoch": 0.5773288439955107, + "grad_norm": 0.5734744668006897, + "learning_rate": 4.138584501781203e-06, + "loss": 0.03519664406776428, + "memory(GiB)": 17.38, + "step": 3215, + "token_acc": 0.97165991902834, + "train_speed(iter/s)": 0.211202 + }, + { + "epoch": 0.5782267115600449, + "grad_norm": 0.8017730712890625, + "learning_rate": 4.12396070879826e-06, + "loss": 0.035449111461639406, + "memory(GiB)": 17.38, + "step": 3220, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.211266 + }, + { + "epoch": 0.5791245791245792, + "grad_norm": 1.0020838975906372, + "learning_rate": 4.10934464290381e-06, + "loss": 0.04336810111999512, + "memory(GiB)": 17.38, + "step": 3225, + "token_acc": 0.9818548387096774, + "train_speed(iter/s)": 0.211331 + }, + { + "epoch": 0.5800224466891134, + "grad_norm": 0.6461151242256165, + "learning_rate": 4.0947364330185935e-06, + "loss": 0.029209265112876893, + "memory(GiB)": 17.38, + "step": 3230, + "token_acc": 0.9810924369747899, + "train_speed(iter/s)": 0.211394 + }, + { + "epoch": 0.5809203142536475, + "grad_norm": 0.5174492001533508, + "learning_rate": 4.080136207994058e-06, + "loss": 0.028296151757240297, + "memory(GiB)": 17.38, + "step": 3235, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.211458 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.8750156760215759, + "learning_rate": 4.065544096611222e-06, + "loss": 0.03008807897567749, + "memory(GiB)": 17.38, + "step": 3240, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.211523 + }, + { + "epoch": 0.582716049382716, + "grad_norm": 1.040645956993103, + "learning_rate": 4.050960227579532e-06, + "loss": 0.03442961871623993, + "memory(GiB)": 17.38, + "step": 3245, + "token_acc": 0.9857723577235772, + "train_speed(iter/s)": 0.211586 + }, + { + "epoch": 0.5836139169472503, + "grad_norm": 0.648860514163971, + "learning_rate": 4.036384729535741e-06, + "loss": 0.03954000473022461, + "memory(GiB)": 17.38, + "step": 3250, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.21165 + }, + { + "epoch": 0.5845117845117845, + "grad_norm": 1.4912347793579102, + "learning_rate": 4.02181773104276e-06, + "loss": 0.04096723198890686, + "memory(GiB)": 17.38, + "step": 3255, + "token_acc": 0.9847161572052402, + "train_speed(iter/s)": 0.211713 + }, + { + "epoch": 0.5854096520763188, + "grad_norm": 1.6548627614974976, + "learning_rate": 4.007259360588533e-06, + "loss": 0.03256728053092957, + "memory(GiB)": 17.38, + "step": 3260, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.211776 + }, + { + "epoch": 0.5863075196408529, + "grad_norm": 0.7206517457962036, + "learning_rate": 3.992709746584901e-06, + "loss": 0.04086422920227051, + "memory(GiB)": 17.38, + "step": 3265, + "token_acc": 0.9686192468619247, + "train_speed(iter/s)": 0.211838 + }, + { + "epoch": 0.5872053872053872, + "grad_norm": 1.822160243988037, + "learning_rate": 3.978169017366462e-06, + "loss": 0.034605208039283755, + "memory(GiB)": 17.38, + "step": 3270, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.2119 + }, + { + "epoch": 0.5881032547699214, + "grad_norm": 0.3073776662349701, + "learning_rate": 3.963637301189456e-06, + "loss": 0.029543408751487733, + "memory(GiB)": 17.38, + "step": 3275, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.211964 + }, + { + "epoch": 0.5890011223344557, + "grad_norm": 1.9872992038726807, + "learning_rate": 3.949114726230618e-06, + "loss": 0.03377828598022461, + "memory(GiB)": 17.38, + "step": 3280, + "token_acc": 0.9838709677419355, + "train_speed(iter/s)": 0.212026 + }, + { + "epoch": 0.5898989898989899, + "grad_norm": 1.9097858667373657, + "learning_rate": 3.934601420586052e-06, + "loss": 0.04517065286636353, + "memory(GiB)": 17.38, + "step": 3285, + "token_acc": 0.9693877551020408, + "train_speed(iter/s)": 0.21209 + }, + { + "epoch": 0.5907968574635242, + "grad_norm": 2.1709792613983154, + "learning_rate": 3.920097512270108e-06, + "loss": 0.0337420254945755, + "memory(GiB)": 17.38, + "step": 3290, + "token_acc": 0.9855371900826446, + "train_speed(iter/s)": 0.212154 + }, + { + "epoch": 0.5916947250280583, + "grad_norm": 1.076111078262329, + "learning_rate": 3.9056031292142385e-06, + "loss": 0.03165815472602844, + "memory(GiB)": 17.38, + "step": 3295, + "token_acc": 0.9816326530612245, + "train_speed(iter/s)": 0.212216 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.8962927460670471, + "learning_rate": 3.8911183992658864e-06, + "loss": 0.03720520138740539, + "memory(GiB)": 17.38, + "step": 3300, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.212277 + }, + { + "epoch": 0.5925925925925926, + "eval_loss": 0.032386187463998795, + "eval_runtime": 59.6033, + "eval_samples_per_second": 15.1, + "eval_steps_per_second": 7.55, + "eval_token_acc": 0.983515465827005, + "step": 3300 + }, + { + "epoch": 0.5934904601571268, + "grad_norm": 0.7111935019493103, + "learning_rate": 3.876643450187344e-06, + "loss": 0.03466266691684723, + "memory(GiB)": 17.38, + "step": 3305, + "token_acc": 0.9830537487161931, + "train_speed(iter/s)": 0.211405 + }, + { + "epoch": 0.5943883277216611, + "grad_norm": 0.5044178366661072, + "learning_rate": 3.862178409654637e-06, + "loss": 0.029964250326156617, + "memory(GiB)": 17.38, + "step": 3310, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.211467 + }, + { + "epoch": 0.5952861952861953, + "grad_norm": 0.7595324516296387, + "learning_rate": 3.847723405256388e-06, + "loss": 0.0389423668384552, + "memory(GiB)": 17.38, + "step": 3315, + "token_acc": 0.9763948497854077, + "train_speed(iter/s)": 0.211529 + }, + { + "epoch": 0.5961840628507296, + "grad_norm": 0.5055687427520752, + "learning_rate": 3.8332785644927e-06, + "loss": 0.03235511183738708, + "memory(GiB)": 17.38, + "step": 3320, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.211589 + }, + { + "epoch": 0.5970819304152637, + "grad_norm": 0.6402701735496521, + "learning_rate": 3.818844014774023e-06, + "loss": 0.030745026469230653, + "memory(GiB)": 17.38, + "step": 3325, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.211652 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 1.2706646919250488, + "learning_rate": 3.804419883420039e-06, + "loss": 0.03447930216789245, + "memory(GiB)": 17.38, + "step": 3330, + "token_acc": 0.98046875, + "train_speed(iter/s)": 0.211714 + }, + { + "epoch": 0.5988776655443322, + "grad_norm": 0.7180752158164978, + "learning_rate": 3.790006297658535e-06, + "loss": 0.031576281785964964, + "memory(GiB)": 17.38, + "step": 3335, + "token_acc": 0.9772727272727273, + "train_speed(iter/s)": 0.211778 + }, + { + "epoch": 0.5997755331088664, + "grad_norm": 0.5540750026702881, + "learning_rate": 3.775603384624278e-06, + "loss": 0.032175496220588684, + "memory(GiB)": 17.38, + "step": 3340, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.211837 + }, + { + "epoch": 0.6006734006734007, + "grad_norm": 1.2299844026565552, + "learning_rate": 3.7612112713579006e-06, + "loss": 0.036039122939109804, + "memory(GiB)": 17.38, + "step": 3345, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.211898 + }, + { + "epoch": 0.6015712682379349, + "grad_norm": 0.7803512215614319, + "learning_rate": 3.7468300848047712e-06, + "loss": 0.0281095415353775, + "memory(GiB)": 17.38, + "step": 3350, + "token_acc": 0.9892703862660944, + "train_speed(iter/s)": 0.211959 + }, + { + "epoch": 0.6024691358024692, + "grad_norm": 0.9968711733818054, + "learning_rate": 3.7324599518138826e-06, + "loss": 0.028966400027275085, + "memory(GiB)": 17.38, + "step": 3355, + "token_acc": 0.9859437751004017, + "train_speed(iter/s)": 0.21202 + }, + { + "epoch": 0.6033670033670033, + "grad_norm": 1.4379856586456299, + "learning_rate": 3.718100999136731e-06, + "loss": 0.034907829761505124, + "memory(GiB)": 17.38, + "step": 3360, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.212079 + }, + { + "epoch": 0.6042648709315376, + "grad_norm": 1.5501691102981567, + "learning_rate": 3.7037533534261928e-06, + "loss": 0.05221737027168274, + "memory(GiB)": 17.38, + "step": 3365, + "token_acc": 0.9754098360655737, + "train_speed(iter/s)": 0.212143 + }, + { + "epoch": 0.6051627384960718, + "grad_norm": 0.7675436735153198, + "learning_rate": 3.689417141235414e-06, + "loss": 0.03333545327186584, + "memory(GiB)": 17.38, + "step": 3370, + "token_acc": 0.9855371900826446, + "train_speed(iter/s)": 0.212204 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.609959065914154, + "learning_rate": 3.675092489016693e-06, + "loss": 0.031475108861923215, + "memory(GiB)": 17.38, + "step": 3375, + "token_acc": 0.9873949579831933, + "train_speed(iter/s)": 0.212265 + }, + { + "epoch": 0.6069584736251403, + "grad_norm": 0.8781528472900391, + "learning_rate": 3.6607795231203584e-06, + "loss": 0.032167309522628786, + "memory(GiB)": 17.38, + "step": 3380, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.212326 + }, + { + "epoch": 0.6078563411896746, + "grad_norm": 0.4406980276107788, + "learning_rate": 3.6464783697936656e-06, + "loss": 0.031094563007354737, + "memory(GiB)": 17.38, + "step": 3385, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.212387 + }, + { + "epoch": 0.6087542087542087, + "grad_norm": 0.8502365946769714, + "learning_rate": 3.6321891551796755e-06, + "loss": 0.037125617265701294, + "memory(GiB)": 17.38, + "step": 3390, + "token_acc": 0.9790794979079498, + "train_speed(iter/s)": 0.212448 + }, + { + "epoch": 0.609652076318743, + "grad_norm": 1.190455436706543, + "learning_rate": 3.617912005316142e-06, + "loss": 0.04733949601650238, + "memory(GiB)": 17.38, + "step": 3395, + "token_acc": 0.978448275862069, + "train_speed(iter/s)": 0.212509 + }, + { + "epoch": 0.6105499438832772, + "grad_norm": 1.0460447072982788, + "learning_rate": 3.6036470461344062e-06, + "loss": 0.03917270302772522, + "memory(GiB)": 17.38, + "step": 3400, + "token_acc": 0.9729166666666667, + "train_speed(iter/s)": 0.212571 + }, + { + "epoch": 0.6105499438832772, + "eval_loss": 0.0334014892578125, + "eval_runtime": 59.5346, + "eval_samples_per_second": 15.117, + "eval_steps_per_second": 7.559, + "eval_token_acc": 0.9831450268568254, + "step": 3400 + }, + { + "epoch": 0.6114478114478115, + "grad_norm": 0.5554693937301636, + "learning_rate": 3.5893944034582758e-06, + "loss": 0.033488065004348755, + "memory(GiB)": 17.38, + "step": 3405, + "token_acc": 0.9852890865549093, + "train_speed(iter/s)": 0.211729 + }, + { + "epoch": 0.6123456790123457, + "grad_norm": 0.6041802763938904, + "learning_rate": 3.575154203002924e-06, + "loss": 0.02629697322845459, + "memory(GiB)": 17.38, + "step": 3410, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 0.211788 + }, + { + "epoch": 0.61324354657688, + "grad_norm": 0.6161660552024841, + "learning_rate": 3.5609265703737784e-06, + "loss": 0.03911909461021423, + "memory(GiB)": 17.38, + "step": 3415, + "token_acc": 0.9728033472803347, + "train_speed(iter/s)": 0.211849 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 2.65439772605896, + "learning_rate": 3.5467116310654115e-06, + "loss": 0.03089310824871063, + "memory(GiB)": 17.38, + "step": 3420, + "token_acc": 0.9915611814345991, + "train_speed(iter/s)": 0.211909 + }, + { + "epoch": 0.6150392817059483, + "grad_norm": 0.6883147954940796, + "learning_rate": 3.5325095104604374e-06, + "loss": 0.031556525826454164, + "memory(GiB)": 17.38, + "step": 3425, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.211969 + }, + { + "epoch": 0.6159371492704826, + "grad_norm": 0.4769245982170105, + "learning_rate": 3.5183203338283955e-06, + "loss": 0.025775331258773803, + "memory(GiB)": 17.38, + "step": 3430, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.212028 + }, + { + "epoch": 0.6168350168350168, + "grad_norm": 3.377089500427246, + "learning_rate": 3.50414422632466e-06, + "loss": 0.03226297497749329, + "memory(GiB)": 17.38, + "step": 3435, + "token_acc": 0.9857723577235772, + "train_speed(iter/s)": 0.212088 + }, + { + "epoch": 0.6177328843995511, + "grad_norm": 0.757219672203064, + "learning_rate": 3.489981312989327e-06, + "loss": 0.02656465768814087, + "memory(GiB)": 17.38, + "step": 3440, + "token_acc": 0.9848484848484849, + "train_speed(iter/s)": 0.212149 + }, + { + "epoch": 0.6186307519640853, + "grad_norm": 1.3921219110488892, + "learning_rate": 3.475831718746114e-06, + "loss": 0.032429558038711545, + "memory(GiB)": 17.38, + "step": 3445, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.212208 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 0.8434154987335205, + "learning_rate": 3.4616955684012567e-06, + "loss": 0.03241931200027466, + "memory(GiB)": 17.38, + "step": 3450, + "token_acc": 0.9753086419753086, + "train_speed(iter/s)": 0.212267 + }, + { + "epoch": 0.6204264870931537, + "grad_norm": 0.9752683043479919, + "learning_rate": 3.4475729866424125e-06, + "loss": 0.030986031889915465, + "memory(GiB)": 17.38, + "step": 3455, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.212327 + }, + { + "epoch": 0.621324354657688, + "grad_norm": 0.6854255795478821, + "learning_rate": 3.4334640980375505e-06, + "loss": 0.03165216445922851, + "memory(GiB)": 17.38, + "step": 3460, + "token_acc": 0.9817073170731707, + "train_speed(iter/s)": 0.212388 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.815204381942749, + "learning_rate": 3.4193690270338655e-06, + "loss": 0.02716798484325409, + "memory(GiB)": 17.38, + "step": 3465, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.212449 + }, + { + "epoch": 0.6231200897867565, + "grad_norm": 1.1259046792984009, + "learning_rate": 3.4052878979566727e-06, + "loss": 0.03220363259315491, + "memory(GiB)": 17.38, + "step": 3470, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.212507 + }, + { + "epoch": 0.6240179573512907, + "grad_norm": 0.7726863622665405, + "learning_rate": 3.3912208350083133e-06, + "loss": 0.03851915895938873, + "memory(GiB)": 17.38, + "step": 3475, + "token_acc": 0.9836734693877551, + "train_speed(iter/s)": 0.212566 + }, + { + "epoch": 0.6249158249158249, + "grad_norm": 0.6608043313026428, + "learning_rate": 3.3771679622670586e-06, + "loss": 0.02346276491880417, + "memory(GiB)": 17.38, + "step": 3480, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.212626 + }, + { + "epoch": 0.6258136924803591, + "grad_norm": 1.8276649713516235, + "learning_rate": 3.3631294036860116e-06, + "loss": 0.04756525456905365, + "memory(GiB)": 17.38, + "step": 3485, + "token_acc": 0.975103734439834, + "train_speed(iter/s)": 0.212684 + }, + { + "epoch": 0.6267115600448934, + "grad_norm": 1.8423209190368652, + "learning_rate": 3.3491052830920234e-06, + "loss": 0.04078681468963623, + "memory(GiB)": 17.38, + "step": 3490, + "token_acc": 0.9782608695652174, + "train_speed(iter/s)": 0.212743 + }, + { + "epoch": 0.6276094276094276, + "grad_norm": 0.8537706732749939, + "learning_rate": 3.335095724184591e-06, + "loss": 0.03249537944793701, + "memory(GiB)": 17.38, + "step": 3495, + "token_acc": 0.9897959183673469, + "train_speed(iter/s)": 0.212803 + }, + { + "epoch": 0.6285072951739619, + "grad_norm": 1.1131573915481567, + "learning_rate": 3.321100850534774e-06, + "loss": 0.03443157970905304, + "memory(GiB)": 17.38, + "step": 3500, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.212862 + }, + { + "epoch": 0.6285072951739619, + "eval_loss": 0.0318644680082798, + "eval_runtime": 59.2426, + "eval_samples_per_second": 15.192, + "eval_steps_per_second": 7.596, + "eval_token_acc": 0.9834228560844601, + "step": 3500 + }, + { + "epoch": 0.6294051627384961, + "grad_norm": 0.9648253917694092, + "learning_rate": 3.3071207855840957e-06, + "loss": 0.041373956203460696, + "memory(GiB)": 17.38, + "step": 3505, + "token_acc": 0.9849418206707734, + "train_speed(iter/s)": 0.212043 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.4351541996002197, + "learning_rate": 3.293155652643464e-06, + "loss": 0.03619065284729004, + "memory(GiB)": 17.38, + "step": 3510, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.212103 + }, + { + "epoch": 0.6312008978675645, + "grad_norm": 0.6683384776115417, + "learning_rate": 3.279205574892077e-06, + "loss": 0.04454144239425659, + "memory(GiB)": 17.38, + "step": 3515, + "token_acc": 0.9742063492063492, + "train_speed(iter/s)": 0.212162 + }, + { + "epoch": 0.6320987654320988, + "grad_norm": 0.6023651361465454, + "learning_rate": 3.2652706753763396e-06, + "loss": 0.034640201926231386, + "memory(GiB)": 17.38, + "step": 3520, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.212218 + }, + { + "epoch": 0.632996632996633, + "grad_norm": 0.663701057434082, + "learning_rate": 3.2513510770087776e-06, + "loss": 0.03685192465782165, + "memory(GiB)": 17.38, + "step": 3525, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.212274 + }, + { + "epoch": 0.6338945005611673, + "grad_norm": 0.8185016512870789, + "learning_rate": 3.2374469025669533e-06, + "loss": 0.03878421187400818, + "memory(GiB)": 17.38, + "step": 3530, + "token_acc": 0.9746835443037974, + "train_speed(iter/s)": 0.212332 + }, + { + "epoch": 0.6347923681257015, + "grad_norm": 0.9945972561836243, + "learning_rate": 3.2235582746923833e-06, + "loss": 0.03368375301361084, + "memory(GiB)": 17.38, + "step": 3535, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.21239 + }, + { + "epoch": 0.6356902356902356, + "grad_norm": 0.7573075890541077, + "learning_rate": 3.2096853158894512e-06, + "loss": 0.03204033672809601, + "memory(GiB)": 17.38, + "step": 3540, + "token_acc": 0.989451476793249, + "train_speed(iter/s)": 0.212449 + }, + { + "epoch": 0.6365881032547699, + "grad_norm": 0.9557067155838013, + "learning_rate": 3.195828148524338e-06, + "loss": 0.03392746746540069, + "memory(GiB)": 17.38, + "step": 3545, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.212507 + }, + { + "epoch": 0.6374859708193041, + "grad_norm": 0.6412433981895447, + "learning_rate": 3.181986894823935e-06, + "loss": 0.039027339220046996, + "memory(GiB)": 17.38, + "step": 3550, + "token_acc": 0.9767932489451476, + "train_speed(iter/s)": 0.212564 + }, + { + "epoch": 0.6383838383838384, + "grad_norm": 0.5774496793746948, + "learning_rate": 3.1681616768747647e-06, + "loss": 0.037918204069137575, + "memory(GiB)": 17.38, + "step": 3555, + "token_acc": 0.9918032786885246, + "train_speed(iter/s)": 0.212621 + }, + { + "epoch": 0.6392817059483726, + "grad_norm": 0.7825069427490234, + "learning_rate": 3.1543526166219105e-06, + "loss": 0.030402606725692748, + "memory(GiB)": 17.38, + "step": 3560, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.212679 + }, + { + "epoch": 0.6401795735129069, + "grad_norm": 1.773136019706726, + "learning_rate": 3.1405598358679328e-06, + "loss": 0.034509477019309995, + "memory(GiB)": 17.38, + "step": 3565, + "token_acc": 0.9769874476987448, + "train_speed(iter/s)": 0.212737 + }, + { + "epoch": 0.641077441077441, + "grad_norm": 0.9002499580383301, + "learning_rate": 3.1267834562718014e-06, + "loss": 0.026789945363998414, + "memory(GiB)": 17.38, + "step": 3570, + "token_acc": 0.9893162393162394, + "train_speed(iter/s)": 0.212795 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 1.5610922574996948, + "learning_rate": 3.11302359934782e-06, + "loss": 0.03017086088657379, + "memory(GiB)": 17.38, + "step": 3575, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.212851 + }, + { + "epoch": 0.6428731762065095, + "grad_norm": 0.8957364559173584, + "learning_rate": 3.0992803864645543e-06, + "loss": 0.037738239765167235, + "memory(GiB)": 17.38, + "step": 3580, + "token_acc": 0.9877551020408163, + "train_speed(iter/s)": 0.212907 + }, + { + "epoch": 0.6437710437710438, + "grad_norm": 0.5978476405143738, + "learning_rate": 3.085553938843762e-06, + "loss": 0.03536413311958313, + "memory(GiB)": 17.38, + "step": 3585, + "token_acc": 0.9789915966386554, + "train_speed(iter/s)": 0.212963 + }, + { + "epoch": 0.644668911335578, + "grad_norm": 0.379538893699646, + "learning_rate": 3.0718443775593233e-06, + "loss": 0.022173571586608886, + "memory(GiB)": 17.38, + "step": 3590, + "token_acc": 0.9915611814345991, + "train_speed(iter/s)": 0.213019 + }, + { + "epoch": 0.6455667789001123, + "grad_norm": 1.6848992109298706, + "learning_rate": 3.0581518235361685e-06, + "loss": 0.03449346423149109, + "memory(GiB)": 17.38, + "step": 3595, + "token_acc": 0.979757085020243, + "train_speed(iter/s)": 0.213076 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 1.024051547050476, + "learning_rate": 3.044476397549221e-06, + "loss": 0.034063678979873654, + "memory(GiB)": 17.38, + "step": 3600, + "token_acc": 0.9809322033898306, + "train_speed(iter/s)": 0.213131 + }, + { + "epoch": 0.6464646464646465, + "eval_loss": 0.031629472970962524, + "eval_runtime": 59.5535, + "eval_samples_per_second": 15.112, + "eval_steps_per_second": 7.556, + "eval_token_acc": 0.9849972217077236, + "step": 3600 + }, + { + "epoch": 0.6473625140291807, + "grad_norm": 1.997093915939331, + "learning_rate": 3.030818220222325e-06, + "loss": 0.03469839692115784, + "memory(GiB)": 17.38, + "step": 3605, + "token_acc": 0.9847654912701129, + "train_speed(iter/s)": 0.212335 + }, + { + "epoch": 0.6482603815937149, + "grad_norm": 3.3116769790649414, + "learning_rate": 3.0171774120271825e-06, + "loss": 0.03560910820960998, + "memory(GiB)": 17.38, + "step": 3610, + "token_acc": 0.9731404958677686, + "train_speed(iter/s)": 0.212393 + }, + { + "epoch": 0.6491582491582492, + "grad_norm": 0.943142294883728, + "learning_rate": 3.003554093282294e-06, + "loss": 0.03705155849456787, + "memory(GiB)": 17.38, + "step": 3615, + "token_acc": 0.9817813765182186, + "train_speed(iter/s)": 0.21245 + }, + { + "epoch": 0.6500561167227834, + "grad_norm": 1.5499387979507446, + "learning_rate": 2.9899483841518884e-06, + "loss": 0.02866685390472412, + "memory(GiB)": 17.38, + "step": 3620, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.212506 + }, + { + "epoch": 0.6509539842873177, + "grad_norm": 0.9049564003944397, + "learning_rate": 2.976360404644877e-06, + "loss": 0.025727853178977966, + "memory(GiB)": 17.38, + "step": 3625, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.212559 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 1.9606379270553589, + "learning_rate": 2.9627902746137816e-06, + "loss": 0.03221776783466339, + "memory(GiB)": 17.38, + "step": 3630, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.212614 + }, + { + "epoch": 0.652749719416386, + "grad_norm": 0.9224356412887573, + "learning_rate": 2.9492381137536863e-06, + "loss": 0.027673569321632386, + "memory(GiB)": 17.38, + "step": 3635, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.212671 + }, + { + "epoch": 0.6536475869809203, + "grad_norm": 1.3812013864517212, + "learning_rate": 2.9357040416011785e-06, + "loss": 0.03246560394763946, + "memory(GiB)": 17.38, + "step": 3640, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.212727 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 1.1157524585723877, + "learning_rate": 2.9221881775332906e-06, + "loss": 0.03707281947135925, + "memory(GiB)": 17.38, + "step": 3645, + "token_acc": 0.9787234042553191, + "train_speed(iter/s)": 0.212783 + }, + { + "epoch": 0.6554433221099888, + "grad_norm": 0.9900431632995605, + "learning_rate": 2.908690640766454e-06, + "loss": 0.025664320588111876, + "memory(GiB)": 17.38, + "step": 3650, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.212839 + }, + { + "epoch": 0.656341189674523, + "grad_norm": 1.030234694480896, + "learning_rate": 2.8952115503554455e-06, + "loss": 0.031166458129882814, + "memory(GiB)": 17.38, + "step": 3655, + "token_acc": 0.989406779661017, + "train_speed(iter/s)": 0.212894 + }, + { + "epoch": 0.6572390572390573, + "grad_norm": 0.7934268712997437, + "learning_rate": 2.8817510251923326e-06, + "loss": 0.029421770572662355, + "memory(GiB)": 17.38, + "step": 3660, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.21295 + }, + { + "epoch": 0.6581369248035914, + "grad_norm": 1.025406002998352, + "learning_rate": 2.8683091840054333e-06, + "loss": 0.04097101986408234, + "memory(GiB)": 17.38, + "step": 3665, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.213004 + }, + { + "epoch": 0.6590347923681257, + "grad_norm": 1.4878090620040894, + "learning_rate": 2.8548861453582606e-06, + "loss": 0.036505895853042605, + "memory(GiB)": 17.38, + "step": 3670, + "token_acc": 0.9794238683127572, + "train_speed(iter/s)": 0.213058 + }, + { + "epoch": 0.6599326599326599, + "grad_norm": 0.8448243737220764, + "learning_rate": 2.8414820276484765e-06, + "loss": 0.03731580376625061, + "memory(GiB)": 17.38, + "step": 3675, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.213113 + }, + { + "epoch": 0.6608305274971942, + "grad_norm": 0.518051028251648, + "learning_rate": 2.828096949106862e-06, + "loss": 0.038469964265823366, + "memory(GiB)": 17.38, + "step": 3680, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.213168 + }, + { + "epoch": 0.6617283950617284, + "grad_norm": 1.07236647605896, + "learning_rate": 2.8147310277962524e-06, + "loss": 0.03246802389621735, + "memory(GiB)": 17.38, + "step": 3685, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.213222 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 0.9013633131980896, + "learning_rate": 2.80138438161051e-06, + "loss": 0.02626926898956299, + "memory(GiB)": 17.38, + "step": 3690, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.213278 + }, + { + "epoch": 0.6635241301907968, + "grad_norm": 0.9525784254074097, + "learning_rate": 2.788057128273487e-06, + "loss": 0.033724480867385866, + "memory(GiB)": 17.38, + "step": 3695, + "token_acc": 0.9830508474576272, + "train_speed(iter/s)": 0.213333 + }, + { + "epoch": 0.6644219977553311, + "grad_norm": 0.3779265284538269, + "learning_rate": 2.774749385337975e-06, + "loss": 0.03583263158798218, + "memory(GiB)": 17.38, + "step": 3700, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 0.213389 + }, + { + "epoch": 0.6644219977553311, + "eval_loss": 0.031091248616576195, + "eval_runtime": 59.2925, + "eval_samples_per_second": 15.179, + "eval_steps_per_second": 7.589, + "eval_token_acc": 0.983515465827005, + "step": 3700 + }, + { + "epoch": 0.6653198653198653, + "grad_norm": 0.5474767684936523, + "learning_rate": 2.761461270184673e-06, + "loss": 0.0344696044921875, + "memory(GiB)": 17.38, + "step": 3705, + "token_acc": 0.9836345039209001, + "train_speed(iter/s)": 0.212619 + }, + { + "epoch": 0.6662177328843996, + "grad_norm": 1.8591980934143066, + "learning_rate": 2.7481929000211626e-06, + "loss": 0.030002924799919128, + "memory(GiB)": 17.38, + "step": 3710, + "token_acc": 0.9838709677419355, + "train_speed(iter/s)": 0.212674 + }, + { + "epoch": 0.6671156004489338, + "grad_norm": 0.7611818909645081, + "learning_rate": 2.734944391880855e-06, + "loss": 0.03487918376922607, + "memory(GiB)": 17.38, + "step": 3715, + "token_acc": 0.9771784232365145, + "train_speed(iter/s)": 0.212729 + }, + { + "epoch": 0.6680134680134681, + "grad_norm": 0.6550134420394897, + "learning_rate": 2.7217158626219783e-06, + "loss": 0.03286608457565308, + "memory(GiB)": 17.38, + "step": 3720, + "token_acc": 0.9793388429752066, + "train_speed(iter/s)": 0.212785 + }, + { + "epoch": 0.6689113355780022, + "grad_norm": 1.7015101909637451, + "learning_rate": 2.7085074289265247e-06, + "loss": 0.03078613579273224, + "memory(GiB)": 17.38, + "step": 3725, + "token_acc": 0.9847161572052402, + "train_speed(iter/s)": 0.212839 + }, + { + "epoch": 0.6698092031425364, + "grad_norm": 1.7195833921432495, + "learning_rate": 2.6953192072992467e-06, + "loss": 0.04355582594871521, + "memory(GiB)": 17.38, + "step": 3730, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.212896 + }, + { + "epoch": 0.6707070707070707, + "grad_norm": 1.2490787506103516, + "learning_rate": 2.682151314066608e-06, + "loss": 0.038646084070205686, + "memory(GiB)": 17.38, + "step": 3735, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.212951 + }, + { + "epoch": 0.671604938271605, + "grad_norm": 0.6383852362632751, + "learning_rate": 2.669003865375773e-06, + "loss": 0.02468145191669464, + "memory(GiB)": 17.38, + "step": 3740, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.213005 + }, + { + "epoch": 0.6725028058361392, + "grad_norm": 0.9678915143013, + "learning_rate": 2.655876977193567e-06, + "loss": 0.03285512328147888, + "memory(GiB)": 17.38, + "step": 3745, + "token_acc": 0.9810924369747899, + "train_speed(iter/s)": 0.213059 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 1.2631752490997314, + "learning_rate": 2.6427707653054728e-06, + "loss": 0.031785255670547484, + "memory(GiB)": 17.38, + "step": 3750, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.213114 + }, + { + "epoch": 0.6742985409652076, + "grad_norm": 0.6728029251098633, + "learning_rate": 2.6296853453145843e-06, + "loss": 0.022910813987255096, + "memory(GiB)": 17.38, + "step": 3755, + "token_acc": 0.9877551020408163, + "train_speed(iter/s)": 0.213169 + }, + { + "epoch": 0.6751964085297418, + "grad_norm": 1.2156270742416382, + "learning_rate": 2.616620832640613e-06, + "loss": 0.034191733598709105, + "memory(GiB)": 17.38, + "step": 3760, + "token_acc": 0.98, + "train_speed(iter/s)": 0.213223 + }, + { + "epoch": 0.6760942760942761, + "grad_norm": 0.5939570069313049, + "learning_rate": 2.603577342518851e-06, + "loss": 0.02353179156780243, + "memory(GiB)": 17.38, + "step": 3765, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.213277 + }, + { + "epoch": 0.6769921436588103, + "grad_norm": 0.7344240546226501, + "learning_rate": 2.590554989999166e-06, + "loss": 0.021662503480911255, + "memory(GiB)": 17.38, + "step": 3770, + "token_acc": 0.9896694214876033, + "train_speed(iter/s)": 0.213331 + }, + { + "epoch": 0.6778900112233446, + "grad_norm": 0.42172756791114807, + "learning_rate": 2.5775538899449788e-06, + "loss": 0.026797187328338624, + "memory(GiB)": 17.38, + "step": 3775, + "token_acc": 0.9834710743801653, + "train_speed(iter/s)": 0.213384 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.732586145401001, + "learning_rate": 2.5645741570322526e-06, + "loss": 0.01975376009941101, + "memory(GiB)": 17.38, + "step": 3780, + "token_acc": 0.9917695473251029, + "train_speed(iter/s)": 0.213438 + }, + { + "epoch": 0.679685746352413, + "grad_norm": 1.0245758295059204, + "learning_rate": 2.551615905748482e-06, + "loss": 0.029313117265701294, + "memory(GiB)": 17.38, + "step": 3785, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.21349 + }, + { + "epoch": 0.6805836139169472, + "grad_norm": 2.9810142517089844, + "learning_rate": 2.538679250391689e-06, + "loss": 0.03837526440620422, + "memory(GiB)": 17.38, + "step": 3790, + "token_acc": 0.9791666666666666, + "train_speed(iter/s)": 0.213543 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 0.6563799381256104, + "learning_rate": 2.5257643050694004e-06, + "loss": 0.020239931344985963, + "memory(GiB)": 17.38, + "step": 3795, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.213596 + }, + { + "epoch": 0.6823793490460157, + "grad_norm": 0.9936453700065613, + "learning_rate": 2.512871183697658e-06, + "loss": 0.029112890362739563, + "memory(GiB)": 17.38, + "step": 3800, + "token_acc": 0.9789029535864979, + "train_speed(iter/s)": 0.213648 + }, + { + "epoch": 0.6823793490460157, + "eval_loss": 0.030899595469236374, + "eval_runtime": 59.2412, + "eval_samples_per_second": 15.192, + "eval_steps_per_second": 7.596, + "eval_token_acc": 0.9841637340248194, + "step": 3800 + }, + { + "epoch": 0.68327721661055, + "grad_norm": 0.9640054106712341, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.024901887774467467, + "memory(GiB)": 17.38, + "step": 3805, + "token_acc": 0.984781121751026, + "train_speed(iter/s)": 0.212886 + }, + { + "epoch": 0.6841750841750842, + "grad_norm": 1.1950197219848633, + "learning_rate": 2.487150867506467e-06, + "loss": 0.028625237941741943, + "memory(GiB)": 17.38, + "step": 3810, + "token_acc": 0.991701244813278, + "train_speed(iter/s)": 0.21294 + }, + { + "epoch": 0.6850729517396184, + "grad_norm": 1.1464393138885498, + "learning_rate": 2.4743238995525964e-06, + "loss": 0.0322964608669281, + "memory(GiB)": 17.38, + "step": 3815, + "token_acc": 0.9897540983606558, + "train_speed(iter/s)": 0.212992 + }, + { + "epoch": 0.6859708193041526, + "grad_norm": 1.1833844184875488, + "learning_rate": 2.4615192092784225e-06, + "loss": 0.035601601004600525, + "memory(GiB)": 17.38, + "step": 3820, + "token_acc": 0.9772727272727273, + "train_speed(iter/s)": 0.213044 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 1.248120665550232, + "learning_rate": 2.4487369096274828e-06, + "loss": 0.02551449239253998, + "memory(GiB)": 17.38, + "step": 3825, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.213096 + }, + { + "epoch": 0.6877665544332211, + "grad_norm": 2.2551465034484863, + "learning_rate": 2.435977113345816e-06, + "loss": 0.037439900636672976, + "memory(GiB)": 17.38, + "step": 3830, + "token_acc": 0.9814814814814815, + "train_speed(iter/s)": 0.213148 + }, + { + "epoch": 0.6886644219977553, + "grad_norm": 0.6242346167564392, + "learning_rate": 2.4232399329809704e-06, + "loss": 0.027642077207565306, + "memory(GiB)": 17.38, + "step": 3835, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.2132 + }, + { + "epoch": 0.6895622895622896, + "grad_norm": 0.9131051301956177, + "learning_rate": 2.410525480881011e-06, + "loss": 0.026515629887580872, + "memory(GiB)": 17.38, + "step": 3840, + "token_acc": 0.9892241379310345, + "train_speed(iter/s)": 0.213254 + }, + { + "epoch": 0.6904601571268238, + "grad_norm": 0.9041575193405151, + "learning_rate": 2.3978338691935314e-06, + "loss": 0.03230937123298645, + "memory(GiB)": 17.38, + "step": 3845, + "token_acc": 0.985655737704918, + "train_speed(iter/s)": 0.213307 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 2.352832078933716, + "learning_rate": 2.385165209864657e-06, + "loss": 0.03141814470291138, + "memory(GiB)": 17.38, + "step": 3850, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.213358 + }, + { + "epoch": 0.6922558922558922, + "grad_norm": 1.1402956247329712, + "learning_rate": 2.3725196146380703e-06, + "loss": 0.02831551432609558, + "memory(GiB)": 17.38, + "step": 3855, + "token_acc": 0.9858299595141701, + "train_speed(iter/s)": 0.213411 + }, + { + "epoch": 0.6931537598204265, + "grad_norm": 0.9474450349807739, + "learning_rate": 2.3598971950540044e-06, + "loss": 0.03286239802837372, + "memory(GiB)": 17.38, + "step": 3860, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.213461 + }, + { + "epoch": 0.6940516273849607, + "grad_norm": 0.918646514415741, + "learning_rate": 2.3472980624482853e-06, + "loss": 0.026221451163291932, + "memory(GiB)": 17.38, + "step": 3865, + "token_acc": 0.9937759336099585, + "train_speed(iter/s)": 0.213513 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 0.6500848531723022, + "learning_rate": 2.334722327951327e-06, + "loss": 0.022830140590667725, + "memory(GiB)": 17.38, + "step": 3870, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.213564 + }, + { + "epoch": 0.6958473625140292, + "grad_norm": 0.4691565930843353, + "learning_rate": 2.322170102487169e-06, + "loss": 0.027497890591621398, + "memory(GiB)": 17.38, + "step": 3875, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.213615 + }, + { + "epoch": 0.6967452300785634, + "grad_norm": 1.3470033407211304, + "learning_rate": 2.309641496772481e-06, + "loss": 0.026915568113327026, + "memory(GiB)": 17.38, + "step": 3880, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 0.213668 + }, + { + "epoch": 0.6976430976430976, + "grad_norm": 0.4571993947029114, + "learning_rate": 2.2971366213156063e-06, + "loss": 0.025150349736213683, + "memory(GiB)": 17.38, + "step": 3885, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.213719 + }, + { + "epoch": 0.6985409652076319, + "grad_norm": 5.67140531539917, + "learning_rate": 2.2846555864155624e-06, + "loss": 0.038307324051856995, + "memory(GiB)": 17.38, + "step": 3890, + "token_acc": 0.9877551020408163, + "train_speed(iter/s)": 0.21377 + }, + { + "epoch": 0.6994388327721661, + "grad_norm": 1.2348878383636475, + "learning_rate": 2.2721985021610933e-06, + "loss": 0.033891907334327696, + "memory(GiB)": 17.38, + "step": 3895, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.213821 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 1.22897469997406, + "learning_rate": 2.259765478429678e-06, + "loss": 0.03174541592597961, + "memory(GiB)": 17.38, + "step": 3900, + "token_acc": 0.9793388429752066, + "train_speed(iter/s)": 0.213873 + }, + { + "epoch": 0.7003367003367004, + "eval_loss": 0.030476640909910202, + "eval_runtime": 59.3845, + "eval_samples_per_second": 15.155, + "eval_steps_per_second": 7.578, + "eval_token_acc": 0.9848120022226339, + "step": 3900 + }, + { + "epoch": 0.7012345679012346, + "grad_norm": 0.6043987274169922, + "learning_rate": 2.2473566248865784e-06, + "loss": 0.01907208263874054, + "memory(GiB)": 17.38, + "step": 3905, + "token_acc": 0.984957264957265, + "train_speed(iter/s)": 0.213121 + }, + { + "epoch": 0.7021324354657688, + "grad_norm": 0.6262363195419312, + "learning_rate": 2.2349720509838587e-06, + "loss": 0.03486924171447754, + "memory(GiB)": 17.38, + "step": 3910, + "token_acc": 0.9852941176470589, + "train_speed(iter/s)": 0.213174 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 1.498445987701416, + "learning_rate": 2.2226118659594235e-06, + "loss": 0.027996140718460082, + "memory(GiB)": 17.38, + "step": 3915, + "token_acc": 0.9828326180257511, + "train_speed(iter/s)": 0.213226 + }, + { + "epoch": 0.7039281705948373, + "grad_norm": 3.017045736312866, + "learning_rate": 2.2102761788360584e-06, + "loss": 0.029009860754013062, + "memory(GiB)": 17.38, + "step": 3920, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.213277 + }, + { + "epoch": 0.7048260381593715, + "grad_norm": 1.0479998588562012, + "learning_rate": 2.197965098420467e-06, + "loss": 0.03738461136817932, + "memory(GiB)": 17.38, + "step": 3925, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.213329 + }, + { + "epoch": 0.7057239057239058, + "grad_norm": 0.8023545742034912, + "learning_rate": 2.185678733302306e-06, + "loss": 0.0231161504983902, + "memory(GiB)": 17.38, + "step": 3930, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.21338 + }, + { + "epoch": 0.70662177328844, + "grad_norm": 0.771193265914917, + "learning_rate": 2.1734171918532366e-06, + "loss": 0.02497338652610779, + "memory(GiB)": 17.38, + "step": 3935, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.21343 + }, + { + "epoch": 0.7075196408529741, + "grad_norm": 0.5781421661376953, + "learning_rate": 2.1611805822259586e-06, + "loss": 0.029178148508071898, + "memory(GiB)": 17.38, + "step": 3940, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.213482 + }, + { + "epoch": 0.7084175084175084, + "grad_norm": 1.3848005533218384, + "learning_rate": 2.14896901235326e-06, + "loss": 0.026811087131500246, + "memory(GiB)": 17.38, + "step": 3945, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.213531 + }, + { + "epoch": 0.7093153759820426, + "grad_norm": 2.1242289543151855, + "learning_rate": 2.1367825899470733e-06, + "loss": 0.037152862548828124, + "memory(GiB)": 17.38, + "step": 3950, + "token_acc": 0.9796747967479674, + "train_speed(iter/s)": 0.213584 + }, + { + "epoch": 0.7102132435465769, + "grad_norm": 1.2256877422332764, + "learning_rate": 2.1246214224975103e-06, + "loss": 0.029462599754333497, + "memory(GiB)": 17.38, + "step": 3955, + "token_acc": 0.9836734693877551, + "train_speed(iter/s)": 0.213634 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.6322500705718994, + "learning_rate": 2.1124856172719284e-06, + "loss": 0.02801285982131958, + "memory(GiB)": 17.38, + "step": 3960, + "token_acc": 0.9785407725321889, + "train_speed(iter/s)": 0.213685 + }, + { + "epoch": 0.7120089786756454, + "grad_norm": 1.2267003059387207, + "learning_rate": 2.100375281313972e-06, + "loss": 0.026802918314933775, + "memory(GiB)": 17.38, + "step": 3965, + "token_acc": 0.9834710743801653, + "train_speed(iter/s)": 0.213735 + }, + { + "epoch": 0.7129068462401795, + "grad_norm": 4.921694278717041, + "learning_rate": 2.0882905214426397e-06, + "loss": 0.03181566894054413, + "memory(GiB)": 17.38, + "step": 3970, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.213785 + }, + { + "epoch": 0.7138047138047138, + "grad_norm": 0.7308579087257385, + "learning_rate": 2.07623144425133e-06, + "loss": 0.0353930652141571, + "memory(GiB)": 17.38, + "step": 3975, + "token_acc": 0.9897119341563786, + "train_speed(iter/s)": 0.213835 + }, + { + "epoch": 0.714702581369248, + "grad_norm": 1.8971703052520752, + "learning_rate": 2.0641981561069173e-06, + "loss": 0.026546701788902283, + "memory(GiB)": 17.38, + "step": 3980, + "token_acc": 0.983739837398374, + "train_speed(iter/s)": 0.213886 + }, + { + "epoch": 0.7156004489337823, + "grad_norm": 1.5146113634109497, + "learning_rate": 2.052190763148794e-06, + "loss": 0.022233220934867858, + "memory(GiB)": 17.38, + "step": 3985, + "token_acc": 0.9937238493723849, + "train_speed(iter/s)": 0.213937 + }, + { + "epoch": 0.7164983164983165, + "grad_norm": 0.8526687026023865, + "learning_rate": 2.0402093712879562e-06, + "loss": 0.034707164764404295, + "memory(GiB)": 17.38, + "step": 3990, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.213986 + }, + { + "epoch": 0.7173961840628508, + "grad_norm": 1.2056403160095215, + "learning_rate": 2.028254086206042e-06, + "loss": 0.026093658804893494, + "memory(GiB)": 17.38, + "step": 3995, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.214036 + }, + { + "epoch": 0.7182940516273849, + "grad_norm": 1.707141399383545, + "learning_rate": 2.0163250133544304e-06, + "loss": 0.037354519963264464, + "memory(GiB)": 17.38, + "step": 4000, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.214087 + }, + { + "epoch": 0.7182940516273849, + "eval_loss": 0.029174668714404106, + "eval_runtime": 59.2838, + "eval_samples_per_second": 15.181, + "eval_steps_per_second": 7.591, + "eval_token_acc": 0.9844415632524541, + "step": 4000 + }, + { + "epoch": 0.7191919191919192, + "grad_norm": 0.25415515899658203, + "learning_rate": 2.0044222579532864e-06, + "loss": 0.02827286720275879, + "memory(GiB)": 17.38, + "step": 4005, + "token_acc": 0.9835560123329907, + "train_speed(iter/s)": 0.213362 + }, + { + "epoch": 0.7200897867564534, + "grad_norm": 1.5875715017318726, + "learning_rate": 1.9925459249906488e-06, + "loss": 0.03303536474704742, + "memory(GiB)": 17.38, + "step": 4010, + "token_acc": 0.985655737704918, + "train_speed(iter/s)": 0.213411 + }, + { + "epoch": 0.7209876543209877, + "grad_norm": 0.27589622139930725, + "learning_rate": 1.98069611922149e-06, + "loss": 0.025451222062110902, + "memory(GiB)": 17.38, + "step": 4015, + "token_acc": 0.9830508474576272, + "train_speed(iter/s)": 0.213462 + }, + { + "epoch": 0.7218855218855219, + "grad_norm": 0.5195873379707336, + "learning_rate": 1.9688729451668116e-06, + "loss": 0.02447448968887329, + "memory(GiB)": 17.38, + "step": 4020, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.213512 + }, + { + "epoch": 0.7227833894500562, + "grad_norm": 1.7163043022155762, + "learning_rate": 1.957076507112695e-06, + "loss": 0.032391208410263064, + "memory(GiB)": 17.38, + "step": 4025, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.213562 + }, + { + "epoch": 0.7236812570145903, + "grad_norm": 2.068011522293091, + "learning_rate": 1.945306909109411e-06, + "loss": 0.02684341073036194, + "memory(GiB)": 17.38, + "step": 4030, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.213612 + }, + { + "epoch": 0.7245791245791245, + "grad_norm": 1.6540495157241821, + "learning_rate": 1.9335642549704797e-06, + "loss": 0.044519555568695066, + "memory(GiB)": 17.38, + "step": 4035, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.213661 + }, + { + "epoch": 0.7254769921436588, + "grad_norm": 1.2645758390426636, + "learning_rate": 1.9218486482717695e-06, + "loss": 0.028735148906707763, + "memory(GiB)": 17.38, + "step": 4040, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.21371 + }, + { + "epoch": 0.726374859708193, + "grad_norm": 0.8257729411125183, + "learning_rate": 1.910160192350573e-06, + "loss": 0.0260012686252594, + "memory(GiB)": 17.38, + "step": 4045, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.213759 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.1504757404327393, + "learning_rate": 1.898498990304699e-06, + "loss": 0.03628788590431213, + "memory(GiB)": 17.38, + "step": 4050, + "token_acc": 0.9834710743801653, + "train_speed(iter/s)": 0.213807 + }, + { + "epoch": 0.7281705948372615, + "grad_norm": 0.4770936369895935, + "learning_rate": 1.8868651449915703e-06, + "loss": 0.025467273592948914, + "memory(GiB)": 17.38, + "step": 4055, + "token_acc": 0.9918032786885246, + "train_speed(iter/s)": 0.213853 + }, + { + "epoch": 0.7290684624017957, + "grad_norm": 0.9146988391876221, + "learning_rate": 1.8752587590273035e-06, + "loss": 0.032961055636405945, + "memory(GiB)": 17.38, + "step": 4060, + "token_acc": 0.9959016393442623, + "train_speed(iter/s)": 0.2139 + }, + { + "epoch": 0.7299663299663299, + "grad_norm": 0.921874463558197, + "learning_rate": 1.8636799347858114e-06, + "loss": 0.03146983385086059, + "memory(GiB)": 17.38, + "step": 4065, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.213948 + }, + { + "epoch": 0.7308641975308642, + "grad_norm": 1.4959501028060913, + "learning_rate": 1.852128774397905e-06, + "loss": 0.025609686970710754, + "memory(GiB)": 17.38, + "step": 4070, + "token_acc": 0.9935344827586207, + "train_speed(iter/s)": 0.213996 + }, + { + "epoch": 0.7317620650953984, + "grad_norm": 1.2349838018417358, + "learning_rate": 1.8406053797503799e-06, + "loss": 0.029957374930381774, + "memory(GiB)": 17.38, + "step": 4075, + "token_acc": 0.983739837398374, + "train_speed(iter/s)": 0.214044 + }, + { + "epoch": 0.7326599326599327, + "grad_norm": 1.0551804304122925, + "learning_rate": 1.8291098524851258e-06, + "loss": 0.027149060368537904, + "memory(GiB)": 17.38, + "step": 4080, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.214092 + }, + { + "epoch": 0.7335578002244669, + "grad_norm": 0.3519715666770935, + "learning_rate": 1.8176422939982336e-06, + "loss": 0.025020474195480348, + "memory(GiB)": 17.38, + "step": 4085, + "token_acc": 0.983739837398374, + "train_speed(iter/s)": 0.214141 + }, + { + "epoch": 0.7344556677890012, + "grad_norm": 1.4062516689300537, + "learning_rate": 1.8062028054390884e-06, + "loss": 0.031118446588516237, + "memory(GiB)": 17.38, + "step": 4090, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.214189 + }, + { + "epoch": 0.7353535353535353, + "grad_norm": 2.273259401321411, + "learning_rate": 1.7947914877094935e-06, + "loss": 0.03438262343406677, + "memory(GiB)": 17.38, + "step": 4095, + "token_acc": 0.9831223628691983, + "train_speed(iter/s)": 0.214237 + }, + { + "epoch": 0.7362514029180696, + "grad_norm": 1.6719846725463867, + "learning_rate": 1.783408441462765e-06, + "loss": 0.028487473726272583, + "memory(GiB)": 17.38, + "step": 4100, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.214285 + }, + { + "epoch": 0.7362514029180696, + "eval_loss": 0.029622850939631462, + "eval_runtime": 59.5749, + "eval_samples_per_second": 15.107, + "eval_steps_per_second": 7.554, + "eval_token_acc": 0.9849046119651788, + "step": 4100 + }, + { + "epoch": 0.7371492704826038, + "grad_norm": 0.6342797875404358, + "learning_rate": 1.7720537671028538e-06, + "loss": 0.04141045808792114, + "memory(GiB)": 17.38, + "step": 4105, + "token_acc": 0.9849418206707734, + "train_speed(iter/s)": 0.213567 + }, + { + "epoch": 0.7380471380471381, + "grad_norm": 0.6199458837509155, + "learning_rate": 1.7607275647834548e-06, + "loss": 0.023208871483802795, + "memory(GiB)": 17.38, + "step": 4110, + "token_acc": 0.9918367346938776, + "train_speed(iter/s)": 0.213617 + }, + { + "epoch": 0.7389450056116723, + "grad_norm": 0.6470329165458679, + "learning_rate": 1.7494299344071314e-06, + "loss": 0.027367407083511354, + "memory(GiB)": 17.38, + "step": 4115, + "token_acc": 0.9936170212765958, + "train_speed(iter/s)": 0.213666 + }, + { + "epoch": 0.7398428731762066, + "grad_norm": 0.9919260740280151, + "learning_rate": 1.7381609756244216e-06, + "loss": 0.022349117696285246, + "memory(GiB)": 17.38, + "step": 4120, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.213715 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.6906048655509949, + "learning_rate": 1.7269207878329746e-06, + "loss": 0.02090343087911606, + "memory(GiB)": 17.38, + "step": 4125, + "token_acc": 0.9916666666666667, + "train_speed(iter/s)": 0.213763 + }, + { + "epoch": 0.741638608305275, + "grad_norm": 0.7965125441551208, + "learning_rate": 1.7157094701766542e-06, + "loss": 0.038212424516677855, + "memory(GiB)": 17.38, + "step": 4130, + "token_acc": 0.9791666666666666, + "train_speed(iter/s)": 0.213811 + }, + { + "epoch": 0.7425364758698092, + "grad_norm": 0.9114153981208801, + "learning_rate": 1.7045271215446862e-06, + "loss": 0.02577747106552124, + "memory(GiB)": 17.38, + "step": 4135, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.21386 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 1.1752060651779175, + "learning_rate": 1.6933738405707688e-06, + "loss": 0.03554680049419403, + "memory(GiB)": 17.38, + "step": 4140, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.213908 + }, + { + "epoch": 0.7443322109988777, + "grad_norm": 0.8617966175079346, + "learning_rate": 1.682249725632215e-06, + "loss": 0.025488993525505065, + "memory(GiB)": 17.38, + "step": 4145, + "token_acc": 0.9836065573770492, + "train_speed(iter/s)": 0.213956 + }, + { + "epoch": 0.745230078563412, + "grad_norm": 1.0416406393051147, + "learning_rate": 1.671154874849074e-06, + "loss": 0.030037355422973634, + "memory(GiB)": 17.38, + "step": 4150, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.214004 + }, + { + "epoch": 0.7461279461279461, + "grad_norm": 2.6986441612243652, + "learning_rate": 1.6600893860832778e-06, + "loss": 0.034278661012649536, + "memory(GiB)": 17.38, + "step": 4155, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.214052 + }, + { + "epoch": 0.7470258136924803, + "grad_norm": 1.2279298305511475, + "learning_rate": 1.649053356937761e-06, + "loss": 0.033626073598861696, + "memory(GiB)": 17.38, + "step": 4160, + "token_acc": 0.9855371900826446, + "train_speed(iter/s)": 0.214094 + }, + { + "epoch": 0.7479236812570146, + "grad_norm": 2.443080425262451, + "learning_rate": 1.6380468847556202e-06, + "loss": 0.03830733299255371, + "memory(GiB)": 17.38, + "step": 4165, + "token_acc": 0.9898785425101214, + "train_speed(iter/s)": 0.214141 + }, + { + "epoch": 0.7488215488215488, + "grad_norm": 1.1768028736114502, + "learning_rate": 1.627070066619239e-06, + "loss": 0.0234613299369812, + "memory(GiB)": 17.38, + "step": 4170, + "token_acc": 0.9917355371900827, + "train_speed(iter/s)": 0.214189 + }, + { + "epoch": 0.7497194163860831, + "grad_norm": 0.9198989868164062, + "learning_rate": 1.6161229993494426e-06, + "loss": 0.026802051067352294, + "memory(GiB)": 17.38, + "step": 4175, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.214236 + }, + { + "epoch": 0.7506172839506173, + "grad_norm": 2.1885592937469482, + "learning_rate": 1.6052057795046366e-06, + "loss": 0.02564775347709656, + "memory(GiB)": 17.38, + "step": 4180, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.214284 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 1.6582032442092896, + "learning_rate": 1.5943185033799557e-06, + "loss": 0.023617088794708252, + "memory(GiB)": 17.38, + "step": 4185, + "token_acc": 0.985655737704918, + "train_speed(iter/s)": 0.214331 + }, + { + "epoch": 0.7524130190796857, + "grad_norm": 0.5812045931816101, + "learning_rate": 1.5834612670064226e-06, + "loss": 0.024746993184089662, + "memory(GiB)": 17.38, + "step": 4190, + "token_acc": 0.9829787234042553, + "train_speed(iter/s)": 0.21438 + }, + { + "epoch": 0.75331088664422, + "grad_norm": 1.1238276958465576, + "learning_rate": 1.5726341661500898e-06, + "loss": 0.031463271379470824, + "memory(GiB)": 17.38, + "step": 4195, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.214427 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 0.4520304203033447, + "learning_rate": 1.561837296311201e-06, + "loss": 0.024458497762680054, + "memory(GiB)": 17.38, + "step": 4200, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.214473 + }, + { + "epoch": 0.7542087542087542, + "eval_loss": 0.03125369921326637, + "eval_runtime": 60.0219, + "eval_samples_per_second": 14.995, + "eval_steps_per_second": 7.497, + "eval_token_acc": 0.9859233191331728, + "step": 4200 + }, + { + "epoch": 0.7551066217732885, + "grad_norm": 1.6875200271606445, + "learning_rate": 1.5510707527233503e-06, + "loss": 0.02725319266319275, + "memory(GiB)": 17.38, + "step": 4205, + "token_acc": 0.9866984993178718, + "train_speed(iter/s)": 0.213776 + }, + { + "epoch": 0.7560044893378227, + "grad_norm": 0.5820701122283936, + "learning_rate": 1.5403346303526373e-06, + "loss": 0.02713656723499298, + "memory(GiB)": 17.38, + "step": 4210, + "token_acc": 0.9788135593220338, + "train_speed(iter/s)": 0.213825 + }, + { + "epoch": 0.7569023569023569, + "grad_norm": 0.36407163739204407, + "learning_rate": 1.5296290238968303e-06, + "loss": 0.026985323429107665, + "memory(GiB)": 17.38, + "step": 4215, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.213873 + }, + { + "epoch": 0.7578002244668911, + "grad_norm": 1.2440131902694702, + "learning_rate": 1.5189540277845382e-06, + "loss": 0.026060250401496888, + "memory(GiB)": 17.38, + "step": 4220, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.21392 + }, + { + "epoch": 0.7586980920314254, + "grad_norm": 1.9274588823318481, + "learning_rate": 1.5083097361743649e-06, + "loss": 0.030699992179870607, + "memory(GiB)": 17.38, + "step": 4225, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.213967 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 0.633176863193512, + "learning_rate": 1.497696242954092e-06, + "loss": 0.026253151893615722, + "memory(GiB)": 17.38, + "step": 4230, + "token_acc": 0.9871794871794872, + "train_speed(iter/s)": 0.214016 + }, + { + "epoch": 0.7604938271604939, + "grad_norm": 0.8920471668243408, + "learning_rate": 1.4871136417398407e-06, + "loss": 0.028958433866500856, + "memory(GiB)": 17.38, + "step": 4235, + "token_acc": 0.9873949579831933, + "train_speed(iter/s)": 0.214063 + }, + { + "epoch": 0.7613916947250281, + "grad_norm": 1.1698507070541382, + "learning_rate": 1.4765620258752505e-06, + "loss": 0.027630111575126647, + "memory(GiB)": 17.38, + "step": 4240, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.214111 + }, + { + "epoch": 0.7622895622895622, + "grad_norm": 1.349119782447815, + "learning_rate": 1.466041488430654e-06, + "loss": 0.04834521114826203, + "memory(GiB)": 17.38, + "step": 4245, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.214159 + }, + { + "epoch": 0.7631874298540965, + "grad_norm": 0.8922721147537231, + "learning_rate": 1.4555521222022618e-06, + "loss": 0.034823456406593324, + "memory(GiB)": 17.38, + "step": 4250, + "token_acc": 0.9809322033898306, + "train_speed(iter/s)": 0.214207 + }, + { + "epoch": 0.7640852974186307, + "grad_norm": 0.7039507031440735, + "learning_rate": 1.4450940197113333e-06, + "loss": 0.027701443433761595, + "memory(GiB)": 17.38, + "step": 4255, + "token_acc": 0.9816326530612245, + "train_speed(iter/s)": 0.214254 + }, + { + "epoch": 0.764983164983165, + "grad_norm": 0.9450100660324097, + "learning_rate": 1.4346672732033739e-06, + "loss": 0.032166242599487305, + "memory(GiB)": 17.38, + "step": 4260, + "token_acc": 0.9817073170731707, + "train_speed(iter/s)": 0.2143 + }, + { + "epoch": 0.7658810325476992, + "grad_norm": 0.6703555583953857, + "learning_rate": 1.424271974647304e-06, + "loss": 0.027309167385101318, + "memory(GiB)": 17.38, + "step": 4265, + "token_acc": 0.9877551020408163, + "train_speed(iter/s)": 0.214347 + }, + { + "epoch": 0.7667789001122335, + "grad_norm": 3.1309144496917725, + "learning_rate": 1.4139082157346685e-06, + "loss": 0.033643665909767154, + "memory(GiB)": 17.38, + "step": 4270, + "token_acc": 0.9810924369747899, + "train_speed(iter/s)": 0.214394 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 1.9825454950332642, + "learning_rate": 1.4035760878788091e-06, + "loss": 0.026664549112319948, + "memory(GiB)": 17.38, + "step": 4275, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.21444 + }, + { + "epoch": 0.7685746352413019, + "grad_norm": 0.7164722681045532, + "learning_rate": 1.3932756822140731e-06, + "loss": 0.03173021376132965, + "memory(GiB)": 17.38, + "step": 4280, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.214486 + }, + { + "epoch": 0.7694725028058361, + "grad_norm": 1.7024521827697754, + "learning_rate": 1.3830070895949988e-06, + "loss": 0.031912803649902344, + "memory(GiB)": 17.38, + "step": 4285, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.214532 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 1.646498203277588, + "learning_rate": 1.3727704005955177e-06, + "loss": 0.028366243839263915, + "memory(GiB)": 17.38, + "step": 4290, + "token_acc": 0.98559670781893, + "train_speed(iter/s)": 0.214579 + }, + { + "epoch": 0.7712682379349046, + "grad_norm": 0.8475176692008972, + "learning_rate": 1.3625657055081576e-06, + "loss": 0.02616657018661499, + "memory(GiB)": 17.38, + "step": 4295, + "token_acc": 0.9849137931034483, + "train_speed(iter/s)": 0.214624 + }, + { + "epoch": 0.7721661054994389, + "grad_norm": 1.7246657609939575, + "learning_rate": 1.3523930943432456e-06, + "loss": 0.026334071159362794, + "memory(GiB)": 17.38, + "step": 4300, + "token_acc": 0.9855371900826446, + "train_speed(iter/s)": 0.214671 + }, + { + "epoch": 0.7721661054994389, + "eval_loss": 0.029096296057105064, + "eval_runtime": 60.0255, + "eval_samples_per_second": 14.994, + "eval_steps_per_second": 7.497, + "eval_token_acc": 0.9844415632524541, + "step": 4300 + }, + { + "epoch": 0.773063973063973, + "grad_norm": 0.7595803141593933, + "learning_rate": 1.3422526568281096e-06, + "loss": 0.025106793642044066, + "memory(GiB)": 17.38, + "step": 4305, + "token_acc": 0.9857925368024649, + "train_speed(iter/s)": 0.213991 + }, + { + "epoch": 0.7739618406285073, + "grad_norm": 1.4294860363006592, + "learning_rate": 1.3321444824062956e-06, + "loss": 0.03754590451717377, + "memory(GiB)": 17.38, + "step": 4310, + "token_acc": 0.9806034482758621, + "train_speed(iter/s)": 0.214039 + }, + { + "epoch": 0.7748597081930415, + "grad_norm": 0.9175985455513, + "learning_rate": 1.3220686602367688e-06, + "loss": 0.030617910623550414, + "memory(GiB)": 17.38, + "step": 4315, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.214084 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.7042086720466614, + "learning_rate": 1.3120252791931326e-06, + "loss": 0.02365140914916992, + "memory(GiB)": 17.38, + "step": 4320, + "token_acc": 0.989406779661017, + "train_speed(iter/s)": 0.214131 + }, + { + "epoch": 0.77665544332211, + "grad_norm": 1.0479687452316284, + "learning_rate": 1.3020144278628478e-06, + "loss": 0.03161399364471436, + "memory(GiB)": 17.38, + "step": 4325, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.214178 + }, + { + "epoch": 0.7775533108866443, + "grad_norm": 1.6007845401763916, + "learning_rate": 1.2920361945464422e-06, + "loss": 0.02449537217617035, + "memory(GiB)": 17.38, + "step": 4330, + "token_acc": 0.989406779661017, + "train_speed(iter/s)": 0.214224 + }, + { + "epoch": 0.7784511784511785, + "grad_norm": 1.5079424381256104, + "learning_rate": 1.282090667256743e-06, + "loss": 0.03528004288673401, + "memory(GiB)": 17.38, + "step": 4335, + "token_acc": 0.9819277108433735, + "train_speed(iter/s)": 0.214271 + }, + { + "epoch": 0.7793490460157126, + "grad_norm": 1.1626056432724, + "learning_rate": 1.2721779337180885e-06, + "loss": 0.024642500281333923, + "memory(GiB)": 17.38, + "step": 4340, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.214317 + }, + { + "epoch": 0.7802469135802469, + "grad_norm": 0.5241588354110718, + "learning_rate": 1.2622980813655639e-06, + "loss": 0.02092319428920746, + "memory(GiB)": 17.38, + "step": 4345, + "token_acc": 0.9849785407725322, + "train_speed(iter/s)": 0.214362 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 0.5101640224456787, + "learning_rate": 1.2524511973442238e-06, + "loss": 0.019632357358932494, + "memory(GiB)": 17.38, + "step": 4350, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.214407 + }, + { + "epoch": 0.7820426487093154, + "grad_norm": 0.7707279920578003, + "learning_rate": 1.2426373685083316e-06, + "loss": 0.023081228137016296, + "memory(GiB)": 17.38, + "step": 4355, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.214453 + }, + { + "epoch": 0.7829405162738496, + "grad_norm": 2.153534173965454, + "learning_rate": 1.2328566814205806e-06, + "loss": 0.03221791982650757, + "memory(GiB)": 17.38, + "step": 4360, + "token_acc": 0.979253112033195, + "train_speed(iter/s)": 0.214496 + }, + { + "epoch": 0.7838383838383839, + "grad_norm": 1.0108195543289185, + "learning_rate": 1.2231092223513452e-06, + "loss": 0.02405703067779541, + "memory(GiB)": 17.38, + "step": 4365, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.214542 + }, + { + "epoch": 0.784736251402918, + "grad_norm": 1.7818634510040283, + "learning_rate": 1.2133950772779074e-06, + "loss": 0.03908553123474121, + "memory(GiB)": 17.38, + "step": 4370, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.214588 + }, + { + "epoch": 0.7856341189674523, + "grad_norm": 0.982045590877533, + "learning_rate": 1.2037143318837059e-06, + "loss": 0.027974212169647218, + "memory(GiB)": 17.38, + "step": 4375, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.214633 + }, + { + "epoch": 0.7865319865319865, + "grad_norm": 0.9785454273223877, + "learning_rate": 1.1940670715575764e-06, + "loss": 0.03869582712650299, + "memory(GiB)": 17.38, + "step": 4380, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.214678 + }, + { + "epoch": 0.7874298540965208, + "grad_norm": 2.980783224105835, + "learning_rate": 1.184453381393005e-06, + "loss": 0.03085460364818573, + "memory(GiB)": 17.38, + "step": 4385, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.214723 + }, + { + "epoch": 0.788327721661055, + "grad_norm": 0.6352113485336304, + "learning_rate": 1.1748733461873652e-06, + "loss": 0.029062888026237486, + "memory(GiB)": 17.38, + "step": 4390, + "token_acc": 0.9917695473251029, + "train_speed(iter/s)": 0.214768 + }, + { + "epoch": 0.7892255892255893, + "grad_norm": 0.33760499954223633, + "learning_rate": 1.1653270504411868e-06, + "loss": 0.029608726501464844, + "memory(GiB)": 17.38, + "step": 4395, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.214811 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.7048406004905701, + "learning_rate": 1.1558145783573905e-06, + "loss": 0.023440904915332794, + "memory(GiB)": 17.38, + "step": 4400, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.214855 + }, + { + "epoch": 0.7901234567901234, + "eval_loss": 0.02855849638581276, + "eval_runtime": 59.4489, + "eval_samples_per_second": 15.139, + "eval_steps_per_second": 7.57, + "eval_token_acc": 0.9849046119651788, + "step": 4400 + }, + { + "epoch": 0.7910213243546577, + "grad_norm": 0.7038957476615906, + "learning_rate": 1.1463360138405654e-06, + "loss": 0.01701013743877411, + "memory(GiB)": 17.38, + "step": 4405, + "token_acc": 0.9863013698630136, + "train_speed(iter/s)": 0.214198 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 0.7166315317153931, + "learning_rate": 1.1368914404962145e-06, + "loss": 0.03113364577293396, + "memory(GiB)": 17.38, + "step": 4410, + "token_acc": 0.9814814814814815, + "train_speed(iter/s)": 0.214243 + }, + { + "epoch": 0.7928170594837262, + "grad_norm": 0.8743456602096558, + "learning_rate": 1.1274809416300252e-06, + "loss": 0.030266988277435302, + "memory(GiB)": 17.38, + "step": 4415, + "token_acc": 0.9817073170731707, + "train_speed(iter/s)": 0.214289 + }, + { + "epoch": 0.7937149270482604, + "grad_norm": 0.7038944363594055, + "learning_rate": 1.1181046002471292e-06, + "loss": 0.024662519991397857, + "memory(GiB)": 17.38, + "step": 4420, + "token_acc": 0.9810126582278481, + "train_speed(iter/s)": 0.214335 + }, + { + "epoch": 0.7946127946127947, + "grad_norm": 0.3838299512863159, + "learning_rate": 1.1087624990513735e-06, + "loss": 0.032780641317367555, + "memory(GiB)": 17.38, + "step": 4425, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.214381 + }, + { + "epoch": 0.7955106621773288, + "grad_norm": 0.7157024145126343, + "learning_rate": 1.0994547204445893e-06, + "loss": 0.027820771932601927, + "memory(GiB)": 17.38, + "step": 4430, + "token_acc": 0.9898373983739838, + "train_speed(iter/s)": 0.214426 + }, + { + "epoch": 0.796408529741863, + "grad_norm": 1.7827956676483154, + "learning_rate": 1.0901813465258688e-06, + "loss": 0.028691038489341736, + "memory(GiB)": 17.38, + "step": 4435, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.214471 + }, + { + "epoch": 0.7973063973063973, + "grad_norm": 1.6897979974746704, + "learning_rate": 1.0809424590908346e-06, + "loss": 0.024974721670150756, + "memory(GiB)": 17.38, + "step": 4440, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.214517 + }, + { + "epoch": 0.7982042648709315, + "grad_norm": 1.1603997945785522, + "learning_rate": 1.0717381396309256e-06, + "loss": 0.0259869247674942, + "memory(GiB)": 17.38, + "step": 4445, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.214563 + }, + { + "epoch": 0.7991021324354658, + "grad_norm": 1.2864385843276978, + "learning_rate": 1.0625684693326727e-06, + "loss": 0.034084361791610715, + "memory(GiB)": 17.38, + "step": 4450, + "token_acc": 0.9774590163934426, + "train_speed(iter/s)": 0.214607 + }, + { + "epoch": 0.8, + "grad_norm": 1.2499374151229858, + "learning_rate": 1.053433529076982e-06, + "loss": 0.021215659379959107, + "memory(GiB)": 17.38, + "step": 4455, + "token_acc": 0.993801652892562, + "train_speed(iter/s)": 0.214653 + }, + { + "epoch": 0.8008978675645342, + "grad_norm": 1.0336294174194336, + "learning_rate": 1.0443333994384298e-06, + "loss": 0.02553292512893677, + "memory(GiB)": 17.38, + "step": 4460, + "token_acc": 0.9898373983739838, + "train_speed(iter/s)": 0.214697 + }, + { + "epoch": 0.8017957351290684, + "grad_norm": 0.5307586789131165, + "learning_rate": 1.0352681606845394e-06, + "loss": 0.024311122298240662, + "memory(GiB)": 17.38, + "step": 4465, + "token_acc": 0.9899598393574297, + "train_speed(iter/s)": 0.214741 + }, + { + "epoch": 0.8026936026936027, + "grad_norm": 1.1317822933197021, + "learning_rate": 1.0262378927750854e-06, + "loss": 0.027873358130455016, + "memory(GiB)": 17.38, + "step": 4470, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.214783 + }, + { + "epoch": 0.8035914702581369, + "grad_norm": 0.9324398040771484, + "learning_rate": 1.0172426753613801e-06, + "loss": 0.0344790518283844, + "memory(GiB)": 17.38, + "step": 4475, + "token_acc": 0.9858299595141701, + "train_speed(iter/s)": 0.214828 + }, + { + "epoch": 0.8044893378226712, + "grad_norm": 1.6751203536987305, + "learning_rate": 1.0082825877855718e-06, + "loss": 0.029978978633880615, + "memory(GiB)": 17.38, + "step": 4480, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.21487 + }, + { + "epoch": 0.8053872053872054, + "grad_norm": 1.0042208433151245, + "learning_rate": 9.993577090799488e-07, + "loss": 0.022933849692344667, + "memory(GiB)": 17.38, + "step": 4485, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.214915 + }, + { + "epoch": 0.8062850729517396, + "grad_norm": 1.4133613109588623, + "learning_rate": 9.904681179662418e-07, + "loss": 0.028456294536590578, + "memory(GiB)": 17.38, + "step": 4490, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.214958 + }, + { + "epoch": 0.8071829405162738, + "grad_norm": 1.4486221075057983, + "learning_rate": 9.81613892854923e-07, + "loss": 0.028815671801567078, + "memory(GiB)": 17.38, + "step": 4495, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.215 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.8121383190155029, + "learning_rate": 9.727951118445278e-07, + "loss": 0.02200884521007538, + "memory(GiB)": 17.38, + "step": 4500, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215043 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.02846209704875946, + "eval_runtime": 59.8679, + "eval_samples_per_second": 15.033, + "eval_steps_per_second": 7.517, + "eval_token_acc": 0.9862937581033525, + "step": 4500 + }, + { + "epoch": 0.8089786756453423, + "grad_norm": 1.1521605253219604, + "learning_rate": 9.640118527209457e-07, + "loss": 0.030388808250427245, + "memory(GiB)": 17.38, + "step": 4505, + "token_acc": 0.986472602739726, + "train_speed(iter/s)": 0.214394 + }, + { + "epoch": 0.8098765432098766, + "grad_norm": 2.259798049926758, + "learning_rate": 9.55264192956758e-07, + "loss": 0.03616534173488617, + "memory(GiB)": 17.38, + "step": 4510, + "token_acc": 0.9770833333333333, + "train_speed(iter/s)": 0.214437 + }, + { + "epoch": 0.8107744107744108, + "grad_norm": 0.3351824879646301, + "learning_rate": 9.465522097105329e-07, + "loss": 0.026586857438087464, + "memory(GiB)": 17.38, + "step": 4515, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 0.214479 + }, + { + "epoch": 0.811672278338945, + "grad_norm": 1.8978947401046753, + "learning_rate": 9.378759798261622e-07, + "loss": 0.030919963121414186, + "memory(GiB)": 17.38, + "step": 4520, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.214523 + }, + { + "epoch": 0.8125701459034792, + "grad_norm": 1.0190826654434204, + "learning_rate": 9.292355798321701e-07, + "loss": 0.02850286066532135, + "memory(GiB)": 17.38, + "step": 4525, + "token_acc": 0.9836065573770492, + "train_speed(iter/s)": 0.214565 + }, + { + "epoch": 0.8134680134680135, + "grad_norm": 1.0378633737564087, + "learning_rate": 9.206310859410494e-07, + "loss": 0.017400476336479186, + "memory(GiB)": 17.38, + "step": 4530, + "token_acc": 0.9939024390243902, + "train_speed(iter/s)": 0.214608 + }, + { + "epoch": 0.8143658810325477, + "grad_norm": 1.526329755783081, + "learning_rate": 9.120625740485762e-07, + "loss": 0.02371554374694824, + "memory(GiB)": 17.38, + "step": 4535, + "token_acc": 0.9891774891774892, + "train_speed(iter/s)": 0.21465 + }, + { + "epoch": 0.815263748597082, + "grad_norm": 1.9005310535430908, + "learning_rate": 9.035301197331553e-07, + "loss": 0.03473139107227326, + "memory(GiB)": 17.38, + "step": 4540, + "token_acc": 0.9774590163934426, + "train_speed(iter/s)": 0.214693 + }, + { + "epoch": 0.8161616161616162, + "grad_norm": 0.4489269554615021, + "learning_rate": 8.95033798255141e-07, + "loss": 0.02181600034236908, + "memory(GiB)": 17.38, + "step": 4545, + "token_acc": 0.9897119341563786, + "train_speed(iter/s)": 0.214734 + }, + { + "epoch": 0.8170594837261503, + "grad_norm": 0.5243973731994629, + "learning_rate": 8.865736845561845e-07, + "loss": 0.027533432841300963, + "memory(GiB)": 17.38, + "step": 4550, + "token_acc": 0.9891774891774892, + "train_speed(iter/s)": 0.214776 + }, + { + "epoch": 0.8179573512906846, + "grad_norm": 2.0725953578948975, + "learning_rate": 8.781498532585614e-07, + "loss": 0.031092095375061034, + "memory(GiB)": 17.38, + "step": 4555, + "token_acc": 0.9852941176470589, + "train_speed(iter/s)": 0.214817 + }, + { + "epoch": 0.8188552188552188, + "grad_norm": 1.067928433418274, + "learning_rate": 8.697623786645198e-07, + "loss": 0.024662891030311586, + "memory(GiB)": 17.38, + "step": 4560, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.214858 + }, + { + "epoch": 0.8197530864197531, + "grad_norm": 0.6206085085868835, + "learning_rate": 8.614113347556285e-07, + "loss": 0.0335844874382019, + "memory(GiB)": 17.38, + "step": 4565, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.214897 + }, + { + "epoch": 0.8206509539842873, + "grad_norm": 1.1404248476028442, + "learning_rate": 8.530967951921149e-07, + "loss": 0.02872735559940338, + "memory(GiB)": 17.38, + "step": 4570, + "token_acc": 0.9851694915254238, + "train_speed(iter/s)": 0.214936 + }, + { + "epoch": 0.8215488215488216, + "grad_norm": 0.7243728637695312, + "learning_rate": 8.44818833312222e-07, + "loss": 0.021016553044319153, + "memory(GiB)": 17.38, + "step": 4575, + "token_acc": 0.9936974789915967, + "train_speed(iter/s)": 0.214977 + }, + { + "epoch": 0.8224466891133558, + "grad_norm": 2.3505282402038574, + "learning_rate": 8.365775221315636e-07, + "loss": 0.023282404243946075, + "memory(GiB)": 17.38, + "step": 4580, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.215018 + }, + { + "epoch": 0.82334455667789, + "grad_norm": 1.579245924949646, + "learning_rate": 8.283729343424718e-07, + "loss": 0.028078573942184448, + "memory(GiB)": 17.38, + "step": 4585, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215058 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 3.101959228515625, + "learning_rate": 8.202051423133611e-07, + "loss": 0.0243370920419693, + "memory(GiB)": 17.38, + "step": 4590, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.215099 + }, + { + "epoch": 0.8251402918069585, + "grad_norm": 1.8192346096038818, + "learning_rate": 8.120742180880947e-07, + "loss": 0.03095346689224243, + "memory(GiB)": 17.38, + "step": 4595, + "token_acc": 0.9834710743801653, + "train_speed(iter/s)": 0.215141 + }, + { + "epoch": 0.8260381593714927, + "grad_norm": 1.3214001655578613, + "learning_rate": 8.039802333853369e-07, + "loss": 0.03128174841403961, + "memory(GiB)": 17.38, + "step": 4600, + "token_acc": 0.9915966386554622, + "train_speed(iter/s)": 0.215183 + }, + { + "epoch": 0.8260381593714927, + "eval_loss": 0.028019437566399574, + "eval_runtime": 60.4033, + "eval_samples_per_second": 14.9, + "eval_steps_per_second": 7.45, + "eval_token_acc": 0.9858307093906279, + "step": 4600 + }, + { + "epoch": 0.826936026936027, + "grad_norm": 0.9889150857925415, + "learning_rate": 7.959232595979349e-07, + "loss": 0.03575218915939331, + "memory(GiB)": 17.38, + "step": 4605, + "token_acc": 0.987012987012987, + "train_speed(iter/s)": 0.214539 + }, + { + "epoch": 0.8278338945005612, + "grad_norm": 0.3053978681564331, + "learning_rate": 7.879033677922759e-07, + "loss": 0.015581905841827393, + "memory(GiB)": 17.38, + "step": 4610, + "token_acc": 0.9918367346938776, + "train_speed(iter/s)": 0.214583 + }, + { + "epoch": 0.8287317620650954, + "grad_norm": 0.9240614175796509, + "learning_rate": 7.799206287076694e-07, + "loss": 0.02756223678588867, + "memory(GiB)": 17.38, + "step": 4615, + "token_acc": 0.9859437751004017, + "train_speed(iter/s)": 0.214625 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 1.5516794919967651, + "learning_rate": 7.719751127557168e-07, + "loss": 0.039028877019882204, + "memory(GiB)": 17.38, + "step": 4620, + "token_acc": 0.9770833333333333, + "train_speed(iter/s)": 0.214669 + }, + { + "epoch": 0.8305274971941639, + "grad_norm": 0.640664279460907, + "learning_rate": 7.640668900196985e-07, + "loss": 0.02044615149497986, + "memory(GiB)": 17.38, + "step": 4625, + "token_acc": 0.9935897435897436, + "train_speed(iter/s)": 0.214712 + }, + { + "epoch": 0.8314253647586981, + "grad_norm": 1.3384597301483154, + "learning_rate": 7.561960302539461e-07, + "loss": 0.024951864778995515, + "memory(GiB)": 17.38, + "step": 4630, + "token_acc": 0.9871244635193133, + "train_speed(iter/s)": 0.214755 + }, + { + "epoch": 0.8323232323232324, + "grad_norm": 1.5596963167190552, + "learning_rate": 7.483626028832369e-07, + "loss": 0.031386613845825195, + "memory(GiB)": 17.38, + "step": 4635, + "token_acc": 0.9978991596638656, + "train_speed(iter/s)": 0.214798 + }, + { + "epoch": 0.8332210998877666, + "grad_norm": 1.8345504999160767, + "learning_rate": 7.405666770021686e-07, + "loss": 0.025643932819366454, + "memory(GiB)": 17.38, + "step": 4640, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.21484 + }, + { + "epoch": 0.8341189674523007, + "grad_norm": 1.3681524991989136, + "learning_rate": 7.328083213745657e-07, + "loss": 0.016640958189964295, + "memory(GiB)": 17.38, + "step": 4645, + "token_acc": 0.9914893617021276, + "train_speed(iter/s)": 0.214881 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 0.7963857650756836, + "learning_rate": 7.250876044328598e-07, + "loss": 0.016118553280830384, + "memory(GiB)": 17.38, + "step": 4650, + "token_acc": 0.9956709956709957, + "train_speed(iter/s)": 0.214922 + }, + { + "epoch": 0.8359147025813692, + "grad_norm": 0.7750681638717651, + "learning_rate": 7.174045942774949e-07, + "loss": 0.03155998587608337, + "memory(GiB)": 17.38, + "step": 4655, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.214966 + }, + { + "epoch": 0.8368125701459035, + "grad_norm": 1.3200747966766357, + "learning_rate": 7.097593586763186e-07, + "loss": 0.020320066809654237, + "memory(GiB)": 17.38, + "step": 4660, + "token_acc": 0.9938524590163934, + "train_speed(iter/s)": 0.215008 + }, + { + "epoch": 0.8377104377104377, + "grad_norm": 2.1913084983825684, + "learning_rate": 7.021519650639952e-07, + "loss": 0.03614517450332642, + "memory(GiB)": 17.38, + "step": 4665, + "token_acc": 0.9836065573770492, + "train_speed(iter/s)": 0.215048 + }, + { + "epoch": 0.838608305274972, + "grad_norm": 0.849054217338562, + "learning_rate": 6.945824805413959e-07, + "loss": 0.025873470306396484, + "memory(GiB)": 17.38, + "step": 4670, + "token_acc": 0.9915611814345991, + "train_speed(iter/s)": 0.215089 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 1.2689311504364014, + "learning_rate": 6.870509718750229e-07, + "loss": 0.03429421186447144, + "memory(GiB)": 17.38, + "step": 4675, + "token_acc": 0.9834710743801653, + "train_speed(iter/s)": 0.215128 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 0.36143457889556885, + "learning_rate": 6.795575054964088e-07, + "loss": 0.024675777554512023, + "memory(GiB)": 17.38, + "step": 4680, + "token_acc": 0.9891774891774892, + "train_speed(iter/s)": 0.215169 + }, + { + "epoch": 0.8413019079685746, + "grad_norm": 1.2493942975997925, + "learning_rate": 6.721021475015377e-07, + "loss": 0.02936035394668579, + "memory(GiB)": 17.38, + "step": 4685, + "token_acc": 0.9814814814814815, + "train_speed(iter/s)": 0.215211 + }, + { + "epoch": 0.8421997755331089, + "grad_norm": 1.1786699295043945, + "learning_rate": 6.646849636502567e-07, + "loss": 0.022340220212936402, + "memory(GiB)": 17.38, + "step": 4690, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.215254 + }, + { + "epoch": 0.8430976430976431, + "grad_norm": 0.34499016404151917, + "learning_rate": 6.573060193657e-07, + "loss": 0.0314663976430893, + "memory(GiB)": 17.38, + "step": 4695, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.215296 + }, + { + "epoch": 0.8439955106621774, + "grad_norm": 0.9373217821121216, + "learning_rate": 6.499653797337107e-07, + "loss": 0.029464873671531677, + "memory(GiB)": 17.38, + "step": 4700, + "token_acc": 0.9774590163934426, + "train_speed(iter/s)": 0.215337 + }, + { + "epoch": 0.8439955106621774, + "eval_loss": 0.028368979692459106, + "eval_runtime": 60.3566, + "eval_samples_per_second": 14.911, + "eval_steps_per_second": 7.456, + "eval_token_acc": 0.9851824411928135, + "step": 4700 + }, + { + "epoch": 0.8448933782267115, + "grad_norm": 0.8249322772026062, + "learning_rate": 6.426631095022667e-07, + "loss": 0.02286662459373474, + "memory(GiB)": 17.38, + "step": 4705, + "token_acc": 0.9861396303901437, + "train_speed(iter/s)": 0.214705 + }, + { + "epoch": 0.8457912457912458, + "grad_norm": 0.5536770820617676, + "learning_rate": 6.353992730809061e-07, + "loss": 0.020447969436645508, + "memory(GiB)": 17.38, + "step": 4710, + "token_acc": 0.9900398406374502, + "train_speed(iter/s)": 0.214748 + }, + { + "epoch": 0.84668911335578, + "grad_norm": 0.819101870059967, + "learning_rate": 6.281739345401677e-07, + "loss": 0.022961407899856567, + "memory(GiB)": 17.38, + "step": 4715, + "token_acc": 0.9937759336099585, + "train_speed(iter/s)": 0.214789 + }, + { + "epoch": 0.8475869809203143, + "grad_norm": 1.0500142574310303, + "learning_rate": 6.209871576110161e-07, + "loss": 0.024665701389312743, + "memory(GiB)": 17.38, + "step": 4720, + "token_acc": 0.9871244635193133, + "train_speed(iter/s)": 0.214831 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.9561021327972412, + "learning_rate": 6.138390056842847e-07, + "loss": 0.025862562656402587, + "memory(GiB)": 17.38, + "step": 4725, + "token_acc": 0.99581589958159, + "train_speed(iter/s)": 0.214872 + }, + { + "epoch": 0.8493827160493828, + "grad_norm": 1.6456671953201294, + "learning_rate": 6.067295418101176e-07, + "loss": 0.04175513982772827, + "memory(GiB)": 17.38, + "step": 4730, + "token_acc": 0.986, + "train_speed(iter/s)": 0.214913 + }, + { + "epoch": 0.8502805836139169, + "grad_norm": 1.2079863548278809, + "learning_rate": 5.996588286974092e-07, + "loss": 0.019708405435085296, + "memory(GiB)": 17.38, + "step": 4735, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.214952 + }, + { + "epoch": 0.8511784511784511, + "grad_norm": 0.8447543978691101, + "learning_rate": 5.926269287132569e-07, + "loss": 0.02823741137981415, + "memory(GiB)": 17.38, + "step": 4740, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.214992 + }, + { + "epoch": 0.8520763187429854, + "grad_norm": 1.1406478881835938, + "learning_rate": 5.856339038824038e-07, + "loss": 0.01721465289592743, + "memory(GiB)": 17.38, + "step": 4745, + "token_acc": 0.99375, + "train_speed(iter/s)": 0.215029 + }, + { + "epoch": 0.8529741863075196, + "grad_norm": 1.6712257862091064, + "learning_rate": 5.786798158866969e-07, + "loss": 0.032575753331184384, + "memory(GiB)": 17.38, + "step": 4750, + "token_acc": 0.9835390946502057, + "train_speed(iter/s)": 0.215068 + }, + { + "epoch": 0.8538720538720539, + "grad_norm": 1.619740605354309, + "learning_rate": 5.71764726064542e-07, + "loss": 0.030162209272384645, + "memory(GiB)": 17.38, + "step": 4755, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.215105 + }, + { + "epoch": 0.8547699214365881, + "grad_norm": 1.3289310932159424, + "learning_rate": 5.648886954103627e-07, + "loss": 0.03657462596893311, + "memory(GiB)": 17.38, + "step": 4760, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.215142 + }, + { + "epoch": 0.8556677890011223, + "grad_norm": 0.9925198554992676, + "learning_rate": 5.580517845740602e-07, + "loss": 0.024964751303195955, + "memory(GiB)": 17.38, + "step": 4765, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.215181 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 11.695115089416504, + "learning_rate": 5.512540538604833e-07, + "loss": 0.027210599184036253, + "memory(GiB)": 17.38, + "step": 4770, + "token_acc": 0.9830508474576272, + "train_speed(iter/s)": 0.215219 + }, + { + "epoch": 0.8574635241301908, + "grad_norm": 1.5092613697052002, + "learning_rate": 5.444955632288873e-07, + "loss": 0.023456774652004242, + "memory(GiB)": 17.38, + "step": 4775, + "token_acc": 0.9836065573770492, + "train_speed(iter/s)": 0.215255 + }, + { + "epoch": 0.858361391694725, + "grad_norm": 1.4480104446411133, + "learning_rate": 5.377763722924184e-07, + "loss": 0.02511906623840332, + "memory(GiB)": 17.38, + "step": 4780, + "token_acc": 0.9918367346938776, + "train_speed(iter/s)": 0.215293 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 1.1284476518630981, + "learning_rate": 5.310965403175739e-07, + "loss": 0.027672219276428222, + "memory(GiB)": 17.38, + "step": 4785, + "token_acc": 0.9917695473251029, + "train_speed(iter/s)": 0.21533 + }, + { + "epoch": 0.8601571268237935, + "grad_norm": 0.9012276530265808, + "learning_rate": 5.244561262236913e-07, + "loss": 0.024995729327201843, + "memory(GiB)": 17.38, + "step": 4790, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.215368 + }, + { + "epoch": 0.8610549943883278, + "grad_norm": 1.7996430397033691, + "learning_rate": 5.178551885824201e-07, + "loss": 0.02610649764537811, + "memory(GiB)": 17.38, + "step": 4795, + "token_acc": 0.9869565217391304, + "train_speed(iter/s)": 0.215405 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 0.7965891361236572, + "learning_rate": 5.112937856172124e-07, + "loss": 0.02759631872177124, + "memory(GiB)": 17.38, + "step": 4800, + "token_acc": 0.9916666666666667, + "train_speed(iter/s)": 0.215442 + }, + { + "epoch": 0.8619528619528619, + "eval_loss": 0.028012992814183235, + "eval_runtime": 62.7828, + "eval_samples_per_second": 14.335, + "eval_steps_per_second": 7.168, + "eval_token_acc": 0.9858307093906279, + "step": 4800 + }, + { + "epoch": 0.8628507295173962, + "grad_norm": 2.1714131832122803, + "learning_rate": 5.04771975202798e-07, + "loss": 0.030586045980453492, + "memory(GiB)": 17.38, + "step": 4805, + "token_acc": 0.9864911080711354, + "train_speed(iter/s)": 0.214785 + }, + { + "epoch": 0.8637485970819304, + "grad_norm": 1.6416758298873901, + "learning_rate": 4.982898148646892e-07, + "loss": 0.0372624933719635, + "memory(GiB)": 17.38, + "step": 4810, + "token_acc": 0.9790794979079498, + "train_speed(iter/s)": 0.214819 + }, + { + "epoch": 0.8646464646464647, + "grad_norm": 0.7899372577667236, + "learning_rate": 4.918473617786613e-07, + "loss": 0.022367826104164122, + "memory(GiB)": 17.38, + "step": 4815, + "token_acc": 0.9957627118644068, + "train_speed(iter/s)": 0.214859 + }, + { + "epoch": 0.8655443322109989, + "grad_norm": 0.3184243142604828, + "learning_rate": 4.854446727702538e-07, + "loss": 0.0226868137717247, + "memory(GiB)": 17.38, + "step": 4820, + "token_acc": 0.9938524590163934, + "train_speed(iter/s)": 0.214899 + }, + { + "epoch": 0.8664421997755332, + "grad_norm": 0.696887731552124, + "learning_rate": 4.790818043142681e-07, + "loss": 0.027195435762405396, + "memory(GiB)": 17.38, + "step": 4825, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.214939 + }, + { + "epoch": 0.8673400673400673, + "grad_norm": 2.242636203765869, + "learning_rate": 4.727588125342669e-07, + "loss": 0.03733437657356262, + "memory(GiB)": 17.38, + "step": 4830, + "token_acc": 0.981404958677686, + "train_speed(iter/s)": 0.214977 + }, + { + "epoch": 0.8682379349046015, + "grad_norm": 0.6172637939453125, + "learning_rate": 4.6647575320208526e-07, + "loss": 0.02494046688079834, + "memory(GiB)": 17.38, + "step": 4835, + "token_acc": 0.989406779661017, + "train_speed(iter/s)": 0.215012 + }, + { + "epoch": 0.8691358024691358, + "grad_norm": 1.2297143936157227, + "learning_rate": 4.6023268173733093e-07, + "loss": 0.025543856620788574, + "memory(GiB)": 17.38, + "step": 4840, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215047 + }, + { + "epoch": 0.87003367003367, + "grad_norm": 0.9486376643180847, + "learning_rate": 4.5402965320690326e-07, + "loss": 0.0196555033326149, + "memory(GiB)": 17.38, + "step": 4845, + "token_acc": 0.9914163090128756, + "train_speed(iter/s)": 0.215084 + }, + { + "epoch": 0.8709315375982043, + "grad_norm": 2.3950729370117188, + "learning_rate": 4.4786672232450035e-07, + "loss": 0.026487797498703003, + "memory(GiB)": 17.38, + "step": 4850, + "token_acc": 0.9829059829059829, + "train_speed(iter/s)": 0.215124 + }, + { + "epoch": 0.8718294051627385, + "grad_norm": 0.8147628307342529, + "learning_rate": 4.4174394345014083e-07, + "loss": 0.02194797098636627, + "memory(GiB)": 17.38, + "step": 4855, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.215165 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.8070253133773804, + "learning_rate": 4.356613705896828e-07, + "loss": 0.02982144057750702, + "memory(GiB)": 17.38, + "step": 4860, + "token_acc": 0.9811715481171548, + "train_speed(iter/s)": 0.215205 + }, + { + "epoch": 0.8736251402918069, + "grad_norm": 1.1290311813354492, + "learning_rate": 4.296190573943504e-07, + "loss": 0.01738460510969162, + "memory(GiB)": 17.38, + "step": 4865, + "token_acc": 0.989451476793249, + "train_speed(iter/s)": 0.215241 + }, + { + "epoch": 0.8745230078563412, + "grad_norm": 1.0240559577941895, + "learning_rate": 4.2361705716025434e-07, + "loss": 0.024514345824718474, + "memory(GiB)": 17.38, + "step": 4870, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.215281 + }, + { + "epoch": 0.8754208754208754, + "grad_norm": 1.3408843278884888, + "learning_rate": 4.176554228279289e-07, + "loss": 0.027880007028579713, + "memory(GiB)": 17.38, + "step": 4875, + "token_acc": 0.9777327935222672, + "train_speed(iter/s)": 0.215322 + }, + { + "epoch": 0.8763187429854097, + "grad_norm": 0.7241111993789673, + "learning_rate": 4.1173420698186027e-07, + "loss": 0.023846164345741272, + "memory(GiB)": 17.38, + "step": 4880, + "token_acc": 0.9807692307692307, + "train_speed(iter/s)": 0.215362 + }, + { + "epoch": 0.8772166105499439, + "grad_norm": 1.3932455778121948, + "learning_rate": 4.058534618500237e-07, + "loss": 0.023435330390930174, + "memory(GiB)": 17.38, + "step": 4885, + "token_acc": 0.9851694915254238, + "train_speed(iter/s)": 0.215401 + }, + { + "epoch": 0.8781144781144781, + "grad_norm": 1.746880054473877, + "learning_rate": 4.0001323930342286e-07, + "loss": 0.02412838190793991, + "memory(GiB)": 17.38, + "step": 4890, + "token_acc": 0.9851694915254238, + "train_speed(iter/s)": 0.215442 + }, + { + "epoch": 0.8790123456790123, + "grad_norm": 0.48470231890678406, + "learning_rate": 3.942135908556355e-07, + "loss": 0.026909294724464416, + "memory(GiB)": 17.38, + "step": 4895, + "token_acc": 0.9815573770491803, + "train_speed(iter/s)": 0.215481 + }, + { + "epoch": 0.8799102132435466, + "grad_norm": 0.48904848098754883, + "learning_rate": 3.8845456766235246e-07, + "loss": 0.024153578281402587, + "memory(GiB)": 17.38, + "step": 4900, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.215521 + }, + { + "epoch": 0.8799102132435466, + "eval_loss": 0.028149865567684174, + "eval_runtime": 59.1716, + "eval_samples_per_second": 15.21, + "eval_steps_per_second": 7.605, + "eval_token_acc": 0.9854602704204483, + "step": 4900 + }, + { + "epoch": 0.8808080808080808, + "grad_norm": 0.9238405227661133, + "learning_rate": 3.827362205209345e-07, + "loss": 0.020637691020965576, + "memory(GiB)": 17.38, + "step": 4905, + "token_acc": 0.9858071135430917, + "train_speed(iter/s)": 0.214924 + }, + { + "epoch": 0.8817059483726151, + "grad_norm": 0.7521107196807861, + "learning_rate": 3.770585998699544e-07, + "loss": 0.01900961101055145, + "memory(GiB)": 17.38, + "step": 4910, + "token_acc": 0.9917355371900827, + "train_speed(iter/s)": 0.214963 + }, + { + "epoch": 0.8826038159371493, + "grad_norm": 2.4239983558654785, + "learning_rate": 3.7142175578876395e-07, + "loss": 0.03327087163925171, + "memory(GiB)": 17.38, + "step": 4915, + "token_acc": 0.9764957264957265, + "train_speed(iter/s)": 0.215002 + }, + { + "epoch": 0.8835016835016835, + "grad_norm": 0.4241452217102051, + "learning_rate": 3.6582573799704035e-07, + "loss": 0.02217492163181305, + "memory(GiB)": 17.38, + "step": 4920, + "token_acc": 0.9957805907172996, + "train_speed(iter/s)": 0.215043 + }, + { + "epoch": 0.8843995510662177, + "grad_norm": 1.0619988441467285, + "learning_rate": 3.6027059585435785e-07, + "loss": 0.020509295165538788, + "memory(GiB)": 17.38, + "step": 4925, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.215081 + }, + { + "epoch": 0.885297418630752, + "grad_norm": 0.27455267310142517, + "learning_rate": 3.5475637835974466e-07, + "loss": 0.01841064989566803, + "memory(GiB)": 17.38, + "step": 4930, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.215121 + }, + { + "epoch": 0.8861952861952862, + "grad_norm": 0.8495221138000488, + "learning_rate": 3.492831341512565e-07, + "loss": 0.03511955738067627, + "memory(GiB)": 17.38, + "step": 4935, + "token_acc": 0.976890756302521, + "train_speed(iter/s)": 0.215163 + }, + { + "epoch": 0.8870931537598205, + "grad_norm": 3.8337371349334717, + "learning_rate": 3.4385091150554105e-07, + "loss": 0.030250746011734008, + "memory(GiB)": 17.38, + "step": 4940, + "token_acc": 0.9854166666666667, + "train_speed(iter/s)": 0.215203 + }, + { + "epoch": 0.8879910213243547, + "grad_norm": 1.1767065525054932, + "learning_rate": 3.384597583374211e-07, + "loss": 0.017814779281616212, + "memory(GiB)": 17.38, + "step": 4945, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215243 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.34028780460357666, + "learning_rate": 3.331097221994617e-07, + "loss": 0.022717326879501343, + "memory(GiB)": 17.38, + "step": 4950, + "token_acc": 0.9857142857142858, + "train_speed(iter/s)": 0.215283 + }, + { + "epoch": 0.8897867564534231, + "grad_norm": 1.8622692823410034, + "learning_rate": 3.278008502815605e-07, + "loss": 0.03485904335975647, + "memory(GiB)": 17.38, + "step": 4955, + "token_acc": 0.9857142857142858, + "train_speed(iter/s)": 0.215322 + }, + { + "epoch": 0.8906846240179573, + "grad_norm": 1.4286247491836548, + "learning_rate": 3.225331894105238e-07, + "loss": 0.029091286659240722, + "memory(GiB)": 17.38, + "step": 4960, + "token_acc": 0.9872340425531915, + "train_speed(iter/s)": 0.215359 + }, + { + "epoch": 0.8915824915824916, + "grad_norm": 0.63150554895401, + "learning_rate": 3.1730678604965706e-07, + "loss": 0.02178904414176941, + "memory(GiB)": 17.38, + "step": 4965, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.215397 + }, + { + "epoch": 0.8924803591470258, + "grad_norm": 1.714266061782837, + "learning_rate": 3.1212168629835724e-07, + "loss": 0.030321192741394044, + "memory(GiB)": 17.38, + "step": 4970, + "token_acc": 0.9793388429752066, + "train_speed(iter/s)": 0.215437 + }, + { + "epoch": 0.8933782267115601, + "grad_norm": 0.5082640051841736, + "learning_rate": 3.069779358916991e-07, + "loss": 0.028776955604553223, + "memory(GiB)": 17.38, + "step": 4975, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.215476 + }, + { + "epoch": 0.8942760942760942, + "grad_norm": 1.2514303922653198, + "learning_rate": 3.0187558020004113e-07, + "loss": 0.02973783016204834, + "memory(GiB)": 17.38, + "step": 4980, + "token_acc": 0.9748953974895398, + "train_speed(iter/s)": 0.215515 + }, + { + "epoch": 0.8951739618406285, + "grad_norm": 1.8983955383300781, + "learning_rate": 2.9681466422861684e-07, + "loss": 0.03358994126319885, + "memory(GiB)": 17.38, + "step": 4985, + "token_acc": 0.9782608695652174, + "train_speed(iter/s)": 0.215554 + }, + { + "epoch": 0.8960718294051627, + "grad_norm": 2.0785417556762695, + "learning_rate": 2.917952326171425e-07, + "loss": 0.0280322402715683, + "memory(GiB)": 17.38, + "step": 4990, + "token_acc": 0.991701244813278, + "train_speed(iter/s)": 0.21559 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 2.5344104766845703, + "learning_rate": 2.8681732963942223e-07, + "loss": 0.02883433997631073, + "memory(GiB)": 17.38, + "step": 4995, + "token_acc": 0.9813278008298755, + "train_speed(iter/s)": 0.215622 + }, + { + "epoch": 0.8978675645342312, + "grad_norm": 0.7847800850868225, + "learning_rate": 2.818809992029592e-07, + "loss": 0.027084237337112425, + "memory(GiB)": 17.38, + "step": 5000, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.215654 + }, + { + "epoch": 0.8978675645342312, + "eval_loss": 0.027972938492894173, + "eval_runtime": 60.978, + "eval_samples_per_second": 14.759, + "eval_steps_per_second": 7.38, + "eval_token_acc": 0.985738099648083, + "step": 5000 + }, + { + "epoch": 0.8987654320987655, + "grad_norm": 1.3210762739181519, + "learning_rate": 2.769862848485638e-07, + "loss": 0.026473921537399293, + "memory(GiB)": 17.38, + "step": 5005, + "token_acc": 0.9873547505126452, + "train_speed(iter/s)": 0.215054 + }, + { + "epoch": 0.8996632996632996, + "grad_norm": 1.3319097757339478, + "learning_rate": 2.7213322974997626e-07, + "loss": 0.028168869018554688, + "memory(GiB)": 17.38, + "step": 5010, + "token_acc": 0.9915611814345991, + "train_speed(iter/s)": 0.215093 + }, + { + "epoch": 0.9005611672278339, + "grad_norm": 1.6646956205368042, + "learning_rate": 2.673218767134783e-07, + "loss": 0.023626665771007537, + "memory(GiB)": 17.38, + "step": 5015, + "token_acc": 0.9935622317596566, + "train_speed(iter/s)": 0.215132 + }, + { + "epoch": 0.9014590347923681, + "grad_norm": 0.8774084448814392, + "learning_rate": 2.6255226817752164e-07, + "loss": 0.027046987414360048, + "memory(GiB)": 17.38, + "step": 5020, + "token_acc": 0.9851063829787234, + "train_speed(iter/s)": 0.21517 + }, + { + "epoch": 0.9023569023569024, + "grad_norm": 0.4828176200389862, + "learning_rate": 2.5782444621235026e-07, + "loss": 0.02829151749610901, + "memory(GiB)": 17.38, + "step": 5025, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.215208 + }, + { + "epoch": 0.9032547699214366, + "grad_norm": 0.5960500240325928, + "learning_rate": 2.531384525196323e-07, + "loss": 0.020842486619949342, + "memory(GiB)": 17.38, + "step": 5030, + "token_acc": 0.9894957983193278, + "train_speed(iter/s)": 0.215246 + }, + { + "epoch": 0.9041526374859709, + "grad_norm": 0.7326636910438538, + "learning_rate": 2.4849432843208786e-07, + "loss": 0.029288771748542785, + "memory(GiB)": 17.38, + "step": 5035, + "token_acc": 0.9916666666666667, + "train_speed(iter/s)": 0.215284 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 2.295067310333252, + "learning_rate": 2.4389211491313015e-07, + "loss": 0.03236299753189087, + "memory(GiB)": 17.38, + "step": 5040, + "token_acc": 0.9795918367346939, + "train_speed(iter/s)": 0.215321 + }, + { + "epoch": 0.9059483726150392, + "grad_norm": 3.135751962661743, + "learning_rate": 2.3933185255649637e-07, + "loss": 0.02853953540325165, + "memory(GiB)": 17.38, + "step": 5045, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.215359 + }, + { + "epoch": 0.9068462401795735, + "grad_norm": 0.39786943793296814, + "learning_rate": 2.3481358158589874e-07, + "loss": 0.023338253796100616, + "memory(GiB)": 17.38, + "step": 5050, + "token_acc": 0.9937759336099585, + "train_speed(iter/s)": 0.215398 + }, + { + "epoch": 0.9077441077441077, + "grad_norm": 0.5368845462799072, + "learning_rate": 2.3033734185466283e-07, + "loss": 0.023724550008773805, + "memory(GiB)": 17.38, + "step": 5055, + "token_acc": 0.9871244635193133, + "train_speed(iter/s)": 0.215434 + }, + { + "epoch": 0.908641975308642, + "grad_norm": 0.9217173457145691, + "learning_rate": 2.2590317284537922e-07, + "loss": 0.027547287940979003, + "memory(GiB)": 17.38, + "step": 5060, + "token_acc": 0.9915254237288136, + "train_speed(iter/s)": 0.215473 + }, + { + "epoch": 0.9095398428731762, + "grad_norm": 1.3524061441421509, + "learning_rate": 2.2151111366955468e-07, + "loss": 0.028666403889656068, + "memory(GiB)": 17.38, + "step": 5065, + "token_acc": 0.9795081967213115, + "train_speed(iter/s)": 0.21551 + }, + { + "epoch": 0.9104377104377105, + "grad_norm": 0.884425938129425, + "learning_rate": 2.1716120306726595e-07, + "loss": 0.024917131662368773, + "memory(GiB)": 17.38, + "step": 5070, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.215537 + }, + { + "epoch": 0.9113355780022446, + "grad_norm": 0.8265371322631836, + "learning_rate": 2.1285347940682045e-07, + "loss": 0.02539147138595581, + "memory(GiB)": 17.38, + "step": 5075, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.215564 + }, + { + "epoch": 0.9122334455667789, + "grad_norm": 2.3108794689178467, + "learning_rate": 2.0858798068441487e-07, + "loss": 0.025833079218864442, + "memory(GiB)": 17.38, + "step": 5080, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.215595 + }, + { + "epoch": 0.9131313131313131, + "grad_norm": 1.288081407546997, + "learning_rate": 2.0436474452380228e-07, + "loss": 0.03718141913414001, + "memory(GiB)": 17.38, + "step": 5085, + "token_acc": 0.9794238683127572, + "train_speed(iter/s)": 0.215631 + }, + { + "epoch": 0.9140291806958474, + "grad_norm": 0.5227884650230408, + "learning_rate": 2.001838081759605e-07, + "loss": 0.020656655728816985, + "memory(GiB)": 17.38, + "step": 5090, + "token_acc": 0.9936170212765958, + "train_speed(iter/s)": 0.215669 + }, + { + "epoch": 0.9149270482603816, + "grad_norm": 1.7272427082061768, + "learning_rate": 1.9604520851876196e-07, + "loss": 0.01716717779636383, + "memory(GiB)": 17.38, + "step": 5095, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215707 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 2.112581253051758, + "learning_rate": 1.919489820566478e-07, + "loss": 0.027463805675506592, + "memory(GiB)": 17.38, + "step": 5100, + "token_acc": 0.9934497816593887, + "train_speed(iter/s)": 0.215746 + }, + { + "epoch": 0.9158249158249159, + "eval_loss": 0.028000257909297943, + "eval_runtime": 59.1168, + "eval_samples_per_second": 15.224, + "eval_steps_per_second": 7.612, + "eval_token_acc": 0.9859233191331728, + "step": 5100 + }, + { + "epoch": 0.91672278338945, + "grad_norm": 2.3127872943878174, + "learning_rate": 1.8789516492030978e-07, + "loss": 0.03171263933181763, + "memory(GiB)": 17.38, + "step": 5105, + "token_acc": 0.9863013698630136, + "train_speed(iter/s)": 0.215172 + }, + { + "epoch": 0.9176206509539843, + "grad_norm": 0.7946669459342957, + "learning_rate": 1.8388379286636615e-07, + "loss": 0.021996480226516724, + "memory(GiB)": 17.38, + "step": 5110, + "token_acc": 0.9938271604938271, + "train_speed(iter/s)": 0.215211 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 1.3478219509124756, + "learning_rate": 1.799149012770518e-07, + "loss": 0.019972196221351622, + "memory(GiB)": 17.38, + "step": 5115, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.215249 + }, + { + "epoch": 0.9194163860830528, + "grad_norm": 0.9864323735237122, + "learning_rate": 1.7598852515990195e-07, + "loss": 0.020311808586120604, + "memory(GiB)": 17.38, + "step": 5120, + "token_acc": 0.9869565217391304, + "train_speed(iter/s)": 0.215287 + }, + { + "epoch": 0.920314253647587, + "grad_norm": 0.8238934874534607, + "learning_rate": 1.7210469914744455e-07, + "loss": 0.019791373610496522, + "memory(GiB)": 17.38, + "step": 5125, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.215325 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 1.4369577169418335, + "learning_rate": 1.6826345749689554e-07, + "loss": 0.033382588624954225, + "memory(GiB)": 17.38, + "step": 5130, + "token_acc": 0.9789915966386554, + "train_speed(iter/s)": 0.215363 + }, + { + "epoch": 0.9221099887766554, + "grad_norm": 1.4463626146316528, + "learning_rate": 1.6446483408985693e-07, + "loss": 0.0185105562210083, + "memory(GiB)": 17.38, + "step": 5135, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215402 + }, + { + "epoch": 0.9230078563411896, + "grad_norm": 1.4056111574172974, + "learning_rate": 1.6070886243201633e-07, + "loss": 0.02868930697441101, + "memory(GiB)": 17.38, + "step": 5140, + "token_acc": 0.9808510638297873, + "train_speed(iter/s)": 0.21544 + }, + { + "epoch": 0.9239057239057239, + "grad_norm": 1.1676428318023682, + "learning_rate": 1.5699557565285406e-07, + "loss": 0.029277276992797852, + "memory(GiB)": 17.38, + "step": 5145, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.215478 + }, + { + "epoch": 0.9248035914702581, + "grad_norm": 0.6323414444923401, + "learning_rate": 1.5332500650534655e-07, + "loss": 0.023776301741600038, + "memory(GiB)": 17.38, + "step": 5150, + "token_acc": 0.9914893617021276, + "train_speed(iter/s)": 0.215517 + }, + { + "epoch": 0.9257014590347924, + "grad_norm": 1.7070456743240356, + "learning_rate": 1.4969718736568216e-07, + "loss": 0.03851460218429566, + "memory(GiB)": 17.38, + "step": 5155, + "token_acc": 0.9755102040816327, + "train_speed(iter/s)": 0.215555 + }, + { + "epoch": 0.9265993265993266, + "grad_norm": 0.995628297328949, + "learning_rate": 1.4611215023297265e-07, + "loss": 0.02340541034936905, + "memory(GiB)": 17.38, + "step": 5160, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.215593 + }, + { + "epoch": 0.9274971941638608, + "grad_norm": 0.9654150009155273, + "learning_rate": 1.4256992672897319e-07, + "loss": 0.017953529953956604, + "memory(GiB)": 17.38, + "step": 5165, + "token_acc": 0.9937759336099585, + "train_speed(iter/s)": 0.215631 + }, + { + "epoch": 0.928395061728395, + "grad_norm": 0.6531078815460205, + "learning_rate": 1.3907054809779995e-07, + "loss": 0.023590242862701415, + "memory(GiB)": 17.38, + "step": 5170, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.215667 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 1.3188893795013428, + "learning_rate": 1.356140452056598e-07, + "loss": 0.025275450944900513, + "memory(GiB)": 17.38, + "step": 5175, + "token_acc": 0.9897959183673469, + "train_speed(iter/s)": 0.215698 + }, + { + "epoch": 0.9301907968574635, + "grad_norm": 1.818155288696289, + "learning_rate": 1.322004485405709e-07, + "loss": 0.020531085133552552, + "memory(GiB)": 17.38, + "step": 5180, + "token_acc": 0.9915966386554622, + "train_speed(iter/s)": 0.215727 + }, + { + "epoch": 0.9310886644219978, + "grad_norm": 2.120398759841919, + "learning_rate": 1.2882978821210147e-07, + "loss": 0.026939159631729125, + "memory(GiB)": 17.38, + "step": 5185, + "token_acc": 0.9832635983263598, + "train_speed(iter/s)": 0.215755 + }, + { + "epoch": 0.931986531986532, + "grad_norm": 1.332161545753479, + "learning_rate": 1.255020939510976e-07, + "loss": 0.01830718219280243, + "memory(GiB)": 17.38, + "step": 5190, + "token_acc": 0.9919028340080972, + "train_speed(iter/s)": 0.215786 + }, + { + "epoch": 0.9328843995510662, + "grad_norm": 1.697027325630188, + "learning_rate": 1.2221739510942744e-07, + "loss": 0.024323096871376036, + "memory(GiB)": 17.38, + "step": 5195, + "token_acc": 0.985655737704918, + "train_speed(iter/s)": 0.215816 + }, + { + "epoch": 0.9337822671156004, + "grad_norm": 2.4368162155151367, + "learning_rate": 1.1897572065971586e-07, + "loss": 0.02372613251209259, + "memory(GiB)": 17.38, + "step": 5200, + "token_acc": 0.9935897435897436, + "train_speed(iter/s)": 0.215844 + }, + { + "epoch": 0.9337822671156004, + "eval_loss": 0.027619920670986176, + "eval_runtime": 70.0454, + "eval_samples_per_second": 12.849, + "eval_steps_per_second": 6.424, + "eval_token_acc": 0.9866641970735321, + "step": 5200 + }, + { + "epoch": 0.9346801346801347, + "grad_norm": 1.2210580110549927, + "learning_rate": 1.157770991950924e-07, + "loss": 0.022353368997573852, + "memory(GiB)": 17.38, + "step": 5205, + "token_acc": 0.9868376068376068, + "train_speed(iter/s)": 0.215164 + }, + { + "epoch": 0.9355780022446689, + "grad_norm": 1.1964539289474487, + "learning_rate": 1.1262155892893977e-07, + "loss": 0.027189862728118897, + "memory(GiB)": 17.38, + "step": 5210, + "token_acc": 0.9914163090128756, + "train_speed(iter/s)": 0.2152 + }, + { + "epoch": 0.9364758698092032, + "grad_norm": 0.8958919048309326, + "learning_rate": 1.095091276946425e-07, + "loss": 0.02762342691421509, + "memory(GiB)": 17.38, + "step": 5215, + "token_acc": 0.9870689655172413, + "train_speed(iter/s)": 0.215231 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 0.5203025937080383, + "learning_rate": 1.064398329453431e-07, + "loss": 0.027194255590438844, + "memory(GiB)": 17.38, + "step": 5220, + "token_acc": 0.9826839826839827, + "train_speed(iter/s)": 0.215259 + }, + { + "epoch": 0.9382716049382716, + "grad_norm": 0.787577211856842, + "learning_rate": 1.0341370175370069e-07, + "loss": 0.025628811120986937, + "memory(GiB)": 17.38, + "step": 5225, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.215288 + }, + { + "epoch": 0.9391694725028058, + "grad_norm": 1.2397444248199463, + "learning_rate": 1.0043076081164838e-07, + "loss": 0.026222583651542664, + "memory(GiB)": 17.38, + "step": 5230, + "token_acc": 0.9917355371900827, + "train_speed(iter/s)": 0.215322 + }, + { + "epoch": 0.94006734006734, + "grad_norm": 1.1216204166412354, + "learning_rate": 9.749103643016288e-08, + "loss": 0.026987135410308838, + "memory(GiB)": 17.38, + "step": 5235, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.215355 + }, + { + "epoch": 0.9409652076318743, + "grad_norm": 1.8150118589401245, + "learning_rate": 9.459455453902866e-08, + "loss": 0.02262101471424103, + "memory(GiB)": 17.38, + "step": 5240, + "token_acc": 0.9893617021276596, + "train_speed(iter/s)": 0.21539 + }, + { + "epoch": 0.9418630751964086, + "grad_norm": 0.6968724131584167, + "learning_rate": 9.174134068661133e-08, + "loss": 0.02112298607826233, + "memory(GiB)": 17.38, + "step": 5245, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215426 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 1.8783295154571533, + "learning_rate": 8.893142003963184e-08, + "loss": 0.022644573450088502, + "memory(GiB)": 17.38, + "step": 5250, + "token_acc": 0.9938524590163934, + "train_speed(iter/s)": 0.21546 + }, + { + "epoch": 0.9436588103254769, + "grad_norm": 0.5992938280105591, + "learning_rate": 8.616481738294324e-08, + "loss": 0.027608248591423034, + "memory(GiB)": 17.38, + "step": 5255, + "token_acc": 0.9810924369747899, + "train_speed(iter/s)": 0.215495 + }, + { + "epoch": 0.9445566778900112, + "grad_norm": 0.8126539587974548, + "learning_rate": 8.344155711931368e-08, + "loss": 0.014930185675621033, + "memory(GiB)": 17.38, + "step": 5260, + "token_acc": 0.992, + "train_speed(iter/s)": 0.215531 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 0.472036212682724, + "learning_rate": 8.07616632692093e-08, + "loss": 0.022065025568008424, + "memory(GiB)": 17.38, + "step": 5265, + "token_acc": 0.9898373983739838, + "train_speed(iter/s)": 0.215565 + }, + { + "epoch": 0.9463524130190797, + "grad_norm": 1.5350534915924072, + "learning_rate": 7.812515947058619e-08, + "loss": 0.02966037392616272, + "memory(GiB)": 17.38, + "step": 5270, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.2156 + }, + { + "epoch": 0.9472502805836139, + "grad_norm": 1.8831390142440796, + "learning_rate": 7.553206897867649e-08, + "loss": 0.026520195603370666, + "memory(GiB)": 17.38, + "step": 5275, + "token_acc": 0.989406779661017, + "train_speed(iter/s)": 0.215631 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 0.809588611125946, + "learning_rate": 7.29824146657887e-08, + "loss": 0.027678117156028748, + "memory(GiB)": 17.38, + "step": 5280, + "token_acc": 0.9872881355932204, + "train_speed(iter/s)": 0.215656 + }, + { + "epoch": 0.9490460157126824, + "grad_norm": 1.0526882410049438, + "learning_rate": 7.047621902110057e-08, + "loss": 0.022055207192897795, + "memory(GiB)": 17.38, + "step": 5285, + "token_acc": 0.9855371900826446, + "train_speed(iter/s)": 0.215684 + }, + { + "epoch": 0.9499438832772166, + "grad_norm": 1.3952608108520508, + "learning_rate": 6.801350415046482e-08, + "loss": 0.028878217935562132, + "memory(GiB)": 17.38, + "step": 5290, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.215713 + }, + { + "epoch": 0.9508417508417508, + "grad_norm": 1.0611093044281006, + "learning_rate": 6.559429177621368e-08, + "loss": 0.0296240359544754, + "memory(GiB)": 17.38, + "step": 5295, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215738 + }, + { + "epoch": 0.9517396184062851, + "grad_norm": 1.4099164009094238, + "learning_rate": 6.321860323696471e-08, + "loss": 0.01611938178539276, + "memory(GiB)": 17.38, + "step": 5300, + "token_acc": 0.9957081545064378, + "train_speed(iter/s)": 0.215762 + }, + { + "epoch": 0.9517396184062851, + "eval_loss": 0.02776762843132019, + "eval_runtime": 62.6527, + "eval_samples_per_second": 14.365, + "eval_steps_per_second": 7.182, + "eval_token_acc": 0.9859233191331728, + "step": 5300 + }, + { + "epoch": 0.9526374859708193, + "grad_norm": 0.8847435712814331, + "learning_rate": 6.088645948743532e-08, + "loss": 0.021325305104255676, + "memory(GiB)": 17.38, + "step": 5305, + "token_acc": 0.9863481228668942, + "train_speed(iter/s)": 0.215156 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 1.3806735277175903, + "learning_rate": 5.8597881098257924e-08, + "loss": 0.03027437925338745, + "memory(GiB)": 17.38, + "step": 5310, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.215188 + }, + { + "epoch": 0.9544332210998878, + "grad_norm": 0.39451703429222107, + "learning_rate": 5.6352888255793434e-08, + "loss": 0.020201580226421358, + "memory(GiB)": 17.38, + "step": 5315, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.21522 + }, + { + "epoch": 0.955331088664422, + "grad_norm": 1.7539409399032593, + "learning_rate": 5.415150076196252e-08, + "loss": 0.030578690767288207, + "memory(GiB)": 17.38, + "step": 5320, + "token_acc": 0.9895833333333334, + "train_speed(iter/s)": 0.215257 + }, + { + "epoch": 0.9562289562289562, + "grad_norm": 0.8873705267906189, + "learning_rate": 5.1993738034061827e-08, + "loss": 0.020966407656669617, + "memory(GiB)": 17.38, + "step": 5325, + "token_acc": 0.9937238493723849, + "train_speed(iter/s)": 0.215294 + }, + { + "epoch": 0.9571268237934905, + "grad_norm": 1.5140655040740967, + "learning_rate": 4.98796191045986e-08, + "loss": 0.02478059083223343, + "memory(GiB)": 17.38, + "step": 5330, + "token_acc": 0.9836065573770492, + "train_speed(iter/s)": 0.215331 + }, + { + "epoch": 0.9580246913580247, + "grad_norm": 1.0895894765853882, + "learning_rate": 4.780916262112023e-08, + "loss": 0.025377148389816286, + "memory(GiB)": 17.38, + "step": 5335, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.215367 + }, + { + "epoch": 0.958922558922559, + "grad_norm": 1.9714152812957764, + "learning_rate": 4.578238684604886e-08, + "loss": 0.03129986524581909, + "memory(GiB)": 17.38, + "step": 5340, + "token_acc": 0.9817073170731707, + "train_speed(iter/s)": 0.215402 + }, + { + "epoch": 0.9598204264870932, + "grad_norm": 1.5770317316055298, + "learning_rate": 4.3799309656524236e-08, + "loss": 0.023345765471458436, + "memory(GiB)": 17.38, + "step": 5345, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.215439 + }, + { + "epoch": 0.9607182940516273, + "grad_norm": 2.5379750728607178, + "learning_rate": 4.185994854424169e-08, + "loss": 0.03656173348426819, + "memory(GiB)": 17.38, + "step": 5350, + "token_acc": 0.9871794871794872, + "train_speed(iter/s)": 0.215476 + }, + { + "epoch": 0.9616161616161616, + "grad_norm": 0.9468285441398621, + "learning_rate": 3.996432061530109e-08, + "loss": 0.020934942364692687, + "memory(GiB)": 17.38, + "step": 5355, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.215512 + }, + { + "epoch": 0.9625140291806958, + "grad_norm": 1.9496392011642456, + "learning_rate": 3.811244259005309e-08, + "loss": 0.031652253866195676, + "memory(GiB)": 17.38, + "step": 5360, + "token_acc": 0.9853556485355649, + "train_speed(iter/s)": 0.215548 + }, + { + "epoch": 0.9634118967452301, + "grad_norm": 0.6875377893447876, + "learning_rate": 3.630433080295426e-08, + "loss": 0.017298415303230286, + "memory(GiB)": 17.38, + "step": 5365, + "token_acc": 0.9916666666666667, + "train_speed(iter/s)": 0.215585 + }, + { + "epoch": 0.9643097643097643, + "grad_norm": 4.830986499786377, + "learning_rate": 3.454000120242051e-08, + "loss": 0.028898942470550536, + "memory(GiB)": 17.38, + "step": 5370, + "token_acc": 0.9892703862660944, + "train_speed(iter/s)": 0.21562 + }, + { + "epoch": 0.9652076318742986, + "grad_norm": 1.5479828119277954, + "learning_rate": 3.281946935069003e-08, + "loss": 0.02056848704814911, + "memory(GiB)": 17.38, + "step": 5375, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.215657 + }, + { + "epoch": 0.9661054994388327, + "grad_norm": 0.926124632358551, + "learning_rate": 3.114275042368109e-08, + "loss": 0.028106117248535158, + "memory(GiB)": 17.38, + "step": 5380, + "token_acc": 0.9794238683127572, + "train_speed(iter/s)": 0.215693 + }, + { + "epoch": 0.967003367003367, + "grad_norm": 2.0329346656799316, + "learning_rate": 2.950985921086391e-08, + "loss": 0.032183319330215454, + "memory(GiB)": 17.38, + "step": 5385, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.215728 + }, + { + "epoch": 0.9679012345679012, + "grad_norm": 0.7273510694503784, + "learning_rate": 2.792081011512404e-08, + "loss": 0.023776063323020936, + "memory(GiB)": 17.38, + "step": 5390, + "token_acc": 0.9873417721518988, + "train_speed(iter/s)": 0.21576 + }, + { + "epoch": 0.9687991021324355, + "grad_norm": 1.3573698997497559, + "learning_rate": 2.6375617152640255e-08, + "loss": 0.02243104577064514, + "memory(GiB)": 17.38, + "step": 5395, + "token_acc": 0.9875518672199171, + "train_speed(iter/s)": 0.21579 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 3.3467459678649902, + "learning_rate": 2.487429395275909e-08, + "loss": 0.03203926384449005, + "memory(GiB)": 17.38, + "step": 5400, + "token_acc": 0.9916317991631799, + "train_speed(iter/s)": 0.215819 + }, + { + "epoch": 0.9696969696969697, + "eval_loss": 0.027687082067131996, + "eval_runtime": 62.1666, + "eval_samples_per_second": 14.477, + "eval_steps_per_second": 7.239, + "eval_token_acc": 0.9862011483608075, + "step": 5400 + }, + { + "epoch": 0.970594837261504, + "grad_norm": 1.440702199935913, + "learning_rate": 2.3416853757873836e-08, + "loss": 0.027758511900901794, + "memory(GiB)": 17.38, + "step": 5405, + "token_acc": 0.9871882473522378, + "train_speed(iter/s)": 0.215245 + }, + { + "epoch": 0.9714927048260381, + "grad_norm": 2.0480434894561768, + "learning_rate": 2.2003309423309084e-08, + "loss": 0.030437469482421875, + "memory(GiB)": 17.38, + "step": 5410, + "token_acc": 0.9786324786324786, + "train_speed(iter/s)": 0.215277 + }, + { + "epoch": 0.9723905723905724, + "grad_norm": 0.6995311379432678, + "learning_rate": 2.0633673417207458e-08, + "loss": 0.022008776664733887, + "memory(GiB)": 17.38, + "step": 5415, + "token_acc": 0.9978813559322034, + "train_speed(iter/s)": 0.215313 + }, + { + "epoch": 0.9732884399551066, + "grad_norm": 1.6735602617263794, + "learning_rate": 1.930795782041639e-08, + "loss": 0.027090591192245484, + "memory(GiB)": 17.38, + "step": 5420, + "token_acc": 0.9871794871794872, + "train_speed(iter/s)": 0.215347 + }, + { + "epoch": 0.9741863075196409, + "grad_norm": 0.6113275289535522, + "learning_rate": 1.8026174326387647e-08, + "loss": 0.01828034222126007, + "memory(GiB)": 17.38, + "step": 5425, + "token_acc": 0.9936708860759493, + "train_speed(iter/s)": 0.215381 + }, + { + "epoch": 0.9750841750841751, + "grad_norm": 1.024767518043518, + "learning_rate": 1.6788334241068517e-08, + "loss": 0.02418099045753479, + "memory(GiB)": 17.38, + "step": 5430, + "token_acc": 0.9852941176470589, + "train_speed(iter/s)": 0.215411 + }, + { + "epoch": 0.9759820426487094, + "grad_norm": 2.1362531185150146, + "learning_rate": 1.5594448482804668e-08, + "loss": 0.02927337884902954, + "memory(GiB)": 17.38, + "step": 5435, + "token_acc": 0.9781746031746031, + "train_speed(iter/s)": 0.215439 + }, + { + "epoch": 0.9768799102132435, + "grad_norm": 1.3342387676239014, + "learning_rate": 1.4444527582243572e-08, + "loss": 0.03483619391918182, + "memory(GiB)": 17.38, + "step": 5440, + "token_acc": 0.9878542510121457, + "train_speed(iter/s)": 0.215471 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 1.162458896636963, + "learning_rate": 1.333858168224178e-08, + "loss": 0.024628420174121857, + "memory(GiB)": 17.38, + "step": 5445, + "token_acc": 0.9852320675105485, + "train_speed(iter/s)": 0.215507 + }, + { + "epoch": 0.978675645342312, + "grad_norm": 1.6054964065551758, + "learning_rate": 1.2276620537773899e-08, + "loss": 0.023460222780704497, + "memory(GiB)": 17.38, + "step": 5450, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.215542 + }, + { + "epoch": 0.9795735129068462, + "grad_norm": 0.9571593403816223, + "learning_rate": 1.1258653515849871e-08, + "loss": 0.027859655022621155, + "memory(GiB)": 17.38, + "step": 5455, + "token_acc": 0.9831932773109243, + "train_speed(iter/s)": 0.215578 + }, + { + "epoch": 0.9804713804713805, + "grad_norm": 1.6298872232437134, + "learning_rate": 1.0284689595427833e-08, + "loss": 0.018769088387489318, + "memory(GiB)": 17.38, + "step": 5460, + "token_acc": 0.9936170212765958, + "train_speed(iter/s)": 0.215613 + }, + { + "epoch": 0.9813692480359147, + "grad_norm": 2.3331682682037354, + "learning_rate": 9.354737367340271e-09, + "loss": 0.02891467809677124, + "memory(GiB)": 17.38, + "step": 5465, + "token_acc": 0.983402489626556, + "train_speed(iter/s)": 0.215648 + }, + { + "epoch": 0.9822671156004489, + "grad_norm": 0.9569219946861267, + "learning_rate": 8.468805034212435e-09, + "loss": 0.02721492648124695, + "memory(GiB)": 17.38, + "step": 5470, + "token_acc": 0.9919354838709677, + "train_speed(iter/s)": 0.215683 + }, + { + "epoch": 0.9831649831649831, + "grad_norm": 2.5574381351470947, + "learning_rate": 7.626900410394045e-09, + "loss": 0.026777103543281555, + "memory(GiB)": 17.38, + "step": 5475, + "token_acc": 0.9854771784232366, + "train_speed(iter/s)": 0.215717 + }, + { + "epoch": 0.9840628507295174, + "grad_norm": 1.2590869665145874, + "learning_rate": 6.8290309218910225e-09, + "loss": 0.032957214117050174, + "memory(GiB)": 17.38, + "step": 5480, + "token_acc": 0.9833333333333333, + "train_speed(iter/s)": 0.215753 + }, + { + "epoch": 0.9849607182940516, + "grad_norm": 0.5552598834037781, + "learning_rate": 6.075203606294433e-09, + "loss": 0.020178470015525817, + "memory(GiB)": 17.38, + "step": 5485, + "token_acc": 0.9874476987447699, + "train_speed(iter/s)": 0.215788 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 0.6694216132164001, + "learning_rate": 5.365425112726086e-09, + "loss": 0.03327462673187256, + "memory(GiB)": 17.38, + "step": 5490, + "token_acc": 0.9814814814814815, + "train_speed(iter/s)": 0.215823 + }, + { + "epoch": 0.9867564534231201, + "grad_norm": 4.890697956085205, + "learning_rate": 4.699701701773584e-09, + "loss": 0.02292284071445465, + "memory(GiB)": 17.38, + "step": 5495, + "token_acc": 0.9858870967741935, + "train_speed(iter/s)": 0.215859 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 1.9162883758544922, + "learning_rate": 4.078039245437593e-09, + "loss": 0.031007373332977296, + "memory(GiB)": 17.38, + "step": 5500, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215894 + }, + { + "epoch": 0.9876543209876543, + "eval_loss": 0.027690263465046883, + "eval_runtime": 60.8965, + "eval_samples_per_second": 14.779, + "eval_steps_per_second": 7.39, + "eval_token_acc": 0.985645489905538, + "step": 5500 + }, + { + "epoch": 0.9885521885521885, + "grad_norm": 2.1794795989990234, + "learning_rate": 3.500443227080763e-09, + "loss": 0.02626202702522278, + "memory(GiB)": 17.38, + "step": 5505, + "token_acc": 0.9858022579541567, + "train_speed(iter/s)": 0.215331 + }, + { + "epoch": 0.9894500561167228, + "grad_norm": 0.6441231369972229, + "learning_rate": 2.9669187413777778e-09, + "loss": 0.018758746981620788, + "memory(GiB)": 17.38, + "step": 5510, + "token_acc": 0.9979423868312757, + "train_speed(iter/s)": 0.215363 + }, + { + "epoch": 0.990347923681257, + "grad_norm": 1.0357253551483154, + "learning_rate": 2.4774704942720496e-09, + "loss": 0.024141253530979158, + "memory(GiB)": 17.38, + "step": 5515, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215396 + }, + { + "epoch": 0.9912457912457913, + "grad_norm": 0.8636681437492371, + "learning_rate": 2.0321028029329782e-09, + "loss": 0.02414155602455139, + "memory(GiB)": 17.38, + "step": 5520, + "token_acc": 0.98125, + "train_speed(iter/s)": 0.215428 + }, + { + "epoch": 0.9921436588103255, + "grad_norm": 2.009631633758545, + "learning_rate": 1.6308195957182028e-09, + "loss": 0.027130979299545287, + "memory(GiB)": 17.38, + "step": 5525, + "token_acc": 0.9877049180327869, + "train_speed(iter/s)": 0.215461 + }, + { + "epoch": 0.9930415263748598, + "grad_norm": 0.7460534572601318, + "learning_rate": 1.2736244121380747e-09, + "loss": 0.02460155189037323, + "memory(GiB)": 17.38, + "step": 5530, + "token_acc": 0.987603305785124, + "train_speed(iter/s)": 0.215494 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 2.1745049953460693, + "learning_rate": 9.605204028262371e-10, + "loss": 0.016963377594947815, + "memory(GiB)": 17.38, + "step": 5535, + "token_acc": 0.9957264957264957, + "train_speed(iter/s)": 0.215527 + }, + { + "epoch": 0.9948372615039281, + "grad_norm": 2.3120298385620117, + "learning_rate": 6.915103295118686e-10, + "loss": 0.021723246574401854, + "memory(GiB)": 17.38, + "step": 5540, + "token_acc": 0.9936440677966102, + "train_speed(iter/s)": 0.215558 + }, + { + "epoch": 0.9957351290684624, + "grad_norm": 1.2359997034072876, + "learning_rate": 4.66596564992483e-10, + "loss": 0.0227289080619812, + "memory(GiB)": 17.38, + "step": 5545, + "token_acc": 0.9858870967741935, + "train_speed(iter/s)": 0.21559 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 0.5378426313400269, + "learning_rate": 2.8578109311672154e-10, + "loss": 0.022391429543495177, + "memory(GiB)": 17.38, + "step": 5550, + "token_acc": 0.9895397489539749, + "train_speed(iter/s)": 0.215622 + }, + { + "epoch": 0.9975308641975309, + "grad_norm": 3.8460347652435303, + "learning_rate": 1.4906550876270243e-10, + "loss": 0.02976270318031311, + "memory(GiB)": 17.38, + "step": 5555, + "token_acc": 0.9896265560165975, + "train_speed(iter/s)": 0.215656 + }, + { + "epoch": 0.9984287317620651, + "grad_norm": 0.6513814926147461, + "learning_rate": 5.645101782913997e-11, + "loss": 0.019312313199043273, + "memory(GiB)": 17.38, + "step": 5560, + "token_acc": 0.9896694214876033, + "train_speed(iter/s)": 0.21569 + }, + { + "epoch": 0.9993265993265993, + "grad_norm": 1.5546852350234985, + "learning_rate": 7.93843721924592e-12, + "loss": 0.024603912234306337, + "memory(GiB)": 17.38, + "step": 5565, + "token_acc": 0.9914529914529915, + "train_speed(iter/s)": 0.215723 + }, + { + "epoch": 0.9998653198653199, + "eval_loss": 0.027694087475538254, + "eval_runtime": 61.6953, + "eval_samples_per_second": 14.588, + "eval_steps_per_second": 7.294, + "eval_token_acc": 0.9860159288757178, + "step": 5568 + } + ], + "logging_steps": 5, + "max_steps": 5568, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.1991779098257e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}