{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.717391304347826, "eval_steps": 50, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010869565217391304, "grad_norm": 2.4570870399475098, "learning_rate": 4.981884057971015e-05, "loss": 2.3612, "step": 10 }, { "epoch": 0.021739130434782608, "grad_norm": 1.2042421102523804, "learning_rate": 4.963768115942029e-05, "loss": 1.1915, "step": 20 }, { "epoch": 0.03260869565217391, "grad_norm": 1.2019174098968506, "learning_rate": 4.945652173913044e-05, "loss": 0.8992, "step": 30 }, { "epoch": 0.043478260869565216, "grad_norm": 1.0546120405197144, "learning_rate": 4.9275362318840584e-05, "loss": 0.7276, "step": 40 }, { "epoch": 0.05434782608695652, "grad_norm": 0.754091203212738, "learning_rate": 4.909420289855073e-05, "loss": 0.6419, "step": 50 }, { "epoch": 0.05434782608695652, "eval_loss": 0.7905128598213196, "eval_runtime": 10.8418, "eval_samples_per_second": 44.273, "eval_steps_per_second": 2.767, "step": 50 }, { "epoch": 0.06521739130434782, "grad_norm": 0.7751966714859009, "learning_rate": 4.891304347826087e-05, "loss": 0.5971, "step": 60 }, { "epoch": 0.07608695652173914, "grad_norm": 0.6874057650566101, "learning_rate": 4.873188405797102e-05, "loss": 0.608, "step": 70 }, { "epoch": 0.08695652173913043, "grad_norm": 0.7145748734474182, "learning_rate": 4.855072463768116e-05, "loss": 0.6111, "step": 80 }, { "epoch": 0.09782608695652174, "grad_norm": 0.8841484189033508, "learning_rate": 4.836956521739131e-05, "loss": 0.5848, "step": 90 }, { "epoch": 0.10869565217391304, "grad_norm": 0.7937784790992737, "learning_rate": 4.818840579710145e-05, "loss": 0.5699, "step": 100 }, { "epoch": 0.10869565217391304, "eval_loss": 0.6956210732460022, "eval_runtime": 10.863, "eval_samples_per_second": 44.187, "eval_steps_per_second": 2.762, "step": 100 }, { "epoch": 0.11956521739130435, "grad_norm": 0.7818441987037659, "learning_rate": 4.80072463768116e-05, "loss": 0.5946, "step": 110 }, { "epoch": 0.13043478260869565, "grad_norm": 0.8666340112686157, "learning_rate": 4.782608695652174e-05, "loss": 0.5469, "step": 120 }, { "epoch": 0.14130434782608695, "grad_norm": 0.7637468576431274, "learning_rate": 4.764492753623189e-05, "loss": 0.5307, "step": 130 }, { "epoch": 0.15217391304347827, "grad_norm": 0.8282362222671509, "learning_rate": 4.746376811594203e-05, "loss": 0.5312, "step": 140 }, { "epoch": 0.16304347826086957, "grad_norm": 0.9675197601318359, "learning_rate": 4.7282608695652177e-05, "loss": 0.5486, "step": 150 }, { "epoch": 0.16304347826086957, "eval_loss": 0.6567058563232422, "eval_runtime": 10.8376, "eval_samples_per_second": 44.29, "eval_steps_per_second": 2.768, "step": 150 }, { "epoch": 0.17391304347826086, "grad_norm": 0.7035924792289734, "learning_rate": 4.710144927536232e-05, "loss": 0.5553, "step": 160 }, { "epoch": 0.18478260869565216, "grad_norm": 0.8957257866859436, "learning_rate": 4.6920289855072464e-05, "loss": 0.54, "step": 170 }, { "epoch": 0.1956521739130435, "grad_norm": 0.8544663190841675, "learning_rate": 4.673913043478261e-05, "loss": 0.55, "step": 180 }, { "epoch": 0.20652173913043478, "grad_norm": 0.7599456310272217, "learning_rate": 4.655797101449276e-05, "loss": 0.5102, "step": 190 }, { "epoch": 0.21739130434782608, "grad_norm": 0.9151259064674377, "learning_rate": 4.63768115942029e-05, "loss": 0.5372, "step": 200 }, { "epoch": 0.21739130434782608, "eval_loss": 0.6319016218185425, "eval_runtime": 10.8237, "eval_samples_per_second": 44.347, "eval_steps_per_second": 2.772, "step": 200 }, { "epoch": 0.22826086956521738, "grad_norm": 0.911018431186676, "learning_rate": 4.6195652173913046e-05, "loss": 0.5354, "step": 210 }, { "epoch": 0.2391304347826087, "grad_norm": 0.7792191505432129, "learning_rate": 4.601449275362319e-05, "loss": 0.5297, "step": 220 }, { "epoch": 0.25, "grad_norm": 0.7631202340126038, "learning_rate": 4.5833333333333334e-05, "loss": 0.5258, "step": 230 }, { "epoch": 0.2608695652173913, "grad_norm": 0.7436497211456299, "learning_rate": 4.565217391304348e-05, "loss": 0.4849, "step": 240 }, { "epoch": 0.2717391304347826, "grad_norm": 0.8372677564620972, "learning_rate": 4.547101449275363e-05, "loss": 0.5131, "step": 250 }, { "epoch": 0.2717391304347826, "eval_loss": 0.6155125498771667, "eval_runtime": 10.8327, "eval_samples_per_second": 44.31, "eval_steps_per_second": 2.769, "step": 250 }, { "epoch": 0.2826086956521739, "grad_norm": 0.8056390285491943, "learning_rate": 4.528985507246377e-05, "loss": 0.4675, "step": 260 }, { "epoch": 0.29347826086956524, "grad_norm": 0.8015912175178528, "learning_rate": 4.5108695652173916e-05, "loss": 0.4942, "step": 270 }, { "epoch": 0.30434782608695654, "grad_norm": 0.8316887021064758, "learning_rate": 4.492753623188406e-05, "loss": 0.5025, "step": 280 }, { "epoch": 0.31521739130434784, "grad_norm": 0.8440260887145996, "learning_rate": 4.4746376811594203e-05, "loss": 0.5112, "step": 290 }, { "epoch": 0.32608695652173914, "grad_norm": 0.8523954153060913, "learning_rate": 4.456521739130435e-05, "loss": 0.4973, "step": 300 }, { "epoch": 0.32608695652173914, "eval_loss": 0.6024141907691956, "eval_runtime": 10.839, "eval_samples_per_second": 44.284, "eval_steps_per_second": 2.768, "step": 300 }, { "epoch": 0.33695652173913043, "grad_norm": 0.8427216410636902, "learning_rate": 4.438405797101449e-05, "loss": 0.5271, "step": 310 }, { "epoch": 0.34782608695652173, "grad_norm": 0.8126628398895264, "learning_rate": 4.4202898550724645e-05, "loss": 0.4608, "step": 320 }, { "epoch": 0.358695652173913, "grad_norm": 0.9947460293769836, "learning_rate": 4.4021739130434786e-05, "loss": 0.4762, "step": 330 }, { "epoch": 0.3695652173913043, "grad_norm": 0.7744721174240112, "learning_rate": 4.384057971014493e-05, "loss": 0.4993, "step": 340 }, { "epoch": 0.3804347826086957, "grad_norm": 0.9483833909034729, "learning_rate": 4.365942028985507e-05, "loss": 0.497, "step": 350 }, { "epoch": 0.3804347826086957, "eval_loss": 0.596432626247406, "eval_runtime": 10.8191, "eval_samples_per_second": 44.366, "eval_steps_per_second": 2.773, "step": 350 }, { "epoch": 0.391304347826087, "grad_norm": 0.7573915123939514, "learning_rate": 4.347826086956522e-05, "loss": 0.469, "step": 360 }, { "epoch": 0.40217391304347827, "grad_norm": 0.8013887405395508, "learning_rate": 4.329710144927536e-05, "loss": 0.4704, "step": 370 }, { "epoch": 0.41304347826086957, "grad_norm": 0.8412826657295227, "learning_rate": 4.3115942028985515e-05, "loss": 0.5033, "step": 380 }, { "epoch": 0.42391304347826086, "grad_norm": 0.8203413486480713, "learning_rate": 4.2934782608695655e-05, "loss": 0.5063, "step": 390 }, { "epoch": 0.43478260869565216, "grad_norm": 0.7952508926391602, "learning_rate": 4.27536231884058e-05, "loss": 0.4706, "step": 400 }, { "epoch": 0.43478260869565216, "eval_loss": 0.5875406861305237, "eval_runtime": 10.8328, "eval_samples_per_second": 44.31, "eval_steps_per_second": 2.769, "step": 400 }, { "epoch": 0.44565217391304346, "grad_norm": 0.9020015597343445, "learning_rate": 4.257246376811594e-05, "loss": 0.4786, "step": 410 }, { "epoch": 0.45652173913043476, "grad_norm": 0.8588363528251648, "learning_rate": 4.239130434782609e-05, "loss": 0.482, "step": 420 }, { "epoch": 0.4673913043478261, "grad_norm": 0.816198468208313, "learning_rate": 4.221014492753623e-05, "loss": 0.4683, "step": 430 }, { "epoch": 0.4782608695652174, "grad_norm": 0.8713563680648804, "learning_rate": 4.202898550724638e-05, "loss": 0.4422, "step": 440 }, { "epoch": 0.4891304347826087, "grad_norm": 0.8633397817611694, "learning_rate": 4.1847826086956525e-05, "loss": 0.45, "step": 450 }, { "epoch": 0.4891304347826087, "eval_loss": 0.5812788605690002, "eval_runtime": 10.8048, "eval_samples_per_second": 44.425, "eval_steps_per_second": 2.777, "step": 450 }, { "epoch": 0.5, "grad_norm": 0.8244722485542297, "learning_rate": 4.166666666666667e-05, "loss": 0.4911, "step": 460 }, { "epoch": 0.5108695652173914, "grad_norm": 0.8201929926872253, "learning_rate": 4.148550724637681e-05, "loss": 0.4611, "step": 470 }, { "epoch": 0.5217391304347826, "grad_norm": 0.9646862149238586, "learning_rate": 4.130434782608696e-05, "loss": 0.4738, "step": 480 }, { "epoch": 0.532608695652174, "grad_norm": 0.9149807095527649, "learning_rate": 4.11231884057971e-05, "loss": 0.4603, "step": 490 }, { "epoch": 0.5434782608695652, "grad_norm": 0.8054277896881104, "learning_rate": 4.094202898550725e-05, "loss": 0.466, "step": 500 }, { "epoch": 0.5434782608695652, "eval_loss": 0.573806881904602, "eval_runtime": 10.7906, "eval_samples_per_second": 44.483, "eval_steps_per_second": 2.78, "step": 500 }, { "epoch": 0.5543478260869565, "grad_norm": 0.9110711812973022, "learning_rate": 4.076086956521739e-05, "loss": 0.4777, "step": 510 }, { "epoch": 0.5652173913043478, "grad_norm": 0.8030044436454773, "learning_rate": 4.057971014492754e-05, "loss": 0.4965, "step": 520 }, { "epoch": 0.5760869565217391, "grad_norm": 0.8847204446792603, "learning_rate": 4.039855072463768e-05, "loss": 0.4843, "step": 530 }, { "epoch": 0.5869565217391305, "grad_norm": 0.9041047692298889, "learning_rate": 4.021739130434783e-05, "loss": 0.4548, "step": 540 }, { "epoch": 0.5978260869565217, "grad_norm": 0.8136462569236755, "learning_rate": 4.003623188405797e-05, "loss": 0.4543, "step": 550 }, { "epoch": 0.5978260869565217, "eval_loss": 0.5662083029747009, "eval_runtime": 10.7748, "eval_samples_per_second": 44.549, "eval_steps_per_second": 2.784, "step": 550 }, { "epoch": 0.6086956521739131, "grad_norm": 0.9470314383506775, "learning_rate": 3.985507246376812e-05, "loss": 0.4852, "step": 560 }, { "epoch": 0.6195652173913043, "grad_norm": 0.9280235171318054, "learning_rate": 3.9673913043478264e-05, "loss": 0.4655, "step": 570 }, { "epoch": 0.6304347826086957, "grad_norm": 0.9118008017539978, "learning_rate": 3.9492753623188405e-05, "loss": 0.449, "step": 580 }, { "epoch": 0.6413043478260869, "grad_norm": 0.9286589026451111, "learning_rate": 3.931159420289855e-05, "loss": 0.448, "step": 590 }, { "epoch": 0.6521739130434783, "grad_norm": 0.7761940956115723, "learning_rate": 3.91304347826087e-05, "loss": 0.4363, "step": 600 }, { "epoch": 0.6521739130434783, "eval_loss": 0.5645204186439514, "eval_runtime": 10.7763, "eval_samples_per_second": 44.542, "eval_steps_per_second": 2.784, "step": 600 }, { "epoch": 0.6630434782608695, "grad_norm": 0.9244002103805542, "learning_rate": 3.8949275362318846e-05, "loss": 0.4354, "step": 610 }, { "epoch": 0.6739130434782609, "grad_norm": 0.8263306021690369, "learning_rate": 3.876811594202899e-05, "loss": 0.4951, "step": 620 }, { "epoch": 0.6847826086956522, "grad_norm": 0.8908408284187317, "learning_rate": 3.8586956521739134e-05, "loss": 0.4401, "step": 630 }, { "epoch": 0.6956521739130435, "grad_norm": 0.8497889637947083, "learning_rate": 3.8405797101449274e-05, "loss": 0.4514, "step": 640 }, { "epoch": 0.7065217391304348, "grad_norm": 0.8306752443313599, "learning_rate": 3.822463768115942e-05, "loss": 0.4503, "step": 650 }, { "epoch": 0.7065217391304348, "eval_loss": 0.5576011538505554, "eval_runtime": 10.7704, "eval_samples_per_second": 44.567, "eval_steps_per_second": 2.785, "step": 650 }, { "epoch": 0.717391304347826, "grad_norm": 0.898961067199707, "learning_rate": 3.804347826086957e-05, "loss": 0.4185, "step": 660 }, { "epoch": 0.7282608695652174, "grad_norm": 0.8870590925216675, "learning_rate": 3.7862318840579716e-05, "loss": 0.4303, "step": 670 }, { "epoch": 0.7391304347826086, "grad_norm": 0.8863015174865723, "learning_rate": 3.7681159420289856e-05, "loss": 0.4356, "step": 680 }, { "epoch": 0.75, "grad_norm": 0.852130651473999, "learning_rate": 3.7500000000000003e-05, "loss": 0.4756, "step": 690 }, { "epoch": 0.7608695652173914, "grad_norm": 0.8342716097831726, "learning_rate": 3.7318840579710144e-05, "loss": 0.4955, "step": 700 }, { "epoch": 0.7608695652173914, "eval_loss": 0.5521832704544067, "eval_runtime": 10.7828, "eval_samples_per_second": 44.515, "eval_steps_per_second": 2.782, "step": 700 }, { "epoch": 0.7717391304347826, "grad_norm": 0.8382502794265747, "learning_rate": 3.713768115942029e-05, "loss": 0.4435, "step": 710 }, { "epoch": 0.782608695652174, "grad_norm": 0.9789420962333679, "learning_rate": 3.695652173913043e-05, "loss": 0.3957, "step": 720 }, { "epoch": 0.7934782608695652, "grad_norm": 0.8945524096488953, "learning_rate": 3.6775362318840586e-05, "loss": 0.4565, "step": 730 }, { "epoch": 0.8043478260869565, "grad_norm": 0.9632206559181213, "learning_rate": 3.6594202898550726e-05, "loss": 0.4312, "step": 740 }, { "epoch": 0.8152173913043478, "grad_norm": 0.8000399470329285, "learning_rate": 3.641304347826087e-05, "loss": 0.429, "step": 750 }, { "epoch": 0.8152173913043478, "eval_loss": 0.550889790058136, "eval_runtime": 10.7987, "eval_samples_per_second": 44.45, "eval_steps_per_second": 2.778, "step": 750 }, { "epoch": 0.8260869565217391, "grad_norm": 0.8817518353462219, "learning_rate": 3.6231884057971014e-05, "loss": 0.4464, "step": 760 }, { "epoch": 0.8369565217391305, "grad_norm": 0.9800861477851868, "learning_rate": 3.605072463768116e-05, "loss": 0.4577, "step": 770 }, { "epoch": 0.8478260869565217, "grad_norm": 0.846571683883667, "learning_rate": 3.58695652173913e-05, "loss": 0.4169, "step": 780 }, { "epoch": 0.8586956521739131, "grad_norm": 0.8653192520141602, "learning_rate": 3.568840579710145e-05, "loss": 0.4195, "step": 790 }, { "epoch": 0.8695652173913043, "grad_norm": 0.8515974283218384, "learning_rate": 3.5507246376811596e-05, "loss": 0.4366, "step": 800 }, { "epoch": 0.8695652173913043, "eval_loss": 0.543337881565094, "eval_runtime": 10.8166, "eval_samples_per_second": 44.376, "eval_steps_per_second": 2.774, "step": 800 }, { "epoch": 0.8804347826086957, "grad_norm": 0.947827935218811, "learning_rate": 3.532608695652174e-05, "loss": 0.4498, "step": 810 }, { "epoch": 0.8913043478260869, "grad_norm": 0.866033136844635, "learning_rate": 3.514492753623188e-05, "loss": 0.4495, "step": 820 }, { "epoch": 0.9021739130434783, "grad_norm": 0.8692104816436768, "learning_rate": 3.496376811594203e-05, "loss": 0.4502, "step": 830 }, { "epoch": 0.9130434782608695, "grad_norm": 0.8841784596443176, "learning_rate": 3.478260869565218e-05, "loss": 0.4577, "step": 840 }, { "epoch": 0.9239130434782609, "grad_norm": 0.9257445335388184, "learning_rate": 3.460144927536232e-05, "loss": 0.4329, "step": 850 }, { "epoch": 0.9239130434782609, "eval_loss": 0.5429030060768127, "eval_runtime": 10.8284, "eval_samples_per_second": 44.328, "eval_steps_per_second": 2.77, "step": 850 }, { "epoch": 0.9347826086956522, "grad_norm": 0.8753738403320312, "learning_rate": 3.4420289855072465e-05, "loss": 0.422, "step": 860 }, { "epoch": 0.9456521739130435, "grad_norm": 0.8853139877319336, "learning_rate": 3.423913043478261e-05, "loss": 0.4357, "step": 870 }, { "epoch": 0.9565217391304348, "grad_norm": 0.8637978434562683, "learning_rate": 3.405797101449276e-05, "loss": 0.4254, "step": 880 }, { "epoch": 0.967391304347826, "grad_norm": 0.9090889096260071, "learning_rate": 3.38768115942029e-05, "loss": 0.4526, "step": 890 }, { "epoch": 0.9782608695652174, "grad_norm": 0.8591863512992859, "learning_rate": 3.369565217391305e-05, "loss": 0.4566, "step": 900 }, { "epoch": 0.9782608695652174, "eval_loss": 0.5399300456047058, "eval_runtime": 10.8533, "eval_samples_per_second": 44.226, "eval_steps_per_second": 2.764, "step": 900 }, { "epoch": 0.9891304347826086, "grad_norm": 0.8273764848709106, "learning_rate": 3.351449275362319e-05, "loss": 0.4182, "step": 910 }, { "epoch": 1.0, "grad_norm": 1.5736610889434814, "learning_rate": 3.3333333333333335e-05, "loss": 0.4548, "step": 920 }, { "epoch": 1.0108695652173914, "grad_norm": 0.8933672904968262, "learning_rate": 3.3152173913043475e-05, "loss": 0.3834, "step": 930 }, { "epoch": 1.0217391304347827, "grad_norm": 0.9641056060791016, "learning_rate": 3.297101449275363e-05, "loss": 0.3962, "step": 940 }, { "epoch": 1.0326086956521738, "grad_norm": 0.8840826153755188, "learning_rate": 3.278985507246377e-05, "loss": 0.3886, "step": 950 }, { "epoch": 1.0326086956521738, "eval_loss": 0.5417191982269287, "eval_runtime": 10.828, "eval_samples_per_second": 44.33, "eval_steps_per_second": 2.771, "step": 950 }, { "epoch": 1.0434782608695652, "grad_norm": 0.9005197286605835, "learning_rate": 3.260869565217392e-05, "loss": 0.3887, "step": 960 }, { "epoch": 1.0543478260869565, "grad_norm": 1.0132256746292114, "learning_rate": 3.242753623188406e-05, "loss": 0.4026, "step": 970 }, { "epoch": 1.065217391304348, "grad_norm": 1.1139558553695679, "learning_rate": 3.2246376811594205e-05, "loss": 0.3703, "step": 980 }, { "epoch": 1.0760869565217392, "grad_norm": 0.9440574049949646, "learning_rate": 3.2065217391304345e-05, "loss": 0.3766, "step": 990 }, { "epoch": 1.0869565217391304, "grad_norm": 1.0028133392333984, "learning_rate": 3.188405797101449e-05, "loss": 0.4038, "step": 1000 }, { "epoch": 1.0869565217391304, "eval_loss": 0.5413796901702881, "eval_runtime": 10.822, "eval_samples_per_second": 44.354, "eval_steps_per_second": 2.772, "step": 1000 }, { "epoch": 1.0978260869565217, "grad_norm": 0.9884430766105652, "learning_rate": 3.170289855072464e-05, "loss": 0.4134, "step": 1010 }, { "epoch": 1.108695652173913, "grad_norm": 0.920870840549469, "learning_rate": 3.152173913043479e-05, "loss": 0.3837, "step": 1020 }, { "epoch": 1.1195652173913044, "grad_norm": 0.942890465259552, "learning_rate": 3.134057971014493e-05, "loss": 0.38, "step": 1030 }, { "epoch": 1.1304347826086956, "grad_norm": 1.0265600681304932, "learning_rate": 3.1159420289855074e-05, "loss": 0.3988, "step": 1040 }, { "epoch": 1.141304347826087, "grad_norm": 0.9994137287139893, "learning_rate": 3.0978260869565215e-05, "loss": 0.3649, "step": 1050 }, { "epoch": 1.141304347826087, "eval_loss": 0.537377655506134, "eval_runtime": 10.7968, "eval_samples_per_second": 44.458, "eval_steps_per_second": 2.779, "step": 1050 }, { "epoch": 1.1521739130434783, "grad_norm": 0.9697039723396301, "learning_rate": 3.079710144927536e-05, "loss": 0.393, "step": 1060 }, { "epoch": 1.1630434782608696, "grad_norm": 0.9805816411972046, "learning_rate": 3.061594202898551e-05, "loss": 0.3594, "step": 1070 }, { "epoch": 1.1739130434782608, "grad_norm": 0.9799211621284485, "learning_rate": 3.0434782608695656e-05, "loss": 0.387, "step": 1080 }, { "epoch": 1.184782608695652, "grad_norm": 0.9287502765655518, "learning_rate": 3.02536231884058e-05, "loss": 0.3889, "step": 1090 }, { "epoch": 1.1956521739130435, "grad_norm": 1.042643666267395, "learning_rate": 3.0072463768115944e-05, "loss": 0.3894, "step": 1100 }, { "epoch": 1.1956521739130435, "eval_loss": 0.5378063917160034, "eval_runtime": 10.77, "eval_samples_per_second": 44.568, "eval_steps_per_second": 2.786, "step": 1100 }, { "epoch": 1.2065217391304348, "grad_norm": 0.963258683681488, "learning_rate": 2.9891304347826088e-05, "loss": 0.3824, "step": 1110 }, { "epoch": 1.2173913043478262, "grad_norm": 0.939275860786438, "learning_rate": 2.971014492753623e-05, "loss": 0.3898, "step": 1120 }, { "epoch": 1.2282608695652173, "grad_norm": 1.1144332885742188, "learning_rate": 2.9528985507246375e-05, "loss": 0.3754, "step": 1130 }, { "epoch": 1.2391304347826086, "grad_norm": 0.999047577381134, "learning_rate": 2.9347826086956526e-05, "loss": 0.3876, "step": 1140 }, { "epoch": 1.25, "grad_norm": 1.024510383605957, "learning_rate": 2.916666666666667e-05, "loss": 0.3782, "step": 1150 }, { "epoch": 1.25, "eval_loss": 0.5348225831985474, "eval_runtime": 10.7748, "eval_samples_per_second": 44.548, "eval_steps_per_second": 2.784, "step": 1150 }, { "epoch": 1.2608695652173914, "grad_norm": 1.0736693143844604, "learning_rate": 2.8985507246376814e-05, "loss": 0.3895, "step": 1160 }, { "epoch": 1.2717391304347827, "grad_norm": 0.9759789705276489, "learning_rate": 2.8804347826086957e-05, "loss": 0.3989, "step": 1170 }, { "epoch": 1.2826086956521738, "grad_norm": 0.8532370328903198, "learning_rate": 2.86231884057971e-05, "loss": 0.3814, "step": 1180 }, { "epoch": 1.2934782608695652, "grad_norm": 0.9838298559188843, "learning_rate": 2.8442028985507245e-05, "loss": 0.3864, "step": 1190 }, { "epoch": 1.3043478260869565, "grad_norm": 0.8785775899887085, "learning_rate": 2.826086956521739e-05, "loss": 0.3676, "step": 1200 }, { "epoch": 1.3043478260869565, "eval_loss": 0.5335877537727356, "eval_runtime": 10.7959, "eval_samples_per_second": 44.461, "eval_steps_per_second": 2.779, "step": 1200 }, { "epoch": 1.315217391304348, "grad_norm": 0.9719653129577637, "learning_rate": 2.807971014492754e-05, "loss": 0.3913, "step": 1210 }, { "epoch": 1.3260869565217392, "grad_norm": 0.9099355340003967, "learning_rate": 2.7898550724637683e-05, "loss": 0.4019, "step": 1220 }, { "epoch": 1.3369565217391304, "grad_norm": 0.9192841649055481, "learning_rate": 2.7717391304347827e-05, "loss": 0.3921, "step": 1230 }, { "epoch": 1.3478260869565217, "grad_norm": 1.1005280017852783, "learning_rate": 2.753623188405797e-05, "loss": 0.3892, "step": 1240 }, { "epoch": 1.358695652173913, "grad_norm": 0.9969606995582581, "learning_rate": 2.7355072463768118e-05, "loss": 0.3756, "step": 1250 }, { "epoch": 1.358695652173913, "eval_loss": 0.5348997712135315, "eval_runtime": 10.8018, "eval_samples_per_second": 44.437, "eval_steps_per_second": 2.777, "step": 1250 }, { "epoch": 1.3695652173913042, "grad_norm": 1.1843461990356445, "learning_rate": 2.7173913043478262e-05, "loss": 0.3751, "step": 1260 }, { "epoch": 1.3804347826086958, "grad_norm": 0.9097041487693787, "learning_rate": 2.6992753623188406e-05, "loss": 0.3852, "step": 1270 }, { "epoch": 1.391304347826087, "grad_norm": 0.9780197739601135, "learning_rate": 2.6811594202898553e-05, "loss": 0.3776, "step": 1280 }, { "epoch": 1.4021739130434783, "grad_norm": 1.0949311256408691, "learning_rate": 2.66304347826087e-05, "loss": 0.3517, "step": 1290 }, { "epoch": 1.4130434782608696, "grad_norm": 1.0105839967727661, "learning_rate": 2.6449275362318844e-05, "loss": 0.384, "step": 1300 }, { "epoch": 1.4130434782608696, "eval_loss": 0.5327685475349426, "eval_runtime": 10.8023, "eval_samples_per_second": 44.435, "eval_steps_per_second": 2.777, "step": 1300 }, { "epoch": 1.4239130434782608, "grad_norm": 1.0113478899002075, "learning_rate": 2.6268115942028988e-05, "loss": 0.3774, "step": 1310 }, { "epoch": 1.434782608695652, "grad_norm": 1.0456714630126953, "learning_rate": 2.608695652173913e-05, "loss": 0.3618, "step": 1320 }, { "epoch": 1.4456521739130435, "grad_norm": 1.048971176147461, "learning_rate": 2.5905797101449275e-05, "loss": 0.3707, "step": 1330 }, { "epoch": 1.4565217391304348, "grad_norm": 0.9918339252471924, "learning_rate": 2.572463768115942e-05, "loss": 0.3979, "step": 1340 }, { "epoch": 1.4673913043478262, "grad_norm": 0.8943142890930176, "learning_rate": 2.554347826086957e-05, "loss": 0.3647, "step": 1350 }, { "epoch": 1.4673913043478262, "eval_loss": 0.530137836933136, "eval_runtime": 10.8212, "eval_samples_per_second": 44.357, "eval_steps_per_second": 2.772, "step": 1350 }, { "epoch": 1.4782608695652173, "grad_norm": 1.1170648336410522, "learning_rate": 2.5362318840579714e-05, "loss": 0.3752, "step": 1360 }, { "epoch": 1.4891304347826086, "grad_norm": 1.0574833154678345, "learning_rate": 2.5181159420289857e-05, "loss": 0.4005, "step": 1370 }, { "epoch": 1.5, "grad_norm": 1.0728522539138794, "learning_rate": 2.5e-05, "loss": 0.404, "step": 1380 }, { "epoch": 1.5108695652173914, "grad_norm": 1.0159659385681152, "learning_rate": 2.4818840579710145e-05, "loss": 0.4051, "step": 1390 }, { "epoch": 1.5217391304347827, "grad_norm": 0.9714246392250061, "learning_rate": 2.4637681159420292e-05, "loss": 0.3775, "step": 1400 }, { "epoch": 1.5217391304347827, "eval_loss": 0.528946578502655, "eval_runtime": 10.8248, "eval_samples_per_second": 44.343, "eval_steps_per_second": 2.771, "step": 1400 }, { "epoch": 1.5326086956521738, "grad_norm": 1.0848588943481445, "learning_rate": 2.4456521739130436e-05, "loss": 0.3703, "step": 1410 }, { "epoch": 1.5434782608695652, "grad_norm": 1.0333784818649292, "learning_rate": 2.427536231884058e-05, "loss": 0.3692, "step": 1420 }, { "epoch": 1.5543478260869565, "grad_norm": 0.9682796597480774, "learning_rate": 2.4094202898550724e-05, "loss": 0.3722, "step": 1430 }, { "epoch": 1.5652173913043477, "grad_norm": 1.0484014749526978, "learning_rate": 2.391304347826087e-05, "loss": 0.3763, "step": 1440 }, { "epoch": 1.5760869565217392, "grad_norm": 0.9075008630752563, "learning_rate": 2.3731884057971015e-05, "loss": 0.3594, "step": 1450 }, { "epoch": 1.5760869565217392, "eval_loss": 0.5278663039207458, "eval_runtime": 10.831, "eval_samples_per_second": 44.317, "eval_steps_per_second": 2.77, "step": 1450 }, { "epoch": 1.5869565217391304, "grad_norm": 0.9710797667503357, "learning_rate": 2.355072463768116e-05, "loss": 0.3821, "step": 1460 }, { "epoch": 1.5978260869565217, "grad_norm": 1.0220218896865845, "learning_rate": 2.3369565217391306e-05, "loss": 0.3623, "step": 1470 }, { "epoch": 1.608695652173913, "grad_norm": 1.0994117259979248, "learning_rate": 2.318840579710145e-05, "loss": 0.3735, "step": 1480 }, { "epoch": 1.6195652173913042, "grad_norm": 0.9513503313064575, "learning_rate": 2.3007246376811593e-05, "loss": 0.3743, "step": 1490 }, { "epoch": 1.6304347826086958, "grad_norm": 0.9903764128684998, "learning_rate": 2.282608695652174e-05, "loss": 0.3661, "step": 1500 }, { "epoch": 1.6304347826086958, "eval_loss": 0.5267059803009033, "eval_runtime": 10.8461, "eval_samples_per_second": 44.256, "eval_steps_per_second": 2.766, "step": 1500 }, { "epoch": 1.641304347826087, "grad_norm": 0.9712284207344055, "learning_rate": 2.2644927536231884e-05, "loss": 0.3662, "step": 1510 }, { "epoch": 1.6521739130434783, "grad_norm": 0.9667128920555115, "learning_rate": 2.246376811594203e-05, "loss": 0.3767, "step": 1520 }, { "epoch": 1.6630434782608696, "grad_norm": 0.9828950762748718, "learning_rate": 2.2282608695652175e-05, "loss": 0.3766, "step": 1530 }, { "epoch": 1.6739130434782608, "grad_norm": 1.0777678489685059, "learning_rate": 2.2101449275362323e-05, "loss": 0.3791, "step": 1540 }, { "epoch": 1.6847826086956523, "grad_norm": 1.006035566329956, "learning_rate": 2.1920289855072466e-05, "loss": 0.3961, "step": 1550 }, { "epoch": 1.6847826086956523, "eval_loss": 0.5241792798042297, "eval_runtime": 10.8288, "eval_samples_per_second": 44.326, "eval_steps_per_second": 2.77, "step": 1550 }, { "epoch": 1.6956521739130435, "grad_norm": 0.9397144913673401, "learning_rate": 2.173913043478261e-05, "loss": 0.3659, "step": 1560 }, { "epoch": 1.7065217391304348, "grad_norm": 1.1106013059616089, "learning_rate": 2.1557971014492757e-05, "loss": 0.381, "step": 1570 }, { "epoch": 1.7173913043478262, "grad_norm": 0.9626741409301758, "learning_rate": 2.13768115942029e-05, "loss": 0.3751, "step": 1580 }, { "epoch": 1.7282608695652173, "grad_norm": 1.139076828956604, "learning_rate": 2.1195652173913045e-05, "loss": 0.3888, "step": 1590 }, { "epoch": 1.7391304347826086, "grad_norm": 1.0335912704467773, "learning_rate": 2.101449275362319e-05, "loss": 0.3758, "step": 1600 }, { "epoch": 1.7391304347826086, "eval_loss": 0.5224525332450867, "eval_runtime": 10.8033, "eval_samples_per_second": 44.431, "eval_steps_per_second": 2.777, "step": 1600 }, { "epoch": 1.75, "grad_norm": 1.0831104516983032, "learning_rate": 2.0833333333333336e-05, "loss": 0.3429, "step": 1610 }, { "epoch": 1.7608695652173914, "grad_norm": 1.0405676364898682, "learning_rate": 2.065217391304348e-05, "loss": 0.3506, "step": 1620 }, { "epoch": 1.7717391304347827, "grad_norm": 0.9948307275772095, "learning_rate": 2.0471014492753624e-05, "loss": 0.3528, "step": 1630 }, { "epoch": 1.7826086956521738, "grad_norm": 1.103110671043396, "learning_rate": 2.028985507246377e-05, "loss": 0.3779, "step": 1640 }, { "epoch": 1.7934782608695652, "grad_norm": 1.0501149892807007, "learning_rate": 2.0108695652173915e-05, "loss": 0.3957, "step": 1650 }, { "epoch": 1.7934782608695652, "eval_loss": 0.5198243260383606, "eval_runtime": 10.7809, "eval_samples_per_second": 44.523, "eval_steps_per_second": 2.783, "step": 1650 }, { "epoch": 1.8043478260869565, "grad_norm": 1.0968273878097534, "learning_rate": 1.992753623188406e-05, "loss": 0.3746, "step": 1660 }, { "epoch": 1.8152173913043477, "grad_norm": 1.0902262926101685, "learning_rate": 1.9746376811594202e-05, "loss": 0.3688, "step": 1670 }, { "epoch": 1.8260869565217392, "grad_norm": 1.1077326536178589, "learning_rate": 1.956521739130435e-05, "loss": 0.3274, "step": 1680 }, { "epoch": 1.8369565217391304, "grad_norm": 1.005223035812378, "learning_rate": 1.9384057971014493e-05, "loss": 0.3858, "step": 1690 }, { "epoch": 1.8478260869565217, "grad_norm": 1.131982445716858, "learning_rate": 1.9202898550724637e-05, "loss": 0.3676, "step": 1700 }, { "epoch": 1.8478260869565217, "eval_loss": 0.5192354321479797, "eval_runtime": 10.7892, "eval_samples_per_second": 44.489, "eval_steps_per_second": 2.781, "step": 1700 }, { "epoch": 1.858695652173913, "grad_norm": 1.0396854877471924, "learning_rate": 1.9021739130434784e-05, "loss": 0.3738, "step": 1710 }, { "epoch": 1.8695652173913042, "grad_norm": 1.1090553998947144, "learning_rate": 1.8840579710144928e-05, "loss": 0.3695, "step": 1720 }, { "epoch": 1.8804347826086958, "grad_norm": 1.081115484237671, "learning_rate": 1.8659420289855072e-05, "loss": 0.3549, "step": 1730 }, { "epoch": 1.891304347826087, "grad_norm": 1.1276469230651855, "learning_rate": 1.8478260869565216e-05, "loss": 0.364, "step": 1740 }, { "epoch": 1.9021739130434783, "grad_norm": 1.0103682279586792, "learning_rate": 1.8297101449275363e-05, "loss": 0.3781, "step": 1750 }, { "epoch": 1.9021739130434783, "eval_loss": 0.5211134552955627, "eval_runtime": 10.7922, "eval_samples_per_second": 44.477, "eval_steps_per_second": 2.78, "step": 1750 }, { "epoch": 1.9130434782608696, "grad_norm": 0.9383260011672974, "learning_rate": 1.8115942028985507e-05, "loss": 0.3469, "step": 1760 }, { "epoch": 1.9239130434782608, "grad_norm": 1.095609188079834, "learning_rate": 1.793478260869565e-05, "loss": 0.3747, "step": 1770 }, { "epoch": 1.9347826086956523, "grad_norm": 1.0580706596374512, "learning_rate": 1.7753623188405798e-05, "loss": 0.3751, "step": 1780 }, { "epoch": 1.9456521739130435, "grad_norm": 0.9539552927017212, "learning_rate": 1.757246376811594e-05, "loss": 0.3638, "step": 1790 }, { "epoch": 1.9565217391304348, "grad_norm": 1.0025752782821655, "learning_rate": 1.739130434782609e-05, "loss": 0.3851, "step": 1800 }, { "epoch": 1.9565217391304348, "eval_loss": 0.516950249671936, "eval_runtime": 10.7904, "eval_samples_per_second": 44.484, "eval_steps_per_second": 2.78, "step": 1800 }, { "epoch": 1.9673913043478262, "grad_norm": 1.0852687358856201, "learning_rate": 1.7210144927536233e-05, "loss": 0.3472, "step": 1810 }, { "epoch": 1.9782608695652173, "grad_norm": 1.1232205629348755, "learning_rate": 1.702898550724638e-05, "loss": 0.3591, "step": 1820 }, { "epoch": 1.9891304347826086, "grad_norm": 1.0539883375167847, "learning_rate": 1.6847826086956524e-05, "loss": 0.3657, "step": 1830 }, { "epoch": 2.0, "grad_norm": 2.387427806854248, "learning_rate": 1.6666666666666667e-05, "loss": 0.393, "step": 1840 }, { "epoch": 2.010869565217391, "grad_norm": 1.0311148166656494, "learning_rate": 1.6485507246376815e-05, "loss": 0.3364, "step": 1850 }, { "epoch": 2.010869565217391, "eval_loss": 0.5237263441085815, "eval_runtime": 10.8116, "eval_samples_per_second": 44.397, "eval_steps_per_second": 2.775, "step": 1850 }, { "epoch": 2.0217391304347827, "grad_norm": 1.2049050331115723, "learning_rate": 1.630434782608696e-05, "loss": 0.336, "step": 1860 }, { "epoch": 2.032608695652174, "grad_norm": 1.0490036010742188, "learning_rate": 1.6123188405797102e-05, "loss": 0.3243, "step": 1870 }, { "epoch": 2.0434782608695654, "grad_norm": 1.131861686706543, "learning_rate": 1.5942028985507246e-05, "loss": 0.3311, "step": 1880 }, { "epoch": 2.0543478260869565, "grad_norm": 1.0843744277954102, "learning_rate": 1.5760869565217393e-05, "loss": 0.3107, "step": 1890 }, { "epoch": 2.0652173913043477, "grad_norm": 1.1070170402526855, "learning_rate": 1.5579710144927537e-05, "loss": 0.3357, "step": 1900 }, { "epoch": 2.0652173913043477, "eval_loss": 0.5324747562408447, "eval_runtime": 10.8165, "eval_samples_per_second": 44.377, "eval_steps_per_second": 2.774, "step": 1900 }, { "epoch": 2.0760869565217392, "grad_norm": 1.1747304201126099, "learning_rate": 1.539855072463768e-05, "loss": 0.3301, "step": 1910 }, { "epoch": 2.0869565217391304, "grad_norm": 1.151397943496704, "learning_rate": 1.5217391304347828e-05, "loss": 0.3313, "step": 1920 }, { "epoch": 2.097826086956522, "grad_norm": 1.1628749370574951, "learning_rate": 1.5036231884057972e-05, "loss": 0.322, "step": 1930 }, { "epoch": 2.108695652173913, "grad_norm": 1.1999317407608032, "learning_rate": 1.4855072463768116e-05, "loss": 0.3174, "step": 1940 }, { "epoch": 2.119565217391304, "grad_norm": 1.1956512928009033, "learning_rate": 1.4673913043478263e-05, "loss": 0.3281, "step": 1950 }, { "epoch": 2.119565217391304, "eval_loss": 0.5320965647697449, "eval_runtime": 10.8261, "eval_samples_per_second": 44.337, "eval_steps_per_second": 2.771, "step": 1950 }, { "epoch": 2.130434782608696, "grad_norm": 1.0482760667800903, "learning_rate": 1.4492753623188407e-05, "loss": 0.3193, "step": 1960 }, { "epoch": 2.141304347826087, "grad_norm": 1.72954261302948, "learning_rate": 1.431159420289855e-05, "loss": 0.3418, "step": 1970 }, { "epoch": 2.1521739130434785, "grad_norm": 1.2846759557724, "learning_rate": 1.4130434782608694e-05, "loss": 0.3274, "step": 1980 }, { "epoch": 2.1630434782608696, "grad_norm": 1.2016099691390991, "learning_rate": 1.3949275362318842e-05, "loss": 0.3305, "step": 1990 }, { "epoch": 2.1739130434782608, "grad_norm": 1.1600474119186401, "learning_rate": 1.3768115942028985e-05, "loss": 0.3424, "step": 2000 }, { "epoch": 2.1739130434782608, "eval_loss": 0.5298786163330078, "eval_runtime": 10.8329, "eval_samples_per_second": 44.31, "eval_steps_per_second": 2.769, "step": 2000 }, { "epoch": 2.1847826086956523, "grad_norm": 1.2170861959457397, "learning_rate": 1.3586956521739131e-05, "loss": 0.3024, "step": 2010 }, { "epoch": 2.1956521739130435, "grad_norm": 1.1393592357635498, "learning_rate": 1.3405797101449276e-05, "loss": 0.3211, "step": 2020 }, { "epoch": 2.2065217391304346, "grad_norm": 1.1940656900405884, "learning_rate": 1.3224637681159422e-05, "loss": 0.3157, "step": 2030 }, { "epoch": 2.217391304347826, "grad_norm": 1.2777471542358398, "learning_rate": 1.3043478260869566e-05, "loss": 0.3382, "step": 2040 }, { "epoch": 2.2282608695652173, "grad_norm": 1.1014587879180908, "learning_rate": 1.286231884057971e-05, "loss": 0.3198, "step": 2050 }, { "epoch": 2.2282608695652173, "eval_loss": 0.5298696160316467, "eval_runtime": 10.844, "eval_samples_per_second": 44.264, "eval_steps_per_second": 2.767, "step": 2050 }, { "epoch": 2.239130434782609, "grad_norm": 1.056643009185791, "learning_rate": 1.2681159420289857e-05, "loss": 0.317, "step": 2060 }, { "epoch": 2.25, "grad_norm": 1.1301910877227783, "learning_rate": 1.25e-05, "loss": 0.3266, "step": 2070 }, { "epoch": 2.260869565217391, "grad_norm": 1.2433290481567383, "learning_rate": 1.2318840579710146e-05, "loss": 0.3313, "step": 2080 }, { "epoch": 2.2717391304347827, "grad_norm": 1.2331655025482178, "learning_rate": 1.213768115942029e-05, "loss": 0.3353, "step": 2090 }, { "epoch": 2.282608695652174, "grad_norm": 1.2137727737426758, "learning_rate": 1.1956521739130435e-05, "loss": 0.3183, "step": 2100 }, { "epoch": 2.282608695652174, "eval_loss": 0.5301353335380554, "eval_runtime": 10.829, "eval_samples_per_second": 44.326, "eval_steps_per_second": 2.77, "step": 2100 }, { "epoch": 2.2934782608695654, "grad_norm": 1.0960383415222168, "learning_rate": 1.177536231884058e-05, "loss": 0.3083, "step": 2110 }, { "epoch": 2.3043478260869565, "grad_norm": 1.2978132963180542, "learning_rate": 1.1594202898550725e-05, "loss": 0.3218, "step": 2120 }, { "epoch": 2.3152173913043477, "grad_norm": 1.1214746236801147, "learning_rate": 1.141304347826087e-05, "loss": 0.3334, "step": 2130 }, { "epoch": 2.3260869565217392, "grad_norm": 1.2013393640518188, "learning_rate": 1.1231884057971016e-05, "loss": 0.3017, "step": 2140 }, { "epoch": 2.3369565217391304, "grad_norm": 1.2782713174819946, "learning_rate": 1.1050724637681161e-05, "loss": 0.3272, "step": 2150 }, { "epoch": 2.3369565217391304, "eval_loss": 0.5313724279403687, "eval_runtime": 10.8265, "eval_samples_per_second": 44.336, "eval_steps_per_second": 2.771, "step": 2150 }, { "epoch": 2.3478260869565215, "grad_norm": 1.1971640586853027, "learning_rate": 1.0869565217391305e-05, "loss": 0.3295, "step": 2160 }, { "epoch": 2.358695652173913, "grad_norm": 1.1936053037643433, "learning_rate": 1.068840579710145e-05, "loss": 0.3132, "step": 2170 }, { "epoch": 2.369565217391304, "grad_norm": 1.0900537967681885, "learning_rate": 1.0507246376811594e-05, "loss": 0.3212, "step": 2180 }, { "epoch": 2.380434782608696, "grad_norm": 1.4006975889205933, "learning_rate": 1.032608695652174e-05, "loss": 0.3258, "step": 2190 }, { "epoch": 2.391304347826087, "grad_norm": 1.3683350086212158, "learning_rate": 1.0144927536231885e-05, "loss": 0.3376, "step": 2200 }, { "epoch": 2.391304347826087, "eval_loss": 0.5328701734542847, "eval_runtime": 10.8097, "eval_samples_per_second": 44.404, "eval_steps_per_second": 2.775, "step": 2200 }, { "epoch": 2.4021739130434785, "grad_norm": 1.120539903640747, "learning_rate": 9.96376811594203e-06, "loss": 0.3132, "step": 2210 }, { "epoch": 2.4130434782608696, "grad_norm": 1.1731290817260742, "learning_rate": 9.782608695652175e-06, "loss": 0.329, "step": 2220 }, { "epoch": 2.4239130434782608, "grad_norm": 1.2622851133346558, "learning_rate": 9.601449275362319e-06, "loss": 0.3181, "step": 2230 }, { "epoch": 2.4347826086956523, "grad_norm": 1.2982534170150757, "learning_rate": 9.420289855072464e-06, "loss": 0.3177, "step": 2240 }, { "epoch": 2.4456521739130435, "grad_norm": 1.2450315952301025, "learning_rate": 9.239130434782608e-06, "loss": 0.3361, "step": 2250 }, { "epoch": 2.4456521739130435, "eval_loss": 0.5312691330909729, "eval_runtime": 10.7859, "eval_samples_per_second": 44.502, "eval_steps_per_second": 2.781, "step": 2250 }, { "epoch": 2.4565217391304346, "grad_norm": 1.2270658016204834, "learning_rate": 9.057971014492753e-06, "loss": 0.3392, "step": 2260 }, { "epoch": 2.467391304347826, "grad_norm": 1.313056468963623, "learning_rate": 8.876811594202899e-06, "loss": 0.3065, "step": 2270 }, { "epoch": 2.4782608695652173, "grad_norm": 1.255582332611084, "learning_rate": 8.695652173913044e-06, "loss": 0.3224, "step": 2280 }, { "epoch": 2.489130434782609, "grad_norm": 1.354356050491333, "learning_rate": 8.51449275362319e-06, "loss": 0.3307, "step": 2290 }, { "epoch": 2.5, "grad_norm": 1.2218225002288818, "learning_rate": 8.333333333333334e-06, "loss": 0.3146, "step": 2300 }, { "epoch": 2.5, "eval_loss": 0.5301942825317383, "eval_runtime": 10.778, "eval_samples_per_second": 44.535, "eval_steps_per_second": 2.783, "step": 2300 }, { "epoch": 2.5108695652173916, "grad_norm": 1.2889074087142944, "learning_rate": 8.15217391304348e-06, "loss": 0.3135, "step": 2310 }, { "epoch": 2.5217391304347827, "grad_norm": 1.1224515438079834, "learning_rate": 7.971014492753623e-06, "loss": 0.3227, "step": 2320 }, { "epoch": 2.532608695652174, "grad_norm": 1.1914352178573608, "learning_rate": 7.789855072463769e-06, "loss": 0.3138, "step": 2330 }, { "epoch": 2.5434782608695654, "grad_norm": 1.3238486051559448, "learning_rate": 7.608695652173914e-06, "loss": 0.3168, "step": 2340 }, { "epoch": 2.5543478260869565, "grad_norm": 1.3031419515609741, "learning_rate": 7.427536231884058e-06, "loss": 0.3253, "step": 2350 }, { "epoch": 2.5543478260869565, "eval_loss": 0.5280157327651978, "eval_runtime": 10.8003, "eval_samples_per_second": 44.443, "eval_steps_per_second": 2.778, "step": 2350 }, { "epoch": 2.5652173913043477, "grad_norm": 1.2570607662200928, "learning_rate": 7.246376811594203e-06, "loss": 0.3246, "step": 2360 }, { "epoch": 2.5760869565217392, "grad_norm": 1.1971008777618408, "learning_rate": 7.065217391304347e-06, "loss": 0.3142, "step": 2370 }, { "epoch": 2.5869565217391304, "grad_norm": 1.2740592956542969, "learning_rate": 6.884057971014493e-06, "loss": 0.3263, "step": 2380 }, { "epoch": 2.5978260869565215, "grad_norm": 1.3141279220581055, "learning_rate": 6.702898550724638e-06, "loss": 0.3141, "step": 2390 }, { "epoch": 2.608695652173913, "grad_norm": 1.3131691217422485, "learning_rate": 6.521739130434783e-06, "loss": 0.3204, "step": 2400 }, { "epoch": 2.608695652173913, "eval_loss": 0.529800295829773, "eval_runtime": 10.7985, "eval_samples_per_second": 44.451, "eval_steps_per_second": 2.778, "step": 2400 }, { "epoch": 2.619565217391304, "grad_norm": 1.2273128032684326, "learning_rate": 6.340579710144928e-06, "loss": 0.3089, "step": 2410 }, { "epoch": 2.630434782608696, "grad_norm": 1.304915428161621, "learning_rate": 6.159420289855073e-06, "loss": 0.3211, "step": 2420 }, { "epoch": 2.641304347826087, "grad_norm": 1.261481761932373, "learning_rate": 5.978260869565218e-06, "loss": 0.3178, "step": 2430 }, { "epoch": 2.6521739130434785, "grad_norm": 1.2726097106933594, "learning_rate": 5.797101449275362e-06, "loss": 0.3342, "step": 2440 }, { "epoch": 2.6630434782608696, "grad_norm": 1.3216400146484375, "learning_rate": 5.615942028985508e-06, "loss": 0.3245, "step": 2450 }, { "epoch": 2.6630434782608696, "eval_loss": 0.5288810133934021, "eval_runtime": 10.8043, "eval_samples_per_second": 44.427, "eval_steps_per_second": 2.777, "step": 2450 }, { "epoch": 2.6739130434782608, "grad_norm": 1.2873594760894775, "learning_rate": 5.4347826086956525e-06, "loss": 0.3137, "step": 2460 }, { "epoch": 2.6847826086956523, "grad_norm": 1.2318347692489624, "learning_rate": 5.253623188405797e-06, "loss": 0.3258, "step": 2470 }, { "epoch": 2.6956521739130435, "grad_norm": 1.2132396697998047, "learning_rate": 5.072463768115943e-06, "loss": 0.3168, "step": 2480 }, { "epoch": 2.7065217391304346, "grad_norm": 1.3572018146514893, "learning_rate": 4.891304347826087e-06, "loss": 0.3187, "step": 2490 }, { "epoch": 2.717391304347826, "grad_norm": 1.2623666524887085, "learning_rate": 4.710144927536232e-06, "loss": 0.3257, "step": 2500 }, { "epoch": 2.717391304347826, "eval_loss": 0.5298188924789429, "eval_runtime": 10.8163, "eval_samples_per_second": 44.377, "eval_steps_per_second": 2.774, "step": 2500 } ], "logging_steps": 10, "max_steps": 2760, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2027066610132582e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }