|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.717391304347826, |
|
"eval_steps": 50, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010869565217391304, |
|
"grad_norm": 2.4570870399475098, |
|
"learning_rate": 4.981884057971015e-05, |
|
"loss": 2.3612, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021739130434782608, |
|
"grad_norm": 1.2042421102523804, |
|
"learning_rate": 4.963768115942029e-05, |
|
"loss": 1.1915, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03260869565217391, |
|
"grad_norm": 1.2019174098968506, |
|
"learning_rate": 4.945652173913044e-05, |
|
"loss": 0.8992, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 1.0546120405197144, |
|
"learning_rate": 4.9275362318840584e-05, |
|
"loss": 0.7276, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05434782608695652, |
|
"grad_norm": 0.754091203212738, |
|
"learning_rate": 4.909420289855073e-05, |
|
"loss": 0.6419, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05434782608695652, |
|
"eval_loss": 0.7905128598213196, |
|
"eval_runtime": 10.8418, |
|
"eval_samples_per_second": 44.273, |
|
"eval_steps_per_second": 2.767, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06521739130434782, |
|
"grad_norm": 0.7751966714859009, |
|
"learning_rate": 4.891304347826087e-05, |
|
"loss": 0.5971, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07608695652173914, |
|
"grad_norm": 0.6874057650566101, |
|
"learning_rate": 4.873188405797102e-05, |
|
"loss": 0.608, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 0.7145748734474182, |
|
"learning_rate": 4.855072463768116e-05, |
|
"loss": 0.6111, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09782608695652174, |
|
"grad_norm": 0.8841484189033508, |
|
"learning_rate": 4.836956521739131e-05, |
|
"loss": 0.5848, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"grad_norm": 0.7937784790992737, |
|
"learning_rate": 4.818840579710145e-05, |
|
"loss": 0.5699, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"eval_loss": 0.6956210732460022, |
|
"eval_runtime": 10.863, |
|
"eval_samples_per_second": 44.187, |
|
"eval_steps_per_second": 2.762, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11956521739130435, |
|
"grad_norm": 0.7818441987037659, |
|
"learning_rate": 4.80072463768116e-05, |
|
"loss": 0.5946, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13043478260869565, |
|
"grad_norm": 0.8666340112686157, |
|
"learning_rate": 4.782608695652174e-05, |
|
"loss": 0.5469, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14130434782608695, |
|
"grad_norm": 0.7637468576431274, |
|
"learning_rate": 4.764492753623189e-05, |
|
"loss": 0.5307, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15217391304347827, |
|
"grad_norm": 0.8282362222671509, |
|
"learning_rate": 4.746376811594203e-05, |
|
"loss": 0.5312, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16304347826086957, |
|
"grad_norm": 0.9675197601318359, |
|
"learning_rate": 4.7282608695652177e-05, |
|
"loss": 0.5486, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16304347826086957, |
|
"eval_loss": 0.6567058563232422, |
|
"eval_runtime": 10.8376, |
|
"eval_samples_per_second": 44.29, |
|
"eval_steps_per_second": 2.768, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.7035924792289734, |
|
"learning_rate": 4.710144927536232e-05, |
|
"loss": 0.5553, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18478260869565216, |
|
"grad_norm": 0.8957257866859436, |
|
"learning_rate": 4.6920289855072464e-05, |
|
"loss": 0.54, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1956521739130435, |
|
"grad_norm": 0.8544663190841675, |
|
"learning_rate": 4.673913043478261e-05, |
|
"loss": 0.55, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20652173913043478, |
|
"grad_norm": 0.7599456310272217, |
|
"learning_rate": 4.655797101449276e-05, |
|
"loss": 0.5102, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 0.9151259064674377, |
|
"learning_rate": 4.63768115942029e-05, |
|
"loss": 0.5372, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"eval_loss": 0.6319016218185425, |
|
"eval_runtime": 10.8237, |
|
"eval_samples_per_second": 44.347, |
|
"eval_steps_per_second": 2.772, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22826086956521738, |
|
"grad_norm": 0.911018431186676, |
|
"learning_rate": 4.6195652173913046e-05, |
|
"loss": 0.5354, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2391304347826087, |
|
"grad_norm": 0.7792191505432129, |
|
"learning_rate": 4.601449275362319e-05, |
|
"loss": 0.5297, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7631202340126038, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 0.5258, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 0.7436497211456299, |
|
"learning_rate": 4.565217391304348e-05, |
|
"loss": 0.4849, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2717391304347826, |
|
"grad_norm": 0.8372677564620972, |
|
"learning_rate": 4.547101449275363e-05, |
|
"loss": 0.5131, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2717391304347826, |
|
"eval_loss": 0.6155125498771667, |
|
"eval_runtime": 10.8327, |
|
"eval_samples_per_second": 44.31, |
|
"eval_steps_per_second": 2.769, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2826086956521739, |
|
"grad_norm": 0.8056390285491943, |
|
"learning_rate": 4.528985507246377e-05, |
|
"loss": 0.4675, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.29347826086956524, |
|
"grad_norm": 0.8015912175178528, |
|
"learning_rate": 4.5108695652173916e-05, |
|
"loss": 0.4942, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.30434782608695654, |
|
"grad_norm": 0.8316887021064758, |
|
"learning_rate": 4.492753623188406e-05, |
|
"loss": 0.5025, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31521739130434784, |
|
"grad_norm": 0.8440260887145996, |
|
"learning_rate": 4.4746376811594203e-05, |
|
"loss": 0.5112, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32608695652173914, |
|
"grad_norm": 0.8523954153060913, |
|
"learning_rate": 4.456521739130435e-05, |
|
"loss": 0.4973, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32608695652173914, |
|
"eval_loss": 0.6024141907691956, |
|
"eval_runtime": 10.839, |
|
"eval_samples_per_second": 44.284, |
|
"eval_steps_per_second": 2.768, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33695652173913043, |
|
"grad_norm": 0.8427216410636902, |
|
"learning_rate": 4.438405797101449e-05, |
|
"loss": 0.5271, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.8126628398895264, |
|
"learning_rate": 4.4202898550724645e-05, |
|
"loss": 0.4608, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.358695652173913, |
|
"grad_norm": 0.9947460293769836, |
|
"learning_rate": 4.4021739130434786e-05, |
|
"loss": 0.4762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3695652173913043, |
|
"grad_norm": 0.7744721174240112, |
|
"learning_rate": 4.384057971014493e-05, |
|
"loss": 0.4993, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3804347826086957, |
|
"grad_norm": 0.9483833909034729, |
|
"learning_rate": 4.365942028985507e-05, |
|
"loss": 0.497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3804347826086957, |
|
"eval_loss": 0.596432626247406, |
|
"eval_runtime": 10.8191, |
|
"eval_samples_per_second": 44.366, |
|
"eval_steps_per_second": 2.773, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.391304347826087, |
|
"grad_norm": 0.7573915123939514, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.469, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.40217391304347827, |
|
"grad_norm": 0.8013887405395508, |
|
"learning_rate": 4.329710144927536e-05, |
|
"loss": 0.4704, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41304347826086957, |
|
"grad_norm": 0.8412826657295227, |
|
"learning_rate": 4.3115942028985515e-05, |
|
"loss": 0.5033, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.42391304347826086, |
|
"grad_norm": 0.8203413486480713, |
|
"learning_rate": 4.2934782608695655e-05, |
|
"loss": 0.5063, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.7952508926391602, |
|
"learning_rate": 4.27536231884058e-05, |
|
"loss": 0.4706, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"eval_loss": 0.5875406861305237, |
|
"eval_runtime": 10.8328, |
|
"eval_samples_per_second": 44.31, |
|
"eval_steps_per_second": 2.769, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.44565217391304346, |
|
"grad_norm": 0.9020015597343445, |
|
"learning_rate": 4.257246376811594e-05, |
|
"loss": 0.4786, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.45652173913043476, |
|
"grad_norm": 0.8588363528251648, |
|
"learning_rate": 4.239130434782609e-05, |
|
"loss": 0.482, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4673913043478261, |
|
"grad_norm": 0.816198468208313, |
|
"learning_rate": 4.221014492753623e-05, |
|
"loss": 0.4683, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4782608695652174, |
|
"grad_norm": 0.8713563680648804, |
|
"learning_rate": 4.202898550724638e-05, |
|
"loss": 0.4422, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4891304347826087, |
|
"grad_norm": 0.8633397817611694, |
|
"learning_rate": 4.1847826086956525e-05, |
|
"loss": 0.45, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4891304347826087, |
|
"eval_loss": 0.5812788605690002, |
|
"eval_runtime": 10.8048, |
|
"eval_samples_per_second": 44.425, |
|
"eval_steps_per_second": 2.777, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8244722485542297, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.4911, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5108695652173914, |
|
"grad_norm": 0.8201929926872253, |
|
"learning_rate": 4.148550724637681e-05, |
|
"loss": 0.4611, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.9646862149238586, |
|
"learning_rate": 4.130434782608696e-05, |
|
"loss": 0.4738, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.532608695652174, |
|
"grad_norm": 0.9149807095527649, |
|
"learning_rate": 4.11231884057971e-05, |
|
"loss": 0.4603, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5434782608695652, |
|
"grad_norm": 0.8054277896881104, |
|
"learning_rate": 4.094202898550725e-05, |
|
"loss": 0.466, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5434782608695652, |
|
"eval_loss": 0.573806881904602, |
|
"eval_runtime": 10.7906, |
|
"eval_samples_per_second": 44.483, |
|
"eval_steps_per_second": 2.78, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5543478260869565, |
|
"grad_norm": 0.9110711812973022, |
|
"learning_rate": 4.076086956521739e-05, |
|
"loss": 0.4777, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5652173913043478, |
|
"grad_norm": 0.8030044436454773, |
|
"learning_rate": 4.057971014492754e-05, |
|
"loss": 0.4965, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5760869565217391, |
|
"grad_norm": 0.8847204446792603, |
|
"learning_rate": 4.039855072463768e-05, |
|
"loss": 0.4843, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5869565217391305, |
|
"grad_norm": 0.9041047692298889, |
|
"learning_rate": 4.021739130434783e-05, |
|
"loss": 0.4548, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5978260869565217, |
|
"grad_norm": 0.8136462569236755, |
|
"learning_rate": 4.003623188405797e-05, |
|
"loss": 0.4543, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5978260869565217, |
|
"eval_loss": 0.5662083029747009, |
|
"eval_runtime": 10.7748, |
|
"eval_samples_per_second": 44.549, |
|
"eval_steps_per_second": 2.784, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 0.9470314383506775, |
|
"learning_rate": 3.985507246376812e-05, |
|
"loss": 0.4852, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6195652173913043, |
|
"grad_norm": 0.9280235171318054, |
|
"learning_rate": 3.9673913043478264e-05, |
|
"loss": 0.4655, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6304347826086957, |
|
"grad_norm": 0.9118008017539978, |
|
"learning_rate": 3.9492753623188405e-05, |
|
"loss": 0.449, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6413043478260869, |
|
"grad_norm": 0.9286589026451111, |
|
"learning_rate": 3.931159420289855e-05, |
|
"loss": 0.448, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.7761940956115723, |
|
"learning_rate": 3.91304347826087e-05, |
|
"loss": 0.4363, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"eval_loss": 0.5645204186439514, |
|
"eval_runtime": 10.7763, |
|
"eval_samples_per_second": 44.542, |
|
"eval_steps_per_second": 2.784, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6630434782608695, |
|
"grad_norm": 0.9244002103805542, |
|
"learning_rate": 3.8949275362318846e-05, |
|
"loss": 0.4354, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6739130434782609, |
|
"grad_norm": 0.8263306021690369, |
|
"learning_rate": 3.876811594202899e-05, |
|
"loss": 0.4951, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6847826086956522, |
|
"grad_norm": 0.8908408284187317, |
|
"learning_rate": 3.8586956521739134e-05, |
|
"loss": 0.4401, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.8497889637947083, |
|
"learning_rate": 3.8405797101449274e-05, |
|
"loss": 0.4514, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7065217391304348, |
|
"grad_norm": 0.8306752443313599, |
|
"learning_rate": 3.822463768115942e-05, |
|
"loss": 0.4503, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7065217391304348, |
|
"eval_loss": 0.5576011538505554, |
|
"eval_runtime": 10.7704, |
|
"eval_samples_per_second": 44.567, |
|
"eval_steps_per_second": 2.785, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.717391304347826, |
|
"grad_norm": 0.898961067199707, |
|
"learning_rate": 3.804347826086957e-05, |
|
"loss": 0.4185, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7282608695652174, |
|
"grad_norm": 0.8870590925216675, |
|
"learning_rate": 3.7862318840579716e-05, |
|
"loss": 0.4303, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7391304347826086, |
|
"grad_norm": 0.8863015174865723, |
|
"learning_rate": 3.7681159420289856e-05, |
|
"loss": 0.4356, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.852130651473999, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.4756, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7608695652173914, |
|
"grad_norm": 0.8342716097831726, |
|
"learning_rate": 3.7318840579710144e-05, |
|
"loss": 0.4955, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7608695652173914, |
|
"eval_loss": 0.5521832704544067, |
|
"eval_runtime": 10.7828, |
|
"eval_samples_per_second": 44.515, |
|
"eval_steps_per_second": 2.782, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7717391304347826, |
|
"grad_norm": 0.8382502794265747, |
|
"learning_rate": 3.713768115942029e-05, |
|
"loss": 0.4435, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 0.9789420962333679, |
|
"learning_rate": 3.695652173913043e-05, |
|
"loss": 0.3957, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7934782608695652, |
|
"grad_norm": 0.8945524096488953, |
|
"learning_rate": 3.6775362318840586e-05, |
|
"loss": 0.4565, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8043478260869565, |
|
"grad_norm": 0.9632206559181213, |
|
"learning_rate": 3.6594202898550726e-05, |
|
"loss": 0.4312, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8152173913043478, |
|
"grad_norm": 0.8000399470329285, |
|
"learning_rate": 3.641304347826087e-05, |
|
"loss": 0.429, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8152173913043478, |
|
"eval_loss": 0.550889790058136, |
|
"eval_runtime": 10.7987, |
|
"eval_samples_per_second": 44.45, |
|
"eval_steps_per_second": 2.778, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8260869565217391, |
|
"grad_norm": 0.8817518353462219, |
|
"learning_rate": 3.6231884057971014e-05, |
|
"loss": 0.4464, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8369565217391305, |
|
"grad_norm": 0.9800861477851868, |
|
"learning_rate": 3.605072463768116e-05, |
|
"loss": 0.4577, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8478260869565217, |
|
"grad_norm": 0.846571683883667, |
|
"learning_rate": 3.58695652173913e-05, |
|
"loss": 0.4169, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8586956521739131, |
|
"grad_norm": 0.8653192520141602, |
|
"learning_rate": 3.568840579710145e-05, |
|
"loss": 0.4195, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.8515974283218384, |
|
"learning_rate": 3.5507246376811596e-05, |
|
"loss": 0.4366, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"eval_loss": 0.543337881565094, |
|
"eval_runtime": 10.8166, |
|
"eval_samples_per_second": 44.376, |
|
"eval_steps_per_second": 2.774, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8804347826086957, |
|
"grad_norm": 0.947827935218811, |
|
"learning_rate": 3.532608695652174e-05, |
|
"loss": 0.4498, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8913043478260869, |
|
"grad_norm": 0.866033136844635, |
|
"learning_rate": 3.514492753623188e-05, |
|
"loss": 0.4495, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9021739130434783, |
|
"grad_norm": 0.8692104816436768, |
|
"learning_rate": 3.496376811594203e-05, |
|
"loss": 0.4502, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9130434782608695, |
|
"grad_norm": 0.8841784596443176, |
|
"learning_rate": 3.478260869565218e-05, |
|
"loss": 0.4577, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9239130434782609, |
|
"grad_norm": 0.9257445335388184, |
|
"learning_rate": 3.460144927536232e-05, |
|
"loss": 0.4329, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9239130434782609, |
|
"eval_loss": 0.5429030060768127, |
|
"eval_runtime": 10.8284, |
|
"eval_samples_per_second": 44.328, |
|
"eval_steps_per_second": 2.77, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9347826086956522, |
|
"grad_norm": 0.8753738403320312, |
|
"learning_rate": 3.4420289855072465e-05, |
|
"loss": 0.422, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9456521739130435, |
|
"grad_norm": 0.8853139877319336, |
|
"learning_rate": 3.423913043478261e-05, |
|
"loss": 0.4357, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 0.8637978434562683, |
|
"learning_rate": 3.405797101449276e-05, |
|
"loss": 0.4254, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.967391304347826, |
|
"grad_norm": 0.9090889096260071, |
|
"learning_rate": 3.38768115942029e-05, |
|
"loss": 0.4526, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9782608695652174, |
|
"grad_norm": 0.8591863512992859, |
|
"learning_rate": 3.369565217391305e-05, |
|
"loss": 0.4566, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9782608695652174, |
|
"eval_loss": 0.5399300456047058, |
|
"eval_runtime": 10.8533, |
|
"eval_samples_per_second": 44.226, |
|
"eval_steps_per_second": 2.764, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9891304347826086, |
|
"grad_norm": 0.8273764848709106, |
|
"learning_rate": 3.351449275362319e-05, |
|
"loss": 0.4182, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5736610889434814, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.4548, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0108695652173914, |
|
"grad_norm": 0.8933672904968262, |
|
"learning_rate": 3.3152173913043475e-05, |
|
"loss": 0.3834, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0217391304347827, |
|
"grad_norm": 0.9641056060791016, |
|
"learning_rate": 3.297101449275363e-05, |
|
"loss": 0.3962, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0326086956521738, |
|
"grad_norm": 0.8840826153755188, |
|
"learning_rate": 3.278985507246377e-05, |
|
"loss": 0.3886, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0326086956521738, |
|
"eval_loss": 0.5417191982269287, |
|
"eval_runtime": 10.828, |
|
"eval_samples_per_second": 44.33, |
|
"eval_steps_per_second": 2.771, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 0.9005197286605835, |
|
"learning_rate": 3.260869565217392e-05, |
|
"loss": 0.3887, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0543478260869565, |
|
"grad_norm": 1.0132256746292114, |
|
"learning_rate": 3.242753623188406e-05, |
|
"loss": 0.4026, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.065217391304348, |
|
"grad_norm": 1.1139558553695679, |
|
"learning_rate": 3.2246376811594205e-05, |
|
"loss": 0.3703, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0760869565217392, |
|
"grad_norm": 0.9440574049949646, |
|
"learning_rate": 3.2065217391304345e-05, |
|
"loss": 0.3766, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 1.0028133392333984, |
|
"learning_rate": 3.188405797101449e-05, |
|
"loss": 0.4038, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"eval_loss": 0.5413796901702881, |
|
"eval_runtime": 10.822, |
|
"eval_samples_per_second": 44.354, |
|
"eval_steps_per_second": 2.772, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0978260869565217, |
|
"grad_norm": 0.9884430766105652, |
|
"learning_rate": 3.170289855072464e-05, |
|
"loss": 0.4134, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.108695652173913, |
|
"grad_norm": 0.920870840549469, |
|
"learning_rate": 3.152173913043479e-05, |
|
"loss": 0.3837, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.1195652173913044, |
|
"grad_norm": 0.942890465259552, |
|
"learning_rate": 3.134057971014493e-05, |
|
"loss": 0.38, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 1.0265600681304932, |
|
"learning_rate": 3.1159420289855074e-05, |
|
"loss": 0.3988, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.141304347826087, |
|
"grad_norm": 0.9994137287139893, |
|
"learning_rate": 3.0978260869565215e-05, |
|
"loss": 0.3649, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.141304347826087, |
|
"eval_loss": 0.537377655506134, |
|
"eval_runtime": 10.7968, |
|
"eval_samples_per_second": 44.458, |
|
"eval_steps_per_second": 2.779, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1521739130434783, |
|
"grad_norm": 0.9697039723396301, |
|
"learning_rate": 3.079710144927536e-05, |
|
"loss": 0.393, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.1630434782608696, |
|
"grad_norm": 0.9805816411972046, |
|
"learning_rate": 3.061594202898551e-05, |
|
"loss": 0.3594, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.1739130434782608, |
|
"grad_norm": 0.9799211621284485, |
|
"learning_rate": 3.0434782608695656e-05, |
|
"loss": 0.387, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.184782608695652, |
|
"grad_norm": 0.9287502765655518, |
|
"learning_rate": 3.02536231884058e-05, |
|
"loss": 0.3889, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1956521739130435, |
|
"grad_norm": 1.042643666267395, |
|
"learning_rate": 3.0072463768115944e-05, |
|
"loss": 0.3894, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1956521739130435, |
|
"eval_loss": 0.5378063917160034, |
|
"eval_runtime": 10.77, |
|
"eval_samples_per_second": 44.568, |
|
"eval_steps_per_second": 2.786, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2065217391304348, |
|
"grad_norm": 0.963258683681488, |
|
"learning_rate": 2.9891304347826088e-05, |
|
"loss": 0.3824, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 0.939275860786438, |
|
"learning_rate": 2.971014492753623e-05, |
|
"loss": 0.3898, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2282608695652173, |
|
"grad_norm": 1.1144332885742188, |
|
"learning_rate": 2.9528985507246375e-05, |
|
"loss": 0.3754, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2391304347826086, |
|
"grad_norm": 0.999047577381134, |
|
"learning_rate": 2.9347826086956526e-05, |
|
"loss": 0.3876, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.024510383605957, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 0.3782, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.5348225831985474, |
|
"eval_runtime": 10.7748, |
|
"eval_samples_per_second": 44.548, |
|
"eval_steps_per_second": 2.784, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2608695652173914, |
|
"grad_norm": 1.0736693143844604, |
|
"learning_rate": 2.8985507246376814e-05, |
|
"loss": 0.3895, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.2717391304347827, |
|
"grad_norm": 0.9759789705276489, |
|
"learning_rate": 2.8804347826086957e-05, |
|
"loss": 0.3989, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.2826086956521738, |
|
"grad_norm": 0.8532370328903198, |
|
"learning_rate": 2.86231884057971e-05, |
|
"loss": 0.3814, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.2934782608695652, |
|
"grad_norm": 0.9838298559188843, |
|
"learning_rate": 2.8442028985507245e-05, |
|
"loss": 0.3864, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 0.8785775899887085, |
|
"learning_rate": 2.826086956521739e-05, |
|
"loss": 0.3676, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"eval_loss": 0.5335877537727356, |
|
"eval_runtime": 10.7959, |
|
"eval_samples_per_second": 44.461, |
|
"eval_steps_per_second": 2.779, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.315217391304348, |
|
"grad_norm": 0.9719653129577637, |
|
"learning_rate": 2.807971014492754e-05, |
|
"loss": 0.3913, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.3260869565217392, |
|
"grad_norm": 0.9099355340003967, |
|
"learning_rate": 2.7898550724637683e-05, |
|
"loss": 0.4019, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.3369565217391304, |
|
"grad_norm": 0.9192841649055481, |
|
"learning_rate": 2.7717391304347827e-05, |
|
"loss": 0.3921, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.3478260869565217, |
|
"grad_norm": 1.1005280017852783, |
|
"learning_rate": 2.753623188405797e-05, |
|
"loss": 0.3892, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.358695652173913, |
|
"grad_norm": 0.9969606995582581, |
|
"learning_rate": 2.7355072463768118e-05, |
|
"loss": 0.3756, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.358695652173913, |
|
"eval_loss": 0.5348997712135315, |
|
"eval_runtime": 10.8018, |
|
"eval_samples_per_second": 44.437, |
|
"eval_steps_per_second": 2.777, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3695652173913042, |
|
"grad_norm": 1.1843461990356445, |
|
"learning_rate": 2.7173913043478262e-05, |
|
"loss": 0.3751, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.3804347826086958, |
|
"grad_norm": 0.9097041487693787, |
|
"learning_rate": 2.6992753623188406e-05, |
|
"loss": 0.3852, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.9780197739601135, |
|
"learning_rate": 2.6811594202898553e-05, |
|
"loss": 0.3776, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.4021739130434783, |
|
"grad_norm": 1.0949311256408691, |
|
"learning_rate": 2.66304347826087e-05, |
|
"loss": 0.3517, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.4130434782608696, |
|
"grad_norm": 1.0105839967727661, |
|
"learning_rate": 2.6449275362318844e-05, |
|
"loss": 0.384, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4130434782608696, |
|
"eval_loss": 0.5327685475349426, |
|
"eval_runtime": 10.8023, |
|
"eval_samples_per_second": 44.435, |
|
"eval_steps_per_second": 2.777, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4239130434782608, |
|
"grad_norm": 1.0113478899002075, |
|
"learning_rate": 2.6268115942028988e-05, |
|
"loss": 0.3774, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.434782608695652, |
|
"grad_norm": 1.0456714630126953, |
|
"learning_rate": 2.608695652173913e-05, |
|
"loss": 0.3618, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.4456521739130435, |
|
"grad_norm": 1.048971176147461, |
|
"learning_rate": 2.5905797101449275e-05, |
|
"loss": 0.3707, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.4565217391304348, |
|
"grad_norm": 0.9918339252471924, |
|
"learning_rate": 2.572463768115942e-05, |
|
"loss": 0.3979, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.4673913043478262, |
|
"grad_norm": 0.8943142890930176, |
|
"learning_rate": 2.554347826086957e-05, |
|
"loss": 0.3647, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4673913043478262, |
|
"eval_loss": 0.530137836933136, |
|
"eval_runtime": 10.8212, |
|
"eval_samples_per_second": 44.357, |
|
"eval_steps_per_second": 2.772, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 1.1170648336410522, |
|
"learning_rate": 2.5362318840579714e-05, |
|
"loss": 0.3752, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.4891304347826086, |
|
"grad_norm": 1.0574833154678345, |
|
"learning_rate": 2.5181159420289857e-05, |
|
"loss": 0.4005, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.0728522539138794, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.404, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.5108695652173914, |
|
"grad_norm": 1.0159659385681152, |
|
"learning_rate": 2.4818840579710145e-05, |
|
"loss": 0.4051, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 0.9714246392250061, |
|
"learning_rate": 2.4637681159420292e-05, |
|
"loss": 0.3775, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"eval_loss": 0.528946578502655, |
|
"eval_runtime": 10.8248, |
|
"eval_samples_per_second": 44.343, |
|
"eval_steps_per_second": 2.771, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5326086956521738, |
|
"grad_norm": 1.0848588943481445, |
|
"learning_rate": 2.4456521739130436e-05, |
|
"loss": 0.3703, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.5434782608695652, |
|
"grad_norm": 1.0333784818649292, |
|
"learning_rate": 2.427536231884058e-05, |
|
"loss": 0.3692, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.5543478260869565, |
|
"grad_norm": 0.9682796597480774, |
|
"learning_rate": 2.4094202898550724e-05, |
|
"loss": 0.3722, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 1.0484014749526978, |
|
"learning_rate": 2.391304347826087e-05, |
|
"loss": 0.3763, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.5760869565217392, |
|
"grad_norm": 0.9075008630752563, |
|
"learning_rate": 2.3731884057971015e-05, |
|
"loss": 0.3594, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5760869565217392, |
|
"eval_loss": 0.5278663039207458, |
|
"eval_runtime": 10.831, |
|
"eval_samples_per_second": 44.317, |
|
"eval_steps_per_second": 2.77, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5869565217391304, |
|
"grad_norm": 0.9710797667503357, |
|
"learning_rate": 2.355072463768116e-05, |
|
"loss": 0.3821, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.5978260869565217, |
|
"grad_norm": 1.0220218896865845, |
|
"learning_rate": 2.3369565217391306e-05, |
|
"loss": 0.3623, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.608695652173913, |
|
"grad_norm": 1.0994117259979248, |
|
"learning_rate": 2.318840579710145e-05, |
|
"loss": 0.3735, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.6195652173913042, |
|
"grad_norm": 0.9513503313064575, |
|
"learning_rate": 2.3007246376811593e-05, |
|
"loss": 0.3743, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.6304347826086958, |
|
"grad_norm": 0.9903764128684998, |
|
"learning_rate": 2.282608695652174e-05, |
|
"loss": 0.3661, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6304347826086958, |
|
"eval_loss": 0.5267059803009033, |
|
"eval_runtime": 10.8461, |
|
"eval_samples_per_second": 44.256, |
|
"eval_steps_per_second": 2.766, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.641304347826087, |
|
"grad_norm": 0.9712284207344055, |
|
"learning_rate": 2.2644927536231884e-05, |
|
"loss": 0.3662, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 0.9667128920555115, |
|
"learning_rate": 2.246376811594203e-05, |
|
"loss": 0.3767, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.6630434782608696, |
|
"grad_norm": 0.9828950762748718, |
|
"learning_rate": 2.2282608695652175e-05, |
|
"loss": 0.3766, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.6739130434782608, |
|
"grad_norm": 1.0777678489685059, |
|
"learning_rate": 2.2101449275362323e-05, |
|
"loss": 0.3791, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6847826086956523, |
|
"grad_norm": 1.006035566329956, |
|
"learning_rate": 2.1920289855072466e-05, |
|
"loss": 0.3961, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6847826086956523, |
|
"eval_loss": 0.5241792798042297, |
|
"eval_runtime": 10.8288, |
|
"eval_samples_per_second": 44.326, |
|
"eval_steps_per_second": 2.77, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6956521739130435, |
|
"grad_norm": 0.9397144913673401, |
|
"learning_rate": 2.173913043478261e-05, |
|
"loss": 0.3659, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.7065217391304348, |
|
"grad_norm": 1.1106013059616089, |
|
"learning_rate": 2.1557971014492757e-05, |
|
"loss": 0.381, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.7173913043478262, |
|
"grad_norm": 0.9626741409301758, |
|
"learning_rate": 2.13768115942029e-05, |
|
"loss": 0.3751, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.7282608695652173, |
|
"grad_norm": 1.139076828956604, |
|
"learning_rate": 2.1195652173913045e-05, |
|
"loss": 0.3888, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 1.0335912704467773, |
|
"learning_rate": 2.101449275362319e-05, |
|
"loss": 0.3758, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"eval_loss": 0.5224525332450867, |
|
"eval_runtime": 10.8033, |
|
"eval_samples_per_second": 44.431, |
|
"eval_steps_per_second": 2.777, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.0831104516983032, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.3429, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.7608695652173914, |
|
"grad_norm": 1.0405676364898682, |
|
"learning_rate": 2.065217391304348e-05, |
|
"loss": 0.3506, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.7717391304347827, |
|
"grad_norm": 0.9948307275772095, |
|
"learning_rate": 2.0471014492753624e-05, |
|
"loss": 0.3528, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.7826086956521738, |
|
"grad_norm": 1.103110671043396, |
|
"learning_rate": 2.028985507246377e-05, |
|
"loss": 0.3779, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.7934782608695652, |
|
"grad_norm": 1.0501149892807007, |
|
"learning_rate": 2.0108695652173915e-05, |
|
"loss": 0.3957, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7934782608695652, |
|
"eval_loss": 0.5198243260383606, |
|
"eval_runtime": 10.7809, |
|
"eval_samples_per_second": 44.523, |
|
"eval_steps_per_second": 2.783, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.8043478260869565, |
|
"grad_norm": 1.0968273878097534, |
|
"learning_rate": 1.992753623188406e-05, |
|
"loss": 0.3746, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.8152173913043477, |
|
"grad_norm": 1.0902262926101685, |
|
"learning_rate": 1.9746376811594202e-05, |
|
"loss": 0.3688, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 1.1077326536178589, |
|
"learning_rate": 1.956521739130435e-05, |
|
"loss": 0.3274, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.8369565217391304, |
|
"grad_norm": 1.005223035812378, |
|
"learning_rate": 1.9384057971014493e-05, |
|
"loss": 0.3858, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.8478260869565217, |
|
"grad_norm": 1.131982445716858, |
|
"learning_rate": 1.9202898550724637e-05, |
|
"loss": 0.3676, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8478260869565217, |
|
"eval_loss": 0.5192354321479797, |
|
"eval_runtime": 10.7892, |
|
"eval_samples_per_second": 44.489, |
|
"eval_steps_per_second": 2.781, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.858695652173913, |
|
"grad_norm": 1.0396854877471924, |
|
"learning_rate": 1.9021739130434784e-05, |
|
"loss": 0.3738, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.8695652173913042, |
|
"grad_norm": 1.1090553998947144, |
|
"learning_rate": 1.8840579710144928e-05, |
|
"loss": 0.3695, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.8804347826086958, |
|
"grad_norm": 1.081115484237671, |
|
"learning_rate": 1.8659420289855072e-05, |
|
"loss": 0.3549, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.891304347826087, |
|
"grad_norm": 1.1276469230651855, |
|
"learning_rate": 1.8478260869565216e-05, |
|
"loss": 0.364, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.9021739130434783, |
|
"grad_norm": 1.0103682279586792, |
|
"learning_rate": 1.8297101449275363e-05, |
|
"loss": 0.3781, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.9021739130434783, |
|
"eval_loss": 0.5211134552955627, |
|
"eval_runtime": 10.7922, |
|
"eval_samples_per_second": 44.477, |
|
"eval_steps_per_second": 2.78, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 0.9383260011672974, |
|
"learning_rate": 1.8115942028985507e-05, |
|
"loss": 0.3469, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.9239130434782608, |
|
"grad_norm": 1.095609188079834, |
|
"learning_rate": 1.793478260869565e-05, |
|
"loss": 0.3747, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.9347826086956523, |
|
"grad_norm": 1.0580706596374512, |
|
"learning_rate": 1.7753623188405798e-05, |
|
"loss": 0.3751, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.9456521739130435, |
|
"grad_norm": 0.9539552927017212, |
|
"learning_rate": 1.757246376811594e-05, |
|
"loss": 0.3638, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 1.0025752782821655, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.3851, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"eval_loss": 0.516950249671936, |
|
"eval_runtime": 10.7904, |
|
"eval_samples_per_second": 44.484, |
|
"eval_steps_per_second": 2.78, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9673913043478262, |
|
"grad_norm": 1.0852687358856201, |
|
"learning_rate": 1.7210144927536233e-05, |
|
"loss": 0.3472, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.9782608695652173, |
|
"grad_norm": 1.1232205629348755, |
|
"learning_rate": 1.702898550724638e-05, |
|
"loss": 0.3591, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.9891304347826086, |
|
"grad_norm": 1.0539883375167847, |
|
"learning_rate": 1.6847826086956524e-05, |
|
"loss": 0.3657, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.387427806854248, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.393, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.010869565217391, |
|
"grad_norm": 1.0311148166656494, |
|
"learning_rate": 1.6485507246376815e-05, |
|
"loss": 0.3364, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.010869565217391, |
|
"eval_loss": 0.5237263441085815, |
|
"eval_runtime": 10.8116, |
|
"eval_samples_per_second": 44.397, |
|
"eval_steps_per_second": 2.775, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.0217391304347827, |
|
"grad_norm": 1.2049050331115723, |
|
"learning_rate": 1.630434782608696e-05, |
|
"loss": 0.336, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.032608695652174, |
|
"grad_norm": 1.0490036010742188, |
|
"learning_rate": 1.6123188405797102e-05, |
|
"loss": 0.3243, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.0434782608695654, |
|
"grad_norm": 1.131861686706543, |
|
"learning_rate": 1.5942028985507246e-05, |
|
"loss": 0.3311, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.0543478260869565, |
|
"grad_norm": 1.0843744277954102, |
|
"learning_rate": 1.5760869565217393e-05, |
|
"loss": 0.3107, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.0652173913043477, |
|
"grad_norm": 1.1070170402526855, |
|
"learning_rate": 1.5579710144927537e-05, |
|
"loss": 0.3357, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.0652173913043477, |
|
"eval_loss": 0.5324747562408447, |
|
"eval_runtime": 10.8165, |
|
"eval_samples_per_second": 44.377, |
|
"eval_steps_per_second": 2.774, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.0760869565217392, |
|
"grad_norm": 1.1747304201126099, |
|
"learning_rate": 1.539855072463768e-05, |
|
"loss": 0.3301, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 1.151397943496704, |
|
"learning_rate": 1.5217391304347828e-05, |
|
"loss": 0.3313, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.097826086956522, |
|
"grad_norm": 1.1628749370574951, |
|
"learning_rate": 1.5036231884057972e-05, |
|
"loss": 0.322, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.108695652173913, |
|
"grad_norm": 1.1999317407608032, |
|
"learning_rate": 1.4855072463768116e-05, |
|
"loss": 0.3174, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.119565217391304, |
|
"grad_norm": 1.1956512928009033, |
|
"learning_rate": 1.4673913043478263e-05, |
|
"loss": 0.3281, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.119565217391304, |
|
"eval_loss": 0.5320965647697449, |
|
"eval_runtime": 10.8261, |
|
"eval_samples_per_second": 44.337, |
|
"eval_steps_per_second": 2.771, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.130434782608696, |
|
"grad_norm": 1.0482760667800903, |
|
"learning_rate": 1.4492753623188407e-05, |
|
"loss": 0.3193, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.141304347826087, |
|
"grad_norm": 1.72954261302948, |
|
"learning_rate": 1.431159420289855e-05, |
|
"loss": 0.3418, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.1521739130434785, |
|
"grad_norm": 1.2846759557724, |
|
"learning_rate": 1.4130434782608694e-05, |
|
"loss": 0.3274, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.1630434782608696, |
|
"grad_norm": 1.2016099691390991, |
|
"learning_rate": 1.3949275362318842e-05, |
|
"loss": 0.3305, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 1.1600474119186401, |
|
"learning_rate": 1.3768115942028985e-05, |
|
"loss": 0.3424, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"eval_loss": 0.5298786163330078, |
|
"eval_runtime": 10.8329, |
|
"eval_samples_per_second": 44.31, |
|
"eval_steps_per_second": 2.769, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.1847826086956523, |
|
"grad_norm": 1.2170861959457397, |
|
"learning_rate": 1.3586956521739131e-05, |
|
"loss": 0.3024, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.1956521739130435, |
|
"grad_norm": 1.1393592357635498, |
|
"learning_rate": 1.3405797101449276e-05, |
|
"loss": 0.3211, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.2065217391304346, |
|
"grad_norm": 1.1940656900405884, |
|
"learning_rate": 1.3224637681159422e-05, |
|
"loss": 0.3157, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.217391304347826, |
|
"grad_norm": 1.2777471542358398, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.3382, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.2282608695652173, |
|
"grad_norm": 1.1014587879180908, |
|
"learning_rate": 1.286231884057971e-05, |
|
"loss": 0.3198, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.2282608695652173, |
|
"eval_loss": 0.5298696160316467, |
|
"eval_runtime": 10.844, |
|
"eval_samples_per_second": 44.264, |
|
"eval_steps_per_second": 2.767, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.239130434782609, |
|
"grad_norm": 1.056643009185791, |
|
"learning_rate": 1.2681159420289857e-05, |
|
"loss": 0.317, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.1301910877227783, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.3266, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 1.2433290481567383, |
|
"learning_rate": 1.2318840579710146e-05, |
|
"loss": 0.3313, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.2717391304347827, |
|
"grad_norm": 1.2331655025482178, |
|
"learning_rate": 1.213768115942029e-05, |
|
"loss": 0.3353, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.282608695652174, |
|
"grad_norm": 1.2137727737426758, |
|
"learning_rate": 1.1956521739130435e-05, |
|
"loss": 0.3183, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.282608695652174, |
|
"eval_loss": 0.5301353335380554, |
|
"eval_runtime": 10.829, |
|
"eval_samples_per_second": 44.326, |
|
"eval_steps_per_second": 2.77, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.2934782608695654, |
|
"grad_norm": 1.0960383415222168, |
|
"learning_rate": 1.177536231884058e-05, |
|
"loss": 0.3083, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.3043478260869565, |
|
"grad_norm": 1.2978132963180542, |
|
"learning_rate": 1.1594202898550725e-05, |
|
"loss": 0.3218, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.3152173913043477, |
|
"grad_norm": 1.1214746236801147, |
|
"learning_rate": 1.141304347826087e-05, |
|
"loss": 0.3334, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.3260869565217392, |
|
"grad_norm": 1.2013393640518188, |
|
"learning_rate": 1.1231884057971016e-05, |
|
"loss": 0.3017, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.3369565217391304, |
|
"grad_norm": 1.2782713174819946, |
|
"learning_rate": 1.1050724637681161e-05, |
|
"loss": 0.3272, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.3369565217391304, |
|
"eval_loss": 0.5313724279403687, |
|
"eval_runtime": 10.8265, |
|
"eval_samples_per_second": 44.336, |
|
"eval_steps_per_second": 2.771, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.3478260869565215, |
|
"grad_norm": 1.1971640586853027, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 0.3295, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.358695652173913, |
|
"grad_norm": 1.1936053037643433, |
|
"learning_rate": 1.068840579710145e-05, |
|
"loss": 0.3132, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.369565217391304, |
|
"grad_norm": 1.0900537967681885, |
|
"learning_rate": 1.0507246376811594e-05, |
|
"loss": 0.3212, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.380434782608696, |
|
"grad_norm": 1.4006975889205933, |
|
"learning_rate": 1.032608695652174e-05, |
|
"loss": 0.3258, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 1.3683350086212158, |
|
"learning_rate": 1.0144927536231885e-05, |
|
"loss": 0.3376, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"eval_loss": 0.5328701734542847, |
|
"eval_runtime": 10.8097, |
|
"eval_samples_per_second": 44.404, |
|
"eval_steps_per_second": 2.775, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4021739130434785, |
|
"grad_norm": 1.120539903640747, |
|
"learning_rate": 9.96376811594203e-06, |
|
"loss": 0.3132, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.4130434782608696, |
|
"grad_norm": 1.1731290817260742, |
|
"learning_rate": 9.782608695652175e-06, |
|
"loss": 0.329, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.4239130434782608, |
|
"grad_norm": 1.2622851133346558, |
|
"learning_rate": 9.601449275362319e-06, |
|
"loss": 0.3181, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 1.2982534170150757, |
|
"learning_rate": 9.420289855072464e-06, |
|
"loss": 0.3177, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.4456521739130435, |
|
"grad_norm": 1.2450315952301025, |
|
"learning_rate": 9.239130434782608e-06, |
|
"loss": 0.3361, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.4456521739130435, |
|
"eval_loss": 0.5312691330909729, |
|
"eval_runtime": 10.7859, |
|
"eval_samples_per_second": 44.502, |
|
"eval_steps_per_second": 2.781, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.4565217391304346, |
|
"grad_norm": 1.2270658016204834, |
|
"learning_rate": 9.057971014492753e-06, |
|
"loss": 0.3392, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.467391304347826, |
|
"grad_norm": 1.313056468963623, |
|
"learning_rate": 8.876811594202899e-06, |
|
"loss": 0.3065, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.4782608695652173, |
|
"grad_norm": 1.255582332611084, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.3224, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.489130434782609, |
|
"grad_norm": 1.354356050491333, |
|
"learning_rate": 8.51449275362319e-06, |
|
"loss": 0.3307, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.2218225002288818, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.3146, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.5301942825317383, |
|
"eval_runtime": 10.778, |
|
"eval_samples_per_second": 44.535, |
|
"eval_steps_per_second": 2.783, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.5108695652173916, |
|
"grad_norm": 1.2889074087142944, |
|
"learning_rate": 8.15217391304348e-06, |
|
"loss": 0.3135, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.5217391304347827, |
|
"grad_norm": 1.1224515438079834, |
|
"learning_rate": 7.971014492753623e-06, |
|
"loss": 0.3227, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.532608695652174, |
|
"grad_norm": 1.1914352178573608, |
|
"learning_rate": 7.789855072463769e-06, |
|
"loss": 0.3138, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.5434782608695654, |
|
"grad_norm": 1.3238486051559448, |
|
"learning_rate": 7.608695652173914e-06, |
|
"loss": 0.3168, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.5543478260869565, |
|
"grad_norm": 1.3031419515609741, |
|
"learning_rate": 7.427536231884058e-06, |
|
"loss": 0.3253, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.5543478260869565, |
|
"eval_loss": 0.5280157327651978, |
|
"eval_runtime": 10.8003, |
|
"eval_samples_per_second": 44.443, |
|
"eval_steps_per_second": 2.778, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.5652173913043477, |
|
"grad_norm": 1.2570607662200928, |
|
"learning_rate": 7.246376811594203e-06, |
|
"loss": 0.3246, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.5760869565217392, |
|
"grad_norm": 1.1971008777618408, |
|
"learning_rate": 7.065217391304347e-06, |
|
"loss": 0.3142, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.5869565217391304, |
|
"grad_norm": 1.2740592956542969, |
|
"learning_rate": 6.884057971014493e-06, |
|
"loss": 0.3263, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.5978260869565215, |
|
"grad_norm": 1.3141279220581055, |
|
"learning_rate": 6.702898550724638e-06, |
|
"loss": 0.3141, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 1.3131691217422485, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 0.3204, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"eval_loss": 0.529800295829773, |
|
"eval_runtime": 10.7985, |
|
"eval_samples_per_second": 44.451, |
|
"eval_steps_per_second": 2.778, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.619565217391304, |
|
"grad_norm": 1.2273128032684326, |
|
"learning_rate": 6.340579710144928e-06, |
|
"loss": 0.3089, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.630434782608696, |
|
"grad_norm": 1.304915428161621, |
|
"learning_rate": 6.159420289855073e-06, |
|
"loss": 0.3211, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.641304347826087, |
|
"grad_norm": 1.261481761932373, |
|
"learning_rate": 5.978260869565218e-06, |
|
"loss": 0.3178, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.6521739130434785, |
|
"grad_norm": 1.2726097106933594, |
|
"learning_rate": 5.797101449275362e-06, |
|
"loss": 0.3342, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.6630434782608696, |
|
"grad_norm": 1.3216400146484375, |
|
"learning_rate": 5.615942028985508e-06, |
|
"loss": 0.3245, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.6630434782608696, |
|
"eval_loss": 0.5288810133934021, |
|
"eval_runtime": 10.8043, |
|
"eval_samples_per_second": 44.427, |
|
"eval_steps_per_second": 2.777, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.6739130434782608, |
|
"grad_norm": 1.2873594760894775, |
|
"learning_rate": 5.4347826086956525e-06, |
|
"loss": 0.3137, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.6847826086956523, |
|
"grad_norm": 1.2318347692489624, |
|
"learning_rate": 5.253623188405797e-06, |
|
"loss": 0.3258, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.6956521739130435, |
|
"grad_norm": 1.2132396697998047, |
|
"learning_rate": 5.072463768115943e-06, |
|
"loss": 0.3168, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.7065217391304346, |
|
"grad_norm": 1.3572018146514893, |
|
"learning_rate": 4.891304347826087e-06, |
|
"loss": 0.3187, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.717391304347826, |
|
"grad_norm": 1.2623666524887085, |
|
"learning_rate": 4.710144927536232e-06, |
|
"loss": 0.3257, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.717391304347826, |
|
"eval_loss": 0.5298188924789429, |
|
"eval_runtime": 10.8163, |
|
"eval_samples_per_second": 44.377, |
|
"eval_steps_per_second": 2.774, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2760, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2027066610132582e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|