{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.070063694267516, "eval_steps": 500, "global_step": 1300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01592356687898089, "grad_norm": 5.938978672027588, "learning_rate": 1.0582010582010582e-06, "loss": 0.5588, "step": 10 }, { "epoch": 0.03184713375796178, "grad_norm": 2.5084874629974365, "learning_rate": 2.1164021164021164e-06, "loss": 0.4422, "step": 20 }, { "epoch": 0.04777070063694268, "grad_norm": 0.9098156690597534, "learning_rate": 3.1746031746031746e-06, "loss": 0.3842, "step": 30 }, { "epoch": 0.06369426751592357, "grad_norm": 0.7925612330436707, "learning_rate": 4.232804232804233e-06, "loss": 0.3481, "step": 40 }, { "epoch": 0.07961783439490445, "grad_norm": 0.7580339312553406, "learning_rate": 5.291005291005291e-06, "loss": 0.3313, "step": 50 }, { "epoch": 0.09554140127388536, "grad_norm": 1.1035263538360596, "learning_rate": 6.349206349206349e-06, "loss": 0.3168, "step": 60 }, { "epoch": 0.11146496815286625, "grad_norm": 0.576166033744812, "learning_rate": 7.4074074074074075e-06, "loss": 0.3109, "step": 70 }, { "epoch": 0.12738853503184713, "grad_norm": 0.6099635362625122, "learning_rate": 8.465608465608466e-06, "loss": 0.3068, "step": 80 }, { "epoch": 0.14331210191082802, "grad_norm": 0.7660859823226929, "learning_rate": 9.523809523809525e-06, "loss": 0.302, "step": 90 }, { "epoch": 0.1592356687898089, "grad_norm": 1.0273711681365967, "learning_rate": 1.0582010582010582e-05, "loss": 0.3053, "step": 100 }, { "epoch": 0.1751592356687898, "grad_norm": 0.7934479713439941, "learning_rate": 1.1640211640211641e-05, "loss": 0.3055, "step": 110 }, { "epoch": 0.1910828025477707, "grad_norm": 1.4802435636520386, "learning_rate": 1.2698412698412699e-05, "loss": 0.3012, "step": 120 }, { "epoch": 0.2070063694267516, "grad_norm": 1.150119423866272, "learning_rate": 1.3756613756613758e-05, "loss": 0.3041, "step": 130 }, { "epoch": 0.2229299363057325, "grad_norm": 1.2672383785247803, "learning_rate": 1.4814814814814815e-05, "loss": 0.3048, "step": 140 }, { "epoch": 0.23885350318471338, "grad_norm": 0.7314700484275818, "learning_rate": 1.5873015873015872e-05, "loss": 0.2997, "step": 150 }, { "epoch": 0.25477707006369427, "grad_norm": 1.1569265127182007, "learning_rate": 1.693121693121693e-05, "loss": 0.3017, "step": 160 }, { "epoch": 0.27070063694267515, "grad_norm": 0.8508642911911011, "learning_rate": 1.798941798941799e-05, "loss": 0.2936, "step": 170 }, { "epoch": 0.28662420382165604, "grad_norm": 0.8888178467750549, "learning_rate": 1.904761904761905e-05, "loss": 0.2959, "step": 180 }, { "epoch": 0.30254777070063693, "grad_norm": 0.9090297818183899, "learning_rate": 1.9999982823676135e-05, "loss": 0.293, "step": 190 }, { "epoch": 0.3184713375796178, "grad_norm": 1.0700126886367798, "learning_rate": 1.9997921736207435e-05, "loss": 0.2927, "step": 200 }, { "epoch": 0.3343949044585987, "grad_norm": 1.2329490184783936, "learning_rate": 1.9992426195241246e-05, "loss": 0.2935, "step": 210 }, { "epoch": 0.3503184713375796, "grad_norm": 0.6014856100082397, "learning_rate": 1.9983498088587904e-05, "loss": 0.2875, "step": 220 }, { "epoch": 0.3662420382165605, "grad_norm": 0.5173149108886719, "learning_rate": 1.9971140483201507e-05, "loss": 0.2865, "step": 230 }, { "epoch": 0.3821656050955414, "grad_norm": 0.4913546144962311, "learning_rate": 1.995535762412639e-05, "loss": 0.2842, "step": 240 }, { "epoch": 0.3980891719745223, "grad_norm": 0.6859982013702393, "learning_rate": 1.9936154933038882e-05, "loss": 0.2825, "step": 250 }, { "epoch": 0.4140127388535032, "grad_norm": 0.836473286151886, "learning_rate": 1.991353900638485e-05, "loss": 0.2807, "step": 260 }, { "epoch": 0.4299363057324841, "grad_norm": 0.7169066667556763, "learning_rate": 1.988751761311373e-05, "loss": 0.2776, "step": 270 }, { "epoch": 0.445859872611465, "grad_norm": 0.7342332601547241, "learning_rate": 1.9858099692009746e-05, "loss": 0.2768, "step": 280 }, { "epoch": 0.46178343949044587, "grad_norm": 0.5076990723609924, "learning_rate": 1.9825295348621303e-05, "loss": 0.2736, "step": 290 }, { "epoch": 0.47770700636942676, "grad_norm": 0.4993409514427185, "learning_rate": 1.978911585178955e-05, "loss": 0.2687, "step": 300 }, { "epoch": 0.49363057324840764, "grad_norm": 0.8263068795204163, "learning_rate": 1.974957362977735e-05, "loss": 0.2698, "step": 310 }, { "epoch": 0.5095541401273885, "grad_norm": 0.5373275279998779, "learning_rate": 1.970668226599997e-05, "loss": 0.2669, "step": 320 }, { "epoch": 0.5254777070063694, "grad_norm": 0.7009662985801697, "learning_rate": 1.966045649435893e-05, "loss": 0.2644, "step": 330 }, { "epoch": 0.5414012738853503, "grad_norm": 0.7370754480361938, "learning_rate": 1.9610912194180685e-05, "loss": 0.264, "step": 340 }, { "epoch": 0.5573248407643312, "grad_norm": 0.5792177319526672, "learning_rate": 1.9558066384761794e-05, "loss": 0.259, "step": 350 }, { "epoch": 0.5732484076433121, "grad_norm": 0.7904641032218933, "learning_rate": 1.950193721952252e-05, "loss": 0.259, "step": 360 }, { "epoch": 0.589171974522293, "grad_norm": 0.57305908203125, "learning_rate": 1.944254397977081e-05, "loss": 0.2527, "step": 370 }, { "epoch": 0.6050955414012739, "grad_norm": 0.6305221319198608, "learning_rate": 1.9379907068078857e-05, "loss": 0.2516, "step": 380 }, { "epoch": 0.6210191082802548, "grad_norm": 0.5892254710197449, "learning_rate": 1.931404800127447e-05, "loss": 0.252, "step": 390 }, { "epoch": 0.6369426751592356, "grad_norm": 0.5180689692497253, "learning_rate": 1.9244989403049677e-05, "loss": 0.2459, "step": 400 }, { "epoch": 0.6528662420382165, "grad_norm": 0.5340262055397034, "learning_rate": 1.9172754996189125e-05, "loss": 0.2427, "step": 410 }, { "epoch": 0.6687898089171974, "grad_norm": 0.6435000896453857, "learning_rate": 1.9097369594420888e-05, "loss": 0.2387, "step": 420 }, { "epoch": 0.6847133757961783, "grad_norm": 0.9354584217071533, "learning_rate": 1.9018859093892538e-05, "loss": 0.2366, "step": 430 }, { "epoch": 0.7006369426751592, "grad_norm": 0.7087671160697937, "learning_rate": 1.8937250464275407e-05, "loss": 0.2315, "step": 440 }, { "epoch": 0.7165605095541401, "grad_norm": 1.0769435167312622, "learning_rate": 1.8852571739500025e-05, "loss": 0.2279, "step": 450 }, { "epoch": 0.732484076433121, "grad_norm": 1.0389244556427002, "learning_rate": 1.8764852008126e-05, "loss": 0.2259, "step": 460 }, { "epoch": 0.7484076433121019, "grad_norm": 0.6885454058647156, "learning_rate": 1.8674121403349634e-05, "loss": 0.2199, "step": 470 }, { "epoch": 0.7643312101910829, "grad_norm": 1.4660230875015259, "learning_rate": 1.858041109265264e-05, "loss": 0.215, "step": 480 }, { "epoch": 0.7802547770700637, "grad_norm": 0.8413995504379272, "learning_rate": 1.8483753267095606e-05, "loss": 0.2134, "step": 490 }, { "epoch": 0.7961783439490446, "grad_norm": 0.687028169631958, "learning_rate": 1.8384181130259814e-05, "loss": 0.207, "step": 500 }, { "epoch": 0.8121019108280255, "grad_norm": 0.7690938115119934, "learning_rate": 1.8281728886841258e-05, "loss": 0.2009, "step": 510 }, { "epoch": 0.8280254777070064, "grad_norm": 0.882749080657959, "learning_rate": 1.8176431730900772e-05, "loss": 0.1951, "step": 520 }, { "epoch": 0.8439490445859873, "grad_norm": 0.6659705638885498, "learning_rate": 1.8068325833774262e-05, "loss": 0.194, "step": 530 }, { "epoch": 0.8598726114649682, "grad_norm": 0.7047539949417114, "learning_rate": 1.7957448331647253e-05, "loss": 0.1857, "step": 540 }, { "epoch": 0.8757961783439491, "grad_norm": 0.7337578535079956, "learning_rate": 1.784383731279799e-05, "loss": 0.1829, "step": 550 }, { "epoch": 0.89171974522293, "grad_norm": 0.8769313097000122, "learning_rate": 1.7727531804513483e-05, "loss": 0.1746, "step": 560 }, { "epoch": 0.9076433121019108, "grad_norm": 0.7639766335487366, "learning_rate": 1.7608571759682963e-05, "loss": 0.1736, "step": 570 }, { "epoch": 0.9235668789808917, "grad_norm": 0.8756518363952637, "learning_rate": 1.748699804307341e-05, "loss": 0.1675, "step": 580 }, { "epoch": 0.9394904458598726, "grad_norm": 0.7969843745231628, "learning_rate": 1.7362852417291813e-05, "loss": 0.1645, "step": 590 }, { "epoch": 0.9554140127388535, "grad_norm": 1.0977206230163574, "learning_rate": 1.7236177528439027e-05, "loss": 0.1581, "step": 600 }, { "epoch": 0.9713375796178344, "grad_norm": 1.1576684713363647, "learning_rate": 1.7107016891460118e-05, "loss": 0.1563, "step": 610 }, { "epoch": 0.9872611464968153, "grad_norm": 0.8371659517288208, "learning_rate": 1.697541487519623e-05, "loss": 0.1475, "step": 620 }, { "epoch": 1.0031847133757963, "grad_norm": 0.7080691456794739, "learning_rate": 1.684141668714318e-05, "loss": 0.1397, "step": 630 }, { "epoch": 1.019108280254777, "grad_norm": 0.9583333730697632, "learning_rate": 1.6705068357921913e-05, "loss": 0.1137, "step": 640 }, { "epoch": 1.035031847133758, "grad_norm": 0.6917926073074341, "learning_rate": 1.6566416725466198e-05, "loss": 0.1108, "step": 650 }, { "epoch": 1.0509554140127388, "grad_norm": 0.7164482474327087, "learning_rate": 1.6425509418933038e-05, "loss": 0.1106, "step": 660 }, { "epoch": 1.0668789808917198, "grad_norm": 0.6723839044570923, "learning_rate": 1.6282394842341234e-05, "loss": 0.1068, "step": 670 }, { "epoch": 1.0828025477707006, "grad_norm": 0.7010757327079773, "learning_rate": 1.613712215794381e-05, "loss": 0.1034, "step": 680 }, { "epoch": 1.0987261146496816, "grad_norm": 0.7355757355690002, "learning_rate": 1.5989741269339956e-05, "loss": 0.1011, "step": 690 }, { "epoch": 1.1146496815286624, "grad_norm": 0.720433235168457, "learning_rate": 1.5840302804332295e-05, "loss": 0.1001, "step": 700 }, { "epoch": 1.1305732484076434, "grad_norm": 0.7248522639274597, "learning_rate": 1.5688858097535403e-05, "loss": 0.0948, "step": 710 }, { "epoch": 1.1464968152866242, "grad_norm": 0.7236132025718689, "learning_rate": 1.5535459172741475e-05, "loss": 0.0906, "step": 720 }, { "epoch": 1.1624203821656052, "grad_norm": 0.6846908330917358, "learning_rate": 1.538015872504933e-05, "loss": 0.0881, "step": 730 }, { "epoch": 1.178343949044586, "grad_norm": 0.6623250246047974, "learning_rate": 1.5223010102762725e-05, "loss": 0.0848, "step": 740 }, { "epoch": 1.194267515923567, "grad_norm": 0.807305097579956, "learning_rate": 1.5064067289064332e-05, "loss": 0.0819, "step": 750 }, { "epoch": 1.2101910828025477, "grad_norm": 0.6570079326629639, "learning_rate": 1.4903384883471608e-05, "loss": 0.0821, "step": 760 }, { "epoch": 1.2261146496815287, "grad_norm": 0.6224232912063599, "learning_rate": 1.474101808308096e-05, "loss": 0.0781, "step": 770 }, { "epoch": 1.2420382165605095, "grad_norm": 0.6488759517669678, "learning_rate": 1.4577022663606592e-05, "loss": 0.0756, "step": 780 }, { "epoch": 1.2579617834394905, "grad_norm": 0.6796790957450867, "learning_rate": 1.4411454960220647e-05, "loss": 0.0739, "step": 790 }, { "epoch": 1.2738853503184713, "grad_norm": 0.6596919298171997, "learning_rate": 1.42443718482011e-05, "loss": 0.0691, "step": 800 }, { "epoch": 1.2898089171974523, "grad_norm": 0.6160626411437988, "learning_rate": 1.4075830723394164e-05, "loss": 0.0676, "step": 810 }, { "epoch": 1.305732484076433, "grad_norm": 0.5704523324966431, "learning_rate": 1.3905889482497858e-05, "loss": 0.0647, "step": 820 }, { "epoch": 1.321656050955414, "grad_norm": 0.5561860203742981, "learning_rate": 1.3734606503173522e-05, "loss": 0.0623, "step": 830 }, { "epoch": 1.3375796178343948, "grad_norm": 0.681788444519043, "learning_rate": 1.3562040623992092e-05, "loss": 0.0611, "step": 840 }, { "epoch": 1.3535031847133758, "grad_norm": 0.6568078994750977, "learning_rate": 1.3388251124222104e-05, "loss": 0.0588, "step": 850 }, { "epoch": 1.3694267515923566, "grad_norm": 0.5263776183128357, "learning_rate": 1.3213297703466237e-05, "loss": 0.0564, "step": 860 }, { "epoch": 1.3853503184713376, "grad_norm": 0.5988286733627319, "learning_rate": 1.303724046115352e-05, "loss": 0.0543, "step": 870 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5688588619232178, "learning_rate": 1.2860139875894163e-05, "loss": 0.0574, "step": 880 }, { "epoch": 1.4171974522292994, "grad_norm": 0.6649556159973145, "learning_rate": 1.2682056784704151e-05, "loss": 0.0529, "step": 890 }, { "epoch": 1.4331210191082802, "grad_norm": 0.5984866619110107, "learning_rate": 1.2503052362106698e-05, "loss": 0.0517, "step": 900 }, { "epoch": 1.4490445859872612, "grad_norm": 0.5580580234527588, "learning_rate": 1.2323188099117791e-05, "loss": 0.0474, "step": 910 }, { "epoch": 1.4649681528662422, "grad_norm": 0.5219546556472778, "learning_rate": 1.2142525782122974e-05, "loss": 0.0464, "step": 920 }, { "epoch": 1.480891719745223, "grad_norm": 0.5171259641647339, "learning_rate": 1.196112747165271e-05, "loss": 0.0444, "step": 930 }, { "epoch": 1.4968152866242037, "grad_norm": 0.5057135224342346, "learning_rate": 1.1779055481063545e-05, "loss": 0.042, "step": 940 }, { "epoch": 1.5127388535031847, "grad_norm": 0.5676354765892029, "learning_rate": 1.1596372355132422e-05, "loss": 0.0405, "step": 950 }, { "epoch": 1.5286624203821657, "grad_norm": 0.5014776587486267, "learning_rate": 1.1413140848571495e-05, "loss": 0.0409, "step": 960 }, { "epoch": 1.5445859872611465, "grad_norm": 0.4629918038845062, "learning_rate": 1.1229423904470864e-05, "loss": 0.0405, "step": 970 }, { "epoch": 1.5605095541401273, "grad_norm": 0.562461256980896, "learning_rate": 1.1045284632676535e-05, "loss": 0.0401, "step": 980 }, { "epoch": 1.5764331210191083, "grad_norm": 0.5296617150306702, "learning_rate": 1.0860786288111158e-05, "loss": 0.0386, "step": 990 }, { "epoch": 1.5923566878980893, "grad_norm": 0.4665983021259308, "learning_rate": 1.0675992249044882e-05, "loss": 0.0352, "step": 1000 }, { "epoch": 1.60828025477707, "grad_norm": 0.5495542287826538, "learning_rate": 1.0490965995323879e-05, "loss": 0.0339, "step": 1010 }, { "epoch": 1.6242038216560508, "grad_norm": 0.47773268818855286, "learning_rate": 1.0305771086563927e-05, "loss": 0.032, "step": 1020 }, { "epoch": 1.6401273885350318, "grad_norm": 0.43807661533355713, "learning_rate": 1.0120471140316647e-05, "loss": 0.0346, "step": 1030 }, { "epoch": 1.6560509554140128, "grad_norm": 0.4941099286079407, "learning_rate": 9.935129810215793e-06, "loss": 0.0329, "step": 1040 }, { "epoch": 1.6719745222929936, "grad_norm": 0.4290195405483246, "learning_rate": 9.749810764111156e-06, "loss": 0.0317, "step": 1050 }, { "epoch": 1.6878980891719744, "grad_norm": 0.4538203477859497, "learning_rate": 9.564577662197612e-06, "loss": 0.0325, "step": 1060 }, { "epoch": 1.7038216560509554, "grad_norm": 0.3967364430427551, "learning_rate": 9.379494135146769e-06, "loss": 0.0307, "step": 1070 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4934009611606598, "learning_rate": 9.194623762248755e-06, "loss": 0.0309, "step": 1080 }, { "epoch": 1.7356687898089171, "grad_norm": 0.4300677478313446, "learning_rate": 9.010030049571686e-06, "loss": 0.0298, "step": 1090 }, { "epoch": 1.7515923566878981, "grad_norm": 0.42275470495224, "learning_rate": 8.825776408146262e-06, "loss": 0.0289, "step": 1100 }, { "epoch": 1.767515923566879, "grad_norm": 0.43387818336486816, "learning_rate": 8.641926132183005e-06, "loss": 0.0275, "step": 1110 }, { "epoch": 1.78343949044586, "grad_norm": 0.4539569616317749, "learning_rate": 8.458542377329661e-06, "loss": 0.0267, "step": 1120 }, { "epoch": 1.799363057324841, "grad_norm": 0.4254033863544464, "learning_rate": 8.275688138976151e-06, "loss": 0.0257, "step": 1130 }, { "epoch": 1.8152866242038217, "grad_norm": 0.3466563820838928, "learning_rate": 8.093426230614627e-06, "loss": 0.0256, "step": 1140 }, { "epoch": 1.8312101910828025, "grad_norm": 0.46024608612060547, "learning_rate": 7.911819262261961e-06, "loss": 0.0253, "step": 1150 }, { "epoch": 1.8471337579617835, "grad_norm": 0.3775930404663086, "learning_rate": 7.73092961895217e-06, "loss": 0.0249, "step": 1160 }, { "epoch": 1.8630573248407645, "grad_norm": 0.4123600125312805, "learning_rate": 7.550819439306113e-06, "loss": 0.0245, "step": 1170 }, { "epoch": 1.8789808917197452, "grad_norm": 0.3415584862232208, "learning_rate": 7.37155059418583e-06, "loss": 0.0234, "step": 1180 }, { "epoch": 1.894904458598726, "grad_norm": 0.33297714591026306, "learning_rate": 7.193184665440865e-06, "loss": 0.0209, "step": 1190 }, { "epoch": 1.910828025477707, "grad_norm": 0.39130252599716187, "learning_rate": 7.015782924753871e-06, "loss": 0.0228, "step": 1200 }, { "epoch": 1.926751592356688, "grad_norm": 0.32679814100265503, "learning_rate": 6.839406312592755e-06, "loss": 0.0206, "step": 1210 }, { "epoch": 1.9426751592356688, "grad_norm": 0.40562936663627625, "learning_rate": 6.664115417276629e-06, "loss": 0.0202, "step": 1220 }, { "epoch": 1.9585987261146496, "grad_norm": 0.4119989275932312, "learning_rate": 6.489970454162677e-06, "loss": 0.0181, "step": 1230 }, { "epoch": 1.9745222929936306, "grad_norm": 0.31588155031204224, "learning_rate": 6.317031244961193e-06, "loss": 0.0186, "step": 1240 }, { "epoch": 1.9904458598726116, "grad_norm": 0.3445233702659607, "learning_rate": 6.1453571971858095e-06, "loss": 0.0189, "step": 1250 }, { "epoch": 2.0063694267515926, "grad_norm": 0.18995656073093414, "learning_rate": 5.9750072837460436e-06, "loss": 0.0134, "step": 1260 }, { "epoch": 2.022292993630573, "grad_norm": 0.21353577077388763, "learning_rate": 5.806040022689083e-06, "loss": 0.0067, "step": 1270 }, { "epoch": 2.038216560509554, "grad_norm": 0.2473672330379486, "learning_rate": 5.638513457097887e-06, "loss": 0.0067, "step": 1280 }, { "epoch": 2.054140127388535, "grad_norm": 0.25051358342170715, "learning_rate": 5.472485135152391e-06, "loss": 0.0063, "step": 1290 }, { "epoch": 2.070063694267516, "grad_norm": 0.2428949475288391, "learning_rate": 5.3080120903607635e-06, "loss": 0.0061, "step": 1300 } ], "logging_steps": 10, "max_steps": 1884, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3672926132234617e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }