{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1884, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01592356687898089, "grad_norm": 5.938978672027588, "learning_rate": 1.0582010582010582e-06, "loss": 0.5588, "step": 10 }, { "epoch": 0.03184713375796178, "grad_norm": 2.5084874629974365, "learning_rate": 2.1164021164021164e-06, "loss": 0.4422, "step": 20 }, { "epoch": 0.04777070063694268, "grad_norm": 0.9098156690597534, "learning_rate": 3.1746031746031746e-06, "loss": 0.3842, "step": 30 }, { "epoch": 0.06369426751592357, "grad_norm": 0.7925612330436707, "learning_rate": 4.232804232804233e-06, "loss": 0.3481, "step": 40 }, { "epoch": 0.07961783439490445, "grad_norm": 0.7580339312553406, "learning_rate": 5.291005291005291e-06, "loss": 0.3313, "step": 50 }, { "epoch": 0.09554140127388536, "grad_norm": 1.1035263538360596, "learning_rate": 6.349206349206349e-06, "loss": 0.3168, "step": 60 }, { "epoch": 0.11146496815286625, "grad_norm": 0.576166033744812, "learning_rate": 7.4074074074074075e-06, "loss": 0.3109, "step": 70 }, { "epoch": 0.12738853503184713, "grad_norm": 0.6099635362625122, "learning_rate": 8.465608465608466e-06, "loss": 0.3068, "step": 80 }, { "epoch": 0.14331210191082802, "grad_norm": 0.7660859823226929, "learning_rate": 9.523809523809525e-06, "loss": 0.302, "step": 90 }, { "epoch": 0.1592356687898089, "grad_norm": 1.0273711681365967, "learning_rate": 1.0582010582010582e-05, "loss": 0.3053, "step": 100 }, { "epoch": 0.1751592356687898, "grad_norm": 0.7934479713439941, "learning_rate": 1.1640211640211641e-05, "loss": 0.3055, "step": 110 }, { "epoch": 0.1910828025477707, "grad_norm": 1.4802435636520386, "learning_rate": 1.2698412698412699e-05, "loss": 0.3012, "step": 120 }, { "epoch": 0.2070063694267516, "grad_norm": 1.150119423866272, "learning_rate": 1.3756613756613758e-05, "loss": 0.3041, "step": 130 }, { "epoch": 0.2229299363057325, "grad_norm": 1.2672383785247803, "learning_rate": 1.4814814814814815e-05, "loss": 0.3048, "step": 140 }, { "epoch": 0.23885350318471338, "grad_norm": 0.7314700484275818, "learning_rate": 1.5873015873015872e-05, "loss": 0.2997, "step": 150 }, { "epoch": 0.25477707006369427, "grad_norm": 1.1569265127182007, "learning_rate": 1.693121693121693e-05, "loss": 0.3017, "step": 160 }, { "epoch": 0.27070063694267515, "grad_norm": 0.8508642911911011, "learning_rate": 1.798941798941799e-05, "loss": 0.2936, "step": 170 }, { "epoch": 0.28662420382165604, "grad_norm": 0.8888178467750549, "learning_rate": 1.904761904761905e-05, "loss": 0.2959, "step": 180 }, { "epoch": 0.30254777070063693, "grad_norm": 0.9090297818183899, "learning_rate": 1.9999982823676135e-05, "loss": 0.293, "step": 190 }, { "epoch": 0.3184713375796178, "grad_norm": 1.0700126886367798, "learning_rate": 1.9997921736207435e-05, "loss": 0.2927, "step": 200 }, { "epoch": 0.3343949044585987, "grad_norm": 1.2329490184783936, "learning_rate": 1.9992426195241246e-05, "loss": 0.2935, "step": 210 }, { "epoch": 0.3503184713375796, "grad_norm": 0.6014856100082397, "learning_rate": 1.9983498088587904e-05, "loss": 0.2875, "step": 220 }, { "epoch": 0.3662420382165605, "grad_norm": 0.5173149108886719, "learning_rate": 1.9971140483201507e-05, "loss": 0.2865, "step": 230 }, { "epoch": 0.3821656050955414, "grad_norm": 0.4913546144962311, "learning_rate": 1.995535762412639e-05, "loss": 0.2842, "step": 240 }, { "epoch": 0.3980891719745223, "grad_norm": 0.6859982013702393, "learning_rate": 1.9936154933038882e-05, "loss": 0.2825, "step": 250 }, { "epoch": 0.4140127388535032, "grad_norm": 0.836473286151886, "learning_rate": 1.991353900638485e-05, "loss": 0.2807, "step": 260 }, { "epoch": 0.4299363057324841, "grad_norm": 0.7169066667556763, "learning_rate": 1.988751761311373e-05, "loss": 0.2776, "step": 270 }, { "epoch": 0.445859872611465, "grad_norm": 0.7342332601547241, "learning_rate": 1.9858099692009746e-05, "loss": 0.2768, "step": 280 }, { "epoch": 0.46178343949044587, "grad_norm": 0.5076990723609924, "learning_rate": 1.9825295348621303e-05, "loss": 0.2736, "step": 290 }, { "epoch": 0.47770700636942676, "grad_norm": 0.4993409514427185, "learning_rate": 1.978911585178955e-05, "loss": 0.2687, "step": 300 }, { "epoch": 0.49363057324840764, "grad_norm": 0.8263068795204163, "learning_rate": 1.974957362977735e-05, "loss": 0.2698, "step": 310 }, { "epoch": 0.5095541401273885, "grad_norm": 0.5373275279998779, "learning_rate": 1.970668226599997e-05, "loss": 0.2669, "step": 320 }, { "epoch": 0.5254777070063694, "grad_norm": 0.7009662985801697, "learning_rate": 1.966045649435893e-05, "loss": 0.2644, "step": 330 }, { "epoch": 0.5414012738853503, "grad_norm": 0.7370754480361938, "learning_rate": 1.9610912194180685e-05, "loss": 0.264, "step": 340 }, { "epoch": 0.5573248407643312, "grad_norm": 0.5792177319526672, "learning_rate": 1.9558066384761794e-05, "loss": 0.259, "step": 350 }, { "epoch": 0.5732484076433121, "grad_norm": 0.7904641032218933, "learning_rate": 1.950193721952252e-05, "loss": 0.259, "step": 360 }, { "epoch": 0.589171974522293, "grad_norm": 0.57305908203125, "learning_rate": 1.944254397977081e-05, "loss": 0.2527, "step": 370 }, { "epoch": 0.6050955414012739, "grad_norm": 0.6305221319198608, "learning_rate": 1.9379907068078857e-05, "loss": 0.2516, "step": 380 }, { "epoch": 0.6210191082802548, "grad_norm": 0.5892254710197449, "learning_rate": 1.931404800127447e-05, "loss": 0.252, "step": 390 }, { "epoch": 0.6369426751592356, "grad_norm": 0.5180689692497253, "learning_rate": 1.9244989403049677e-05, "loss": 0.2459, "step": 400 }, { "epoch": 0.6528662420382165, "grad_norm": 0.5340262055397034, "learning_rate": 1.9172754996189125e-05, "loss": 0.2427, "step": 410 }, { "epoch": 0.6687898089171974, "grad_norm": 0.6435000896453857, "learning_rate": 1.9097369594420888e-05, "loss": 0.2387, "step": 420 }, { "epoch": 0.6847133757961783, "grad_norm": 0.9354584217071533, "learning_rate": 1.9018859093892538e-05, "loss": 0.2366, "step": 430 }, { "epoch": 0.7006369426751592, "grad_norm": 0.7087671160697937, "learning_rate": 1.8937250464275407e-05, "loss": 0.2315, "step": 440 }, { "epoch": 0.7165605095541401, "grad_norm": 1.0769435167312622, "learning_rate": 1.8852571739500025e-05, "loss": 0.2279, "step": 450 }, { "epoch": 0.732484076433121, "grad_norm": 1.0389244556427002, "learning_rate": 1.8764852008126e-05, "loss": 0.2259, "step": 460 }, { "epoch": 0.7484076433121019, "grad_norm": 0.6885454058647156, "learning_rate": 1.8674121403349634e-05, "loss": 0.2199, "step": 470 }, { "epoch": 0.7643312101910829, "grad_norm": 1.4660230875015259, "learning_rate": 1.858041109265264e-05, "loss": 0.215, "step": 480 }, { "epoch": 0.7802547770700637, "grad_norm": 0.8413995504379272, "learning_rate": 1.8483753267095606e-05, "loss": 0.2134, "step": 490 }, { "epoch": 0.7961783439490446, "grad_norm": 0.687028169631958, "learning_rate": 1.8384181130259814e-05, "loss": 0.207, "step": 500 }, { "epoch": 0.8121019108280255, "grad_norm": 0.7690938115119934, "learning_rate": 1.8281728886841258e-05, "loss": 0.2009, "step": 510 }, { "epoch": 0.8280254777070064, "grad_norm": 0.882749080657959, "learning_rate": 1.8176431730900772e-05, "loss": 0.1951, "step": 520 }, { "epoch": 0.8439490445859873, "grad_norm": 0.6659705638885498, "learning_rate": 1.8068325833774262e-05, "loss": 0.194, "step": 530 }, { "epoch": 0.8598726114649682, "grad_norm": 0.7047539949417114, "learning_rate": 1.7957448331647253e-05, "loss": 0.1857, "step": 540 }, { "epoch": 0.8757961783439491, "grad_norm": 0.7337578535079956, "learning_rate": 1.784383731279799e-05, "loss": 0.1829, "step": 550 }, { "epoch": 0.89171974522293, "grad_norm": 0.8769313097000122, "learning_rate": 1.7727531804513483e-05, "loss": 0.1746, "step": 560 }, { "epoch": 0.9076433121019108, "grad_norm": 0.7639766335487366, "learning_rate": 1.7608571759682963e-05, "loss": 0.1736, "step": 570 }, { "epoch": 0.9235668789808917, "grad_norm": 0.8756518363952637, "learning_rate": 1.748699804307341e-05, "loss": 0.1675, "step": 580 }, { "epoch": 0.9394904458598726, "grad_norm": 0.7969843745231628, "learning_rate": 1.7362852417291813e-05, "loss": 0.1645, "step": 590 }, { "epoch": 0.9554140127388535, "grad_norm": 1.0977206230163574, "learning_rate": 1.7236177528439027e-05, "loss": 0.1581, "step": 600 }, { "epoch": 0.9713375796178344, "grad_norm": 1.1576684713363647, "learning_rate": 1.7107016891460118e-05, "loss": 0.1563, "step": 610 }, { "epoch": 0.9872611464968153, "grad_norm": 0.8371659517288208, "learning_rate": 1.697541487519623e-05, "loss": 0.1475, "step": 620 }, { "epoch": 1.0031847133757963, "grad_norm": 0.7080691456794739, "learning_rate": 1.684141668714318e-05, "loss": 0.1397, "step": 630 }, { "epoch": 1.019108280254777, "grad_norm": 0.9583333730697632, "learning_rate": 1.6705068357921913e-05, "loss": 0.1137, "step": 640 }, { "epoch": 1.035031847133758, "grad_norm": 0.6917926073074341, "learning_rate": 1.6566416725466198e-05, "loss": 0.1108, "step": 650 }, { "epoch": 1.0509554140127388, "grad_norm": 0.7164482474327087, "learning_rate": 1.6425509418933038e-05, "loss": 0.1106, "step": 660 }, { "epoch": 1.0668789808917198, "grad_norm": 0.6723839044570923, "learning_rate": 1.6282394842341234e-05, "loss": 0.1068, "step": 670 }, { "epoch": 1.0828025477707006, "grad_norm": 0.7010757327079773, "learning_rate": 1.613712215794381e-05, "loss": 0.1034, "step": 680 }, { "epoch": 1.0987261146496816, "grad_norm": 0.7355757355690002, "learning_rate": 1.5989741269339956e-05, "loss": 0.1011, "step": 690 }, { "epoch": 1.1146496815286624, "grad_norm": 0.720433235168457, "learning_rate": 1.5840302804332295e-05, "loss": 0.1001, "step": 700 }, { "epoch": 1.1305732484076434, "grad_norm": 0.7248522639274597, "learning_rate": 1.5688858097535403e-05, "loss": 0.0948, "step": 710 }, { "epoch": 1.1464968152866242, "grad_norm": 0.7236132025718689, "learning_rate": 1.5535459172741475e-05, "loss": 0.0906, "step": 720 }, { "epoch": 1.1624203821656052, "grad_norm": 0.6846908330917358, "learning_rate": 1.538015872504933e-05, "loss": 0.0881, "step": 730 }, { "epoch": 1.178343949044586, "grad_norm": 0.6623250246047974, "learning_rate": 1.5223010102762725e-05, "loss": 0.0848, "step": 740 }, { "epoch": 1.194267515923567, "grad_norm": 0.807305097579956, "learning_rate": 1.5064067289064332e-05, "loss": 0.0819, "step": 750 }, { "epoch": 1.2101910828025477, "grad_norm": 0.6570079326629639, "learning_rate": 1.4903384883471608e-05, "loss": 0.0821, "step": 760 }, { "epoch": 1.2261146496815287, "grad_norm": 0.6224232912063599, "learning_rate": 1.474101808308096e-05, "loss": 0.0781, "step": 770 }, { "epoch": 1.2420382165605095, "grad_norm": 0.6488759517669678, "learning_rate": 1.4577022663606592e-05, "loss": 0.0756, "step": 780 }, { "epoch": 1.2579617834394905, "grad_norm": 0.6796790957450867, "learning_rate": 1.4411454960220647e-05, "loss": 0.0739, "step": 790 }, { "epoch": 1.2738853503184713, "grad_norm": 0.6596919298171997, "learning_rate": 1.42443718482011e-05, "loss": 0.0691, "step": 800 }, { "epoch": 1.2898089171974523, "grad_norm": 0.6160626411437988, "learning_rate": 1.4075830723394164e-05, "loss": 0.0676, "step": 810 }, { "epoch": 1.305732484076433, "grad_norm": 0.5704523324966431, "learning_rate": 1.3905889482497858e-05, "loss": 0.0647, "step": 820 }, { "epoch": 1.321656050955414, "grad_norm": 0.5561860203742981, "learning_rate": 1.3734606503173522e-05, "loss": 0.0623, "step": 830 }, { "epoch": 1.3375796178343948, "grad_norm": 0.681788444519043, "learning_rate": 1.3562040623992092e-05, "loss": 0.0611, "step": 840 }, { "epoch": 1.3535031847133758, "grad_norm": 0.6568078994750977, "learning_rate": 1.3388251124222104e-05, "loss": 0.0588, "step": 850 }, { "epoch": 1.3694267515923566, "grad_norm": 0.5263776183128357, "learning_rate": 1.3213297703466237e-05, "loss": 0.0564, "step": 860 }, { "epoch": 1.3853503184713376, "grad_norm": 0.5988286733627319, "learning_rate": 1.303724046115352e-05, "loss": 0.0543, "step": 870 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5688588619232178, "learning_rate": 1.2860139875894163e-05, "loss": 0.0574, "step": 880 }, { "epoch": 1.4171974522292994, "grad_norm": 0.6649556159973145, "learning_rate": 1.2682056784704151e-05, "loss": 0.0529, "step": 890 }, { "epoch": 1.4331210191082802, "grad_norm": 0.5984866619110107, "learning_rate": 1.2503052362106698e-05, "loss": 0.0517, "step": 900 }, { "epoch": 1.4490445859872612, "grad_norm": 0.5580580234527588, "learning_rate": 1.2323188099117791e-05, "loss": 0.0474, "step": 910 }, { "epoch": 1.4649681528662422, "grad_norm": 0.5219546556472778, "learning_rate": 1.2142525782122974e-05, "loss": 0.0464, "step": 920 }, { "epoch": 1.480891719745223, "grad_norm": 0.5171259641647339, "learning_rate": 1.196112747165271e-05, "loss": 0.0444, "step": 930 }, { "epoch": 1.4968152866242037, "grad_norm": 0.5057135224342346, "learning_rate": 1.1779055481063545e-05, "loss": 0.042, "step": 940 }, { "epoch": 1.5127388535031847, "grad_norm": 0.5676354765892029, "learning_rate": 1.1596372355132422e-05, "loss": 0.0405, "step": 950 }, { "epoch": 1.5286624203821657, "grad_norm": 0.5014776587486267, "learning_rate": 1.1413140848571495e-05, "loss": 0.0409, "step": 960 }, { "epoch": 1.5445859872611465, "grad_norm": 0.4629918038845062, "learning_rate": 1.1229423904470864e-05, "loss": 0.0405, "step": 970 }, { "epoch": 1.5605095541401273, "grad_norm": 0.562461256980896, "learning_rate": 1.1045284632676535e-05, "loss": 0.0401, "step": 980 }, { "epoch": 1.5764331210191083, "grad_norm": 0.5296617150306702, "learning_rate": 1.0860786288111158e-05, "loss": 0.0386, "step": 990 }, { "epoch": 1.5923566878980893, "grad_norm": 0.4665983021259308, "learning_rate": 1.0675992249044882e-05, "loss": 0.0352, "step": 1000 }, { "epoch": 1.60828025477707, "grad_norm": 0.5495542287826538, "learning_rate": 1.0490965995323879e-05, "loss": 0.0339, "step": 1010 }, { "epoch": 1.6242038216560508, "grad_norm": 0.47773268818855286, "learning_rate": 1.0305771086563927e-05, "loss": 0.032, "step": 1020 }, { "epoch": 1.6401273885350318, "grad_norm": 0.43807661533355713, "learning_rate": 1.0120471140316647e-05, "loss": 0.0346, "step": 1030 }, { "epoch": 1.6560509554140128, "grad_norm": 0.4941099286079407, "learning_rate": 9.935129810215793e-06, "loss": 0.0329, "step": 1040 }, { "epoch": 1.6719745222929936, "grad_norm": 0.4290195405483246, "learning_rate": 9.749810764111156e-06, "loss": 0.0317, "step": 1050 }, { "epoch": 1.6878980891719744, "grad_norm": 0.4538203477859497, "learning_rate": 9.564577662197612e-06, "loss": 0.0325, "step": 1060 }, { "epoch": 1.7038216560509554, "grad_norm": 0.3967364430427551, "learning_rate": 9.379494135146769e-06, "loss": 0.0307, "step": 1070 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4934009611606598, "learning_rate": 9.194623762248755e-06, "loss": 0.0309, "step": 1080 }, { "epoch": 1.7356687898089171, "grad_norm": 0.4300677478313446, "learning_rate": 9.010030049571686e-06, "loss": 0.0298, "step": 1090 }, { "epoch": 1.7515923566878981, "grad_norm": 0.42275470495224, "learning_rate": 8.825776408146262e-06, "loss": 0.0289, "step": 1100 }, { "epoch": 1.767515923566879, "grad_norm": 0.43387818336486816, "learning_rate": 8.641926132183005e-06, "loss": 0.0275, "step": 1110 }, { "epoch": 1.78343949044586, "grad_norm": 0.4539569616317749, "learning_rate": 8.458542377329661e-06, "loss": 0.0267, "step": 1120 }, { "epoch": 1.799363057324841, "grad_norm": 0.4254033863544464, "learning_rate": 8.275688138976151e-06, "loss": 0.0257, "step": 1130 }, { "epoch": 1.8152866242038217, "grad_norm": 0.3466563820838928, "learning_rate": 8.093426230614627e-06, "loss": 0.0256, "step": 1140 }, { "epoch": 1.8312101910828025, "grad_norm": 0.46024608612060547, "learning_rate": 7.911819262261961e-06, "loss": 0.0253, "step": 1150 }, { "epoch": 1.8471337579617835, "grad_norm": 0.3775930404663086, "learning_rate": 7.73092961895217e-06, "loss": 0.0249, "step": 1160 }, { "epoch": 1.8630573248407645, "grad_norm": 0.4123600125312805, "learning_rate": 7.550819439306113e-06, "loss": 0.0245, "step": 1170 }, { "epoch": 1.8789808917197452, "grad_norm": 0.3415584862232208, "learning_rate": 7.37155059418583e-06, "loss": 0.0234, "step": 1180 }, { "epoch": 1.894904458598726, "grad_norm": 0.33297714591026306, "learning_rate": 7.193184665440865e-06, "loss": 0.0209, "step": 1190 }, { "epoch": 1.910828025477707, "grad_norm": 0.39130252599716187, "learning_rate": 7.015782924753871e-06, "loss": 0.0228, "step": 1200 }, { "epoch": 1.926751592356688, "grad_norm": 0.32679814100265503, "learning_rate": 6.839406312592755e-06, "loss": 0.0206, "step": 1210 }, { "epoch": 1.9426751592356688, "grad_norm": 0.40562936663627625, "learning_rate": 6.664115417276629e-06, "loss": 0.0202, "step": 1220 }, { "epoch": 1.9585987261146496, "grad_norm": 0.4119989275932312, "learning_rate": 6.489970454162677e-06, "loss": 0.0181, "step": 1230 }, { "epoch": 1.9745222929936306, "grad_norm": 0.31588155031204224, "learning_rate": 6.317031244961193e-06, "loss": 0.0186, "step": 1240 }, { "epoch": 1.9904458598726116, "grad_norm": 0.3445233702659607, "learning_rate": 6.1453571971858095e-06, "loss": 0.0189, "step": 1250 }, { "epoch": 2.0063694267515926, "grad_norm": 0.18995656073093414, "learning_rate": 5.9750072837460436e-06, "loss": 0.0134, "step": 1260 }, { "epoch": 2.022292993630573, "grad_norm": 0.21353577077388763, "learning_rate": 5.806040022689083e-06, "loss": 0.0067, "step": 1270 }, { "epoch": 2.038216560509554, "grad_norm": 0.2473672330379486, "learning_rate": 5.638513457097887e-06, "loss": 0.0067, "step": 1280 }, { "epoch": 2.054140127388535, "grad_norm": 0.25051358342170715, "learning_rate": 5.472485135152391e-06, "loss": 0.0063, "step": 1290 }, { "epoch": 2.070063694267516, "grad_norm": 0.2428949475288391, "learning_rate": 5.3080120903607635e-06, "loss": 0.0061, "step": 1300 }, { "epoch": 2.0859872611464967, "grad_norm": 0.2344033420085907, "learning_rate": 5.14515082196743e-06, "loss": 0.0065, "step": 1310 }, { "epoch": 2.1019108280254777, "grad_norm": 0.18941205739974976, "learning_rate": 4.983957275544645e-06, "loss": 0.0059, "step": 1320 }, { "epoch": 2.1178343949044587, "grad_norm": 0.312821626663208, "learning_rate": 4.8244868237742325e-06, "loss": 0.006, "step": 1330 }, { "epoch": 2.1337579617834397, "grad_norm": 0.278249591588974, "learning_rate": 4.666794247426169e-06, "loss": 0.0058, "step": 1340 }, { "epoch": 2.1496815286624202, "grad_norm": 0.23995652794837952, "learning_rate": 4.510933716540459e-06, "loss": 0.0055, "step": 1350 }, { "epoch": 2.1656050955414012, "grad_norm": 0.27266427874565125, "learning_rate": 4.356958771818804e-06, "loss": 0.0058, "step": 1360 }, { "epoch": 2.1815286624203822, "grad_norm": 0.15857194364070892, "learning_rate": 4.204922306232508e-06, "loss": 0.0046, "step": 1370 }, { "epoch": 2.1974522292993632, "grad_norm": 0.22888706624507904, "learning_rate": 4.054876546852832e-06, "loss": 0.0054, "step": 1380 }, { "epoch": 2.213375796178344, "grad_norm": 0.2679773271083832, "learning_rate": 3.906873036910128e-06, "loss": 0.0061, "step": 1390 }, { "epoch": 2.229299363057325, "grad_norm": 0.20967616140842438, "learning_rate": 3.760962618087871e-06, "loss": 0.0054, "step": 1400 }, { "epoch": 2.245222929936306, "grad_norm": 0.24804317951202393, "learning_rate": 3.6171954130577004e-06, "loss": 0.0048, "step": 1410 }, { "epoch": 2.261146496815287, "grad_norm": 0.2076551765203476, "learning_rate": 3.4756208082614375e-06, "loss": 0.005, "step": 1420 }, { "epoch": 2.2770700636942673, "grad_norm": 0.19931840896606445, "learning_rate": 3.3362874369460154e-06, "loss": 0.0044, "step": 1430 }, { "epoch": 2.2929936305732483, "grad_norm": 0.18009555339813232, "learning_rate": 3.1992431624571364e-06, "loss": 0.0045, "step": 1440 }, { "epoch": 2.3089171974522293, "grad_norm": 0.3000176250934601, "learning_rate": 3.06453506179743e-06, "loss": 0.0053, "step": 1450 }, { "epoch": 2.3248407643312103, "grad_norm": 0.28960147500038147, "learning_rate": 2.9322094094547006e-06, "loss": 0.0046, "step": 1460 }, { "epoch": 2.340764331210191, "grad_norm": 0.1789318174123764, "learning_rate": 2.802311661505873e-06, "loss": 0.0051, "step": 1470 }, { "epoch": 2.356687898089172, "grad_norm": 0.21748697757720947, "learning_rate": 2.6748864400020557e-06, "loss": 0.0048, "step": 1480 }, { "epoch": 2.372611464968153, "grad_norm": 0.27882322669029236, "learning_rate": 2.54997751764014e-06, "loss": 0.0049, "step": 1490 }, { "epoch": 2.388535031847134, "grad_norm": 0.2235172539949417, "learning_rate": 2.42762780272613e-06, "loss": 0.0044, "step": 1500 }, { "epoch": 2.404458598726115, "grad_norm": 0.26929816603660583, "learning_rate": 2.3078793244354303e-06, "loss": 0.0049, "step": 1510 }, { "epoch": 2.4203821656050954, "grad_norm": 0.19245010614395142, "learning_rate": 2.1907732183751084e-06, "loss": 0.0041, "step": 1520 }, { "epoch": 2.4363057324840764, "grad_norm": 0.21086634695529938, "learning_rate": 2.076349712453141e-06, "loss": 0.0045, "step": 1530 }, { "epoch": 2.4522292993630574, "grad_norm": 0.21319827437400818, "learning_rate": 1.964648113059442e-06, "loss": 0.0037, "step": 1540 }, { "epoch": 2.468152866242038, "grad_norm": 0.22776374220848083, "learning_rate": 1.8557067915634707e-06, "loss": 0.0039, "step": 1550 }, { "epoch": 2.484076433121019, "grad_norm": 0.24752478301525116, "learning_rate": 1.7495631711330052e-06, "loss": 0.0044, "step": 1560 }, { "epoch": 2.5, "grad_norm": 0.19484113156795502, "learning_rate": 1.6462537138786793e-06, "loss": 0.0044, "step": 1570 }, { "epoch": 2.515923566878981, "grad_norm": 0.16486898064613342, "learning_rate": 1.545813908328624e-06, "loss": 0.0036, "step": 1580 }, { "epoch": 2.531847133757962, "grad_norm": 0.15153397619724274, "learning_rate": 1.4482782572375698e-06, "loss": 0.0044, "step": 1590 }, { "epoch": 2.5477707006369426, "grad_norm": 0.15440870821475983, "learning_rate": 1.353680265734565e-06, "loss": 0.0038, "step": 1600 }, { "epoch": 2.5636942675159236, "grad_norm": 0.17342127859592438, "learning_rate": 1.2620524298134117e-06, "loss": 0.0044, "step": 1610 }, { "epoch": 2.5796178343949046, "grad_norm": 0.18218614161014557, "learning_rate": 1.1734262251697458e-06, "loss": 0.0038, "step": 1620 }, { "epoch": 2.595541401273885, "grad_norm": 0.19614370167255402, "learning_rate": 1.087832096388608e-06, "loss": 0.0039, "step": 1630 }, { "epoch": 2.611464968152866, "grad_norm": 0.12302397936582565, "learning_rate": 1.005299446486211e-06, "loss": 0.0036, "step": 1640 }, { "epoch": 2.627388535031847, "grad_norm": 0.2053324580192566, "learning_rate": 9.258566268095281e-07, "loss": 0.0039, "step": 1650 }, { "epoch": 2.643312101910828, "grad_norm": 0.17470555007457733, "learning_rate": 8.495309272971175e-07, "loss": 0.0041, "step": 1660 }, { "epoch": 2.659235668789809, "grad_norm": 0.13682498037815094, "learning_rate": 7.763485671045723e-07, "loss": 0.0034, "step": 1670 }, { "epoch": 2.6751592356687897, "grad_norm": 0.24173220992088318, "learning_rate": 7.063346855978092e-07, "loss": 0.0036, "step": 1680 }, { "epoch": 2.6910828025477707, "grad_norm": 0.19215670228004456, "learning_rate": 6.395133337172699e-07, "loss": 0.0039, "step": 1690 }, { "epoch": 2.7070063694267517, "grad_norm": 0.15766797959804535, "learning_rate": 5.759074657160213e-07, "loss": 0.0033, "step": 1700 }, { "epoch": 2.722929936305732, "grad_norm": 0.17200110852718353, "learning_rate": 5.15538931274584e-07, "loss": 0.0036, "step": 1710 }, { "epoch": 2.738853503184713, "grad_norm": 0.20934487879276276, "learning_rate": 4.5842846799520337e-07, "loss": 0.0039, "step": 1720 }, { "epoch": 2.754777070063694, "grad_norm": 0.18075595796108246, "learning_rate": 4.0459569427813593e-07, "loss": 0.0038, "step": 1730 }, { "epoch": 2.770700636942675, "grad_norm": 0.22637341916561127, "learning_rate": 3.540591025824025e-07, "loss": 0.0037, "step": 1740 }, { "epoch": 2.786624203821656, "grad_norm": 0.4421471357345581, "learning_rate": 3.0683605307331053e-07, "loss": 0.0038, "step": 1750 }, { "epoch": 2.802547770700637, "grad_norm": 0.18272390961647034, "learning_rate": 2.6294276765895577e-07, "loss": 0.0035, "step": 1760 }, { "epoch": 2.8184713375796178, "grad_norm": 0.1819288283586502, "learning_rate": 2.223943244177207e-07, "loss": 0.0035, "step": 1770 }, { "epoch": 2.8343949044585988, "grad_norm": 0.20315492153167725, "learning_rate": 1.852046524187068e-07, "loss": 0.0037, "step": 1780 }, { "epoch": 2.8503184713375798, "grad_norm": 0.12796497344970703, "learning_rate": 1.5138652693685751e-07, "loss": 0.0027, "step": 1790 }, { "epoch": 2.8662420382165603, "grad_norm": 0.2990235984325409, "learning_rate": 1.209515650644455e-07, "loss": 0.0032, "step": 1800 }, { "epoch": 2.8821656050955413, "grad_norm": 0.11842101812362671, "learning_rate": 9.391022172040243e-08, "loss": 0.0034, "step": 1810 }, { "epoch": 2.8980891719745223, "grad_norm": 0.23246175050735474, "learning_rate": 7.02717860588853e-08, "loss": 0.0037, "step": 1820 }, { "epoch": 2.9140127388535033, "grad_norm": 0.14524857699871063, "learning_rate": 5.0044378278286675e-08, "loss": 0.0035, "step": 1830 }, { "epoch": 2.9299363057324843, "grad_norm": 0.21137791872024536, "learning_rate": 3.323494683181716e-08, "loss": 0.0039, "step": 1840 }, { "epoch": 2.945859872611465, "grad_norm": 0.14681434631347656, "learning_rate": 1.9849266040596893e-08, "loss": 0.0035, "step": 1850 }, { "epoch": 2.961783439490446, "grad_norm": 0.1578862965106964, "learning_rate": 9.891934110075606e-09, "loss": 0.0034, "step": 1860 }, { "epoch": 2.977707006369427, "grad_norm": 0.16527986526489258, "learning_rate": 3.366371550470593e-09, "loss": 0.0031, "step": 1870 }, { "epoch": 2.9936305732484074, "grad_norm": 0.15652863681316376, "learning_rate": 2.74820001772147e-10, "loss": 0.0031, "step": 1880 } ], "logging_steps": 10, "max_steps": 1884, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.981483631879022e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }