{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9962034927866363, "eval_steps": 2000000, "global_step": 164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 803.0022583007812, "epoch": 0.006074411541381929, "grad_norm": 0.14674668166353436, "kl": 0.0, "learning_rate": 5.88235294117647e-08, "loss": 0.0479, "num_tokens": 918402.0, "reward": 0.9815848618745804, "reward_std": 0.23756355978548527, "rewards/accuracy_reward": 0.4933035746216774, "rewards/format_reward": 0.9765624925494194, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 734.3027648925781, "epoch": 0.030372057706909643, "grad_norm": 0.13057922454955487, "kl": 5.84721565246582e-05, "learning_rate": 2.941176470588235e-07, "loss": 0.037, "num_tokens": 4258679.0, "reward": 1.0901228114962578, "reward_std": 0.23264290555380285, "rewards/accuracy_reward": 0.5965401763096452, "rewards/format_reward": 0.9871651735156775, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 743.0277114868164, "epoch": 0.060744115413819286, "grad_norm": 0.1343510023679506, "kl": 8.401274681091309e-05, "learning_rate": 5.88235294117647e-07, "loss": 0.0465, "num_tokens": 8471139.0, "reward": 1.062276841700077, "reward_std": 0.23240854553878307, "rewards/accuracy_reward": 0.5718749992549419, "rewards/format_reward": 0.9808035627007484, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 739.1007049560546, "epoch": 0.09111617312072894, "grad_norm": 0.738570752534254, "kl": 0.00017675161361694336, "learning_rate": 8.823529411764705e-07, "loss": 0.0358, "num_tokens": 12734494.0, "reward": 1.0722098737955092, "reward_std": 0.2205923892557621, "rewards/accuracy_reward": 0.5785714268684388, "rewards/format_reward": 0.9872767761349678, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 745.5227966308594, "epoch": 0.12148823082763857, "grad_norm": 0.30158627332435983, "kl": 0.0026874780654907227, "learning_rate": 9.989726963751682e-07, "loss": 0.037, "num_tokens": 16990620.0, "reward": 1.0786830827593803, "reward_std": 0.23385403044521808, "rewards/accuracy_reward": 0.5861607141792774, "rewards/format_reward": 0.9850446373224259, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 731.8799407958984, "epoch": 0.15186028853454822, "grad_norm": 0.10368791276497527, "kl": 0.0003843784332275391, "learning_rate": 9.927100106776212e-07, "loss": 0.0351, "num_tokens": 21176106.0, "reward": 1.106584869325161, "reward_std": 0.21388941686600446, "rewards/accuracy_reward": 0.6142857171595096, "rewards/format_reward": 0.9845982104539871, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 716.5080688476562, "epoch": 0.18223234624145787, "grad_norm": 0.24334246130754558, "kl": 0.0008969306945800781, "learning_rate": 9.808267184205181e-07, "loss": 0.0203, "num_tokens": 25301046.0, "reward": 1.0974330857396126, "reward_std": 0.21846173331141472, "rewards/accuracy_reward": 0.6022321447730065, "rewards/format_reward": 0.9904017791152, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 718.258511352539, "epoch": 0.2126044039483675, "grad_norm": 0.13060657109261103, "kl": 0.0011320114135742188, "learning_rate": 9.634583786730108e-07, "loss": 0.0247, "num_tokens": 29447476.0, "reward": 1.116071480512619, "reward_std": 0.21333869993686677, "rewards/accuracy_reward": 0.620758930593729, "rewards/format_reward": 0.9906249955296517, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 734.5969055175781, "epoch": 0.24297646165527714, "grad_norm": 0.14790799582315342, "kl": 0.0016210556030273437, "learning_rate": 9.408031213740044e-07, "loss": 0.0307, "num_tokens": 33678894.0, "reward": 1.0677455827593803, "reward_std": 0.2126396529376507, "rewards/accuracy_reward": 0.5736607156693936, "rewards/format_reward": 0.9881696373224258, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 724.3696716308593, "epoch": 0.2733485193621868, "grad_norm": 0.1235294567362974, "kl": 0.0030605316162109373, "learning_rate": 9.131193871579974e-07, "loss": 0.0288, "num_tokens": 37860574.0, "reward": 1.0801339834928512, "reward_std": 0.22113933004438877, "rewards/accuracy_reward": 0.5854910694062709, "rewards/format_reward": 0.9892857074737549, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 719.0857437133789, "epoch": 0.30372057706909644, "grad_norm": 0.11260442217805001, "kl": 0.004908370971679688, "learning_rate": 8.807229791845671e-07, "loss": 0.0309, "num_tokens": 42021414.0, "reward": 1.1001116633415222, "reward_std": 0.2082567172124982, "rewards/accuracy_reward": 0.6053571477532387, "rewards/format_reward": 0.9895089223980904, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 710.4136505126953, "epoch": 0.3340926347760061, "grad_norm": 0.12446746182181773, "kl": 0.00711669921875, "learning_rate": 8.439834606028593e-07, "loss": 0.03, "num_tokens": 46149299.0, "reward": 1.1018973752856254, "reward_std": 0.20198939852416514, "rewards/accuracy_reward": 0.6075892850756646, "rewards/format_reward": 0.9886160641908646, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 711.5286056518555, "epoch": 0.36446469248291574, "grad_norm": 0.13140372700302483, "kl": 0.01026763916015625, "learning_rate": 8.033199387471276e-07, "loss": 0.0255, "num_tokens": 50248419.0, "reward": 1.0939732655882835, "reward_std": 0.20504674576222898, "rewards/accuracy_reward": 0.6002232126891613, "rewards/format_reward": 0.9874999925494194, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 693.0864105224609, "epoch": 0.39483675018982534, "grad_norm": 5.84593208956697, "kl": 0.01532440185546875, "learning_rate": 7.591962841552626e-07, "loss": 0.0221, "num_tokens": 54273302.0, "reward": 1.102567011117935, "reward_std": 0.19313923437148334, "rewards/accuracy_reward": 0.6060267843306064, "rewards/format_reward": 0.9930803507566452, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 698.3335159301757, "epoch": 0.425208807896735, "grad_norm": 0.18797262379226592, "kl": 0.0115875244140625, "learning_rate": 7.121158389495185e-07, "loss": 0.0308, "num_tokens": 58317828.0, "reward": 1.0886161223053932, "reward_std": 0.2050962893292308, "rewards/accuracy_reward": 0.5941964283585548, "rewards/format_reward": 0.9888392791152001, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 680.4533798217774, "epoch": 0.45558086560364464, "grad_norm": 0.12462934113614446, "kl": 0.01414947509765625, "learning_rate": 6.626156749437736e-07, "loss": 0.0258, "num_tokens": 62259643.0, "reward": 1.1068080976605414, "reward_std": 0.1903689544647932, "rewards/accuracy_reward": 0.6107142798602581, "rewards/format_reward": 0.9921874910593033, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 671.9855209350586, "epoch": 0.4859529233105543, "grad_norm": 0.12209526567257212, "kl": 0.01604156494140625, "learning_rate": 6.112604669781572e-07, "loss": 0.0134, "num_tokens": 66175850.0, "reward": 1.0722098708152772, "reward_std": 0.1989122748374939, "rewards/accuracy_reward": 0.5743303574621678, "rewards/format_reward": 0.9957589223980904, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 655.3560516357422, "epoch": 0.5163249810174639, "grad_norm": 0.12987027750323757, "kl": 0.017596435546875, "learning_rate": 5.586360513712009e-07, "loss": 0.0202, "num_tokens": 70039477.0, "reward": 1.1373884499073028, "reward_std": 0.17809431692585348, "rewards/accuracy_reward": 0.6401785723865032, "rewards/format_reward": 0.9944196373224259, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 679.969448852539, "epoch": 0.5466970387243736, "grad_norm": 0.11569644648116158, "kl": 0.01629638671875, "learning_rate": 5.053427429716866e-07, "loss": 0.0261, "num_tokens": 74010748.0, "reward": 1.1206473752856254, "reward_std": 0.16938802655786275, "rewards/accuracy_reward": 0.62518887296319, "rewards/format_reward": 0.9930803492665291, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 670.8172203063965, "epoch": 0.5770690964312832, "grad_norm": 0.14653189798827554, "kl": 0.01604461669921875, "learning_rate": 4.519884870461591e-07, "loss": 0.0062, "num_tokens": 77924425.0, "reward": 1.1319196939468383, "reward_std": 0.18193732015788555, "rewards/accuracy_reward": 0.6334821462631226, "rewards/format_reward": 0.9968749970197678, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 689.6049453735352, "epoch": 0.6074411541381929, "grad_norm": 0.1373443920915848, "kl": 0.0151153564453125, "learning_rate": 3.991819241221835e-07, "loss": 0.0202, "num_tokens": 81937855.0, "reward": 1.1222098782658576, "reward_std": 0.1860942555591464, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9944196373224259, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 675.8683364868164, "epoch": 0.6378132118451025, "grad_norm": 0.14762350547769187, "kl": 0.015521240234375, "learning_rate": 3.4752544690038643e-07, "loss": 0.0151, "num_tokens": 85899921.0, "reward": 1.128125049173832, "reward_std": 0.1950968151912093, "rewards/accuracy_reward": 0.6305803559720516, "rewards/format_reward": 0.9950892791152001, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 696.7239181518555, "epoch": 0.6681852695520122, "grad_norm": 0.12897190097851505, "kl": 0.0144561767578125, "learning_rate": 2.976083284388031e-07, "loss": 0.0225, "num_tokens": 89967132.0, "reward": 1.0918527334928512, "reward_std": 0.1823650782927871, "rewards/accuracy_reward": 0.5953125059604645, "rewards/format_reward": 0.9930803492665291, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 719.4877540588379, "epoch": 0.6985573272589218, "grad_norm": 0.12020301457071197, "kl": 0.014898681640625, "learning_rate": 2.500000000000001e-07, "loss": 0.0153, "num_tokens": 94129389.0, "reward": 1.1255580872297286, "reward_std": 0.19005396589636803, "rewards/accuracy_reward": 0.6283482149243355, "rewards/format_reward": 0.9944196343421936, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 681.8125282287598, "epoch": 0.7289293849658315, "grad_norm": 0.1344831996031667, "kl": 0.01582489013671875, "learning_rate": 2.0524355524417015e-07, "loss": 0.0192, "num_tokens": 98103221.0, "reward": 1.1333705812692643, "reward_std": 0.20433492437005044, "rewards/accuracy_reward": 0.6354910746216774, "rewards/format_reward": 0.9957589209079742, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 681.395115661621, "epoch": 0.7593014426727411, "grad_norm": 0.13865803911159388, "kl": 0.0163787841796875, "learning_rate": 1.6384955486934154e-07, "loss": 0.0122, "num_tokens": 102096279.0, "reward": 1.1371652349829673, "reward_std": 0.18053851332515478, "rewards/accuracy_reward": 0.6404017873108387, "rewards/format_reward": 0.9935267820954323, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 702.8656555175781, "epoch": 0.7896735003796507, "grad_norm": 0.10840464189916307, "kl": 0.01602935791015625, "learning_rate": 1.262902023724824e-07, "loss": 0.0162, "num_tokens": 106168389.0, "reward": 1.1045759484171866, "reward_std": 0.18122291592881085, "rewards/accuracy_reward": 0.6062500029802322, "rewards/format_reward": 0.9966517820954323, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 696.1946792602539, "epoch": 0.8200455580865603, "grad_norm": 0.1591189816361499, "kl": 0.01680145263671875, "learning_rate": 9.299395737170757e-08, "loss": 0.0167, "num_tokens": 110246405.0, "reward": 1.0947545170783997, "reward_std": 0.20664523243904115, "rewards/accuracy_reward": 0.5973214291036129, "rewards/format_reward": 0.9948660656809807, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 698.2578453063965, "epoch": 0.85041761579347, "grad_norm": 0.13052581661637774, "kl": 0.0175323486328125, "learning_rate": 6.43406479383053e-08, "loss": 0.0206, "num_tokens": 114326168.0, "reward": 1.1349330857396125, "reward_std": 0.19575350042432546, "rewards/accuracy_reward": 0.6372767858207226, "rewards/format_reward": 0.9953124925494194, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 678.3460144042969, "epoch": 0.8807896735003796, "grad_norm": 0.12342327662368928, "kl": 0.01726837158203125, "learning_rate": 4.065713769482082e-08, "loss": 0.0217, "num_tokens": 118268150.0, "reward": 1.131250050663948, "reward_std": 0.19892821311950684, "rewards/accuracy_reward": 0.6337053582072258, "rewards/format_reward": 0.9950892806053162, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 699.1832885742188, "epoch": 0.9111617312072893, "grad_norm": 0.11347140306400243, "kl": 0.01734161376953125, "learning_rate": 2.2213597106929605e-08, "loss": 0.0206, "num_tokens": 122323331.0, "reward": 1.123325940966606, "reward_std": 0.19308128226548432, "rewards/accuracy_reward": 0.625669640302658, "rewards/format_reward": 0.9953124925494194, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 699.7196716308594, "epoch": 0.9415337889141989, "grad_norm": 0.1475830222889839, "kl": 0.01719207763671875, "learning_rate": 9.22042150446728e-09, "loss": 0.0185, "num_tokens": 126366451.0, "reward": 1.154241117835045, "reward_std": 0.1731583815999329, "rewards/accuracy_reward": 0.6560267820954323, "rewards/format_reward": 0.9964285656809807, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 695.9060577392578, "epoch": 0.9719058466211086, "grad_norm": 0.13967302854618574, "kl": 0.01683197021484375, "learning_rate": 1.8258309893965374e-09, "loss": 0.0191, "num_tokens": 130430318.0, "reward": 1.1508929118514062, "reward_std": 0.20208097249269485, "rewards/accuracy_reward": 0.6546875029802323, "rewards/format_reward": 0.9924107119441032, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 684.7267169952393, "epoch": 0.9962034927866363, "kl": 0.017988204956054688, "num_tokens": 133627034.0, "reward": 1.118443138897419, "reward_std": 0.1834622365422547, "rewards/accuracy_reward": 0.6226820051670074, "rewards/format_reward": 0.9946986511349678, "step": 164, "total_flos": 0.0, "train_loss": 0.024119453254814554, "train_runtime": 33084.5686, "train_samples_per_second": 0.557, "train_steps_per_second": 0.005 } ], "logging_steps": 5, "max_steps": 164, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }