{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9898666666666667, "eval_steps": 500, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 582.459849357605, "epoch": 0.017066666666666667, "grad_norm": 0.31589704751968384, "learning_rate": 0.0, "loss": -0.1186, "num_tokens": 653232.0, "reward": 0.3515625159488991, "reward_std": 0.3448667535558343, "rewards/accuracy_reward": 0.35156250034924597, "sce_advantage": 29.60546875, "sce_advantage_std": 1.6962890625, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 609.3683314323425, "epoch": 0.08533333333333333, "grad_norm": 0.3136809468269348, "learning_rate": 2e-06, "loss": -0.1096, "num_tokens": 3367024.0, "reward": 0.345982157450635, "reward_std": 0.342531890142709, "rewards/accuracy_reward": 0.34598214336438105, "sce_advantage": 29.724609375, "sce_advantage_std": 1.6939697265625, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 647.798466873169, "epoch": 0.17066666666666666, "grad_norm": 0.31411269307136536, "learning_rate": 2.9754298604207156e-06, "loss": -0.0326, "num_tokens": 6928708.0, "reward": 0.5000000235857442, "reward_std": 0.31452907202765346, "rewards/accuracy_reward": 0.500744047272019, "sce_advantage": 32.82578125, "sce_advantage_std": 1.568798828125, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 762.5181144714355, "epoch": 0.256, "grad_norm": 0.42305925488471985, "learning_rate": 2.8281840384798147e-06, "loss": 0.0293, "num_tokens": 11004595.0, "reward": 0.6053571700351312, "reward_std": 0.21672922340221704, "rewards/accuracy_reward": 0.605357142095454, "sce_advantage": 38.546875, "sce_advantage_std": 1.0988525390625, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 825.0319566726685, "epoch": 0.3413333333333333, "grad_norm": 0.12070361524820328, "learning_rate": 2.5606601717798212e-06, "loss": 0.0629, "num_tokens": 15349561.0, "reward": 0.6325893149245531, "reward_std": 0.21432666098698974, "rewards/accuracy_reward": 0.6325892861466855, "sce_advantage": 41.025, "sce_advantage_std": 1.011767578125, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 832.1634309768676, "epoch": 0.4266666666666667, "grad_norm": 0.11953950673341751, "learning_rate": 2.1970847580656528e-06, "loss": 0.0529, "num_tokens": 19747280.0, "reward": 0.6272321719443426, "reward_std": 0.20688999248668552, "rewards/accuracy_reward": 0.6272321433527395, "sce_advantage": 42.6671875, "sce_advantage_std": 0.94627685546875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 824.6692317962646, "epoch": 0.512, "grad_norm": 0.12637147307395935, "learning_rate": 1.7703825567208588e-06, "loss": 0.0526, "num_tokens": 24100554.0, "reward": 0.6368303868919611, "reward_std": 0.19028520658612252, "rewards/accuracy_reward": 0.6368303560186177, "sce_advantage": 43.525, "sce_advantage_std": 0.9552490234375, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 848.5489213943481, "epoch": 0.5973333333333334, "grad_norm": 0.12432567030191422, "learning_rate": 1.3191949796170155e-06, "loss": 0.0577, "num_tokens": 28565891.0, "reward": 0.5912946681957691, "reward_std": 0.19803369138389826, "rewards/accuracy_reward": 0.591294644260779, "sce_advantage": 43.93125, "sce_advantage_std": 0.91083984375, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 837.3781648635864, "epoch": 0.6826666666666666, "grad_norm": 0.5495607256889343, "learning_rate": 8.843807918208651e-07, "loss": 0.0474, "num_tokens": 32990269.0, "reward": 0.6285714600235224, "reward_std": 0.1882201772648841, "rewards/accuracy_reward": 0.6285714281257242, "sce_advantage": 44.7953125, "sce_advantage_std": 0.8768310546875, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 813.5683387756347, "epoch": 0.768, "grad_norm": 0.13694342970848083, "learning_rate": 5.053160126388076e-07, "loss": 0.0488, "num_tokens": 37331289.0, "reward": 0.6265625279862433, "reward_std": 0.18500199196860195, "rewards/accuracy_reward": 0.6265624999534338, "sce_advantage": 45.015625, "sce_advantage_std": 0.9173583984375, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 829.90628490448, "epoch": 0.8533333333333334, "grad_norm": 0.6106674671173096, "learning_rate": 2.163280915478289e-07, "loss": 0.0456, "num_tokens": 41709642.0, "reward": 0.6316964592551813, "reward_std": 0.21558557036332787, "rewards/accuracy_reward": 0.6316964287543669, "sce_advantage": 45.315625, "sce_advantage_std": 0.8802734375, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 836.8087438583374, "epoch": 0.9386666666666666, "grad_norm": 0.17513859272003174, "learning_rate": 4.358727386092198e-08, "loss": 0.0335, "num_tokens": 46128522.0, "reward": 0.5850446717813611, "reward_std": 0.20032540340907873, "rewards/accuracy_reward": 0.5865327379666269, "sce_advantage": 45.3296875, "sce_advantage_std": 0.87066650390625, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 833.650455156962, "epoch": 0.9898666666666667, "num_tokens": 48759009.0, "reward": 0.5859375276292363, "reward_std": 0.2100959747719268, "rewards/accuracy_reward": 0.5859375, "sce_advantage": 45.328125, "sce_advantage_std": 0.9546305338541666, "step": 58, "total_flos": 0.0, "train_loss": 0.024251502135704303, "train_runtime": 40823.4006, "train_samples_per_second": 0.184, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }