calculator_agent_qwen2.5_3b / trainer_state.json
Dan-AiTuning's picture
model commit
0a6fb3c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0025945808738470078,
"completion_length": 170.85416666666666,
"epoch": 0.0033333333333333335,
"grad_norm": 93.89875220655702,
"kl": 385.1,
"learning_rate": 4e-07,
"loss": 0.7721,
"reward": 0.6091666718324026,
"reward_std": 0.11040117839972179,
"rewards/judge_tool_use": 0.6104166805744171,
"rewards/judge_tool_use/std": 0.23237329721450806,
"rewards/verify_correctness": 0.6041666666666666,
"rewards/verify_correctness/std": 0.331703782081604,
"step": 5
},
{
"clip_ratio": 0.0060518977232277395,
"completion_length": 177.1875,
"epoch": 0.006666666666666667,
"grad_norm": 9.776842338966992,
"kl": 112.5,
"learning_rate": 9e-07,
"loss": 0.2337,
"reward": 0.5799999833106995,
"reward_std": 0.06415978074073792,
"rewards/judge_tool_use": 0.5531249940395355,
"rewards/judge_tool_use/std": 0.42493732273578644,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.457730233669281,
"step": 10
},
{
"clip_ratio": 0.0026892464607954024,
"completion_length": 93.39583333333333,
"epoch": 0.01,
"grad_norm": 9.977887566458119,
"kl": 118.95,
"learning_rate": 1e-06,
"loss": 0.2916,
"reward": 0.8125,
"reward_std": 0.13529950194060802,
"rewards/judge_tool_use": 0.843750019868215,
"rewards/judge_tool_use/std": 0.2237425111234188,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 15
},
{
"clip_ratio": 0.006966326106339693,
"completion_length": 102.1875,
"epoch": 0.013333333333333334,
"grad_norm": 12.06423951719899,
"kl": 125.55,
"learning_rate": 1e-06,
"loss": 0.2913,
"reward": 0.7787500023841858,
"reward_std": 0.18160327523946762,
"rewards/judge_tool_use": 0.778124988079071,
"rewards/judge_tool_use/std": 0.3228672966361046,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 20
},
{
"clip_ratio": 0.0011292789597064257,
"completion_length": 124.3125,
"epoch": 0.016666666666666666,
"grad_norm": 8.195116496059761,
"kl": 61.559375,
"learning_rate": 1e-06,
"loss": 0.1389,
"reward": 0.8116666873296102,
"reward_std": 0.11048179492354393,
"rewards/judge_tool_use": 0.8270833293596903,
"rewards/judge_tool_use/std": 0.24461634953816733,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.39984261989593506,
"step": 25
},
{
"clip_ratio": 0.0029064802452921867,
"completion_length": 59.375,
"epoch": 0.02,
"grad_norm": 0.5796629563759274,
"kl": 4.3453125,
"learning_rate": 1e-06,
"loss": 0.0212,
"reward": 0.887499988079071,
"reward_std": 0.10905145853757858,
"rewards/judge_tool_use": 0.890625,
"rewards/judge_tool_use/std": 0.1675497591495514,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.22360680997371674,
"step": 30
},
{
"clip_ratio": 0.007409381680190563,
"completion_length": 98.375,
"epoch": 0.023333333333333334,
"grad_norm": 0.8606388492244584,
"kl": 6.91875,
"learning_rate": 1e-06,
"loss": 0.0811,
"reward": 0.8641666372617086,
"reward_std": 0.12008555854360263,
"rewards/judge_tool_use": 0.9083333412806193,
"rewards/judge_tool_use/std": 0.1753722901145617,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.42213259140650433,
"step": 35
},
{
"clip_ratio": 0.004702709428966045,
"completion_length": 101.34375,
"epoch": 0.02666666666666667,
"grad_norm": 9.62952082324332,
"kl": 2.60703125,
"learning_rate": 1e-06,
"loss": 0.0232,
"reward": 0.8199999928474426,
"reward_std": 0.22975663095712662,
"rewards/judge_tool_use": 0.8531250059604645,
"rewards/judge_tool_use/std": 0.28422578424215317,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3811737895011902,
"step": 40
},
{
"clip_ratio": 0.0037392981350421904,
"completion_length": 115.72916666666667,
"epoch": 0.03,
"grad_norm": 0.41001786033101184,
"kl": 7.5859375,
"learning_rate": 1e-06,
"loss": 0.0704,
"reward": 0.812499980131785,
"reward_std": 0.11250268605848153,
"rewards/judge_tool_use": 0.84375,
"rewards/judge_tool_use/std": 0.22761926551659903,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 45
},
{
"clip_ratio": 0.0033889828715473413,
"completion_length": 100.75,
"epoch": 0.03333333333333333,
"grad_norm": 3.0378548631776696,
"kl": 0.6390625,
"learning_rate": 1e-06,
"loss": 0.0521,
"reward": 0.6862499862909317,
"reward_std": 0.11575066298246384,
"rewards/judge_tool_use": 0.6937499940395355,
"rewards/judge_tool_use/std": 0.22235235385596752,
"rewards/verify_correctness": 0.65625,
"rewards/verify_correctness/std": 0.23935678601264954,
"step": 50
},
{
"clip_ratio": 0.0024284129962325097,
"completion_length": 119.375,
"epoch": 0.03666666666666667,
"grad_norm": 20.786314639465385,
"kl": 1.7916015625,
"learning_rate": 1e-06,
"loss": 0.0271,
"reward": 0.8674999872843424,
"reward_std": 0.12038270942866802,
"rewards/judge_tool_use": 0.912500003973643,
"rewards/judge_tool_use/std": 0.1864920680721601,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.42213259140650433,
"step": 55
},
{
"clip_ratio": 0.006763660744763911,
"completion_length": 110.09375,
"epoch": 0.04,
"grad_norm": 109.33119098699642,
"kl": 47.1625,
"learning_rate": 1e-06,
"loss": 0.1779,
"reward": 0.7075000107288361,
"reward_std": 0.13642948493361473,
"rewards/judge_tool_use": 0.7125000059604645,
"rewards/judge_tool_use/std": 0.3916912078857422,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.42898140847682953,
"step": 60
},
{
"clip_ratio": 0.0026321998797357083,
"completion_length": 103.39583333333333,
"epoch": 0.043333333333333335,
"grad_norm": 1.4069264237823462,
"kl": 2.836328125,
"learning_rate": 1e-06,
"loss": 0.0596,
"reward": 0.5699999978144964,
"reward_std": 0.1616679678360621,
"rewards/judge_tool_use": 0.5875000009934107,
"rewards/judge_tool_use/std": 0.29024802645047504,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.25411585966746014,
"step": 65
},
{
"clip_ratio": 0.0048656010068953036,
"completion_length": 140.875,
"epoch": 0.04666666666666667,
"grad_norm": 0.05496390689490212,
"kl": 0.383984375,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 0.7750000059604645,
"reward_std": 0.009258201345801353,
"rewards/judge_tool_use": 0.78125,
"rewards/judge_tool_use/std": 0.22647663950920105,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 70
},
{
"clip_ratio": 0.0005703422240912915,
"completion_length": 101.45833333333333,
"epoch": 0.05,
"grad_norm": 4.7434453029775785,
"kl": 0.85390625,
"learning_rate": 1e-06,
"loss": 0.0123,
"reward": 0.8791666626930237,
"reward_std": 0.11438154180844624,
"rewards/judge_tool_use": 0.90625,
"rewards/judge_tool_use/std": 0.1892575373252233,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.32623785734176636,
"step": 75
},
{
"clip_ratio": 0.004394483286887407,
"completion_length": 115.125,
"epoch": 0.05333333333333334,
"grad_norm": 18.348889188180735,
"kl": 1.1921875,
"learning_rate": 1e-06,
"loss": 0.0491,
"reward": 0.8037499785423279,
"reward_std": 0.11402197554707527,
"rewards/judge_tool_use": 0.856249988079071,
"rewards/judge_tool_use/std": 0.22299936041235924,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.497555673122406,
"step": 80
},
{
"clip_ratio": 0.0019086383283138276,
"completion_length": 72.72916666666667,
"epoch": 0.056666666666666664,
"grad_norm": 0.18350366354479203,
"kl": 0.076220703125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9666666785875956,
"reward_std": 0.03248864381263653,
"rewards/judge_tool_use": 0.9895833333333334,
"rewards/judge_tool_use/std": 0.03110433618227641,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.16666666666666666,
"step": 85
},
{
"clip_ratio": 0.003978083655238151,
"completion_length": 128.125,
"epoch": 0.06,
"grad_norm": 0.4888077045674276,
"kl": 0.452734375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 0.887499988079071,
"reward_std": 0.0304714092053473,
"rewards/judge_tool_use": 0.984375,
"rewards/judge_tool_use/std": 0.04136751964688301,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.5163977742195129,
"step": 90
},
{
"clip_ratio": 0.004061844293028116,
"completion_length": 73.95833333333333,
"epoch": 0.06333333333333334,
"grad_norm": 1.8106715962150735,
"kl": 0.293359375,
"learning_rate": 1e-06,
"loss": -0.001,
"reward": 0.850000003973643,
"reward_std": 0.03771235949049393,
"rewards/judge_tool_use": 0.9375,
"rewards/judge_tool_use/std": 0.07327633599440257,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 95
},
{
"clip_ratio": 0.0005249343812465668,
"completion_length": 116.4375,
"epoch": 0.06666666666666667,
"grad_norm": 0.8380959397252633,
"kl": 0.251953125,
"learning_rate": 1e-06,
"loss": 0.008,
"reward": 0.7874999940395355,
"reward_std": 0.05897941440343857,
"rewards/judge_tool_use": 0.796875,
"rewards/judge_tool_use/std": 0.2327149659395218,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 100
},
{
"clip_ratio": 0.00741090327501297,
"completion_length": 75.02083333333333,
"epoch": 0.07,
"grad_norm": 10.30093582857556,
"kl": 0.8576171875,
"learning_rate": 1e-06,
"loss": 0.0212,
"reward": 0.6974999904632568,
"reward_std": 0.15869482358296713,
"rewards/judge_tool_use": 0.7416666547457377,
"rewards/judge_tool_use/std": 0.22486203908920288,
"rewards/verify_correctness": 0.5208333333333334,
"rewards/verify_correctness/std": 0.5150477091471354,
"step": 105
},
{
"clip_ratio": 0.0045037418603897095,
"completion_length": 107.53125,
"epoch": 0.07333333333333333,
"grad_norm": 1.2618789359641311,
"kl": 0.459375,
"learning_rate": 1e-06,
"loss": 0.02,
"reward": 0.918749988079071,
"reward_std": 0.10914891492575407,
"rewards/judge_tool_use": 0.953125,
"rewards/judge_tool_use/std": 0.10675819590687752,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 110
},
{
"clip_ratio": 0.0013080392964184284,
"completion_length": 83.29166666666667,
"epoch": 0.07666666666666666,
"grad_norm": 1.3187929435904935,
"kl": 6.78203125,
"learning_rate": 1e-06,
"loss": 0.0404,
"reward": 0.8991666634877523,
"reward_std": 0.10660337905089061,
"rewards/judge_tool_use": 0.931249996026357,
"rewards/judge_tool_use/std": 0.17414189875125885,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.3198537329832713,
"step": 115
},
{
"clip_ratio": 0.001383163803257048,
"completion_length": 127.46875,
"epoch": 0.08,
"grad_norm": 0.07051598250723173,
"kl": 0.60703125,
"learning_rate": 1e-06,
"loss": -0.005,
"reward": 0.7400000095367432,
"reward_std": 0.05770984524860978,
"rewards/judge_tool_use": 0.7999999821186066,
"rewards/judge_tool_use/std": 0.22889777272939682,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.5163977742195129,
"step": 120
},
{
"clip_ratio": 0.0007298925891518592,
"completion_length": 92.72916666666667,
"epoch": 0.08333333333333333,
"grad_norm": 0.17549127232311668,
"kl": 0.2087890625,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9616666634877523,
"reward_std": 0.010886183939874172,
"rewards/judge_tool_use": 0.993749996026357,
"rewards/judge_tool_use/std": 0.01971883823474248,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 125
},
{
"clip_ratio": 0.002933995798230171,
"completion_length": 81.46875,
"epoch": 0.08666666666666667,
"grad_norm": 0.4833514179614044,
"kl": 0.1775390625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.9350000023841858,
"reward_std": 0.017422057688236237,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025000007823109627,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 130
},
{
"clip_ratio": 0.001261868537403643,
"completion_length": 122.97916666666667,
"epoch": 0.09,
"grad_norm": 0.3241107117396865,
"kl": 0.23095703125,
"learning_rate": 1e-06,
"loss": 0.0102,
"reward": 0.8791666626930237,
"reward_std": 0.057309987023472786,
"rewards/judge_tool_use": 0.9270833333333334,
"rewards/judge_tool_use/std": 0.1284285510579745,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 135
},
{
"clip_ratio": 0.0008610086515545845,
"completion_length": 84.0,
"epoch": 0.09333333333333334,
"grad_norm": 7.900768391435355,
"kl": 0.234375,
"learning_rate": 1e-06,
"loss": 0.0296,
"reward": 0.918749988079071,
"reward_std": 0.11230538040399551,
"rewards/judge_tool_use": 0.921875,
"rewards/judge_tool_use/std": 0.16829413175582886,
"rewards/verify_correctness": 0.90625,
"rewards/verify_correctness/std": 0.20155644416809082,
"step": 140
},
{
"clip_ratio": 0.002098301984369755,
"completion_length": 97.4375,
"epoch": 0.09666666666666666,
"grad_norm": 116546485.23512569,
"kl": 2857376.061328125,
"learning_rate": 1e-06,
"loss": 5723.877,
"reward": 0.7324999968210856,
"reward_std": 0.1401955665399631,
"rewards/judge_tool_use": 0.8374999761581421,
"rewards/judge_tool_use/std": 0.21938102692365646,
"rewards/verify_correctness": 0.3125,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 145
},
{
"clip_ratio": 0.00661008469760418,
"completion_length": 91.9375,
"epoch": 0.1,
"grad_norm": 0.15506303872115793,
"kl": 1082.3681640625,
"learning_rate": 1e-06,
"loss": 2.2091,
"reward": 0.875,
"reward_std": 0.08185647381469607,
"rewards/judge_tool_use": 0.90625,
"rewards/judge_tool_use/std": 0.17084430158138275,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 94.04166666666667,
"epoch": 0.10333333333333333,
"grad_norm": 1.1559868262365864,
"kl": 37.68232421875,
"learning_rate": 1e-06,
"loss": 0.0821,
"reward": 0.9233333269755045,
"reward_std": 0.03246487428744634,
"rewards/judge_tool_use": 0.9458333253860474,
"rewards/judge_tool_use/std": 0.08484516913692157,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 155
},
{
"clip_ratio": 0.001581813069060445,
"completion_length": 118.40625,
"epoch": 0.10666666666666667,
"grad_norm": 6.97639562650297,
"kl": 1.0310546875,
"learning_rate": 1e-06,
"loss": 0.0379,
"reward": 0.7549999952316284,
"reward_std": 0.14613103866577148,
"rewards/judge_tool_use": 0.8187499940395355,
"rewards/judge_tool_use/std": 0.20402206480503082,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.0,
"step": 160
},
{
"clip_ratio": 0.0015904867090284825,
"completion_length": 69.1875,
"epoch": 0.11,
"grad_norm": 0.03440563598329304,
"kl": 0.9119140625,
"learning_rate": 1e-06,
"loss": 0.0055,
"reward": 0.8925000031789144,
"reward_std": 0.06749333689610164,
"rewards/judge_tool_use": 0.9333333174387614,
"rewards/judge_tool_use/std": 0.11831362545490265,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.331703782081604,
"step": 165
},
{
"clip_ratio": 0.002113501913845539,
"completion_length": 80.90625,
"epoch": 0.11333333333333333,
"grad_norm": 0.2535518240520116,
"kl": 0.256640625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9674999713897705,
"reward_std": 0.027019730769097805,
"rewards/judge_tool_use": 0.9593749940395355,
"rewards/judge_tool_use/std": 0.041013939306139946,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 170
},
{
"clip_ratio": 0.0027771067805588247,
"completion_length": 85.10416666666667,
"epoch": 0.11666666666666667,
"grad_norm": 1.4643118560278143,
"kl": 14.665625,
"learning_rate": 1e-06,
"loss": 0.0287,
"reward": 0.9225000143051147,
"reward_std": 0.08353264754017194,
"rewards/judge_tool_use": 0.981250007947286,
"rewards/judge_tool_use/std": 0.06663193802038829,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3462595542271932,
"step": 175
},
{
"clip_ratio": 0.010677224583923816,
"completion_length": 79.40625,
"epoch": 0.12,
"grad_norm": 23.600689362585086,
"kl": 22.570703125,
"learning_rate": 1e-06,
"loss": 0.0499,
"reward": 0.7824999988079071,
"reward_std": 0.0866054892539978,
"rewards/judge_tool_use": 0.9156249761581421,
"rewards/judge_tool_use/std": 0.13062315434217453,
"rewards/verify_correctness": 0.25,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 180
},
{
"clip_ratio": 0.002086438238620758,
"completion_length": 72.97916666666667,
"epoch": 0.12333333333333334,
"grad_norm": 0.3641677909401427,
"kl": 0.2119140625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.8666666547457377,
"reward_std": 0.049736435214678444,
"rewards/judge_tool_use": 0.90625,
"rewards/judge_tool_use/std": 0.11474608878294627,
"rewards/verify_correctness": 0.7083333333333334,
"rewards/verify_correctness/std": 0.21770429611206055,
"step": 185
},
{
"clip_ratio": 0.0038325218483805656,
"completion_length": 49.4375,
"epoch": 0.12666666666666668,
"grad_norm": 0.21086652879123133,
"kl": 13.8916015625,
"learning_rate": 1e-06,
"loss": 0.0143,
"reward": 0.949999988079071,
"reward_std": 0.11195331811904907,
"rewards/judge_tool_use": 0.953125,
"rewards/judge_tool_use/std": 0.12445715814828873,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.17078252136707306,
"step": 190
},
{
"clip_ratio": 0.0009160305373370648,
"completion_length": 68.39583333333333,
"epoch": 0.13,
"grad_norm": 41.93084775556413,
"kl": 4.29609375,
"learning_rate": 1e-06,
"loss": 0.007,
"reward": 0.8799999952316284,
"reward_std": 0.012344265977541605,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.03998426472147306,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.5163977742195129,
"step": 195
},
{
"clip_ratio": 0.0023942343890666963,
"completion_length": 104.5,
"epoch": 0.13333333333333333,
"grad_norm": 11.56654747316538,
"kl": 9.48359375,
"learning_rate": 1e-06,
"loss": 0.0082,
"reward": 0.7549999952316284,
"reward_std": 0.06465663947165012,
"rewards/judge_tool_use": 0.8031249940395355,
"rewards/judge_tool_use/std": 0.22497396357357502,
"rewards/verify_correctness": 0.5625,
"rewards/verify_correctness/std": 0.5123475790023804,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 55.666666666666664,
"epoch": 0.13666666666666666,
"grad_norm": 0.044316960076541095,
"kl": 0.21640625,
"learning_rate": 1e-06,
"loss": -0.07,
"reward": 0.8816666603088379,
"reward_std": 0.052872808650135994,
"rewards/judge_tool_use": 0.8520833452542623,
"rewards/judge_tool_use/std": 0.179916741947333,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 205
},
{
"clip_ratio": 0.00298199572134763,
"completion_length": 87.1875,
"epoch": 0.14,
"grad_norm": 0.5961166297628009,
"kl": 0.8013671875,
"learning_rate": 1e-06,
"loss": 0.0512,
"reward": 0.9087499976158142,
"reward_std": 0.10966175608336926,
"rewards/judge_tool_use": 0.9406249821186066,
"rewards/judge_tool_use/std": 0.16376879438757896,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 210
},
{
"clip_ratio": 0.0016499520279467106,
"completion_length": 90.83333333333333,
"epoch": 0.14333333333333334,
"grad_norm": 4.675832955995631,
"kl": 1.3087890625,
"learning_rate": 1e-06,
"loss": 0.041,
"reward": 0.9633333285649618,
"reward_std": 0.06979535892605782,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.07252075274785359,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.22771002848943075,
"step": 215
},
{
"clip_ratio": 0.0009900989942252636,
"completion_length": 57.9375,
"epoch": 0.14666666666666667,
"grad_norm": 11.064991822281403,
"kl": 14.7765625,
"learning_rate": 1e-06,
"loss": -0.0021,
"reward": 0.9149999916553497,
"reward_std": 0.10488088428974152,
"rewards/judge_tool_use": 0.925000011920929,
"rewards/judge_tool_use/std": 0.1612451672554016,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.22360680997371674,
"step": 220
},
{
"clip_ratio": 0.006531355949118733,
"completion_length": 75.72916666666667,
"epoch": 0.15,
"grad_norm": 1721.9681952666701,
"kl": 28.7462890625,
"learning_rate": 1e-06,
"loss": 0.046,
"reward": 0.712499996026357,
"reward_std": 0.20226633052031198,
"rewards/judge_tool_use": 0.6875,
"rewards/judge_tool_use/std": 0.2489463413755099,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.28463754057884216,
"step": 225
},
{
"clip_ratio": 0.001966949412599206,
"completion_length": 78.625,
"epoch": 0.15333333333333332,
"grad_norm": 0.43853112182902554,
"kl": 4.1453125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.679999977350235,
"reward_std": 0.14663636311888695,
"rewards/judge_tool_use": 0.6781250238418579,
"rewards/judge_tool_use/std": 0.4121476113796234,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.42898140847682953,
"step": 230
},
{
"clip_ratio": 0.0018659377470612525,
"completion_length": 114.02083333333333,
"epoch": 0.15666666666666668,
"grad_norm": 3.100123205393985,
"kl": 3.02109375,
"learning_rate": 1e-06,
"loss": 0.0543,
"reward": 0.809166669845581,
"reward_std": 0.1575567809243997,
"rewards/judge_tool_use": 0.881250003973643,
"rewards/judge_tool_use/std": 0.19285393754641214,
"rewards/verify_correctness": 0.5208333333333334,
"rewards/verify_correctness/std": 0.36932093898455304,
"step": 235
},
{
"clip_ratio": 0.0018544910941272973,
"completion_length": 81.125,
"epoch": 0.16,
"grad_norm": 2.424569092207152,
"kl": 0.627734375,
"learning_rate": 1e-06,
"loss": 0.0332,
"reward": 0.7649999856948853,
"reward_std": 0.18427922576665878,
"rewards/judge_tool_use": 0.815625011920929,
"rewards/judge_tool_use/std": 0.2867114394903183,
"rewards/verify_correctness": 0.5625,
"rewards/verify_correctness/std": 0.5081988871097565,
"step": 240
},
{
"clip_ratio": 0.0013313586998265237,
"completion_length": 96.47916666666667,
"epoch": 0.16333333333333333,
"grad_norm": 1.905174844858326,
"kl": 0.31015625,
"learning_rate": 1e-06,
"loss": 0.0109,
"reward": 0.856666644414266,
"reward_std": 0.09935928011933963,
"rewards/judge_tool_use": 0.9145833253860474,
"rewards/judge_tool_use/std": 0.15078541884819666,
"rewards/verify_correctness": 0.625,
"rewards/verify_correctness/std": 0.4878704647223155,
"step": 245
},
{
"clip_ratio": 0.0009706525830551982,
"completion_length": 72.53125,
"epoch": 0.16666666666666666,
"grad_norm": 0.6805601508978978,
"kl": 0.573828125,
"learning_rate": 1e-06,
"loss": 0.0431,
"reward": 0.9312500059604645,
"reward_std": 0.06104740500450134,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.09979145228862762,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 250
},
{
"clip_ratio": 0.001953125,
"completion_length": 80.35416666666667,
"epoch": 0.17,
"grad_norm": 0.12919899479086885,
"kl": 0.260546875,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.940833330154419,
"reward_std": 0.028137820462385815,
"rewards/judge_tool_use": 0.9937500158945719,
"rewards/judge_tool_use/std": 0.013437099754810333,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.331703782081604,
"step": 255
},
{
"clip_ratio": 0.005703368596732617,
"completion_length": 83.21875,
"epoch": 0.17333333333333334,
"grad_norm": 1.946076353704121,
"kl": 0.2544921875,
"learning_rate": 1e-06,
"loss": -0.0004,
"reward": 0.9000000059604645,
"reward_std": 0.05235438700765371,
"rewards/judge_tool_use": 0.9375,
"rewards/judge_tool_use/std": 0.11046760901808739,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 83.70833333333333,
"epoch": 0.17666666666666667,
"grad_norm": 5.214532115094087,
"kl": 50.934375,
"learning_rate": 1e-06,
"loss": 0.0987,
"reward": 0.9424999952316284,
"reward_std": 0.05198417603969574,
"rewards/judge_tool_use": 0.9749999841054281,
"rewards/judge_tool_use/std": 0.06610877811908722,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.1707825263341268,
"step": 265
},
{
"clip_ratio": 0.0017290424089878797,
"completion_length": 84.71875,
"epoch": 0.18,
"grad_norm": 0.2231498531869626,
"kl": 0.3408203125,
"learning_rate": 1e-06,
"loss": -0.0023,
"reward": 0.9437499940395355,
"reward_std": 0.01767767034471035,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.38319888710975647,
"step": 270
},
{
"clip_ratio": 0.0024591220542788507,
"completion_length": 99.5,
"epoch": 0.18333333333333332,
"grad_norm": 0.6587571271446878,
"kl": 1.0974609375,
"learning_rate": 1e-06,
"loss": -0.0007,
"reward": 0.7808333237965902,
"reward_std": 0.12485803912083308,
"rewards/judge_tool_use": 0.887500007947286,
"rewards/judge_tool_use/std": 0.22067607939243317,
"rewards/verify_correctness": 0.3541666666666667,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 275
},
{
"clip_ratio": 0.0030754867941141127,
"completion_length": 77.15625,
"epoch": 0.18666666666666668,
"grad_norm": 0.19350548920635696,
"kl": 0.34140625,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 0.8725000023841858,
"reward_std": 0.055471993051469326,
"rewards/judge_tool_use": 0.949999988079071,
"rewards/judge_tool_use/std": 0.08520798571407795,
"rewards/verify_correctness": 0.5625,
"rewards/verify_correctness/std": 0.5081988871097565,
"step": 280
},
{
"clip_ratio": 0.002323679253458977,
"completion_length": 105.39583333333333,
"epoch": 0.19,
"grad_norm": 0.3062492173104658,
"kl": 373.6810546875,
"learning_rate": 1e-06,
"loss": 0.769,
"reward": 0.9183333317438761,
"reward_std": 0.08563666356106599,
"rewards/judge_tool_use": 0.9604166746139526,
"rewards/judge_tool_use/std": 0.10187822952866554,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.305153489112854,
"step": 285
},
{
"clip_ratio": 0.0027462080586701633,
"completion_length": 64.71875,
"epoch": 0.19333333333333333,
"grad_norm": 1.907689788676722,
"kl": 3.71171875,
"learning_rate": 1e-06,
"loss": 0.0063,
"reward": 0.9237499833106995,
"reward_std": 0.14122167974710464,
"rewards/judge_tool_use": 0.9281249940395355,
"rewards/judge_tool_use/std": 0.19741450250148773,
"rewards/verify_correctness": 0.90625,
"rewards/verify_correctness/std": 0.29578252136707306,
"step": 290
},
{
"clip_ratio": 0.0014276792760938406,
"completion_length": 110.125,
"epoch": 0.19666666666666666,
"grad_norm": 5.415532630231474,
"kl": 0.4234375,
"learning_rate": 1e-06,
"loss": -0.0077,
"reward": 0.8458333015441895,
"reward_std": 0.2074962705373764,
"rewards/judge_tool_use": 0.875,
"rewards/judge_tool_use/std": 0.2852979749441147,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.4045371313889821,
"step": 295
},
{
"clip_ratio": 0.001273171789944172,
"completion_length": 84.8125,
"epoch": 0.2,
"grad_norm": 0.7050928593838439,
"kl": 0.2220703125,
"learning_rate": 1e-06,
"loss": 0.0415,
"reward": 0.9537499845027924,
"reward_std": 0.09016102831810713,
"rewards/judge_tool_use": 0.949999988079071,
"rewards/judge_tool_use/std": 0.13606470078229904,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 300
},
{
"clip_ratio": 0.00012106538051739335,
"completion_length": 71.79166666666667,
"epoch": 0.20333333333333334,
"grad_norm": 0.3670228135757323,
"kl": 1.64921875,
"learning_rate": 1e-06,
"loss": 0.0174,
"reward": 0.9508333206176758,
"reward_std": 0.07014790053168933,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.08843709776798885,
"rewards/verify_correctness": 0.8541666666666666,
"rewards/verify_correctness/std": 0.25,
"step": 305
},
{
"clip_ratio": 0.0013387146405875682,
"completion_length": 55.5625,
"epoch": 0.20666666666666667,
"grad_norm": 0.16146643171622213,
"kl": 0.2376953125,
"learning_rate": 1e-06,
"loss": -0.0024,
"reward": 0.9587499797344208,
"reward_std": 0.0969240814447403,
"rewards/judge_tool_use": 0.971875011920929,
"rewards/judge_tool_use/std": 0.09994790703058243,
"rewards/verify_correctness": 0.90625,
"rewards/verify_correctness/std": 0.20155644416809082,
"step": 310
},
{
"clip_ratio": 0.0017094017937779426,
"completion_length": 63.395833333333336,
"epoch": 0.21,
"grad_norm": 1.5235387274657526,
"kl": 76.58203125,
"learning_rate": 1e-06,
"loss": 0.1523,
"reward": 0.9824999968210856,
"reward_std": 0.049497475226720176,
"rewards/judge_tool_use": 0.9833333293596903,
"rewards/judge_tool_use/std": 0.06666666766007741,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 315
},
{
"clip_ratio": 0.0032774390652775764,
"completion_length": 68.6875,
"epoch": 0.21333333333333335,
"grad_norm": 2612.164532527323,
"kl": 182.58828125,
"learning_rate": 1e-06,
"loss": 0.373,
"reward": 0.9112499952316284,
"reward_std": 0.09157592756673694,
"rewards/judge_tool_use": 0.9437499940395355,
"rewards/judge_tool_use/std": 0.15795189142227173,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 320
},
{
"clip_ratio": 0.0010407064110040665,
"completion_length": 76.45833333333333,
"epoch": 0.21666666666666667,
"grad_norm": 2.421529558975116,
"kl": 0.40625,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 0.8883333404858907,
"reward_std": 0.043602497316896915,
"rewards/judge_tool_use": 0.9437499841054281,
"rewards/judge_tool_use/std": 0.08793675154447556,
"rewards/verify_correctness": 0.6666666666666666,
"rewards/verify_correctness/std": 0.3442651828130086,
"step": 325
},
{
"clip_ratio": 0.005306883063167333,
"completion_length": 79.75,
"epoch": 0.22,
"grad_norm": 15.808736875255741,
"kl": 5.0265625,
"learning_rate": 1e-06,
"loss": 0.0468,
"reward": 0.7362499833106995,
"reward_std": 0.2673564925789833,
"rewards/judge_tool_use": 0.7718749940395355,
"rewards/judge_tool_use/std": 0.3598140925168991,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.4515564441680908,
"step": 330
},
{
"clip_ratio": 0.0012897307053208352,
"completion_length": 92.375,
"epoch": 0.22333333333333333,
"grad_norm": 0.8262876073255261,
"kl": 0.42890625,
"learning_rate": 1e-06,
"loss": 0.0047,
"reward": 0.8600000143051147,
"reward_std": 0.08047416061162949,
"rewards/judge_tool_use": 0.9499999682108561,
"rewards/judge_tool_use/std": 0.13221755623817444,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 335
},
{
"clip_ratio": 0.0010268327314406633,
"completion_length": 79.5625,
"epoch": 0.22666666666666666,
"grad_norm": 0.14720074644097367,
"kl": 0.2916015625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 0.9862499833106995,
"reward_std": 0.0176776722073555,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 340
},
{
"clip_ratio": 0.0009512485004961491,
"completion_length": 95.85416666666667,
"epoch": 0.23,
"grad_norm": 0.05737038924876427,
"kl": 0.54521484375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.9416666825612386,
"reward_std": 0.01843047762910525,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.028598766773939133,
"rewards/verify_correctness": 0.7916666666666666,
"rewards/verify_correctness/std": 0.28598760565121967,
"step": 345
},
{
"clip_ratio": 0.0005090909078717231,
"completion_length": 84.53125,
"epoch": 0.23333333333333334,
"grad_norm": 0.10503333394399555,
"kl": 0.4201171875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.9537500143051147,
"reward_std": 0.019955309107899666,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 350
},
{
"clip_ratio": 0.00019607844296842813,
"completion_length": 108.35416666666667,
"epoch": 0.23666666666666666,
"grad_norm": 1.7884752441260985,
"kl": 0.4283203125,
"learning_rate": 1e-06,
"loss": 0.0359,
"reward": 0.9358333547910055,
"reward_std": 0.08353088547786076,
"rewards/judge_tool_use": 0.956249992052714,
"rewards/judge_tool_use/std": 0.08920949697494507,
"rewards/verify_correctness": 0.8541666666666666,
"rewards/verify_correctness/std": 0.3462595542271932,
"step": 355
},
{
"clip_ratio": 0.0012887715362012385,
"completion_length": 85.84375,
"epoch": 0.24,
"grad_norm": 2.9967592204893183,
"kl": 2.588671875,
"learning_rate": 1e-06,
"loss": 0.0506,
"reward": 0.875,
"reward_std": 0.09827076643705368,
"rewards/judge_tool_use": 0.890625,
"rewards/judge_tool_use/std": 0.19682374596595764,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 53.791666666666664,
"epoch": 0.24333333333333335,
"grad_norm": 0.05438651543966187,
"kl": 0.29921875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 365
},
{
"clip_ratio": 0.0004950494971126318,
"completion_length": 79.75,
"epoch": 0.24666666666666667,
"grad_norm": 0.10471062504755536,
"kl": 0.2140625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9950000047683716,
"reward_std": 0.009258206002414227,
"rewards/judge_tool_use": 0.9937500059604645,
"rewards/judge_tool_use/std": 0.017078254371881485,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 370
},
{
"clip_ratio": 0.0008534850552678108,
"completion_length": 71.75,
"epoch": 0.25,
"grad_norm": 0.05059977006337808,
"kl": 0.28515625,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.9599999984105428,
"reward_std": 0.007126967112223308,
"rewards/judge_tool_use": 0.9916666746139526,
"rewards/judge_tool_use/std": 0.01490712414185206,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 375
},
{
"clip_ratio": 0.0021406911546364427,
"completion_length": 131.09375,
"epoch": 0.25333333333333335,
"grad_norm": 0.2406966009370665,
"kl": 8.698828125,
"learning_rate": 1e-06,
"loss": 0.0463,
"reward": 0.8287499845027924,
"reward_std": 0.10335364565253258,
"rewards/judge_tool_use": 0.856249988079071,
"rewards/judge_tool_use/std": 0.17525622248649597,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.42695631086826324,
"step": 380
},
{
"clip_ratio": 0.0003087012562900782,
"completion_length": 81.6875,
"epoch": 0.25666666666666665,
"grad_norm": 0.07653342987035294,
"kl": 0.242578125,
"learning_rate": 1e-06,
"loss": -0.0002,
"reward": 0.9124999841054281,
"reward_std": 0.04729796418299278,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.0737380584081014,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 385
},
{
"clip_ratio": 0.00035750765819102525,
"completion_length": 107.46875,
"epoch": 0.26,
"grad_norm": 0.06597534204621332,
"kl": 0.22822265625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9474999904632568,
"reward_std": 0.007071072701364756,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 390
},
{
"clip_ratio": 0.0005074221640825272,
"completion_length": 74.77083333333333,
"epoch": 0.2633333333333333,
"grad_norm": 0.23933590665982507,
"kl": 0.2693359375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 0.9716666539510092,
"reward_std": 0.023570228057603042,
"rewards/judge_tool_use": 0.9854166507720947,
"rewards/judge_tool_use/std": 0.02500000720222791,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.1490712066491445,
"step": 395
},
{
"clip_ratio": 0.0010744871804490685,
"completion_length": 89.375,
"epoch": 0.26666666666666666,
"grad_norm": 1.1698863583293904,
"kl": 0.276171875,
"learning_rate": 1e-06,
"loss": 0.0256,
"reward": 0.9025000035762787,
"reward_std": 0.08742741448804736,
"rewards/judge_tool_use": 0.9249999821186066,
"rewards/judge_tool_use/std": 0.1527007520198822,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25,
"step": 400
},
{
"clip_ratio": 0.0018488712608814239,
"completion_length": 80.66666666666667,
"epoch": 0.27,
"grad_norm": 1.4081894468368357,
"kl": 0.41796875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 0.8766666650772095,
"reward_std": 0.05552822661896547,
"rewards/judge_tool_use": 0.9291666746139526,
"rewards/judge_tool_use/std": 0.07800610611836116,
"rewards/verify_correctness": 0.6666666666666666,
"rewards/verify_correctness/std": 0.0,
"step": 405
},
{
"clip_ratio": 0.00013927576364949345,
"completion_length": 79.5625,
"epoch": 0.2733333333333333,
"grad_norm": 0.36461709597810504,
"kl": 0.19462890625,
"learning_rate": 1e-06,
"loss": -0.0021,
"reward": 0.9662500023841858,
"reward_std": 0.02875388413667679,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.84375,
"rewards/verify_correctness/std": 0.23935678601264954,
"step": 410
},
{
"clip_ratio": 0.0013018524972721935,
"completion_length": 113.125,
"epoch": 0.27666666666666667,
"grad_norm": 0.3280852961847766,
"kl": 4.0763671875,
"learning_rate": 1e-06,
"loss": 0.0044,
"reward": 0.8483333190282186,
"reward_std": 0.150167316198349,
"rewards/judge_tool_use": 0.9041666388511658,
"rewards/judge_tool_use/std": 0.16761433954040209,
"rewards/verify_correctness": 0.625,
"rewards/verify_correctness/std": 0.4262484510739644,
"step": 415
},
{
"clip_ratio": 0.001338357198983431,
"completion_length": 90.0,
"epoch": 0.28,
"grad_norm": 0.6117562389637614,
"kl": 0.2181640625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9912500083446503,
"reward_std": 0.02474873699247837,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 420
},
{
"clip_ratio": 0.0008633675985038281,
"completion_length": 335.8541666666667,
"epoch": 0.2833333333333333,
"grad_norm": 0.8108468649324869,
"kl": 0.4734375,
"learning_rate": 1e-06,
"loss": 0.0387,
"reward": 0.6941666603088379,
"reward_std": 0.24669699867566428,
"rewards/judge_tool_use": 0.7895833253860474,
"rewards/judge_tool_use/std": 0.2617962161699931,
"rewards/verify_correctness": 0.3125,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 425
},
{
"clip_ratio": 0.004640390491113066,
"completion_length": 123.375,
"epoch": 0.2866666666666667,
"grad_norm": 0.4713406045119873,
"kl": 1.093359375,
"learning_rate": 1e-06,
"loss": -0.0019,
"reward": 0.7487500011920929,
"reward_std": 0.2284143902361393,
"rewards/judge_tool_use": 0.7406249940395355,
"rewards/judge_tool_use/std": 0.35302741825580597,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 430
},
{
"clip_ratio": 0.0008070833748206497,
"completion_length": 153.9375,
"epoch": 0.29,
"grad_norm": 1.9592742170228072,
"kl": 1.117578125,
"learning_rate": 1e-06,
"loss": 0.0878,
"reward": 0.9316666523615519,
"reward_std": 0.15917644401391348,
"rewards/judge_tool_use": 0.9354166587193807,
"rewards/judge_tool_use/std": 0.2003726214170456,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.21770429611206055,
"step": 435
},
{
"clip_ratio": 0.0018505133455619216,
"completion_length": 173.0,
"epoch": 0.29333333333333333,
"grad_norm": 1.383857692998585,
"kl": 0.4033203125,
"learning_rate": 1e-06,
"loss": 0.031,
"reward": 0.7675000131130219,
"reward_std": 0.1635022610425949,
"rewards/judge_tool_use": 0.8499999940395355,
"rewards/judge_tool_use/std": 0.2784065455198288,
"rewards/verify_correctness": 0.4375,
"rewards/verify_correctness/std": 0.47360680997371674,
"step": 440
},
{
"clip_ratio": 0.0004979253280907869,
"completion_length": 117.91666666666667,
"epoch": 0.2966666666666667,
"grad_norm": 14.897280067316444,
"kl": 1.298046875,
"learning_rate": 1e-06,
"loss": 0.0177,
"reward": 0.9633333285649618,
"reward_std": 0.06114211150755485,
"rewards/judge_tool_use": 0.9645833174387614,
"rewards/judge_tool_use/std": 0.09941734870274861,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 445
},
{
"clip_ratio": 0.001424700953066349,
"completion_length": 78.5625,
"epoch": 0.3,
"grad_norm": 0.046388448519732395,
"kl": 0.276171875,
"learning_rate": 1e-06,
"loss": 0.0166,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 450
},
{
"clip_ratio": 0.0008928571827709675,
"completion_length": 70.1875,
"epoch": 0.30333333333333334,
"grad_norm": 13.949856199547733,
"kl": 0.98984375,
"learning_rate": 1e-06,
"loss": 0.0114,
"reward": 0.9741666714350382,
"reward_std": 0.04652188221613566,
"rewards/judge_tool_use": 0.9729166626930237,
"rewards/judge_tool_use/std": 0.06579288840293884,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 455
},
{
"clip_ratio": 0.001224489789456129,
"completion_length": 77.1875,
"epoch": 0.30666666666666664,
"grad_norm": 0.08400978434221483,
"kl": 0.5431640625,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9350000023841858,
"reward_std": 0.009258206002414227,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025000007823109627,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 460
},
{
"clip_ratio": 0.0007088846992701292,
"completion_length": 106.10416666666667,
"epoch": 0.31,
"grad_norm": 1.2873129844422142,
"kl": 0.703125,
"learning_rate": 1e-06,
"loss": 0.0265,
"reward": 0.8833333253860474,
"reward_std": 0.10906451940536499,
"rewards/judge_tool_use": 0.9166666666666666,
"rewards/judge_tool_use/std": 0.185252716143926,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.3303537170092265,
"step": 465
},
{
"clip_ratio": 0.0028592190705239774,
"completion_length": 101.875,
"epoch": 0.31333333333333335,
"grad_norm": 0.20363038570520958,
"kl": 0.36171875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 0.9199999868869781,
"reward_std": 0.017422062810510397,
"rewards/judge_tool_use": 0.9624999761581421,
"rewards/judge_tool_use/std": 0.04955306649208069,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 470
},
{
"clip_ratio": 0.0005361930467188359,
"completion_length": 84.14583333333333,
"epoch": 0.31666666666666665,
"grad_norm": 0.8174753094540738,
"kl": 0.2626953125,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9816666841506958,
"reward_std": 0.028729441886146862,
"rewards/judge_tool_use": 0.987500011920929,
"rewards/judge_tool_use/std": 0.026292627056439716,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 475
},
{
"clip_ratio": 0.0010484508238732815,
"completion_length": 83.1875,
"epoch": 0.32,
"grad_norm": 4.869255272848249,
"kl": 0.59296875,
"learning_rate": 1e-06,
"loss": -0.0054,
"reward": 0.918749988079071,
"reward_std": 0.07140177488327026,
"rewards/judge_tool_use": 0.9375,
"rewards/judge_tool_use/std": 0.11180340498685837,
"rewards/verify_correctness": 0.84375,
"rewards/verify_correctness/std": 0.23935678601264954,
"step": 480
},
{
"clip_ratio": 0.0041505326051265,
"completion_length": 104.0625,
"epoch": 0.3233333333333333,
"grad_norm": 0.7259306381788916,
"kl": 0.31796875,
"learning_rate": 1e-06,
"loss": -0.0027,
"reward": 0.9758333365122477,
"reward_std": 0.028907646735509235,
"rewards/judge_tool_use": 0.9854166706403097,
"rewards/judge_tool_use/std": 0.032623790204524994,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1343709627787272,
"step": 485
},
{
"clip_ratio": 0.002794364234432578,
"completion_length": 167.9375,
"epoch": 0.32666666666666666,
"grad_norm": 0.07267427706403441,
"kl": 0.27734375,
"learning_rate": 1e-06,
"loss": 0.0062,
"reward": 0.9162499904632568,
"reward_std": 0.0662735546939075,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.056753065437078476,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 490
},
{
"clip_ratio": 0.00040650404989719393,
"completion_length": 100.45833333333333,
"epoch": 0.33,
"grad_norm": 0.2878900699504781,
"kl": 0.34296875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 0.9416666626930237,
"reward_std": 0.07133257389068604,
"rewards/judge_tool_use": 0.9479166666666666,
"rewards/judge_tool_use/std": 0.10411662111679713,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.21770429611206055,
"step": 495
},
{
"clip_ratio": 0.003871983336284757,
"completion_length": 90.59375,
"epoch": 0.3333333333333333,
"grad_norm": 0.774210737681418,
"kl": 0.2849609375,
"learning_rate": 1e-06,
"loss": 0.0051,
"reward": 0.9049999713897705,
"reward_std": 0.05290384031832218,
"rewards/judge_tool_use": 0.9749999642372131,
"rewards/judge_tool_use/std": 0.038117386400699615,
"rewards/verify_correctness": 0.625,
"rewards/verify_correctness/std": 0.36435678601264954,
"step": 500
},
{
"clip_ratio": 0.0014365109149366618,
"completion_length": 163.0,
"epoch": 0.33666666666666667,
"grad_norm": 5.746402974167011,
"kl": 0.805078125,
"learning_rate": 1e-06,
"loss": 0.0094,
"reward": 0.815833330154419,
"reward_std": 0.11187712320437034,
"rewards/judge_tool_use": 0.8583333094914755,
"rewards/judge_tool_use/std": 0.2133141408363978,
"rewards/verify_correctness": 0.6458333333333334,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 505
},
{
"clip_ratio": 0.000518302945420146,
"completion_length": 86.75,
"epoch": 0.34,
"grad_norm": 0.0989351605838229,
"kl": 0.43984375,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 510
},
{
"clip_ratio": 0.0005924170836806297,
"completion_length": 62.0,
"epoch": 0.3433333333333333,
"grad_norm": 0.78032389511651,
"kl": 0.367578125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 0.9866666595141093,
"reward_std": 0.02108185241619746,
"rewards/judge_tool_use": 0.993749996026357,
"rewards/judge_tool_use/std": 0.013437099754810333,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 515
},
{
"clip_ratio": 0.004682651488110423,
"completion_length": 106.53125,
"epoch": 0.3466666666666667,
"grad_norm": 1.1243616090267425,
"kl": 0.57734375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 0.9049999713897705,
"reward_std": 0.060222613625228405,
"rewards/judge_tool_use": 0.9593749940395355,
"rewards/judge_tool_use/std": 0.049755578860640526,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.25,
"step": 520
},
{
"clip_ratio": 8.077544625848532e-05,
"completion_length": 90.0625,
"epoch": 0.35,
"grad_norm": 0.9496683027092206,
"kl": 0.254296875,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 0.8733333349227905,
"reward_std": 0.10902170836925507,
"rewards/judge_tool_use": 0.9145833253860474,
"rewards/judge_tool_use/std": 0.13846262296040854,
"rewards/verify_correctness": 0.7083333333333334,
"rewards/verify_correctness/std": 0.2939421534538269,
"step": 525
},
{
"clip_ratio": 0.0037049442529678346,
"completion_length": 77.6875,
"epoch": 0.35333333333333333,
"grad_norm": 0.04794774304730862,
"kl": 0.398828125,
"learning_rate": 1e-06,
"loss": -0.0013,
"reward": 0.9287499785423279,
"reward_std": 0.033716630190610886,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.023935683071613312,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 530
},
{
"clip_ratio": 0.0026162526570260524,
"completion_length": 79.64583333333333,
"epoch": 0.3566666666666667,
"grad_norm": 0.41535999227857473,
"kl": 155.844140625,
"learning_rate": 1e-06,
"loss": 0.3488,
"reward": 0.9000000158945719,
"reward_std": 0.08795289571086566,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.09092239538828532,
"rewards/verify_correctness": 0.625,
"rewards/verify_correctness/std": 0.43299739559491474,
"step": 535
},
{
"clip_ratio": 0.00012376237427815794,
"completion_length": 66.25,
"epoch": 0.36,
"grad_norm": 0.30713351970070757,
"kl": 0.346875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 0.9937500059604645,
"reward_std": 0.01767767034471035,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 71.10416666666667,
"epoch": 0.36333333333333334,
"grad_norm": 0.09672611146866233,
"kl": 0.390234375,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 0.8599999944368998,
"reward_std": 0.0,
"rewards/judge_tool_use": 0.8666666746139526,
"rewards/judge_tool_use/std": 0.13770607113838196,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 75.0625,
"epoch": 0.36666666666666664,
"grad_norm": 0.06902490237203746,
"kl": 0.25,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.925000011920929,
"reward_std": 0.009258206002414227,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.0428981501609087,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 550
},
{
"clip_ratio": 0.0013333333656191826,
"completion_length": 81.02083333333333,
"epoch": 0.37,
"grad_norm": 0.07471788985105354,
"kl": 1.6287109375,
"learning_rate": 1e-06,
"loss": 0.0378,
"reward": 0.8991666634877523,
"reward_std": 0.1171679353962342,
"rewards/judge_tool_use": 0.9208333492279053,
"rewards/judge_tool_use/std": 0.12504171580076218,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.1707825263341268,
"step": 555
},
{
"clip_ratio": 0.0011577558820135892,
"completion_length": 93.5,
"epoch": 0.37333333333333335,
"grad_norm": 0.6806153493761989,
"kl": 0.3818359375,
"learning_rate": 1e-06,
"loss": -0.0049,
"reward": 0.9262500107288361,
"reward_std": 0.07174841035157442,
"rewards/judge_tool_use": 0.9625000059604645,
"rewards/judge_tool_use/std": 0.12010355666279793,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 560
},
{
"clip_ratio": 0.00013755158288404347,
"completion_length": 102.10416666666667,
"epoch": 0.37666666666666665,
"grad_norm": 0.3453755330616684,
"kl": 0.19248046875,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.9516666531562805,
"reward_std": 0.03459723728398482,
"rewards/judge_tool_use": 0.981250007947286,
"rewards/judge_tool_use/std": 0.03086424618959427,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 565
},
{
"clip_ratio": 0.0005459312233142555,
"completion_length": 64.78125,
"epoch": 0.38,
"grad_norm": 0.106962339918945,
"kl": 0.3599609375,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9824999868869781,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.9781249761581421,
"rewards/judge_tool_use/std": 0.025617383420467377,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 570
},
{
"clip_ratio": 0.003975985199213028,
"completion_length": 79.72916666666667,
"epoch": 0.38333333333333336,
"grad_norm": 31.87547237303355,
"kl": 1.836328125,
"learning_rate": 1e-06,
"loss": 0.0061,
"reward": 0.9599999984105428,
"reward_std": 0.08956686376283567,
"rewards/judge_tool_use": 0.9708333214124044,
"rewards/judge_tool_use/std": 0.08607227355241776,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.22771002848943075,
"step": 575
},
{
"clip_ratio": 0.0022586936596781016,
"completion_length": 89.4375,
"epoch": 0.38666666666666666,
"grad_norm": 0.18459174965007472,
"kl": 2.7267578125,
"learning_rate": 1e-06,
"loss": 0.0178,
"reward": 0.9512500166893005,
"reward_std": 0.08859837148338556,
"rewards/judge_tool_use": 0.9468750059604645,
"rewards/judge_tool_use/std": 0.1337440349161625,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 580
},
{
"clip_ratio": 0.0009126466698944569,
"completion_length": 66.64583333333333,
"epoch": 0.39,
"grad_norm": 0.47653096528197164,
"kl": 0.923828125,
"learning_rate": 1e-06,
"loss": 0.009,
"reward": 0.9541666507720947,
"reward_std": 0.07951843117674191,
"rewards/judge_tool_use": 0.9583333532015482,
"rewards/judge_tool_use/std": 0.1015499656399091,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1971883475780487,
"step": 585
},
{
"clip_ratio": 0.0015760843292810024,
"completion_length": 92.34375,
"epoch": 0.3933333333333333,
"grad_norm": 0.16495067647216727,
"kl": 0.2921875,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 0.9399999976158142,
"reward_std": 0.010690455324947834,
"rewards/judge_tool_use": 0.987500011920929,
"rewards/judge_tool_use/std": 0.02236068621277809,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 590
},
{
"clip_ratio": 0.0009520398220047354,
"completion_length": 90.22916666666667,
"epoch": 0.39666666666666667,
"grad_norm": 150.32526217473026,
"kl": 10.976953125,
"learning_rate": 1e-06,
"loss": 0.0411,
"reward": 0.9608333309491476,
"reward_std": 0.06940593632558982,
"rewards/judge_tool_use": 0.9666666587193807,
"rewards/judge_tool_use/std": 0.08551172663768132,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1971883475780487,
"step": 595
},
{
"clip_ratio": 0.0018051420571282505,
"completion_length": 65.28125,
"epoch": 0.4,
"grad_norm": 0.5706420371896888,
"kl": 2.0546875,
"learning_rate": 1e-06,
"loss": 0.0248,
"reward": 0.8949999809265137,
"reward_std": 0.014142142608761787,
"rewards/judge_tool_use": 0.9937499761581421,
"rewards/judge_tool_use/std": 0.025000005960464478,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.5163977742195129,
"step": 600
},
{
"clip_ratio": 0.0,
"completion_length": 64.75,
"epoch": 0.4033333333333333,
"grad_norm": 0.11139257485411902,
"kl": 0.1998046875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 605
},
{
"clip_ratio": 0.0,
"completion_length": 75.9375,
"epoch": 0.4066666666666667,
"grad_norm": 0.05923647506926399,
"kl": 0.2099609375,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 610
},
{
"clip_ratio": 0.001606425642967224,
"completion_length": 69.27083333333333,
"epoch": 0.41,
"grad_norm": 0.4111915365282135,
"kl": 2.0720703125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 0.9674999912579855,
"reward_std": 0.04904646178086599,
"rewards/judge_tool_use": 0.9645833373069763,
"rewards/judge_tool_use/std": 0.08384520187973976,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 615
},
{
"clip_ratio": 0.0,
"completion_length": 69.5,
"epoch": 0.41333333333333333,
"grad_norm": 0.022696732599213294,
"kl": 0.2666015625,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9799999892711639,
"reward_std": 0.0,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.025819895789027214,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 620
},
{
"clip_ratio": 0.000514579750597477,
"completion_length": 92.0625,
"epoch": 0.4166666666666667,
"grad_norm": 11.520531811733376,
"kl": 1.763671875,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 0.9300000071525574,
"reward_std": 0.04443951385716597,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.043299747010072075,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.280521680911382,
"step": 625
},
{
"clip_ratio": 0.001496715540997684,
"completion_length": 137.84375,
"epoch": 0.42,
"grad_norm": 0.5816584419225785,
"kl": 1.04296875,
"learning_rate": 1e-06,
"loss": 0.013,
"reward": 0.8637500107288361,
"reward_std": 0.10849415510892868,
"rewards/judge_tool_use": 0.9312499761581421,
"rewards/judge_tool_use/std": 0.15823806822299957,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.497555673122406,
"step": 630
},
{
"clip_ratio": 0.0002162941498681903,
"completion_length": 104.9375,
"epoch": 0.42333333333333334,
"grad_norm": 1.037480255167932,
"kl": 0.4396484375,
"learning_rate": 1e-06,
"loss": 0.0271,
"reward": 0.9183333118756613,
"reward_std": 0.06705306967099507,
"rewards/judge_tool_use": 0.949999988079071,
"rewards/judge_tool_use/std": 0.09021268288294475,
"rewards/verify_correctness": 0.7916666666666666,
"rewards/verify_correctness/std": 0.305153489112854,
"step": 635
},
{
"clip_ratio": 0.004081117268651724,
"completion_length": 61.25,
"epoch": 0.4266666666666667,
"grad_norm": 0.07253261510235819,
"kl": 7.5224609375,
"learning_rate": 1e-06,
"loss": 0.0144,
"reward": 0.9799999892711639,
"reward_std": 0.014142139814794064,
"rewards/judge_tool_use": 0.9749999642372131,
"rewards/judge_tool_use/std": 0.038117386400699615,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 640
},
{
"clip_ratio": 0.0,
"completion_length": 75.10416666666667,
"epoch": 0.43,
"grad_norm": 1.233226781331498,
"kl": 18.0759765625,
"learning_rate": 1e-06,
"loss": 0.0359,
"reward": 0.9791666666666666,
"reward_std": 0.021593637764453888,
"rewards/judge_tool_use": 0.9895833333333334,
"rewards/judge_tool_use/std": 0.015957123289505642,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1343709627787272,
"step": 645
},
{
"clip_ratio": 0.0022598175797611474,
"completion_length": 72.46875,
"epoch": 0.43333333333333335,
"grad_norm": 0.35274976474922665,
"kl": 0.51875,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 650
},
{
"clip_ratio": 0.00019841270986944436,
"completion_length": 88.0625,
"epoch": 0.43666666666666665,
"grad_norm": 2.1379059746819324,
"kl": 0.41953125,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.9150000015894572,
"reward_std": 0.07097954737643401,
"rewards/judge_tool_use": 0.9458333253860474,
"rewards/judge_tool_use/std": 0.11968278015653293,
"rewards/verify_correctness": 0.7916666666666666,
"rewards/verify_correctness/std": 0.28598760565121967,
"step": 655
},
{
"clip_ratio": 0.0004010346601717174,
"completion_length": 83.1875,
"epoch": 0.44,
"grad_norm": 0.0809447830533763,
"kl": 0.698828125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 0.9275000095367432,
"reward_std": 0.05599744990468025,
"rewards/judge_tool_use": 0.971875011920929,
"rewards/judge_tool_use/std": 0.09994790703058243,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 660
},
{
"clip_ratio": 0.0002642008010298014,
"completion_length": 64.08333333333333,
"epoch": 0.44333333333333336,
"grad_norm": 0.12353014311423977,
"kl": 1.115234375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 0.9483333230018616,
"reward_std": 0.011841016821563244,
"rewards/judge_tool_use": 0.9770833253860474,
"rewards/judge_tool_use/std": 0.03198537975549698,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 665
},
{
"clip_ratio": 0.000147058826405555,
"completion_length": 60.0625,
"epoch": 0.44666666666666666,
"grad_norm": 0.033734607078496216,
"kl": 0.28359375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 670
},
{
"clip_ratio": 0.0007290652487426996,
"completion_length": 75.3125,
"epoch": 0.45,
"grad_norm": 0.28937763507235525,
"kl": 0.41787109375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9925000071525574,
"reward_std": 0.01676552618543307,
"rewards/judge_tool_use": 0.9958333373069763,
"rewards/judge_tool_use/std": 0.011385502914587656,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 675
},
{
"clip_ratio": 0.0007440476212650538,
"completion_length": 101.75,
"epoch": 0.4533333333333333,
"grad_norm": 0.21986652830727627,
"kl": 0.30478515625,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9925000071525574,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 680
},
{
"clip_ratio": 0.0001519756857305765,
"completion_length": 72.25,
"epoch": 0.45666666666666667,
"grad_norm": 0.7866833774635168,
"kl": 0.2416015625,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 0.9599999984105428,
"reward_std": 0.02014437907685836,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025546599179506302,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.16666666666666666,
"step": 685
},
{
"clip_ratio": 0.003282261826097965,
"completion_length": 80.8125,
"epoch": 0.46,
"grad_norm": 0.12656362296920962,
"kl": 0.209375,
"learning_rate": 1e-06,
"loss": -0.0001,
"reward": 0.9350000023841858,
"reward_std": 0.017422057688236237,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025000007823109627,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 690
},
{
"clip_ratio": 0.0008928571827709675,
"completion_length": 81.95833333333333,
"epoch": 0.4633333333333333,
"grad_norm": 0.07893765556745304,
"kl": 0.784375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 0.9733333190282186,
"reward_std": 0.01708986610174179,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.16666666666666666,
"step": 695
},
{
"clip_ratio": 0.0006349206436425447,
"completion_length": 68.3125,
"epoch": 0.4666666666666667,
"grad_norm": 1.2625190305994096,
"kl": 1.878515625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 0.9900000095367432,
"reward_std": 0.010690455324947834,
"rewards/judge_tool_use": 0.987500011920929,
"rewards/judge_tool_use/std": 0.02236068621277809,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 700
},
{
"clip_ratio": 0.0009756097570061684,
"completion_length": 67.77083333333333,
"epoch": 0.47,
"grad_norm": 0.10890212362336826,
"kl": 0.26640625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.978333314259847,
"reward_std": 0.015430334955453873,
"rewards/judge_tool_use": 0.9833333293596903,
"rewards/judge_tool_use/std": 0.017213263859351475,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 705
},
{
"clip_ratio": 0.0007211538497358561,
"completion_length": 92.8125,
"epoch": 0.47333333333333333,
"grad_norm": 0.027587471479060908,
"kl": 0.21982421875,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9712499976158142,
"reward_std": 0.018077218905091286,
"rewards/judge_tool_use": 0.9718749821186066,
"rewards/judge_tool_use/std": 0.03831989876925945,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 710
},
{
"clip_ratio": 0.0009474414400756359,
"completion_length": 108.08333333333333,
"epoch": 0.4766666666666667,
"grad_norm": 0.19022699842586205,
"kl": 0.237890625,
"learning_rate": 1e-06,
"loss": -0.0022,
"reward": 0.9325000047683716,
"reward_std": 0.03303187837203344,
"rewards/judge_tool_use": 0.9833333293596903,
"rewards/judge_tool_use/std": 0.02805217479666074,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.33744919300079346,
"step": 715
},
{
"clip_ratio": 0.0005668934434652328,
"completion_length": 80.4375,
"epoch": 0.48,
"grad_norm": 0.0847895586797689,
"kl": 0.2830078125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9925000071525574,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 720
},
{
"clip_ratio": 0.0010113780386745929,
"completion_length": 98.5625,
"epoch": 0.48333333333333334,
"grad_norm": 393.7775350754493,
"kl": 13.9048828125,
"learning_rate": 1e-06,
"loss": 0.0279,
"reward": 0.887499988079071,
"reward_std": 0.05597590561956167,
"rewards/judge_tool_use": 0.9375,
"rewards/judge_tool_use/std": 0.1213516891002655,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 725
},
{
"clip_ratio": 0.00041820945334620775,
"completion_length": 102.9375,
"epoch": 0.4866666666666667,
"grad_norm": 0.09728385322995636,
"kl": 9.962890625,
"learning_rate": 1e-06,
"loss": 0.0203,
"reward": 0.9887500107288361,
"reward_std": 0.0318198068998754,
"rewards/judge_tool_use": 0.9937499761581421,
"rewards/judge_tool_use/std": 0.025000005960464478,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 730
},
{
"clip_ratio": 0.0006047413335181773,
"completion_length": 85.3125,
"epoch": 0.49,
"grad_norm": 0.16058354971882016,
"kl": 8755.388671875,
"learning_rate": 1e-06,
"loss": 17.4619,
"reward": 0.9358333349227905,
"reward_std": 0.05956774204969406,
"rewards/judge_tool_use": 0.9770833452542623,
"rewards/judge_tool_use/std": 0.07801744093497594,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.3065035541852315,
"step": 735
},
{
"clip_ratio": 0.0,
"completion_length": 68.4375,
"epoch": 0.49333333333333335,
"grad_norm": 0.16518417296961413,
"kl": 0.1482421875,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 740
},
{
"clip_ratio": 0.00011098779505118728,
"completion_length": 96.8125,
"epoch": 0.49666666666666665,
"grad_norm": 0.06250702594297985,
"kl": 0.348828125,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9074999690055847,
"reward_std": 0.01324320025742054,
"rewards/judge_tool_use": 0.9729166428248087,
"rewards/judge_tool_use/std": 0.033744927495718,
"rewards/verify_correctness": 0.6458333333333334,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 745
},
{
"clip_ratio": 0.0010483485879376532,
"completion_length": 83.09375,
"epoch": 0.5,
"grad_norm": 7.418537851126552,
"kl": 20.103515625,
"learning_rate": 1e-06,
"loss": 0.0342,
"reward": 0.8912499845027924,
"reward_std": 0.12655025720596313,
"rewards/judge_tool_use": 0.934374988079071,
"rewards/judge_tool_use/std": 0.1986893266439438,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.38319888710975647,
"step": 750
},
{
"clip_ratio": 0.00021881838329136373,
"completion_length": 96.47916666666667,
"epoch": 0.5033333333333333,
"grad_norm": 0.329185897503142,
"kl": 0.50859375,
"learning_rate": 1e-06,
"loss": 0.0142,
"reward": 0.9466666579246521,
"reward_std": 0.068666722625494,
"rewards/judge_tool_use": 0.9645833174387614,
"rewards/judge_tool_use/std": 0.08136301239331563,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.24290452400843301,
"step": 755
},
{
"clip_ratio": 0.0016204309416934849,
"completion_length": 79.5,
"epoch": 0.5066666666666667,
"grad_norm": 1.5018746267117502,
"kl": 0.2720703125,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 0.9550000131130219,
"reward_std": 0.0481070326641202,
"rewards/judge_tool_use": 0.9750000238418579,
"rewards/judge_tool_use/std": 0.04472137242555618,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.22360680997371674,
"step": 760
},
{
"clip_ratio": 6.493506371043622e-05,
"completion_length": 92.39583333333333,
"epoch": 0.51,
"grad_norm": 0.09973169012901163,
"kl": 0.24765625,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.9900000095367432,
"reward_std": 0.01655506311605374,
"rewards/judge_tool_use": 0.987499992052714,
"rewards/judge_tool_use/std": 0.024290458609660465,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 765
},
{
"clip_ratio": 0.0022681955248117446,
"completion_length": 70.5,
"epoch": 0.5133333333333333,
"grad_norm": 0.05997101159912667,
"kl": 0.303125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 770
},
{
"clip_ratio": 0.0,
"completion_length": 83.33333333333333,
"epoch": 0.5166666666666667,
"grad_norm": 0.6264206985392197,
"kl": 0.315234375,
"learning_rate": 1e-06,
"loss": -0.0003,
"reward": 0.9091666539510092,
"reward_std": 0.05015302697817484,
"rewards/judge_tool_use": 0.9645833174387614,
"rewards/judge_tool_use/std": 0.043106683840354286,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.33744919300079346,
"step": 775
},
{
"clip_ratio": 0.0017392298206686974,
"completion_length": 93.53125,
"epoch": 0.52,
"grad_norm": 0.524085431110826,
"kl": 0.4203125,
"learning_rate": 1e-06,
"loss": -0.0004,
"reward": 0.9474999904632568,
"reward_std": 0.03535534022375941,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.037500010803341866,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.22360680997371674,
"step": 780
},
{
"clip_ratio": 0.0006958305835723877,
"completion_length": 102.04166666666667,
"epoch": 0.5233333333333333,
"grad_norm": 0.10557989588862941,
"kl": 0.227734375,
"learning_rate": 1e-06,
"loss": -0.0011,
"reward": 0.9399999777475992,
"reward_std": 0.04806827505429586,
"rewards/judge_tool_use": 0.9666666587193807,
"rewards/judge_tool_use/std": 0.07302967707316081,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 785
},
{
"clip_ratio": 0.0,
"completion_length": 69.15625,
"epoch": 0.5266666666666666,
"grad_norm": 0.02144329361599325,
"kl": 0.2107421875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.8799999952316284,
"reward_std": 0.0,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.025819895789027214,
"rewards/verify_correctness": 0.5,
"rewards/verify_correctness/std": 0.5163977742195129,
"step": 790
},
{
"clip_ratio": 0.001966528873890638,
"completion_length": 102.5,
"epoch": 0.53,
"grad_norm": 5.014929981150802,
"kl": 0.689453125,
"learning_rate": 1e-06,
"loss": 0.0161,
"reward": 0.8583333293596903,
"reward_std": 0.13183549232780933,
"rewards/judge_tool_use": 0.9270833532015482,
"rewards/judge_tool_use/std": 0.16641392558813095,
"rewards/verify_correctness": 0.5833333333333334,
"rewards/verify_correctness/std": 0.3303537170092265,
"step": 795
},
{
"clip_ratio": 0.0018077531363815068,
"completion_length": 65.125,
"epoch": 0.5333333333333333,
"grad_norm": 2.4449889650147725,
"kl": 0.616796875,
"learning_rate": 1e-06,
"loss": 0.0159,
"reward": 0.9699999988079071,
"reward_std": 0.032513730227947235,
"rewards/judge_tool_use": 0.9781249761581421,
"rewards/judge_tool_use/std": 0.03145764768123627,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.17078252136707306,
"step": 800
},
{
"clip_ratio": 0.0011476841289550066,
"completion_length": 120.14583333333333,
"epoch": 0.5366666666666666,
"grad_norm": 0.5110356082388042,
"kl": 65.94375,
"learning_rate": 1e-06,
"loss": 0.1173,
"reward": 0.9258333444595337,
"reward_std": 0.09278637667496999,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.09471883624792099,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.4166666666666667,
"step": 805
},
{
"clip_ratio": 0.0009965144796296953,
"completion_length": 104.90625,
"epoch": 0.54,
"grad_norm": 0.04881719966559261,
"kl": 0.297265625,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 0.9224999845027924,
"reward_std": 0.031052968464791775,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.045975545421242714,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 810
},
{
"clip_ratio": 0.0006250000093132258,
"completion_length": 88.6875,
"epoch": 0.5433333333333333,
"grad_norm": 0.15076410059985984,
"kl": 0.3453125,
"learning_rate": 1e-06,
"loss": -0.0169,
"reward": 0.8958333333333334,
"reward_std": 0.06509770142535369,
"rewards/judge_tool_use": 0.9583333333333334,
"rewards/judge_tool_use/std": 0.10000000521540642,
"rewards/verify_correctness": 0.6458333333333334,
"rewards/verify_correctness/std": 0.427598516146342,
"step": 815
},
{
"clip_ratio": 0.0023800658993422983,
"completion_length": 93.53125,
"epoch": 0.5466666666666666,
"grad_norm": 0.13059834615215846,
"kl": 221.2806640625,
"learning_rate": 1e-06,
"loss": 0.4431,
"reward": 0.9950000047683716,
"reward_std": 0.014142139814794064,
"rewards/judge_tool_use": 0.9937500059604645,
"rewards/judge_tool_use/std": 0.017078254371881485,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 820
},
{
"clip_ratio": 0.0,
"completion_length": 64.16666666666667,
"epoch": 0.55,
"grad_norm": 0.21646392402860518,
"kl": 179.3978515625,
"learning_rate": 1e-06,
"loss": 0.358,
"reward": 0.9908333222071329,
"reward_std": 0.011785114804903666,
"rewards/judge_tool_use": 0.993749996026357,
"rewards/judge_tool_use/std": 0.013437099754810333,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 825
},
{
"clip_ratio": 0.0006482481141574681,
"completion_length": 70.6875,
"epoch": 0.5533333333333333,
"grad_norm": 1.2886721610742002,
"kl": 0.33203125,
"learning_rate": 1e-06,
"loss": 0.01,
"reward": 0.8974999785423279,
"reward_std": 0.060415223240852356,
"rewards/judge_tool_use": 0.934374988079071,
"rewards/judge_tool_use/std": 0.12344871461391449,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 830
},
{
"clip_ratio": 0.0,
"completion_length": 66.39583333333333,
"epoch": 0.5566666666666666,
"grad_norm": 0.2650259970239174,
"kl": 0.312109375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 835
},
{
"clip_ratio": 0.0015576324425637723,
"completion_length": 62.5625,
"epoch": 0.56,
"grad_norm": 534.346335825647,
"kl": 87.0494140625,
"learning_rate": 1e-06,
"loss": 0.2192,
"reward": 0.9300000071525574,
"reward_std": 0.05656854063272476,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.10000000149011612,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 840
},
{
"clip_ratio": 6.523157353512942e-05,
"completion_length": 89.6875,
"epoch": 0.5633333333333334,
"grad_norm": 3.527265824573338,
"kl": 0.693359375,
"learning_rate": 1e-06,
"loss": 0.0109,
"reward": 0.9291666746139526,
"reward_std": 0.06318580235044162,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.09166666989525159,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.3065035541852315,
"step": 845
},
{
"clip_ratio": 0.0012977312318980694,
"completion_length": 70.15625,
"epoch": 0.5666666666666667,
"grad_norm": 0.1420108574253742,
"kl": 0.4056640625,
"learning_rate": 1e-06,
"loss": 0.0112,
"reward": 0.9874999821186066,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.984375,
"rewards/judge_tool_use/std": 0.023935683071613312,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 850
},
{
"clip_ratio": 0.001287515088915825,
"completion_length": 65.5,
"epoch": 0.57,
"grad_norm": 0.4405911139980202,
"kl": 0.929296875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 0.9458333055178324,
"reward_std": 0.03568211570382118,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.028598766773939133,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.30103762944539386,
"step": 855
},
{
"clip_ratio": 0.0007213706150650979,
"completion_length": 99.4375,
"epoch": 0.5733333333333334,
"grad_norm": 0.6950772748452339,
"kl": 0.4955078125,
"learning_rate": 1e-06,
"loss": -0.0017,
"reward": 0.8187499940395355,
"reward_std": 0.11091133579611778,
"rewards/judge_tool_use": 0.84375,
"rewards/judge_tool_use/std": 0.21456576697528362,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.38319888710975647,
"step": 860
},
{
"clip_ratio": 0.00097902100533247,
"completion_length": 70.625,
"epoch": 0.5766666666666667,
"grad_norm": 0.08707789007037814,
"kl": 0.311328125,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9916666746139526,
"reward_std": 0.01307279740770658,
"rewards/judge_tool_use": 0.9895833333333334,
"rewards/judge_tool_use/std": 0.02482260266939799,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 865
},
{
"clip_ratio": 0.0026898022275418042,
"completion_length": 145.34375,
"epoch": 0.58,
"grad_norm": 0.12277957926584093,
"kl": 16.7296875,
"learning_rate": 1e-06,
"loss": 0.0335,
"reward": 0.6724999994039536,
"reward_std": 0.024493126198649406,
"rewards/judge_tool_use": 0.7781250178813934,
"rewards/judge_tool_use/std": 0.24047533050179482,
"rewards/verify_correctness": 0.25,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 870
},
{
"clip_ratio": 0.00219599399715662,
"completion_length": 76.33333333333333,
"epoch": 0.5833333333333334,
"grad_norm": 2.725601337016608,
"kl": 0.30625,
"learning_rate": 1e-06,
"loss": -0.0028,
"reward": 0.8441666563351949,
"reward_std": 0.09498834641029437,
"rewards/judge_tool_use": 0.9041666587193807,
"rewards/judge_tool_use/std": 0.15194741388161978,
"rewards/verify_correctness": 0.6041666666666666,
"rewards/verify_correctness/std": 0.4983704487482707,
"step": 875
},
{
"clip_ratio": 0.0008220872841775417,
"completion_length": 100.59375,
"epoch": 0.5866666666666667,
"grad_norm": 0.3521652411691662,
"kl": 0.470703125,
"learning_rate": 1e-06,
"loss": 0.0135,
"reward": 0.8425000011920929,
"reward_std": 0.12523019313812256,
"rewards/judge_tool_use": 0.8812500238418579,
"rewards/judge_tool_use/std": 0.19585155323147774,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.42898140847682953,
"step": 880
},
{
"clip_ratio": 0.0,
"completion_length": 82.3125,
"epoch": 0.59,
"grad_norm": 1.7981311946360221,
"kl": 0.292578125,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9399999777475992,
"reward_std": 0.0,
"rewards/judge_tool_use": 0.9666666587193807,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 885
},
{
"clip_ratio": 0.001058566861320287,
"completion_length": 79.03125,
"epoch": 0.5933333333333334,
"grad_norm": 1.347847810134541,
"kl": 0.2482421875,
"learning_rate": 1e-06,
"loss": -0.0024,
"reward": 0.9225000143051147,
"reward_std": 0.06236250279471278,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.11229145526885986,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 890
},
{
"clip_ratio": 0.0004772079642862082,
"completion_length": 75.58333333333333,
"epoch": 0.5966666666666667,
"grad_norm": 38.987008642448025,
"kl": 6.492578125,
"learning_rate": 1e-06,
"loss": 0.0146,
"reward": 0.7949999769528707,
"reward_std": 0.10766563316186269,
"rewards/judge_tool_use": 0.9104166825612386,
"rewards/judge_tool_use/std": 0.14955327411492667,
"rewards/verify_correctness": 0.3333333333333333,
"rewards/verify_correctness/std": 0.0,
"step": 895
},
{
"clip_ratio": 0.0026460913009941577,
"completion_length": 61.3125,
"epoch": 0.6,
"grad_norm": 0.13988761849808656,
"kl": 1.6234375,
"learning_rate": 1e-06,
"loss": 0.0046,
"reward": 0.9762499928474426,
"reward_std": 0.024748740252107382,
"rewards/judge_tool_use": 0.9781249761581421,
"rewards/judge_tool_use/std": 0.025617383420467377,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 900
},
{
"clip_ratio": 0.0007113821338862181,
"completion_length": 81.04166666666667,
"epoch": 0.6033333333333334,
"grad_norm": 0.07659308560187726,
"kl": 0.3126953125,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9649999936421713,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 905
},
{
"clip_ratio": 0.001008645538240671,
"completion_length": 67.6875,
"epoch": 0.6066666666666667,
"grad_norm": 0.26869976791962935,
"kl": 0.2265625,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 910
},
{
"clip_ratio": 0.0,
"completion_length": 60.125,
"epoch": 0.61,
"grad_norm": 0.07923399412772805,
"kl": 0.25546875,
"learning_rate": 1e-06,
"loss": -0.0016,
"reward": 0.9566666682561239,
"reward_std": 0.018856181452671688,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.7916666666666666,
"rewards/verify_correctness/std": 0.28598760565121967,
"step": 915
},
{
"clip_ratio": 0.0,
"completion_length": 90.375,
"epoch": 0.6133333333333333,
"grad_norm": 1.5125503684257633,
"kl": 0.2650390625,
"learning_rate": 1e-06,
"loss": -0.0057,
"reward": 0.9162499904632568,
"reward_std": 0.09739170409739017,
"rewards/judge_tool_use": 0.949999988079071,
"rewards/judge_tool_use/std": 0.09660918265581131,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 920
},
{
"clip_ratio": 0.0003249390749260783,
"completion_length": 83.20833333333333,
"epoch": 0.6166666666666667,
"grad_norm": 0.0376142648380269,
"kl": 0.437109375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.9416666626930237,
"reward_std": 0.016618976990381878,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.0429793248573939,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 925
},
{
"clip_ratio": 0.0,
"completion_length": 61.34375,
"epoch": 0.62,
"grad_norm": 0.017493888934549447,
"kl": 0.205078125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.949999988079071,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 930
},
{
"clip_ratio": 0.001988636329770088,
"completion_length": 86.3125,
"epoch": 0.6233333333333333,
"grad_norm": 0.3529719981491763,
"kl": 0.167578125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9683333436648051,
"reward_std": 0.0246026162058115,
"rewards/judge_tool_use": 0.9916666746139526,
"rewards/judge_tool_use/std": 0.022771005829175312,
"rewards/verify_correctness": 0.875,
"rewards/verify_correctness/std": 0.16666666666666666,
"step": 935
},
{
"clip_ratio": 0.0003450655611231923,
"completion_length": 68.0,
"epoch": 0.6266666666666667,
"grad_norm": 0.0932730656399196,
"kl": 0.286328125,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 940
},
{
"clip_ratio": 0.001211305521428585,
"completion_length": 62.854166666666664,
"epoch": 0.63,
"grad_norm": 0.08491446176113226,
"kl": 0.21591796875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9583333333333334,
"reward_std": 0.006900658831000328,
"rewards/judge_tool_use": 0.9895833333333334,
"rewards/judge_tool_use/std": 0.015957122047742207,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 945
},
{
"clip_ratio": 0.0,
"completion_length": 54.84375,
"epoch": 0.6333333333333333,
"grad_norm": 0.08445845606617904,
"kl": 0.37021484375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 950
},
{
"clip_ratio": 0.0009750896133482456,
"completion_length": 93.54166666666667,
"epoch": 0.6366666666666667,
"grad_norm": 9.5445004824132,
"kl": 0.745703125,
"learning_rate": 1e-06,
"loss": 0.0385,
"reward": 0.9033333460489908,
"reward_std": 0.06474923094113667,
"rewards/judge_tool_use": 0.962499996026357,
"rewards/judge_tool_use/std": 0.10911189516385396,
"rewards/verify_correctness": 0.6666666666666666,
"rewards/verify_correctness/std": 0.3442651828130086,
"step": 955
},
{
"clip_ratio": 0.0023815435823053123,
"completion_length": 57.875,
"epoch": 0.64,
"grad_norm": 0.3188163340929078,
"kl": 0.2923828125,
"learning_rate": 1e-06,
"loss": 0.023,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 960
},
{
"clip_ratio": 0.0010169491171836853,
"completion_length": 79.75,
"epoch": 0.6433333333333333,
"grad_norm": 0.047849220688192536,
"kl": 0.3609375,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9450000127156576,
"reward_std": 0.015430334955453873,
"rewards/judge_tool_use": 0.9833333293596903,
"rewards/judge_tool_use/std": 0.017213263859351475,
"rewards/verify_correctness": 0.7916666666666666,
"rewards/verify_correctness/std": 0.28598760565121967,
"step": 965
},
{
"clip_ratio": 0.0007624854100868106,
"completion_length": 99.40625,
"epoch": 0.6466666666666666,
"grad_norm": 1.6765942241870033,
"kl": 0.3326171875,
"learning_rate": 1e-06,
"loss": 0.0178,
"reward": 0.7712499797344208,
"reward_std": 0.2565469294786453,
"rewards/judge_tool_use": 0.815625011920929,
"rewards/judge_tool_use/std": 0.3259209841489792,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.497555673122406,
"step": 970
},
{
"clip_ratio": 0.0,
"completion_length": 73.47916666666667,
"epoch": 0.65,
"grad_norm": 0.06349338696726389,
"kl": 0.341796875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.950000007947286,
"reward_std": 0.006172137334942818,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.028598766773939133,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 975
},
{
"clip_ratio": 0.0012797552859410643,
"completion_length": 63.3125,
"epoch": 0.6533333333333333,
"grad_norm": 0.28318230460197835,
"kl": 0.2015625,
"learning_rate": 1e-06,
"loss": -0.0,
"reward": 0.9862500131130219,
"reward_std": 0.0340069429948926,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.029578257352113724,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 980
},
{
"clip_ratio": 0.00012330455938354136,
"completion_length": 65.1875,
"epoch": 0.6566666666666666,
"grad_norm": 0.040520629760965425,
"kl": 0.12958984375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 0.9691666563351949,
"reward_std": 0.022236108779907227,
"rewards/judge_tool_use": 0.987499992052714,
"rewards/judge_tool_use/std": 0.016666671882073086,
"rewards/verify_correctness": 0.8958333333333334,
"rewards/verify_correctness/std": 0.1595711906750997,
"step": 985
},
{
"clip_ratio": 0.0002617801073938608,
"completion_length": 57.9375,
"epoch": 0.66,
"grad_norm": 0.08013232055388929,
"kl": 0.2423828125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9375,
"reward_std": 0.010350988246500492,
"rewards/judge_tool_use": 0.984375,
"rewards/judge_tool_use/std": 0.023935683071613312,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 990
},
{
"clip_ratio": 0.0005273823626339436,
"completion_length": 125.39583333333333,
"epoch": 0.6633333333333333,
"grad_norm": 0.21519543469410263,
"kl": 0.4150390625,
"learning_rate": 1e-06,
"loss": -0.0002,
"reward": 0.9150000214576721,
"reward_std": 0.06592664029449224,
"rewards/judge_tool_use": 0.9770833253860474,
"rewards/judge_tool_use/std": 0.050411589443683624,
"rewards/verify_correctness": 0.6666666666666666,
"rewards/verify_correctness/std": 0.45265427231788635,
"step": 995
},
{
"clip_ratio": 0.0036985486280173064,
"completion_length": 77.15625,
"epoch": 0.6666666666666666,
"grad_norm": 0.13812841795509134,
"kl": 0.2712890625,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9699999690055847,
"reward_std": 0.017422060016542673,
"rewards/judge_tool_use": 0.9624999761581421,
"rewards/judge_tool_use/std": 0.04955306649208069,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1000
},
{
"clip_ratio": 0.001371738500893116,
"completion_length": 80.89583333333333,
"epoch": 0.67,
"grad_norm": 0.026859045422407482,
"kl": 0.25078125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 0.9274999896685282,
"reward_std": 0.048885335214436054,
"rewards/judge_tool_use": 0.9666666587193807,
"rewards/judge_tool_use/std": 0.03929029653469721,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.1595711906750997,
"step": 1005
},
{
"clip_ratio": 0.0007334963418543338,
"completion_length": 73.1875,
"epoch": 0.6733333333333333,
"grad_norm": 0.029137916766353474,
"kl": 0.215625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9925000071525574,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1010
},
{
"clip_ratio": 0.001794871874153614,
"completion_length": 55.5,
"epoch": 0.6766666666666666,
"grad_norm": 0.05424116695206805,
"kl": 0.14228515625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.9900000095367432,
"reward_std": 0.013801320145527521,
"rewards/judge_tool_use": 0.987499992052714,
"rewards/judge_tool_use/std": 0.026874199509620667,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1015
},
{
"clip_ratio": 0.0038688791915774345,
"completion_length": 83.125,
"epoch": 0.68,
"grad_norm": 0.6002387040564632,
"kl": 0.71337890625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 0.9775000214576721,
"reward_std": 0.03518358897417784,
"rewards/judge_tool_use": 0.9718749821186066,
"rewards/judge_tool_use/std": 0.04269563965499401,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1020
},
{
"clip_ratio": 0.00015748031437397004,
"completion_length": 73.08333333333333,
"epoch": 0.6833333333333333,
"grad_norm": 0.17152579955328484,
"kl": 0.23369140625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9566666682561239,
"reward_std": 0.011614705435931683,
"rewards/judge_tool_use": 0.987499992052714,
"rewards/judge_tool_use/std": 0.024290457367897034,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1025
},
{
"clip_ratio": 0.0013904837891459466,
"completion_length": 82.09375,
"epoch": 0.6866666666666666,
"grad_norm": 0.06465280465321986,
"kl": 0.1673828125,
"learning_rate": 1e-06,
"loss": -0.0003,
"reward": 0.9874999821186066,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.984375,
"rewards/judge_tool_use/std": 0.023935683071613312,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1030
},
{
"clip_ratio": 0.0010101010091602802,
"completion_length": 95.77083333333333,
"epoch": 0.69,
"grad_norm": 0.0497496702632166,
"kl": 0.26328125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9883333245913187,
"reward_std": 0.010886183629433313,
"rewards/judge_tool_use": 0.9854166507720947,
"rewards/judge_tool_use/std": 0.017078256855408352,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1035
},
{
"clip_ratio": 0.0,
"completion_length": 89.5625,
"epoch": 0.6933333333333334,
"grad_norm": 0.05805638933980568,
"kl": 0.33515625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9474999904632568,
"reward_std": 0.007071072701364756,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 1040
},
{
"clip_ratio": 0.0005376344081014395,
"completion_length": 69.0,
"epoch": 0.6966666666666667,
"grad_norm": 0.057741287045645795,
"kl": 0.23125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9933333396911621,
"reward_std": 0.007126972700158755,
"rewards/judge_tool_use": 0.9916666746139526,
"rewards/judge_tool_use/std": 0.01490712414185206,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1045
},
{
"clip_ratio": 0.0020588235929608345,
"completion_length": 89.0,
"epoch": 0.7,
"grad_norm": 0.11086366545110844,
"kl": 0.2451171875,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1050
},
{
"clip_ratio": 0.002628726325929165,
"completion_length": 75.72916666666667,
"epoch": 0.7033333333333334,
"grad_norm": 1.1943993692559909,
"kl": 11.296875,
"learning_rate": 1e-06,
"loss": 0.0049,
"reward": 0.8274999856948853,
"reward_std": 0.11860653261343639,
"rewards/judge_tool_use": 0.8416666587193807,
"rewards/judge_tool_use/std": 0.2197331190109253,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.3198537329832713,
"step": 1055
},
{
"clip_ratio": 0.0027464469894766808,
"completion_length": 76.46875,
"epoch": 0.7066666666666667,
"grad_norm": 0.09154251338419422,
"kl": 0.7396484375,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 0.9200000166893005,
"reward_std": 0.029206860810518265,
"rewards/judge_tool_use": 0.9624999761581421,
"rewards/judge_tool_use/std": 0.03872983902692795,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 1060
},
{
"clip_ratio": 0.0025510898791253566,
"completion_length": 85.54166666666667,
"epoch": 0.71,
"grad_norm": 7.4396963421241615,
"kl": 1.466015625,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 0.9608333309491476,
"reward_std": 0.016499162030716736,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25546592473983765,
"step": 1065
},
{
"clip_ratio": 0.00016474464209750294,
"completion_length": 65.46875,
"epoch": 0.7133333333333334,
"grad_norm": 0.38680025327426837,
"kl": 0.7421875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 0.9950000047683716,
"reward_std": 0.014142131432890892,
"rewards/judge_tool_use": 0.9937500059604645,
"rewards/judge_tool_use/std": 0.02500000037252903,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1070
},
{
"clip_ratio": 0.0,
"completion_length": 79.91666666666667,
"epoch": 0.7166666666666667,
"grad_norm": 0.040315040411472755,
"kl": 0.2189453125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9933333396911621,
"reward_std": 0.007126972700158755,
"rewards/judge_tool_use": 0.9916666746139526,
"rewards/judge_tool_use/std": 0.01490712414185206,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1075
},
{
"clip_ratio": 0.0,
"completion_length": 68.0625,
"epoch": 0.72,
"grad_norm": 1.7493474554433537,
"kl": 0.369140625,
"learning_rate": 1e-06,
"loss": -0.0001,
"reward": 0.987500011920929,
"reward_std": 0.02314550243318081,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.17078252136707306,
"step": 1080
},
{
"clip_ratio": 0.001583357620984316,
"completion_length": 86.85416666666667,
"epoch": 0.7233333333333334,
"grad_norm": 44.87687812229357,
"kl": 8.5908203125,
"learning_rate": 1e-06,
"loss": 0.0171,
"reward": 0.9758333166440328,
"reward_std": 0.029838324524462223,
"rewards/judge_tool_use": 0.9749999841054281,
"rewards/judge_tool_use/std": 0.0367970938483874,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 1085
},
{
"clip_ratio": 0.0002669311594218016,
"completion_length": 75.15625,
"epoch": 0.7266666666666667,
"grad_norm": 0.9238188968217804,
"kl": 2.844921875,
"learning_rate": 1e-06,
"loss": 0.005,
"reward": 0.9012499749660492,
"reward_std": 0.05243951827287674,
"rewards/judge_tool_use": 0.9468749761581421,
"rewards/judge_tool_use/std": 0.02212652750313282,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 1090
},
{
"clip_ratio": 0.000481430534273386,
"completion_length": 77.95833333333333,
"epoch": 0.73,
"grad_norm": 0.7345981685759967,
"kl": 0.368359375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9983333349227905,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1095
},
{
"clip_ratio": 0.002522681839764118,
"completion_length": 106.15625,
"epoch": 0.7333333333333333,
"grad_norm": 0.14338628771134002,
"kl": 0.305078125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 0.9387499988079071,
"reward_std": 0.03364227432757616,
"rewards/judge_tool_use": 0.9781250059604645,
"rewards/judge_tool_use/std": 0.04251633584499359,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 1100
},
{
"clip_ratio": 0.0025735294446349146,
"completion_length": 59.166666666666664,
"epoch": 0.7366666666666667,
"grad_norm": 0.036437347174280016,
"kl": 0.2404296875,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9649999936421713,
"reward_std": 0.004714048467576504,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1105
},
{
"clip_ratio": 0.00031796502880752084,
"completion_length": 78.375,
"epoch": 0.74,
"grad_norm": 0.057036900120037294,
"kl": 0.21328125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 0.9462500214576721,
"reward_std": 0.08264750707894564,
"rewards/judge_tool_use": 0.9406249821186066,
"rewards/judge_tool_use/std": 0.15678166970610619,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1110
},
{
"clip_ratio": 0.0009749303571879864,
"completion_length": 77.10416666666667,
"epoch": 0.7433333333333333,
"grad_norm": 0.09765470973201737,
"kl": 0.31328125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9383333325386047,
"reward_std": 0.01632875545571248,
"rewards/judge_tool_use": 0.9645833174387614,
"rewards/judge_tool_use/std": 0.033744927495718,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1115
},
{
"clip_ratio": 0.005039867106825113,
"completion_length": 96.15625,
"epoch": 0.7466666666666667,
"grad_norm": 0.39855344737975584,
"kl": 0.3517578125,
"learning_rate": 1e-06,
"loss": -0.005,
"reward": 0.8637499809265137,
"reward_std": 0.09736945712938905,
"rewards/judge_tool_use": 0.9312499761581421,
"rewards/judge_tool_use/std": 0.1450628936290741,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.497555673122406,
"step": 1120
},
{
"clip_ratio": 0.000663868710398674,
"completion_length": 88.70833333333333,
"epoch": 0.75,
"grad_norm": 0.03425710766538615,
"kl": 0.230078125,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9458333253860474,
"reward_std": 0.05892556874702374,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.0749652733405431,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25546592473983765,
"step": 1125
},
{
"clip_ratio": 0.0,
"completion_length": 74.4375,
"epoch": 0.7533333333333333,
"grad_norm": 0.061636357119929305,
"kl": 0.26015625,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1130
},
{
"clip_ratio": 0.0007194244768470526,
"completion_length": 72.625,
"epoch": 0.7566666666666667,
"grad_norm": 0.054919383958076454,
"kl": 0.378515625,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9599999984105428,
"reward_std": 0.007126967112223308,
"rewards/judge_tool_use": 0.9916666746139526,
"rewards/judge_tool_use/std": 0.01490712414185206,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1135
},
{
"clip_ratio": 0.0003389830468222499,
"completion_length": 70.875,
"epoch": 0.76,
"grad_norm": 0.2643017191768637,
"kl": 0.21484375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9824999868869781,
"reward_std": 0.027645720168948174,
"rewards/judge_tool_use": 0.9937500059604645,
"rewards/judge_tool_use/std": 0.017078254371881485,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.17078252136707306,
"step": 1140
},
{
"clip_ratio": 0.00042253523133695126,
"completion_length": 86.875,
"epoch": 0.7633333333333333,
"grad_norm": 0.05569328969548695,
"kl": 0.20859375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 0.9983333349227905,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1145
},
{
"clip_ratio": 0.0009957325644791126,
"completion_length": 70.15625,
"epoch": 0.7666666666666667,
"grad_norm": 0.04901324608109469,
"kl": 0.327734375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9474999904632568,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 1150
},
{
"clip_ratio": 0.0023318555206060408,
"completion_length": 75.5625,
"epoch": 0.77,
"grad_norm": 0.04106910726707159,
"kl": 0.2216796875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9566666682561239,
"reward_std": 0.011614705125490824,
"rewards/judge_tool_use": 0.987499992052714,
"rewards/judge_tool_use/std": 0.016666671882073086,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1155
},
{
"clip_ratio": 0.0005040322430431843,
"completion_length": 83.625,
"epoch": 0.7733333333333333,
"grad_norm": 2.987988625745358,
"kl": 0.309765625,
"learning_rate": 1e-06,
"loss": -0.0085,
"reward": 0.8725000023841858,
"reward_std": 0.08379396051168442,
"rewards/judge_tool_use": 0.903124988079071,
"rewards/judge_tool_use/std": 0.17461267113685608,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 1160
},
{
"clip_ratio": 0.0,
"completion_length": 103.64583333333333,
"epoch": 0.7766666666666666,
"grad_norm": 0.02629597807895239,
"kl": 0.228125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9649999936421713,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1165
},
{
"clip_ratio": 0.0,
"completion_length": 57.96875,
"epoch": 0.78,
"grad_norm": 0.07410127643745398,
"kl": 0.1939453125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1170
},
{
"clip_ratio": 0.0006822766736149788,
"completion_length": 84.10416666666667,
"epoch": 0.7833333333333333,
"grad_norm": 0.15022605399359662,
"kl": 0.499609375,
"learning_rate": 1e-06,
"loss": 0.0053,
"reward": 0.8541666666666666,
"reward_std": 0.08558030892163515,
"rewards/judge_tool_use": 0.875,
"rewards/judge_tool_use/std": 0.16530899827679,
"rewards/verify_correctness": 0.7708333333333334,
"rewards/verify_correctness/std": 0.3065035541852315,
"step": 1175
},
{
"clip_ratio": 0.0013619335135445,
"completion_length": 59.5,
"epoch": 0.7866666666666666,
"grad_norm": 0.5652312034577931,
"kl": 0.7025390625,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 0.956250011920929,
"reward_std": 0.04082316905260086,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 1180
},
{
"clip_ratio": 0.00011415524641051888,
"completion_length": 81.60416666666667,
"epoch": 0.79,
"grad_norm": 0.12620412707671058,
"kl": 0.3166015625,
"learning_rate": 1e-06,
"loss": -0.0005,
"reward": 0.9799999992052714,
"reward_std": 0.021484845007459324,
"rewards/judge_tool_use": 0.9854166706403097,
"rewards/judge_tool_use/std": 0.027342626204093296,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 1185
},
{
"clip_ratio": 0.0010737302247434855,
"completion_length": 110.46875,
"epoch": 0.7933333333333333,
"grad_norm": 1.7918621578186396,
"kl": 0.730078125,
"learning_rate": 1e-06,
"loss": 0.0108,
"reward": 0.78125,
"reward_std": 0.20286056958138943,
"rewards/judge_tool_use": 0.859375,
"rewards/judge_tool_use/std": 0.23376120440661907,
"rewards/verify_correctness": 0.46875,
"rewards/verify_correctness/std": 0.5143726766109467,
"step": 1190
},
{
"clip_ratio": 6.253908504731953e-05,
"completion_length": 111.41666666666667,
"epoch": 0.7966666666666666,
"grad_norm": 0.03905890135125302,
"kl": 0.242578125,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 0.9733333190282186,
"reward_std": 0.05249338845411936,
"rewards/judge_tool_use": 0.9770833253860474,
"rewards/judge_tool_use/std": 0.06962200005849202,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 1195
},
{
"clip_ratio": 0.001590106077492237,
"completion_length": 140.0,
"epoch": 0.8,
"grad_norm": 0.40701520002938785,
"kl": 0.385546875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 0.9937500059604645,
"reward_std": 0.01767767034471035,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1200
},
{
"clip_ratio": 0.0020255075418390335,
"completion_length": 88.85416666666667,
"epoch": 0.8033333333333333,
"grad_norm": 0.05096213910492859,
"kl": 6.8375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 0.9675000111262003,
"reward_std": 0.06057482430090507,
"rewards/judge_tool_use": 0.9749999841054281,
"rewards/judge_tool_use/std": 0.0746867706378301,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1343709627787272,
"step": 1205
},
{
"clip_ratio": 0.002165872976183891,
"completion_length": 113.46875,
"epoch": 0.8066666666666666,
"grad_norm": 0.038572489497010296,
"kl": 1.9291015625,
"learning_rate": 1e-06,
"loss": -0.0044,
"reward": 0.9662500023841858,
"reward_std": 0.08459719270467758,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.09953015297651291,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1210
},
{
"clip_ratio": 0.002159976586699486,
"completion_length": 123.64583333333333,
"epoch": 0.81,
"grad_norm": 358.62576549649833,
"kl": 6.189453125,
"learning_rate": 1e-06,
"loss": 0.0137,
"reward": 0.9066666563351949,
"reward_std": 0.07383281799654166,
"rewards/judge_tool_use": 0.956250011920929,
"rewards/judge_tool_use/std": 0.09136871124307315,
"rewards/verify_correctness": 0.7083333333333334,
"rewards/verify_correctness/std": 0.4150371154149373,
"step": 1215
},
{
"clip_ratio": 0.0018026274861767887,
"completion_length": 104.34375,
"epoch": 0.8133333333333334,
"grad_norm": 3.067095761854335,
"kl": 1.104296875,
"learning_rate": 1e-06,
"loss": -0.0106,
"reward": 0.9012500047683716,
"reward_std": 0.19770433753728867,
"rewards/judge_tool_use": 0.9156250059604645,
"rewards/judge_tool_use/std": 0.22225218266248703,
"rewards/verify_correctness": 0.84375,
"rewards/verify_correctness/std": 0.34860680997371674,
"step": 1220
},
{
"clip_ratio": 0.0,
"completion_length": 94.08333333333333,
"epoch": 0.8166666666666667,
"grad_norm": 2.5780187257288305,
"kl": 0.45625,
"learning_rate": 1e-06,
"loss": -0.0012,
"reward": 0.9458333253860474,
"reward_std": 0.04852588474750519,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.06652763485908508,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25546592473983765,
"step": 1225
},
{
"clip_ratio": 0.0011104722507297993,
"completion_length": 113.84375,
"epoch": 0.82,
"grad_norm": 121.8815400237713,
"kl": 11.90546875,
"learning_rate": 1e-06,
"loss": 0.0468,
"reward": 0.8700000047683716,
"reward_std": 0.2571648806333542,
"rewards/judge_tool_use": 0.8843749761581421,
"rewards/judge_tool_use/std": 0.25856370478868484,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.3943893313407898,
"step": 1230
},
{
"clip_ratio": 0.0026565464213490486,
"completion_length": 93.3125,
"epoch": 0.8233333333333334,
"grad_norm": 0.10321332519745828,
"kl": 0.405078125,
"learning_rate": 1e-06,
"loss": -0.0093,
"reward": 0.9666666587193807,
"reward_std": 0.05114200680206219,
"rewards/judge_tool_use": 0.9791666666666666,
"rewards/judge_tool_use/std": 0.0749652733405431,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.1490712066491445,
"step": 1235
},
{
"clip_ratio": 0.0024265490006655454,
"completion_length": 115.21875,
"epoch": 0.8266666666666667,
"grad_norm": 0.23216984855971948,
"kl": 0.3140625,
"learning_rate": 1e-06,
"loss": 0.004,
"reward": 0.8787499964237213,
"reward_std": 0.09615195170044899,
"rewards/judge_tool_use": 0.9500000178813934,
"rewards/judge_tool_use/std": 0.11884498223662376,
"rewards/verify_correctness": 0.59375,
"rewards/verify_correctness/std": 0.4797805994749069,
"step": 1240
},
{
"clip_ratio": 0.0017241379246115685,
"completion_length": 83.3125,
"epoch": 0.83,
"grad_norm": 99.5968858061048,
"kl": 7.05859375,
"learning_rate": 1e-06,
"loss": 0.0057,
"reward": 0.8924999833106995,
"reward_std": 0.12001222868760426,
"rewards/judge_tool_use": 0.8916666706403097,
"rewards/judge_tool_use/std": 0.19258573154608408,
"rewards/verify_correctness": 0.8958333333333334,
"rewards/verify_correctness/std": 0.23240453998247781,
"step": 1245
},
{
"clip_ratio": 0.00025445292703807354,
"completion_length": 107.09375,
"epoch": 0.8333333333333334,
"grad_norm": 0.03256099733878858,
"kl": 4.037109375,
"learning_rate": 1e-06,
"loss": 0.0071,
"reward": 0.9849999845027924,
"reward_std": 0.009258206002414227,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025000007823109627,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1250
},
{
"clip_ratio": 0.0017938947305083275,
"completion_length": 94.35416666666667,
"epoch": 0.8366666666666667,
"grad_norm": 0.06251358849592821,
"kl": 0.41484375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.8608333269755045,
"reward_std": 0.031753117529054485,
"rewards/judge_tool_use": 0.893749992052714,
"rewards/judge_tool_use/std": 0.10712230205535889,
"rewards/verify_correctness": 0.7291666666666666,
"rewards/verify_correctness/std": 0.331703782081604,
"step": 1255
},
{
"clip_ratio": 0.000598287198226899,
"completion_length": 80.0,
"epoch": 0.84,
"grad_norm": 0.05840859691708302,
"kl": 0.53984375,
"learning_rate": 1e-06,
"loss": -0.0014,
"reward": 0.9712499976158142,
"reward_std": 0.07356970012187958,
"rewards/judge_tool_use": 0.971875011920929,
"rewards/judge_tool_use/std": 0.09994790703058243,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1260
},
{
"clip_ratio": 0.0018959624227136373,
"completion_length": 147.64583333333334,
"epoch": 0.8433333333333334,
"grad_norm": 0.03529068907540189,
"kl": 0.275390625,
"learning_rate": 1e-06,
"loss": -0.0002,
"reward": 0.949999988079071,
"reward_std": 0.02855063695460558,
"rewards/judge_tool_use": 0.9791666467984518,
"rewards/judge_tool_use/std": 0.038437106957038246,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.25411585966746014,
"step": 1265
},
{
"clip_ratio": 0.0008389623428229243,
"completion_length": 90.15625,
"epoch": 0.8466666666666667,
"grad_norm": 1.2714158856293196,
"kl": 1.3701171875,
"learning_rate": 1e-06,
"loss": 0.0127,
"reward": 0.913750022649765,
"reward_std": 0.08004017360508442,
"rewards/judge_tool_use": 0.9625000059604645,
"rewards/judge_tool_use/std": 0.11686970666050911,
"rewards/verify_correctness": 0.71875,
"rewards/verify_correctness/std": 0.38319888710975647,
"step": 1270
},
{
"clip_ratio": 0.0019859145628288387,
"completion_length": 88.08333333333333,
"epoch": 0.85,
"grad_norm": 0.06215294502600355,
"kl": 0.309765625,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 0.9799999793370565,
"reward_std": 0.012344274669885635,
"rewards/judge_tool_use": 0.9749999841054281,
"rewards/judge_tool_use/std": 0.03333334376414617,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1275
},
{
"clip_ratio": 0.00028956440510228274,
"completion_length": 125.1875,
"epoch": 0.8533333333333334,
"grad_norm": 0.35612576827223785,
"kl": 0.3298828125,
"learning_rate": 1e-06,
"loss": 0.0075,
"reward": 0.8712500035762787,
"reward_std": 0.08317051082849503,
"rewards/judge_tool_use": 0.925000011920929,
"rewards/judge_tool_use/std": 0.1485760398209095,
"rewards/verify_correctness": 0.65625,
"rewards/verify_correctness/std": 0.4797805994749069,
"step": 1280
},
{
"clip_ratio": 0.000690448796376586,
"completion_length": 77.375,
"epoch": 0.8566666666666667,
"grad_norm": 0.08452869378749826,
"kl": 2.195703125,
"learning_rate": 1e-06,
"loss": 0.0217,
"reward": 0.9083333412806193,
"reward_std": 0.06206287909299135,
"rewards/judge_tool_use": 0.9270833333333334,
"rewards/judge_tool_use/std": 0.13058080275853476,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1285
},
{
"clip_ratio": 0.0005081300623714924,
"completion_length": 94.1875,
"epoch": 0.86,
"grad_norm": 0.1776444100641673,
"kl": 0.2853515625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9737499952316284,
"reward_std": 0.02722262777388096,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 0.90625,
"rewards/verify_correctness/std": 0.20155644416809082,
"step": 1290
},
{
"clip_ratio": 0.0010418544989079237,
"completion_length": 80.91666666666667,
"epoch": 0.8633333333333333,
"grad_norm": 0.0610474709944994,
"kl": 0.6662109375,
"learning_rate": 1e-06,
"loss": 0.0053,
"reward": 0.962499996026357,
"reward_std": 0.055602967428664364,
"rewards/judge_tool_use": 0.96875,
"rewards/judge_tool_use/std": 0.0822451909383138,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1343709627787272,
"step": 1295
},
{
"clip_ratio": 0.0009510869160294533,
"completion_length": 77.75,
"epoch": 0.8666666666666667,
"grad_norm": 0.9107590450435429,
"kl": 0.2291015625,
"learning_rate": 1e-06,
"loss": -0.0002,
"reward": 0.9449999630451202,
"reward_std": 0.05099019035696983,
"rewards/judge_tool_use": 0.9312500059604645,
"rewards/judge_tool_use/std": 0.12392698042094707,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1300
},
{
"clip_ratio": 0.0005154639016836882,
"completion_length": 69.60416666666667,
"epoch": 0.87,
"grad_norm": 1.2573438996166055,
"kl": 0.34296875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9675000111262003,
"reward_std": 0.07550223357975483,
"rewards/judge_tool_use": 0.975000003973643,
"rewards/judge_tool_use/std": 0.08629285047451656,
"rewards/verify_correctness": 0.9375,
"rewards/verify_correctness/std": 0.1343709627787272,
"step": 1305
},
{
"clip_ratio": 0.0013909014873206616,
"completion_length": 92.96875,
"epoch": 0.8733333333333333,
"grad_norm": 0.022891900254080604,
"kl": 0.308984375,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.9925000071525574,
"reward_std": 0.01632927590981126,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.029578257352113724,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1310
},
{
"clip_ratio": 0.0,
"completion_length": 56.0625,
"epoch": 0.8766666666666667,
"grad_norm": 0.6528272207623552,
"kl": 0.2763671875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9950000047683716,
"reward_std": 0.006900660072763761,
"rewards/judge_tool_use": 0.993749996026357,
"rewards/judge_tool_use/std": 0.013437099754810333,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1315
},
{
"clip_ratio": 0.002574713109061122,
"completion_length": 93.40625,
"epoch": 0.88,
"grad_norm": 2.691428684412783,
"kl": 0.2265625,
"learning_rate": 1e-06,
"loss": 0.0205,
"reward": 0.9312499761581421,
"reward_std": 0.09005174040794373,
"rewards/judge_tool_use": 0.921875,
"rewards/judge_tool_use/std": 0.12776117026805878,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1320
},
{
"clip_ratio": 0.0005747126415371895,
"completion_length": 74.3125,
"epoch": 0.8833333333333333,
"grad_norm": 1.4432230028882391,
"kl": 0.637890625,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.9583333333333334,
"reward_std": 0.006900658831000328,
"rewards/judge_tool_use": 0.9895833333333334,
"rewards/judge_tool_use/std": 0.015957122047742207,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1325
},
{
"clip_ratio": 0.0,
"completion_length": 70.53125,
"epoch": 0.8866666666666667,
"grad_norm": 0.054306836971010644,
"kl": 0.4212890625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9912500083446503,
"reward_std": 0.02474873699247837,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1330
},
{
"clip_ratio": 0.0010802339063957333,
"completion_length": 88.89583333333333,
"epoch": 0.89,
"grad_norm": 1.275564433705307,
"kl": 0.525390625,
"learning_rate": 1e-06,
"loss": -0.0011,
"reward": 0.7258332967758179,
"reward_std": 0.11929505753020446,
"rewards/judge_tool_use": 0.7874999841054281,
"rewards/judge_tool_use/std": 0.17958564311265945,
"rewards/verify_correctness": 0.4791666666666667,
"rewards/verify_correctness/std": 0.25546592473983765,
"step": 1335
},
{
"clip_ratio": 0.0011502533918246627,
"completion_length": 77.8125,
"epoch": 0.8933333333333333,
"grad_norm": 0.09495443439677693,
"kl": 0.267578125,
"learning_rate": 1e-06,
"loss": -0.0008,
"reward": 0.9325000047683716,
"reward_std": 0.024799177423119545,
"rewards/judge_tool_use": 0.9781249761581421,
"rewards/judge_tool_use/std": 0.036371923983097076,
"rewards/verify_correctness": 0.75,
"rewards/verify_correctness/std": 0.25819888710975647,
"step": 1340
},
{
"clip_ratio": 0.0015889291651546955,
"completion_length": 64.45833333333333,
"epoch": 0.8966666666666666,
"grad_norm": 0.14989201200913477,
"kl": 0.2818359375,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9816666841506958,
"reward_std": 0.028729444990555447,
"rewards/judge_tool_use": 0.987500011920929,
"rewards/judge_tool_use/std": 0.026292627056439716,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 1345
},
{
"clip_ratio": 0.0019876956939697266,
"completion_length": 68.3125,
"epoch": 0.9,
"grad_norm": 0.1295069839584885,
"kl": 0.2017578125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.9975000023841858,
"reward_std": 0.007071069907397032,
"rewards/judge_tool_use": 0.996874988079071,
"rewards/judge_tool_use/std": 0.012500002980232239,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1350
},
{
"clip_ratio": 0.001906401664018631,
"completion_length": 78.83333333333333,
"epoch": 0.9033333333333333,
"grad_norm": 0.3518791950466458,
"kl": 0.2375,
"learning_rate": 1e-06,
"loss": 0.0066,
"reward": 0.8791666626930237,
"reward_std": 0.03298428406318029,
"rewards/judge_tool_use": 0.8854166666666666,
"rewards/judge_tool_use/std": 0.10253725449244182,
"rewards/verify_correctness": 0.8541666666666666,
"rewards/verify_correctness/std": 0.1707825263341268,
"step": 1355
},
{
"clip_ratio": 0.0007567567750811577,
"completion_length": 82.03125,
"epoch": 0.9066666666666666,
"grad_norm": 0.021035024825950537,
"kl": 0.8796875,
"learning_rate": 1e-06,
"loss": 0.0294,
"reward": 0.9662500023841858,
"reward_std": 0.07981003820896149,
"rewards/judge_tool_use": 0.965624988079071,
"rewards/judge_tool_use/std": 0.11212901771068573,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1360
},
{
"clip_ratio": 0.0,
"completion_length": 60.5625,
"epoch": 0.91,
"grad_norm": 0.05787722591961142,
"kl": 0.2779296875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.8633333245913187,
"reward_std": 0.006172137334942818,
"rewards/judge_tool_use": 0.8708333174387614,
"rewards/judge_tool_use/std": 0.1371594878534476,
"rewards/verify_correctness": 0.8333333333333334,
"rewards/verify_correctness/std": 0.1721325914065043,
"step": 1365
},
{
"clip_ratio": 0.0007437300402671099,
"completion_length": 75.75,
"epoch": 0.9133333333333333,
"grad_norm": 0.26276877013042266,
"kl": 0.25859375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9850000143051147,
"reward_std": 0.01994866505265236,
"rewards/judge_tool_use": 0.9812500178813934,
"rewards/judge_tool_use/std": 0.039438940584659576,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1370
},
{
"clip_ratio": 0.001213064044713974,
"completion_length": 94.8125,
"epoch": 0.9166666666666666,
"grad_norm": 0.17680777202915993,
"kl": 0.333984375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 0.8958333333333334,
"reward_std": 0.06000414118170738,
"rewards/judge_tool_use": 0.9479166467984518,
"rewards/judge_tool_use/std": 0.10853513081868489,
"rewards/verify_correctness": 0.6875,
"rewards/verify_correctness/std": 0.3429151177406311,
"step": 1375
},
{
"clip_ratio": 0.000994318164885044,
"completion_length": 47.0625,
"epoch": 0.92,
"grad_norm": 0.0472238380868765,
"kl": 0.282421875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1380
},
{
"clip_ratio": 0.0006648936308920383,
"completion_length": 64.58333333333333,
"epoch": 0.9233333333333333,
"grad_norm": 0.03587334591120667,
"kl": 0.17080078125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 0.9924999872843424,
"reward_std": 0.012051478028297424,
"rewards/judge_tool_use": 0.9958333373069763,
"rewards/judge_tool_use/std": 0.011385502914587656,
"rewards/verify_correctness": 0.9791666666666666,
"rewards/verify_correctness/std": 0.08333333333333333,
"step": 1385
},
{
"clip_ratio": 0.0025167843326926232,
"completion_length": 76.875,
"epoch": 0.9266666666666666,
"grad_norm": 0.3856157588789181,
"kl": 0.3412109375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9900000095367432,
"reward_std": 0.017422059550881386,
"rewards/judge_tool_use": 0.987500011920929,
"rewards/judge_tool_use/std": 0.02236068621277809,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1390
},
{
"clip_ratio": 0.0,
"completion_length": 65.04166666666667,
"epoch": 0.93,
"grad_norm": 0.1934572773836806,
"kl": 0.3046875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1395
},
{
"clip_ratio": 0.001198182231746614,
"completion_length": 89.0625,
"epoch": 0.9333333333333333,
"grad_norm": 0.675230815999025,
"kl": 0.3251953125,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.8862500190734863,
"reward_std": 0.035135677084326744,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.04409133270382881,
"rewards/verify_correctness": 0.53125,
"rewards/verify_correctness/std": 0.5143726766109467,
"step": 1400
},
{
"clip_ratio": 0.0,
"completion_length": 78.58333333333333,
"epoch": 0.9366666666666666,
"grad_norm": 0.04618812246067894,
"kl": 0.31796875,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1405
},
{
"clip_ratio": 0.0006472928449511528,
"completion_length": 104.78125,
"epoch": 0.94,
"grad_norm": 0.2027750965971443,
"kl": 0.1837890625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 0.9512499868869781,
"reward_std": 0.03955394588410854,
"rewards/judge_tool_use": 0.9937500059604645,
"rewards/judge_tool_use/std": 0.017078254371881485,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.375,
"step": 1410
},
{
"clip_ratio": 0.0008242387557402253,
"completion_length": 98.375,
"epoch": 0.9433333333333334,
"grad_norm": 0.08416311072246639,
"kl": 0.3388671875,
"learning_rate": 1e-06,
"loss": -0.0008,
"reward": 0.8683333198229471,
"reward_std": 0.14452426508069038,
"rewards/judge_tool_use": 0.918750007947286,
"rewards/judge_tool_use/std": 0.17317061002055803,
"rewards/verify_correctness": 0.6666666666666666,
"rewards/verify_correctness/std": 0.46607474486033124,
"step": 1415
},
{
"clip_ratio": 0.001800605608150363,
"completion_length": 84.84375,
"epoch": 0.9466666666666667,
"grad_norm": 0.06391568829388007,
"kl": 0.3146484375,
"learning_rate": 1e-06,
"loss": -0.0092,
"reward": 0.7537499964237213,
"reward_std": 0.2096980162896216,
"rewards/judge_tool_use": 0.809374988079071,
"rewards/judge_tool_use/std": 0.2246776893734932,
"rewards/verify_correctness": 0.53125,
"rewards/verify_correctness/std": 0.5143726766109467,
"step": 1420
},
{
"clip_ratio": 0.00038022813387215135,
"completion_length": 60.083333333333336,
"epoch": 0.95,
"grad_norm": 0.03794355903634634,
"kl": 0.2689453125,
"learning_rate": 1e-06,
"loss": -0.002,
"reward": 0.9783333341280619,
"reward_std": 0.04257579147815704,
"rewards/judge_tool_use": 0.9833333293596903,
"rewards/judge_tool_use/std": 0.0516397754351298,
"rewards/verify_correctness": 0.9583333333333334,
"rewards/verify_correctness/std": 0.11385501424471538,
"step": 1425
},
{
"clip_ratio": 0.0006096576456911862,
"completion_length": 71.125,
"epoch": 0.9533333333333334,
"grad_norm": 0.30796597977550466,
"kl": 0.2615234375,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9749999940395355,
"reward_std": 0.017422061879187822,
"rewards/judge_tool_use": 0.9687499701976776,
"rewards/judge_tool_use/std": 0.04577303305268288,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1430
},
{
"clip_ratio": 0.0024035727605223657,
"completion_length": 72.47916666666667,
"epoch": 0.9566666666666667,
"grad_norm": 1.22539179930226,
"kl": 0.416015625,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.8508333365122477,
"reward_std": 0.01868577239414056,
"rewards/judge_tool_use": 0.8604166706403097,
"rewards/judge_tool_use/std": 0.1511431708931923,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25546592473983765,
"step": 1435
},
{
"clip_ratio": 0.0,
"completion_length": 54.0,
"epoch": 0.96,
"grad_norm": 0.0669772138366113,
"kl": 0.2806640625,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.0,
"reward_std": 0.0,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1440
},
{
"clip_ratio": 0.0001488095265813172,
"completion_length": 71.04166666666667,
"epoch": 0.9633333333333334,
"grad_norm": 0.03299226460421629,
"kl": 0.237109375,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9983333349227905,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1445
},
{
"clip_ratio": 0.0005509641952812672,
"completion_length": 62.375,
"epoch": 0.9666666666666667,
"grad_norm": 0.169618332875564,
"kl": 0.39140625,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 0.9362500011920929,
"reward_std": 0.0176776684820652,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.025819895789027214,
"rewards/verify_correctness": 0.78125,
"rewards/verify_correctness/std": 0.2561737895011902,
"step": 1450
},
{
"clip_ratio": 0.0,
"completion_length": 69.625,
"epoch": 0.97,
"grad_norm": 0.11741331911619839,
"kl": 0.2830078125,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.9983333349227905,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.9979166587193807,
"rewards/judge_tool_use/std": 0.008333335320154825,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1455
},
{
"clip_ratio": 0.0,
"completion_length": 63.875,
"epoch": 0.9733333333333334,
"grad_norm": 0.03231880152902591,
"kl": 0.33046875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 0.9799999892711639,
"reward_std": 0.0,
"rewards/judge_tool_use": 0.9749999940395355,
"rewards/judge_tool_use/std": 0.025819895789027214,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1460
},
{
"clip_ratio": 0.002763367397710681,
"completion_length": 88.83333333333333,
"epoch": 0.9766666666666667,
"grad_norm": 1.198685689035818,
"kl": 29510.626171875,
"learning_rate": 1e-06,
"loss": 59.2647,
"reward": 0.9008333086967468,
"reward_std": 0.09250006452202797,
"rewards/judge_tool_use": 0.9229166507720947,
"rewards/judge_tool_use/std": 0.12078534811735153,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.28463754057884216,
"step": 1465
},
{
"clip_ratio": 0.000631313119083643,
"completion_length": 65.375,
"epoch": 0.98,
"grad_norm": 0.055429824214890476,
"kl": 0.474609375,
"learning_rate": 1e-06,
"loss": -0.0002,
"reward": 0.9625000059604645,
"reward_std": 0.02314549870789051,
"rewards/judge_tool_use": 1.0,
"rewards/judge_tool_use/std": 0.0,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25,
"step": 1470
},
{
"clip_ratio": 0.0,
"completion_length": 88.20833333333333,
"epoch": 0.9833333333333333,
"grad_norm": 0.016207461931570542,
"kl": 0.230078125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.9849999944368998,
"reward_std": 0.0047140466049313545,
"rewards/judge_tool_use": 0.981249988079071,
"rewards/judge_tool_use/std": 0.025546599179506302,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1475
},
{
"clip_ratio": 0.0016450773924589156,
"completion_length": 79.625,
"epoch": 0.9866666666666667,
"grad_norm": 1.7797350058464707,
"kl": 0.271875,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.887499988079071,
"reward_std": 0.08430386334657669,
"rewards/judge_tool_use": 0.90625,
"rewards/judge_tool_use/std": 0.15478479862213135,
"rewards/verify_correctness": 0.8125,
"rewards/verify_correctness/std": 0.25,
"step": 1480
},
{
"clip_ratio": 0.0,
"completion_length": 75.75,
"epoch": 0.99,
"grad_norm": 0.03087383599727828,
"kl": 0.184765625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.996666669845581,
"reward_std": 0.006172137334942818,
"rewards/judge_tool_use": 0.9958333373069763,
"rewards/judge_tool_use/std": 0.011385502914587656,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1485
},
{
"clip_ratio": 0.001946034946013242,
"completion_length": 74.875,
"epoch": 0.9933333333333333,
"grad_norm": 0.17294764604787113,
"kl": 0.408203125,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.9637499749660492,
"reward_std": 0.03092945460230112,
"rewards/judge_tool_use": 0.9624999761581421,
"rewards/judge_tool_use/std": 0.050000015646219254,
"rewards/verify_correctness": 0.96875,
"rewards/verify_correctness/std": 0.125,
"step": 1490
},
{
"clip_ratio": 0.001576576568186283,
"completion_length": 77.97916666666667,
"epoch": 0.9966666666666667,
"grad_norm": 0.26066361310457414,
"kl": 0.3224609375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.971666673819224,
"reward_std": 0.03252020105719566,
"rewards/judge_tool_use": 0.9854166507720947,
"rewards/judge_tool_use/std": 0.017078255613644917,
"rewards/verify_correctness": 0.9166666666666666,
"rewards/verify_correctness/std": 0.22771002848943075,
"step": 1495
},
{
"clip_ratio": 0.0008512710221111775,
"completion_length": 57.6875,
"epoch": 1.0,
"grad_norm": 0.30129414295649587,
"kl": 0.2619140625,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 0.9925000071525574,
"reward_std": 0.010350990109145641,
"rewards/judge_tool_use": 0.9906249940395355,
"rewards/judge_tool_use/std": 0.0201556496322155,
"rewards/verify_correctness": 1.0,
"rewards/verify_correctness/std": 0.0,
"step": 1500
}
],
"logging_steps": 5,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}