|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0025945808738470078, |
|
"completion_length": 170.85416666666666, |
|
"epoch": 0.0033333333333333335, |
|
"grad_norm": 93.89875220655702, |
|
"kl": 385.1, |
|
"learning_rate": 4e-07, |
|
"loss": 0.7721, |
|
"reward": 0.6091666718324026, |
|
"reward_std": 0.11040117839972179, |
|
"rewards/judge_tool_use": 0.6104166805744171, |
|
"rewards/judge_tool_use/std": 0.23237329721450806, |
|
"rewards/verify_correctness": 0.6041666666666666, |
|
"rewards/verify_correctness/std": 0.331703782081604, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0060518977232277395, |
|
"completion_length": 177.1875, |
|
"epoch": 0.006666666666666667, |
|
"grad_norm": 9.776842338966992, |
|
"kl": 112.5, |
|
"learning_rate": 9e-07, |
|
"loss": 0.2337, |
|
"reward": 0.5799999833106995, |
|
"reward_std": 0.06415978074073792, |
|
"rewards/judge_tool_use": 0.5531249940395355, |
|
"rewards/judge_tool_use/std": 0.42493732273578644, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.457730233669281, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026892464607954024, |
|
"completion_length": 93.39583333333333, |
|
"epoch": 0.01, |
|
"grad_norm": 9.977887566458119, |
|
"kl": 118.95, |
|
"learning_rate": 1e-06, |
|
"loss": 0.2916, |
|
"reward": 0.8125, |
|
"reward_std": 0.13529950194060802, |
|
"rewards/judge_tool_use": 0.843750019868215, |
|
"rewards/judge_tool_use/std": 0.2237425111234188, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.006966326106339693, |
|
"completion_length": 102.1875, |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 12.06423951719899, |
|
"kl": 125.55, |
|
"learning_rate": 1e-06, |
|
"loss": 0.2913, |
|
"reward": 0.7787500023841858, |
|
"reward_std": 0.18160327523946762, |
|
"rewards/judge_tool_use": 0.778124988079071, |
|
"rewards/judge_tool_use/std": 0.3228672966361046, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0011292789597064257, |
|
"completion_length": 124.3125, |
|
"epoch": 0.016666666666666666, |
|
"grad_norm": 8.195116496059761, |
|
"kl": 61.559375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.1389, |
|
"reward": 0.8116666873296102, |
|
"reward_std": 0.11048179492354393, |
|
"rewards/judge_tool_use": 0.8270833293596903, |
|
"rewards/judge_tool_use/std": 0.24461634953816733, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.39984261989593506, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0029064802452921867, |
|
"completion_length": 59.375, |
|
"epoch": 0.02, |
|
"grad_norm": 0.5796629563759274, |
|
"kl": 4.3453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0212, |
|
"reward": 0.887499988079071, |
|
"reward_std": 0.10905145853757858, |
|
"rewards/judge_tool_use": 0.890625, |
|
"rewards/judge_tool_use/std": 0.1675497591495514, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.22360680997371674, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.007409381680190563, |
|
"completion_length": 98.375, |
|
"epoch": 0.023333333333333334, |
|
"grad_norm": 0.8606388492244584, |
|
"kl": 6.91875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0811, |
|
"reward": 0.8641666372617086, |
|
"reward_std": 0.12008555854360263, |
|
"rewards/judge_tool_use": 0.9083333412806193, |
|
"rewards/judge_tool_use/std": 0.1753722901145617, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.42213259140650433, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.004702709428966045, |
|
"completion_length": 101.34375, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 9.62952082324332, |
|
"kl": 2.60703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0232, |
|
"reward": 0.8199999928474426, |
|
"reward_std": 0.22975663095712662, |
|
"rewards/judge_tool_use": 0.8531250059604645, |
|
"rewards/judge_tool_use/std": 0.28422578424215317, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3811737895011902, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0037392981350421904, |
|
"completion_length": 115.72916666666667, |
|
"epoch": 0.03, |
|
"grad_norm": 0.41001786033101184, |
|
"kl": 7.5859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0704, |
|
"reward": 0.812499980131785, |
|
"reward_std": 0.11250268605848153, |
|
"rewards/judge_tool_use": 0.84375, |
|
"rewards/judge_tool_use/std": 0.22761926551659903, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0033889828715473413, |
|
"completion_length": 100.75, |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 3.0378548631776696, |
|
"kl": 0.6390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0521, |
|
"reward": 0.6862499862909317, |
|
"reward_std": 0.11575066298246384, |
|
"rewards/judge_tool_use": 0.6937499940395355, |
|
"rewards/judge_tool_use/std": 0.22235235385596752, |
|
"rewards/verify_correctness": 0.65625, |
|
"rewards/verify_correctness/std": 0.23935678601264954, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0024284129962325097, |
|
"completion_length": 119.375, |
|
"epoch": 0.03666666666666667, |
|
"grad_norm": 20.786314639465385, |
|
"kl": 1.7916015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0271, |
|
"reward": 0.8674999872843424, |
|
"reward_std": 0.12038270942866802, |
|
"rewards/judge_tool_use": 0.912500003973643, |
|
"rewards/judge_tool_use/std": 0.1864920680721601, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.42213259140650433, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.006763660744763911, |
|
"completion_length": 110.09375, |
|
"epoch": 0.04, |
|
"grad_norm": 109.33119098699642, |
|
"kl": 47.1625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.1779, |
|
"reward": 0.7075000107288361, |
|
"reward_std": 0.13642948493361473, |
|
"rewards/judge_tool_use": 0.7125000059604645, |
|
"rewards/judge_tool_use/std": 0.3916912078857422, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.42898140847682953, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026321998797357083, |
|
"completion_length": 103.39583333333333, |
|
"epoch": 0.043333333333333335, |
|
"grad_norm": 1.4069264237823462, |
|
"kl": 2.836328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0596, |
|
"reward": 0.5699999978144964, |
|
"reward_std": 0.1616679678360621, |
|
"rewards/judge_tool_use": 0.5875000009934107, |
|
"rewards/judge_tool_use/std": 0.29024802645047504, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.25411585966746014, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0048656010068953036, |
|
"completion_length": 140.875, |
|
"epoch": 0.04666666666666667, |
|
"grad_norm": 0.05496390689490212, |
|
"kl": 0.383984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0039, |
|
"reward": 0.7750000059604645, |
|
"reward_std": 0.009258201345801353, |
|
"rewards/judge_tool_use": 0.78125, |
|
"rewards/judge_tool_use/std": 0.22647663950920105, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005703422240912915, |
|
"completion_length": 101.45833333333333, |
|
"epoch": 0.05, |
|
"grad_norm": 4.7434453029775785, |
|
"kl": 0.85390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0123, |
|
"reward": 0.8791666626930237, |
|
"reward_std": 0.11438154180844624, |
|
"rewards/judge_tool_use": 0.90625, |
|
"rewards/judge_tool_use/std": 0.1892575373252233, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.32623785734176636, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.004394483286887407, |
|
"completion_length": 115.125, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 18.348889188180735, |
|
"kl": 1.1921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0491, |
|
"reward": 0.8037499785423279, |
|
"reward_std": 0.11402197554707527, |
|
"rewards/judge_tool_use": 0.856249988079071, |
|
"rewards/judge_tool_use/std": 0.22299936041235924, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.497555673122406, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0019086383283138276, |
|
"completion_length": 72.72916666666667, |
|
"epoch": 0.056666666666666664, |
|
"grad_norm": 0.18350366354479203, |
|
"kl": 0.076220703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9666666785875956, |
|
"reward_std": 0.03248864381263653, |
|
"rewards/judge_tool_use": 0.9895833333333334, |
|
"rewards/judge_tool_use/std": 0.03110433618227641, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.16666666666666666, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.003978083655238151, |
|
"completion_length": 128.125, |
|
"epoch": 0.06, |
|
"grad_norm": 0.4888077045674276, |
|
"kl": 0.452734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0035, |
|
"reward": 0.887499988079071, |
|
"reward_std": 0.0304714092053473, |
|
"rewards/judge_tool_use": 0.984375, |
|
"rewards/judge_tool_use/std": 0.04136751964688301, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.5163977742195129, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.004061844293028116, |
|
"completion_length": 73.95833333333333, |
|
"epoch": 0.06333333333333334, |
|
"grad_norm": 1.8106715962150735, |
|
"kl": 0.293359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.001, |
|
"reward": 0.850000003973643, |
|
"reward_std": 0.03771235949049393, |
|
"rewards/judge_tool_use": 0.9375, |
|
"rewards/judge_tool_use/std": 0.07327633599440257, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005249343812465668, |
|
"completion_length": 116.4375, |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.8380959397252633, |
|
"kl": 0.251953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.008, |
|
"reward": 0.7874999940395355, |
|
"reward_std": 0.05897941440343857, |
|
"rewards/judge_tool_use": 0.796875, |
|
"rewards/judge_tool_use/std": 0.2327149659395218, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.00741090327501297, |
|
"completion_length": 75.02083333333333, |
|
"epoch": 0.07, |
|
"grad_norm": 10.30093582857556, |
|
"kl": 0.8576171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0212, |
|
"reward": 0.6974999904632568, |
|
"reward_std": 0.15869482358296713, |
|
"rewards/judge_tool_use": 0.7416666547457377, |
|
"rewards/judge_tool_use/std": 0.22486203908920288, |
|
"rewards/verify_correctness": 0.5208333333333334, |
|
"rewards/verify_correctness/std": 0.5150477091471354, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0045037418603897095, |
|
"completion_length": 107.53125, |
|
"epoch": 0.07333333333333333, |
|
"grad_norm": 1.2618789359641311, |
|
"kl": 0.459375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.02, |
|
"reward": 0.918749988079071, |
|
"reward_std": 0.10914891492575407, |
|
"rewards/judge_tool_use": 0.953125, |
|
"rewards/judge_tool_use/std": 0.10675819590687752, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013080392964184284, |
|
"completion_length": 83.29166666666667, |
|
"epoch": 0.07666666666666666, |
|
"grad_norm": 1.3187929435904935, |
|
"kl": 6.78203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0404, |
|
"reward": 0.8991666634877523, |
|
"reward_std": 0.10660337905089061, |
|
"rewards/judge_tool_use": 0.931249996026357, |
|
"rewards/judge_tool_use/std": 0.17414189875125885, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.3198537329832713, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.001383163803257048, |
|
"completion_length": 127.46875, |
|
"epoch": 0.08, |
|
"grad_norm": 0.07051598250723173, |
|
"kl": 0.60703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.005, |
|
"reward": 0.7400000095367432, |
|
"reward_std": 0.05770984524860978, |
|
"rewards/judge_tool_use": 0.7999999821186066, |
|
"rewards/judge_tool_use/std": 0.22889777272939682, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.5163977742195129, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007298925891518592, |
|
"completion_length": 92.72916666666667, |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.17549127232311668, |
|
"kl": 0.2087890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9616666634877523, |
|
"reward_std": 0.010886183939874172, |
|
"rewards/judge_tool_use": 0.993749996026357, |
|
"rewards/judge_tool_use/std": 0.01971883823474248, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.002933995798230171, |
|
"completion_length": 81.46875, |
|
"epoch": 0.08666666666666667, |
|
"grad_norm": 0.4833514179614044, |
|
"kl": 0.1775390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 0.9350000023841858, |
|
"reward_std": 0.017422057688236237, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025000007823109627, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.001261868537403643, |
|
"completion_length": 122.97916666666667, |
|
"epoch": 0.09, |
|
"grad_norm": 0.3241107117396865, |
|
"kl": 0.23095703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0102, |
|
"reward": 0.8791666626930237, |
|
"reward_std": 0.057309987023472786, |
|
"rewards/judge_tool_use": 0.9270833333333334, |
|
"rewards/judge_tool_use/std": 0.1284285510579745, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008610086515545845, |
|
"completion_length": 84.0, |
|
"epoch": 0.09333333333333334, |
|
"grad_norm": 7.900768391435355, |
|
"kl": 0.234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0296, |
|
"reward": 0.918749988079071, |
|
"reward_std": 0.11230538040399551, |
|
"rewards/judge_tool_use": 0.921875, |
|
"rewards/judge_tool_use/std": 0.16829413175582886, |
|
"rewards/verify_correctness": 0.90625, |
|
"rewards/verify_correctness/std": 0.20155644416809082, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.002098301984369755, |
|
"completion_length": 97.4375, |
|
"epoch": 0.09666666666666666, |
|
"grad_norm": 116546485.23512569, |
|
"kl": 2857376.061328125, |
|
"learning_rate": 1e-06, |
|
"loss": 5723.877, |
|
"reward": 0.7324999968210856, |
|
"reward_std": 0.1401955665399631, |
|
"rewards/judge_tool_use": 0.8374999761581421, |
|
"rewards/judge_tool_use/std": 0.21938102692365646, |
|
"rewards/verify_correctness": 0.3125, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.00661008469760418, |
|
"completion_length": 91.9375, |
|
"epoch": 0.1, |
|
"grad_norm": 0.15506303872115793, |
|
"kl": 1082.3681640625, |
|
"learning_rate": 1e-06, |
|
"loss": 2.2091, |
|
"reward": 0.875, |
|
"reward_std": 0.08185647381469607, |
|
"rewards/judge_tool_use": 0.90625, |
|
"rewards/judge_tool_use/std": 0.17084430158138275, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 94.04166666666667, |
|
"epoch": 0.10333333333333333, |
|
"grad_norm": 1.1559868262365864, |
|
"kl": 37.68232421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0821, |
|
"reward": 0.9233333269755045, |
|
"reward_std": 0.03246487428744634, |
|
"rewards/judge_tool_use": 0.9458333253860474, |
|
"rewards/judge_tool_use/std": 0.08484516913692157, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.001581813069060445, |
|
"completion_length": 118.40625, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 6.97639562650297, |
|
"kl": 1.0310546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0379, |
|
"reward": 0.7549999952316284, |
|
"reward_std": 0.14613103866577148, |
|
"rewards/judge_tool_use": 0.8187499940395355, |
|
"rewards/judge_tool_use/std": 0.20402206480503082, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0015904867090284825, |
|
"completion_length": 69.1875, |
|
"epoch": 0.11, |
|
"grad_norm": 0.03440563598329304, |
|
"kl": 0.9119140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0055, |
|
"reward": 0.8925000031789144, |
|
"reward_std": 0.06749333689610164, |
|
"rewards/judge_tool_use": 0.9333333174387614, |
|
"rewards/judge_tool_use/std": 0.11831362545490265, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.331703782081604, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.002113501913845539, |
|
"completion_length": 80.90625, |
|
"epoch": 0.11333333333333333, |
|
"grad_norm": 0.2535518240520116, |
|
"kl": 0.256640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9674999713897705, |
|
"reward_std": 0.027019730769097805, |
|
"rewards/judge_tool_use": 0.9593749940395355, |
|
"rewards/judge_tool_use/std": 0.041013939306139946, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0027771067805588247, |
|
"completion_length": 85.10416666666667, |
|
"epoch": 0.11666666666666667, |
|
"grad_norm": 1.4643118560278143, |
|
"kl": 14.665625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0287, |
|
"reward": 0.9225000143051147, |
|
"reward_std": 0.08353264754017194, |
|
"rewards/judge_tool_use": 0.981250007947286, |
|
"rewards/judge_tool_use/std": 0.06663193802038829, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3462595542271932, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.010677224583923816, |
|
"completion_length": 79.40625, |
|
"epoch": 0.12, |
|
"grad_norm": 23.600689362585086, |
|
"kl": 22.570703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0499, |
|
"reward": 0.7824999988079071, |
|
"reward_std": 0.0866054892539978, |
|
"rewards/judge_tool_use": 0.9156249761581421, |
|
"rewards/judge_tool_use/std": 0.13062315434217453, |
|
"rewards/verify_correctness": 0.25, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.002086438238620758, |
|
"completion_length": 72.97916666666667, |
|
"epoch": 0.12333333333333334, |
|
"grad_norm": 0.3641677909401427, |
|
"kl": 0.2119140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8666666547457377, |
|
"reward_std": 0.049736435214678444, |
|
"rewards/judge_tool_use": 0.90625, |
|
"rewards/judge_tool_use/std": 0.11474608878294627, |
|
"rewards/verify_correctness": 0.7083333333333334, |
|
"rewards/verify_correctness/std": 0.21770429611206055, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0038325218483805656, |
|
"completion_length": 49.4375, |
|
"epoch": 0.12666666666666668, |
|
"grad_norm": 0.21086652879123133, |
|
"kl": 13.8916015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0143, |
|
"reward": 0.949999988079071, |
|
"reward_std": 0.11195331811904907, |
|
"rewards/judge_tool_use": 0.953125, |
|
"rewards/judge_tool_use/std": 0.12445715814828873, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.17078252136707306, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009160305373370648, |
|
"completion_length": 68.39583333333333, |
|
"epoch": 0.13, |
|
"grad_norm": 41.93084775556413, |
|
"kl": 4.29609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.007, |
|
"reward": 0.8799999952316284, |
|
"reward_std": 0.012344265977541605, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.03998426472147306, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.5163977742195129, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0023942343890666963, |
|
"completion_length": 104.5, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 11.56654747316538, |
|
"kl": 9.48359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0082, |
|
"reward": 0.7549999952316284, |
|
"reward_std": 0.06465663947165012, |
|
"rewards/judge_tool_use": 0.8031249940395355, |
|
"rewards/judge_tool_use/std": 0.22497396357357502, |
|
"rewards/verify_correctness": 0.5625, |
|
"rewards/verify_correctness/std": 0.5123475790023804, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 55.666666666666664, |
|
"epoch": 0.13666666666666666, |
|
"grad_norm": 0.044316960076541095, |
|
"kl": 0.21640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.07, |
|
"reward": 0.8816666603088379, |
|
"reward_std": 0.052872808650135994, |
|
"rewards/judge_tool_use": 0.8520833452542623, |
|
"rewards/judge_tool_use/std": 0.179916741947333, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.00298199572134763, |
|
"completion_length": 87.1875, |
|
"epoch": 0.14, |
|
"grad_norm": 0.5961166297628009, |
|
"kl": 0.8013671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0512, |
|
"reward": 0.9087499976158142, |
|
"reward_std": 0.10966175608336926, |
|
"rewards/judge_tool_use": 0.9406249821186066, |
|
"rewards/judge_tool_use/std": 0.16376879438757896, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0016499520279467106, |
|
"completion_length": 90.83333333333333, |
|
"epoch": 0.14333333333333334, |
|
"grad_norm": 4.675832955995631, |
|
"kl": 1.3087890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.041, |
|
"reward": 0.9633333285649618, |
|
"reward_std": 0.06979535892605782, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.07252075274785359, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.22771002848943075, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009900989942252636, |
|
"completion_length": 57.9375, |
|
"epoch": 0.14666666666666667, |
|
"grad_norm": 11.064991822281403, |
|
"kl": 14.7765625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0021, |
|
"reward": 0.9149999916553497, |
|
"reward_std": 0.10488088428974152, |
|
"rewards/judge_tool_use": 0.925000011920929, |
|
"rewards/judge_tool_use/std": 0.1612451672554016, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.22360680997371674, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.006531355949118733, |
|
"completion_length": 75.72916666666667, |
|
"epoch": 0.15, |
|
"grad_norm": 1721.9681952666701, |
|
"kl": 28.7462890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.046, |
|
"reward": 0.712499996026357, |
|
"reward_std": 0.20226633052031198, |
|
"rewards/judge_tool_use": 0.6875, |
|
"rewards/judge_tool_use/std": 0.2489463413755099, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.28463754057884216, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.001966949412599206, |
|
"completion_length": 78.625, |
|
"epoch": 0.15333333333333332, |
|
"grad_norm": 0.43853112182902554, |
|
"kl": 4.1453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.679999977350235, |
|
"reward_std": 0.14663636311888695, |
|
"rewards/judge_tool_use": 0.6781250238418579, |
|
"rewards/judge_tool_use/std": 0.4121476113796234, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.42898140847682953, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018659377470612525, |
|
"completion_length": 114.02083333333333, |
|
"epoch": 0.15666666666666668, |
|
"grad_norm": 3.100123205393985, |
|
"kl": 3.02109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0543, |
|
"reward": 0.809166669845581, |
|
"reward_std": 0.1575567809243997, |
|
"rewards/judge_tool_use": 0.881250003973643, |
|
"rewards/judge_tool_use/std": 0.19285393754641214, |
|
"rewards/verify_correctness": 0.5208333333333334, |
|
"rewards/verify_correctness/std": 0.36932093898455304, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018544910941272973, |
|
"completion_length": 81.125, |
|
"epoch": 0.16, |
|
"grad_norm": 2.424569092207152, |
|
"kl": 0.627734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0332, |
|
"reward": 0.7649999856948853, |
|
"reward_std": 0.18427922576665878, |
|
"rewards/judge_tool_use": 0.815625011920929, |
|
"rewards/judge_tool_use/std": 0.2867114394903183, |
|
"rewards/verify_correctness": 0.5625, |
|
"rewards/verify_correctness/std": 0.5081988871097565, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013313586998265237, |
|
"completion_length": 96.47916666666667, |
|
"epoch": 0.16333333333333333, |
|
"grad_norm": 1.905174844858326, |
|
"kl": 0.31015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0109, |
|
"reward": 0.856666644414266, |
|
"reward_std": 0.09935928011933963, |
|
"rewards/judge_tool_use": 0.9145833253860474, |
|
"rewards/judge_tool_use/std": 0.15078541884819666, |
|
"rewards/verify_correctness": 0.625, |
|
"rewards/verify_correctness/std": 0.4878704647223155, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009706525830551982, |
|
"completion_length": 72.53125, |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.6805601508978978, |
|
"kl": 0.573828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0431, |
|
"reward": 0.9312500059604645, |
|
"reward_std": 0.06104740500450134, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.09979145228862762, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.001953125, |
|
"completion_length": 80.35416666666667, |
|
"epoch": 0.17, |
|
"grad_norm": 0.12919899479086885, |
|
"kl": 0.260546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 0.940833330154419, |
|
"reward_std": 0.028137820462385815, |
|
"rewards/judge_tool_use": 0.9937500158945719, |
|
"rewards/judge_tool_use/std": 0.013437099754810333, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.331703782081604, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.005703368596732617, |
|
"completion_length": 83.21875, |
|
"epoch": 0.17333333333333334, |
|
"grad_norm": 1.946076353704121, |
|
"kl": 0.2544921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0004, |
|
"reward": 0.9000000059604645, |
|
"reward_std": 0.05235438700765371, |
|
"rewards/judge_tool_use": 0.9375, |
|
"rewards/judge_tool_use/std": 0.11046760901808739, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.70833333333333, |
|
"epoch": 0.17666666666666667, |
|
"grad_norm": 5.214532115094087, |
|
"kl": 50.934375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0987, |
|
"reward": 0.9424999952316284, |
|
"reward_std": 0.05198417603969574, |
|
"rewards/judge_tool_use": 0.9749999841054281, |
|
"rewards/judge_tool_use/std": 0.06610877811908722, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.1707825263341268, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017290424089878797, |
|
"completion_length": 84.71875, |
|
"epoch": 0.18, |
|
"grad_norm": 0.2231498531869626, |
|
"kl": 0.3408203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0023, |
|
"reward": 0.9437499940395355, |
|
"reward_std": 0.01767767034471035, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.38319888710975647, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0024591220542788507, |
|
"completion_length": 99.5, |
|
"epoch": 0.18333333333333332, |
|
"grad_norm": 0.6587571271446878, |
|
"kl": 1.0974609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0007, |
|
"reward": 0.7808333237965902, |
|
"reward_std": 0.12485803912083308, |
|
"rewards/judge_tool_use": 0.887500007947286, |
|
"rewards/judge_tool_use/std": 0.22067607939243317, |
|
"rewards/verify_correctness": 0.3541666666666667, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0030754867941141127, |
|
"completion_length": 77.15625, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.19350548920635696, |
|
"kl": 0.34140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"reward": 0.8725000023841858, |
|
"reward_std": 0.055471993051469326, |
|
"rewards/judge_tool_use": 0.949999988079071, |
|
"rewards/judge_tool_use/std": 0.08520798571407795, |
|
"rewards/verify_correctness": 0.5625, |
|
"rewards/verify_correctness/std": 0.5081988871097565, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.002323679253458977, |
|
"completion_length": 105.39583333333333, |
|
"epoch": 0.19, |
|
"grad_norm": 0.3062492173104658, |
|
"kl": 373.6810546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.769, |
|
"reward": 0.9183333317438761, |
|
"reward_std": 0.08563666356106599, |
|
"rewards/judge_tool_use": 0.9604166746139526, |
|
"rewards/judge_tool_use/std": 0.10187822952866554, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.305153489112854, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0027462080586701633, |
|
"completion_length": 64.71875, |
|
"epoch": 0.19333333333333333, |
|
"grad_norm": 1.907689788676722, |
|
"kl": 3.71171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0063, |
|
"reward": 0.9237499833106995, |
|
"reward_std": 0.14122167974710464, |
|
"rewards/judge_tool_use": 0.9281249940395355, |
|
"rewards/judge_tool_use/std": 0.19741450250148773, |
|
"rewards/verify_correctness": 0.90625, |
|
"rewards/verify_correctness/std": 0.29578252136707306, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0014276792760938406, |
|
"completion_length": 110.125, |
|
"epoch": 0.19666666666666666, |
|
"grad_norm": 5.415532630231474, |
|
"kl": 0.4234375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0077, |
|
"reward": 0.8458333015441895, |
|
"reward_std": 0.2074962705373764, |
|
"rewards/judge_tool_use": 0.875, |
|
"rewards/judge_tool_use/std": 0.2852979749441147, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.4045371313889821, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.001273171789944172, |
|
"completion_length": 84.8125, |
|
"epoch": 0.2, |
|
"grad_norm": 0.7050928593838439, |
|
"kl": 0.2220703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0415, |
|
"reward": 0.9537499845027924, |
|
"reward_std": 0.09016102831810713, |
|
"rewards/judge_tool_use": 0.949999988079071, |
|
"rewards/judge_tool_use/std": 0.13606470078229904, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.00012106538051739335, |
|
"completion_length": 71.79166666666667, |
|
"epoch": 0.20333333333333334, |
|
"grad_norm": 0.3670228135757323, |
|
"kl": 1.64921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0174, |
|
"reward": 0.9508333206176758, |
|
"reward_std": 0.07014790053168933, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.08843709776798885, |
|
"rewards/verify_correctness": 0.8541666666666666, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013387146405875682, |
|
"completion_length": 55.5625, |
|
"epoch": 0.20666666666666667, |
|
"grad_norm": 0.16146643171622213, |
|
"kl": 0.2376953125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0024, |
|
"reward": 0.9587499797344208, |
|
"reward_std": 0.0969240814447403, |
|
"rewards/judge_tool_use": 0.971875011920929, |
|
"rewards/judge_tool_use/std": 0.09994790703058243, |
|
"rewards/verify_correctness": 0.90625, |
|
"rewards/verify_correctness/std": 0.20155644416809082, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017094017937779426, |
|
"completion_length": 63.395833333333336, |
|
"epoch": 0.21, |
|
"grad_norm": 1.5235387274657526, |
|
"kl": 76.58203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.1523, |
|
"reward": 0.9824999968210856, |
|
"reward_std": 0.049497475226720176, |
|
"rewards/judge_tool_use": 0.9833333293596903, |
|
"rewards/judge_tool_use/std": 0.06666666766007741, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0032774390652775764, |
|
"completion_length": 68.6875, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 2612.164532527323, |
|
"kl": 182.58828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.373, |
|
"reward": 0.9112499952316284, |
|
"reward_std": 0.09157592756673694, |
|
"rewards/judge_tool_use": 0.9437499940395355, |
|
"rewards/judge_tool_use/std": 0.15795189142227173, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010407064110040665, |
|
"completion_length": 76.45833333333333, |
|
"epoch": 0.21666666666666667, |
|
"grad_norm": 2.421529558975116, |
|
"kl": 0.40625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0014, |
|
"reward": 0.8883333404858907, |
|
"reward_std": 0.043602497316896915, |
|
"rewards/judge_tool_use": 0.9437499841054281, |
|
"rewards/judge_tool_use/std": 0.08793675154447556, |
|
"rewards/verify_correctness": 0.6666666666666666, |
|
"rewards/verify_correctness/std": 0.3442651828130086, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.005306883063167333, |
|
"completion_length": 79.75, |
|
"epoch": 0.22, |
|
"grad_norm": 15.808736875255741, |
|
"kl": 5.0265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0468, |
|
"reward": 0.7362499833106995, |
|
"reward_std": 0.2673564925789833, |
|
"rewards/judge_tool_use": 0.7718749940395355, |
|
"rewards/judge_tool_use/std": 0.3598140925168991, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.4515564441680908, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0012897307053208352, |
|
"completion_length": 92.375, |
|
"epoch": 0.22333333333333333, |
|
"grad_norm": 0.8262876073255261, |
|
"kl": 0.42890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0047, |
|
"reward": 0.8600000143051147, |
|
"reward_std": 0.08047416061162949, |
|
"rewards/judge_tool_use": 0.9499999682108561, |
|
"rewards/judge_tool_use/std": 0.13221755623817444, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010268327314406633, |
|
"completion_length": 79.5625, |
|
"epoch": 0.22666666666666666, |
|
"grad_norm": 0.14720074644097367, |
|
"kl": 0.2916015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0029, |
|
"reward": 0.9862499833106995, |
|
"reward_std": 0.0176776722073555, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009512485004961491, |
|
"completion_length": 95.85416666666667, |
|
"epoch": 0.23, |
|
"grad_norm": 0.05737038924876427, |
|
"kl": 0.54521484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.9416666825612386, |
|
"reward_std": 0.01843047762910525, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.028598766773939133, |
|
"rewards/verify_correctness": 0.7916666666666666, |
|
"rewards/verify_correctness/std": 0.28598760565121967, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005090909078717231, |
|
"completion_length": 84.53125, |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 0.10503333394399555, |
|
"kl": 0.4201171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.9537500143051147, |
|
"reward_std": 0.019955309107899666, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.00019607844296842813, |
|
"completion_length": 108.35416666666667, |
|
"epoch": 0.23666666666666666, |
|
"grad_norm": 1.7884752441260985, |
|
"kl": 0.4283203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0359, |
|
"reward": 0.9358333547910055, |
|
"reward_std": 0.08353088547786076, |
|
"rewards/judge_tool_use": 0.956249992052714, |
|
"rewards/judge_tool_use/std": 0.08920949697494507, |
|
"rewards/verify_correctness": 0.8541666666666666, |
|
"rewards/verify_correctness/std": 0.3462595542271932, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0012887715362012385, |
|
"completion_length": 85.84375, |
|
"epoch": 0.24, |
|
"grad_norm": 2.9967592204893183, |
|
"kl": 2.588671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0506, |
|
"reward": 0.875, |
|
"reward_std": 0.09827076643705368, |
|
"rewards/judge_tool_use": 0.890625, |
|
"rewards/judge_tool_use/std": 0.19682374596595764, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 53.791666666666664, |
|
"epoch": 0.24333333333333335, |
|
"grad_norm": 0.05438651543966187, |
|
"kl": 0.29921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0004950494971126318, |
|
"completion_length": 79.75, |
|
"epoch": 0.24666666666666667, |
|
"grad_norm": 0.10471062504755536, |
|
"kl": 0.2140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9950000047683716, |
|
"reward_std": 0.009258206002414227, |
|
"rewards/judge_tool_use": 0.9937500059604645, |
|
"rewards/judge_tool_use/std": 0.017078254371881485, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008534850552678108, |
|
"completion_length": 71.75, |
|
"epoch": 0.25, |
|
"grad_norm": 0.05059977006337808, |
|
"kl": 0.28515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.9599999984105428, |
|
"reward_std": 0.007126967112223308, |
|
"rewards/judge_tool_use": 0.9916666746139526, |
|
"rewards/judge_tool_use/std": 0.01490712414185206, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0021406911546364427, |
|
"completion_length": 131.09375, |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 0.2406966009370665, |
|
"kl": 8.698828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0463, |
|
"reward": 0.8287499845027924, |
|
"reward_std": 0.10335364565253258, |
|
"rewards/judge_tool_use": 0.856249988079071, |
|
"rewards/judge_tool_use/std": 0.17525622248649597, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.42695631086826324, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0003087012562900782, |
|
"completion_length": 81.6875, |
|
"epoch": 0.25666666666666665, |
|
"grad_norm": 0.07653342987035294, |
|
"kl": 0.242578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.9124999841054281, |
|
"reward_std": 0.04729796418299278, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.0737380584081014, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.00035750765819102525, |
|
"completion_length": 107.46875, |
|
"epoch": 0.26, |
|
"grad_norm": 0.06597534204621332, |
|
"kl": 0.22822265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9474999904632568, |
|
"reward_std": 0.007071072701364756, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005074221640825272, |
|
"completion_length": 74.77083333333333, |
|
"epoch": 0.2633333333333333, |
|
"grad_norm": 0.23933590665982507, |
|
"kl": 0.2693359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0016, |
|
"reward": 0.9716666539510092, |
|
"reward_std": 0.023570228057603042, |
|
"rewards/judge_tool_use": 0.9854166507720947, |
|
"rewards/judge_tool_use/std": 0.02500000720222791, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.1490712066491445, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010744871804490685, |
|
"completion_length": 89.375, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 1.1698863583293904, |
|
"kl": 0.276171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0256, |
|
"reward": 0.9025000035762787, |
|
"reward_std": 0.08742741448804736, |
|
"rewards/judge_tool_use": 0.9249999821186066, |
|
"rewards/judge_tool_use/std": 0.1527007520198822, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018488712608814239, |
|
"completion_length": 80.66666666666667, |
|
"epoch": 0.27, |
|
"grad_norm": 1.4081894468368357, |
|
"kl": 0.41796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0022, |
|
"reward": 0.8766666650772095, |
|
"reward_std": 0.05552822661896547, |
|
"rewards/judge_tool_use": 0.9291666746139526, |
|
"rewards/judge_tool_use/std": 0.07800610611836116, |
|
"rewards/verify_correctness": 0.6666666666666666, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.00013927576364949345, |
|
"completion_length": 79.5625, |
|
"epoch": 0.2733333333333333, |
|
"grad_norm": 0.36461709597810504, |
|
"kl": 0.19462890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0021, |
|
"reward": 0.9662500023841858, |
|
"reward_std": 0.02875388413667679, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.84375, |
|
"rewards/verify_correctness/std": 0.23935678601264954, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013018524972721935, |
|
"completion_length": 113.125, |
|
"epoch": 0.27666666666666667, |
|
"grad_norm": 0.3280852961847766, |
|
"kl": 4.0763671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0044, |
|
"reward": 0.8483333190282186, |
|
"reward_std": 0.150167316198349, |
|
"rewards/judge_tool_use": 0.9041666388511658, |
|
"rewards/judge_tool_use/std": 0.16761433954040209, |
|
"rewards/verify_correctness": 0.625, |
|
"rewards/verify_correctness/std": 0.4262484510739644, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.001338357198983431, |
|
"completion_length": 90.0, |
|
"epoch": 0.28, |
|
"grad_norm": 0.6117562389637614, |
|
"kl": 0.2181640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9912500083446503, |
|
"reward_std": 0.02474873699247837, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008633675985038281, |
|
"completion_length": 335.8541666666667, |
|
"epoch": 0.2833333333333333, |
|
"grad_norm": 0.8108468649324869, |
|
"kl": 0.4734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0387, |
|
"reward": 0.6941666603088379, |
|
"reward_std": 0.24669699867566428, |
|
"rewards/judge_tool_use": 0.7895833253860474, |
|
"rewards/judge_tool_use/std": 0.2617962161699931, |
|
"rewards/verify_correctness": 0.3125, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.004640390491113066, |
|
"completion_length": 123.375, |
|
"epoch": 0.2866666666666667, |
|
"grad_norm": 0.4713406045119873, |
|
"kl": 1.093359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0019, |
|
"reward": 0.7487500011920929, |
|
"reward_std": 0.2284143902361393, |
|
"rewards/judge_tool_use": 0.7406249940395355, |
|
"rewards/judge_tool_use/std": 0.35302741825580597, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008070833748206497, |
|
"completion_length": 153.9375, |
|
"epoch": 0.29, |
|
"grad_norm": 1.9592742170228072, |
|
"kl": 1.117578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0878, |
|
"reward": 0.9316666523615519, |
|
"reward_std": 0.15917644401391348, |
|
"rewards/judge_tool_use": 0.9354166587193807, |
|
"rewards/judge_tool_use/std": 0.2003726214170456, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.21770429611206055, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018505133455619216, |
|
"completion_length": 173.0, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 1.383857692998585, |
|
"kl": 0.4033203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.031, |
|
"reward": 0.7675000131130219, |
|
"reward_std": 0.1635022610425949, |
|
"rewards/judge_tool_use": 0.8499999940395355, |
|
"rewards/judge_tool_use/std": 0.2784065455198288, |
|
"rewards/verify_correctness": 0.4375, |
|
"rewards/verify_correctness/std": 0.47360680997371674, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0004979253280907869, |
|
"completion_length": 117.91666666666667, |
|
"epoch": 0.2966666666666667, |
|
"grad_norm": 14.897280067316444, |
|
"kl": 1.298046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0177, |
|
"reward": 0.9633333285649618, |
|
"reward_std": 0.06114211150755485, |
|
"rewards/judge_tool_use": 0.9645833174387614, |
|
"rewards/judge_tool_use/std": 0.09941734870274861, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.001424700953066349, |
|
"completion_length": 78.5625, |
|
"epoch": 0.3, |
|
"grad_norm": 0.046388448519732395, |
|
"kl": 0.276171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0166, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008928571827709675, |
|
"completion_length": 70.1875, |
|
"epoch": 0.30333333333333334, |
|
"grad_norm": 13.949856199547733, |
|
"kl": 0.98984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0114, |
|
"reward": 0.9741666714350382, |
|
"reward_std": 0.04652188221613566, |
|
"rewards/judge_tool_use": 0.9729166626930237, |
|
"rewards/judge_tool_use/std": 0.06579288840293884, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.001224489789456129, |
|
"completion_length": 77.1875, |
|
"epoch": 0.30666666666666664, |
|
"grad_norm": 0.08400978434221483, |
|
"kl": 0.5431640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9350000023841858, |
|
"reward_std": 0.009258206002414227, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025000007823109627, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007088846992701292, |
|
"completion_length": 106.10416666666667, |
|
"epoch": 0.31, |
|
"grad_norm": 1.2873129844422142, |
|
"kl": 0.703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0265, |
|
"reward": 0.8833333253860474, |
|
"reward_std": 0.10906451940536499, |
|
"rewards/judge_tool_use": 0.9166666666666666, |
|
"rewards/judge_tool_use/std": 0.185252716143926, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.3303537170092265, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0028592190705239774, |
|
"completion_length": 101.875, |
|
"epoch": 0.31333333333333335, |
|
"grad_norm": 0.20363038570520958, |
|
"kl": 0.36171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0023, |
|
"reward": 0.9199999868869781, |
|
"reward_std": 0.017422062810510397, |
|
"rewards/judge_tool_use": 0.9624999761581421, |
|
"rewards/judge_tool_use/std": 0.04955306649208069, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005361930467188359, |
|
"completion_length": 84.14583333333333, |
|
"epoch": 0.31666666666666665, |
|
"grad_norm": 0.8174753094540738, |
|
"kl": 0.2626953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9816666841506958, |
|
"reward_std": 0.028729441886146862, |
|
"rewards/judge_tool_use": 0.987500011920929, |
|
"rewards/judge_tool_use/std": 0.026292627056439716, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010484508238732815, |
|
"completion_length": 83.1875, |
|
"epoch": 0.32, |
|
"grad_norm": 4.869255272848249, |
|
"kl": 0.59296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0054, |
|
"reward": 0.918749988079071, |
|
"reward_std": 0.07140177488327026, |
|
"rewards/judge_tool_use": 0.9375, |
|
"rewards/judge_tool_use/std": 0.11180340498685837, |
|
"rewards/verify_correctness": 0.84375, |
|
"rewards/verify_correctness/std": 0.23935678601264954, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0041505326051265, |
|
"completion_length": 104.0625, |
|
"epoch": 0.3233333333333333, |
|
"grad_norm": 0.7259306381788916, |
|
"kl": 0.31796875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0027, |
|
"reward": 0.9758333365122477, |
|
"reward_std": 0.028907646735509235, |
|
"rewards/judge_tool_use": 0.9854166706403097, |
|
"rewards/judge_tool_use/std": 0.032623790204524994, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1343709627787272, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.002794364234432578, |
|
"completion_length": 167.9375, |
|
"epoch": 0.32666666666666666, |
|
"grad_norm": 0.07267427706403441, |
|
"kl": 0.27734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0062, |
|
"reward": 0.9162499904632568, |
|
"reward_std": 0.0662735546939075, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.056753065437078476, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.00040650404989719393, |
|
"completion_length": 100.45833333333333, |
|
"epoch": 0.33, |
|
"grad_norm": 0.2878900699504781, |
|
"kl": 0.34296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0032, |
|
"reward": 0.9416666626930237, |
|
"reward_std": 0.07133257389068604, |
|
"rewards/judge_tool_use": 0.9479166666666666, |
|
"rewards/judge_tool_use/std": 0.10411662111679713, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.21770429611206055, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.003871983336284757, |
|
"completion_length": 90.59375, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.774210737681418, |
|
"kl": 0.2849609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0051, |
|
"reward": 0.9049999713897705, |
|
"reward_std": 0.05290384031832218, |
|
"rewards/judge_tool_use": 0.9749999642372131, |
|
"rewards/judge_tool_use/std": 0.038117386400699615, |
|
"rewards/verify_correctness": 0.625, |
|
"rewards/verify_correctness/std": 0.36435678601264954, |
|
"step": 500 |
|
}, |
|
{ |
|
"clip_ratio": 0.0014365109149366618, |
|
"completion_length": 163.0, |
|
"epoch": 0.33666666666666667, |
|
"grad_norm": 5.746402974167011, |
|
"kl": 0.805078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0094, |
|
"reward": 0.815833330154419, |
|
"reward_std": 0.11187712320437034, |
|
"rewards/judge_tool_use": 0.8583333094914755, |
|
"rewards/judge_tool_use/std": 0.2133141408363978, |
|
"rewards/verify_correctness": 0.6458333333333334, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 505 |
|
}, |
|
{ |
|
"clip_ratio": 0.000518302945420146, |
|
"completion_length": 86.75, |
|
"epoch": 0.34, |
|
"grad_norm": 0.0989351605838229, |
|
"kl": 0.43984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0042, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005924170836806297, |
|
"completion_length": 62.0, |
|
"epoch": 0.3433333333333333, |
|
"grad_norm": 0.78032389511651, |
|
"kl": 0.367578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0016, |
|
"reward": 0.9866666595141093, |
|
"reward_std": 0.02108185241619746, |
|
"rewards/judge_tool_use": 0.993749996026357, |
|
"rewards/judge_tool_use/std": 0.013437099754810333, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 515 |
|
}, |
|
{ |
|
"clip_ratio": 0.004682651488110423, |
|
"completion_length": 106.53125, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 1.1243616090267425, |
|
"kl": 0.57734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0022, |
|
"reward": 0.9049999713897705, |
|
"reward_std": 0.060222613625228405, |
|
"rewards/judge_tool_use": 0.9593749940395355, |
|
"rewards/judge_tool_use/std": 0.049755578860640526, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 520 |
|
}, |
|
{ |
|
"clip_ratio": 8.077544625848532e-05, |
|
"completion_length": 90.0625, |
|
"epoch": 0.35, |
|
"grad_norm": 0.9496683027092206, |
|
"kl": 0.254296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0041, |
|
"reward": 0.8733333349227905, |
|
"reward_std": 0.10902170836925507, |
|
"rewards/judge_tool_use": 0.9145833253860474, |
|
"rewards/judge_tool_use/std": 0.13846262296040854, |
|
"rewards/verify_correctness": 0.7083333333333334, |
|
"rewards/verify_correctness/std": 0.2939421534538269, |
|
"step": 525 |
|
}, |
|
{ |
|
"clip_ratio": 0.0037049442529678346, |
|
"completion_length": 77.6875, |
|
"epoch": 0.35333333333333333, |
|
"grad_norm": 0.04794774304730862, |
|
"kl": 0.398828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0013, |
|
"reward": 0.9287499785423279, |
|
"reward_std": 0.033716630190610886, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.023935683071613312, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 530 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026162526570260524, |
|
"completion_length": 79.64583333333333, |
|
"epoch": 0.3566666666666667, |
|
"grad_norm": 0.41535999227857473, |
|
"kl": 155.844140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3488, |
|
"reward": 0.9000000158945719, |
|
"reward_std": 0.08795289571086566, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.09092239538828532, |
|
"rewards/verify_correctness": 0.625, |
|
"rewards/verify_correctness/std": 0.43299739559491474, |
|
"step": 535 |
|
}, |
|
{ |
|
"clip_ratio": 0.00012376237427815794, |
|
"completion_length": 66.25, |
|
"epoch": 0.36, |
|
"grad_norm": 0.30713351970070757, |
|
"kl": 0.346875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0013, |
|
"reward": 0.9937500059604645, |
|
"reward_std": 0.01767767034471035, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 540 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.10416666666667, |
|
"epoch": 0.36333333333333334, |
|
"grad_norm": 0.09672611146866233, |
|
"kl": 0.390234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"reward": 0.8599999944368998, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 0.8666666746139526, |
|
"rewards/judge_tool_use/std": 0.13770607113838196, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 545 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.0625, |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 0.06902490237203746, |
|
"kl": 0.25, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.925000011920929, |
|
"reward_std": 0.009258206002414227, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.0428981501609087, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 550 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013333333656191826, |
|
"completion_length": 81.02083333333333, |
|
"epoch": 0.37, |
|
"grad_norm": 0.07471788985105354, |
|
"kl": 1.6287109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0378, |
|
"reward": 0.8991666634877523, |
|
"reward_std": 0.1171679353962342, |
|
"rewards/judge_tool_use": 0.9208333492279053, |
|
"rewards/judge_tool_use/std": 0.12504171580076218, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.1707825263341268, |
|
"step": 555 |
|
}, |
|
{ |
|
"clip_ratio": 0.0011577558820135892, |
|
"completion_length": 93.5, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.6806153493761989, |
|
"kl": 0.3818359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0049, |
|
"reward": 0.9262500107288361, |
|
"reward_std": 0.07174841035157442, |
|
"rewards/judge_tool_use": 0.9625000059604645, |
|
"rewards/judge_tool_use/std": 0.12010355666279793, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 560 |
|
}, |
|
{ |
|
"clip_ratio": 0.00013755158288404347, |
|
"completion_length": 102.10416666666667, |
|
"epoch": 0.37666666666666665, |
|
"grad_norm": 0.3453755330616684, |
|
"kl": 0.19248046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"reward": 0.9516666531562805, |
|
"reward_std": 0.03459723728398482, |
|
"rewards/judge_tool_use": 0.981250007947286, |
|
"rewards/judge_tool_use/std": 0.03086424618959427, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 565 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005459312233142555, |
|
"completion_length": 64.78125, |
|
"epoch": 0.38, |
|
"grad_norm": 0.106962339918945, |
|
"kl": 0.3599609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9824999868869781, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.9781249761581421, |
|
"rewards/judge_tool_use/std": 0.025617383420467377, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"clip_ratio": 0.003975985199213028, |
|
"completion_length": 79.72916666666667, |
|
"epoch": 0.38333333333333336, |
|
"grad_norm": 31.87547237303355, |
|
"kl": 1.836328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0061, |
|
"reward": 0.9599999984105428, |
|
"reward_std": 0.08956686376283567, |
|
"rewards/judge_tool_use": 0.9708333214124044, |
|
"rewards/judge_tool_use/std": 0.08607227355241776, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.22771002848943075, |
|
"step": 575 |
|
}, |
|
{ |
|
"clip_ratio": 0.0022586936596781016, |
|
"completion_length": 89.4375, |
|
"epoch": 0.38666666666666666, |
|
"grad_norm": 0.18459174965007472, |
|
"kl": 2.7267578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0178, |
|
"reward": 0.9512500166893005, |
|
"reward_std": 0.08859837148338556, |
|
"rewards/judge_tool_use": 0.9468750059604645, |
|
"rewards/judge_tool_use/std": 0.1337440349161625, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 580 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009126466698944569, |
|
"completion_length": 66.64583333333333, |
|
"epoch": 0.39, |
|
"grad_norm": 0.47653096528197164, |
|
"kl": 0.923828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.009, |
|
"reward": 0.9541666507720947, |
|
"reward_std": 0.07951843117674191, |
|
"rewards/judge_tool_use": 0.9583333532015482, |
|
"rewards/judge_tool_use/std": 0.1015499656399091, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1971883475780487, |
|
"step": 585 |
|
}, |
|
{ |
|
"clip_ratio": 0.0015760843292810024, |
|
"completion_length": 92.34375, |
|
"epoch": 0.3933333333333333, |
|
"grad_norm": 0.16495067647216727, |
|
"kl": 0.2921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"reward": 0.9399999976158142, |
|
"reward_std": 0.010690455324947834, |
|
"rewards/judge_tool_use": 0.987500011920929, |
|
"rewards/judge_tool_use/std": 0.02236068621277809, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 590 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009520398220047354, |
|
"completion_length": 90.22916666666667, |
|
"epoch": 0.39666666666666667, |
|
"grad_norm": 150.32526217473026, |
|
"kl": 10.976953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0411, |
|
"reward": 0.9608333309491476, |
|
"reward_std": 0.06940593632558982, |
|
"rewards/judge_tool_use": 0.9666666587193807, |
|
"rewards/judge_tool_use/std": 0.08551172663768132, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1971883475780487, |
|
"step": 595 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018051420571282505, |
|
"completion_length": 65.28125, |
|
"epoch": 0.4, |
|
"grad_norm": 0.5706420371896888, |
|
"kl": 2.0546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0248, |
|
"reward": 0.8949999809265137, |
|
"reward_std": 0.014142142608761787, |
|
"rewards/judge_tool_use": 0.9937499761581421, |
|
"rewards/judge_tool_use/std": 0.025000005960464478, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.5163977742195129, |
|
"step": 600 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 64.75, |
|
"epoch": 0.4033333333333333, |
|
"grad_norm": 0.11139257485411902, |
|
"kl": 0.1998046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.9375, |
|
"epoch": 0.4066666666666667, |
|
"grad_norm": 0.05923647506926399, |
|
"kl": 0.2099609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"clip_ratio": 0.001606425642967224, |
|
"completion_length": 69.27083333333333, |
|
"epoch": 0.41, |
|
"grad_norm": 0.4111915365282135, |
|
"kl": 2.0720703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0038, |
|
"reward": 0.9674999912579855, |
|
"reward_std": 0.04904646178086599, |
|
"rewards/judge_tool_use": 0.9645833373069763, |
|
"rewards/judge_tool_use/std": 0.08384520187973976, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 615 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.5, |
|
"epoch": 0.41333333333333333, |
|
"grad_norm": 0.022696732599213294, |
|
"kl": 0.2666015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9799999892711639, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.025819895789027214, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"clip_ratio": 0.000514579750597477, |
|
"completion_length": 92.0625, |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 11.520531811733376, |
|
"kl": 1.763671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0042, |
|
"reward": 0.9300000071525574, |
|
"reward_std": 0.04443951385716597, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.043299747010072075, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.280521680911382, |
|
"step": 625 |
|
}, |
|
{ |
|
"clip_ratio": 0.001496715540997684, |
|
"completion_length": 137.84375, |
|
"epoch": 0.42, |
|
"grad_norm": 0.5816584419225785, |
|
"kl": 1.04296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.013, |
|
"reward": 0.8637500107288361, |
|
"reward_std": 0.10849415510892868, |
|
"rewards/judge_tool_use": 0.9312499761581421, |
|
"rewards/judge_tool_use/std": 0.15823806822299957, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.497555673122406, |
|
"step": 630 |
|
}, |
|
{ |
|
"clip_ratio": 0.0002162941498681903, |
|
"completion_length": 104.9375, |
|
"epoch": 0.42333333333333334, |
|
"grad_norm": 1.037480255167932, |
|
"kl": 0.4396484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0271, |
|
"reward": 0.9183333118756613, |
|
"reward_std": 0.06705306967099507, |
|
"rewards/judge_tool_use": 0.949999988079071, |
|
"rewards/judge_tool_use/std": 0.09021268288294475, |
|
"rewards/verify_correctness": 0.7916666666666666, |
|
"rewards/verify_correctness/std": 0.305153489112854, |
|
"step": 635 |
|
}, |
|
{ |
|
"clip_ratio": 0.004081117268651724, |
|
"completion_length": 61.25, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.07253261510235819, |
|
"kl": 7.5224609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0144, |
|
"reward": 0.9799999892711639, |
|
"reward_std": 0.014142139814794064, |
|
"rewards/judge_tool_use": 0.9749999642372131, |
|
"rewards/judge_tool_use/std": 0.038117386400699615, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.10416666666667, |
|
"epoch": 0.43, |
|
"grad_norm": 1.233226781331498, |
|
"kl": 18.0759765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0359, |
|
"reward": 0.9791666666666666, |
|
"reward_std": 0.021593637764453888, |
|
"rewards/judge_tool_use": 0.9895833333333334, |
|
"rewards/judge_tool_use/std": 0.015957123289505642, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1343709627787272, |
|
"step": 645 |
|
}, |
|
{ |
|
"clip_ratio": 0.0022598175797611474, |
|
"completion_length": 72.46875, |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 0.35274976474922665, |
|
"kl": 0.51875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"clip_ratio": 0.00019841270986944436, |
|
"completion_length": 88.0625, |
|
"epoch": 0.43666666666666665, |
|
"grad_norm": 2.1379059746819324, |
|
"kl": 0.41953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"reward": 0.9150000015894572, |
|
"reward_std": 0.07097954737643401, |
|
"rewards/judge_tool_use": 0.9458333253860474, |
|
"rewards/judge_tool_use/std": 0.11968278015653293, |
|
"rewards/verify_correctness": 0.7916666666666666, |
|
"rewards/verify_correctness/std": 0.28598760565121967, |
|
"step": 655 |
|
}, |
|
{ |
|
"clip_ratio": 0.0004010346601717174, |
|
"completion_length": 83.1875, |
|
"epoch": 0.44, |
|
"grad_norm": 0.0809447830533763, |
|
"kl": 0.698828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0012, |
|
"reward": 0.9275000095367432, |
|
"reward_std": 0.05599744990468025, |
|
"rewards/judge_tool_use": 0.971875011920929, |
|
"rewards/judge_tool_use/std": 0.09994790703058243, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 660 |
|
}, |
|
{ |
|
"clip_ratio": 0.0002642008010298014, |
|
"completion_length": 64.08333333333333, |
|
"epoch": 0.44333333333333336, |
|
"grad_norm": 0.12353014311423977, |
|
"kl": 1.115234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0024, |
|
"reward": 0.9483333230018616, |
|
"reward_std": 0.011841016821563244, |
|
"rewards/judge_tool_use": 0.9770833253860474, |
|
"rewards/judge_tool_use/std": 0.03198537975549698, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 665 |
|
}, |
|
{ |
|
"clip_ratio": 0.000147058826405555, |
|
"completion_length": 60.0625, |
|
"epoch": 0.44666666666666666, |
|
"grad_norm": 0.033734607078496216, |
|
"kl": 0.28359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007290652487426996, |
|
"completion_length": 75.3125, |
|
"epoch": 0.45, |
|
"grad_norm": 0.28937763507235525, |
|
"kl": 0.41787109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.01676552618543307, |
|
"rewards/judge_tool_use": 0.9958333373069763, |
|
"rewards/judge_tool_use/std": 0.011385502914587656, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 675 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007440476212650538, |
|
"completion_length": 101.75, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.21986652830727627, |
|
"kl": 0.30478515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"clip_ratio": 0.0001519756857305765, |
|
"completion_length": 72.25, |
|
"epoch": 0.45666666666666667, |
|
"grad_norm": 0.7866833774635168, |
|
"kl": 0.2416015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"reward": 0.9599999984105428, |
|
"reward_std": 0.02014437907685836, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025546599179506302, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.16666666666666666, |
|
"step": 685 |
|
}, |
|
{ |
|
"clip_ratio": 0.003282261826097965, |
|
"completion_length": 80.8125, |
|
"epoch": 0.46, |
|
"grad_norm": 0.12656362296920962, |
|
"kl": 0.209375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0001, |
|
"reward": 0.9350000023841858, |
|
"reward_std": 0.017422057688236237, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025000007823109627, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 690 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008928571827709675, |
|
"completion_length": 81.95833333333333, |
|
"epoch": 0.4633333333333333, |
|
"grad_norm": 0.07893765556745304, |
|
"kl": 0.784375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0017, |
|
"reward": 0.9733333190282186, |
|
"reward_std": 0.01708986610174179, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.16666666666666666, |
|
"step": 695 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006349206436425447, |
|
"completion_length": 68.3125, |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 1.2625190305994096, |
|
"kl": 1.878515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0037, |
|
"reward": 0.9900000095367432, |
|
"reward_std": 0.010690455324947834, |
|
"rewards/judge_tool_use": 0.987500011920929, |
|
"rewards/judge_tool_use/std": 0.02236068621277809, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009756097570061684, |
|
"completion_length": 67.77083333333333, |
|
"epoch": 0.47, |
|
"grad_norm": 0.10890212362336826, |
|
"kl": 0.26640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 0.978333314259847, |
|
"reward_std": 0.015430334955453873, |
|
"rewards/judge_tool_use": 0.9833333293596903, |
|
"rewards/judge_tool_use/std": 0.017213263859351475, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 705 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007211538497358561, |
|
"completion_length": 92.8125, |
|
"epoch": 0.47333333333333333, |
|
"grad_norm": 0.027587471479060908, |
|
"kl": 0.21982421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9712499976158142, |
|
"reward_std": 0.018077218905091286, |
|
"rewards/judge_tool_use": 0.9718749821186066, |
|
"rewards/judge_tool_use/std": 0.03831989876925945, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 710 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009474414400756359, |
|
"completion_length": 108.08333333333333, |
|
"epoch": 0.4766666666666667, |
|
"grad_norm": 0.19022699842586205, |
|
"kl": 0.237890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0022, |
|
"reward": 0.9325000047683716, |
|
"reward_std": 0.03303187837203344, |
|
"rewards/judge_tool_use": 0.9833333293596903, |
|
"rewards/judge_tool_use/std": 0.02805217479666074, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.33744919300079346, |
|
"step": 715 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005668934434652328, |
|
"completion_length": 80.4375, |
|
"epoch": 0.48, |
|
"grad_norm": 0.0847895586797689, |
|
"kl": 0.2830078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010113780386745929, |
|
"completion_length": 98.5625, |
|
"epoch": 0.48333333333333334, |
|
"grad_norm": 393.7775350754493, |
|
"kl": 13.9048828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0279, |
|
"reward": 0.887499988079071, |
|
"reward_std": 0.05597590561956167, |
|
"rewards/judge_tool_use": 0.9375, |
|
"rewards/judge_tool_use/std": 0.1213516891002655, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 725 |
|
}, |
|
{ |
|
"clip_ratio": 0.00041820945334620775, |
|
"completion_length": 102.9375, |
|
"epoch": 0.4866666666666667, |
|
"grad_norm": 0.09728385322995636, |
|
"kl": 9.962890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0203, |
|
"reward": 0.9887500107288361, |
|
"reward_std": 0.0318198068998754, |
|
"rewards/judge_tool_use": 0.9937499761581421, |
|
"rewards/judge_tool_use/std": 0.025000005960464478, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 730 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006047413335181773, |
|
"completion_length": 85.3125, |
|
"epoch": 0.49, |
|
"grad_norm": 0.16058354971882016, |
|
"kl": 8755.388671875, |
|
"learning_rate": 1e-06, |
|
"loss": 17.4619, |
|
"reward": 0.9358333349227905, |
|
"reward_std": 0.05956774204969406, |
|
"rewards/judge_tool_use": 0.9770833452542623, |
|
"rewards/judge_tool_use/std": 0.07801744093497594, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.3065035541852315, |
|
"step": 735 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 68.4375, |
|
"epoch": 0.49333333333333335, |
|
"grad_norm": 0.16518417296961413, |
|
"kl": 0.1482421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"clip_ratio": 0.00011098779505118728, |
|
"completion_length": 96.8125, |
|
"epoch": 0.49666666666666665, |
|
"grad_norm": 0.06250702594297985, |
|
"kl": 0.348828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9074999690055847, |
|
"reward_std": 0.01324320025742054, |
|
"rewards/judge_tool_use": 0.9729166428248087, |
|
"rewards/judge_tool_use/std": 0.033744927495718, |
|
"rewards/verify_correctness": 0.6458333333333334, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 745 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010483485879376532, |
|
"completion_length": 83.09375, |
|
"epoch": 0.5, |
|
"grad_norm": 7.418537851126552, |
|
"kl": 20.103515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0342, |
|
"reward": 0.8912499845027924, |
|
"reward_std": 0.12655025720596313, |
|
"rewards/judge_tool_use": 0.934374988079071, |
|
"rewards/judge_tool_use/std": 0.1986893266439438, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.38319888710975647, |
|
"step": 750 |
|
}, |
|
{ |
|
"clip_ratio": 0.00021881838329136373, |
|
"completion_length": 96.47916666666667, |
|
"epoch": 0.5033333333333333, |
|
"grad_norm": 0.329185897503142, |
|
"kl": 0.50859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0142, |
|
"reward": 0.9466666579246521, |
|
"reward_std": 0.068666722625494, |
|
"rewards/judge_tool_use": 0.9645833174387614, |
|
"rewards/judge_tool_use/std": 0.08136301239331563, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.24290452400843301, |
|
"step": 755 |
|
}, |
|
{ |
|
"clip_ratio": 0.0016204309416934849, |
|
"completion_length": 79.5, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 1.5018746267117502, |
|
"kl": 0.2720703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0035, |
|
"reward": 0.9550000131130219, |
|
"reward_std": 0.0481070326641202, |
|
"rewards/judge_tool_use": 0.9750000238418579, |
|
"rewards/judge_tool_use/std": 0.04472137242555618, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.22360680997371674, |
|
"step": 760 |
|
}, |
|
{ |
|
"clip_ratio": 6.493506371043622e-05, |
|
"completion_length": 92.39583333333333, |
|
"epoch": 0.51, |
|
"grad_norm": 0.09973169012901163, |
|
"kl": 0.24765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0002, |
|
"reward": 0.9900000095367432, |
|
"reward_std": 0.01655506311605374, |
|
"rewards/judge_tool_use": 0.987499992052714, |
|
"rewards/judge_tool_use/std": 0.024290458609660465, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"clip_ratio": 0.0022681955248117446, |
|
"completion_length": 70.5, |
|
"epoch": 0.5133333333333333, |
|
"grad_norm": 0.05997101159912667, |
|
"kl": 0.303125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.33333333333333, |
|
"epoch": 0.5166666666666667, |
|
"grad_norm": 0.6264206985392197, |
|
"kl": 0.315234375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0003, |
|
"reward": 0.9091666539510092, |
|
"reward_std": 0.05015302697817484, |
|
"rewards/judge_tool_use": 0.9645833174387614, |
|
"rewards/judge_tool_use/std": 0.043106683840354286, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.33744919300079346, |
|
"step": 775 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017392298206686974, |
|
"completion_length": 93.53125, |
|
"epoch": 0.52, |
|
"grad_norm": 0.524085431110826, |
|
"kl": 0.4203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0004, |
|
"reward": 0.9474999904632568, |
|
"reward_std": 0.03535534022375941, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.037500010803341866, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.22360680997371674, |
|
"step": 780 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006958305835723877, |
|
"completion_length": 102.04166666666667, |
|
"epoch": 0.5233333333333333, |
|
"grad_norm": 0.10557989588862941, |
|
"kl": 0.227734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0011, |
|
"reward": 0.9399999777475992, |
|
"reward_std": 0.04806827505429586, |
|
"rewards/judge_tool_use": 0.9666666587193807, |
|
"rewards/judge_tool_use/std": 0.07302967707316081, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 785 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.15625, |
|
"epoch": 0.5266666666666666, |
|
"grad_norm": 0.02144329361599325, |
|
"kl": 0.2107421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.8799999952316284, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.025819895789027214, |
|
"rewards/verify_correctness": 0.5, |
|
"rewards/verify_correctness/std": 0.5163977742195129, |
|
"step": 790 |
|
}, |
|
{ |
|
"clip_ratio": 0.001966528873890638, |
|
"completion_length": 102.5, |
|
"epoch": 0.53, |
|
"grad_norm": 5.014929981150802, |
|
"kl": 0.689453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0161, |
|
"reward": 0.8583333293596903, |
|
"reward_std": 0.13183549232780933, |
|
"rewards/judge_tool_use": 0.9270833532015482, |
|
"rewards/judge_tool_use/std": 0.16641392558813095, |
|
"rewards/verify_correctness": 0.5833333333333334, |
|
"rewards/verify_correctness/std": 0.3303537170092265, |
|
"step": 795 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018077531363815068, |
|
"completion_length": 65.125, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 2.4449889650147725, |
|
"kl": 0.616796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0159, |
|
"reward": 0.9699999988079071, |
|
"reward_std": 0.032513730227947235, |
|
"rewards/judge_tool_use": 0.9781249761581421, |
|
"rewards/judge_tool_use/std": 0.03145764768123627, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.17078252136707306, |
|
"step": 800 |
|
}, |
|
{ |
|
"clip_ratio": 0.0011476841289550066, |
|
"completion_length": 120.14583333333333, |
|
"epoch": 0.5366666666666666, |
|
"grad_norm": 0.5110356082388042, |
|
"kl": 65.94375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.1173, |
|
"reward": 0.9258333444595337, |
|
"reward_std": 0.09278637667496999, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.09471883624792099, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.4166666666666667, |
|
"step": 805 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009965144796296953, |
|
"completion_length": 104.90625, |
|
"epoch": 0.54, |
|
"grad_norm": 0.04881719966559261, |
|
"kl": 0.297265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0012, |
|
"reward": 0.9224999845027924, |
|
"reward_std": 0.031052968464791775, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.045975545421242714, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 810 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006250000093132258, |
|
"completion_length": 88.6875, |
|
"epoch": 0.5433333333333333, |
|
"grad_norm": 0.15076410059985984, |
|
"kl": 0.3453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0169, |
|
"reward": 0.8958333333333334, |
|
"reward_std": 0.06509770142535369, |
|
"rewards/judge_tool_use": 0.9583333333333334, |
|
"rewards/judge_tool_use/std": 0.10000000521540642, |
|
"rewards/verify_correctness": 0.6458333333333334, |
|
"rewards/verify_correctness/std": 0.427598516146342, |
|
"step": 815 |
|
}, |
|
{ |
|
"clip_ratio": 0.0023800658993422983, |
|
"completion_length": 93.53125, |
|
"epoch": 0.5466666666666666, |
|
"grad_norm": 0.13059834615215846, |
|
"kl": 221.2806640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4431, |
|
"reward": 0.9950000047683716, |
|
"reward_std": 0.014142139814794064, |
|
"rewards/judge_tool_use": 0.9937500059604645, |
|
"rewards/judge_tool_use/std": 0.017078254371881485, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 64.16666666666667, |
|
"epoch": 0.55, |
|
"grad_norm": 0.21646392402860518, |
|
"kl": 179.3978515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.358, |
|
"reward": 0.9908333222071329, |
|
"reward_std": 0.011785114804903666, |
|
"rewards/judge_tool_use": 0.993749996026357, |
|
"rewards/judge_tool_use/std": 0.013437099754810333, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 825 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006482481141574681, |
|
"completion_length": 70.6875, |
|
"epoch": 0.5533333333333333, |
|
"grad_norm": 1.2886721610742002, |
|
"kl": 0.33203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.01, |
|
"reward": 0.8974999785423279, |
|
"reward_std": 0.060415223240852356, |
|
"rewards/judge_tool_use": 0.934374988079071, |
|
"rewards/judge_tool_use/std": 0.12344871461391449, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 830 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 66.39583333333333, |
|
"epoch": 0.5566666666666666, |
|
"grad_norm": 0.2650259970239174, |
|
"kl": 0.312109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"clip_ratio": 0.0015576324425637723, |
|
"completion_length": 62.5625, |
|
"epoch": 0.56, |
|
"grad_norm": 534.346335825647, |
|
"kl": 87.0494140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.2192, |
|
"reward": 0.9300000071525574, |
|
"reward_std": 0.05656854063272476, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.10000000149011612, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 840 |
|
}, |
|
{ |
|
"clip_ratio": 6.523157353512942e-05, |
|
"completion_length": 89.6875, |
|
"epoch": 0.5633333333333334, |
|
"grad_norm": 3.527265824573338, |
|
"kl": 0.693359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0109, |
|
"reward": 0.9291666746139526, |
|
"reward_std": 0.06318580235044162, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.09166666989525159, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.3065035541852315, |
|
"step": 845 |
|
}, |
|
{ |
|
"clip_ratio": 0.0012977312318980694, |
|
"completion_length": 70.15625, |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 0.1420108574253742, |
|
"kl": 0.4056640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0112, |
|
"reward": 0.9874999821186066, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.984375, |
|
"rewards/judge_tool_use/std": 0.023935683071613312, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"clip_ratio": 0.001287515088915825, |
|
"completion_length": 65.5, |
|
"epoch": 0.57, |
|
"grad_norm": 0.4405911139980202, |
|
"kl": 0.929296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0015, |
|
"reward": 0.9458333055178324, |
|
"reward_std": 0.03568211570382118, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.028598766773939133, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.30103762944539386, |
|
"step": 855 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007213706150650979, |
|
"completion_length": 99.4375, |
|
"epoch": 0.5733333333333334, |
|
"grad_norm": 0.6950772748452339, |
|
"kl": 0.4955078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0017, |
|
"reward": 0.8187499940395355, |
|
"reward_std": 0.11091133579611778, |
|
"rewards/judge_tool_use": 0.84375, |
|
"rewards/judge_tool_use/std": 0.21456576697528362, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.38319888710975647, |
|
"step": 860 |
|
}, |
|
{ |
|
"clip_ratio": 0.00097902100533247, |
|
"completion_length": 70.625, |
|
"epoch": 0.5766666666666667, |
|
"grad_norm": 0.08707789007037814, |
|
"kl": 0.311328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9916666746139526, |
|
"reward_std": 0.01307279740770658, |
|
"rewards/judge_tool_use": 0.9895833333333334, |
|
"rewards/judge_tool_use/std": 0.02482260266939799, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026898022275418042, |
|
"completion_length": 145.34375, |
|
"epoch": 0.58, |
|
"grad_norm": 0.12277957926584093, |
|
"kl": 16.7296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0335, |
|
"reward": 0.6724999994039536, |
|
"reward_std": 0.024493126198649406, |
|
"rewards/judge_tool_use": 0.7781250178813934, |
|
"rewards/judge_tool_use/std": 0.24047533050179482, |
|
"rewards/verify_correctness": 0.25, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 870 |
|
}, |
|
{ |
|
"clip_ratio": 0.00219599399715662, |
|
"completion_length": 76.33333333333333, |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 2.725601337016608, |
|
"kl": 0.30625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0028, |
|
"reward": 0.8441666563351949, |
|
"reward_std": 0.09498834641029437, |
|
"rewards/judge_tool_use": 0.9041666587193807, |
|
"rewards/judge_tool_use/std": 0.15194741388161978, |
|
"rewards/verify_correctness": 0.6041666666666666, |
|
"rewards/verify_correctness/std": 0.4983704487482707, |
|
"step": 875 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008220872841775417, |
|
"completion_length": 100.59375, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.3521652411691662, |
|
"kl": 0.470703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0135, |
|
"reward": 0.8425000011920929, |
|
"reward_std": 0.12523019313812256, |
|
"rewards/judge_tool_use": 0.8812500238418579, |
|
"rewards/judge_tool_use/std": 0.19585155323147774, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.42898140847682953, |
|
"step": 880 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.3125, |
|
"epoch": 0.59, |
|
"grad_norm": 1.7981311946360221, |
|
"kl": 0.292578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9399999777475992, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 0.9666666587193807, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 885 |
|
}, |
|
{ |
|
"clip_ratio": 0.001058566861320287, |
|
"completion_length": 79.03125, |
|
"epoch": 0.5933333333333334, |
|
"grad_norm": 1.347847810134541, |
|
"kl": 0.2482421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0024, |
|
"reward": 0.9225000143051147, |
|
"reward_std": 0.06236250279471278, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.11229145526885986, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 890 |
|
}, |
|
{ |
|
"clip_ratio": 0.0004772079642862082, |
|
"completion_length": 75.58333333333333, |
|
"epoch": 0.5966666666666667, |
|
"grad_norm": 38.987008642448025, |
|
"kl": 6.492578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0146, |
|
"reward": 0.7949999769528707, |
|
"reward_std": 0.10766563316186269, |
|
"rewards/judge_tool_use": 0.9104166825612386, |
|
"rewards/judge_tool_use/std": 0.14955327411492667, |
|
"rewards/verify_correctness": 0.3333333333333333, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026460913009941577, |
|
"completion_length": 61.3125, |
|
"epoch": 0.6, |
|
"grad_norm": 0.13988761849808656, |
|
"kl": 1.6234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0046, |
|
"reward": 0.9762499928474426, |
|
"reward_std": 0.024748740252107382, |
|
"rewards/judge_tool_use": 0.9781249761581421, |
|
"rewards/judge_tool_use/std": 0.025617383420467377, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 900 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007113821338862181, |
|
"completion_length": 81.04166666666667, |
|
"epoch": 0.6033333333333334, |
|
"grad_norm": 0.07659308560187726, |
|
"kl": 0.3126953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9649999936421713, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 905 |
|
}, |
|
{ |
|
"clip_ratio": 0.001008645538240671, |
|
"completion_length": 67.6875, |
|
"epoch": 0.6066666666666667, |
|
"grad_norm": 0.26869976791962935, |
|
"kl": 0.2265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 60.125, |
|
"epoch": 0.61, |
|
"grad_norm": 0.07923399412772805, |
|
"kl": 0.25546875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0016, |
|
"reward": 0.9566666682561239, |
|
"reward_std": 0.018856181452671688, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.7916666666666666, |
|
"rewards/verify_correctness/std": 0.28598760565121967, |
|
"step": 915 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 90.375, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 1.5125503684257633, |
|
"kl": 0.2650390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0057, |
|
"reward": 0.9162499904632568, |
|
"reward_std": 0.09739170409739017, |
|
"rewards/judge_tool_use": 0.949999988079071, |
|
"rewards/judge_tool_use/std": 0.09660918265581131, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 920 |
|
}, |
|
{ |
|
"clip_ratio": 0.0003249390749260783, |
|
"completion_length": 83.20833333333333, |
|
"epoch": 0.6166666666666667, |
|
"grad_norm": 0.0376142648380269, |
|
"kl": 0.437109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.9416666626930237, |
|
"reward_std": 0.016618976990381878, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.0429793248573939, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 925 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 61.34375, |
|
"epoch": 0.62, |
|
"grad_norm": 0.017493888934549447, |
|
"kl": 0.205078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.949999988079071, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 930 |
|
}, |
|
{ |
|
"clip_ratio": 0.001988636329770088, |
|
"completion_length": 86.3125, |
|
"epoch": 0.6233333333333333, |
|
"grad_norm": 0.3529719981491763, |
|
"kl": 0.167578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9683333436648051, |
|
"reward_std": 0.0246026162058115, |
|
"rewards/judge_tool_use": 0.9916666746139526, |
|
"rewards/judge_tool_use/std": 0.022771005829175312, |
|
"rewards/verify_correctness": 0.875, |
|
"rewards/verify_correctness/std": 0.16666666666666666, |
|
"step": 935 |
|
}, |
|
{ |
|
"clip_ratio": 0.0003450655611231923, |
|
"completion_length": 68.0, |
|
"epoch": 0.6266666666666667, |
|
"grad_norm": 0.0932730656399196, |
|
"kl": 0.286328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"clip_ratio": 0.001211305521428585, |
|
"completion_length": 62.854166666666664, |
|
"epoch": 0.63, |
|
"grad_norm": 0.08491446176113226, |
|
"kl": 0.21591796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9583333333333334, |
|
"reward_std": 0.006900658831000328, |
|
"rewards/judge_tool_use": 0.9895833333333334, |
|
"rewards/judge_tool_use/std": 0.015957122047742207, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 945 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 54.84375, |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 0.08445845606617904, |
|
"kl": 0.37021484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009750896133482456, |
|
"completion_length": 93.54166666666667, |
|
"epoch": 0.6366666666666667, |
|
"grad_norm": 9.5445004824132, |
|
"kl": 0.745703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0385, |
|
"reward": 0.9033333460489908, |
|
"reward_std": 0.06474923094113667, |
|
"rewards/judge_tool_use": 0.962499996026357, |
|
"rewards/judge_tool_use/std": 0.10911189516385396, |
|
"rewards/verify_correctness": 0.6666666666666666, |
|
"rewards/verify_correctness/std": 0.3442651828130086, |
|
"step": 955 |
|
}, |
|
{ |
|
"clip_ratio": 0.0023815435823053123, |
|
"completion_length": 57.875, |
|
"epoch": 0.64, |
|
"grad_norm": 0.3188163340929078, |
|
"kl": 0.2923828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.023, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010169491171836853, |
|
"completion_length": 79.75, |
|
"epoch": 0.6433333333333333, |
|
"grad_norm": 0.047849220688192536, |
|
"kl": 0.3609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9450000127156576, |
|
"reward_std": 0.015430334955453873, |
|
"rewards/judge_tool_use": 0.9833333293596903, |
|
"rewards/judge_tool_use/std": 0.017213263859351475, |
|
"rewards/verify_correctness": 0.7916666666666666, |
|
"rewards/verify_correctness/std": 0.28598760565121967, |
|
"step": 965 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007624854100868106, |
|
"completion_length": 99.40625, |
|
"epoch": 0.6466666666666666, |
|
"grad_norm": 1.6765942241870033, |
|
"kl": 0.3326171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0178, |
|
"reward": 0.7712499797344208, |
|
"reward_std": 0.2565469294786453, |
|
"rewards/judge_tool_use": 0.815625011920929, |
|
"rewards/judge_tool_use/std": 0.3259209841489792, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.497555673122406, |
|
"step": 970 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.47916666666667, |
|
"epoch": 0.65, |
|
"grad_norm": 0.06349338696726389, |
|
"kl": 0.341796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.950000007947286, |
|
"reward_std": 0.006172137334942818, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.028598766773939133, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 975 |
|
}, |
|
{ |
|
"clip_ratio": 0.0012797552859410643, |
|
"completion_length": 63.3125, |
|
"epoch": 0.6533333333333333, |
|
"grad_norm": 0.28318230460197835, |
|
"kl": 0.2015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0, |
|
"reward": 0.9862500131130219, |
|
"reward_std": 0.0340069429948926, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.029578257352113724, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 980 |
|
}, |
|
{ |
|
"clip_ratio": 0.00012330455938354136, |
|
"completion_length": 65.1875, |
|
"epoch": 0.6566666666666666, |
|
"grad_norm": 0.040520629760965425, |
|
"kl": 0.12958984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0029, |
|
"reward": 0.9691666563351949, |
|
"reward_std": 0.022236108779907227, |
|
"rewards/judge_tool_use": 0.987499992052714, |
|
"rewards/judge_tool_use/std": 0.016666671882073086, |
|
"rewards/verify_correctness": 0.8958333333333334, |
|
"rewards/verify_correctness/std": 0.1595711906750997, |
|
"step": 985 |
|
}, |
|
{ |
|
"clip_ratio": 0.0002617801073938608, |
|
"completion_length": 57.9375, |
|
"epoch": 0.66, |
|
"grad_norm": 0.08013232055388929, |
|
"kl": 0.2423828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9375, |
|
"reward_std": 0.010350988246500492, |
|
"rewards/judge_tool_use": 0.984375, |
|
"rewards/judge_tool_use/std": 0.023935683071613312, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 990 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005273823626339436, |
|
"completion_length": 125.39583333333333, |
|
"epoch": 0.6633333333333333, |
|
"grad_norm": 0.21519543469410263, |
|
"kl": 0.4150390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.9150000214576721, |
|
"reward_std": 0.06592664029449224, |
|
"rewards/judge_tool_use": 0.9770833253860474, |
|
"rewards/judge_tool_use/std": 0.050411589443683624, |
|
"rewards/verify_correctness": 0.6666666666666666, |
|
"rewards/verify_correctness/std": 0.45265427231788635, |
|
"step": 995 |
|
}, |
|
{ |
|
"clip_ratio": 0.0036985486280173064, |
|
"completion_length": 77.15625, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.13812841795509134, |
|
"kl": 0.2712890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9699999690055847, |
|
"reward_std": 0.017422060016542673, |
|
"rewards/judge_tool_use": 0.9624999761581421, |
|
"rewards/judge_tool_use/std": 0.04955306649208069, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"clip_ratio": 0.001371738500893116, |
|
"completion_length": 80.89583333333333, |
|
"epoch": 0.67, |
|
"grad_norm": 0.026859045422407482, |
|
"kl": 0.25078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0027, |
|
"reward": 0.9274999896685282, |
|
"reward_std": 0.048885335214436054, |
|
"rewards/judge_tool_use": 0.9666666587193807, |
|
"rewards/judge_tool_use/std": 0.03929029653469721, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.1595711906750997, |
|
"step": 1005 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007334963418543338, |
|
"completion_length": 73.1875, |
|
"epoch": 0.6733333333333333, |
|
"grad_norm": 0.029137916766353474, |
|
"kl": 0.215625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"clip_ratio": 0.001794871874153614, |
|
"completion_length": 55.5, |
|
"epoch": 0.6766666666666666, |
|
"grad_norm": 0.05424116695206805, |
|
"kl": 0.14228515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"reward": 0.9900000095367432, |
|
"reward_std": 0.013801320145527521, |
|
"rewards/judge_tool_use": 0.987499992052714, |
|
"rewards/judge_tool_use/std": 0.026874199509620667, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"clip_ratio": 0.0038688791915774345, |
|
"completion_length": 83.125, |
|
"epoch": 0.68, |
|
"grad_norm": 0.6002387040564632, |
|
"kl": 0.71337890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0016, |
|
"reward": 0.9775000214576721, |
|
"reward_std": 0.03518358897417784, |
|
"rewards/judge_tool_use": 0.9718749821186066, |
|
"rewards/judge_tool_use/std": 0.04269563965499401, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"clip_ratio": 0.00015748031437397004, |
|
"completion_length": 73.08333333333333, |
|
"epoch": 0.6833333333333333, |
|
"grad_norm": 0.17152579955328484, |
|
"kl": 0.23369140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9566666682561239, |
|
"reward_std": 0.011614705435931683, |
|
"rewards/judge_tool_use": 0.987499992052714, |
|
"rewards/judge_tool_use/std": 0.024290457367897034, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1025 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013904837891459466, |
|
"completion_length": 82.09375, |
|
"epoch": 0.6866666666666666, |
|
"grad_norm": 0.06465280465321986, |
|
"kl": 0.1673828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0003, |
|
"reward": 0.9874999821186066, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.984375, |
|
"rewards/judge_tool_use/std": 0.023935683071613312, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010101010091602802, |
|
"completion_length": 95.77083333333333, |
|
"epoch": 0.69, |
|
"grad_norm": 0.0497496702632166, |
|
"kl": 0.26328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9883333245913187, |
|
"reward_std": 0.010886183629433313, |
|
"rewards/judge_tool_use": 0.9854166507720947, |
|
"rewards/judge_tool_use/std": 0.017078256855408352, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 89.5625, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.05805638933980568, |
|
"kl": 0.33515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9474999904632568, |
|
"reward_std": 0.007071072701364756, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 1040 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005376344081014395, |
|
"completion_length": 69.0, |
|
"epoch": 0.6966666666666667, |
|
"grad_norm": 0.057741287045645795, |
|
"kl": 0.23125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9933333396911621, |
|
"reward_std": 0.007126972700158755, |
|
"rewards/judge_tool_use": 0.9916666746139526, |
|
"rewards/judge_tool_use/std": 0.01490712414185206, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"clip_ratio": 0.0020588235929608345, |
|
"completion_length": 89.0, |
|
"epoch": 0.7, |
|
"grad_norm": 0.11086366545110844, |
|
"kl": 0.2451171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"clip_ratio": 0.002628726325929165, |
|
"completion_length": 75.72916666666667, |
|
"epoch": 0.7033333333333334, |
|
"grad_norm": 1.1943993692559909, |
|
"kl": 11.296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0049, |
|
"reward": 0.8274999856948853, |
|
"reward_std": 0.11860653261343639, |
|
"rewards/judge_tool_use": 0.8416666587193807, |
|
"rewards/judge_tool_use/std": 0.2197331190109253, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.3198537329832713, |
|
"step": 1055 |
|
}, |
|
{ |
|
"clip_ratio": 0.0027464469894766808, |
|
"completion_length": 76.46875, |
|
"epoch": 0.7066666666666667, |
|
"grad_norm": 0.09154251338419422, |
|
"kl": 0.7396484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0038, |
|
"reward": 0.9200000166893005, |
|
"reward_std": 0.029206860810518265, |
|
"rewards/judge_tool_use": 0.9624999761581421, |
|
"rewards/judge_tool_use/std": 0.03872983902692795, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 1060 |
|
}, |
|
{ |
|
"clip_ratio": 0.0025510898791253566, |
|
"completion_length": 85.54166666666667, |
|
"epoch": 0.71, |
|
"grad_norm": 7.4396963421241615, |
|
"kl": 1.466015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0026, |
|
"reward": 0.9608333309491476, |
|
"reward_std": 0.016499162030716736, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25546592473983765, |
|
"step": 1065 |
|
}, |
|
{ |
|
"clip_ratio": 0.00016474464209750294, |
|
"completion_length": 65.46875, |
|
"epoch": 0.7133333333333334, |
|
"grad_norm": 0.38680025327426837, |
|
"kl": 0.7421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0036, |
|
"reward": 0.9950000047683716, |
|
"reward_std": 0.014142131432890892, |
|
"rewards/judge_tool_use": 0.9937500059604645, |
|
"rewards/judge_tool_use/std": 0.02500000037252903, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.91666666666667, |
|
"epoch": 0.7166666666666667, |
|
"grad_norm": 0.040315040411472755, |
|
"kl": 0.2189453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9933333396911621, |
|
"reward_std": 0.007126972700158755, |
|
"rewards/judge_tool_use": 0.9916666746139526, |
|
"rewards/judge_tool_use/std": 0.01490712414185206, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 68.0625, |
|
"epoch": 0.72, |
|
"grad_norm": 1.7493474554433537, |
|
"kl": 0.369140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0001, |
|
"reward": 0.987500011920929, |
|
"reward_std": 0.02314550243318081, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.17078252136707306, |
|
"step": 1080 |
|
}, |
|
{ |
|
"clip_ratio": 0.001583357620984316, |
|
"completion_length": 86.85416666666667, |
|
"epoch": 0.7233333333333334, |
|
"grad_norm": 44.87687812229357, |
|
"kl": 8.5908203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0171, |
|
"reward": 0.9758333166440328, |
|
"reward_std": 0.029838324524462223, |
|
"rewards/judge_tool_use": 0.9749999841054281, |
|
"rewards/judge_tool_use/std": 0.0367970938483874, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 1085 |
|
}, |
|
{ |
|
"clip_ratio": 0.0002669311594218016, |
|
"completion_length": 75.15625, |
|
"epoch": 0.7266666666666667, |
|
"grad_norm": 0.9238188968217804, |
|
"kl": 2.844921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.005, |
|
"reward": 0.9012499749660492, |
|
"reward_std": 0.05243951827287674, |
|
"rewards/judge_tool_use": 0.9468749761581421, |
|
"rewards/judge_tool_use/std": 0.02212652750313282, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 1090 |
|
}, |
|
{ |
|
"clip_ratio": 0.000481430534273386, |
|
"completion_length": 77.95833333333333, |
|
"epoch": 0.73, |
|
"grad_norm": 0.7345981685759967, |
|
"kl": 0.368359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9983333349227905, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"clip_ratio": 0.002522681839764118, |
|
"completion_length": 106.15625, |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.14338628771134002, |
|
"kl": 0.305078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0012, |
|
"reward": 0.9387499988079071, |
|
"reward_std": 0.03364227432757616, |
|
"rewards/judge_tool_use": 0.9781250059604645, |
|
"rewards/judge_tool_use/std": 0.04251633584499359, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 1100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0025735294446349146, |
|
"completion_length": 59.166666666666664, |
|
"epoch": 0.7366666666666667, |
|
"grad_norm": 0.036437347174280016, |
|
"kl": 0.2404296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9649999936421713, |
|
"reward_std": 0.004714048467576504, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1105 |
|
}, |
|
{ |
|
"clip_ratio": 0.00031796502880752084, |
|
"completion_length": 78.375, |
|
"epoch": 0.74, |
|
"grad_norm": 0.057036900120037294, |
|
"kl": 0.21328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0013, |
|
"reward": 0.9462500214576721, |
|
"reward_std": 0.08264750707894564, |
|
"rewards/judge_tool_use": 0.9406249821186066, |
|
"rewards/judge_tool_use/std": 0.15678166970610619, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009749303571879864, |
|
"completion_length": 77.10416666666667, |
|
"epoch": 0.7433333333333333, |
|
"grad_norm": 0.09765470973201737, |
|
"kl": 0.31328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9383333325386047, |
|
"reward_std": 0.01632875545571248, |
|
"rewards/judge_tool_use": 0.9645833174387614, |
|
"rewards/judge_tool_use/std": 0.033744927495718, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1115 |
|
}, |
|
{ |
|
"clip_ratio": 0.005039867106825113, |
|
"completion_length": 96.15625, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.39855344737975584, |
|
"kl": 0.3517578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.005, |
|
"reward": 0.8637499809265137, |
|
"reward_std": 0.09736945712938905, |
|
"rewards/judge_tool_use": 0.9312499761581421, |
|
"rewards/judge_tool_use/std": 0.1450628936290741, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.497555673122406, |
|
"step": 1120 |
|
}, |
|
{ |
|
"clip_ratio": 0.000663868710398674, |
|
"completion_length": 88.70833333333333, |
|
"epoch": 0.75, |
|
"grad_norm": 0.03425710766538615, |
|
"kl": 0.230078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9458333253860474, |
|
"reward_std": 0.05892556874702374, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.0749652733405431, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25546592473983765, |
|
"step": 1125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.4375, |
|
"epoch": 0.7533333333333333, |
|
"grad_norm": 0.061636357119929305, |
|
"kl": 0.26015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007194244768470526, |
|
"completion_length": 72.625, |
|
"epoch": 0.7566666666666667, |
|
"grad_norm": 0.054919383958076454, |
|
"kl": 0.378515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9599999984105428, |
|
"reward_std": 0.007126967112223308, |
|
"rewards/judge_tool_use": 0.9916666746139526, |
|
"rewards/judge_tool_use/std": 0.01490712414185206, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0003389830468222499, |
|
"completion_length": 70.875, |
|
"epoch": 0.76, |
|
"grad_norm": 0.2643017191768637, |
|
"kl": 0.21484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9824999868869781, |
|
"reward_std": 0.027645720168948174, |
|
"rewards/judge_tool_use": 0.9937500059604645, |
|
"rewards/judge_tool_use/std": 0.017078254371881485, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.17078252136707306, |
|
"step": 1140 |
|
}, |
|
{ |
|
"clip_ratio": 0.00042253523133695126, |
|
"completion_length": 86.875, |
|
"epoch": 0.7633333333333333, |
|
"grad_norm": 0.05569328969548695, |
|
"kl": 0.20859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0023, |
|
"reward": 0.9983333349227905, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009957325644791126, |
|
"completion_length": 70.15625, |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 0.04901324608109469, |
|
"kl": 0.327734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9474999904632568, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 1150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0023318555206060408, |
|
"completion_length": 75.5625, |
|
"epoch": 0.77, |
|
"grad_norm": 0.04106910726707159, |
|
"kl": 0.2216796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9566666682561239, |
|
"reward_std": 0.011614705125490824, |
|
"rewards/judge_tool_use": 0.987499992052714, |
|
"rewards/judge_tool_use/std": 0.016666671882073086, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005040322430431843, |
|
"completion_length": 83.625, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 2.987988625745358, |
|
"kl": 0.309765625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0085, |
|
"reward": 0.8725000023841858, |
|
"reward_std": 0.08379396051168442, |
|
"rewards/judge_tool_use": 0.903124988079071, |
|
"rewards/judge_tool_use/std": 0.17461267113685608, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 1160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 103.64583333333333, |
|
"epoch": 0.7766666666666666, |
|
"grad_norm": 0.02629597807895239, |
|
"kl": 0.228125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9649999936421713, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 57.96875, |
|
"epoch": 0.78, |
|
"grad_norm": 0.07410127643745398, |
|
"kl": 0.1939453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006822766736149788, |
|
"completion_length": 84.10416666666667, |
|
"epoch": 0.7833333333333333, |
|
"grad_norm": 0.15022605399359662, |
|
"kl": 0.499609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0053, |
|
"reward": 0.8541666666666666, |
|
"reward_std": 0.08558030892163515, |
|
"rewards/judge_tool_use": 0.875, |
|
"rewards/judge_tool_use/std": 0.16530899827679, |
|
"rewards/verify_correctness": 0.7708333333333334, |
|
"rewards/verify_correctness/std": 0.3065035541852315, |
|
"step": 1175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013619335135445, |
|
"completion_length": 59.5, |
|
"epoch": 0.7866666666666666, |
|
"grad_norm": 0.5652312034577931, |
|
"kl": 0.7025390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0037, |
|
"reward": 0.956250011920929, |
|
"reward_std": 0.04082316905260086, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 1180 |
|
}, |
|
{ |
|
"clip_ratio": 0.00011415524641051888, |
|
"completion_length": 81.60416666666667, |
|
"epoch": 0.79, |
|
"grad_norm": 0.12620412707671058, |
|
"kl": 0.3166015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0005, |
|
"reward": 0.9799999992052714, |
|
"reward_std": 0.021484845007459324, |
|
"rewards/judge_tool_use": 0.9854166706403097, |
|
"rewards/judge_tool_use/std": 0.027342626204093296, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 1185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010737302247434855, |
|
"completion_length": 110.46875, |
|
"epoch": 0.7933333333333333, |
|
"grad_norm": 1.7918621578186396, |
|
"kl": 0.730078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0108, |
|
"reward": 0.78125, |
|
"reward_std": 0.20286056958138943, |
|
"rewards/judge_tool_use": 0.859375, |
|
"rewards/judge_tool_use/std": 0.23376120440661907, |
|
"rewards/verify_correctness": 0.46875, |
|
"rewards/verify_correctness/std": 0.5143726766109467, |
|
"step": 1190 |
|
}, |
|
{ |
|
"clip_ratio": 6.253908504731953e-05, |
|
"completion_length": 111.41666666666667, |
|
"epoch": 0.7966666666666666, |
|
"grad_norm": 0.03905890135125302, |
|
"kl": 0.242578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0035, |
|
"reward": 0.9733333190282186, |
|
"reward_std": 0.05249338845411936, |
|
"rewards/judge_tool_use": 0.9770833253860474, |
|
"rewards/judge_tool_use/std": 0.06962200005849202, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 1195 |
|
}, |
|
{ |
|
"clip_ratio": 0.001590106077492237, |
|
"completion_length": 140.0, |
|
"epoch": 0.8, |
|
"grad_norm": 0.40701520002938785, |
|
"kl": 0.385546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0014, |
|
"reward": 0.9937500059604645, |
|
"reward_std": 0.01767767034471035, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0020255075418390335, |
|
"completion_length": 88.85416666666667, |
|
"epoch": 0.8033333333333333, |
|
"grad_norm": 0.05096213910492859, |
|
"kl": 6.8375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0026, |
|
"reward": 0.9675000111262003, |
|
"reward_std": 0.06057482430090507, |
|
"rewards/judge_tool_use": 0.9749999841054281, |
|
"rewards/judge_tool_use/std": 0.0746867706378301, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1343709627787272, |
|
"step": 1205 |
|
}, |
|
{ |
|
"clip_ratio": 0.002165872976183891, |
|
"completion_length": 113.46875, |
|
"epoch": 0.8066666666666666, |
|
"grad_norm": 0.038572489497010296, |
|
"kl": 1.9291015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0044, |
|
"reward": 0.9662500023841858, |
|
"reward_std": 0.08459719270467758, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.09953015297651291, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1210 |
|
}, |
|
{ |
|
"clip_ratio": 0.002159976586699486, |
|
"completion_length": 123.64583333333333, |
|
"epoch": 0.81, |
|
"grad_norm": 358.62576549649833, |
|
"kl": 6.189453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0137, |
|
"reward": 0.9066666563351949, |
|
"reward_std": 0.07383281799654166, |
|
"rewards/judge_tool_use": 0.956250011920929, |
|
"rewards/judge_tool_use/std": 0.09136871124307315, |
|
"rewards/verify_correctness": 0.7083333333333334, |
|
"rewards/verify_correctness/std": 0.4150371154149373, |
|
"step": 1215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018026274861767887, |
|
"completion_length": 104.34375, |
|
"epoch": 0.8133333333333334, |
|
"grad_norm": 3.067095761854335, |
|
"kl": 1.104296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0106, |
|
"reward": 0.9012500047683716, |
|
"reward_std": 0.19770433753728867, |
|
"rewards/judge_tool_use": 0.9156250059604645, |
|
"rewards/judge_tool_use/std": 0.22225218266248703, |
|
"rewards/verify_correctness": 0.84375, |
|
"rewards/verify_correctness/std": 0.34860680997371674, |
|
"step": 1220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 94.08333333333333, |
|
"epoch": 0.8166666666666667, |
|
"grad_norm": 2.5780187257288305, |
|
"kl": 0.45625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0012, |
|
"reward": 0.9458333253860474, |
|
"reward_std": 0.04852588474750519, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.06652763485908508, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25546592473983765, |
|
"step": 1225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0011104722507297993, |
|
"completion_length": 113.84375, |
|
"epoch": 0.82, |
|
"grad_norm": 121.8815400237713, |
|
"kl": 11.90546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0468, |
|
"reward": 0.8700000047683716, |
|
"reward_std": 0.2571648806333542, |
|
"rewards/judge_tool_use": 0.8843749761581421, |
|
"rewards/judge_tool_use/std": 0.25856370478868484, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.3943893313407898, |
|
"step": 1230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026565464213490486, |
|
"completion_length": 93.3125, |
|
"epoch": 0.8233333333333334, |
|
"grad_norm": 0.10321332519745828, |
|
"kl": 0.405078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0093, |
|
"reward": 0.9666666587193807, |
|
"reward_std": 0.05114200680206219, |
|
"rewards/judge_tool_use": 0.9791666666666666, |
|
"rewards/judge_tool_use/std": 0.0749652733405431, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.1490712066491445, |
|
"step": 1235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0024265490006655454, |
|
"completion_length": 115.21875, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.23216984855971948, |
|
"kl": 0.3140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.004, |
|
"reward": 0.8787499964237213, |
|
"reward_std": 0.09615195170044899, |
|
"rewards/judge_tool_use": 0.9500000178813934, |
|
"rewards/judge_tool_use/std": 0.11884498223662376, |
|
"rewards/verify_correctness": 0.59375, |
|
"rewards/verify_correctness/std": 0.4797805994749069, |
|
"step": 1240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017241379246115685, |
|
"completion_length": 83.3125, |
|
"epoch": 0.83, |
|
"grad_norm": 99.5968858061048, |
|
"kl": 7.05859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0057, |
|
"reward": 0.8924999833106995, |
|
"reward_std": 0.12001222868760426, |
|
"rewards/judge_tool_use": 0.8916666706403097, |
|
"rewards/judge_tool_use/std": 0.19258573154608408, |
|
"rewards/verify_correctness": 0.8958333333333334, |
|
"rewards/verify_correctness/std": 0.23240453998247781, |
|
"step": 1245 |
|
}, |
|
{ |
|
"clip_ratio": 0.00025445292703807354, |
|
"completion_length": 107.09375, |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.03256099733878858, |
|
"kl": 4.037109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0071, |
|
"reward": 0.9849999845027924, |
|
"reward_std": 0.009258206002414227, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025000007823109627, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017938947305083275, |
|
"completion_length": 94.35416666666667, |
|
"epoch": 0.8366666666666667, |
|
"grad_norm": 0.06251358849592821, |
|
"kl": 0.41484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.8608333269755045, |
|
"reward_std": 0.031753117529054485, |
|
"rewards/judge_tool_use": 0.893749992052714, |
|
"rewards/judge_tool_use/std": 0.10712230205535889, |
|
"rewards/verify_correctness": 0.7291666666666666, |
|
"rewards/verify_correctness/std": 0.331703782081604, |
|
"step": 1255 |
|
}, |
|
{ |
|
"clip_ratio": 0.000598287198226899, |
|
"completion_length": 80.0, |
|
"epoch": 0.84, |
|
"grad_norm": 0.05840859691708302, |
|
"kl": 0.53984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0014, |
|
"reward": 0.9712499976158142, |
|
"reward_std": 0.07356970012187958, |
|
"rewards/judge_tool_use": 0.971875011920929, |
|
"rewards/judge_tool_use/std": 0.09994790703058243, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0018959624227136373, |
|
"completion_length": 147.64583333333334, |
|
"epoch": 0.8433333333333334, |
|
"grad_norm": 0.03529068907540189, |
|
"kl": 0.275390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.949999988079071, |
|
"reward_std": 0.02855063695460558, |
|
"rewards/judge_tool_use": 0.9791666467984518, |
|
"rewards/judge_tool_use/std": 0.038437106957038246, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.25411585966746014, |
|
"step": 1265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008389623428229243, |
|
"completion_length": 90.15625, |
|
"epoch": 0.8466666666666667, |
|
"grad_norm": 1.2714158856293196, |
|
"kl": 1.3701171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0127, |
|
"reward": 0.913750022649765, |
|
"reward_std": 0.08004017360508442, |
|
"rewards/judge_tool_use": 0.9625000059604645, |
|
"rewards/judge_tool_use/std": 0.11686970666050911, |
|
"rewards/verify_correctness": 0.71875, |
|
"rewards/verify_correctness/std": 0.38319888710975647, |
|
"step": 1270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0019859145628288387, |
|
"completion_length": 88.08333333333333, |
|
"epoch": 0.85, |
|
"grad_norm": 0.06215294502600355, |
|
"kl": 0.309765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0012, |
|
"reward": 0.9799999793370565, |
|
"reward_std": 0.012344274669885635, |
|
"rewards/judge_tool_use": 0.9749999841054281, |
|
"rewards/judge_tool_use/std": 0.03333334376414617, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"clip_ratio": 0.00028956440510228274, |
|
"completion_length": 125.1875, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.35612576827223785, |
|
"kl": 0.3298828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0075, |
|
"reward": 0.8712500035762787, |
|
"reward_std": 0.08317051082849503, |
|
"rewards/judge_tool_use": 0.925000011920929, |
|
"rewards/judge_tool_use/std": 0.1485760398209095, |
|
"rewards/verify_correctness": 0.65625, |
|
"rewards/verify_correctness/std": 0.4797805994749069, |
|
"step": 1280 |
|
}, |
|
{ |
|
"clip_ratio": 0.000690448796376586, |
|
"completion_length": 77.375, |
|
"epoch": 0.8566666666666667, |
|
"grad_norm": 0.08452869378749826, |
|
"kl": 2.195703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0217, |
|
"reward": 0.9083333412806193, |
|
"reward_std": 0.06206287909299135, |
|
"rewards/judge_tool_use": 0.9270833333333334, |
|
"rewards/judge_tool_use/std": 0.13058080275853476, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005081300623714924, |
|
"completion_length": 94.1875, |
|
"epoch": 0.86, |
|
"grad_norm": 0.1776444100641673, |
|
"kl": 0.2853515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9737499952316284, |
|
"reward_std": 0.02722262777388096, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 0.90625, |
|
"rewards/verify_correctness/std": 0.20155644416809082, |
|
"step": 1290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010418544989079237, |
|
"completion_length": 80.91666666666667, |
|
"epoch": 0.8633333333333333, |
|
"grad_norm": 0.0610474709944994, |
|
"kl": 0.6662109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0053, |
|
"reward": 0.962499996026357, |
|
"reward_std": 0.055602967428664364, |
|
"rewards/judge_tool_use": 0.96875, |
|
"rewards/judge_tool_use/std": 0.0822451909383138, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1343709627787272, |
|
"step": 1295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0009510869160294533, |
|
"completion_length": 77.75, |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.9107590450435429, |
|
"kl": 0.2291015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.9449999630451202, |
|
"reward_std": 0.05099019035696983, |
|
"rewards/judge_tool_use": 0.9312500059604645, |
|
"rewards/judge_tool_use/std": 0.12392698042094707, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005154639016836882, |
|
"completion_length": 69.60416666666667, |
|
"epoch": 0.87, |
|
"grad_norm": 1.2573438996166055, |
|
"kl": 0.34296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9675000111262003, |
|
"reward_std": 0.07550223357975483, |
|
"rewards/judge_tool_use": 0.975000003973643, |
|
"rewards/judge_tool_use/std": 0.08629285047451656, |
|
"rewards/verify_correctness": 0.9375, |
|
"rewards/verify_correctness/std": 0.1343709627787272, |
|
"step": 1305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0013909014873206616, |
|
"completion_length": 92.96875, |
|
"epoch": 0.8733333333333333, |
|
"grad_norm": 0.022891900254080604, |
|
"kl": 0.308984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.01632927590981126, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.029578257352113724, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 56.0625, |
|
"epoch": 0.8766666666666667, |
|
"grad_norm": 0.6528272207623552, |
|
"kl": 0.2763671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9950000047683716, |
|
"reward_std": 0.006900660072763761, |
|
"rewards/judge_tool_use": 0.993749996026357, |
|
"rewards/judge_tool_use/std": 0.013437099754810333, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"clip_ratio": 0.002574713109061122, |
|
"completion_length": 93.40625, |
|
"epoch": 0.88, |
|
"grad_norm": 2.691428684412783, |
|
"kl": 0.2265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0205, |
|
"reward": 0.9312499761581421, |
|
"reward_std": 0.09005174040794373, |
|
"rewards/judge_tool_use": 0.921875, |
|
"rewards/judge_tool_use/std": 0.12776117026805878, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005747126415371895, |
|
"completion_length": 74.3125, |
|
"epoch": 0.8833333333333333, |
|
"grad_norm": 1.4432230028882391, |
|
"kl": 0.637890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.9583333333333334, |
|
"reward_std": 0.006900658831000328, |
|
"rewards/judge_tool_use": 0.9895833333333334, |
|
"rewards/judge_tool_use/std": 0.015957122047742207, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 70.53125, |
|
"epoch": 0.8866666666666667, |
|
"grad_norm": 0.054306836971010644, |
|
"kl": 0.4212890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9912500083446503, |
|
"reward_std": 0.02474873699247837, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0010802339063957333, |
|
"completion_length": 88.89583333333333, |
|
"epoch": 0.89, |
|
"grad_norm": 1.275564433705307, |
|
"kl": 0.525390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0011, |
|
"reward": 0.7258332967758179, |
|
"reward_std": 0.11929505753020446, |
|
"rewards/judge_tool_use": 0.7874999841054281, |
|
"rewards/judge_tool_use/std": 0.17958564311265945, |
|
"rewards/verify_correctness": 0.4791666666666667, |
|
"rewards/verify_correctness/std": 0.25546592473983765, |
|
"step": 1335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0011502533918246627, |
|
"completion_length": 77.8125, |
|
"epoch": 0.8933333333333333, |
|
"grad_norm": 0.09495443439677693, |
|
"kl": 0.267578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0008, |
|
"reward": 0.9325000047683716, |
|
"reward_std": 0.024799177423119545, |
|
"rewards/judge_tool_use": 0.9781249761581421, |
|
"rewards/judge_tool_use/std": 0.036371923983097076, |
|
"rewards/verify_correctness": 0.75, |
|
"rewards/verify_correctness/std": 0.25819888710975647, |
|
"step": 1340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0015889291651546955, |
|
"completion_length": 64.45833333333333, |
|
"epoch": 0.8966666666666666, |
|
"grad_norm": 0.14989201200913477, |
|
"kl": 0.2818359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9816666841506958, |
|
"reward_std": 0.028729444990555447, |
|
"rewards/judge_tool_use": 0.987500011920929, |
|
"rewards/judge_tool_use/std": 0.026292627056439716, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 1345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0019876956939697266, |
|
"completion_length": 68.3125, |
|
"epoch": 0.9, |
|
"grad_norm": 0.1295069839584885, |
|
"kl": 0.2017578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9975000023841858, |
|
"reward_std": 0.007071069907397032, |
|
"rewards/judge_tool_use": 0.996874988079071, |
|
"rewards/judge_tool_use/std": 0.012500002980232239, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"clip_ratio": 0.001906401664018631, |
|
"completion_length": 78.83333333333333, |
|
"epoch": 0.9033333333333333, |
|
"grad_norm": 0.3518791950466458, |
|
"kl": 0.2375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0066, |
|
"reward": 0.8791666626930237, |
|
"reward_std": 0.03298428406318029, |
|
"rewards/judge_tool_use": 0.8854166666666666, |
|
"rewards/judge_tool_use/std": 0.10253725449244182, |
|
"rewards/verify_correctness": 0.8541666666666666, |
|
"rewards/verify_correctness/std": 0.1707825263341268, |
|
"step": 1355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007567567750811577, |
|
"completion_length": 82.03125, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.021035024825950537, |
|
"kl": 0.8796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0294, |
|
"reward": 0.9662500023841858, |
|
"reward_std": 0.07981003820896149, |
|
"rewards/judge_tool_use": 0.965624988079071, |
|
"rewards/judge_tool_use/std": 0.11212901771068573, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 60.5625, |
|
"epoch": 0.91, |
|
"grad_norm": 0.05787722591961142, |
|
"kl": 0.2779296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.8633333245913187, |
|
"reward_std": 0.006172137334942818, |
|
"rewards/judge_tool_use": 0.8708333174387614, |
|
"rewards/judge_tool_use/std": 0.1371594878534476, |
|
"rewards/verify_correctness": 0.8333333333333334, |
|
"rewards/verify_correctness/std": 0.1721325914065043, |
|
"step": 1365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0007437300402671099, |
|
"completion_length": 75.75, |
|
"epoch": 0.9133333333333333, |
|
"grad_norm": 0.26276877013042266, |
|
"kl": 0.25859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9850000143051147, |
|
"reward_std": 0.01994866505265236, |
|
"rewards/judge_tool_use": 0.9812500178813934, |
|
"rewards/judge_tool_use/std": 0.039438940584659576, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"clip_ratio": 0.001213064044713974, |
|
"completion_length": 94.8125, |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.17680777202915993, |
|
"kl": 0.333984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0012, |
|
"reward": 0.8958333333333334, |
|
"reward_std": 0.06000414118170738, |
|
"rewards/judge_tool_use": 0.9479166467984518, |
|
"rewards/judge_tool_use/std": 0.10853513081868489, |
|
"rewards/verify_correctness": 0.6875, |
|
"rewards/verify_correctness/std": 0.3429151177406311, |
|
"step": 1375 |
|
}, |
|
{ |
|
"clip_ratio": 0.000994318164885044, |
|
"completion_length": 47.0625, |
|
"epoch": 0.92, |
|
"grad_norm": 0.0472238380868765, |
|
"kl": 0.282421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006648936308920383, |
|
"completion_length": 64.58333333333333, |
|
"epoch": 0.9233333333333333, |
|
"grad_norm": 0.03587334591120667, |
|
"kl": 0.17080078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0013, |
|
"reward": 0.9924999872843424, |
|
"reward_std": 0.012051478028297424, |
|
"rewards/judge_tool_use": 0.9958333373069763, |
|
"rewards/judge_tool_use/std": 0.011385502914587656, |
|
"rewards/verify_correctness": 0.9791666666666666, |
|
"rewards/verify_correctness/std": 0.08333333333333333, |
|
"step": 1385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0025167843326926232, |
|
"completion_length": 76.875, |
|
"epoch": 0.9266666666666666, |
|
"grad_norm": 0.3856157588789181, |
|
"kl": 0.3412109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9900000095367432, |
|
"reward_std": 0.017422059550881386, |
|
"rewards/judge_tool_use": 0.987500011920929, |
|
"rewards/judge_tool_use/std": 0.02236068621277809, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 65.04166666666667, |
|
"epoch": 0.93, |
|
"grad_norm": 0.1934572773836806, |
|
"kl": 0.3046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"clip_ratio": 0.001198182231746614, |
|
"completion_length": 89.0625, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.675230815999025, |
|
"kl": 0.3251953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8862500190734863, |
|
"reward_std": 0.035135677084326744, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.04409133270382881, |
|
"rewards/verify_correctness": 0.53125, |
|
"rewards/verify_correctness/std": 0.5143726766109467, |
|
"step": 1400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.58333333333333, |
|
"epoch": 0.9366666666666666, |
|
"grad_norm": 0.04618812246067894, |
|
"kl": 0.31796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006472928449511528, |
|
"completion_length": 104.78125, |
|
"epoch": 0.94, |
|
"grad_norm": 0.2027750965971443, |
|
"kl": 0.1837890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0015, |
|
"reward": 0.9512499868869781, |
|
"reward_std": 0.03955394588410854, |
|
"rewards/judge_tool_use": 0.9937500059604645, |
|
"rewards/judge_tool_use/std": 0.017078254371881485, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.375, |
|
"step": 1410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008242387557402253, |
|
"completion_length": 98.375, |
|
"epoch": 0.9433333333333334, |
|
"grad_norm": 0.08416311072246639, |
|
"kl": 0.3388671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0008, |
|
"reward": 0.8683333198229471, |
|
"reward_std": 0.14452426508069038, |
|
"rewards/judge_tool_use": 0.918750007947286, |
|
"rewards/judge_tool_use/std": 0.17317061002055803, |
|
"rewards/verify_correctness": 0.6666666666666666, |
|
"rewards/verify_correctness/std": 0.46607474486033124, |
|
"step": 1415 |
|
}, |
|
{ |
|
"clip_ratio": 0.001800605608150363, |
|
"completion_length": 84.84375, |
|
"epoch": 0.9466666666666667, |
|
"grad_norm": 0.06391568829388007, |
|
"kl": 0.3146484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0092, |
|
"reward": 0.7537499964237213, |
|
"reward_std": 0.2096980162896216, |
|
"rewards/judge_tool_use": 0.809374988079071, |
|
"rewards/judge_tool_use/std": 0.2246776893734932, |
|
"rewards/verify_correctness": 0.53125, |
|
"rewards/verify_correctness/std": 0.5143726766109467, |
|
"step": 1420 |
|
}, |
|
{ |
|
"clip_ratio": 0.00038022813387215135, |
|
"completion_length": 60.083333333333336, |
|
"epoch": 0.95, |
|
"grad_norm": 0.03794355903634634, |
|
"kl": 0.2689453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.002, |
|
"reward": 0.9783333341280619, |
|
"reward_std": 0.04257579147815704, |
|
"rewards/judge_tool_use": 0.9833333293596903, |
|
"rewards/judge_tool_use/std": 0.0516397754351298, |
|
"rewards/verify_correctness": 0.9583333333333334, |
|
"rewards/verify_correctness/std": 0.11385501424471538, |
|
"step": 1425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0006096576456911862, |
|
"completion_length": 71.125, |
|
"epoch": 0.9533333333333334, |
|
"grad_norm": 0.30796597977550466, |
|
"kl": 0.2615234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9749999940395355, |
|
"reward_std": 0.017422061879187822, |
|
"rewards/judge_tool_use": 0.9687499701976776, |
|
"rewards/judge_tool_use/std": 0.04577303305268288, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0024035727605223657, |
|
"completion_length": 72.47916666666667, |
|
"epoch": 0.9566666666666667, |
|
"grad_norm": 1.22539179930226, |
|
"kl": 0.416015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"reward": 0.8508333365122477, |
|
"reward_std": 0.01868577239414056, |
|
"rewards/judge_tool_use": 0.8604166706403097, |
|
"rewards/judge_tool_use/std": 0.1511431708931923, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25546592473983765, |
|
"step": 1435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 54.0, |
|
"epoch": 0.96, |
|
"grad_norm": 0.0669772138366113, |
|
"kl": 0.2806640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0001488095265813172, |
|
"completion_length": 71.04166666666667, |
|
"epoch": 0.9633333333333334, |
|
"grad_norm": 0.03299226460421629, |
|
"kl": 0.237109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9983333349227905, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0005509641952812672, |
|
"completion_length": 62.375, |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 0.169618332875564, |
|
"kl": 0.39140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.001, |
|
"reward": 0.9362500011920929, |
|
"reward_std": 0.0176776684820652, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.025819895789027214, |
|
"rewards/verify_correctness": 0.78125, |
|
"rewards/verify_correctness/std": 0.2561737895011902, |
|
"step": 1450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.625, |
|
"epoch": 0.97, |
|
"grad_norm": 0.11741331911619839, |
|
"kl": 0.2830078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9983333349227905, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.9979166587193807, |
|
"rewards/judge_tool_use/std": 0.008333335320154825, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 63.875, |
|
"epoch": 0.9733333333333334, |
|
"grad_norm": 0.03231880152902591, |
|
"kl": 0.33046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9799999892711639, |
|
"reward_std": 0.0, |
|
"rewards/judge_tool_use": 0.9749999940395355, |
|
"rewards/judge_tool_use/std": 0.025819895789027214, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"clip_ratio": 0.002763367397710681, |
|
"completion_length": 88.83333333333333, |
|
"epoch": 0.9766666666666667, |
|
"grad_norm": 1.198685689035818, |
|
"kl": 29510.626171875, |
|
"learning_rate": 1e-06, |
|
"loss": 59.2647, |
|
"reward": 0.9008333086967468, |
|
"reward_std": 0.09250006452202797, |
|
"rewards/judge_tool_use": 0.9229166507720947, |
|
"rewards/judge_tool_use/std": 0.12078534811735153, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.28463754057884216, |
|
"step": 1465 |
|
}, |
|
{ |
|
"clip_ratio": 0.000631313119083643, |
|
"completion_length": 65.375, |
|
"epoch": 0.98, |
|
"grad_norm": 0.055429824214890476, |
|
"kl": 0.474609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.9625000059604645, |
|
"reward_std": 0.02314549870789051, |
|
"rewards/judge_tool_use": 1.0, |
|
"rewards/judge_tool_use/std": 0.0, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 1470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 88.20833333333333, |
|
"epoch": 0.9833333333333333, |
|
"grad_norm": 0.016207461931570542, |
|
"kl": 0.230078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.9849999944368998, |
|
"reward_std": 0.0047140466049313545, |
|
"rewards/judge_tool_use": 0.981249988079071, |
|
"rewards/judge_tool_use/std": 0.025546599179506302, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0016450773924589156, |
|
"completion_length": 79.625, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 1.7797350058464707, |
|
"kl": 0.271875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0002, |
|
"reward": 0.887499988079071, |
|
"reward_std": 0.08430386334657669, |
|
"rewards/judge_tool_use": 0.90625, |
|
"rewards/judge_tool_use/std": 0.15478479862213135, |
|
"rewards/verify_correctness": 0.8125, |
|
"rewards/verify_correctness/std": 0.25, |
|
"step": 1480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.75, |
|
"epoch": 0.99, |
|
"grad_norm": 0.03087383599727828, |
|
"kl": 0.184765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"reward": 0.996666669845581, |
|
"reward_std": 0.006172137334942818, |
|
"rewards/judge_tool_use": 0.9958333373069763, |
|
"rewards/judge_tool_use/std": 0.011385502914587656, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1485 |
|
}, |
|
{ |
|
"clip_ratio": 0.001946034946013242, |
|
"completion_length": 74.875, |
|
"epoch": 0.9933333333333333, |
|
"grad_norm": 0.17294764604787113, |
|
"kl": 0.408203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.9637499749660492, |
|
"reward_std": 0.03092945460230112, |
|
"rewards/judge_tool_use": 0.9624999761581421, |
|
"rewards/judge_tool_use/std": 0.050000015646219254, |
|
"rewards/verify_correctness": 0.96875, |
|
"rewards/verify_correctness/std": 0.125, |
|
"step": 1490 |
|
}, |
|
{ |
|
"clip_ratio": 0.001576576568186283, |
|
"completion_length": 77.97916666666667, |
|
"epoch": 0.9966666666666667, |
|
"grad_norm": 0.26066361310457414, |
|
"kl": 0.3224609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.971666673819224, |
|
"reward_std": 0.03252020105719566, |
|
"rewards/judge_tool_use": 0.9854166507720947, |
|
"rewards/judge_tool_use/std": 0.017078255613644917, |
|
"rewards/verify_correctness": 0.9166666666666666, |
|
"rewards/verify_correctness/std": 0.22771002848943075, |
|
"step": 1495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0008512710221111775, |
|
"completion_length": 57.6875, |
|
"epoch": 1.0, |
|
"grad_norm": 0.30129414295649587, |
|
"kl": 0.2619140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.001, |
|
"reward": 0.9925000071525574, |
|
"reward_std": 0.010350990109145641, |
|
"rewards/judge_tool_use": 0.9906249940395355, |
|
"rewards/judge_tool_use/std": 0.0201556496322155, |
|
"rewards/verify_correctness": 1.0, |
|
"rewards/verify_correctness/std": 0.0, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|