|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.005813953488372093, |
|
"eval_steps": 500, |
|
"global_step": 240, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5501.0, |
|
"completions/max_terminated_length": 5501.0, |
|
"completions/mean_length": 3558.09375, |
|
"completions/mean_terminated_length": 3558.09375, |
|
"completions/min_length": 2215.0, |
|
"completions/min_terminated_length": 2215.0, |
|
"epoch": 2.4224806201550387e-05, |
|
"grad_norm": 0.00640977891147893, |
|
"kl": 0.0007143020629882812, |
|
"learning_rate": 0.0, |
|
"loss": 0.0006, |
|
"num_tokens": 568407.0, |
|
"reward": 0.4926603138446808, |
|
"reward_std": 0.08448069542646408, |
|
"rewards/avg_thinking_length_func": 157.22222900390625, |
|
"rewards/confidence_score_reward_func": 0.7339284420013428, |
|
"rewards/correct_answer_reward_func": 0.640625, |
|
"rewards/efficient_thinking_reward_func": 0.9699548628723149, |
|
"rewards/format_and_efficient_reward_func": 0.5214560031890869, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.318666696548462, |
|
"rewards/tool_execution_reward_func": 1.983011245727539, |
|
"rewards/visit_tool_reward_func": 0.9305298328399658, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 4.8449612403100775e-05, |
|
"grad_norm": 0.0064083920341846115, |
|
"kl": 0.0007143020629882812, |
|
"learning_rate": 6.25e-08, |
|
"loss": 0.0006, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 7.267441860465116e-05, |
|
"grad_norm": 0.006447812260611595, |
|
"kl": 0.0007295608520507812, |
|
"learning_rate": 1.25e-07, |
|
"loss": 0.0006, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 9.689922480620155e-05, |
|
"grad_norm": 0.0066225031847143186, |
|
"kl": 0.0007305145263671875, |
|
"learning_rate": 1.875e-07, |
|
"loss": 0.0006, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7891.0, |
|
"completions/max_terminated_length": 7891.0, |
|
"completions/mean_length": 3465.828125, |
|
"completions/mean_terminated_length": 3465.828125, |
|
"completions/min_length": 1264.0, |
|
"completions/min_terminated_length": 1264.0, |
|
"epoch": 0.00012112403100775194, |
|
"grad_norm": 0.011221982806523546, |
|
"kl": 0.0008029937744140625, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0003, |
|
"num_tokens": 1050218.0, |
|
"reward": 0.35228461027145386, |
|
"reward_std": 0.11903564631938934, |
|
"rewards/avg_thinking_length_func": 172.3975830078125, |
|
"rewards/confidence_score_reward_func": 0.7573737502098083, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.8796035517984737, |
|
"rewards/format_and_efficient_reward_func": 0.3536693751811981, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.489912509918213, |
|
"rewards/tool_execution_reward_func": 1.9884867668151855, |
|
"rewards/visit_tool_reward_func": 0.9384097456932068, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00014534883720930232, |
|
"grad_norm": 0.011369566083514073, |
|
"kl": 0.0008258819580078125, |
|
"learning_rate": 3.1249999999999997e-07, |
|
"loss": 0.0003, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0001695736434108527, |
|
"grad_norm": 0.011325781329231437, |
|
"kl": 0.000820159912109375, |
|
"learning_rate": 3.75e-07, |
|
"loss": 0.0003, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0001937984496124031, |
|
"grad_norm": 0.011468177438620898, |
|
"kl": 0.0008134841918945312, |
|
"learning_rate": 4.375e-07, |
|
"loss": 0.0003, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9790.0, |
|
"completions/max_terminated_length": 9790.0, |
|
"completions/mean_length": 4101.421875, |
|
"completions/mean_terminated_length": 4101.421875, |
|
"completions/min_length": 1141.0, |
|
"completions/min_terminated_length": 1141.0, |
|
"epoch": 0.00021802325581395349, |
|
"grad_norm": 0.008533015854175789, |
|
"kl": 0.00080108642578125, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"num_tokens": 1636681.0, |
|
"reward": 0.4183655381202698, |
|
"reward_std": 0.0931699275970459, |
|
"rewards/avg_thinking_length_func": 176.92233276367188, |
|
"rewards/confidence_score_reward_func": 0.7306747436523438, |
|
"rewards/correct_answer_reward_func": 0.546875, |
|
"rewards/efficient_thinking_reward_func": 0.8954936332818751, |
|
"rewards/format_and_efficient_reward_func": 0.4208581745624542, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.53083336353302, |
|
"rewards/tool_execution_reward_func": 1.9508955478668213, |
|
"rewards/visit_tool_reward_func": 0.8424738645553589, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00024224806201550387, |
|
"grad_norm": 0.009520985221391949, |
|
"kl": 0.0007925033569335938, |
|
"learning_rate": 5.625e-07, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00026647286821705426, |
|
"grad_norm": 0.010085270290120536, |
|
"kl": 0.0011835098266601562, |
|
"learning_rate": 6.249999999999999e-07, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00029069767441860465, |
|
"grad_norm": 0.008472445513601271, |
|
"kl": 0.0008249282836914062, |
|
"learning_rate": 6.875e-07, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7445.0, |
|
"completions/max_terminated_length": 7445.0, |
|
"completions/mean_length": 3379.234375, |
|
"completions/mean_terminated_length": 3379.234375, |
|
"completions/min_length": 1491.0, |
|
"completions/min_terminated_length": 1491.0, |
|
"epoch": 0.00031492248062015503, |
|
"grad_norm": 0.01258823765843166, |
|
"kl": 0.0009145736694335938, |
|
"learning_rate": 7.5e-07, |
|
"loss": -0.0001, |
|
"num_tokens": 2110165.0, |
|
"reward": 0.4067286550998688, |
|
"reward_std": 0.18041250109672546, |
|
"rewards/avg_thinking_length_func": 170.76950073242188, |
|
"rewards/confidence_score_reward_func": 0.763248085975647, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.8802126246942265, |
|
"rewards/format_and_efficient_reward_func": 0.4241780936717987, |
|
"rewards/format_reward_func": 0.99958336353302, |
|
"rewards/num_xml_reward_func": 1.6099066734313965, |
|
"rewards/tool_execution_reward_func": 1.9751970767974854, |
|
"rewards/visit_tool_reward_func": 0.9391972422599792, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003391472868217054, |
|
"grad_norm": 0.012500551984662189, |
|
"kl": 0.0009927749633789062, |
|
"learning_rate": 8.125e-07, |
|
"loss": -0.0001, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003633720930232558, |
|
"grad_norm": 0.012416715432446976, |
|
"kl": 0.0010967254638671875, |
|
"learning_rate": 8.75e-07, |
|
"loss": -0.0001, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003875968992248062, |
|
"grad_norm": 0.01288145978177755, |
|
"kl": 0.001140594482421875, |
|
"learning_rate": 9.374999999999999e-07, |
|
"loss": -0.0001, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 10205.0, |
|
"completions/max_terminated_length": 10205.0, |
|
"completions/mean_length": 4119.96875, |
|
"completions/mean_terminated_length": 4119.96875, |
|
"completions/min_length": 1159.0, |
|
"completions/min_terminated_length": 1159.0, |
|
"epoch": 0.0004118217054263566, |
|
"grad_norm": 0.009407611593031055, |
|
"kl": 0.0011768341064453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 2691117.0, |
|
"reward": 0.4201432466506958, |
|
"reward_std": 0.0907188206911087, |
|
"rewards/avg_thinking_length_func": 171.4025115966797, |
|
"rewards/confidence_score_reward_func": 0.7308521270751953, |
|
"rewards/correct_answer_reward_func": 0.546875, |
|
"rewards/efficient_thinking_reward_func": 0.861229582956026, |
|
"rewards/format_and_efficient_reward_func": 0.37111079692840576, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.3203115463256836, |
|
"rewards/tool_execution_reward_func": 1.9717044830322266, |
|
"rewards/visit_tool_reward_func": 0.8859716653823853, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00043604651162790697, |
|
"grad_norm": 0.009347834781139657, |
|
"kl": 0.00139617919921875, |
|
"learning_rate": 1.0625e-06, |
|
"loss": 0.0004, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00046027131782945736, |
|
"grad_norm": 0.00928664951165006, |
|
"kl": 0.00167083740234375, |
|
"learning_rate": 1.125e-06, |
|
"loss": 0.0004, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00048449612403100775, |
|
"grad_norm": 0.009342230945576057, |
|
"kl": 0.00212860107421875, |
|
"learning_rate": 1.1874999999999999e-06, |
|
"loss": 0.0004, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7792.0, |
|
"completions/max_terminated_length": 7792.0, |
|
"completions/mean_length": 3474.78125, |
|
"completions/mean_terminated_length": 3474.78125, |
|
"completions/min_length": 1307.0, |
|
"completions/min_terminated_length": 1307.0, |
|
"epoch": 0.0005087209302325581, |
|
"grad_norm": 0.010737570621935045, |
|
"kl": 0.002620697021484375, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": -0.0, |
|
"num_tokens": 3182962.0, |
|
"reward": 0.3430381715297699, |
|
"reward_std": 0.15257038176059723, |
|
"rewards/avg_thinking_length_func": 163.67486572265625, |
|
"rewards/confidence_score_reward_func": 0.7590060234069824, |
|
"rewards/correct_answer_reward_func": 0.4375, |
|
"rewards/efficient_thinking_reward_func": 0.9094734969614356, |
|
"rewards/format_and_efficient_reward_func": 0.354397177696228, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.4648277759552002, |
|
"rewards/tool_execution_reward_func": 1.9753289222717285, |
|
"rewards/visit_tool_reward_func": 0.9633350968360901, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005329457364341085, |
|
"grad_norm": 0.010610611287841326, |
|
"kl": 0.003204345703125, |
|
"learning_rate": 1.3125e-06, |
|
"loss": -0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005571705426356589, |
|
"grad_norm": 0.010883725821996518, |
|
"kl": 0.003814697265625, |
|
"learning_rate": 1.375e-06, |
|
"loss": -0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005813953488372093, |
|
"grad_norm": 0.010728950563018041, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 1.4375e-06, |
|
"loss": -0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9082.0, |
|
"completions/max_terminated_length": 9082.0, |
|
"completions/mean_length": 4205.453125, |
|
"completions/mean_terminated_length": 4205.453125, |
|
"completions/min_length": 1188.0, |
|
"completions/min_terminated_length": 1188.0, |
|
"epoch": 0.0006056201550387597, |
|
"grad_norm": 0.011708703395331976, |
|
"kl": 0.0054779052734375, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 3788330.0, |
|
"reward": 0.4100201725959778, |
|
"reward_std": 0.12962010502815247, |
|
"rewards/avg_thinking_length_func": 167.64987182617188, |
|
"rewards/confidence_score_reward_func": 0.7269817590713501, |
|
"rewards/correct_answer_reward_func": 0.53125, |
|
"rewards/efficient_thinking_reward_func": 0.894405090660734, |
|
"rewards/format_and_efficient_reward_func": 0.4734077453613281, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.2609543800354004, |
|
"rewards/tool_execution_reward_func": 1.9624817371368408, |
|
"rewards/visit_tool_reward_func": 0.8933978080749512, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006298449612403101, |
|
"grad_norm": 0.012029441405275343, |
|
"kl": 0.00725555419921875, |
|
"learning_rate": 1.5624999999999999e-06, |
|
"loss": 0.0016, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006540697674418605, |
|
"grad_norm": 0.011965973272488425, |
|
"kl": 0.010589599609375, |
|
"learning_rate": 1.625e-06, |
|
"loss": 0.0016, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006782945736434108, |
|
"grad_norm": 0.018054158629818226, |
|
"kl": 0.017059326171875, |
|
"learning_rate": 1.6875e-06, |
|
"loss": 0.0016, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7466.0, |
|
"completions/max_terminated_length": 7466.0, |
|
"completions/mean_length": 3525.1875, |
|
"completions/mean_terminated_length": 3525.1875, |
|
"completions/min_length": 1458.0, |
|
"completions/min_terminated_length": 1458.0, |
|
"epoch": 0.0007025193798449612, |
|
"grad_norm": 0.011398719184495674, |
|
"kl": 0.013671875, |
|
"learning_rate": 1.75e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 4289196.0, |
|
"reward": 0.3574071526527405, |
|
"reward_std": 0.09749965369701385, |
|
"rewards/avg_thinking_length_func": 163.65969848632812, |
|
"rewards/confidence_score_reward_func": 0.7581030130386353, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.9089163330381327, |
|
"rewards/format_and_efficient_reward_func": 0.3653510808944702, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.4366014003753662, |
|
"rewards/tool_execution_reward_func": 1.9675538539886475, |
|
"rewards/visit_tool_reward_func": 0.960380494594574, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007267441860465116, |
|
"grad_norm": 0.010833682609318612, |
|
"kl": 0.015838623046875, |
|
"learning_rate": 1.8125e-06, |
|
"loss": 0.0001, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000750968992248062, |
|
"grad_norm": 0.0231097533212703, |
|
"kl": 0.022918701171875, |
|
"learning_rate": 1.8749999999999998e-06, |
|
"loss": 0.0001, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007751937984496124, |
|
"grad_norm": 0.011257228008738334, |
|
"kl": 0.021575927734375, |
|
"learning_rate": 1.9375e-06, |
|
"loss": 0.0001, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8888.0, |
|
"completions/max_terminated_length": 8888.0, |
|
"completions/mean_length": 3804.265625, |
|
"completions/mean_terminated_length": 3804.265625, |
|
"completions/min_length": 1067.0, |
|
"completions/min_terminated_length": 1067.0, |
|
"epoch": 0.0007994186046511628, |
|
"grad_norm": 0.02927961829217277, |
|
"kl": 0.030914306640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 4848644.0, |
|
"reward": 0.46360254287719727, |
|
"reward_std": 0.10140425711870193, |
|
"rewards/avg_thinking_length_func": 168.85345458984375, |
|
"rewards/confidence_score_reward_func": 0.7187485694885254, |
|
"rewards/correct_answer_reward_func": 0.609375, |
|
"rewards/efficient_thinking_reward_func": 0.8848315117904739, |
|
"rewards/format_and_efficient_reward_func": 0.46383440494537354, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.3748114109039307, |
|
"rewards/tool_execution_reward_func": 1.9836355447769165, |
|
"rewards/visit_tool_reward_func": 0.8981889486312866, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008236434108527132, |
|
"grad_norm": 0.00984263633299767, |
|
"kl": 0.026763916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008478682170542636, |
|
"grad_norm": 0.022916321346866338, |
|
"kl": 0.03643798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008720930232558139, |
|
"grad_norm": 0.010968578899761567, |
|
"kl": 0.03680419921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7119.0, |
|
"completions/max_terminated_length": 7119.0, |
|
"completions/mean_length": 3045.9375, |
|
"completions/mean_terminated_length": 3045.9375, |
|
"completions/min_length": 1306.0, |
|
"completions/min_terminated_length": 1306.0, |
|
"epoch": 0.0008963178294573643, |
|
"grad_norm": 0.11556192957878203, |
|
"kl": 0.066314697265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 5286042.0, |
|
"reward": 0.38059696555137634, |
|
"reward_std": 0.20472648739814758, |
|
"rewards/avg_thinking_length_func": 171.9969024658203, |
|
"rewards/confidence_score_reward_func": 0.7361885905265808, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 0.8792661921309781, |
|
"rewards/format_and_efficient_reward_func": 0.4069961905479431, |
|
"rewards/format_reward_func": 0.9985389709472656, |
|
"rewards/num_xml_reward_func": 1.7584354877471924, |
|
"rewards/tool_execution_reward_func": 1.9876766204833984, |
|
"rewards/visit_tool_reward_func": 0.926859438419342, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009205426356589147, |
|
"grad_norm": 0.013991455687567742, |
|
"kl": 0.034393310546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009447674418604651, |
|
"grad_norm": 0.01433251116157902, |
|
"kl": 0.0352783203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009689922480620155, |
|
"grad_norm": 0.01682769595676241, |
|
"kl": 0.0426025390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6696.0, |
|
"completions/max_terminated_length": 6696.0, |
|
"completions/mean_length": 3054.78125, |
|
"completions/mean_terminated_length": 3054.78125, |
|
"completions/min_length": 734.0, |
|
"completions/min_terminated_length": 734.0, |
|
"epoch": 0.0009932170542635659, |
|
"grad_norm": 0.034243564194623544, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 5728827.0, |
|
"reward": 0.5321023464202881, |
|
"reward_std": 0.07992984354496002, |
|
"rewards/avg_thinking_length_func": 185.18777465820312, |
|
"rewards/confidence_score_reward_func": 0.699253261089325, |
|
"rewards/correct_answer_reward_func": 0.734375, |
|
"rewards/efficient_thinking_reward_func": 0.8423659179880447, |
|
"rewards/format_and_efficient_reward_func": 0.5654621124267578, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.836081624031067, |
|
"rewards/tool_execution_reward_func": 1.9795209169387817, |
|
"rewards/visit_tool_reward_func": 0.8331901431083679, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010174418604651163, |
|
"grad_norm": 0.008357434600682397, |
|
"kl": 0.0467529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010416666666666667, |
|
"grad_norm": 0.009143109288946688, |
|
"kl": 0.05499267578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001065891472868217, |
|
"grad_norm": 0.018383062802239766, |
|
"kl": 0.07135009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5912.0, |
|
"completions/max_terminated_length": 5912.0, |
|
"completions/mean_length": 2513.34375, |
|
"completions/mean_terminated_length": 2513.34375, |
|
"completions/min_length": 1085.0, |
|
"completions/min_terminated_length": 1085.0, |
|
"epoch": 0.0010901162790697674, |
|
"grad_norm": 1.2449457515517797, |
|
"kl": 0.5546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 6121552.0, |
|
"reward": 0.41406646370887756, |
|
"reward_std": 0.1448429971933365, |
|
"rewards/avg_thinking_length_func": 159.43849182128906, |
|
"rewards/confidence_score_reward_func": 0.7091017961502075, |
|
"rewards/correct_answer_reward_func": 0.5625, |
|
"rewards/efficient_thinking_reward_func": 0.9100999417514477, |
|
"rewards/format_and_efficient_reward_func": 0.40307265520095825, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.7179009914398193, |
|
"rewards/tool_execution_reward_func": 1.9982638359069824, |
|
"rewards/visit_tool_reward_func": 0.8534926772117615, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011143410852713178, |
|
"grad_norm": 0.04725193167363872, |
|
"kl": 0.0830078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011385658914728682, |
|
"grad_norm": 0.01076799271094143, |
|
"kl": 0.0728759765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011627906976744186, |
|
"grad_norm": 1.2844930338395364, |
|
"kl": 0.594970703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0012, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6441.0, |
|
"completions/max_terminated_length": 6441.0, |
|
"completions/mean_length": 2998.25, |
|
"completions/mean_terminated_length": 2998.25, |
|
"completions/min_length": 811.0, |
|
"completions/min_terminated_length": 811.0, |
|
"epoch": 0.001187015503875969, |
|
"grad_norm": 0.07335407885154412, |
|
"kl": 0.0946044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 6632198.0, |
|
"reward": 0.4027416408061981, |
|
"reward_std": 0.18368688225746155, |
|
"rewards/avg_thinking_length_func": 144.1616668701172, |
|
"rewards/confidence_score_reward_func": 0.6523082852363586, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.8715830269761213, |
|
"rewards/format_and_efficient_reward_func": 0.30888205766677856, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.2804265022277832, |
|
"rewards/tool_execution_reward_func": 1.9967105388641357, |
|
"rewards/visit_tool_reward_func": 0.777007520198822, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012112403100775194, |
|
"grad_norm": 3382.951158532336, |
|
"kl": 386.0513916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1955, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012354651162790698, |
|
"grad_norm": 0.04763618115574692, |
|
"kl": 0.111083984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012596899224806201, |
|
"grad_norm": 0.011361146003702229, |
|
"kl": 0.0693359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5699.0, |
|
"completions/max_terminated_length": 5699.0, |
|
"completions/mean_length": 2702.609375, |
|
"completions/mean_terminated_length": 2702.609375, |
|
"completions/min_length": 929.0, |
|
"completions/min_terminated_length": 929.0, |
|
"epoch": 0.0012839147286821705, |
|
"grad_norm": 641.0279979496779, |
|
"kl": 340.21875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.3029, |
|
"num_tokens": 7071302.0, |
|
"reward": 0.38491296768188477, |
|
"reward_std": 0.20615670084953308, |
|
"rewards/avg_thinking_length_func": 144.03466796875, |
|
"rewards/confidence_score_reward_func": 0.6775128841400146, |
|
"rewards/correct_answer_reward_func": 0.546875, |
|
"rewards/efficient_thinking_reward_func": 0.8956235775099276, |
|
"rewards/format_and_efficient_reward_func": 0.298817902803421, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.156145691871643, |
|
"rewards/tool_execution_reward_func": 2.0, |
|
"rewards/visit_tool_reward_func": 0.8991793990135193, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001308139534883721, |
|
"grad_norm": 10.07283016494114, |
|
"kl": 6.52294921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0054, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0013323643410852713, |
|
"grad_norm": 0.024178305719161252, |
|
"kl": 0.1031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0013565891472868217, |
|
"grad_norm": 0.010123659301215143, |
|
"kl": 0.0853271484375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7974.0, |
|
"completions/max_terminated_length": 7974.0, |
|
"completions/mean_length": 3418.6875, |
|
"completions/mean_terminated_length": 3418.6875, |
|
"completions/min_length": 1001.0, |
|
"completions/min_terminated_length": 1001.0, |
|
"epoch": 0.001380813953488372, |
|
"grad_norm": 0.011918704634373778, |
|
"kl": 0.0814208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"num_tokens": 7618083.0, |
|
"reward": 0.33670923113822937, |
|
"reward_std": 0.2170744389295578, |
|
"rewards/avg_thinking_length_func": 162.87310791015625, |
|
"rewards/confidence_score_reward_func": 0.6380267143249512, |
|
"rewards/correct_answer_reward_func": 0.484375, |
|
"rewards/efficient_thinking_reward_func": 0.8769457565983968, |
|
"rewards/format_and_efficient_reward_func": 0.15387150645256042, |
|
"rewards/format_reward_func": 0.9937499761581421, |
|
"rewards/num_xml_reward_func": 0.7425504326820374, |
|
"rewards/tool_execution_reward_func": 1.984375, |
|
"rewards/visit_tool_reward_func": 0.7900611162185669, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014050387596899225, |
|
"grad_norm": 0.012656826141930118, |
|
"kl": 0.0870361328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014292635658914728, |
|
"grad_norm": 0.01963879028272825, |
|
"kl": 0.102783203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014534883720930232, |
|
"grad_norm": 0.023803010795812877, |
|
"kl": 0.111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6602.0, |
|
"completions/max_terminated_length": 6602.0, |
|
"completions/mean_length": 2937.375, |
|
"completions/mean_terminated_length": 2937.375, |
|
"completions/min_length": 1124.0, |
|
"completions/min_terminated_length": 1124.0, |
|
"epoch": 0.0014777131782945736, |
|
"grad_norm": 0.6399010033168665, |
|
"kl": 0.328857421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0008, |
|
"num_tokens": 8081395.0, |
|
"reward": 0.41028502583503723, |
|
"reward_std": 0.1911381632089615, |
|
"rewards/avg_thinking_length_func": 154.1159210205078, |
|
"rewards/confidence_score_reward_func": 0.6654285192489624, |
|
"rewards/correct_answer_reward_func": 0.59375, |
|
"rewards/efficient_thinking_reward_func": 0.8800399071963756, |
|
"rewards/format_and_efficient_reward_func": 0.1847984343767166, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 0.921923816204071, |
|
"rewards/tool_execution_reward_func": 1.9983552694320679, |
|
"rewards/visit_tool_reward_func": 0.883500337600708, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001501937984496124, |
|
"grad_norm": 0.011193139638749735, |
|
"kl": 0.091552734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0015261627906976744, |
|
"grad_norm": 0.010209194017758182, |
|
"kl": 0.086181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0015503875968992248, |
|
"grad_norm": 0.14653936372168078, |
|
"kl": 0.1170654296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8850.0, |
|
"completions/max_terminated_length": 8850.0, |
|
"completions/mean_length": 3542.96875, |
|
"completions/mean_terminated_length": 3542.96875, |
|
"completions/min_length": 880.0, |
|
"completions/min_terminated_length": 880.0, |
|
"epoch": 0.0015746124031007752, |
|
"grad_norm": 0.48126334443141955, |
|
"kl": 0.248046875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"num_tokens": 8636201.0, |
|
"reward": 0.39273786544799805, |
|
"reward_std": 0.12296080589294434, |
|
"rewards/avg_thinking_length_func": 150.4586639404297, |
|
"rewards/confidence_score_reward_func": 0.6261853575706482, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.8429494490638886, |
|
"rewards/format_and_efficient_reward_func": 0.26941436529159546, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 0.9826317429542542, |
|
"rewards/tool_execution_reward_func": 1.9983552694320679, |
|
"rewards/visit_tool_reward_func": 0.8202804327011108, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0015988372093023256, |
|
"grad_norm": 0.011505230665385266, |
|
"kl": 0.087646484375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0003, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001623062015503876, |
|
"grad_norm": 0.011219221768431348, |
|
"kl": 0.0863037109375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0003, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0016472868217054263, |
|
"grad_norm": 0.013493117517357446, |
|
"kl": 0.0845947265625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0003, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6164.0, |
|
"completions/max_terminated_length": 6164.0, |
|
"completions/mean_length": 3045.984375, |
|
"completions/mean_terminated_length": 3045.984375, |
|
"completions/min_length": 853.0, |
|
"completions/min_terminated_length": 853.0, |
|
"epoch": 0.0016715116279069767, |
|
"grad_norm": 0.015474671509878156, |
|
"kl": 0.085205078125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"num_tokens": 9080324.0, |
|
"reward": 0.3584170639514923, |
|
"reward_std": 0.2464786320924759, |
|
"rewards/avg_thinking_length_func": 171.05947875976562, |
|
"rewards/confidence_score_reward_func": 0.6698201298713684, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.9022617067768229, |
|
"rewards/format_and_efficient_reward_func": 0.18420693278312683, |
|
"rewards/format_reward_func": 0.999218761920929, |
|
"rewards/num_xml_reward_func": 0.9476650953292847, |
|
"rewards/tool_execution_reward_func": 1.9967105388641357, |
|
"rewards/visit_tool_reward_func": 0.922633707523346, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001695736434108527, |
|
"grad_norm": 0.01314183789857302, |
|
"kl": 0.082275390625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0017199612403100775, |
|
"grad_norm": 0.012255008742171034, |
|
"kl": 0.0802001953125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0017441860465116279, |
|
"grad_norm": 0.016022338448163764, |
|
"kl": 0.0791015625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9687.0, |
|
"completions/max_terminated_length": 9687.0, |
|
"completions/mean_length": 4153.765625, |
|
"completions/mean_terminated_length": 4153.765625, |
|
"completions/min_length": 1035.0, |
|
"completions/min_terminated_length": 1035.0, |
|
"epoch": 0.0017684108527131783, |
|
"grad_norm": 0.009771623312563241, |
|
"kl": 0.07470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 9647713.0, |
|
"reward": 0.39447835087776184, |
|
"reward_std": 0.1022053211927414, |
|
"rewards/avg_thinking_length_func": 180.9823455810547, |
|
"rewards/confidence_score_reward_func": 0.6325613260269165, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.8102246632773766, |
|
"rewards/format_and_efficient_reward_func": 0.31101614236831665, |
|
"rewards/format_reward_func": 0.9996874928474426, |
|
"rewards/num_xml_reward_func": 1.1014292240142822, |
|
"rewards/tool_execution_reward_func": 1.9983552694320679, |
|
"rewards/visit_tool_reward_func": 0.9176727533340454, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0017926356589147287, |
|
"grad_norm": 0.009518866209493148, |
|
"kl": 0.0743408203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001816860465116279, |
|
"grad_norm": 0.01107061263145856, |
|
"kl": 0.074462890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0018410852713178294, |
|
"grad_norm": 0.010455700609646703, |
|
"kl": 0.0758056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.015625, |
|
"completions/max_length": 6094.0, |
|
"completions/max_terminated_length": 6094.0, |
|
"completions/mean_length": 3365.65625, |
|
"completions/mean_terminated_length": 3386.5714285714284, |
|
"completions/min_length": 1457.0, |
|
"completions/min_terminated_length": 1457.0, |
|
"epoch": 0.0018653100775193798, |
|
"grad_norm": 0.010697094262847633, |
|
"kl": 0.07177734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 10096064.0, |
|
"reward": 0.4402870833873749, |
|
"reward_std": 0.17748701572418213, |
|
"rewards/avg_thinking_length_func": 184.61854553222656, |
|
"rewards/confidence_score_reward_func": 0.6924824714660645, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.8674089768653666, |
|
"rewards/format_and_efficient_reward_func": 0.46700799465179443, |
|
"rewards/format_reward_func": 0.9821969866752625, |
|
"rewards/num_xml_reward_func": 1.4879558086395264, |
|
"rewards/tool_execution_reward_func": 1.9514802694320679, |
|
"rewards/visit_tool_reward_func": 0.9262524843215942, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0018895348837209302, |
|
"grad_norm": 0.010757613261067228, |
|
"kl": 0.0716552734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0019137596899224806, |
|
"grad_norm": 0.010687573666984099, |
|
"kl": 0.0711669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001937984496124031, |
|
"grad_norm": 0.010774872814522038, |
|
"kl": 0.07177734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9500.0, |
|
"completions/max_terminated_length": 9500.0, |
|
"completions/mean_length": 4230.6875, |
|
"completions/mean_terminated_length": 4230.6875, |
|
"completions/min_length": 1095.0, |
|
"completions/min_terminated_length": 1095.0, |
|
"epoch": 0.0019622093023255816, |
|
"grad_norm": 46.52685366161902, |
|
"kl": 28.5504150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0212, |
|
"num_tokens": 10633304.0, |
|
"reward": 0.4479905962944031, |
|
"reward_std": 0.11886347830295563, |
|
"rewards/avg_thinking_length_func": 196.62542724609375, |
|
"rewards/confidence_score_reward_func": 0.6686310768127441, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.8074578120916676, |
|
"rewards/format_and_efficient_reward_func": 0.4098377823829651, |
|
"rewards/format_reward_func": 0.9993749856948853, |
|
"rewards/num_xml_reward_func": 1.3076300621032715, |
|
"rewards/tool_execution_reward_func": 1.9934210777282715, |
|
"rewards/visit_tool_reward_func": 0.9281606674194336, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0019864341085271318, |
|
"grad_norm": 0.011024444662647613, |
|
"kl": 0.0682373046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0020106589147286824, |
|
"grad_norm": 0.0110905273039609, |
|
"kl": 0.0682373046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0020348837209302325, |
|
"grad_norm": 0.011161056303561772, |
|
"kl": 0.068359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7332.0, |
|
"completions/max_terminated_length": 7332.0, |
|
"completions/mean_length": 3184.28125, |
|
"completions/mean_terminated_length": 3184.28125, |
|
"completions/min_length": 1380.0, |
|
"completions/min_terminated_length": 1380.0, |
|
"epoch": 0.002059108527131783, |
|
"grad_norm": 0.007262566160814956, |
|
"kl": 0.0670166015625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"num_tokens": 11072119.0, |
|
"reward": 0.48964226245880127, |
|
"reward_std": 0.09526845812797546, |
|
"rewards/avg_thinking_length_func": 183.27981567382812, |
|
"rewards/confidence_score_reward_func": 0.7107405066490173, |
|
"rewards/correct_answer_reward_func": 0.671875, |
|
"rewards/efficient_thinking_reward_func": 0.8552614079949872, |
|
"rewards/format_and_efficient_reward_func": 0.509292721748352, |
|
"rewards/format_reward_func": 0.9996874928474426, |
|
"rewards/num_xml_reward_func": 1.630164384841919, |
|
"rewards/tool_execution_reward_func": 1.9862616062164307, |
|
"rewards/visit_tool_reward_func": 0.9241018295288086, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0020833333333333333, |
|
"grad_norm": 0.007239493682926299, |
|
"kl": 0.0675048828125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002107558139534884, |
|
"grad_norm": 0.007565680492649283, |
|
"kl": 0.06787109375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002131782945736434, |
|
"grad_norm": 0.007407335837345995, |
|
"kl": 0.0682373046875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9640.0, |
|
"completions/max_terminated_length": 9640.0, |
|
"completions/mean_length": 3956.140625, |
|
"completions/mean_terminated_length": 3956.140625, |
|
"completions/min_length": 987.0, |
|
"completions/min_terminated_length": 987.0, |
|
"epoch": 0.0021560077519379847, |
|
"grad_norm": 0.009630461090198177, |
|
"kl": 0.06591796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 11589191.0, |
|
"reward": 0.4685676693916321, |
|
"reward_std": 0.08529931306838989, |
|
"rewards/avg_thinking_length_func": 185.34999084472656, |
|
"rewards/confidence_score_reward_func": 0.673518717288971, |
|
"rewards/correct_answer_reward_func": 0.65625, |
|
"rewards/efficient_thinking_reward_func": 0.8117772322905137, |
|
"rewards/format_and_efficient_reward_func": 0.4981999397277832, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.5498807430267334, |
|
"rewards/tool_execution_reward_func": 1.9884867668151855, |
|
"rewards/visit_tool_reward_func": 0.9419025182723999, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002180232558139535, |
|
"grad_norm": 0.010035272389521673, |
|
"kl": 0.0660400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0022044573643410855, |
|
"grad_norm": 0.009886020878154878, |
|
"kl": 0.0653076171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0022286821705426356, |
|
"grad_norm": 0.010179048111382292, |
|
"kl": 0.0648193359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5889.0, |
|
"completions/max_terminated_length": 5889.0, |
|
"completions/mean_length": 3288.046875, |
|
"completions/mean_terminated_length": 3288.046875, |
|
"completions/min_length": 1106.0, |
|
"completions/min_terminated_length": 1106.0, |
|
"epoch": 0.0022529069767441862, |
|
"grad_norm": 0.36462018525457834, |
|
"kl": 0.1248779296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"num_tokens": 12047117.0, |
|
"reward": 0.5035778284072876, |
|
"reward_std": 0.09110674262046814, |
|
"rewards/avg_thinking_length_func": 180.05084228515625, |
|
"rewards/confidence_score_reward_func": 0.7095786333084106, |
|
"rewards/correct_answer_reward_func": 0.6875, |
|
"rewards/efficient_thinking_reward_func": 0.865053232533276, |
|
"rewards/format_and_efficient_reward_func": 0.5739701986312866, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.6645023822784424, |
|
"rewards/tool_execution_reward_func": 1.9736841917037964, |
|
"rewards/visit_tool_reward_func": 0.9475066065788269, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0022771317829457364, |
|
"grad_norm": 0.010049427947341465, |
|
"kl": 0.0684814453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002301356589147287, |
|
"grad_norm": 0.008406367137924373, |
|
"kl": 0.067138671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002325581395348837, |
|
"grad_norm": 0.008646991679074768, |
|
"kl": 0.0682373046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9717.0, |
|
"completions/max_terminated_length": 9717.0, |
|
"completions/mean_length": 4042.09375, |
|
"completions/mean_terminated_length": 4042.09375, |
|
"completions/min_length": 926.0, |
|
"completions/min_terminated_length": 926.0, |
|
"epoch": 0.0023498062015503878, |
|
"grad_norm": 0.36433347439984676, |
|
"kl": 0.266357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 12594675.0, |
|
"reward": 0.4354441165924072, |
|
"reward_std": 0.10702547430992126, |
|
"rewards/avg_thinking_length_func": 178.31576538085938, |
|
"rewards/confidence_score_reward_func": 0.6778514385223389, |
|
"rewards/correct_answer_reward_func": 0.59375, |
|
"rewards/efficient_thinking_reward_func": 0.8262231594607177, |
|
"rewards/format_and_efficient_reward_func": 0.4731639623641968, |
|
"rewards/format_reward_func": 0.9996874928474426, |
|
"rewards/num_xml_reward_func": 1.5230944156646729, |
|
"rewards/tool_execution_reward_func": 1.977658987045288, |
|
"rewards/visit_tool_reward_func": 0.90561443567276, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002374031007751938, |
|
"grad_norm": 0.017653090743062046, |
|
"kl": 0.07763671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0023982558139534886, |
|
"grad_norm": 0.009650143183516308, |
|
"kl": 0.066650390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0024224806201550387, |
|
"grad_norm": 0.009666383934140476, |
|
"kl": 0.066650390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7009.0, |
|
"completions/max_terminated_length": 7009.0, |
|
"completions/mean_length": 3569.1875, |
|
"completions/mean_terminated_length": 3569.1875, |
|
"completions/min_length": 1350.0, |
|
"completions/min_terminated_length": 1350.0, |
|
"epoch": 0.0024467054263565893, |
|
"grad_norm": 0.012628187028225836, |
|
"kl": 0.0160369873046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"num_tokens": 13095521.0, |
|
"reward": 0.4694232642650604, |
|
"reward_std": 0.11920525133609772, |
|
"rewards/avg_thinking_length_func": 166.68763732910156, |
|
"rewards/confidence_score_reward_func": 0.693173885345459, |
|
"rewards/correct_answer_reward_func": 0.640625, |
|
"rewards/efficient_thinking_reward_func": 0.8890269113384983, |
|
"rewards/format_and_efficient_reward_func": 0.52373868227005, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.4931187629699707, |
|
"rewards/tool_execution_reward_func": 1.9407894611358643, |
|
"rewards/visit_tool_reward_func": 0.9543420076370239, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0024709302325581395, |
|
"grad_norm": 0.013764666926511201, |
|
"kl": 0.016693115234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00249515503875969, |
|
"grad_norm": 0.015582325932853322, |
|
"kl": 0.017730712890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0025193798449612403, |
|
"grad_norm": 0.017864538067072777, |
|
"kl": 0.01995849609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 11718.0, |
|
"completions/max_terminated_length": 11718.0, |
|
"completions/mean_length": 4337.8125, |
|
"completions/mean_terminated_length": 4337.8125, |
|
"completions/min_length": 1402.0, |
|
"completions/min_terminated_length": 1402.0, |
|
"epoch": 0.002543604651162791, |
|
"grad_norm": 0.011715145428905095, |
|
"kl": 0.023681640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 13691037.0, |
|
"reward": 0.4581317901611328, |
|
"reward_std": 0.07780471444129944, |
|
"rewards/avg_thinking_length_func": 141.15011596679688, |
|
"rewards/confidence_score_reward_func": 0.6525664925575256, |
|
"rewards/correct_answer_reward_func": 0.65625, |
|
"rewards/efficient_thinking_reward_func": 0.7593332235923487, |
|
"rewards/format_and_efficient_reward_func": 0.45769202709198, |
|
"rewards/format_reward_func": 0.9993749856948853, |
|
"rewards/num_xml_reward_func": 1.3809731006622314, |
|
"rewards/tool_execution_reward_func": 1.9640991687774658, |
|
"rewards/visit_tool_reward_func": 0.9199192523956299, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002567829457364341, |
|
"grad_norm": 0.012478280222631418, |
|
"kl": 0.03009033203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0025920542635658917, |
|
"grad_norm": 0.013305867700430574, |
|
"kl": 0.0390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002616279069767442, |
|
"grad_norm": 0.0183428412461533, |
|
"kl": 0.0509033203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6386.0, |
|
"completions/max_terminated_length": 6386.0, |
|
"completions/mean_length": 3297.5625, |
|
"completions/mean_terminated_length": 3297.5625, |
|
"completions/min_length": 1296.0, |
|
"completions/min_terminated_length": 1296.0, |
|
"epoch": 0.0026405038759689924, |
|
"grad_norm": 0.02337332236690807, |
|
"kl": 0.0616455078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"num_tokens": 14184466.0, |
|
"reward": 0.40722835063934326, |
|
"reward_std": 0.14360609650611877, |
|
"rewards/avg_thinking_length_func": 138.28097534179688, |
|
"rewards/confidence_score_reward_func": 0.644202470779419, |
|
"rewards/correct_answer_reward_func": 0.59375, |
|
"rewards/efficient_thinking_reward_func": 0.7607926960767375, |
|
"rewards/format_and_efficient_reward_func": 0.46497124433517456, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.4057281017303467, |
|
"rewards/tool_execution_reward_func": 1.9434621334075928, |
|
"rewards/visit_tool_reward_func": 0.9184768199920654, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0026647286821705426, |
|
"grad_norm": 0.012698799447402773, |
|
"kl": 0.06781005859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002688953488372093, |
|
"grad_norm": 0.012619226324675306, |
|
"kl": 0.0758056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0027131782945736434, |
|
"grad_norm": 0.013347372933753418, |
|
"kl": 0.0892333984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 10254.0, |
|
"completions/max_terminated_length": 10254.0, |
|
"completions/mean_length": 4017.296875, |
|
"completions/mean_terminated_length": 4017.296875, |
|
"completions/min_length": 1163.0, |
|
"completions/min_terminated_length": 1163.0, |
|
"epoch": 0.002737403100775194, |
|
"grad_norm": 0.8482856229199331, |
|
"kl": 0.163818359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 14783302.0, |
|
"reward": 0.3793744742870331, |
|
"reward_std": 0.08317889273166656, |
|
"rewards/avg_thinking_length_func": 96.98873901367188, |
|
"rewards/confidence_score_reward_func": 0.5890461206436157, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.4956153760102844, |
|
"rewards/format_and_efficient_reward_func": 0.3040567636489868, |
|
"rewards/format_reward_func": 0.991857647895813, |
|
"rewards/num_xml_reward_func": 0.9565892815589905, |
|
"rewards/tool_execution_reward_func": 1.883992075920105, |
|
"rewards/visit_tool_reward_func": 0.6309776306152344, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002761627906976744, |
|
"grad_norm": 2.892668930951565, |
|
"kl": 0.87744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.002, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0027858527131782948, |
|
"grad_norm": 0.11540075032392746, |
|
"kl": 0.258544921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002810077519379845, |
|
"grad_norm": 0.03602102455529362, |
|
"kl": 0.205078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7956.0, |
|
"completions/max_terminated_length": 7956.0, |
|
"completions/mean_length": 3060.15625, |
|
"completions/mean_terminated_length": 3060.15625, |
|
"completions/min_length": 1178.0, |
|
"completions/min_terminated_length": 1178.0, |
|
"epoch": 0.0028343023255813955, |
|
"grad_norm": 0.011947818843430099, |
|
"kl": 0.1031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"num_tokens": 15239738.0, |
|
"reward": 0.4257793724536896, |
|
"reward_std": 0.15445315837860107, |
|
"rewards/avg_thinking_length_func": 111.71697235107422, |
|
"rewards/confidence_score_reward_func": 0.6188951730728149, |
|
"rewards/correct_answer_reward_func": 0.671875, |
|
"rewards/efficient_thinking_reward_func": 0.7151743089595498, |
|
"rewards/format_and_efficient_reward_func": 0.3122476637363434, |
|
"rewards/format_reward_func": 0.9918689727783203, |
|
"rewards/num_xml_reward_func": 1.2823729515075684, |
|
"rewards/tool_execution_reward_func": 1.9500064849853516, |
|
"rewards/visit_tool_reward_func": 0.8597963452339172, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0028585271317829457, |
|
"grad_norm": 0.01184782503909529, |
|
"kl": 0.0999755859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0028827519379844963, |
|
"grad_norm": 0.01222442223239816, |
|
"kl": 0.099365234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0029069767441860465, |
|
"grad_norm": 0.01288408566646706, |
|
"kl": 0.1002197265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 11579.0, |
|
"completions/max_terminated_length": 11579.0, |
|
"completions/mean_length": 3778.484375, |
|
"completions/mean_terminated_length": 3778.484375, |
|
"completions/min_length": 857.0, |
|
"completions/min_terminated_length": 857.0, |
|
"epoch": 0.002931201550387597, |
|
"grad_norm": 0.013127560986285324, |
|
"kl": 0.170654296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"num_tokens": 15802870.0, |
|
"reward": 0.35960614681243896, |
|
"reward_std": 0.09336411207914352, |
|
"rewards/avg_thinking_length_func": 120.11376953125, |
|
"rewards/confidence_score_reward_func": 0.5505574941635132, |
|
"rewards/correct_answer_reward_func": 0.609375, |
|
"rewards/efficient_thinking_reward_func": 0.5848998658707487, |
|
"rewards/format_and_efficient_reward_func": 0.09069697558879852, |
|
"rewards/format_reward_func": 0.9635053873062134, |
|
"rewards/num_xml_reward_func": 0.6183948516845703, |
|
"rewards/tool_execution_reward_func": 1.921267032623291, |
|
"rewards/visit_tool_reward_func": 0.408791184425354, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0029554263565891472, |
|
"grad_norm": 0.011473925349363189, |
|
"kl": 0.173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.002979651162790698, |
|
"grad_norm": 0.010667583254555548, |
|
"kl": 0.1767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003003875968992248, |
|
"grad_norm": 0.010839236682357098, |
|
"kl": 0.1826171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7543.0, |
|
"completions/max_terminated_length": 7543.0, |
|
"completions/mean_length": 3495.625, |
|
"completions/mean_terminated_length": 3495.625, |
|
"completions/min_length": 1272.0, |
|
"completions/min_terminated_length": 1272.0, |
|
"epoch": 0.0030281007751937986, |
|
"grad_norm": 0.014134486127382969, |
|
"kl": 0.136474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 16267847.0, |
|
"reward": 0.40116244554519653, |
|
"reward_std": 0.11558952927589417, |
|
"rewards/avg_thinking_length_func": 171.4405975341797, |
|
"rewards/confidence_score_reward_func": 0.592523455619812, |
|
"rewards/correct_answer_reward_func": 0.65625, |
|
"rewards/efficient_thinking_reward_func": 0.78887382548876, |
|
"rewards/format_and_efficient_reward_func": -0.007415967993438244, |
|
"rewards/format_reward_func": 0.9569429159164429, |
|
"rewards/num_xml_reward_func": 0.533742368221283, |
|
"rewards/tool_execution_reward_func": 1.984920620918274, |
|
"rewards/visit_tool_reward_func": 0.8972762823104858, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003052325581395349, |
|
"grad_norm": 0.01438304535919498, |
|
"kl": 0.140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0030765503875968994, |
|
"grad_norm": 0.014656756114246808, |
|
"kl": 0.14794921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0031007751937984496, |
|
"grad_norm": 0.015042904271731165, |
|
"kl": 0.15869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0012, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 12404.0, |
|
"completions/max_terminated_length": 12404.0, |
|
"completions/mean_length": 4594.015625, |
|
"completions/mean_terminated_length": 4594.015625, |
|
"completions/min_length": 1214.0, |
|
"completions/min_terminated_length": 1214.0, |
|
"epoch": 0.003125, |
|
"grad_norm": 0.01590013423348445, |
|
"kl": 0.26904296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0009, |
|
"num_tokens": 16831957.0, |
|
"reward": 0.3612688183784485, |
|
"reward_std": 0.08134222030639648, |
|
"rewards/avg_thinking_length_func": 189.8800048828125, |
|
"rewards/confidence_score_reward_func": 0.5268421173095703, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.6692969275756135, |
|
"rewards/format_and_efficient_reward_func": -0.032691895961761475, |
|
"rewards/format_reward_func": 0.9466335773468018, |
|
"rewards/num_xml_reward_func": 0.4149753153324127, |
|
"rewards/tool_execution_reward_func": 1.9272011518478394, |
|
"rewards/visit_tool_reward_func": 0.7673778533935547, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0031492248062015503, |
|
"grad_norm": 0.01646874780720208, |
|
"kl": 0.29443359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0009, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003173449612403101, |
|
"grad_norm": 0.01694506623714648, |
|
"kl": 0.314453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0009, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003197674418604651, |
|
"grad_norm": 0.016867539615718644, |
|
"kl": 0.3271484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.001, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6944.0, |
|
"completions/max_terminated_length": 6944.0, |
|
"completions/mean_length": 2774.265625, |
|
"completions/mean_terminated_length": 2774.265625, |
|
"completions/min_length": 1204.0, |
|
"completions/min_terminated_length": 1204.0, |
|
"epoch": 0.0032218992248062017, |
|
"grad_norm": 0.022617229528507702, |
|
"kl": 0.26953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 17249404.0, |
|
"reward": 0.20413580536842346, |
|
"reward_std": 0.05481432378292084, |
|
"rewards/avg_thinking_length_func": 129.03866577148438, |
|
"rewards/confidence_score_reward_func": 0.49319422245025635, |
|
"rewards/correct_answer_reward_func": 0.34375, |
|
"rewards/efficient_thinking_reward_func": 0.7432039407243382, |
|
"rewards/format_and_efficient_reward_func": 0.17171993851661682, |
|
"rewards/format_reward_func": 0.9746097326278687, |
|
"rewards/num_xml_reward_func": 0.8615504503250122, |
|
"rewards/tool_execution_reward_func": 1.9303656816482544, |
|
"rewards/visit_tool_reward_func": 0.9013795852661133, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003246124031007752, |
|
"grad_norm": 0.020143739711233816, |
|
"kl": 0.252685546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0032703488372093025, |
|
"grad_norm": 0.01785809415589292, |
|
"kl": 0.227294921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0032945736434108527, |
|
"grad_norm": 0.015380281270199666, |
|
"kl": 0.199462890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7166.0, |
|
"completions/max_terminated_length": 7166.0, |
|
"completions/mean_length": 3239.453125, |
|
"completions/mean_terminated_length": 3239.453125, |
|
"completions/min_length": 1458.0, |
|
"completions/min_terminated_length": 1458.0, |
|
"epoch": 0.0033187984496124033, |
|
"grad_norm": 0.012800365899215092, |
|
"kl": 0.138427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 17686794.0, |
|
"reward": 0.3108579218387604, |
|
"reward_std": 0.13888844847679138, |
|
"rewards/avg_thinking_length_func": 171.369384765625, |
|
"rewards/confidence_score_reward_func": 0.5435695648193359, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.802592893497664, |
|
"rewards/format_and_efficient_reward_func": 0.2916308343410492, |
|
"rewards/format_reward_func": 0.9913173913955688, |
|
"rewards/num_xml_reward_func": 1.4043910503387451, |
|
"rewards/tool_execution_reward_func": 1.8357443809509277, |
|
"rewards/visit_tool_reward_func": 0.8753163814544678, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0033430232558139534, |
|
"grad_norm": 0.014285839855776115, |
|
"kl": 0.1318359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003367248062015504, |
|
"grad_norm": 0.015433812962718682, |
|
"kl": 0.128173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003391472868217054, |
|
"grad_norm": 0.015720560114809618, |
|
"kl": 0.1229248046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5424.0, |
|
"completions/max_terminated_length": 5424.0, |
|
"completions/mean_length": 3209.875, |
|
"completions/mean_terminated_length": 3209.875, |
|
"completions/min_length": 1301.0, |
|
"completions/min_terminated_length": 1301.0, |
|
"epoch": 0.003415697674418605, |
|
"grad_norm": 0.009160832793565006, |
|
"kl": 0.089599609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 18156998.0, |
|
"reward": 0.2771710753440857, |
|
"reward_std": 0.10209451615810394, |
|
"rewards/avg_thinking_length_func": 144.3570556640625, |
|
"rewards/confidence_score_reward_func": 0.5883906483650208, |
|
"rewards/correct_answer_reward_func": 0.421875, |
|
"rewards/efficient_thinking_reward_func": 0.9227171305298694, |
|
"rewards/format_and_efficient_reward_func": 0.303905189037323, |
|
"rewards/format_reward_func": 0.9965387582778931, |
|
"rewards/num_xml_reward_func": 1.6496015787124634, |
|
"rewards/tool_execution_reward_func": 1.9101753234863281, |
|
"rewards/visit_tool_reward_func": 1.0097795724868774, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003439922480620155, |
|
"grad_norm": 0.009348804877622782, |
|
"kl": 0.0853271484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0034641472868217056, |
|
"grad_norm": 0.009332442022472659, |
|
"kl": 0.080322265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0034883720930232558, |
|
"grad_norm": 0.009512893821144673, |
|
"kl": 0.0767822265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7366.0, |
|
"completions/max_terminated_length": 7366.0, |
|
"completions/mean_length": 4435.890625, |
|
"completions/mean_terminated_length": 4435.890625, |
|
"completions/min_length": 1397.0, |
|
"completions/min_terminated_length": 1397.0, |
|
"epoch": 0.0035125968992248064, |
|
"grad_norm": 0.013329760690267301, |
|
"kl": 0.055419921875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0002, |
|
"num_tokens": 18712707.0, |
|
"reward": 0.4343380331993103, |
|
"reward_std": 0.1319217085838318, |
|
"rewards/avg_thinking_length_func": 213.60223388671875, |
|
"rewards/confidence_score_reward_func": 0.6497268080711365, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.8139017177985812, |
|
"rewards/format_and_efficient_reward_func": 0.4802235960960388, |
|
"rewards/format_reward_func": 0.9989955425262451, |
|
"rewards/num_xml_reward_func": 1.751387119293213, |
|
"rewards/tool_execution_reward_func": 1.9038957357406616, |
|
"rewards/visit_tool_reward_func": 0.9324563145637512, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0035368217054263565, |
|
"grad_norm": 0.013975012703647748, |
|
"kl": 0.0540771484375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0002, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003561046511627907, |
|
"grad_norm": 0.014076489547319788, |
|
"kl": 0.0531005859375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0002, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0035852713178294573, |
|
"grad_norm": 0.014165636449546886, |
|
"kl": 0.0531005859375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0002, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5721.0, |
|
"completions/max_terminated_length": 5721.0, |
|
"completions/mean_length": 3476.890625, |
|
"completions/mean_terminated_length": 3476.890625, |
|
"completions/min_length": 1375.0, |
|
"completions/min_terminated_length": 1375.0, |
|
"epoch": 0.003609496124031008, |
|
"grad_norm": 0.007532410662647794, |
|
"kl": 0.06439208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 19224629.0, |
|
"reward": 0.3097182512283325, |
|
"reward_std": 0.06608685851097107, |
|
"rewards/avg_thinking_length_func": 155.74346923828125, |
|
"rewards/confidence_score_reward_func": 0.6070291996002197, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.9227627606272979, |
|
"rewards/format_and_efficient_reward_func": 0.3381012976169586, |
|
"rewards/format_reward_func": 0.9996874928474426, |
|
"rewards/num_xml_reward_func": 1.6836090087890625, |
|
"rewards/tool_execution_reward_func": 1.8510758876800537, |
|
"rewards/visit_tool_reward_func": 0.8944061994552612, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003633720930232558, |
|
"grad_norm": 0.007379430347015788, |
|
"kl": 0.0645751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0036579457364341087, |
|
"grad_norm": 0.008138518366845196, |
|
"kl": 0.0657958984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003682170542635659, |
|
"grad_norm": 0.008284296957527382, |
|
"kl": 0.0673828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9618.0, |
|
"completions/max_terminated_length": 9618.0, |
|
"completions/mean_length": 4875.078125, |
|
"completions/mean_terminated_length": 4875.078125, |
|
"completions/min_length": 1847.0, |
|
"completions/min_terminated_length": 1847.0, |
|
"epoch": 0.0037063953488372095, |
|
"grad_norm": 0.014704297852595168, |
|
"kl": 0.05523681640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0015, |
|
"num_tokens": 19820623.0, |
|
"reward": 0.428906112909317, |
|
"reward_std": 0.16942133009433746, |
|
"rewards/avg_thinking_length_func": 210.6763916015625, |
|
"rewards/confidence_score_reward_func": 0.6548709869384766, |
|
"rewards/correct_answer_reward_func": 0.609375, |
|
"rewards/efficient_thinking_reward_func": 0.7212743512877299, |
|
"rewards/format_and_efficient_reward_func": 0.4301028251647949, |
|
"rewards/format_reward_func": 0.9975892305374146, |
|
"rewards/num_xml_reward_func": 1.4759665727615356, |
|
"rewards/tool_execution_reward_func": 1.8980989456176758, |
|
"rewards/visit_tool_reward_func": 0.9375091791152954, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0037306201550387596, |
|
"grad_norm": 0.015023473705486283, |
|
"kl": 0.0567626953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0015, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0037548449612403102, |
|
"grad_norm": 0.015217500076281755, |
|
"kl": 0.05841064453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0015, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0037790697674418604, |
|
"grad_norm": 0.016114636489248848, |
|
"kl": 0.0614013671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0015, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6823.0, |
|
"completions/max_terminated_length": 6823.0, |
|
"completions/mean_length": 4384.296875, |
|
"completions/mean_terminated_length": 4384.296875, |
|
"completions/min_length": 1697.0, |
|
"completions/min_terminated_length": 1697.0, |
|
"epoch": 0.003803294573643411, |
|
"grad_norm": 0.006588369322686691, |
|
"kl": 0.0643310546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 20425222.0, |
|
"reward": 0.34698012471199036, |
|
"reward_std": 0.03517330437898636, |
|
"rewards/avg_thinking_length_func": 178.83392333984375, |
|
"rewards/confidence_score_reward_func": 0.6313294172286987, |
|
"rewards/correct_answer_reward_func": 0.484375, |
|
"rewards/efficient_thinking_reward_func": 0.8650427095882729, |
|
"rewards/format_and_efficient_reward_func": 0.37807154655456543, |
|
"rewards/format_reward_func": 0.9995312690734863, |
|
"rewards/num_xml_reward_func": 1.323744297027588, |
|
"rewards/tool_execution_reward_func": 1.96144700050354, |
|
"rewards/visit_tool_reward_func": 0.9631377458572388, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003827519379844961, |
|
"grad_norm": 0.006972139333963718, |
|
"kl": 0.0670166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003851744186046512, |
|
"grad_norm": 0.0071318562836598836, |
|
"kl": 0.06884765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003875968992248062, |
|
"grad_norm": 0.007113091376284595, |
|
"kl": 0.06982421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 11894.0, |
|
"completions/max_terminated_length": 11894.0, |
|
"completions/mean_length": 5685.0625, |
|
"completions/mean_terminated_length": 5685.0625, |
|
"completions/min_length": 1886.0, |
|
"completions/min_terminated_length": 1886.0, |
|
"epoch": 0.0039001937984496126, |
|
"grad_norm": 0.01558937344658329, |
|
"kl": 0.06414794921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 21086786.0, |
|
"reward": 0.4025996923446655, |
|
"reward_std": 0.13449470698833466, |
|
"rewards/avg_thinking_length_func": 254.32508850097656, |
|
"rewards/confidence_score_reward_func": 0.6495309472084045, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.6637161596148502, |
|
"rewards/format_and_efficient_reward_func": 0.458422988653183, |
|
"rewards/format_reward_func": 0.9998437166213989, |
|
"rewards/num_xml_reward_func": 1.5073208808898926, |
|
"rewards/tool_execution_reward_func": 1.9572367668151855, |
|
"rewards/visit_tool_reward_func": 0.9573923349380493, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003924418604651163, |
|
"grad_norm": 0.016638056430155885, |
|
"kl": 0.064453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0018, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.003948643410852713, |
|
"grad_norm": 0.01813854752521658, |
|
"kl": 0.06536865234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0018, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0039728682170542635, |
|
"grad_norm": 0.01938490985845502, |
|
"kl": 0.06988525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0019, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9060.0, |
|
"completions/max_terminated_length": 9060.0, |
|
"completions/mean_length": 4403.21875, |
|
"completions/mean_terminated_length": 4403.21875, |
|
"completions/min_length": 1390.0, |
|
"completions/min_terminated_length": 1390.0, |
|
"epoch": 0.003997093023255814, |
|
"grad_norm": 0.005449273513524992, |
|
"kl": 0.0662841796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 21662894.0, |
|
"reward": 0.35001087188720703, |
|
"reward_std": 0.009927155449986458, |
|
"rewards/avg_thinking_length_func": 188.15765380859375, |
|
"rewards/confidence_score_reward_func": 0.6182008981704712, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 0.8001981107519069, |
|
"rewards/format_and_efficient_reward_func": 0.36673688888549805, |
|
"rewards/format_reward_func": 0.9998437166213989, |
|
"rewards/num_xml_reward_func": 1.4394086599349976, |
|
"rewards/tool_execution_reward_func": 1.993227481842041, |
|
"rewards/visit_tool_reward_func": 0.936252236366272, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004021317829457365, |
|
"grad_norm": 0.00568312787735846, |
|
"kl": 0.068115234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0040455426356589145, |
|
"grad_norm": 0.005806971085578714, |
|
"kl": 0.069580078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004069767441860465, |
|
"grad_norm": 0.00592190722180043, |
|
"kl": 0.070556640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9992.0, |
|
"completions/max_terminated_length": 9992.0, |
|
"completions/mean_length": 5377.4375, |
|
"completions/mean_terminated_length": 5377.4375, |
|
"completions/min_length": 1809.0, |
|
"completions/min_terminated_length": 1809.0, |
|
"epoch": 0.004093992248062016, |
|
"grad_norm": 0.359099649617951, |
|
"kl": 0.1207275390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0029, |
|
"num_tokens": 22286431.0, |
|
"reward": 0.40037134289741516, |
|
"reward_std": 0.12838459014892578, |
|
"rewards/avg_thinking_length_func": 245.9459228515625, |
|
"rewards/confidence_score_reward_func": 0.6141020059585571, |
|
"rewards/correct_answer_reward_func": 0.609375, |
|
"rewards/efficient_thinking_reward_func": 0.6361426555187852, |
|
"rewards/format_and_efficient_reward_func": 0.45017051696777344, |
|
"rewards/format_reward_func": 0.9981250166893005, |
|
"rewards/num_xml_reward_func": 1.532149076461792, |
|
"rewards/tool_execution_reward_func": 1.9983552694320679, |
|
"rewards/visit_tool_reward_func": 0.9713033437728882, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004118217054263566, |
|
"grad_norm": 0.0312847460920415, |
|
"kl": 0.08642578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0028, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004142441860465116, |
|
"grad_norm": 0.5587996108011728, |
|
"kl": 0.2386474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.003, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004166666666666667, |
|
"grad_norm": 0.03228792794627183, |
|
"kl": 0.092529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0028, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7094.0, |
|
"completions/max_terminated_length": 7094.0, |
|
"completions/mean_length": 4163.640625, |
|
"completions/mean_terminated_length": 4163.640625, |
|
"completions/min_length": 1385.0, |
|
"completions/min_terminated_length": 1385.0, |
|
"epoch": 0.004190891472868217, |
|
"grad_norm": 0.008141555436627606, |
|
"kl": 0.1014404296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"num_tokens": 22851695.0, |
|
"reward": 0.31291523575782776, |
|
"reward_std": 0.0387241393327713, |
|
"rewards/avg_thinking_length_func": 150.9978485107422, |
|
"rewards/confidence_score_reward_func": 0.5685818195343018, |
|
"rewards/correct_answer_reward_func": 0.46875, |
|
"rewards/efficient_thinking_reward_func": 0.8065696148258371, |
|
"rewards/format_and_efficient_reward_func": 0.30031993985176086, |
|
"rewards/format_reward_func": 0.9996874928474426, |
|
"rewards/num_xml_reward_func": 1.2274867296218872, |
|
"rewards/tool_execution_reward_func": 1.9928336143493652, |
|
"rewards/visit_tool_reward_func": 0.9787203073501587, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004215116279069768, |
|
"grad_norm": 0.008733677069632446, |
|
"kl": 0.1131591796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0042393410852713176, |
|
"grad_norm": 0.009638540295346257, |
|
"kl": 0.12744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004263565891472868, |
|
"grad_norm": 0.010992556993855552, |
|
"kl": 0.142822265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8227.0, |
|
"completions/max_terminated_length": 8227.0, |
|
"completions/mean_length": 4541.953125, |
|
"completions/mean_terminated_length": 4541.953125, |
|
"completions/min_length": 1507.0, |
|
"completions/min_terminated_length": 1507.0, |
|
"epoch": 0.004287790697674419, |
|
"grad_norm": 0.1409188461026278, |
|
"kl": 0.265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0037, |
|
"num_tokens": 23436250.0, |
|
"reward": 0.3243735730648041, |
|
"reward_std": 0.15356436371803284, |
|
"rewards/avg_thinking_length_func": 171.99826049804688, |
|
"rewards/confidence_score_reward_func": 0.5453901290893555, |
|
"rewards/correct_answer_reward_func": 0.53125, |
|
"rewards/efficient_thinking_reward_func": 0.6924963364887087, |
|
"rewards/format_and_efficient_reward_func": 0.3312879800796509, |
|
"rewards/format_reward_func": 0.998577356338501, |
|
"rewards/num_xml_reward_func": 1.3812510967254639, |
|
"rewards/tool_execution_reward_func": 1.9967105388641357, |
|
"rewards/visit_tool_reward_func": 0.9554424285888672, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004312015503875969, |
|
"grad_norm": 0.05228415250398885, |
|
"kl": 0.201904296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0037, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004336240310077519, |
|
"grad_norm": 0.060068767522700996, |
|
"kl": 0.2451171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0037, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00436046511627907, |
|
"grad_norm": 0.2730620784971272, |
|
"kl": 0.4716796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0041, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8120.0, |
|
"completions/max_terminated_length": 8120.0, |
|
"completions/mean_length": 3361.40625, |
|
"completions/mean_terminated_length": 3361.40625, |
|
"completions/min_length": 1075.0, |
|
"completions/min_terminated_length": 1075.0, |
|
"epoch": 0.00438468992248062, |
|
"grad_norm": 0.05853393969832367, |
|
"kl": 0.5302734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"num_tokens": 23987107.0, |
|
"reward": 0.24436859786510468, |
|
"reward_std": 0.04949303716421127, |
|
"rewards/avg_thinking_length_func": 81.72256469726562, |
|
"rewards/confidence_score_reward_func": 0.45580577850341797, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.573834842856046, |
|
"rewards/format_and_efficient_reward_func": 0.22879377007484436, |
|
"rewards/format_reward_func": 0.995830774307251, |
|
"rewards/num_xml_reward_func": 1.104771614074707, |
|
"rewards/tool_execution_reward_func": 1.9899488687515259, |
|
"rewards/visit_tool_reward_func": 0.8998211622238159, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004408914728682171, |
|
"grad_norm": 0.07516497327438276, |
|
"kl": 0.6845703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0009, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004433139534883721, |
|
"grad_norm": 0.05997132496622212, |
|
"kl": 0.626953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0008, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004457364341085271, |
|
"grad_norm": 0.037671767248184135, |
|
"kl": 0.48681640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8178.0, |
|
"completions/max_terminated_length": 8178.0, |
|
"completions/mean_length": 3659.640625, |
|
"completions/mean_terminated_length": 3659.640625, |
|
"completions/min_length": 896.0, |
|
"completions/min_terminated_length": 896.0, |
|
"epoch": 0.004481589147286822, |
|
"grad_norm": 0.016069114631093232, |
|
"kl": 0.34326171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0009, |
|
"num_tokens": 24496297.0, |
|
"reward": 0.2305455505847931, |
|
"reward_std": 0.06948232650756836, |
|
"rewards/avg_thinking_length_func": 111.37628936767578, |
|
"rewards/confidence_score_reward_func": 0.37327370047569275, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.48277143466617184, |
|
"rewards/format_and_efficient_reward_func": 0.1522754281759262, |
|
"rewards/format_reward_func": 0.9647905230522156, |
|
"rewards/num_xml_reward_func": 0.8915370106697083, |
|
"rewards/tool_execution_reward_func": 1.9581143856048584, |
|
"rewards/visit_tool_reward_func": 0.5689894556999207, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0045058139534883725, |
|
"grad_norm": 0.014918137972398021, |
|
"kl": 0.31640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0008, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004530038759689922, |
|
"grad_norm": 0.014560290660972823, |
|
"kl": 0.2958984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0008, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004554263565891473, |
|
"grad_norm": 0.014191965162457063, |
|
"kl": 0.27880859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0008, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 11027.0, |
|
"completions/max_terminated_length": 11027.0, |
|
"completions/mean_length": 2904.71875, |
|
"completions/mean_terminated_length": 2904.71875, |
|
"completions/min_length": 912.0, |
|
"completions/min_terminated_length": 912.0, |
|
"epoch": 0.004578488372093023, |
|
"grad_norm": 0.02465674538761865, |
|
"kl": 0.27099609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 24960803.0, |
|
"reward": 0.22261814773082733, |
|
"reward_std": 0.04196429252624512, |
|
"rewards/avg_thinking_length_func": 79.28602600097656, |
|
"rewards/confidence_score_reward_func": 0.40539172291755676, |
|
"rewards/correct_answer_reward_func": 0.46875, |
|
"rewards/efficient_thinking_reward_func": 0.4911669222941917, |
|
"rewards/format_and_efficient_reward_func": 0.14570605754852295, |
|
"rewards/format_reward_func": 0.9741340279579163, |
|
"rewards/num_xml_reward_func": 0.884125292301178, |
|
"rewards/tool_execution_reward_func": 1.9560561180114746, |
|
"rewards/visit_tool_reward_func": 0.7019689083099365, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004602713178294574, |
|
"grad_norm": 0.01002216998448175, |
|
"kl": 0.24951171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004626937984496124, |
|
"grad_norm": 0.009283017573166963, |
|
"kl": 0.234619140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004651162790697674, |
|
"grad_norm": 0.00871351171533654, |
|
"kl": 0.221435546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6760.0, |
|
"completions/max_terminated_length": 6760.0, |
|
"completions/mean_length": 3175.625, |
|
"completions/mean_terminated_length": 3175.625, |
|
"completions/min_length": 1279.0, |
|
"completions/min_terminated_length": 1279.0, |
|
"epoch": 0.004675387596899225, |
|
"grad_norm": 0.01864801542206714, |
|
"kl": 0.200927734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"num_tokens": 25421310.0, |
|
"reward": 0.3337632417678833, |
|
"reward_std": 0.1033831387758255, |
|
"rewards/avg_thinking_length_func": 144.4852752685547, |
|
"rewards/confidence_score_reward_func": 0.5157345533370972, |
|
"rewards/correct_answer_reward_func": 0.609375, |
|
"rewards/efficient_thinking_reward_func": 0.6954727584239813, |
|
"rewards/format_and_efficient_reward_func": 0.2803717255592346, |
|
"rewards/format_reward_func": 0.9838045835494995, |
|
"rewards/num_xml_reward_func": 1.244771957397461, |
|
"rewards/tool_execution_reward_func": 1.9927083253860474, |
|
"rewards/visit_tool_reward_func": 0.8324298858642578, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0046996124031007756, |
|
"grad_norm": 0.018411722840556213, |
|
"kl": 0.193603515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004723837209302325, |
|
"grad_norm": 0.018380172856358755, |
|
"kl": 0.189208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004748062015503876, |
|
"grad_norm": 0.018655645496485265, |
|
"kl": 0.1875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0013, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5256.0, |
|
"completions/max_terminated_length": 5256.0, |
|
"completions/mean_length": 2635.984375, |
|
"completions/mean_terminated_length": 2635.984375, |
|
"completions/min_length": 1134.0, |
|
"completions/min_terminated_length": 1134.0, |
|
"epoch": 0.0047722868217054265, |
|
"grad_norm": 0.004219005229441154, |
|
"kl": 0.1275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 25833510.0, |
|
"reward": 0.30341458320617676, |
|
"reward_std": 0.014322971925139427, |
|
"rewards/avg_thinking_length_func": 147.2517852783203, |
|
"rewards/confidence_score_reward_func": 0.5635701417922974, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 0.8586018615751865, |
|
"rewards/format_and_efficient_reward_func": 0.28311923146247864, |
|
"rewards/format_reward_func": 0.9866694808006287, |
|
"rewards/num_xml_reward_func": 1.2634769678115845, |
|
"rewards/tool_execution_reward_func": 1.9635450839996338, |
|
"rewards/visit_tool_reward_func": 0.80121248960495, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004796511627906977, |
|
"grad_norm": 0.004672728095639017, |
|
"kl": 0.130615234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004820736434108527, |
|
"grad_norm": 0.004950768699918263, |
|
"kl": 0.1329345703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0048449612403100775, |
|
"grad_norm": 0.005160418640186133, |
|
"kl": 0.1343994140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5779.0, |
|
"completions/max_terminated_length": 5779.0, |
|
"completions/mean_length": 2860.046875, |
|
"completions/mean_terminated_length": 2860.046875, |
|
"completions/min_length": 1125.0, |
|
"completions/min_terminated_length": 1125.0, |
|
"epoch": 0.004869186046511628, |
|
"grad_norm": 0.008829648064201757, |
|
"kl": 0.0411376953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 26237865.0, |
|
"reward": 0.45401930809020996, |
|
"reward_std": 0.09410357475280762, |
|
"rewards/avg_thinking_length_func": 182.534423828125, |
|
"rewards/confidence_score_reward_func": 0.5977352857589722, |
|
"rewards/correct_answer_reward_func": 0.734375, |
|
"rewards/efficient_thinking_reward_func": 0.784292215730239, |
|
"rewards/format_and_efficient_reward_func": 0.41676729917526245, |
|
"rewards/format_reward_func": 0.9937513470649719, |
|
"rewards/num_xml_reward_func": 1.5355236530303955, |
|
"rewards/tool_execution_reward_func": 1.9931985139846802, |
|
"rewards/visit_tool_reward_func": 0.8612196445465088, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004893410852713179, |
|
"grad_norm": 0.009196641903985264, |
|
"kl": 0.0400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004917635658914728, |
|
"grad_norm": 0.009490032359266305, |
|
"kl": 0.038818359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.004941860465116279, |
|
"grad_norm": 0.009682454113754367, |
|
"kl": 0.03753662109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6331.0, |
|
"completions/max_terminated_length": 6331.0, |
|
"completions/mean_length": 2659.453125, |
|
"completions/mean_terminated_length": 2659.453125, |
|
"completions/min_length": 895.0, |
|
"completions/min_terminated_length": 895.0, |
|
"epoch": 0.00496608527131783, |
|
"grad_norm": 0.0037363827479903167, |
|
"kl": 0.03375244140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 26646632.0, |
|
"reward": 0.3066054582595825, |
|
"reward_std": 0.03825566917657852, |
|
"rewards/avg_thinking_length_func": 136.1707763671875, |
|
"rewards/confidence_score_reward_func": 0.5777994990348816, |
|
"rewards/correct_answer_reward_func": 0.484375, |
|
"rewards/efficient_thinking_reward_func": 0.786608708417682, |
|
"rewards/format_and_efficient_reward_func": 0.3019195795059204, |
|
"rewards/format_reward_func": 0.9903415441513062, |
|
"rewards/num_xml_reward_func": 1.3805111646652222, |
|
"rewards/tool_execution_reward_func": 1.9650006294250488, |
|
"rewards/visit_tool_reward_func": 0.8477368354797363, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00499031007751938, |
|
"grad_norm": 0.0037822209816054495, |
|
"kl": 0.03350830078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00501453488372093, |
|
"grad_norm": 0.0038040246120938713, |
|
"kl": 0.033477783203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0050387596899224806, |
|
"grad_norm": 0.0038540122892837783, |
|
"kl": 0.03350830078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5045.0, |
|
"completions/max_terminated_length": 5045.0, |
|
"completions/mean_length": 2763.5, |
|
"completions/mean_terminated_length": 2763.5, |
|
"completions/min_length": 1119.0, |
|
"completions/min_terminated_length": 1119.0, |
|
"epoch": 0.005062984496124031, |
|
"grad_norm": 0.00548683325475162, |
|
"kl": 0.035003662109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 27041016.0, |
|
"reward": 0.45102459192276, |
|
"reward_std": 0.06410035490989685, |
|
"rewards/avg_thinking_length_func": 186.88746643066406, |
|
"rewards/confidence_score_reward_func": 0.6191459894180298, |
|
"rewards/correct_answer_reward_func": 0.6875, |
|
"rewards/efficient_thinking_reward_func": 0.8100582820862734, |
|
"rewards/format_and_efficient_reward_func": 0.44868165254592896, |
|
"rewards/format_reward_func": 0.9952791929244995, |
|
"rewards/num_xml_reward_func": 1.649810552597046, |
|
"rewards/tool_execution_reward_func": 1.9959295988082886, |
|
"rewards/visit_tool_reward_func": 0.8671329021453857, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005087209302325582, |
|
"grad_norm": 0.005414160917662644, |
|
"kl": 0.03399658203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0051114341085271315, |
|
"grad_norm": 0.005397000227956369, |
|
"kl": 0.033294677734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005135658914728682, |
|
"grad_norm": 0.005329822482164869, |
|
"kl": 0.03271484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5622.0, |
|
"completions/max_terminated_length": 5622.0, |
|
"completions/mean_length": 2689.5, |
|
"completions/mean_terminated_length": 2689.5, |
|
"completions/min_length": 1080.0, |
|
"completions/min_terminated_length": 1080.0, |
|
"epoch": 0.005159883720930233, |
|
"grad_norm": 0.004583885118409577, |
|
"kl": 0.027679443359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 27444785.0, |
|
"reward": 0.3377038240432739, |
|
"reward_std": 0.03283514827489853, |
|
"rewards/avg_thinking_length_func": 156.95558166503906, |
|
"rewards/confidence_score_reward_func": 0.6069622039794922, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.8533607950008524, |
|
"rewards/format_and_efficient_reward_func": 0.3490750193595886, |
|
"rewards/format_reward_func": 0.9963964819908142, |
|
"rewards/num_xml_reward_func": 1.565781831741333, |
|
"rewards/tool_execution_reward_func": 1.9799107313156128, |
|
"rewards/visit_tool_reward_func": 0.886849582195282, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005184108527131783, |
|
"grad_norm": 0.004553415372891503, |
|
"kl": 0.02728271484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005208333333333333, |
|
"grad_norm": 0.004416753047475649, |
|
"kl": 0.026763916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005232558139534884, |
|
"grad_norm": 0.004302097167180992, |
|
"kl": 0.02606201171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6509.0, |
|
"completions/max_terminated_length": 6509.0, |
|
"completions/mean_length": 3170.359375, |
|
"completions/mean_terminated_length": 3170.359375, |
|
"completions/min_length": 1316.0, |
|
"completions/min_terminated_length": 1316.0, |
|
"epoch": 0.005256782945736434, |
|
"grad_norm": 0.00874079090702132, |
|
"kl": 0.03131103515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 27886438.0, |
|
"reward": 0.4546785354614258, |
|
"reward_std": 0.13061311841011047, |
|
"rewards/avg_thinking_length_func": 185.58987426757812, |
|
"rewards/confidence_score_reward_func": 0.6329280138015747, |
|
"rewards/correct_answer_reward_func": 0.671875, |
|
"rewards/efficient_thinking_reward_func": 0.7895888587130873, |
|
"rewards/format_and_efficient_reward_func": 0.43139761686325073, |
|
"rewards/format_reward_func": 0.9971143007278442, |
|
"rewards/num_xml_reward_func": 1.6065764427185059, |
|
"rewards/tool_execution_reward_func": 1.9975961446762085, |
|
"rewards/visit_tool_reward_func": 0.8967168927192688, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005281007751937985, |
|
"grad_norm": 0.009254919184464793, |
|
"kl": 0.031158447265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005305232558139535, |
|
"grad_norm": 0.008540278295280325, |
|
"kl": 0.03131103515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005329457364341085, |
|
"grad_norm": 0.009027249196409619, |
|
"kl": 0.031463623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0002, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6125.0, |
|
"completions/max_terminated_length": 6125.0, |
|
"completions/mean_length": 2700.765625, |
|
"completions/mean_terminated_length": 2700.765625, |
|
"completions/min_length": 966.0, |
|
"completions/min_terminated_length": 966.0, |
|
"epoch": 0.005353682170542636, |
|
"grad_norm": 0.001653975947803042, |
|
"kl": 0.0260009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 28309543.0, |
|
"reward": 0.3285777270793915, |
|
"reward_std": 0.013459177687764168, |
|
"rewards/avg_thinking_length_func": 149.52700805664062, |
|
"rewards/confidence_score_reward_func": 0.6043996214866638, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 0.8903335916310755, |
|
"rewards/format_and_efficient_reward_func": 0.3600352108478546, |
|
"rewards/format_reward_func": 0.996889591217041, |
|
"rewards/num_xml_reward_func": 1.5710426568984985, |
|
"rewards/tool_execution_reward_func": 1.9776184558868408, |
|
"rewards/visit_tool_reward_func": 0.9032177925109863, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005377906976744186, |
|
"grad_norm": 0.001652035863632615, |
|
"kl": 0.0264892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005402131782945736, |
|
"grad_norm": 0.0016513159446787636, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005426356589147287, |
|
"grad_norm": 0.0020335905228311916, |
|
"kl": 0.027557373046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 4748.0, |
|
"completions/max_terminated_length": 4748.0, |
|
"completions/mean_length": 2978.90625, |
|
"completions/mean_terminated_length": 2978.90625, |
|
"completions/min_length": 1244.0, |
|
"completions/min_terminated_length": 1244.0, |
|
"epoch": 0.005450581395348837, |
|
"grad_norm": 0.006026935047182901, |
|
"kl": 0.03179931640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 28740848.0, |
|
"reward": 0.4945339560508728, |
|
"reward_std": 0.0744490772485733, |
|
"rewards/avg_thinking_length_func": 172.45849609375, |
|
"rewards/confidence_score_reward_func": 0.6167193651199341, |
|
"rewards/correct_answer_reward_func": 0.765625, |
|
"rewards/efficient_thinking_reward_func": 0.7966197226027097, |
|
"rewards/format_and_efficient_reward_func": 0.512791097164154, |
|
"rewards/format_reward_func": 0.9983228445053101, |
|
"rewards/num_xml_reward_func": 1.6630462408065796, |
|
"rewards/tool_execution_reward_func": 1.9971591234207153, |
|
"rewards/visit_tool_reward_func": 0.9000678062438965, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005474806201550388, |
|
"grad_norm": 0.005801070538806677, |
|
"kl": 0.0323486328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005499031007751938, |
|
"grad_norm": 0.005789539677805553, |
|
"kl": 0.03302001953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005523255813953488, |
|
"grad_norm": 0.005731300295942885, |
|
"kl": 0.033935546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0003, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5315.0, |
|
"completions/max_terminated_length": 5315.0, |
|
"completions/mean_length": 2718.109375, |
|
"completions/mean_terminated_length": 2718.109375, |
|
"completions/min_length": 1049.0, |
|
"completions/min_terminated_length": 1049.0, |
|
"epoch": 0.005547480620155039, |
|
"grad_norm": 0.0027604900450052977, |
|
"kl": 0.03369140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"num_tokens": 29177563.0, |
|
"reward": 0.33125773072242737, |
|
"reward_std": 0.012095385231077671, |
|
"rewards/avg_thinking_length_func": 138.28082275390625, |
|
"rewards/confidence_score_reward_func": 0.588701605796814, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 0.8968424695250805, |
|
"rewards/format_and_efficient_reward_func": 0.36526361107826233, |
|
"rewards/format_reward_func": 0.9942506551742554, |
|
"rewards/num_xml_reward_func": 1.484344720840454, |
|
"rewards/tool_execution_reward_func": 1.9658281803131104, |
|
"rewards/visit_tool_reward_func": 0.9050877094268799, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0055717054263565895, |
|
"grad_norm": 0.0028469369049688264, |
|
"kl": 0.03424072265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005595930232558139, |
|
"grad_norm": 0.0029207200987881226, |
|
"kl": 0.03466796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00562015503875969, |
|
"grad_norm": 0.002891989345093088, |
|
"kl": 0.03436279296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5430.0, |
|
"completions/max_terminated_length": 5430.0, |
|
"completions/mean_length": 3147.21875, |
|
"completions/mean_terminated_length": 3147.21875, |
|
"completions/min_length": 1208.0, |
|
"completions/min_terminated_length": 1208.0, |
|
"epoch": 0.0056443798449612404, |
|
"grad_norm": 0.008009912903442006, |
|
"kl": 0.039306640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 29641355.0, |
|
"reward": 0.45486128330230713, |
|
"reward_std": 0.10010581463575363, |
|
"rewards/avg_thinking_length_func": 154.7548828125, |
|
"rewards/confidence_score_reward_func": 0.5910084247589111, |
|
"rewards/correct_answer_reward_func": 0.71875, |
|
"rewards/efficient_thinking_reward_func": 0.79141897353926, |
|
"rewards/format_and_efficient_reward_func": 0.4532102346420288, |
|
"rewards/format_reward_func": 0.9973268508911133, |
|
"rewards/num_xml_reward_func": 1.6137380599975586, |
|
"rewards/tool_execution_reward_func": 1.9840686321258545, |
|
"rewards/visit_tool_reward_func": 0.9216470718383789, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005668604651162791, |
|
"grad_norm": 0.008010434434161435, |
|
"kl": 0.03924560546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005692829457364341, |
|
"grad_norm": 0.008059617739522514, |
|
"kl": 0.03936767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005717054263565891, |
|
"grad_norm": 0.008321692756210844, |
|
"kl": 0.0400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0004, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 5409.0, |
|
"completions/max_terminated_length": 5409.0, |
|
"completions/mean_length": 2747.0, |
|
"completions/mean_terminated_length": 2747.0, |
|
"completions/min_length": 1028.0, |
|
"completions/min_terminated_length": 1028.0, |
|
"epoch": 0.005741279069767442, |
|
"grad_norm": 0.00607005113841099, |
|
"kl": 0.0380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 30090756.0, |
|
"reward": 0.3245881199836731, |
|
"reward_std": 0.030338726937770844, |
|
"rewards/avg_thinking_length_func": 118.96601867675781, |
|
"rewards/confidence_score_reward_func": 0.5715887546539307, |
|
"rewards/correct_answer_reward_func": 0.515625, |
|
"rewards/efficient_thinking_reward_func": 0.7931376609790313, |
|
"rewards/format_and_efficient_reward_func": 0.3051683306694031, |
|
"rewards/format_reward_func": 0.9918498396873474, |
|
"rewards/num_xml_reward_func": 1.335392713546753, |
|
"rewards/tool_execution_reward_func": 1.956681728363037, |
|
"rewards/visit_tool_reward_func": 0.8923399448394775, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005765503875968993, |
|
"grad_norm": 0.006076971580972504, |
|
"kl": 0.03839111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005789728682170542, |
|
"grad_norm": 0.005795692009836339, |
|
"kl": 0.0380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.005813953488372093, |
|
"grad_norm": 0.005478655391819232, |
|
"kl": 0.0377197265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 240 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 640, |
|
"num_input_tokens_seen": 30090756, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|