Jan-v1-2509 / trainer_state.json
bachvudinh's picture
Upload folder using huggingface_hub
21bd389 verified
raw
history blame
147 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.005813953488372093,
"eval_steps": 500,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5501.0,
"completions/max_terminated_length": 5501.0,
"completions/mean_length": 3558.09375,
"completions/mean_terminated_length": 3558.09375,
"completions/min_length": 2215.0,
"completions/min_terminated_length": 2215.0,
"epoch": 2.4224806201550387e-05,
"grad_norm": 0.00640977891147893,
"kl": 0.0007143020629882812,
"learning_rate": 0.0,
"loss": 0.0006,
"num_tokens": 568407.0,
"reward": 0.4926603138446808,
"reward_std": 0.08448069542646408,
"rewards/avg_thinking_length_func": 157.22222900390625,
"rewards/confidence_score_reward_func": 0.7339284420013428,
"rewards/correct_answer_reward_func": 0.640625,
"rewards/efficient_thinking_reward_func": 0.9699548628723149,
"rewards/format_and_efficient_reward_func": 0.5214560031890869,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.318666696548462,
"rewards/tool_execution_reward_func": 1.983011245727539,
"rewards/visit_tool_reward_func": 0.9305298328399658,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 4.8449612403100775e-05,
"grad_norm": 0.0064083920341846115,
"kl": 0.0007143020629882812,
"learning_rate": 6.25e-08,
"loss": 0.0006,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 7.267441860465116e-05,
"grad_norm": 0.006447812260611595,
"kl": 0.0007295608520507812,
"learning_rate": 1.25e-07,
"loss": 0.0006,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 9.689922480620155e-05,
"grad_norm": 0.0066225031847143186,
"kl": 0.0007305145263671875,
"learning_rate": 1.875e-07,
"loss": 0.0006,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7891.0,
"completions/max_terminated_length": 7891.0,
"completions/mean_length": 3465.828125,
"completions/mean_terminated_length": 3465.828125,
"completions/min_length": 1264.0,
"completions/min_terminated_length": 1264.0,
"epoch": 0.00012112403100775194,
"grad_norm": 0.011221982806523546,
"kl": 0.0008029937744140625,
"learning_rate": 2.5e-07,
"loss": 0.0003,
"num_tokens": 1050218.0,
"reward": 0.35228461027145386,
"reward_std": 0.11903564631938934,
"rewards/avg_thinking_length_func": 172.3975830078125,
"rewards/confidence_score_reward_func": 0.7573737502098083,
"rewards/correct_answer_reward_func": 0.453125,
"rewards/efficient_thinking_reward_func": 0.8796035517984737,
"rewards/format_and_efficient_reward_func": 0.3536693751811981,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.489912509918213,
"rewards/tool_execution_reward_func": 1.9884867668151855,
"rewards/visit_tool_reward_func": 0.9384097456932068,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00014534883720930232,
"grad_norm": 0.011369566083514073,
"kl": 0.0008258819580078125,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0003,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0001695736434108527,
"grad_norm": 0.011325781329231437,
"kl": 0.000820159912109375,
"learning_rate": 3.75e-07,
"loss": 0.0003,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0001937984496124031,
"grad_norm": 0.011468177438620898,
"kl": 0.0008134841918945312,
"learning_rate": 4.375e-07,
"loss": 0.0003,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9790.0,
"completions/max_terminated_length": 9790.0,
"completions/mean_length": 4101.421875,
"completions/mean_terminated_length": 4101.421875,
"completions/min_length": 1141.0,
"completions/min_terminated_length": 1141.0,
"epoch": 0.00021802325581395349,
"grad_norm": 0.008533015854175789,
"kl": 0.00080108642578125,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 1636681.0,
"reward": 0.4183655381202698,
"reward_std": 0.0931699275970459,
"rewards/avg_thinking_length_func": 176.92233276367188,
"rewards/confidence_score_reward_func": 0.7306747436523438,
"rewards/correct_answer_reward_func": 0.546875,
"rewards/efficient_thinking_reward_func": 0.8954936332818751,
"rewards/format_and_efficient_reward_func": 0.4208581745624542,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.53083336353302,
"rewards/tool_execution_reward_func": 1.9508955478668213,
"rewards/visit_tool_reward_func": 0.8424738645553589,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00024224806201550387,
"grad_norm": 0.009520985221391949,
"kl": 0.0007925033569335938,
"learning_rate": 5.625e-07,
"loss": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00026647286821705426,
"grad_norm": 0.010085270290120536,
"kl": 0.0011835098266601562,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00029069767441860465,
"grad_norm": 0.008472445513601271,
"kl": 0.0008249282836914062,
"learning_rate": 6.875e-07,
"loss": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7445.0,
"completions/max_terminated_length": 7445.0,
"completions/mean_length": 3379.234375,
"completions/mean_terminated_length": 3379.234375,
"completions/min_length": 1491.0,
"completions/min_terminated_length": 1491.0,
"epoch": 0.00031492248062015503,
"grad_norm": 0.01258823765843166,
"kl": 0.0009145736694335938,
"learning_rate": 7.5e-07,
"loss": -0.0001,
"num_tokens": 2110165.0,
"reward": 0.4067286550998688,
"reward_std": 0.18041250109672546,
"rewards/avg_thinking_length_func": 170.76950073242188,
"rewards/confidence_score_reward_func": 0.763248085975647,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.8802126246942265,
"rewards/format_and_efficient_reward_func": 0.4241780936717987,
"rewards/format_reward_func": 0.99958336353302,
"rewards/num_xml_reward_func": 1.6099066734313965,
"rewards/tool_execution_reward_func": 1.9751970767974854,
"rewards/visit_tool_reward_func": 0.9391972422599792,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0003391472868217054,
"grad_norm": 0.012500551984662189,
"kl": 0.0009927749633789062,
"learning_rate": 8.125e-07,
"loss": -0.0001,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0003633720930232558,
"grad_norm": 0.012416715432446976,
"kl": 0.0010967254638671875,
"learning_rate": 8.75e-07,
"loss": -0.0001,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0003875968992248062,
"grad_norm": 0.01288145978177755,
"kl": 0.001140594482421875,
"learning_rate": 9.374999999999999e-07,
"loss": -0.0001,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10205.0,
"completions/max_terminated_length": 10205.0,
"completions/mean_length": 4119.96875,
"completions/mean_terminated_length": 4119.96875,
"completions/min_length": 1159.0,
"completions/min_terminated_length": 1159.0,
"epoch": 0.0004118217054263566,
"grad_norm": 0.009407611593031055,
"kl": 0.0011768341064453125,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 2691117.0,
"reward": 0.4201432466506958,
"reward_std": 0.0907188206911087,
"rewards/avg_thinking_length_func": 171.4025115966797,
"rewards/confidence_score_reward_func": 0.7308521270751953,
"rewards/correct_answer_reward_func": 0.546875,
"rewards/efficient_thinking_reward_func": 0.861229582956026,
"rewards/format_and_efficient_reward_func": 0.37111079692840576,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.3203115463256836,
"rewards/tool_execution_reward_func": 1.9717044830322266,
"rewards/visit_tool_reward_func": 0.8859716653823853,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00043604651162790697,
"grad_norm": 0.009347834781139657,
"kl": 0.00139617919921875,
"learning_rate": 1.0625e-06,
"loss": 0.0004,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00046027131782945736,
"grad_norm": 0.00928664951165006,
"kl": 0.00167083740234375,
"learning_rate": 1.125e-06,
"loss": 0.0004,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00048449612403100775,
"grad_norm": 0.009342230945576057,
"kl": 0.00212860107421875,
"learning_rate": 1.1874999999999999e-06,
"loss": 0.0004,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7792.0,
"completions/max_terminated_length": 7792.0,
"completions/mean_length": 3474.78125,
"completions/mean_terminated_length": 3474.78125,
"completions/min_length": 1307.0,
"completions/min_terminated_length": 1307.0,
"epoch": 0.0005087209302325581,
"grad_norm": 0.010737570621935045,
"kl": 0.002620697021484375,
"learning_rate": 1.2499999999999999e-06,
"loss": -0.0,
"num_tokens": 3182962.0,
"reward": 0.3430381715297699,
"reward_std": 0.15257038176059723,
"rewards/avg_thinking_length_func": 163.67486572265625,
"rewards/confidence_score_reward_func": 0.7590060234069824,
"rewards/correct_answer_reward_func": 0.4375,
"rewards/efficient_thinking_reward_func": 0.9094734969614356,
"rewards/format_and_efficient_reward_func": 0.354397177696228,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.4648277759552002,
"rewards/tool_execution_reward_func": 1.9753289222717285,
"rewards/visit_tool_reward_func": 0.9633350968360901,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0005329457364341085,
"grad_norm": 0.010610611287841326,
"kl": 0.003204345703125,
"learning_rate": 1.3125e-06,
"loss": -0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0005571705426356589,
"grad_norm": 0.010883725821996518,
"kl": 0.003814697265625,
"learning_rate": 1.375e-06,
"loss": -0.0,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0005813953488372093,
"grad_norm": 0.010728950563018041,
"kl": 0.00518798828125,
"learning_rate": 1.4375e-06,
"loss": -0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9082.0,
"completions/max_terminated_length": 9082.0,
"completions/mean_length": 4205.453125,
"completions/mean_terminated_length": 4205.453125,
"completions/min_length": 1188.0,
"completions/min_terminated_length": 1188.0,
"epoch": 0.0006056201550387597,
"grad_norm": 0.011708703395331976,
"kl": 0.0054779052734375,
"learning_rate": 1.5e-06,
"loss": 0.0016,
"num_tokens": 3788330.0,
"reward": 0.4100201725959778,
"reward_std": 0.12962010502815247,
"rewards/avg_thinking_length_func": 167.64987182617188,
"rewards/confidence_score_reward_func": 0.7269817590713501,
"rewards/correct_answer_reward_func": 0.53125,
"rewards/efficient_thinking_reward_func": 0.894405090660734,
"rewards/format_and_efficient_reward_func": 0.4734077453613281,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.2609543800354004,
"rewards/tool_execution_reward_func": 1.9624817371368408,
"rewards/visit_tool_reward_func": 0.8933978080749512,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0006298449612403101,
"grad_norm": 0.012029441405275343,
"kl": 0.00725555419921875,
"learning_rate": 1.5624999999999999e-06,
"loss": 0.0016,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0006540697674418605,
"grad_norm": 0.011965973272488425,
"kl": 0.010589599609375,
"learning_rate": 1.625e-06,
"loss": 0.0016,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0006782945736434108,
"grad_norm": 0.018054158629818226,
"kl": 0.017059326171875,
"learning_rate": 1.6875e-06,
"loss": 0.0016,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7466.0,
"completions/max_terminated_length": 7466.0,
"completions/mean_length": 3525.1875,
"completions/mean_terminated_length": 3525.1875,
"completions/min_length": 1458.0,
"completions/min_terminated_length": 1458.0,
"epoch": 0.0007025193798449612,
"grad_norm": 0.011398719184495674,
"kl": 0.013671875,
"learning_rate": 1.75e-06,
"loss": 0.0001,
"num_tokens": 4289196.0,
"reward": 0.3574071526527405,
"reward_std": 0.09749965369701385,
"rewards/avg_thinking_length_func": 163.65969848632812,
"rewards/confidence_score_reward_func": 0.7581030130386353,
"rewards/correct_answer_reward_func": 0.453125,
"rewards/efficient_thinking_reward_func": 0.9089163330381327,
"rewards/format_and_efficient_reward_func": 0.3653510808944702,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.4366014003753662,
"rewards/tool_execution_reward_func": 1.9675538539886475,
"rewards/visit_tool_reward_func": 0.960380494594574,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0007267441860465116,
"grad_norm": 0.010833682609318612,
"kl": 0.015838623046875,
"learning_rate": 1.8125e-06,
"loss": 0.0001,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.000750968992248062,
"grad_norm": 0.0231097533212703,
"kl": 0.022918701171875,
"learning_rate": 1.8749999999999998e-06,
"loss": 0.0001,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0007751937984496124,
"grad_norm": 0.011257228008738334,
"kl": 0.021575927734375,
"learning_rate": 1.9375e-06,
"loss": 0.0001,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8888.0,
"completions/max_terminated_length": 8888.0,
"completions/mean_length": 3804.265625,
"completions/mean_terminated_length": 3804.265625,
"completions/min_length": 1067.0,
"completions/min_terminated_length": 1067.0,
"epoch": 0.0007994186046511628,
"grad_norm": 0.02927961829217277,
"kl": 0.030914306640625,
"learning_rate": 2e-06,
"loss": 0.0005,
"num_tokens": 4848644.0,
"reward": 0.46360254287719727,
"reward_std": 0.10140425711870193,
"rewards/avg_thinking_length_func": 168.85345458984375,
"rewards/confidence_score_reward_func": 0.7187485694885254,
"rewards/correct_answer_reward_func": 0.609375,
"rewards/efficient_thinking_reward_func": 0.8848315117904739,
"rewards/format_and_efficient_reward_func": 0.46383440494537354,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.3748114109039307,
"rewards/tool_execution_reward_func": 1.9836355447769165,
"rewards/visit_tool_reward_func": 0.8981889486312866,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0008236434108527132,
"grad_norm": 0.00984263633299767,
"kl": 0.026763916015625,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0008478682170542636,
"grad_norm": 0.022916321346866338,
"kl": 0.03643798828125,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0008720930232558139,
"grad_norm": 0.010968578899761567,
"kl": 0.03680419921875,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7119.0,
"completions/max_terminated_length": 7119.0,
"completions/mean_length": 3045.9375,
"completions/mean_terminated_length": 3045.9375,
"completions/min_length": 1306.0,
"completions/min_terminated_length": 1306.0,
"epoch": 0.0008963178294573643,
"grad_norm": 0.11556192957878203,
"kl": 0.066314697265625,
"learning_rate": 2e-06,
"loss": 0.0011,
"num_tokens": 5286042.0,
"reward": 0.38059696555137634,
"reward_std": 0.20472648739814758,
"rewards/avg_thinking_length_func": 171.9969024658203,
"rewards/confidence_score_reward_func": 0.7361885905265808,
"rewards/correct_answer_reward_func": 0.5,
"rewards/efficient_thinking_reward_func": 0.8792661921309781,
"rewards/format_and_efficient_reward_func": 0.4069961905479431,
"rewards/format_reward_func": 0.9985389709472656,
"rewards/num_xml_reward_func": 1.7584354877471924,
"rewards/tool_execution_reward_func": 1.9876766204833984,
"rewards/visit_tool_reward_func": 0.926859438419342,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0009205426356589147,
"grad_norm": 0.013991455687567742,
"kl": 0.034393310546875,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0009447674418604651,
"grad_norm": 0.01433251116157902,
"kl": 0.0352783203125,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0009689922480620155,
"grad_norm": 0.01682769595676241,
"kl": 0.0426025390625,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6696.0,
"completions/max_terminated_length": 6696.0,
"completions/mean_length": 3054.78125,
"completions/mean_terminated_length": 3054.78125,
"completions/min_length": 734.0,
"completions/min_terminated_length": 734.0,
"epoch": 0.0009932170542635659,
"grad_norm": 0.034243564194623544,
"kl": 0.0538330078125,
"learning_rate": 2e-06,
"loss": 0.0003,
"num_tokens": 5728827.0,
"reward": 0.5321023464202881,
"reward_std": 0.07992984354496002,
"rewards/avg_thinking_length_func": 185.18777465820312,
"rewards/confidence_score_reward_func": 0.699253261089325,
"rewards/correct_answer_reward_func": 0.734375,
"rewards/efficient_thinking_reward_func": 0.8423659179880447,
"rewards/format_and_efficient_reward_func": 0.5654621124267578,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.836081624031067,
"rewards/tool_execution_reward_func": 1.9795209169387817,
"rewards/visit_tool_reward_func": 0.8331901431083679,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0010174418604651163,
"grad_norm": 0.008357434600682397,
"kl": 0.0467529296875,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0010416666666666667,
"grad_norm": 0.009143109288946688,
"kl": 0.05499267578125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001065891472868217,
"grad_norm": 0.018383062802239766,
"kl": 0.07135009765625,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5912.0,
"completions/max_terminated_length": 5912.0,
"completions/mean_length": 2513.34375,
"completions/mean_terminated_length": 2513.34375,
"completions/min_length": 1085.0,
"completions/min_terminated_length": 1085.0,
"epoch": 0.0010901162790697674,
"grad_norm": 1.2449457515517797,
"kl": 0.5546875,
"learning_rate": 2e-06,
"loss": 0.0011,
"num_tokens": 6121552.0,
"reward": 0.41406646370887756,
"reward_std": 0.1448429971933365,
"rewards/avg_thinking_length_func": 159.43849182128906,
"rewards/confidence_score_reward_func": 0.7091017961502075,
"rewards/correct_answer_reward_func": 0.5625,
"rewards/efficient_thinking_reward_func": 0.9100999417514477,
"rewards/format_and_efficient_reward_func": 0.40307265520095825,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.7179009914398193,
"rewards/tool_execution_reward_func": 1.9982638359069824,
"rewards/visit_tool_reward_func": 0.8534926772117615,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0011143410852713178,
"grad_norm": 0.04725193167363872,
"kl": 0.0830078125,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0011385658914728682,
"grad_norm": 0.01076799271094143,
"kl": 0.0728759765625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0011627906976744186,
"grad_norm": 1.2844930338395364,
"kl": 0.594970703125,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6441.0,
"completions/max_terminated_length": 6441.0,
"completions/mean_length": 2998.25,
"completions/mean_terminated_length": 2998.25,
"completions/min_length": 811.0,
"completions/min_terminated_length": 811.0,
"epoch": 0.001187015503875969,
"grad_norm": 0.07335407885154412,
"kl": 0.0946044921875,
"learning_rate": 2e-06,
"loss": 0.0003,
"num_tokens": 6632198.0,
"reward": 0.4027416408061981,
"reward_std": 0.18368688225746155,
"rewards/avg_thinking_length_func": 144.1616668701172,
"rewards/confidence_score_reward_func": 0.6523082852363586,
"rewards/correct_answer_reward_func": 0.578125,
"rewards/efficient_thinking_reward_func": 0.8715830269761213,
"rewards/format_and_efficient_reward_func": 0.30888205766677856,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.2804265022277832,
"rewards/tool_execution_reward_func": 1.9967105388641357,
"rewards/visit_tool_reward_func": 0.777007520198822,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0012112403100775194,
"grad_norm": 3382.951158532336,
"kl": 386.0513916015625,
"learning_rate": 2e-06,
"loss": 0.1955,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0012354651162790698,
"grad_norm": 0.04763618115574692,
"kl": 0.111083984375,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0012596899224806201,
"grad_norm": 0.011361146003702229,
"kl": 0.0693359375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5699.0,
"completions/max_terminated_length": 5699.0,
"completions/mean_length": 2702.609375,
"completions/mean_terminated_length": 2702.609375,
"completions/min_length": 929.0,
"completions/min_terminated_length": 929.0,
"epoch": 0.0012839147286821705,
"grad_norm": 641.0279979496779,
"kl": 340.21875,
"learning_rate": 2e-06,
"loss": 0.3029,
"num_tokens": 7071302.0,
"reward": 0.38491296768188477,
"reward_std": 0.20615670084953308,
"rewards/avg_thinking_length_func": 144.03466796875,
"rewards/confidence_score_reward_func": 0.6775128841400146,
"rewards/correct_answer_reward_func": 0.546875,
"rewards/efficient_thinking_reward_func": 0.8956235775099276,
"rewards/format_and_efficient_reward_func": 0.298817902803421,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.156145691871643,
"rewards/tool_execution_reward_func": 2.0,
"rewards/visit_tool_reward_func": 0.8991793990135193,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001308139534883721,
"grad_norm": 10.07283016494114,
"kl": 6.52294921875,
"learning_rate": 2e-06,
"loss": 0.0054,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0013323643410852713,
"grad_norm": 0.024178305719161252,
"kl": 0.1031494140625,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0013565891472868217,
"grad_norm": 0.010123659301215143,
"kl": 0.0853271484375,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7974.0,
"completions/max_terminated_length": 7974.0,
"completions/mean_length": 3418.6875,
"completions/mean_terminated_length": 3418.6875,
"completions/min_length": 1001.0,
"completions/min_terminated_length": 1001.0,
"epoch": 0.001380813953488372,
"grad_norm": 0.011918704634373778,
"kl": 0.0814208984375,
"learning_rate": 2e-06,
"loss": 0.0006,
"num_tokens": 7618083.0,
"reward": 0.33670923113822937,
"reward_std": 0.2170744389295578,
"rewards/avg_thinking_length_func": 162.87310791015625,
"rewards/confidence_score_reward_func": 0.6380267143249512,
"rewards/correct_answer_reward_func": 0.484375,
"rewards/efficient_thinking_reward_func": 0.8769457565983968,
"rewards/format_and_efficient_reward_func": 0.15387150645256042,
"rewards/format_reward_func": 0.9937499761581421,
"rewards/num_xml_reward_func": 0.7425504326820374,
"rewards/tool_execution_reward_func": 1.984375,
"rewards/visit_tool_reward_func": 0.7900611162185669,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0014050387596899225,
"grad_norm": 0.012656826141930118,
"kl": 0.0870361328125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0014292635658914728,
"grad_norm": 0.01963879028272825,
"kl": 0.102783203125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0014534883720930232,
"grad_norm": 0.023803010795812877,
"kl": 0.111328125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6602.0,
"completions/max_terminated_length": 6602.0,
"completions/mean_length": 2937.375,
"completions/mean_terminated_length": 2937.375,
"completions/min_length": 1124.0,
"completions/min_terminated_length": 1124.0,
"epoch": 0.0014777131782945736,
"grad_norm": 0.6399010033168665,
"kl": 0.328857421875,
"learning_rate": 2e-06,
"loss": 0.0008,
"num_tokens": 8081395.0,
"reward": 0.41028502583503723,
"reward_std": 0.1911381632089615,
"rewards/avg_thinking_length_func": 154.1159210205078,
"rewards/confidence_score_reward_func": 0.6654285192489624,
"rewards/correct_answer_reward_func": 0.59375,
"rewards/efficient_thinking_reward_func": 0.8800399071963756,
"rewards/format_and_efficient_reward_func": 0.1847984343767166,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 0.921923816204071,
"rewards/tool_execution_reward_func": 1.9983552694320679,
"rewards/visit_tool_reward_func": 0.883500337600708,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001501937984496124,
"grad_norm": 0.011193139638749735,
"kl": 0.091552734375,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0015261627906976744,
"grad_norm": 0.010209194017758182,
"kl": 0.086181640625,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0015503875968992248,
"grad_norm": 0.14653936372168078,
"kl": 0.1170654296875,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8850.0,
"completions/max_terminated_length": 8850.0,
"completions/mean_length": 3542.96875,
"completions/mean_terminated_length": 3542.96875,
"completions/min_length": 880.0,
"completions/min_terminated_length": 880.0,
"epoch": 0.0015746124031007752,
"grad_norm": 0.48126334443141955,
"kl": 0.248046875,
"learning_rate": 2e-06,
"loss": -0.0001,
"num_tokens": 8636201.0,
"reward": 0.39273786544799805,
"reward_std": 0.12296080589294434,
"rewards/avg_thinking_length_func": 150.4586639404297,
"rewards/confidence_score_reward_func": 0.6261853575706482,
"rewards/correct_answer_reward_func": 0.578125,
"rewards/efficient_thinking_reward_func": 0.8429494490638886,
"rewards/format_and_efficient_reward_func": 0.26941436529159546,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 0.9826317429542542,
"rewards/tool_execution_reward_func": 1.9983552694320679,
"rewards/visit_tool_reward_func": 0.8202804327011108,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0015988372093023256,
"grad_norm": 0.011505230665385266,
"kl": 0.087646484375,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001623062015503876,
"grad_norm": 0.011219221768431348,
"kl": 0.0863037109375,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0016472868217054263,
"grad_norm": 0.013493117517357446,
"kl": 0.0845947265625,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6164.0,
"completions/max_terminated_length": 6164.0,
"completions/mean_length": 3045.984375,
"completions/mean_terminated_length": 3045.984375,
"completions/min_length": 853.0,
"completions/min_terminated_length": 853.0,
"epoch": 0.0016715116279069767,
"grad_norm": 0.015474671509878156,
"kl": 0.085205078125,
"learning_rate": 2e-06,
"loss": -0.0001,
"num_tokens": 9080324.0,
"reward": 0.3584170639514923,
"reward_std": 0.2464786320924759,
"rewards/avg_thinking_length_func": 171.05947875976562,
"rewards/confidence_score_reward_func": 0.6698201298713684,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.9022617067768229,
"rewards/format_and_efficient_reward_func": 0.18420693278312683,
"rewards/format_reward_func": 0.999218761920929,
"rewards/num_xml_reward_func": 0.9476650953292847,
"rewards/tool_execution_reward_func": 1.9967105388641357,
"rewards/visit_tool_reward_func": 0.922633707523346,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001695736434108527,
"grad_norm": 0.01314183789857302,
"kl": 0.082275390625,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0017199612403100775,
"grad_norm": 0.012255008742171034,
"kl": 0.0802001953125,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0017441860465116279,
"grad_norm": 0.016022338448163764,
"kl": 0.0791015625,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9687.0,
"completions/max_terminated_length": 9687.0,
"completions/mean_length": 4153.765625,
"completions/mean_terminated_length": 4153.765625,
"completions/min_length": 1035.0,
"completions/min_terminated_length": 1035.0,
"epoch": 0.0017684108527131783,
"grad_norm": 0.009771623312563241,
"kl": 0.07470703125,
"learning_rate": 2e-06,
"loss": 0.0005,
"num_tokens": 9647713.0,
"reward": 0.39447835087776184,
"reward_std": 0.1022053211927414,
"rewards/avg_thinking_length_func": 180.9823455810547,
"rewards/confidence_score_reward_func": 0.6325613260269165,
"rewards/correct_answer_reward_func": 0.578125,
"rewards/efficient_thinking_reward_func": 0.8102246632773766,
"rewards/format_and_efficient_reward_func": 0.31101614236831665,
"rewards/format_reward_func": 0.9996874928474426,
"rewards/num_xml_reward_func": 1.1014292240142822,
"rewards/tool_execution_reward_func": 1.9983552694320679,
"rewards/visit_tool_reward_func": 0.9176727533340454,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0017926356589147287,
"grad_norm": 0.009518866209493148,
"kl": 0.0743408203125,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001816860465116279,
"grad_norm": 0.01107061263145856,
"kl": 0.074462890625,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0018410852713178294,
"grad_norm": 0.010455700609646703,
"kl": 0.0758056640625,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 6094.0,
"completions/max_terminated_length": 6094.0,
"completions/mean_length": 3365.65625,
"completions/mean_terminated_length": 3386.5714285714284,
"completions/min_length": 1457.0,
"completions/min_terminated_length": 1457.0,
"epoch": 0.0018653100775193798,
"grad_norm": 0.010697094262847633,
"kl": 0.07177734375,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 10096064.0,
"reward": 0.4402870833873749,
"reward_std": 0.17748701572418213,
"rewards/avg_thinking_length_func": 184.61854553222656,
"rewards/confidence_score_reward_func": 0.6924824714660645,
"rewards/correct_answer_reward_func": 0.625,
"rewards/efficient_thinking_reward_func": 0.8674089768653666,
"rewards/format_and_efficient_reward_func": 0.46700799465179443,
"rewards/format_reward_func": 0.9821969866752625,
"rewards/num_xml_reward_func": 1.4879558086395264,
"rewards/tool_execution_reward_func": 1.9514802694320679,
"rewards/visit_tool_reward_func": 0.9262524843215942,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0018895348837209302,
"grad_norm": 0.010757613261067228,
"kl": 0.0716552734375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0019137596899224806,
"grad_norm": 0.010687573666984099,
"kl": 0.0711669921875,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.001937984496124031,
"grad_norm": 0.010774872814522038,
"kl": 0.07177734375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9500.0,
"completions/max_terminated_length": 9500.0,
"completions/mean_length": 4230.6875,
"completions/mean_terminated_length": 4230.6875,
"completions/min_length": 1095.0,
"completions/min_terminated_length": 1095.0,
"epoch": 0.0019622093023255816,
"grad_norm": 46.52685366161902,
"kl": 28.5504150390625,
"learning_rate": 2e-06,
"loss": 0.0212,
"num_tokens": 10633304.0,
"reward": 0.4479905962944031,
"reward_std": 0.11886347830295563,
"rewards/avg_thinking_length_func": 196.62542724609375,
"rewards/confidence_score_reward_func": 0.6686310768127441,
"rewards/correct_answer_reward_func": 0.625,
"rewards/efficient_thinking_reward_func": 0.8074578120916676,
"rewards/format_and_efficient_reward_func": 0.4098377823829651,
"rewards/format_reward_func": 0.9993749856948853,
"rewards/num_xml_reward_func": 1.3076300621032715,
"rewards/tool_execution_reward_func": 1.9934210777282715,
"rewards/visit_tool_reward_func": 0.9281606674194336,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0019864341085271318,
"grad_norm": 0.011024444662647613,
"kl": 0.0682373046875,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0020106589147286824,
"grad_norm": 0.0110905273039609,
"kl": 0.0682373046875,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0020348837209302325,
"grad_norm": 0.011161056303561772,
"kl": 0.068359375,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7332.0,
"completions/max_terminated_length": 7332.0,
"completions/mean_length": 3184.28125,
"completions/mean_terminated_length": 3184.28125,
"completions/min_length": 1380.0,
"completions/min_terminated_length": 1380.0,
"epoch": 0.002059108527131783,
"grad_norm": 0.007262566160814956,
"kl": 0.0670166015625,
"learning_rate": 2e-06,
"loss": -0.0,
"num_tokens": 11072119.0,
"reward": 0.48964226245880127,
"reward_std": 0.09526845812797546,
"rewards/avg_thinking_length_func": 183.27981567382812,
"rewards/confidence_score_reward_func": 0.7107405066490173,
"rewards/correct_answer_reward_func": 0.671875,
"rewards/efficient_thinking_reward_func": 0.8552614079949872,
"rewards/format_and_efficient_reward_func": 0.509292721748352,
"rewards/format_reward_func": 0.9996874928474426,
"rewards/num_xml_reward_func": 1.630164384841919,
"rewards/tool_execution_reward_func": 1.9862616062164307,
"rewards/visit_tool_reward_func": 0.9241018295288086,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0020833333333333333,
"grad_norm": 0.007239493682926299,
"kl": 0.0675048828125,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002107558139534884,
"grad_norm": 0.007565680492649283,
"kl": 0.06787109375,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002131782945736434,
"grad_norm": 0.007407335837345995,
"kl": 0.0682373046875,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9640.0,
"completions/max_terminated_length": 9640.0,
"completions/mean_length": 3956.140625,
"completions/mean_terminated_length": 3956.140625,
"completions/min_length": 987.0,
"completions/min_terminated_length": 987.0,
"epoch": 0.0021560077519379847,
"grad_norm": 0.009630461090198177,
"kl": 0.06591796875,
"learning_rate": 2e-06,
"loss": 0.0004,
"num_tokens": 11589191.0,
"reward": 0.4685676693916321,
"reward_std": 0.08529931306838989,
"rewards/avg_thinking_length_func": 185.34999084472656,
"rewards/confidence_score_reward_func": 0.673518717288971,
"rewards/correct_answer_reward_func": 0.65625,
"rewards/efficient_thinking_reward_func": 0.8117772322905137,
"rewards/format_and_efficient_reward_func": 0.4981999397277832,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.5498807430267334,
"rewards/tool_execution_reward_func": 1.9884867668151855,
"rewards/visit_tool_reward_func": 0.9419025182723999,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002180232558139535,
"grad_norm": 0.010035272389521673,
"kl": 0.0660400390625,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0022044573643410855,
"grad_norm": 0.009886020878154878,
"kl": 0.0653076171875,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0022286821705426356,
"grad_norm": 0.010179048111382292,
"kl": 0.0648193359375,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5889.0,
"completions/max_terminated_length": 5889.0,
"completions/mean_length": 3288.046875,
"completions/mean_terminated_length": 3288.046875,
"completions/min_length": 1106.0,
"completions/min_terminated_length": 1106.0,
"epoch": 0.0022529069767441862,
"grad_norm": 0.36462018525457834,
"kl": 0.1248779296875,
"learning_rate": 2e-06,
"loss": 0.001,
"num_tokens": 12047117.0,
"reward": 0.5035778284072876,
"reward_std": 0.09110674262046814,
"rewards/avg_thinking_length_func": 180.05084228515625,
"rewards/confidence_score_reward_func": 0.7095786333084106,
"rewards/correct_answer_reward_func": 0.6875,
"rewards/efficient_thinking_reward_func": 0.865053232533276,
"rewards/format_and_efficient_reward_func": 0.5739701986312866,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.6645023822784424,
"rewards/tool_execution_reward_func": 1.9736841917037964,
"rewards/visit_tool_reward_func": 0.9475066065788269,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0022771317829457364,
"grad_norm": 0.010049427947341465,
"kl": 0.0684814453125,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002301356589147287,
"grad_norm": 0.008406367137924373,
"kl": 0.067138671875,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002325581395348837,
"grad_norm": 0.008646991679074768,
"kl": 0.0682373046875,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9717.0,
"completions/max_terminated_length": 9717.0,
"completions/mean_length": 4042.09375,
"completions/mean_terminated_length": 4042.09375,
"completions/min_length": 926.0,
"completions/min_terminated_length": 926.0,
"epoch": 0.0023498062015503878,
"grad_norm": 0.36433347439984676,
"kl": 0.266357421875,
"learning_rate": 2e-06,
"loss": 0.0004,
"num_tokens": 12594675.0,
"reward": 0.4354441165924072,
"reward_std": 0.10702547430992126,
"rewards/avg_thinking_length_func": 178.31576538085938,
"rewards/confidence_score_reward_func": 0.6778514385223389,
"rewards/correct_answer_reward_func": 0.59375,
"rewards/efficient_thinking_reward_func": 0.8262231594607177,
"rewards/format_and_efficient_reward_func": 0.4731639623641968,
"rewards/format_reward_func": 0.9996874928474426,
"rewards/num_xml_reward_func": 1.5230944156646729,
"rewards/tool_execution_reward_func": 1.977658987045288,
"rewards/visit_tool_reward_func": 0.90561443567276,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002374031007751938,
"grad_norm": 0.017653090743062046,
"kl": 0.07763671875,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0023982558139534886,
"grad_norm": 0.009650143183516308,
"kl": 0.066650390625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0024224806201550387,
"grad_norm": 0.009666383934140476,
"kl": 0.066650390625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7009.0,
"completions/max_terminated_length": 7009.0,
"completions/mean_length": 3569.1875,
"completions/mean_terminated_length": 3569.1875,
"completions/min_length": 1350.0,
"completions/min_terminated_length": 1350.0,
"epoch": 0.0024467054263565893,
"grad_norm": 0.012628187028225836,
"kl": 0.0160369873046875,
"learning_rate": 2e-06,
"loss": 0.0013,
"num_tokens": 13095521.0,
"reward": 0.4694232642650604,
"reward_std": 0.11920525133609772,
"rewards/avg_thinking_length_func": 166.68763732910156,
"rewards/confidence_score_reward_func": 0.693173885345459,
"rewards/correct_answer_reward_func": 0.640625,
"rewards/efficient_thinking_reward_func": 0.8890269113384983,
"rewards/format_and_efficient_reward_func": 0.52373868227005,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.4931187629699707,
"rewards/tool_execution_reward_func": 1.9407894611358643,
"rewards/visit_tool_reward_func": 0.9543420076370239,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0024709302325581395,
"grad_norm": 0.013764666926511201,
"kl": 0.016693115234375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00249515503875969,
"grad_norm": 0.015582325932853322,
"kl": 0.017730712890625,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0025193798449612403,
"grad_norm": 0.017864538067072777,
"kl": 0.01995849609375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11718.0,
"completions/max_terminated_length": 11718.0,
"completions/mean_length": 4337.8125,
"completions/mean_terminated_length": 4337.8125,
"completions/min_length": 1402.0,
"completions/min_terminated_length": 1402.0,
"epoch": 0.002543604651162791,
"grad_norm": 0.011715145428905095,
"kl": 0.023681640625,
"learning_rate": 2e-06,
"loss": 0.0003,
"num_tokens": 13691037.0,
"reward": 0.4581317901611328,
"reward_std": 0.07780471444129944,
"rewards/avg_thinking_length_func": 141.15011596679688,
"rewards/confidence_score_reward_func": 0.6525664925575256,
"rewards/correct_answer_reward_func": 0.65625,
"rewards/efficient_thinking_reward_func": 0.7593332235923487,
"rewards/format_and_efficient_reward_func": 0.45769202709198,
"rewards/format_reward_func": 0.9993749856948853,
"rewards/num_xml_reward_func": 1.3809731006622314,
"rewards/tool_execution_reward_func": 1.9640991687774658,
"rewards/visit_tool_reward_func": 0.9199192523956299,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002567829457364341,
"grad_norm": 0.012478280222631418,
"kl": 0.03009033203125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0025920542635658917,
"grad_norm": 0.013305867700430574,
"kl": 0.0390625,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002616279069767442,
"grad_norm": 0.0183428412461533,
"kl": 0.0509033203125,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6386.0,
"completions/max_terminated_length": 6386.0,
"completions/mean_length": 3297.5625,
"completions/mean_terminated_length": 3297.5625,
"completions/min_length": 1296.0,
"completions/min_terminated_length": 1296.0,
"epoch": 0.0026405038759689924,
"grad_norm": 0.02337332236690807,
"kl": 0.0616455078125,
"learning_rate": 2e-06,
"loss": 0.001,
"num_tokens": 14184466.0,
"reward": 0.40722835063934326,
"reward_std": 0.14360609650611877,
"rewards/avg_thinking_length_func": 138.28097534179688,
"rewards/confidence_score_reward_func": 0.644202470779419,
"rewards/correct_answer_reward_func": 0.59375,
"rewards/efficient_thinking_reward_func": 0.7607926960767375,
"rewards/format_and_efficient_reward_func": 0.46497124433517456,
"rewards/format_reward_func": 1.0,
"rewards/num_xml_reward_func": 1.4057281017303467,
"rewards/tool_execution_reward_func": 1.9434621334075928,
"rewards/visit_tool_reward_func": 0.9184768199920654,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0026647286821705426,
"grad_norm": 0.012698799447402773,
"kl": 0.06781005859375,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002688953488372093,
"grad_norm": 0.012619226324675306,
"kl": 0.0758056640625,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0027131782945736434,
"grad_norm": 0.013347372933753418,
"kl": 0.0892333984375,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10254.0,
"completions/max_terminated_length": 10254.0,
"completions/mean_length": 4017.296875,
"completions/mean_terminated_length": 4017.296875,
"completions/min_length": 1163.0,
"completions/min_terminated_length": 1163.0,
"epoch": 0.002737403100775194,
"grad_norm": 0.8482856229199331,
"kl": 0.163818359375,
"learning_rate": 2e-06,
"loss": 0.0003,
"num_tokens": 14783302.0,
"reward": 0.3793744742870331,
"reward_std": 0.08317889273166656,
"rewards/avg_thinking_length_func": 96.98873901367188,
"rewards/confidence_score_reward_func": 0.5890461206436157,
"rewards/correct_answer_reward_func": 0.578125,
"rewards/efficient_thinking_reward_func": 0.4956153760102844,
"rewards/format_and_efficient_reward_func": 0.3040567636489868,
"rewards/format_reward_func": 0.991857647895813,
"rewards/num_xml_reward_func": 0.9565892815589905,
"rewards/tool_execution_reward_func": 1.883992075920105,
"rewards/visit_tool_reward_func": 0.6309776306152344,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002761627906976744,
"grad_norm": 2.892668930951565,
"kl": 0.87744140625,
"learning_rate": 2e-06,
"loss": 0.002,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0027858527131782948,
"grad_norm": 0.11540075032392746,
"kl": 0.258544921875,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002810077519379845,
"grad_norm": 0.03602102455529362,
"kl": 0.205078125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7956.0,
"completions/max_terminated_length": 7956.0,
"completions/mean_length": 3060.15625,
"completions/mean_terminated_length": 3060.15625,
"completions/min_length": 1178.0,
"completions/min_terminated_length": 1178.0,
"epoch": 0.0028343023255813955,
"grad_norm": 0.011947818843430099,
"kl": 0.1031494140625,
"learning_rate": 2e-06,
"loss": 0.0013,
"num_tokens": 15239738.0,
"reward": 0.4257793724536896,
"reward_std": 0.15445315837860107,
"rewards/avg_thinking_length_func": 111.71697235107422,
"rewards/confidence_score_reward_func": 0.6188951730728149,
"rewards/correct_answer_reward_func": 0.671875,
"rewards/efficient_thinking_reward_func": 0.7151743089595498,
"rewards/format_and_efficient_reward_func": 0.3122476637363434,
"rewards/format_reward_func": 0.9918689727783203,
"rewards/num_xml_reward_func": 1.2823729515075684,
"rewards/tool_execution_reward_func": 1.9500064849853516,
"rewards/visit_tool_reward_func": 0.8597963452339172,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0028585271317829457,
"grad_norm": 0.01184782503909529,
"kl": 0.0999755859375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0028827519379844963,
"grad_norm": 0.01222442223239816,
"kl": 0.099365234375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0029069767441860465,
"grad_norm": 0.01288408566646706,
"kl": 0.1002197265625,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11579.0,
"completions/max_terminated_length": 11579.0,
"completions/mean_length": 3778.484375,
"completions/mean_terminated_length": 3778.484375,
"completions/min_length": 857.0,
"completions/min_terminated_length": 857.0,
"epoch": 0.002931201550387597,
"grad_norm": 0.013127560986285324,
"kl": 0.170654296875,
"learning_rate": 2e-06,
"loss": 0.0006,
"num_tokens": 15802870.0,
"reward": 0.35960614681243896,
"reward_std": 0.09336411207914352,
"rewards/avg_thinking_length_func": 120.11376953125,
"rewards/confidence_score_reward_func": 0.5505574941635132,
"rewards/correct_answer_reward_func": 0.609375,
"rewards/efficient_thinking_reward_func": 0.5848998658707487,
"rewards/format_and_efficient_reward_func": 0.09069697558879852,
"rewards/format_reward_func": 0.9635053873062134,
"rewards/num_xml_reward_func": 0.6183948516845703,
"rewards/tool_execution_reward_func": 1.921267032623291,
"rewards/visit_tool_reward_func": 0.408791184425354,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0029554263565891472,
"grad_norm": 0.011473925349363189,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.002979651162790698,
"grad_norm": 0.010667583254555548,
"kl": 0.1767578125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003003875968992248,
"grad_norm": 0.010839236682357098,
"kl": 0.1826171875,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7543.0,
"completions/max_terminated_length": 7543.0,
"completions/mean_length": 3495.625,
"completions/mean_terminated_length": 3495.625,
"completions/min_length": 1272.0,
"completions/min_terminated_length": 1272.0,
"epoch": 0.0030281007751937986,
"grad_norm": 0.014134486127382969,
"kl": 0.136474609375,
"learning_rate": 2e-06,
"loss": 0.0011,
"num_tokens": 16267847.0,
"reward": 0.40116244554519653,
"reward_std": 0.11558952927589417,
"rewards/avg_thinking_length_func": 171.4405975341797,
"rewards/confidence_score_reward_func": 0.592523455619812,
"rewards/correct_answer_reward_func": 0.65625,
"rewards/efficient_thinking_reward_func": 0.78887382548876,
"rewards/format_and_efficient_reward_func": -0.007415967993438244,
"rewards/format_reward_func": 0.9569429159164429,
"rewards/num_xml_reward_func": 0.533742368221283,
"rewards/tool_execution_reward_func": 1.984920620918274,
"rewards/visit_tool_reward_func": 0.8972762823104858,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003052325581395349,
"grad_norm": 0.01438304535919498,
"kl": 0.140625,
"learning_rate": 2e-06,
"loss": 0.0011,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0030765503875968994,
"grad_norm": 0.014656756114246808,
"kl": 0.14794921875,
"learning_rate": 2e-06,
"loss": 0.0011,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0031007751937984496,
"grad_norm": 0.015042904271731165,
"kl": 0.15869140625,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12404.0,
"completions/max_terminated_length": 12404.0,
"completions/mean_length": 4594.015625,
"completions/mean_terminated_length": 4594.015625,
"completions/min_length": 1214.0,
"completions/min_terminated_length": 1214.0,
"epoch": 0.003125,
"grad_norm": 0.01590013423348445,
"kl": 0.26904296875,
"learning_rate": 2e-06,
"loss": 0.0009,
"num_tokens": 16831957.0,
"reward": 0.3612688183784485,
"reward_std": 0.08134222030639648,
"rewards/avg_thinking_length_func": 189.8800048828125,
"rewards/confidence_score_reward_func": 0.5268421173095703,
"rewards/correct_answer_reward_func": 0.625,
"rewards/efficient_thinking_reward_func": 0.6692969275756135,
"rewards/format_and_efficient_reward_func": -0.032691895961761475,
"rewards/format_reward_func": 0.9466335773468018,
"rewards/num_xml_reward_func": 0.4149753153324127,
"rewards/tool_execution_reward_func": 1.9272011518478394,
"rewards/visit_tool_reward_func": 0.7673778533935547,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0031492248062015503,
"grad_norm": 0.01646874780720208,
"kl": 0.29443359375,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003173449612403101,
"grad_norm": 0.01694506623714648,
"kl": 0.314453125,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003197674418604651,
"grad_norm": 0.016867539615718644,
"kl": 0.3271484375,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6944.0,
"completions/max_terminated_length": 6944.0,
"completions/mean_length": 2774.265625,
"completions/mean_terminated_length": 2774.265625,
"completions/min_length": 1204.0,
"completions/min_terminated_length": 1204.0,
"epoch": 0.0032218992248062017,
"grad_norm": 0.022617229528507702,
"kl": 0.26953125,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 17249404.0,
"reward": 0.20413580536842346,
"reward_std": 0.05481432378292084,
"rewards/avg_thinking_length_func": 129.03866577148438,
"rewards/confidence_score_reward_func": 0.49319422245025635,
"rewards/correct_answer_reward_func": 0.34375,
"rewards/efficient_thinking_reward_func": 0.7432039407243382,
"rewards/format_and_efficient_reward_func": 0.17171993851661682,
"rewards/format_reward_func": 0.9746097326278687,
"rewards/num_xml_reward_func": 0.8615504503250122,
"rewards/tool_execution_reward_func": 1.9303656816482544,
"rewards/visit_tool_reward_func": 0.9013795852661133,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003246124031007752,
"grad_norm": 0.020143739711233816,
"kl": 0.252685546875,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0032703488372093025,
"grad_norm": 0.01785809415589292,
"kl": 0.227294921875,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0032945736434108527,
"grad_norm": 0.015380281270199666,
"kl": 0.199462890625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7166.0,
"completions/max_terminated_length": 7166.0,
"completions/mean_length": 3239.453125,
"completions/mean_terminated_length": 3239.453125,
"completions/min_length": 1458.0,
"completions/min_terminated_length": 1458.0,
"epoch": 0.0033187984496124033,
"grad_norm": 0.012800365899215092,
"kl": 0.138427734375,
"learning_rate": 2e-06,
"loss": 0.0004,
"num_tokens": 17686794.0,
"reward": 0.3108579218387604,
"reward_std": 0.13888844847679138,
"rewards/avg_thinking_length_func": 171.369384765625,
"rewards/confidence_score_reward_func": 0.5435695648193359,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.802592893497664,
"rewards/format_and_efficient_reward_func": 0.2916308343410492,
"rewards/format_reward_func": 0.9913173913955688,
"rewards/num_xml_reward_func": 1.4043910503387451,
"rewards/tool_execution_reward_func": 1.8357443809509277,
"rewards/visit_tool_reward_func": 0.8753163814544678,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0033430232558139534,
"grad_norm": 0.014285839855776115,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003367248062015504,
"grad_norm": 0.015433812962718682,
"kl": 0.128173828125,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003391472868217054,
"grad_norm": 0.015720560114809618,
"kl": 0.1229248046875,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5424.0,
"completions/max_terminated_length": 5424.0,
"completions/mean_length": 3209.875,
"completions/mean_terminated_length": 3209.875,
"completions/min_length": 1301.0,
"completions/min_terminated_length": 1301.0,
"epoch": 0.003415697674418605,
"grad_norm": 0.009160832793565006,
"kl": 0.089599609375,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 18156998.0,
"reward": 0.2771710753440857,
"reward_std": 0.10209451615810394,
"rewards/avg_thinking_length_func": 144.3570556640625,
"rewards/confidence_score_reward_func": 0.5883906483650208,
"rewards/correct_answer_reward_func": 0.421875,
"rewards/efficient_thinking_reward_func": 0.9227171305298694,
"rewards/format_and_efficient_reward_func": 0.303905189037323,
"rewards/format_reward_func": 0.9965387582778931,
"rewards/num_xml_reward_func": 1.6496015787124634,
"rewards/tool_execution_reward_func": 1.9101753234863281,
"rewards/visit_tool_reward_func": 1.0097795724868774,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003439922480620155,
"grad_norm": 0.009348804877622782,
"kl": 0.0853271484375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0034641472868217056,
"grad_norm": 0.009332442022472659,
"kl": 0.080322265625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0034883720930232558,
"grad_norm": 0.009512893821144673,
"kl": 0.0767822265625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7366.0,
"completions/max_terminated_length": 7366.0,
"completions/mean_length": 4435.890625,
"completions/mean_terminated_length": 4435.890625,
"completions/min_length": 1397.0,
"completions/min_terminated_length": 1397.0,
"epoch": 0.0035125968992248064,
"grad_norm": 0.013329760690267301,
"kl": 0.055419921875,
"learning_rate": 2e-06,
"loss": -0.0002,
"num_tokens": 18712707.0,
"reward": 0.4343380331993103,
"reward_std": 0.1319217085838318,
"rewards/avg_thinking_length_func": 213.60223388671875,
"rewards/confidence_score_reward_func": 0.6497268080711365,
"rewards/correct_answer_reward_func": 0.625,
"rewards/efficient_thinking_reward_func": 0.8139017177985812,
"rewards/format_and_efficient_reward_func": 0.4802235960960388,
"rewards/format_reward_func": 0.9989955425262451,
"rewards/num_xml_reward_func": 1.751387119293213,
"rewards/tool_execution_reward_func": 1.9038957357406616,
"rewards/visit_tool_reward_func": 0.9324563145637512,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0035368217054263565,
"grad_norm": 0.013975012703647748,
"kl": 0.0540771484375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003561046511627907,
"grad_norm": 0.014076489547319788,
"kl": 0.0531005859375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0035852713178294573,
"grad_norm": 0.014165636449546886,
"kl": 0.0531005859375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5721.0,
"completions/max_terminated_length": 5721.0,
"completions/mean_length": 3476.890625,
"completions/mean_terminated_length": 3476.890625,
"completions/min_length": 1375.0,
"completions/min_terminated_length": 1375.0,
"epoch": 0.003609496124031008,
"grad_norm": 0.007532410662647794,
"kl": 0.06439208984375,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 19224629.0,
"reward": 0.3097182512283325,
"reward_std": 0.06608685851097107,
"rewards/avg_thinking_length_func": 155.74346923828125,
"rewards/confidence_score_reward_func": 0.6070291996002197,
"rewards/correct_answer_reward_func": 0.453125,
"rewards/efficient_thinking_reward_func": 0.9227627606272979,
"rewards/format_and_efficient_reward_func": 0.3381012976169586,
"rewards/format_reward_func": 0.9996874928474426,
"rewards/num_xml_reward_func": 1.6836090087890625,
"rewards/tool_execution_reward_func": 1.8510758876800537,
"rewards/visit_tool_reward_func": 0.8944061994552612,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003633720930232558,
"grad_norm": 0.007379430347015788,
"kl": 0.0645751953125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0036579457364341087,
"grad_norm": 0.008138518366845196,
"kl": 0.0657958984375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003682170542635659,
"grad_norm": 0.008284296957527382,
"kl": 0.0673828125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9618.0,
"completions/max_terminated_length": 9618.0,
"completions/mean_length": 4875.078125,
"completions/mean_terminated_length": 4875.078125,
"completions/min_length": 1847.0,
"completions/min_terminated_length": 1847.0,
"epoch": 0.0037063953488372095,
"grad_norm": 0.014704297852595168,
"kl": 0.05523681640625,
"learning_rate": 2e-06,
"loss": 0.0015,
"num_tokens": 19820623.0,
"reward": 0.428906112909317,
"reward_std": 0.16942133009433746,
"rewards/avg_thinking_length_func": 210.6763916015625,
"rewards/confidence_score_reward_func": 0.6548709869384766,
"rewards/correct_answer_reward_func": 0.609375,
"rewards/efficient_thinking_reward_func": 0.7212743512877299,
"rewards/format_and_efficient_reward_func": 0.4301028251647949,
"rewards/format_reward_func": 0.9975892305374146,
"rewards/num_xml_reward_func": 1.4759665727615356,
"rewards/tool_execution_reward_func": 1.8980989456176758,
"rewards/visit_tool_reward_func": 0.9375091791152954,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0037306201550387596,
"grad_norm": 0.015023473705486283,
"kl": 0.0567626953125,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0037548449612403102,
"grad_norm": 0.015217500076281755,
"kl": 0.05841064453125,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0037790697674418604,
"grad_norm": 0.016114636489248848,
"kl": 0.0614013671875,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6823.0,
"completions/max_terminated_length": 6823.0,
"completions/mean_length": 4384.296875,
"completions/mean_terminated_length": 4384.296875,
"completions/min_length": 1697.0,
"completions/min_terminated_length": 1697.0,
"epoch": 0.003803294573643411,
"grad_norm": 0.006588369322686691,
"kl": 0.0643310546875,
"learning_rate": 2e-06,
"loss": 0.0005,
"num_tokens": 20425222.0,
"reward": 0.34698012471199036,
"reward_std": 0.03517330437898636,
"rewards/avg_thinking_length_func": 178.83392333984375,
"rewards/confidence_score_reward_func": 0.6313294172286987,
"rewards/correct_answer_reward_func": 0.484375,
"rewards/efficient_thinking_reward_func": 0.8650427095882729,
"rewards/format_and_efficient_reward_func": 0.37807154655456543,
"rewards/format_reward_func": 0.9995312690734863,
"rewards/num_xml_reward_func": 1.323744297027588,
"rewards/tool_execution_reward_func": 1.96144700050354,
"rewards/visit_tool_reward_func": 0.9631377458572388,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003827519379844961,
"grad_norm": 0.006972139333963718,
"kl": 0.0670166015625,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003851744186046512,
"grad_norm": 0.0071318562836598836,
"kl": 0.06884765625,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003875968992248062,
"grad_norm": 0.007113091376284595,
"kl": 0.06982421875,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11894.0,
"completions/max_terminated_length": 11894.0,
"completions/mean_length": 5685.0625,
"completions/mean_terminated_length": 5685.0625,
"completions/min_length": 1886.0,
"completions/min_terminated_length": 1886.0,
"epoch": 0.0039001937984496126,
"grad_norm": 0.01558937344658329,
"kl": 0.06414794921875,
"learning_rate": 2e-06,
"loss": 0.0018,
"num_tokens": 21086786.0,
"reward": 0.4025996923446655,
"reward_std": 0.13449470698833466,
"rewards/avg_thinking_length_func": 254.32508850097656,
"rewards/confidence_score_reward_func": 0.6495309472084045,
"rewards/correct_answer_reward_func": 0.578125,
"rewards/efficient_thinking_reward_func": 0.6637161596148502,
"rewards/format_and_efficient_reward_func": 0.458422988653183,
"rewards/format_reward_func": 0.9998437166213989,
"rewards/num_xml_reward_func": 1.5073208808898926,
"rewards/tool_execution_reward_func": 1.9572367668151855,
"rewards/visit_tool_reward_func": 0.9573923349380493,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003924418604651163,
"grad_norm": 0.016638056430155885,
"kl": 0.064453125,
"learning_rate": 2e-06,
"loss": 0.0018,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.003948643410852713,
"grad_norm": 0.01813854752521658,
"kl": 0.06536865234375,
"learning_rate": 2e-06,
"loss": 0.0018,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0039728682170542635,
"grad_norm": 0.01938490985845502,
"kl": 0.06988525390625,
"learning_rate": 2e-06,
"loss": 0.0019,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9060.0,
"completions/max_terminated_length": 9060.0,
"completions/mean_length": 4403.21875,
"completions/mean_terminated_length": 4403.21875,
"completions/min_length": 1390.0,
"completions/min_terminated_length": 1390.0,
"epoch": 0.003997093023255814,
"grad_norm": 0.005449273513524992,
"kl": 0.0662841796875,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 21662894.0,
"reward": 0.35001087188720703,
"reward_std": 0.009927155449986458,
"rewards/avg_thinking_length_func": 188.15765380859375,
"rewards/confidence_score_reward_func": 0.6182008981704712,
"rewards/correct_answer_reward_func": 0.5,
"rewards/efficient_thinking_reward_func": 0.8001981107519069,
"rewards/format_and_efficient_reward_func": 0.36673688888549805,
"rewards/format_reward_func": 0.9998437166213989,
"rewards/num_xml_reward_func": 1.4394086599349976,
"rewards/tool_execution_reward_func": 1.993227481842041,
"rewards/visit_tool_reward_func": 0.936252236366272,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004021317829457365,
"grad_norm": 0.00568312787735846,
"kl": 0.068115234375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0040455426356589145,
"grad_norm": 0.005806971085578714,
"kl": 0.069580078125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004069767441860465,
"grad_norm": 0.00592190722180043,
"kl": 0.070556640625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9992.0,
"completions/max_terminated_length": 9992.0,
"completions/mean_length": 5377.4375,
"completions/mean_terminated_length": 5377.4375,
"completions/min_length": 1809.0,
"completions/min_terminated_length": 1809.0,
"epoch": 0.004093992248062016,
"grad_norm": 0.359099649617951,
"kl": 0.1207275390625,
"learning_rate": 2e-06,
"loss": 0.0029,
"num_tokens": 22286431.0,
"reward": 0.40037134289741516,
"reward_std": 0.12838459014892578,
"rewards/avg_thinking_length_func": 245.9459228515625,
"rewards/confidence_score_reward_func": 0.6141020059585571,
"rewards/correct_answer_reward_func": 0.609375,
"rewards/efficient_thinking_reward_func": 0.6361426555187852,
"rewards/format_and_efficient_reward_func": 0.45017051696777344,
"rewards/format_reward_func": 0.9981250166893005,
"rewards/num_xml_reward_func": 1.532149076461792,
"rewards/tool_execution_reward_func": 1.9983552694320679,
"rewards/visit_tool_reward_func": 0.9713033437728882,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004118217054263566,
"grad_norm": 0.0312847460920415,
"kl": 0.08642578125,
"learning_rate": 2e-06,
"loss": 0.0028,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004142441860465116,
"grad_norm": 0.5587996108011728,
"kl": 0.2386474609375,
"learning_rate": 2e-06,
"loss": 0.003,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004166666666666667,
"grad_norm": 0.03228792794627183,
"kl": 0.092529296875,
"learning_rate": 2e-06,
"loss": 0.0028,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7094.0,
"completions/max_terminated_length": 7094.0,
"completions/mean_length": 4163.640625,
"completions/mean_terminated_length": 4163.640625,
"completions/min_length": 1385.0,
"completions/min_terminated_length": 1385.0,
"epoch": 0.004190891472868217,
"grad_norm": 0.008141555436627606,
"kl": 0.1014404296875,
"learning_rate": 2e-06,
"loss": 0.0007,
"num_tokens": 22851695.0,
"reward": 0.31291523575782776,
"reward_std": 0.0387241393327713,
"rewards/avg_thinking_length_func": 150.9978485107422,
"rewards/confidence_score_reward_func": 0.5685818195343018,
"rewards/correct_answer_reward_func": 0.46875,
"rewards/efficient_thinking_reward_func": 0.8065696148258371,
"rewards/format_and_efficient_reward_func": 0.30031993985176086,
"rewards/format_reward_func": 0.9996874928474426,
"rewards/num_xml_reward_func": 1.2274867296218872,
"rewards/tool_execution_reward_func": 1.9928336143493652,
"rewards/visit_tool_reward_func": 0.9787203073501587,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004215116279069768,
"grad_norm": 0.008733677069632446,
"kl": 0.1131591796875,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0042393410852713176,
"grad_norm": 0.009638540295346257,
"kl": 0.12744140625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004263565891472868,
"grad_norm": 0.010992556993855552,
"kl": 0.142822265625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8227.0,
"completions/max_terminated_length": 8227.0,
"completions/mean_length": 4541.953125,
"completions/mean_terminated_length": 4541.953125,
"completions/min_length": 1507.0,
"completions/min_terminated_length": 1507.0,
"epoch": 0.004287790697674419,
"grad_norm": 0.1409188461026278,
"kl": 0.265625,
"learning_rate": 2e-06,
"loss": 0.0037,
"num_tokens": 23436250.0,
"reward": 0.3243735730648041,
"reward_std": 0.15356436371803284,
"rewards/avg_thinking_length_func": 171.99826049804688,
"rewards/confidence_score_reward_func": 0.5453901290893555,
"rewards/correct_answer_reward_func": 0.53125,
"rewards/efficient_thinking_reward_func": 0.6924963364887087,
"rewards/format_and_efficient_reward_func": 0.3312879800796509,
"rewards/format_reward_func": 0.998577356338501,
"rewards/num_xml_reward_func": 1.3812510967254639,
"rewards/tool_execution_reward_func": 1.9967105388641357,
"rewards/visit_tool_reward_func": 0.9554424285888672,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004312015503875969,
"grad_norm": 0.05228415250398885,
"kl": 0.201904296875,
"learning_rate": 2e-06,
"loss": 0.0037,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004336240310077519,
"grad_norm": 0.060068767522700996,
"kl": 0.2451171875,
"learning_rate": 2e-06,
"loss": 0.0037,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00436046511627907,
"grad_norm": 0.2730620784971272,
"kl": 0.4716796875,
"learning_rate": 2e-06,
"loss": 0.0041,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8120.0,
"completions/max_terminated_length": 8120.0,
"completions/mean_length": 3361.40625,
"completions/mean_terminated_length": 3361.40625,
"completions/min_length": 1075.0,
"completions/min_terminated_length": 1075.0,
"epoch": 0.00438468992248062,
"grad_norm": 0.05853393969832367,
"kl": 0.5302734375,
"learning_rate": 2e-06,
"loss": 0.0007,
"num_tokens": 23987107.0,
"reward": 0.24436859786510468,
"reward_std": 0.04949303716421127,
"rewards/avg_thinking_length_func": 81.72256469726562,
"rewards/confidence_score_reward_func": 0.45580577850341797,
"rewards/correct_answer_reward_func": 0.453125,
"rewards/efficient_thinking_reward_func": 0.573834842856046,
"rewards/format_and_efficient_reward_func": 0.22879377007484436,
"rewards/format_reward_func": 0.995830774307251,
"rewards/num_xml_reward_func": 1.104771614074707,
"rewards/tool_execution_reward_func": 1.9899488687515259,
"rewards/visit_tool_reward_func": 0.8998211622238159,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004408914728682171,
"grad_norm": 0.07516497327438276,
"kl": 0.6845703125,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004433139534883721,
"grad_norm": 0.05997132496622212,
"kl": 0.626953125,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004457364341085271,
"grad_norm": 0.037671767248184135,
"kl": 0.48681640625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8178.0,
"completions/max_terminated_length": 8178.0,
"completions/mean_length": 3659.640625,
"completions/mean_terminated_length": 3659.640625,
"completions/min_length": 896.0,
"completions/min_terminated_length": 896.0,
"epoch": 0.004481589147286822,
"grad_norm": 0.016069114631093232,
"kl": 0.34326171875,
"learning_rate": 2e-06,
"loss": 0.0009,
"num_tokens": 24496297.0,
"reward": 0.2305455505847931,
"reward_std": 0.06948232650756836,
"rewards/avg_thinking_length_func": 111.37628936767578,
"rewards/confidence_score_reward_func": 0.37327370047569275,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.48277143466617184,
"rewards/format_and_efficient_reward_func": 0.1522754281759262,
"rewards/format_reward_func": 0.9647905230522156,
"rewards/num_xml_reward_func": 0.8915370106697083,
"rewards/tool_execution_reward_func": 1.9581143856048584,
"rewards/visit_tool_reward_func": 0.5689894556999207,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0045058139534883725,
"grad_norm": 0.014918137972398021,
"kl": 0.31640625,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004530038759689922,
"grad_norm": 0.014560290660972823,
"kl": 0.2958984375,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004554263565891473,
"grad_norm": 0.014191965162457063,
"kl": 0.27880859375,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11027.0,
"completions/max_terminated_length": 11027.0,
"completions/mean_length": 2904.71875,
"completions/mean_terminated_length": 2904.71875,
"completions/min_length": 912.0,
"completions/min_terminated_length": 912.0,
"epoch": 0.004578488372093023,
"grad_norm": 0.02465674538761865,
"kl": 0.27099609375,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 24960803.0,
"reward": 0.22261814773082733,
"reward_std": 0.04196429252624512,
"rewards/avg_thinking_length_func": 79.28602600097656,
"rewards/confidence_score_reward_func": 0.40539172291755676,
"rewards/correct_answer_reward_func": 0.46875,
"rewards/efficient_thinking_reward_func": 0.4911669222941917,
"rewards/format_and_efficient_reward_func": 0.14570605754852295,
"rewards/format_reward_func": 0.9741340279579163,
"rewards/num_xml_reward_func": 0.884125292301178,
"rewards/tool_execution_reward_func": 1.9560561180114746,
"rewards/visit_tool_reward_func": 0.7019689083099365,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004602713178294574,
"grad_norm": 0.01002216998448175,
"kl": 0.24951171875,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004626937984496124,
"grad_norm": 0.009283017573166963,
"kl": 0.234619140625,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004651162790697674,
"grad_norm": 0.00871351171533654,
"kl": 0.221435546875,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6760.0,
"completions/max_terminated_length": 6760.0,
"completions/mean_length": 3175.625,
"completions/mean_terminated_length": 3175.625,
"completions/min_length": 1279.0,
"completions/min_terminated_length": 1279.0,
"epoch": 0.004675387596899225,
"grad_norm": 0.01864801542206714,
"kl": 0.200927734375,
"learning_rate": 2e-06,
"loss": 0.0013,
"num_tokens": 25421310.0,
"reward": 0.3337632417678833,
"reward_std": 0.1033831387758255,
"rewards/avg_thinking_length_func": 144.4852752685547,
"rewards/confidence_score_reward_func": 0.5157345533370972,
"rewards/correct_answer_reward_func": 0.609375,
"rewards/efficient_thinking_reward_func": 0.6954727584239813,
"rewards/format_and_efficient_reward_func": 0.2803717255592346,
"rewards/format_reward_func": 0.9838045835494995,
"rewards/num_xml_reward_func": 1.244771957397461,
"rewards/tool_execution_reward_func": 1.9927083253860474,
"rewards/visit_tool_reward_func": 0.8324298858642578,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0046996124031007756,
"grad_norm": 0.018411722840556213,
"kl": 0.193603515625,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004723837209302325,
"grad_norm": 0.018380172856358755,
"kl": 0.189208984375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004748062015503876,
"grad_norm": 0.018655645496485265,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5256.0,
"completions/max_terminated_length": 5256.0,
"completions/mean_length": 2635.984375,
"completions/mean_terminated_length": 2635.984375,
"completions/min_length": 1134.0,
"completions/min_terminated_length": 1134.0,
"epoch": 0.0047722868217054265,
"grad_norm": 0.004219005229441154,
"kl": 0.1275634765625,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 25833510.0,
"reward": 0.30341458320617676,
"reward_std": 0.014322971925139427,
"rewards/avg_thinking_length_func": 147.2517852783203,
"rewards/confidence_score_reward_func": 0.5635701417922974,
"rewards/correct_answer_reward_func": 0.5,
"rewards/efficient_thinking_reward_func": 0.8586018615751865,
"rewards/format_and_efficient_reward_func": 0.28311923146247864,
"rewards/format_reward_func": 0.9866694808006287,
"rewards/num_xml_reward_func": 1.2634769678115845,
"rewards/tool_execution_reward_func": 1.9635450839996338,
"rewards/visit_tool_reward_func": 0.80121248960495,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004796511627906977,
"grad_norm": 0.004672728095639017,
"kl": 0.130615234375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004820736434108527,
"grad_norm": 0.004950768699918263,
"kl": 0.1329345703125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0048449612403100775,
"grad_norm": 0.005160418640186133,
"kl": 0.1343994140625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5779.0,
"completions/max_terminated_length": 5779.0,
"completions/mean_length": 2860.046875,
"completions/mean_terminated_length": 2860.046875,
"completions/min_length": 1125.0,
"completions/min_terminated_length": 1125.0,
"epoch": 0.004869186046511628,
"grad_norm": 0.008829648064201757,
"kl": 0.0411376953125,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 26237865.0,
"reward": 0.45401930809020996,
"reward_std": 0.09410357475280762,
"rewards/avg_thinking_length_func": 182.534423828125,
"rewards/confidence_score_reward_func": 0.5977352857589722,
"rewards/correct_answer_reward_func": 0.734375,
"rewards/efficient_thinking_reward_func": 0.784292215730239,
"rewards/format_and_efficient_reward_func": 0.41676729917526245,
"rewards/format_reward_func": 0.9937513470649719,
"rewards/num_xml_reward_func": 1.5355236530303955,
"rewards/tool_execution_reward_func": 1.9931985139846802,
"rewards/visit_tool_reward_func": 0.8612196445465088,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004893410852713179,
"grad_norm": 0.009196641903985264,
"kl": 0.0400390625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004917635658914728,
"grad_norm": 0.009490032359266305,
"kl": 0.038818359375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.004941860465116279,
"grad_norm": 0.009682454113754367,
"kl": 0.03753662109375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6331.0,
"completions/max_terminated_length": 6331.0,
"completions/mean_length": 2659.453125,
"completions/mean_terminated_length": 2659.453125,
"completions/min_length": 895.0,
"completions/min_terminated_length": 895.0,
"epoch": 0.00496608527131783,
"grad_norm": 0.0037363827479903167,
"kl": 0.03375244140625,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 26646632.0,
"reward": 0.3066054582595825,
"reward_std": 0.03825566917657852,
"rewards/avg_thinking_length_func": 136.1707763671875,
"rewards/confidence_score_reward_func": 0.5777994990348816,
"rewards/correct_answer_reward_func": 0.484375,
"rewards/efficient_thinking_reward_func": 0.786608708417682,
"rewards/format_and_efficient_reward_func": 0.3019195795059204,
"rewards/format_reward_func": 0.9903415441513062,
"rewards/num_xml_reward_func": 1.3805111646652222,
"rewards/tool_execution_reward_func": 1.9650006294250488,
"rewards/visit_tool_reward_func": 0.8477368354797363,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00499031007751938,
"grad_norm": 0.0037822209816054495,
"kl": 0.03350830078125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00501453488372093,
"grad_norm": 0.0038040246120938713,
"kl": 0.033477783203125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0050387596899224806,
"grad_norm": 0.0038540122892837783,
"kl": 0.03350830078125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5045.0,
"completions/max_terminated_length": 5045.0,
"completions/mean_length": 2763.5,
"completions/mean_terminated_length": 2763.5,
"completions/min_length": 1119.0,
"completions/min_terminated_length": 1119.0,
"epoch": 0.005062984496124031,
"grad_norm": 0.00548683325475162,
"kl": 0.035003662109375,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 27041016.0,
"reward": 0.45102459192276,
"reward_std": 0.06410035490989685,
"rewards/avg_thinking_length_func": 186.88746643066406,
"rewards/confidence_score_reward_func": 0.6191459894180298,
"rewards/correct_answer_reward_func": 0.6875,
"rewards/efficient_thinking_reward_func": 0.8100582820862734,
"rewards/format_and_efficient_reward_func": 0.44868165254592896,
"rewards/format_reward_func": 0.9952791929244995,
"rewards/num_xml_reward_func": 1.649810552597046,
"rewards/tool_execution_reward_func": 1.9959295988082886,
"rewards/visit_tool_reward_func": 0.8671329021453857,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005087209302325582,
"grad_norm": 0.005414160917662644,
"kl": 0.03399658203125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0051114341085271315,
"grad_norm": 0.005397000227956369,
"kl": 0.033294677734375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005135658914728682,
"grad_norm": 0.005329822482164869,
"kl": 0.03271484375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5622.0,
"completions/max_terminated_length": 5622.0,
"completions/mean_length": 2689.5,
"completions/mean_terminated_length": 2689.5,
"completions/min_length": 1080.0,
"completions/min_terminated_length": 1080.0,
"epoch": 0.005159883720930233,
"grad_norm": 0.004583885118409577,
"kl": 0.027679443359375,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 27444785.0,
"reward": 0.3377038240432739,
"reward_std": 0.03283514827489853,
"rewards/avg_thinking_length_func": 156.95558166503906,
"rewards/confidence_score_reward_func": 0.6069622039794922,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.8533607950008524,
"rewards/format_and_efficient_reward_func": 0.3490750193595886,
"rewards/format_reward_func": 0.9963964819908142,
"rewards/num_xml_reward_func": 1.565781831741333,
"rewards/tool_execution_reward_func": 1.9799107313156128,
"rewards/visit_tool_reward_func": 0.886849582195282,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005184108527131783,
"grad_norm": 0.004553415372891503,
"kl": 0.02728271484375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005208333333333333,
"grad_norm": 0.004416753047475649,
"kl": 0.026763916015625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005232558139534884,
"grad_norm": 0.004302097167180992,
"kl": 0.02606201171875,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6509.0,
"completions/max_terminated_length": 6509.0,
"completions/mean_length": 3170.359375,
"completions/mean_terminated_length": 3170.359375,
"completions/min_length": 1316.0,
"completions/min_terminated_length": 1316.0,
"epoch": 0.005256782945736434,
"grad_norm": 0.00874079090702132,
"kl": 0.03131103515625,
"learning_rate": 2e-06,
"loss": 0.0002,
"num_tokens": 27886438.0,
"reward": 0.4546785354614258,
"reward_std": 0.13061311841011047,
"rewards/avg_thinking_length_func": 185.58987426757812,
"rewards/confidence_score_reward_func": 0.6329280138015747,
"rewards/correct_answer_reward_func": 0.671875,
"rewards/efficient_thinking_reward_func": 0.7895888587130873,
"rewards/format_and_efficient_reward_func": 0.43139761686325073,
"rewards/format_reward_func": 0.9971143007278442,
"rewards/num_xml_reward_func": 1.6065764427185059,
"rewards/tool_execution_reward_func": 1.9975961446762085,
"rewards/visit_tool_reward_func": 0.8967168927192688,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005281007751937985,
"grad_norm": 0.009254919184464793,
"kl": 0.031158447265625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005305232558139535,
"grad_norm": 0.008540278295280325,
"kl": 0.03131103515625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005329457364341085,
"grad_norm": 0.009027249196409619,
"kl": 0.031463623046875,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6125.0,
"completions/max_terminated_length": 6125.0,
"completions/mean_length": 2700.765625,
"completions/mean_terminated_length": 2700.765625,
"completions/min_length": 966.0,
"completions/min_terminated_length": 966.0,
"epoch": 0.005353682170542636,
"grad_norm": 0.001653975947803042,
"kl": 0.0260009765625,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 28309543.0,
"reward": 0.3285777270793915,
"reward_std": 0.013459177687764168,
"rewards/avg_thinking_length_func": 149.52700805664062,
"rewards/confidence_score_reward_func": 0.6043996214866638,
"rewards/correct_answer_reward_func": 0.5,
"rewards/efficient_thinking_reward_func": 0.8903335916310755,
"rewards/format_and_efficient_reward_func": 0.3600352108478546,
"rewards/format_reward_func": 0.996889591217041,
"rewards/num_xml_reward_func": 1.5710426568984985,
"rewards/tool_execution_reward_func": 1.9776184558868408,
"rewards/visit_tool_reward_func": 0.9032177925109863,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005377906976744186,
"grad_norm": 0.001652035863632615,
"kl": 0.0264892578125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005402131782945736,
"grad_norm": 0.0016513159446787636,
"kl": 0.0269775390625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005426356589147287,
"grad_norm": 0.0020335905228311916,
"kl": 0.027557373046875,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4748.0,
"completions/max_terminated_length": 4748.0,
"completions/mean_length": 2978.90625,
"completions/mean_terminated_length": 2978.90625,
"completions/min_length": 1244.0,
"completions/min_terminated_length": 1244.0,
"epoch": 0.005450581395348837,
"grad_norm": 0.006026935047182901,
"kl": 0.03179931640625,
"learning_rate": 2e-06,
"loss": 0.0003,
"num_tokens": 28740848.0,
"reward": 0.4945339560508728,
"reward_std": 0.0744490772485733,
"rewards/avg_thinking_length_func": 172.45849609375,
"rewards/confidence_score_reward_func": 0.6167193651199341,
"rewards/correct_answer_reward_func": 0.765625,
"rewards/efficient_thinking_reward_func": 0.7966197226027097,
"rewards/format_and_efficient_reward_func": 0.512791097164154,
"rewards/format_reward_func": 0.9983228445053101,
"rewards/num_xml_reward_func": 1.6630462408065796,
"rewards/tool_execution_reward_func": 1.9971591234207153,
"rewards/visit_tool_reward_func": 0.9000678062438965,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005474806201550388,
"grad_norm": 0.005801070538806677,
"kl": 0.0323486328125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005499031007751938,
"grad_norm": 0.005789539677805553,
"kl": 0.03302001953125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005523255813953488,
"grad_norm": 0.005731300295942885,
"kl": 0.033935546875,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5315.0,
"completions/max_terminated_length": 5315.0,
"completions/mean_length": 2718.109375,
"completions/mean_terminated_length": 2718.109375,
"completions/min_length": 1049.0,
"completions/min_terminated_length": 1049.0,
"epoch": 0.005547480620155039,
"grad_norm": 0.0027604900450052977,
"kl": 0.03369140625,
"learning_rate": 2e-06,
"loss": 0.0,
"num_tokens": 29177563.0,
"reward": 0.33125773072242737,
"reward_std": 0.012095385231077671,
"rewards/avg_thinking_length_func": 138.28082275390625,
"rewards/confidence_score_reward_func": 0.588701605796814,
"rewards/correct_answer_reward_func": 0.5,
"rewards/efficient_thinking_reward_func": 0.8968424695250805,
"rewards/format_and_efficient_reward_func": 0.36526361107826233,
"rewards/format_reward_func": 0.9942506551742554,
"rewards/num_xml_reward_func": 1.484344720840454,
"rewards/tool_execution_reward_func": 1.9658281803131104,
"rewards/visit_tool_reward_func": 0.9050877094268799,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.0055717054263565895,
"grad_norm": 0.0028469369049688264,
"kl": 0.03424072265625,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005595930232558139,
"grad_norm": 0.0029207200987881226,
"kl": 0.03466796875,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.00562015503875969,
"grad_norm": 0.002891989345093088,
"kl": 0.03436279296875,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5430.0,
"completions/max_terminated_length": 5430.0,
"completions/mean_length": 3147.21875,
"completions/mean_terminated_length": 3147.21875,
"completions/min_length": 1208.0,
"completions/min_terminated_length": 1208.0,
"epoch": 0.0056443798449612404,
"grad_norm": 0.008009912903442006,
"kl": 0.039306640625,
"learning_rate": 2e-06,
"loss": 0.0004,
"num_tokens": 29641355.0,
"reward": 0.45486128330230713,
"reward_std": 0.10010581463575363,
"rewards/avg_thinking_length_func": 154.7548828125,
"rewards/confidence_score_reward_func": 0.5910084247589111,
"rewards/correct_answer_reward_func": 0.71875,
"rewards/efficient_thinking_reward_func": 0.79141897353926,
"rewards/format_and_efficient_reward_func": 0.4532102346420288,
"rewards/format_reward_func": 0.9973268508911133,
"rewards/num_xml_reward_func": 1.6137380599975586,
"rewards/tool_execution_reward_func": 1.9840686321258545,
"rewards/visit_tool_reward_func": 0.9216470718383789,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005668604651162791,
"grad_norm": 0.008010434434161435,
"kl": 0.03924560546875,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005692829457364341,
"grad_norm": 0.008059617739522514,
"kl": 0.03936767578125,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005717054263565891,
"grad_norm": 0.008321692756210844,
"kl": 0.0400390625,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5409.0,
"completions/max_terminated_length": 5409.0,
"completions/mean_length": 2747.0,
"completions/mean_terminated_length": 2747.0,
"completions/min_length": 1028.0,
"completions/min_terminated_length": 1028.0,
"epoch": 0.005741279069767442,
"grad_norm": 0.00607005113841099,
"kl": 0.0380859375,
"learning_rate": 2e-06,
"loss": 0.0001,
"num_tokens": 30090756.0,
"reward": 0.3245881199836731,
"reward_std": 0.030338726937770844,
"rewards/avg_thinking_length_func": 118.96601867675781,
"rewards/confidence_score_reward_func": 0.5715887546539307,
"rewards/correct_answer_reward_func": 0.515625,
"rewards/efficient_thinking_reward_func": 0.7931376609790313,
"rewards/format_and_efficient_reward_func": 0.3051683306694031,
"rewards/format_reward_func": 0.9918498396873474,
"rewards/num_xml_reward_func": 1.335392713546753,
"rewards/tool_execution_reward_func": 1.956681728363037,
"rewards/visit_tool_reward_func": 0.8923399448394775,
"step": 237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005765503875968993,
"grad_norm": 0.006076971580972504,
"kl": 0.03839111328125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005789728682170542,
"grad_norm": 0.005795692009836339,
"kl": 0.0380859375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.005813953488372093,
"grad_norm": 0.005478655391819232,
"kl": 0.0377197265625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 240
}
],
"logging_steps": 1,
"max_steps": 640,
"num_input_tokens_seen": 30090756,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}