diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9133 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5792703951451624, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ave_tool_num": 0.6145833333333334, + "completion_length": 65.67708333333333, + "epoch": 0.0008275291359216606, + "grad_norm": 3.687605619430542, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.38072917331010103, + "reward_std": 0.36328611709177494, + "rewards/accuracy_reward": 0.38072917331010103, + "step": 1 + }, + { + "ave_tool_num": 0.6354166666666666, + "completion_length": 64.54166666666667, + "epoch": 0.0016550582718433211, + "grad_norm": 2.8233048915863037, + "kl": 0.0003399848937988281, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.3552083360652129, + "reward_std": 0.3107606054594119, + "rewards/accuracy_reward": 0.3552083360652129, + "step": 2 + }, + { + "ave_tool_num": 0.7083333333333334, + "completion_length": 68.69791666666667, + "epoch": 0.0024825874077649816, + "grad_norm": 4.700760841369629, + "kl": 0.00030930836995442707, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.33145833512147266, + "reward_std": 0.3468545638024807, + "rewards/accuracy_reward": 0.33145833512147266, + "step": 3 + }, + { + "ave_tool_num": 0.59375, + "completion_length": 71.1875, + "epoch": 0.0033101165436866422, + "grad_norm": 3.1818137168884277, + "kl": 0.00038623809814453125, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.3880208308498065, + "reward_std": 0.36459839468201, + "rewards/accuracy_reward": 0.3880208308498065, + "step": 4 + }, + { + "ave_tool_num": 0.6041666666666666, + "completion_length": 73.41666666666667, + "epoch": 0.0041376456796083025, + "grad_norm": 2.789335250854492, + "kl": 0.0005177656809488932, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.5219791680574417, + "reward_std": 0.41248805820941925, + "rewards/accuracy_reward": 0.5219791680574417, + "step": 5 + }, + { + "ave_tool_num": 0.59375, + "completion_length": 73.27083333333333, + "epoch": 0.004965174815529963, + "grad_norm": 2.524054527282715, + "kl": 0.0004936854044596354, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.34947916384165484, + "reward_std": 0.30716510241230327, + "rewards/accuracy_reward": 0.34947916384165484, + "step": 6 + }, + { + "ave_tool_num": 0.53125, + "completion_length": 77.28125, + "epoch": 0.005792703951451624, + "grad_norm": 2.392362117767334, + "kl": 0.0005311171213785807, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.3123958309491475, + "reward_std": 0.32299235214789707, + "rewards/accuracy_reward": 0.3123958309491475, + "step": 7 + }, + { + "ave_tool_num": 0.4479166666666667, + "completion_length": 74.58333333333333, + "epoch": 0.0066202330873732845, + "grad_norm": 2.302565097808838, + "kl": 0.0006212393442789713, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.4179166716833909, + "reward_std": 0.40247274314363796, + "rewards/accuracy_reward": 0.4179166716833909, + "step": 8 + }, + { + "ave_tool_num": 0.6458333333333334, + "completion_length": 71.0, + "epoch": 0.007447762223294945, + "grad_norm": 2.7281908988952637, + "kl": 0.0006186167399088541, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.3823958324889342, + "reward_std": 0.3120509621997674, + "rewards/accuracy_reward": 0.3823958324889342, + "step": 9 + }, + { + "ave_tool_num": 0.4791666666666667, + "completion_length": 66.10416666666667, + "epoch": 0.008275291359216605, + "grad_norm": 3.340848684310913, + "kl": 0.0007521311442057291, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.44187499831120175, + "reward_std": 0.3964681203166644, + "rewards/accuracy_reward": 0.44187499831120175, + "step": 10 + }, + { + "ave_tool_num": 0.4479166666666667, + "completion_length": 86.86458333333333, + "epoch": 0.009102820495138267, + "grad_norm": 3.190786838531494, + "kl": 0.000995318094889323, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.3650000036383669, + "reward_std": 0.3726381715387106, + "rewards/accuracy_reward": 0.3650000036383669, + "step": 11 + }, + { + "ave_tool_num": 0.4791666666666667, + "completion_length": 72.8125, + "epoch": 0.009930349631059926, + "grad_norm": 2.569927930831909, + "kl": 0.0014209747314453125, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.5018749994536241, + "reward_std": 0.42092933381597203, + "rewards/accuracy_reward": 0.5018749994536241, + "step": 12 + }, + { + "ave_tool_num": 0.4583333333333333, + "completion_length": 77.75, + "epoch": 0.010757878766981588, + "grad_norm": 3.4380650520324707, + "kl": 0.0014429092407226562, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.3452083344260852, + "reward_std": 0.3447565163175265, + "rewards/accuracy_reward": 0.3452083344260852, + "step": 13 + }, + { + "ave_tool_num": 0.4375, + "completion_length": 84.02083333333333, + "epoch": 0.011585407902903248, + "grad_norm": 2.0795938968658447, + "kl": 0.0015691121419270833, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.293958330526948, + "reward_std": 0.3163194600492716, + "rewards/accuracy_reward": 0.293958330526948, + "step": 14 + }, + { + "ave_tool_num": 0.375, + "completion_length": 74.52083333333333, + "epoch": 0.01241293703882491, + "grad_norm": 2.4282796382904053, + "kl": 0.0019931793212890625, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.5118749998509884, + "reward_std": 0.395802674194177, + "rewards/accuracy_reward": 0.5118749998509884, + "step": 15 + }, + { + "ave_tool_num": 0.4583333333333333, + "completion_length": 77.92708333333333, + "epoch": 0.013240466174746569, + "grad_norm": 2.45853328704834, + "kl": 0.002288182576497396, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.4009375013411045, + "reward_std": 0.38005323459704715, + "rewards/accuracy_reward": 0.4009375013411045, + "step": 16 + }, + { + "ave_tool_num": 0.3645833333333333, + "completion_length": 73.5625, + "epoch": 0.01406799531066823, + "grad_norm": 15.817559242248535, + "kl": 0.0027027130126953125, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.37656250471870106, + "reward_std": 0.31493017946680385, + "rewards/accuracy_reward": 0.37656250471870106, + "step": 17 + }, + { + "ave_tool_num": 0.40625, + "completion_length": 71.82291666666667, + "epoch": 0.01489552444658989, + "grad_norm": 2.51556658744812, + "kl": 0.002956390380859375, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.4659374977151553, + "reward_std": 0.4030299248794715, + "rewards/accuracy_reward": 0.4659374977151553, + "step": 18 + }, + { + "ave_tool_num": 0.4791666666666667, + "completion_length": 72.85416666666667, + "epoch": 0.01572305358251155, + "grad_norm": 2.344104766845703, + "kl": 0.0029093424479166665, + "learning_rate": 1e-06, + "loss": 0.0001, + "reward": 0.40958333636323613, + "reward_std": 0.4137238909800847, + "rewards/accuracy_reward": 0.40958333636323613, + "step": 19 + }, + { + "ave_tool_num": 0.3958333333333333, + "completion_length": 65.88541666666667, + "epoch": 0.01655058271843321, + "grad_norm": 2.9661552906036377, + "kl": 0.00408935546875, + "learning_rate": 1e-06, + "loss": 0.0002, + "reward": 0.5032291638975342, + "reward_std": 0.3095587318142255, + "rewards/accuracy_reward": 0.5032291638975342, + "step": 20 + }, + { + "ave_tool_num": 0.3958333333333333, + "completion_length": 75.36458333333333, + "epoch": 0.017378111854354873, + "grad_norm": 3.7749435901641846, + "kl": 0.0040124257405598955, + "learning_rate": 1e-06, + "loss": 0.0002, + "reward": 0.43718750464419526, + "reward_std": 0.36893431345621747, + "rewards/accuracy_reward": 0.43718750464419526, + "step": 21 + }, + { + "ave_tool_num": 0.2916666666666667, + "completion_length": 70.92708333333333, + "epoch": 0.018205640990276533, + "grad_norm": 2.3006138801574707, + "kl": 0.004107157389322917, + "learning_rate": 1e-06, + "loss": 0.0002, + "reward": 0.49499999980131787, + "reward_std": 0.36158617710073787, + "rewards/accuracy_reward": 0.49499999980131787, + "step": 22 + }, + { + "ave_tool_num": 0.40625, + "completion_length": 81.92708333333333, + "epoch": 0.019033170126198193, + "grad_norm": 2.7853517532348633, + "kl": 0.004355112711588542, + "learning_rate": 1e-06, + "loss": 0.0002, + "reward": 0.42625000700354576, + "reward_std": 0.4067776973048846, + "rewards/accuracy_reward": 0.42625000700354576, + "step": 23 + }, + { + "ave_tool_num": 0.3125, + "completion_length": 91.17708333333333, + "epoch": 0.019860699262119853, + "grad_norm": 1.9532500505447388, + "kl": 0.004709879557291667, + "learning_rate": 1e-06, + "loss": 0.0002, + "reward": 0.4234375034769376, + "reward_std": 0.3190951943397522, + "rewards/accuracy_reward": 0.4234375034769376, + "step": 24 + }, + { + "ave_tool_num": 0.34375, + "completion_length": 75.29166666666667, + "epoch": 0.020688228398041516, + "grad_norm": 2.2100584506988525, + "kl": 0.00644683837890625, + "learning_rate": 1e-06, + "loss": 0.0003, + "reward": 0.4388541605633994, + "reward_std": 0.3791306624189019, + "rewards/accuracy_reward": 0.4388541605633994, + "step": 25 + }, + { + "ave_tool_num": 0.3125, + "completion_length": 77.22916666666667, + "epoch": 0.021515757533963176, + "grad_norm": 2.7899577617645264, + "kl": 0.006688435872395833, + "learning_rate": 1e-06, + "loss": 0.0003, + "reward": 0.4389583344260852, + "reward_std": 0.3954159927864869, + "rewards/accuracy_reward": 0.4389583344260852, + "step": 26 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 82.71875, + "epoch": 0.022343286669884835, + "grad_norm": 1.710020661354065, + "kl": 0.007265726725260417, + "learning_rate": 1e-06, + "loss": 0.0003, + "reward": 0.5147916624943415, + "reward_std": 0.39067064225673676, + "rewards/accuracy_reward": 0.5147916624943415, + "step": 27 + }, + { + "ave_tool_num": 0.21875, + "completion_length": 71.55208333333333, + "epoch": 0.023170815805806495, + "grad_norm": 2.4213032722473145, + "kl": 0.007489522298177083, + "learning_rate": 1e-06, + "loss": 0.0003, + "reward": 0.5606249993046125, + "reward_std": 0.3689138777554035, + "rewards/accuracy_reward": 0.5606249993046125, + "step": 28 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 76.76041666666667, + "epoch": 0.023998344941728155, + "grad_norm": 2.6013545989990234, + "kl": 0.009526570638020834, + "learning_rate": 1e-06, + "loss": 0.0004, + "reward": 0.4172916719689965, + "reward_std": 0.3677075362453858, + "rewards/accuracy_reward": 0.4172916719689965, + "step": 29 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 72.90625, + "epoch": 0.02482587407764982, + "grad_norm": 2.1115834712982178, + "kl": 0.011133829752604166, + "learning_rate": 1e-06, + "loss": 0.0004, + "reward": 0.5239583353201548, + "reward_std": 0.37030617768565816, + "rewards/accuracy_reward": 0.5239583353201548, + "step": 30 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 81.0, + "epoch": 0.025653403213571478, + "grad_norm": 2.8508222103118896, + "kl": 0.007921854654947916, + "learning_rate": 1e-06, + "loss": 0.0003, + "reward": 0.43270833790302277, + "reward_std": 0.33646651978294057, + "rewards/accuracy_reward": 0.43270833790302277, + "step": 31 + }, + { + "ave_tool_num": 0.125, + "completion_length": 78.20833333333333, + "epoch": 0.026480932349493138, + "grad_norm": 2.1016619205474854, + "kl": 0.0103607177734375, + "learning_rate": 1e-06, + "loss": 0.0004, + "reward": 0.5217708374063174, + "reward_std": 0.3840571269392967, + "rewards/accuracy_reward": 0.5217708374063174, + "step": 32 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 78.80208333333333, + "epoch": 0.027308461485414798, + "grad_norm": 2.4991519451141357, + "kl": 0.009682973225911459, + "learning_rate": 1e-06, + "loss": 0.0004, + "reward": 0.5381250021358331, + "reward_std": 0.3248012171437343, + "rewards/accuracy_reward": 0.5381250021358331, + "step": 33 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 69.47916666666667, + "epoch": 0.02813599062133646, + "grad_norm": 2.313563585281372, + "kl": 0.013397216796875, + "learning_rate": 1e-06, + "loss": 0.0005, + "reward": 0.5530208324392637, + "reward_std": 0.3375113798926274, + "rewards/accuracy_reward": 0.5530208324392637, + "step": 34 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 77.07291666666667, + "epoch": 0.02896351975725812, + "grad_norm": 2.8639955520629883, + "kl": 0.01644134521484375, + "learning_rate": 1e-06, + "loss": 0.0007, + "reward": 0.3894791665176551, + "reward_std": 0.39523400242129963, + "rewards/accuracy_reward": 0.3894791665176551, + "step": 35 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 72.92708333333333, + "epoch": 0.02979104889317978, + "grad_norm": 2.813983917236328, + "kl": 0.013661702473958334, + "learning_rate": 1e-06, + "loss": 0.0005, + "reward": 0.5276041639347872, + "reward_std": 0.3725420751919349, + "rewards/accuracy_reward": 0.5276041639347872, + "step": 36 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 71.86458333333333, + "epoch": 0.03061857802910144, + "grad_norm": 2.5993218421936035, + "kl": 0.013849894205729166, + "learning_rate": 1e-06, + "loss": 0.0006, + "reward": 0.5138541633884112, + "reward_std": 0.3418373266855876, + "rewards/accuracy_reward": 0.5138541633884112, + "step": 37 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 72.5625, + "epoch": 0.0314461071650231, + "grad_norm": 3.332808494567871, + "kl": 0.021245320638020832, + "learning_rate": 1e-06, + "loss": 0.0008, + "reward": 0.42874999934186536, + "reward_std": 0.32438094230989617, + "rewards/accuracy_reward": 0.42874999934186536, + "step": 38 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 77.11458333333333, + "epoch": 0.032273636300944764, + "grad_norm": 2.975217819213867, + "kl": 0.015080769856770834, + "learning_rate": 1e-06, + "loss": 0.0006, + "reward": 0.4168750022848447, + "reward_std": 0.3363594835003217, + "rewards/accuracy_reward": 0.4168750022848447, + "step": 39 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 65.22916666666667, + "epoch": 0.03310116543686642, + "grad_norm": 2.4516894817352295, + "kl": 0.014780680338541666, + "learning_rate": 1e-06, + "loss": 0.0006, + "reward": 0.6432291666666666, + "reward_std": 0.32696346659213305, + "rewards/accuracy_reward": 0.6432291666666666, + "step": 40 + }, + { + "ave_tool_num": 0.125, + "completion_length": 73.97916666666667, + "epoch": 0.03392869457278808, + "grad_norm": 2.3031206130981445, + "kl": 0.017913818359375, + "learning_rate": 1e-06, + "loss": 0.0007, + "reward": 0.5205208410819372, + "reward_std": 0.364706643546621, + "rewards/accuracy_reward": 0.5205208410819372, + "step": 41 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 71.0625, + "epoch": 0.034756223708709746, + "grad_norm": 3.1562440395355225, + "kl": 0.022003173828125, + "learning_rate": 1e-06, + "loss": 0.0009, + "reward": 0.3971875024338563, + "reward_std": 0.3577278293669224, + "rewards/accuracy_reward": 0.3971875024338563, + "step": 42 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 90.1875, + "epoch": 0.0355837528446314, + "grad_norm": 4.000753402709961, + "kl": 0.021642049153645832, + "learning_rate": 1e-06, + "loss": 0.0009, + "reward": 0.4338541701436043, + "reward_std": 0.3337518473466237, + "rewards/accuracy_reward": 0.4338541701436043, + "step": 43 + }, + { + "ave_tool_num": 0.125, + "completion_length": 85.53125, + "epoch": 0.036411281980553066, + "grad_norm": 1.83049738407135, + "kl": 0.018803914388020832, + "learning_rate": 1e-06, + "loss": 0.0008, + "reward": 0.3570833317935467, + "reward_std": 0.40495876719554263, + "rewards/accuracy_reward": 0.3570833317935467, + "step": 44 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 88.80208333333333, + "epoch": 0.03723881111647473, + "grad_norm": 2.0824763774871826, + "kl": 0.016888936360677082, + "learning_rate": 1e-06, + "loss": 0.0007, + "reward": 0.5151041696468989, + "reward_std": 0.41253070160746574, + "rewards/accuracy_reward": 0.5151041696468989, + "step": 45 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 71.8125, + "epoch": 0.038066340252396386, + "grad_norm": 2.011597156524658, + "kl": 0.018610636393229168, + "learning_rate": 1e-06, + "loss": 0.0007, + "reward": 0.5507291741669178, + "reward_std": 0.3596516201893489, + "rewards/accuracy_reward": 0.5507291741669178, + "step": 46 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 72.71875, + "epoch": 0.03889386938831805, + "grad_norm": 9.884456634521484, + "kl": 0.029095967610677082, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.5670833302040895, + "reward_std": 0.35869242002566654, + "rewards/accuracy_reward": 0.5670833302040895, + "step": 47 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 73.22916666666667, + "epoch": 0.039721398524239705, + "grad_norm": 2.1355645656585693, + "kl": 0.022616068522135418, + "learning_rate": 1e-06, + "loss": 0.0009, + "reward": 0.47739583253860474, + "reward_std": 0.39569361756245297, + "rewards/accuracy_reward": 0.47739583253860474, + "step": 48 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 76.35416666666667, + "epoch": 0.04054892766016137, + "grad_norm": 2.0273003578186035, + "kl": 0.02371978759765625, + "learning_rate": 1e-06, + "loss": 0.0009, + "reward": 0.5472916687528292, + "reward_std": 0.3669271071751912, + "rewards/accuracy_reward": 0.5472916687528292, + "step": 49 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 80.59375, + "epoch": 0.04137645679608303, + "grad_norm": 3.362546682357788, + "kl": 0.019856770833333332, + "learning_rate": 1e-06, + "loss": 0.0008, + "reward": 0.5693750008940697, + "reward_std": 0.37384110627075035, + "rewards/accuracy_reward": 0.5693750008940697, + "step": 50 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 66.48958333333333, + "epoch": 0.04220398593200469, + "grad_norm": 2.030670404434204, + "kl": 0.024220784505208332, + "learning_rate": 1e-06, + "loss": 0.001, + "reward": 0.5455208371082941, + "reward_std": 0.33280155311028164, + "rewards/accuracy_reward": 0.5455208371082941, + "step": 51 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 81.32291666666667, + "epoch": 0.04303151506792635, + "grad_norm": 2.0850930213928223, + "kl": 0.0300140380859375, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.5513541661202908, + "reward_std": 0.32255622930824757, + "rewards/accuracy_reward": 0.5513541661202908, + "step": 52 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 66.78125, + "epoch": 0.04385904420384801, + "grad_norm": 2.2691259384155273, + "kl": 0.026349385579427082, + "learning_rate": 1e-06, + "loss": 0.0011, + "reward": 0.5747916710873445, + "reward_std": 0.3266766245166461, + "rewards/accuracy_reward": 0.5747916710873445, + "step": 53 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 72.125, + "epoch": 0.04468657333976967, + "grad_norm": 2.1348917484283447, + "kl": 0.0281829833984375, + "learning_rate": 1e-06, + "loss": 0.0011, + "reward": 0.5356250032782555, + "reward_std": 0.35159418483575183, + "rewards/accuracy_reward": 0.5356250032782555, + "step": 54 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 92.92708333333333, + "epoch": 0.045514102475691334, + "grad_norm": 2.132793426513672, + "kl": 0.033152262369791664, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.49937499935428303, + "reward_std": 0.3804554690917333, + "rewards/accuracy_reward": 0.49937499935428303, + "step": 55 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.54166666666667, + "epoch": 0.04634163161161299, + "grad_norm": 2.429373264312744, + "kl": 0.026219685872395832, + "learning_rate": 1e-06, + "loss": 0.0011, + "reward": 0.45479167252779007, + "reward_std": 0.3285732716321945, + "rewards/accuracy_reward": 0.45479167252779007, + "step": 56 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 69.15625, + "epoch": 0.047169160747534654, + "grad_norm": 3.324183225631714, + "kl": 0.042338053385416664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5339583319922289, + "reward_std": 0.37656143059333164, + "rewards/accuracy_reward": 0.5339583319922289, + "step": 57 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 92.625, + "epoch": 0.04799668988345631, + "grad_norm": 2.167320728302002, + "kl": 0.047826131184895836, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.45802083487312, + "reward_std": 0.3691972717642784, + "rewards/accuracy_reward": 0.45802083487312, + "step": 58 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 75.59375, + "epoch": 0.048824219019377973, + "grad_norm": 1.8926162719726562, + "kl": 0.032297770182291664, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5481250000496706, + "reward_std": 0.3484482280910015, + "rewards/accuracy_reward": 0.5481250000496706, + "step": 59 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 67.14583333333333, + "epoch": 0.04965174815529964, + "grad_norm": 2.1781420707702637, + "kl": 0.03143310546875, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5081250034272671, + "reward_std": 0.36099957426389057, + "rewards/accuracy_reward": 0.5081250034272671, + "step": 60 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 74.41666666666667, + "epoch": 0.05047927729122129, + "grad_norm": 2.071403741836548, + "kl": 0.051060994466145836, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.408958338201046, + "reward_std": 0.3547427176187436, + "rewards/accuracy_reward": 0.408958338201046, + "step": 61 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 75.5625, + "epoch": 0.051306806427142956, + "grad_norm": 2.9192886352539062, + "kl": 0.037353515625, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.4878125029305617, + "reward_std": 0.3637554782132308, + "rewards/accuracy_reward": 0.4878125029305617, + "step": 62 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 87.05208333333333, + "epoch": 0.05213433556306462, + "grad_norm": 3.907299041748047, + "kl": 0.032908121744791664, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.4937499991307656, + "reward_std": 0.36045203854640323, + "rewards/accuracy_reward": 0.4937499991307656, + "step": 63 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 76.53125, + "epoch": 0.052961864698986276, + "grad_norm": 3.1855220794677734, + "kl": 0.0250396728515625, + "learning_rate": 1e-06, + "loss": 0.001, + "reward": 0.49562500417232513, + "reward_std": 0.41566236813863117, + "rewards/accuracy_reward": 0.49562500417232513, + "step": 64 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 67.9375, + "epoch": 0.05378939383490794, + "grad_norm": 3.258819341659546, + "kl": 0.036351521809895836, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5430208320418993, + "reward_std": 0.385436254243056, + "rewards/accuracy_reward": 0.5430208320418993, + "step": 65 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.98958333333333, + "epoch": 0.054616922970829596, + "grad_norm": 1.9243744611740112, + "kl": 0.030497233072916668, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.42406250288089115, + "reward_std": 0.3583800954123338, + "rewards/accuracy_reward": 0.42406250288089115, + "step": 66 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 72.1875, + "epoch": 0.05544445210675126, + "grad_norm": 2.385833501815796, + "kl": 0.047953287760416664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.4985416680574417, + "reward_std": 0.36579764261841774, + "rewards/accuracy_reward": 0.4985416680574417, + "step": 67 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 66.38541666666667, + "epoch": 0.05627198124267292, + "grad_norm": 3.092698097229004, + "kl": 0.052225748697916664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6913541741669178, + "reward_std": 0.3443136215209961, + "rewards/accuracy_reward": 0.6913541741669178, + "step": 68 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 76.55208333333333, + "epoch": 0.05709951037859458, + "grad_norm": 2.5485997200012207, + "kl": 0.0429534912109375, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5273958289374908, + "reward_std": 0.35629118606448174, + "rewards/accuracy_reward": 0.5273958289374908, + "step": 69 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 65.86458333333333, + "epoch": 0.05792703951451624, + "grad_norm": 4.600788116455078, + "kl": 0.040191650390625, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.44093750168879825, + "reward_std": 0.3719057651857535, + "rewards/accuracy_reward": 0.44093750168879825, + "step": 70 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 71.125, + "epoch": 0.0587545686504379, + "grad_norm": 2.277543544769287, + "kl": 0.037495930989583336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6015625049670538, + "reward_std": 0.381665733953317, + "rewards/accuracy_reward": 0.6015625049670538, + "step": 71 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 73.72916666666667, + "epoch": 0.05958209778635956, + "grad_norm": 1.9522491693496704, + "kl": 0.044820149739583336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6833333323399226, + "reward_std": 0.34054601813356083, + "rewards/accuracy_reward": 0.6833333323399226, + "step": 72 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 61.979166666666664, + "epoch": 0.060409626922281225, + "grad_norm": 2.2071452140808105, + "kl": 0.061767578125, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5990624961753687, + "reward_std": 0.361715796093146, + "rewards/accuracy_reward": 0.5990624961753687, + "step": 73 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 76.375, + "epoch": 0.06123715605820288, + "grad_norm": 1.9740723371505737, + "kl": 0.0379486083984375, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.44447916994492215, + "reward_std": 0.3240007509787877, + "rewards/accuracy_reward": 0.44447916994492215, + "step": 74 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 66.40625, + "epoch": 0.062064685194124544, + "grad_norm": 2.02036714553833, + "kl": 0.041651407877604164, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.43041667093833286, + "reward_std": 0.3399979795018832, + "rewards/accuracy_reward": 0.43041667093833286, + "step": 75 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 72.3125, + "epoch": 0.0628922143300462, + "grad_norm": 3.04404616355896, + "kl": 0.044097900390625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.4487499964113037, + "reward_std": 0.34006026821831864, + "rewards/accuracy_reward": 0.4487499964113037, + "step": 76 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 72.02083333333333, + "epoch": 0.06371974346596787, + "grad_norm": 2.000454902648926, + "kl": 0.052652994791666664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6081250011920929, + "reward_std": 0.3188822753727436, + "rewards/accuracy_reward": 0.6081250011920929, + "step": 77 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 76.625, + "epoch": 0.06454727260188953, + "grad_norm": 3.751617431640625, + "kl": 0.0370635986328125, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6051041595637798, + "reward_std": 0.33085890486836433, + "rewards/accuracy_reward": 0.6051041595637798, + "step": 78 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 77.91666666666667, + "epoch": 0.06537480173781118, + "grad_norm": 1.797606348991394, + "kl": 0.035237630208333336, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.6705208296577135, + "reward_std": 0.31878550599018735, + "rewards/accuracy_reward": 0.6705208296577135, + "step": 79 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 68.375, + "epoch": 0.06620233087373284, + "grad_norm": 2.437802314758301, + "kl": 0.046788533528645836, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6250000049670538, + "reward_std": 0.35698052495718, + "rewards/accuracy_reward": 0.6250000049670538, + "step": 80 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 65.17708333333333, + "epoch": 0.06702986000965451, + "grad_norm": 2.6624574661254883, + "kl": 0.033429463704427086, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5828125017384688, + "reward_std": 0.27249022262791794, + "rewards/accuracy_reward": 0.5828125017384688, + "step": 81 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 73.14583333333333, + "epoch": 0.06785738914557617, + "grad_norm": 1.9140758514404297, + "kl": 0.038645426432291664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5407291625936826, + "reward_std": 0.31591786816716194, + "rewards/accuracy_reward": 0.5407291625936826, + "step": 82 + }, + { + "ave_tool_num": 0.2708333333333333, + "completion_length": 77.23958333333333, + "epoch": 0.06868491828149782, + "grad_norm": 2.173022508621216, + "kl": 0.045450846354166664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5088541594644388, + "reward_std": 0.32267672816912335, + "rewards/accuracy_reward": 0.5088541594644388, + "step": 83 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 69.75, + "epoch": 0.06951244741741949, + "grad_norm": 3.4886348247528076, + "kl": 0.036865234375, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.450312502682209, + "reward_std": 0.36511879911025363, + "rewards/accuracy_reward": 0.450312502682209, + "step": 84 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 58.947916666666664, + "epoch": 0.07033997655334115, + "grad_norm": 2.7444026470184326, + "kl": 0.054026285807291664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6114583313465118, + "reward_std": 0.29948860593140125, + "rewards/accuracy_reward": 0.6114583313465118, + "step": 85 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 68.36458333333333, + "epoch": 0.0711675056892628, + "grad_norm": 3.821908473968506, + "kl": 0.07298787434895833, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5954166675607363, + "reward_std": 0.3094818902512391, + "rewards/accuracy_reward": 0.5954166675607363, + "step": 86 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 72.47916666666667, + "epoch": 0.07199503482518448, + "grad_norm": 3.217932939529419, + "kl": 0.039693196614583336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5587499979883432, + "reward_std": 0.3754260800778866, + "rewards/accuracy_reward": 0.5587499979883432, + "step": 87 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 72.67708333333333, + "epoch": 0.07282256396110613, + "grad_norm": 2.2279629707336426, + "kl": 0.056427001953125, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.3909374997019768, + "reward_std": 0.35039687156677246, + "rewards/accuracy_reward": 0.3909374997019768, + "step": 88 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 67.77083333333333, + "epoch": 0.07365009309702779, + "grad_norm": 2.8139841556549072, + "kl": 0.03839111328125, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5513541661202908, + "reward_std": 0.30517816729843616, + "rewards/accuracy_reward": 0.5513541661202908, + "step": 89 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 71.53125, + "epoch": 0.07447762223294946, + "grad_norm": 4.97050666809082, + "kl": 0.03057861328125, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.5953124985098839, + "reward_std": 0.38752932846546173, + "rewards/accuracy_reward": 0.5953124985098839, + "step": 90 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 70.0625, + "epoch": 0.07530515136887111, + "grad_norm": 4.379319667816162, + "kl": 0.030202229817708332, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.5593749967714151, + "reward_std": 0.3371945700297753, + "rewards/accuracy_reward": 0.5593749967714151, + "step": 91 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 74.61458333333333, + "epoch": 0.07613268050479277, + "grad_norm": 2.2585816383361816, + "kl": 0.052846272786458336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6656250078231096, + "reward_std": 0.2909380483130614, + "rewards/accuracy_reward": 0.6656250078231096, + "step": 92 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 75.29166666666667, + "epoch": 0.07696020964071443, + "grad_norm": 2.6250510215759277, + "kl": 0.028594970703125, + "learning_rate": 1e-06, + "loss": 0.0011, + "reward": 0.4998958383997281, + "reward_std": 0.37181190152963, + "rewards/accuracy_reward": 0.4998958383997281, + "step": 93 + }, + { + "ave_tool_num": 0.25, + "completion_length": 68.33333333333333, + "epoch": 0.0777877387766361, + "grad_norm": 2.723007917404175, + "kl": 0.033284505208333336, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5479166644314925, + "reward_std": 0.34167717583477497, + "rewards/accuracy_reward": 0.5479166644314925, + "step": 94 + }, + { + "ave_tool_num": 0.23958333333333334, + "completion_length": 68.11458333333333, + "epoch": 0.07861526791255775, + "grad_norm": 2.5346224308013916, + "kl": 0.037760416666666664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5022916669646899, + "reward_std": 0.4038414532939593, + "rewards/accuracy_reward": 0.5022916669646899, + "step": 95 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 70.13541666666667, + "epoch": 0.07944279704847941, + "grad_norm": 2.780632734298706, + "kl": 0.048309326171875, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.3776041669771075, + "reward_std": 0.34594897739589214, + "rewards/accuracy_reward": 0.3776041669771075, + "step": 96 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 72.94791666666667, + "epoch": 0.08027032618440108, + "grad_norm": 2.500133752822876, + "kl": 0.044443766276041664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.4633333347737789, + "reward_std": 0.3085879795253277, + "rewards/accuracy_reward": 0.4633333347737789, + "step": 97 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 82.125, + "epoch": 0.08109785532032274, + "grad_norm": 2.6109557151794434, + "kl": 0.036895751953125, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.596666673819224, + "reward_std": 0.40472638979554176, + "rewards/accuracy_reward": 0.596666673819224, + "step": 98 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 79.47916666666667, + "epoch": 0.0819253844562444, + "grad_norm": 1.938370704650879, + "kl": 0.032246907552083336, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.6350000003973643, + "reward_std": 0.36127813284595806, + "rewards/accuracy_reward": 0.6350000003973643, + "step": 99 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 73.76041666666667, + "epoch": 0.08275291359216606, + "grad_norm": 2.8753280639648438, + "kl": 0.034917195638020836, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.49947916343808174, + "reward_std": 0.3276378686229388, + "rewards/accuracy_reward": 0.49947916343808174, + "step": 100 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 77.38541666666667, + "epoch": 0.08358044272808772, + "grad_norm": 3.090855836868286, + "kl": 0.036265055338541664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5604166686534882, + "reward_std": 0.31745480994383496, + "rewards/accuracy_reward": 0.5604166686534882, + "step": 101 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 85.16666666666667, + "epoch": 0.08440797186400938, + "grad_norm": 4.0723958015441895, + "kl": 0.10227457682291667, + "learning_rate": 1e-06, + "loss": 0.0041, + "reward": 0.3477083370089531, + "reward_std": 0.42247989525397617, + "rewards/accuracy_reward": 0.3477083370089531, + "step": 102 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 76.34375, + "epoch": 0.08523550099993103, + "grad_norm": 7.84744930267334, + "kl": 0.045440673828125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.46947916597127914, + "reward_std": 0.3615213173131148, + "rewards/accuracy_reward": 0.46947916597127914, + "step": 103 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 73.36458333333333, + "epoch": 0.0860630301358527, + "grad_norm": 1.7557107210159302, + "kl": 0.029632568359375, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.7265625031044086, + "reward_std": 0.26481136803825694, + "rewards/accuracy_reward": 0.7265625031044086, + "step": 104 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 72.54166666666667, + "epoch": 0.08689055927177436, + "grad_norm": 2.596742630004883, + "kl": 0.040049235026041664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6534375001986822, + "reward_std": 0.3179449786742528, + "rewards/accuracy_reward": 0.6534375001986822, + "step": 105 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 67.21875, + "epoch": 0.08771808840769602, + "grad_norm": 2.493940591812134, + "kl": 0.042287190755208336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.631250003973643, + "reward_std": 0.3303266813357671, + "rewards/accuracy_reward": 0.631250003973643, + "step": 106 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 85.48958333333333, + "epoch": 0.08854561754361769, + "grad_norm": 2.3171470165252686, + "kl": 0.037221272786458336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5619791696468989, + "reward_std": 0.3135261485973994, + "rewards/accuracy_reward": 0.5619791696468989, + "step": 107 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 72.875, + "epoch": 0.08937314667953934, + "grad_norm": 62.56015396118164, + "kl": 0.055308024088541664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5119791676600774, + "reward_std": 0.42183414101600647, + "rewards/accuracy_reward": 0.5119791676600774, + "step": 108 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 69.38541666666667, + "epoch": 0.090200675815461, + "grad_norm": 2.7930006980895996, + "kl": 0.054473876953125, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5686458374063174, + "reward_std": 0.31749122341473895, + "rewards/accuracy_reward": 0.5686458374063174, + "step": 109 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 75.0, + "epoch": 0.09102820495138267, + "grad_norm": 2.3008956909179688, + "kl": 0.05169677734375, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5631250068545341, + "reward_std": 0.36644775172074634, + "rewards/accuracy_reward": 0.5631250068545341, + "step": 110 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 101.09375, + "epoch": 0.09185573408730432, + "grad_norm": 2.264420509338379, + "kl": 0.055328369140625, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5740625038743019, + "reward_std": 0.3205209746956825, + "rewards/accuracy_reward": 0.5740625038743019, + "step": 111 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 90.98958333333333, + "epoch": 0.09268326322322598, + "grad_norm": 2.402318000793457, + "kl": 0.044972737630208336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.4929166628668706, + "reward_std": 0.31770427773396176, + "rewards/accuracy_reward": 0.4929166628668706, + "step": 112 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 67.30208333333333, + "epoch": 0.09351079235914765, + "grad_norm": 3.5962202548980713, + "kl": 0.0460205078125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5494791654249033, + "reward_std": 0.38186609496672946, + "rewards/accuracy_reward": 0.5494791654249033, + "step": 113 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 68.46875, + "epoch": 0.09433832149506931, + "grad_norm": 2.0474822521209717, + "kl": 0.04351806640625, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6542708327372869, + "reward_std": 0.2539586015045643, + "rewards/accuracy_reward": 0.6542708327372869, + "step": 114 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 70.0, + "epoch": 0.09516585063099096, + "grad_norm": 2.0896530151367188, + "kl": 0.047149658203125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5221875011920929, + "reward_std": 0.3038016601155202, + "rewards/accuracy_reward": 0.5221875011920929, + "step": 115 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 69.41666666666667, + "epoch": 0.09599337976691262, + "grad_norm": 2.559731960296631, + "kl": 0.041178385416666664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5104166716337204, + "reward_std": 0.3311898087461789, + "rewards/accuracy_reward": 0.5104166716337204, + "step": 116 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 69.0625, + "epoch": 0.09682090890283429, + "grad_norm": 2.0041069984436035, + "kl": 0.056376139322916664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6979166654249033, + "reward_std": 0.2651527989655733, + "rewards/accuracy_reward": 0.6979166654249033, + "step": 117 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 77.5625, + "epoch": 0.09764843803875595, + "grad_norm": 2.1682207584381104, + "kl": 0.0477447509765625, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.549895832935969, + "reward_std": 0.3593260881801446, + "rewards/accuracy_reward": 0.549895832935969, + "step": 118 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 84.28125, + "epoch": 0.0984759671746776, + "grad_norm": 2.387112855911255, + "kl": 0.033742268880208336, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5623958334326744, + "reward_std": 0.3781033381819725, + "rewards/accuracy_reward": 0.5623958334326744, + "step": 119 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 81.45833333333333, + "epoch": 0.09930349631059927, + "grad_norm": 2.0244181156158447, + "kl": 0.042154947916666664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6265625022351742, + "reward_std": 0.31554700434207916, + "rewards/accuracy_reward": 0.6265625022351742, + "step": 120 + }, + { + "ave_tool_num": 0.125, + "completion_length": 88.41666666666667, + "epoch": 0.10013102544652093, + "grad_norm": 14.879405975341797, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0049, + "reward": 0.4183333379526933, + "reward_std": 0.3507399931550026, + "rewards/accuracy_reward": 0.4183333379526933, + "step": 121 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 75.03125, + "epoch": 0.10095855458244259, + "grad_norm": 3.620107889175415, + "kl": 0.036265055338541664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.4909375024338563, + "reward_std": 0.37202512100338936, + "rewards/accuracy_reward": 0.4909375024338563, + "step": 122 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 72.27083333333333, + "epoch": 0.10178608371836426, + "grad_norm": 7.597269535064697, + "kl": 0.22815958658854166, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 0.5867708325386047, + "reward_std": 0.43105898797512054, + "rewards/accuracy_reward": 0.5867708325386047, + "step": 123 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 81.86458333333333, + "epoch": 0.10261361285428591, + "grad_norm": 2.3587875366210938, + "kl": 0.06360371907552083, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.4632291669646899, + "reward_std": 0.3146243579685688, + "rewards/accuracy_reward": 0.4632291669646899, + "step": 124 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 76.64583333333333, + "epoch": 0.10344114199020757, + "grad_norm": 7.050756931304932, + "kl": 0.062367757161458336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5158333343764147, + "reward_std": 0.41276457781593007, + "rewards/accuracy_reward": 0.5158333343764147, + "step": 125 + }, + { + "ave_tool_num": 0.125, + "completion_length": 77.72916666666667, + "epoch": 0.10426867112612924, + "grad_norm": 2.454303503036499, + "kl": 0.052332560221354164, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6827083354194959, + "reward_std": 0.34928590804338455, + "rewards/accuracy_reward": 0.6827083354194959, + "step": 126 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 91.67708333333333, + "epoch": 0.1050962002620509, + "grad_norm": 2.895021677017212, + "kl": 0.03302001953125, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.6443749964237213, + "reward_std": 0.33631719648838043, + "rewards/accuracy_reward": 0.6443749964237213, + "step": 127 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 80.63541666666667, + "epoch": 0.10592372939797255, + "grad_norm": 2.7560677528381348, + "kl": 0.037017822265625, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.49677083392937976, + "reward_std": 0.2790515224138896, + "rewards/accuracy_reward": 0.49677083392937976, + "step": 128 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 87.29166666666667, + "epoch": 0.10675125853389421, + "grad_norm": 2.8029603958129883, + "kl": 0.046427408854166664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5446874996026357, + "reward_std": 0.3540305780867736, + "rewards/accuracy_reward": 0.5446874996026357, + "step": 129 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 85.10416666666667, + "epoch": 0.10757878766981588, + "grad_norm": 2.1513683795928955, + "kl": 0.036829630533854164, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.43822916100422543, + "reward_std": 0.3599824532866478, + "rewards/accuracy_reward": 0.43822916100422543, + "step": 130 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 80.05208333333333, + "epoch": 0.10840631680573753, + "grad_norm": 4.786322593688965, + "kl": 0.060536702473958336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.56447916974624, + "reward_std": 0.3248026476552089, + "rewards/accuracy_reward": 0.56447916974624, + "step": 131 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 86.26041666666667, + "epoch": 0.10923384594165919, + "grad_norm": 3.1199698448181152, + "kl": 0.08556111653645833, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.4675000061591466, + "reward_std": 0.3205302090694507, + "rewards/accuracy_reward": 0.4675000061591466, + "step": 132 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 78.76041666666667, + "epoch": 0.11006137507758086, + "grad_norm": 2.27067232131958, + "kl": 0.037577311197916664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6420833369096121, + "reward_std": 0.36189357408632833, + "rewards/accuracy_reward": 0.6420833369096121, + "step": 133 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 80.48958333333333, + "epoch": 0.11088890421350252, + "grad_norm": 2.1309235095977783, + "kl": 0.039886474609375, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5974999989072481, + "reward_std": 0.31943047419190407, + "rewards/accuracy_reward": 0.5974999989072481, + "step": 134 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 86.38541666666667, + "epoch": 0.11171643334942417, + "grad_norm": 2.0169618129730225, + "kl": 0.039642333984375, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5110416735212008, + "reward_std": 0.3593553937971592, + "rewards/accuracy_reward": 0.5110416735212008, + "step": 135 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 82.26041666666667, + "epoch": 0.11254396248534584, + "grad_norm": 1.9898463487625122, + "kl": 0.057230631510416664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6373958388964335, + "reward_std": 0.3655017515023549, + "rewards/accuracy_reward": 0.6373958388964335, + "step": 136 + }, + { + "ave_tool_num": 0.125, + "completion_length": 80.58333333333333, + "epoch": 0.1133714916212675, + "grad_norm": 4.400066375732422, + "kl": 0.04052734375, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6259374991059303, + "reward_std": 0.38145149623354274, + "rewards/accuracy_reward": 0.6259374991059303, + "step": 137 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 90.03125, + "epoch": 0.11419902075718916, + "grad_norm": 2.6714096069335938, + "kl": 0.033299763997395836, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.6194791694482168, + "reward_std": 0.3607353779176871, + "rewards/accuracy_reward": 0.6194791694482168, + "step": 138 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 72.26041666666667, + "epoch": 0.11502654989311081, + "grad_norm": 3.3189098834991455, + "kl": 0.041554768880208336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.605208333581686, + "reward_std": 0.3869066039721171, + "rewards/accuracy_reward": 0.605208333581686, + "step": 139 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 67.27083333333333, + "epoch": 0.11585407902903248, + "grad_norm": 2.230771064758301, + "kl": 0.035563151041666664, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.5090625025331974, + "reward_std": 0.2363946419209242, + "rewards/accuracy_reward": 0.5090625025331974, + "step": 140 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 80.47916666666667, + "epoch": 0.11668160816495414, + "grad_norm": 2.146206855773926, + "kl": 0.036402384440104164, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5421874970197678, + "reward_std": 0.3737240843474865, + "rewards/accuracy_reward": 0.5421874970197678, + "step": 141 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.45833333333333, + "epoch": 0.1175091373008758, + "grad_norm": 3.28766131401062, + "kl": 0.033365885416666664, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.49156250307957333, + "reward_std": 0.3900417760014534, + "rewards/accuracy_reward": 0.49156250307957333, + "step": 142 + }, + { + "ave_tool_num": 0.125, + "completion_length": 67.23958333333333, + "epoch": 0.11833666643679747, + "grad_norm": 2.543090343475342, + "kl": 0.04876708984375, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5653125022848448, + "reward_std": 0.31714113739629585, + "rewards/accuracy_reward": 0.5653125022848448, + "step": 143 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 78.60416666666667, + "epoch": 0.11916419557271912, + "grad_norm": 3.7350409030914307, + "kl": 0.09153238932291667, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.5474999981621901, + "reward_std": 0.3795171764989694, + "rewards/accuracy_reward": 0.5474999981621901, + "step": 144 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 76.17708333333333, + "epoch": 0.11999172470864078, + "grad_norm": 3.535137414932251, + "kl": 0.046722412109375, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6359374970197678, + "reward_std": 0.36204566558202106, + "rewards/accuracy_reward": 0.6359374970197678, + "step": 145 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 69.26041666666667, + "epoch": 0.12081925384456245, + "grad_norm": 2.7752747535705566, + "kl": 0.046234130859375, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5817708410322666, + "reward_std": 0.2765699438750744, + "rewards/accuracy_reward": 0.5817708410322666, + "step": 146 + }, + { + "ave_tool_num": 0.125, + "completion_length": 86.07291666666667, + "epoch": 0.1216467829804841, + "grad_norm": 3.5031681060791016, + "kl": 0.059315999348958336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.4665624958773454, + "reward_std": 0.32489378191530704, + "rewards/accuracy_reward": 0.4665624958773454, + "step": 147 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 78.52083333333333, + "epoch": 0.12247431211640576, + "grad_norm": 2.6766879558563232, + "kl": 0.06259663899739583, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6430208372573057, + "reward_std": 0.3150616909066836, + "rewards/accuracy_reward": 0.6430208372573057, + "step": 148 + }, + { + "ave_tool_num": 0.125, + "completion_length": 105.35416666666667, + "epoch": 0.12330184125232743, + "grad_norm": 2.554305076599121, + "kl": 0.07182820638020833, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.4714583282669385, + "reward_std": 0.376865924646457, + "rewards/accuracy_reward": 0.4714583282669385, + "step": 149 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 74.45833333333333, + "epoch": 0.12412937038824909, + "grad_norm": 2.3507766723632812, + "kl": 0.038503011067708336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5942708353201548, + "reward_std": 0.35623766481876373, + "rewards/accuracy_reward": 0.5942708353201548, + "step": 150 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 99.55208333333333, + "epoch": 0.12495689952417074, + "grad_norm": 2.0572264194488525, + "kl": 0.031005859375, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.4930208350221316, + "reward_std": 0.35398441428939503, + "rewards/accuracy_reward": 0.4930208350221316, + "step": 151 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.09375, + "epoch": 0.1257844286600924, + "grad_norm": 2.2769055366516113, + "kl": 0.049611409505208336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6014583359162012, + "reward_std": 0.3553168922662735, + "rewards/accuracy_reward": 0.6014583359162012, + "step": 152 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 78.875, + "epoch": 0.12661195779601406, + "grad_norm": 4.5564727783203125, + "kl": 0.049296061197916664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.525937500099341, + "reward_std": 0.3018303445229928, + "rewards/accuracy_reward": 0.525937500099341, + "step": 153 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 76.75, + "epoch": 0.12743948693193574, + "grad_norm": 2.38199782371521, + "kl": 0.051579793294270836, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4639583354194959, + "reward_std": 0.3513490459881723, + "rewards/accuracy_reward": 0.4639583354194959, + "step": 154 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 78.88541666666667, + "epoch": 0.1282670160678574, + "grad_norm": 5.366733074188232, + "kl": 0.040283203125, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6566666712363561, + "reward_std": 0.3079093669851621, + "rewards/accuracy_reward": 0.6566666712363561, + "step": 155 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 98.03125, + "epoch": 0.12909454520377905, + "grad_norm": 3.751271963119507, + "kl": 0.0460205078125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.44302083055178326, + "reward_std": 0.38687663276990253, + "rewards/accuracy_reward": 0.44302083055178326, + "step": 156 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 101.21875, + "epoch": 0.1299220743397007, + "grad_norm": 2.745645046234131, + "kl": 0.036616007486979164, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.531041664381822, + "reward_std": 0.3185325749218464, + "rewards/accuracy_reward": 0.531041664381822, + "step": 157 + }, + { + "ave_tool_num": 0.125, + "completion_length": 79.03125, + "epoch": 0.13074960347562237, + "grad_norm": 2.7066538333892822, + "kl": 0.04107666015625, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.7153125082453092, + "reward_std": 0.3228346072137356, + "rewards/accuracy_reward": 0.7153125082453092, + "step": 158 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 89.29166666666667, + "epoch": 0.13157713261154402, + "grad_norm": 2.677125930786133, + "kl": 0.034372965494791664, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.6368749986092249, + "reward_std": 0.26303643733263016, + "rewards/accuracy_reward": 0.6368749986092249, + "step": 159 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 115.92708333333333, + "epoch": 0.13240466174746568, + "grad_norm": 2.132375717163086, + "kl": 0.039886474609375, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.49822916835546494, + "reward_std": 0.3110681486626466, + "rewards/accuracy_reward": 0.49822916835546494, + "step": 160 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 80.46875, + "epoch": 0.13323219088338736, + "grad_norm": 2.669215440750122, + "kl": 0.0288848876953125, + "learning_rate": 1e-06, + "loss": 0.0012, + "reward": 0.6499999991307656, + "reward_std": 0.28740444034338, + "rewards/accuracy_reward": 0.6499999991307656, + "step": 161 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 74.47916666666667, + "epoch": 0.13405972001930902, + "grad_norm": 2.283672332763672, + "kl": 0.046803792317708336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.46562499552965164, + "reward_std": 0.4138417864839236, + "rewards/accuracy_reward": 0.46562499552965164, + "step": 162 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 79.96875, + "epoch": 0.13488724915523068, + "grad_norm": 6.253664493560791, + "kl": 0.09523518880208333, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.5635416607062022, + "reward_std": 0.29939627802620333, + "rewards/accuracy_reward": 0.5635416607062022, + "step": 163 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.69791666666667, + "epoch": 0.13571477829115233, + "grad_norm": 2.5587716102600098, + "kl": 0.047556559244791664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5342708354194959, + "reward_std": 0.35881395141283673, + "rewards/accuracy_reward": 0.5342708354194959, + "step": 164 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 68.03125, + "epoch": 0.136542307427074, + "grad_norm": 3.228886365890503, + "kl": 0.056437174479166664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5321874991059303, + "reward_std": 0.33468054855863255, + "rewards/accuracy_reward": 0.5321874991059303, + "step": 165 + }, + { + "ave_tool_num": 0.125, + "completion_length": 81.45833333333333, + "epoch": 0.13736983656299565, + "grad_norm": 2.658090591430664, + "kl": 0.04669189453125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.44083333263794583, + "reward_std": 0.4031498183806737, + "rewards/accuracy_reward": 0.44083333263794583, + "step": 166 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 72.61458333333333, + "epoch": 0.13819736569891733, + "grad_norm": 2.5563857555389404, + "kl": 0.040491739908854164, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.7337499956289927, + "reward_std": 0.30837671272456646, + "rewards/accuracy_reward": 0.7337499956289927, + "step": 167 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 79.26041666666667, + "epoch": 0.13902489483483899, + "grad_norm": 6.327179908752441, + "kl": 0.07746378580729167, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.5373958324392637, + "reward_std": 0.32378827035427094, + "rewards/accuracy_reward": 0.5373958324392637, + "step": 168 + }, + { + "ave_tool_num": 0.125, + "completion_length": 75.14583333333333, + "epoch": 0.13985242397076064, + "grad_norm": 2.202244997024536, + "kl": 0.033854166666666664, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.528437502682209, + "reward_std": 0.2910701738825689, + "rewards/accuracy_reward": 0.528437502682209, + "step": 169 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 76.75, + "epoch": 0.1406799531066823, + "grad_norm": 3.974524736404419, + "kl": 0.033213297526041664, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.571041668454806, + "reward_std": 0.31571107109387714, + "rewards/accuracy_reward": 0.571041668454806, + "step": 170 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 84.11458333333333, + "epoch": 0.14150748224260395, + "grad_norm": 5.454798221588135, + "kl": 0.061187744140625, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5422916722794374, + "reward_std": 0.3246851519991954, + "rewards/accuracy_reward": 0.5422916722794374, + "step": 171 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 70.35416666666667, + "epoch": 0.1423350113785256, + "grad_norm": 2.9077439308166504, + "kl": 0.037567138671875, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6457291605571905, + "reward_std": 0.291256637002031, + "rewards/accuracy_reward": 0.6457291605571905, + "step": 172 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 80.05208333333333, + "epoch": 0.14316254051444727, + "grad_norm": 2.417736291885376, + "kl": 0.031911214192708336, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.6132291679581007, + "reward_std": 0.35574293074508506, + "rewards/accuracy_reward": 0.6132291679581007, + "step": 173 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 87.67708333333333, + "epoch": 0.14399006965036895, + "grad_norm": 2.52188777923584, + "kl": 0.0479278564453125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5479166631897291, + "reward_std": 0.34336994712551433, + "rewards/accuracy_reward": 0.5479166631897291, + "step": 174 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 85.46875, + "epoch": 0.1448175987862906, + "grad_norm": 2.154428482055664, + "kl": 0.032374064127604164, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.5118749986092249, + "reward_std": 0.3715562087794145, + "rewards/accuracy_reward": 0.5118749986092249, + "step": 175 + }, + { + "ave_tool_num": 0.125, + "completion_length": 82.67708333333333, + "epoch": 0.14564512792221226, + "grad_norm": 4.208851337432861, + "kl": 0.036214192708333336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.46947916348775226, + "reward_std": 0.3929327515264352, + "rewards/accuracy_reward": 0.46947916348775226, + "step": 176 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 79.40625, + "epoch": 0.14647265705813392, + "grad_norm": 2.0872671604156494, + "kl": 0.047698974609375, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5698958312471708, + "reward_std": 0.37948331609368324, + "rewards/accuracy_reward": 0.5698958312471708, + "step": 177 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 93.67708333333333, + "epoch": 0.14730018619405558, + "grad_norm": 3.1690151691436768, + "kl": 0.062398274739583336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.40552083775401115, + "reward_std": 0.3808382774392764, + "rewards/accuracy_reward": 0.40552083775401115, + "step": 178 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 68.5, + "epoch": 0.14812771532997723, + "grad_norm": 2.3781681060791016, + "kl": 0.039927164713541664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6235416655739149, + "reward_std": 0.36955029144883156, + "rewards/accuracy_reward": 0.6235416655739149, + "step": 179 + }, + { + "ave_tool_num": 0.125, + "completion_length": 71.83333333333333, + "epoch": 0.14895524446589892, + "grad_norm": 2.4361417293548584, + "kl": 0.047632853190104164, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6420833331843218, + "reward_std": 0.2987837915619214, + "rewards/accuracy_reward": 0.6420833331843218, + "step": 180 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 64.78125, + "epoch": 0.14978277360182057, + "grad_norm": 3.9060847759246826, + "kl": 0.045827229817708336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.572395833209157, + "reward_std": 0.2890772794683774, + "rewards/accuracy_reward": 0.572395833209157, + "step": 181 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 74.89583333333333, + "epoch": 0.15061030273774223, + "grad_norm": 2.1893551349639893, + "kl": 0.043853759765625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6427083325882753, + "reward_std": 0.328141164034605, + "rewards/accuracy_reward": 0.6427083325882753, + "step": 182 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 85.90625, + "epoch": 0.1514378318736639, + "grad_norm": 3.539602279663086, + "kl": 0.051177978515625, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.41229166959722835, + "reward_std": 0.39006341621279716, + "rewards/accuracy_reward": 0.41229166959722835, + "step": 183 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 78.10416666666667, + "epoch": 0.15226536100958554, + "grad_norm": 2.6855051517486572, + "kl": 0.059743245442708336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5067708293596903, + "reward_std": 0.3716175432006518, + "rewards/accuracy_reward": 0.5067708293596903, + "step": 184 + }, + { + "ave_tool_num": 0.125, + "completion_length": 90.875, + "epoch": 0.1530928901455072, + "grad_norm": 2.4823334217071533, + "kl": 0.046051025390625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.643854171037674, + "reward_std": 0.32150814558068913, + "rewards/accuracy_reward": 0.643854171037674, + "step": 185 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 75.125, + "epoch": 0.15392041928142886, + "grad_norm": 2.2918262481689453, + "kl": 0.059977213541666664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6064583361148834, + "reward_std": 0.33849912012616795, + "rewards/accuracy_reward": 0.6064583361148834, + "step": 186 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 111.65625, + "epoch": 0.15474794841735054, + "grad_norm": 2.4122800827026367, + "kl": 0.050435384114583336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.4927083378036817, + "reward_std": 0.344767801463604, + "rewards/accuracy_reward": 0.4927083378036817, + "step": 187 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 91.60416666666667, + "epoch": 0.1555754775532722, + "grad_norm": 19.653553009033203, + "kl": 0.060129801432291664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5104166629413763, + "reward_std": 0.35996608932813007, + "rewards/accuracy_reward": 0.5104166629413763, + "step": 188 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 72.29166666666667, + "epoch": 0.15640300668919385, + "grad_norm": 4.432910442352295, + "kl": 0.08709716796875, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.6364583397905031, + "reward_std": 0.2980127576738596, + "rewards/accuracy_reward": 0.6364583397905031, + "step": 189 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 73.59375, + "epoch": 0.1572305358251155, + "grad_norm": 2.094522476196289, + "kl": 0.03701019287109375, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5459375015149514, + "reward_std": 0.3821908949563901, + "rewards/accuracy_reward": 0.5459375015149514, + "step": 190 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 77.15625, + "epoch": 0.15805806496103716, + "grad_norm": 2.1448163986206055, + "kl": 0.040008544921875, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5455208346247673, + "reward_std": 0.21666466382642588, + "rewards/accuracy_reward": 0.5455208346247673, + "step": 191 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 80.84375, + "epoch": 0.15888559409695882, + "grad_norm": 2.6644067764282227, + "kl": 0.036865234375, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5858333309491476, + "reward_std": 0.3136956915259361, + "rewards/accuracy_reward": 0.5858333309491476, + "step": 192 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 90.58333333333333, + "epoch": 0.15971312323288048, + "grad_norm": 3.072394609451294, + "kl": 0.0669097900390625, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5412500128149986, + "reward_std": 0.34065889318784076, + "rewards/accuracy_reward": 0.5412500128149986, + "step": 193 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 74.75, + "epoch": 0.16054065236880216, + "grad_norm": 7.887403964996338, + "kl": 0.042744954427083336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.4642708295335372, + "reward_std": 0.3487265519797802, + "rewards/accuracy_reward": 0.4642708295335372, + "step": 194 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 101.05208333333333, + "epoch": 0.16136818150472382, + "grad_norm": 2.930554151535034, + "kl": 0.06610107421875, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5577083341777325, + "reward_std": 0.26634228726228076, + "rewards/accuracy_reward": 0.5577083341777325, + "step": 195 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 72.13541666666667, + "epoch": 0.16219571064064547, + "grad_norm": 3.7259373664855957, + "kl": 0.034077962239583336, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.6441666682561239, + "reward_std": 0.35053656219194335, + "rewards/accuracy_reward": 0.6441666682561239, + "step": 196 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 84.9375, + "epoch": 0.16302323977656713, + "grad_norm": 2.511021137237549, + "kl": 0.044169108072916664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.543229175110658, + "reward_std": 0.33085233966509503, + "rewards/accuracy_reward": 0.543229175110658, + "step": 197 + }, + { + "ave_tool_num": 0.125, + "completion_length": 74.14583333333333, + "epoch": 0.1638507689124888, + "grad_norm": 3.7786381244659424, + "kl": 0.07590738932291667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.6233333349227905, + "reward_std": 0.3173280097544193, + "rewards/accuracy_reward": 0.6233333349227905, + "step": 198 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 72.42708333333333, + "epoch": 0.16467829804841044, + "grad_norm": 2.8411505222320557, + "kl": 0.06720987955729167, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5915625045696894, + "reward_std": 0.309618787219127, + "rewards/accuracy_reward": 0.5915625045696894, + "step": 199 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 77.45833333333333, + "epoch": 0.16550582718433213, + "grad_norm": 2.000492811203003, + "kl": 0.041361490885416664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6951041668653488, + "reward_std": 0.3060580330590407, + "rewards/accuracy_reward": 0.6951041668653488, + "step": 200 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 77.35416666666667, + "epoch": 0.16633335632025378, + "grad_norm": 2.0167338848114014, + "kl": 0.048238118489583336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.7067708348234495, + "reward_std": 0.29401514182488125, + "rewards/accuracy_reward": 0.7067708348234495, + "step": 201 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 90.26041666666667, + "epoch": 0.16716088545617544, + "grad_norm": 3.840559959411621, + "kl": 0.037267049153645836, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5955208341280619, + "reward_std": 0.32142388199766475, + "rewards/accuracy_reward": 0.5955208341280619, + "step": 202 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 82.91666666666667, + "epoch": 0.1679884145920971, + "grad_norm": 4.373442649841309, + "kl": 0.107086181640625, + "learning_rate": 1e-06, + "loss": 0.0043, + "reward": 0.6162500008940697, + "reward_std": 0.26121783753236133, + "rewards/accuracy_reward": 0.6162500008940697, + "step": 203 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 90.96875, + "epoch": 0.16881594372801875, + "grad_norm": 2.6465377807617188, + "kl": 0.039408365885416664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.49812499309579533, + "reward_std": 0.3550982524951299, + "rewards/accuracy_reward": 0.49812499309579533, + "step": 204 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 88.53125, + "epoch": 0.1696434728639404, + "grad_norm": 2.1646859645843506, + "kl": 0.044336954752604164, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5235416690508524, + "reward_std": 0.41123304267724353, + "rewards/accuracy_reward": 0.5235416690508524, + "step": 205 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 82.67708333333333, + "epoch": 0.17047100199986207, + "grad_norm": 3.7032065391540527, + "kl": 0.11011759440104167, + "learning_rate": 1e-06, + "loss": 0.0044, + "reward": 0.7200000137090683, + "reward_std": 0.3081221828858058, + "rewards/accuracy_reward": 0.7200000137090683, + "step": 206 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 72.83333333333333, + "epoch": 0.17129853113578375, + "grad_norm": 3.1673998832702637, + "kl": 0.040669759114583336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.4937499985098839, + "reward_std": 0.40340136488278705, + "rewards/accuracy_reward": 0.4937499985098839, + "step": 207 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 75.84375, + "epoch": 0.1721260602717054, + "grad_norm": 2.2036566734313965, + "kl": 0.051015218098958336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5648958335320154, + "reward_std": 0.34016695370276767, + "rewards/accuracy_reward": 0.5648958335320154, + "step": 208 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 86.44791666666667, + "epoch": 0.17295358940762706, + "grad_norm": 3.051480531692505, + "kl": 0.07497151692708333, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5527083352208138, + "reward_std": 0.3241584400335948, + "rewards/accuracy_reward": 0.5527083352208138, + "step": 209 + }, + { + "ave_tool_num": 0.010416666666666666, + "completion_length": 79.44791666666667, + "epoch": 0.17378111854354872, + "grad_norm": 3.96998929977417, + "kl": 0.07347615559895833, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6418750062584877, + "reward_std": 0.33057450378934544, + "rewards/accuracy_reward": 0.6418750062584877, + "step": 210 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 73.83333333333333, + "epoch": 0.17460864767947037, + "grad_norm": 4.410951614379883, + "kl": 0.08829243977864583, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.5895833224058151, + "reward_std": 0.37829917172590893, + "rewards/accuracy_reward": 0.5895833224058151, + "step": 211 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 87.96875, + "epoch": 0.17543617681539203, + "grad_norm": 2.174173593521118, + "kl": 0.03497314453125, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.6947916597127914, + "reward_std": 0.30671805577973527, + "rewards/accuracy_reward": 0.6947916597127914, + "step": 212 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 105.01041666666667, + "epoch": 0.17626370595131371, + "grad_norm": 4.537360191345215, + "kl": 0.16217041015625, + "learning_rate": 1e-06, + "loss": 0.0065, + "reward": 0.6230208277702332, + "reward_std": 0.3527686496575673, + "rewards/accuracy_reward": 0.6230208277702332, + "step": 213 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 90.08333333333333, + "epoch": 0.17709123508723537, + "grad_norm": 1.9273887872695923, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6160416652758917, + "reward_std": 0.28171175097425777, + "rewards/accuracy_reward": 0.6160416652758917, + "step": 214 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 81.48958333333333, + "epoch": 0.17791876422315703, + "grad_norm": 2.9464356899261475, + "kl": 0.043370564778645836, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6165624981125196, + "reward_std": 0.2966545708477497, + "rewards/accuracy_reward": 0.6165624981125196, + "step": 215 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 74.42708333333333, + "epoch": 0.17874629335907868, + "grad_norm": 2.3797082901000977, + "kl": 0.04547119140625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5392708331346512, + "reward_std": 0.3373684672017892, + "rewards/accuracy_reward": 0.5392708331346512, + "step": 216 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 73.9375, + "epoch": 0.17957382249500034, + "grad_norm": 2.383244514465332, + "kl": 0.041422526041666664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6300000051657358, + "reward_std": 0.34261837353308994, + "rewards/accuracy_reward": 0.6300000051657358, + "step": 217 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 70.36458333333333, + "epoch": 0.180401351630922, + "grad_norm": 5.236860275268555, + "kl": 0.046539306640625, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5902083367109299, + "reward_std": 0.38385709809760254, + "rewards/accuracy_reward": 0.5902083367109299, + "step": 218 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 82.14583333333333, + "epoch": 0.18122888076684365, + "grad_norm": 1.881351351737976, + "kl": 0.06689453125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.49375000099341076, + "reward_std": 0.25820976744095486, + "rewards/accuracy_reward": 0.49375000099341076, + "step": 219 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 77.9375, + "epoch": 0.18205640990276534, + "grad_norm": 3.387083053588867, + "kl": 0.13338216145833334, + "learning_rate": 1e-06, + "loss": 0.0054, + "reward": 0.5128124977151552, + "reward_std": 0.3816277775913477, + "rewards/accuracy_reward": 0.5128124977151552, + "step": 220 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 89.9375, + "epoch": 0.182883939038687, + "grad_norm": 2.073045015335083, + "kl": 0.044891357421875, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6433333307504654, + "reward_std": 0.3143393875410159, + "rewards/accuracy_reward": 0.6433333307504654, + "step": 221 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 73.91666666666667, + "epoch": 0.18371146817460865, + "grad_norm": 2.9066104888916016, + "kl": 0.052998860677083336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5274999986092249, + "reward_std": 0.35108063990871113, + "rewards/accuracy_reward": 0.5274999986092249, + "step": 222 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 75.89583333333333, + "epoch": 0.1845389973105303, + "grad_norm": 2.8635218143463135, + "kl": 0.05718994140625, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6520833323399226, + "reward_std": 0.31667303914825123, + "rewards/accuracy_reward": 0.6520833323399226, + "step": 223 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 72.46875, + "epoch": 0.18536652644645196, + "grad_norm": 2.4180386066436768, + "kl": 0.09063720703125, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.5039583345254263, + "reward_std": 0.37733621150255203, + "rewards/accuracy_reward": 0.5039583345254263, + "step": 224 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 73.55208333333333, + "epoch": 0.18619405558237362, + "grad_norm": 2.142974853515625, + "kl": 0.044972737630208336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.7477083404858907, + "reward_std": 0.2510933925708135, + "rewards/accuracy_reward": 0.7477083404858907, + "step": 225 + }, + { + "ave_tool_num": 0.125, + "completion_length": 68.48958333333333, + "epoch": 0.1870215847182953, + "grad_norm": 2.296579360961914, + "kl": 0.035268147786458336, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.6586458360155424, + "reward_std": 0.3830305269608895, + "rewards/accuracy_reward": 0.6586458360155424, + "step": 226 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 77.6875, + "epoch": 0.18784911385421696, + "grad_norm": 2.4417812824249268, + "kl": 0.059427897135416664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.544166666145126, + "reward_std": 0.3258590375383695, + "rewards/accuracy_reward": 0.544166666145126, + "step": 227 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 81.40625, + "epoch": 0.18867664299013862, + "grad_norm": 2.3232662677764893, + "kl": 0.050201416015625, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5872916678587595, + "reward_std": 0.3712866511195898, + "rewards/accuracy_reward": 0.5872916678587595, + "step": 228 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 69.5, + "epoch": 0.18950417212606027, + "grad_norm": 1.689404010772705, + "kl": 0.041208902994791664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6217708314458529, + "reward_std": 0.26979586978753406, + "rewards/accuracy_reward": 0.6217708314458529, + "step": 229 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 84.28125, + "epoch": 0.19033170126198193, + "grad_norm": 1.952042579650879, + "kl": 0.07904052734375, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6213541657974323, + "reward_std": 0.348984282463789, + "rewards/accuracy_reward": 0.6213541657974323, + "step": 230 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 73.35416666666667, + "epoch": 0.19115923039790358, + "grad_norm": 2.086299180984497, + "kl": 0.044514973958333336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6644791613022486, + "reward_std": 0.28711416323979694, + "rewards/accuracy_reward": 0.6644791613022486, + "step": 231 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 78.23958333333333, + "epoch": 0.19198675953382524, + "grad_norm": 2.2860920429229736, + "kl": 0.053263346354166664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5891666760047277, + "reward_std": 0.37468406682213146, + "rewards/accuracy_reward": 0.5891666760047277, + "step": 232 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.34375, + "epoch": 0.19281428866974692, + "grad_norm": 2.316683292388916, + "kl": 0.043629964192708336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.4664583336561918, + "reward_std": 0.2703871273746093, + "rewards/accuracy_reward": 0.4664583336561918, + "step": 233 + }, + { + "ave_tool_num": 0.010416666666666666, + "completion_length": 69.61458333333333, + "epoch": 0.19364181780566858, + "grad_norm": 3.0993733406066895, + "kl": 0.051717122395833336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6253125059107939, + "reward_std": 0.33224081620574, + "rewards/accuracy_reward": 0.6253125059107939, + "step": 234 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 69.92708333333333, + "epoch": 0.19446934694159024, + "grad_norm": 2.995387077331543, + "kl": 0.06134033203125, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5140625002483526, + "reward_std": 0.353696937362353, + "rewards/accuracy_reward": 0.5140625002483526, + "step": 235 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 75.98958333333333, + "epoch": 0.1952968760775119, + "grad_norm": 2.306981086730957, + "kl": 0.054931640625, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.656979168454806, + "reward_std": 0.3649794173737367, + "rewards/accuracy_reward": 0.656979168454806, + "step": 236 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 71.30208333333333, + "epoch": 0.19612440521343355, + "grad_norm": 2.5044620037078857, + "kl": 0.07228597005208333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6143750001986822, + "reward_std": 0.352769210934639, + "rewards/accuracy_reward": 0.6143750001986822, + "step": 237 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 70.88541666666667, + "epoch": 0.1969519343493552, + "grad_norm": 1.8697267770767212, + "kl": 0.04522705078125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.7015625089406967, + "reward_std": 0.2808002432187398, + "rewards/accuracy_reward": 0.7015625089406967, + "step": 238 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 75.21875, + "epoch": 0.1977794634852769, + "grad_norm": 2.335069179534912, + "kl": 0.039815266927083336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6441666682561239, + "reward_std": 0.3148883873752008, + "rewards/accuracy_reward": 0.6441666682561239, + "step": 239 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 80.02083333333333, + "epoch": 0.19860699262119855, + "grad_norm": 4.325173377990723, + "kl": 0.07384236653645833, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5832291642824808, + "reward_std": 0.3523593445618947, + "rewards/accuracy_reward": 0.5832291642824808, + "step": 240 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 85.34375, + "epoch": 0.1994345217571202, + "grad_norm": 2.6426918506622314, + "kl": 0.047800699869791664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.4207291690011819, + "reward_std": 0.25509989634156227, + "rewards/accuracy_reward": 0.4207291690011819, + "step": 241 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 83.78125, + "epoch": 0.20026205089304186, + "grad_norm": 2.035111427307129, + "kl": 0.054412841796875, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6014583321909109, + "reward_std": 0.306376280883948, + "rewards/accuracy_reward": 0.6014583321909109, + "step": 242 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 87.54166666666667, + "epoch": 0.20108958002896352, + "grad_norm": 2.409351110458374, + "kl": 0.046407063802083336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5661458385487398, + "reward_std": 0.2981305103749037, + "rewards/accuracy_reward": 0.5661458385487398, + "step": 243 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 74.72916666666667, + "epoch": 0.20191710916488517, + "grad_norm": 1.8365428447723389, + "kl": 0.043833414713541664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6883333350221316, + "reward_std": 0.32166294008493423, + "rewards/accuracy_reward": 0.6883333350221316, + "step": 244 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 74.64583333333333, + "epoch": 0.20274463830080683, + "grad_norm": 2.3217105865478516, + "kl": 0.054331461588541664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.4139583383997281, + "reward_std": 0.4058089666068554, + "rewards/accuracy_reward": 0.4139583383997281, + "step": 245 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 79.875, + "epoch": 0.2035721674367285, + "grad_norm": 2.2104644775390625, + "kl": 0.043528238932291664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.4676041690011819, + "reward_std": 0.3349659740924835, + "rewards/accuracy_reward": 0.4676041690011819, + "step": 246 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 94.05208333333333, + "epoch": 0.20439969657265017, + "grad_norm": 4.1428070068359375, + "kl": 0.06785074869791667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6673958351214727, + "reward_std": 0.24209492653608322, + "rewards/accuracy_reward": 0.6673958351214727, + "step": 247 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 72.40625, + "epoch": 0.20522722570857183, + "grad_norm": 2.448861598968506, + "kl": 0.047993977864583336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.4870833332339923, + "reward_std": 0.3301217705011368, + "rewards/accuracy_reward": 0.4870833332339923, + "step": 248 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 76.54166666666667, + "epoch": 0.20605475484449348, + "grad_norm": 2.193126916885376, + "kl": 0.057047526041666664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6219791664431492, + "reward_std": 0.3468327539352079, + "rewards/accuracy_reward": 0.6219791664431492, + "step": 249 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 71.02083333333333, + "epoch": 0.20688228398041514, + "grad_norm": 5.847820281982422, + "kl": 0.044647216796875, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6993749986092249, + "reward_std": 0.38842568298180896, + "rewards/accuracy_reward": 0.6993749986092249, + "step": 250 + }, + { + "ave_tool_num": 0.125, + "completion_length": 76.84375, + "epoch": 0.2077098131163368, + "grad_norm": 2.401087999343872, + "kl": 0.055882771809895836, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6127083351214727, + "reward_std": 0.30380687986811, + "rewards/accuracy_reward": 0.6127083351214727, + "step": 251 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 72.83333333333333, + "epoch": 0.20853734225225848, + "grad_norm": 3.3752310276031494, + "kl": 0.045033772786458336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.667083332935969, + "reward_std": 0.38028897282977897, + "rewards/accuracy_reward": 0.667083332935969, + "step": 252 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 77.66666666666667, + "epoch": 0.20936487138818013, + "grad_norm": 274.5823974609375, + "kl": 0.4368387858072917, + "learning_rate": 1e-06, + "loss": 0.0175, + "reward": 0.5447916661699613, + "reward_std": 0.371360154201587, + "rewards/accuracy_reward": 0.5447916661699613, + "step": 253 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 69.03125, + "epoch": 0.2101924005241018, + "grad_norm": 3.470311164855957, + "kl": 0.06559244791666667, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5521874992797772, + "reward_std": 0.29611882008612156, + "rewards/accuracy_reward": 0.5521874992797772, + "step": 254 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 90.3125, + "epoch": 0.21101992966002345, + "grad_norm": 2.56372332572937, + "kl": 0.042572021484375, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5528125055134296, + "reward_std": 0.25509514783819515, + "rewards/accuracy_reward": 0.5528125055134296, + "step": 255 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 76.86458333333333, + "epoch": 0.2118474587959451, + "grad_norm": 2.7387022972106934, + "kl": 0.05413818359375, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5913541692619523, + "reward_std": 0.2679431786139806, + "rewards/accuracy_reward": 0.5913541692619523, + "step": 256 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 83.78125, + "epoch": 0.21267498793186676, + "grad_norm": 2.0594706535339355, + "kl": 0.048929850260416664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5170833344260851, + "reward_std": 0.3559880598137776, + "rewards/accuracy_reward": 0.5170833344260851, + "step": 257 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 78.10416666666667, + "epoch": 0.21350251706778842, + "grad_norm": 2.5971689224243164, + "kl": 0.036336263020833336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6788541674613953, + "reward_std": 0.33328263213237125, + "rewards/accuracy_reward": 0.6788541674613953, + "step": 258 + }, + { + "ave_tool_num": 0.125, + "completion_length": 87.97916666666667, + "epoch": 0.2143300462037101, + "grad_norm": 2.3301939964294434, + "kl": 0.040608723958333336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.44489583155761164, + "reward_std": 0.30126742273569107, + "rewards/accuracy_reward": 0.44489583155761164, + "step": 259 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 70.65625, + "epoch": 0.21515757533963176, + "grad_norm": 2.68522572517395, + "kl": 0.037251790364583336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6517708376049995, + "reward_std": 0.19857345490405956, + "rewards/accuracy_reward": 0.6517708376049995, + "step": 260 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 70.73958333333333, + "epoch": 0.2159851044755534, + "grad_norm": 2.128300428390503, + "kl": 0.039815266927083336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6052083373069763, + "reward_std": 0.38846677790085477, + "rewards/accuracy_reward": 0.6052083373069763, + "step": 261 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 74.94791666666667, + "epoch": 0.21681263361147507, + "grad_norm": 2.6488935947418213, + "kl": 0.035512288411458336, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.7713541686534882, + "reward_std": 0.3013971733550231, + "rewards/accuracy_reward": 0.7713541686534882, + "step": 262 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 77.40625, + "epoch": 0.21764016274739673, + "grad_norm": 2.222418785095215, + "kl": 0.039540608723958336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.43447917327284813, + "reward_std": 0.3160654592017333, + "rewards/accuracy_reward": 0.43447917327284813, + "step": 263 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 93.5625, + "epoch": 0.21846769188331838, + "grad_norm": 3.0685811042785645, + "kl": 0.039433797200520836, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5899999961256981, + "reward_std": 0.33966201916337013, + "rewards/accuracy_reward": 0.5899999961256981, + "step": 264 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 73.0625, + "epoch": 0.21929522101924004, + "grad_norm": 2.5679595470428467, + "kl": 0.042958577473958336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6344791650772095, + "reward_std": 0.3161227082212766, + "rewards/accuracy_reward": 0.6344791650772095, + "step": 265 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 82.76041666666667, + "epoch": 0.22012275015516172, + "grad_norm": 12.171127319335938, + "kl": 0.16628519694010416, + "learning_rate": 1e-06, + "loss": 0.0066, + "reward": 0.6093750037252903, + "reward_std": 0.35201121866703033, + "rewards/accuracy_reward": 0.6093750037252903, + "step": 266 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 85.84375, + "epoch": 0.22095027929108338, + "grad_norm": 5.305222034454346, + "kl": 0.03875732421875, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6580208341280619, + "reward_std": 0.3094043905536334, + "rewards/accuracy_reward": 0.6580208341280619, + "step": 267 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.60416666666667, + "epoch": 0.22177780842700504, + "grad_norm": 3.0172300338745117, + "kl": 0.051361083984375, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4187500001862645, + "reward_std": 0.3434275531520446, + "rewards/accuracy_reward": 0.4187500001862645, + "step": 268 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 76.30208333333333, + "epoch": 0.2226053375629267, + "grad_norm": 2.907139539718628, + "kl": 0.076629638671875, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.5614583392937978, + "reward_std": 0.3178571183234453, + "rewards/accuracy_reward": 0.5614583392937978, + "step": 269 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 76.94791666666667, + "epoch": 0.22343286669884835, + "grad_norm": 3.8187243938446045, + "kl": 0.110626220703125, + "learning_rate": 1e-06, + "loss": 0.0044, + "reward": 0.6413541659712791, + "reward_std": 0.31137993310888606, + "rewards/accuracy_reward": 0.6413541659712791, + "step": 270 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 85.59375, + "epoch": 0.22426039583477, + "grad_norm": 81.12844848632812, + "kl": 0.4144287109375, + "learning_rate": 1e-06, + "loss": 0.0166, + "reward": 0.5730208332339922, + "reward_std": 0.36230522270003956, + "rewards/accuracy_reward": 0.5730208332339922, + "step": 271 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 87.125, + "epoch": 0.2250879249706917, + "grad_norm": 2.99643611907959, + "kl": 0.062296549479166664, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5261458319922289, + "reward_std": 0.30647357925772667, + "rewards/accuracy_reward": 0.5261458319922289, + "step": 272 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 86.875, + "epoch": 0.22591545410661334, + "grad_norm": 2.1190407276153564, + "kl": 0.051035563151041664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5962499976158142, + "reward_std": 0.3205944448709488, + "rewards/accuracy_reward": 0.5962499976158142, + "step": 273 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 89.04166666666667, + "epoch": 0.226742983242535, + "grad_norm": 2.475698232650757, + "kl": 0.07123819986979167, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5142708364874125, + "reward_std": 0.34650829372306663, + "rewards/accuracy_reward": 0.5142708364874125, + "step": 274 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 81.32291666666667, + "epoch": 0.22757051237845666, + "grad_norm": 3.2320003509521484, + "kl": 0.08024088541666667, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.48604166011015576, + "reward_std": 0.3260461861888568, + "rewards/accuracy_reward": 0.48604166011015576, + "step": 275 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 75.5, + "epoch": 0.2283980415143783, + "grad_norm": 4.958624362945557, + "kl": 0.08075968424479167, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5086458312968413, + "reward_std": 0.38901104032993317, + "rewards/accuracy_reward": 0.5086458312968413, + "step": 276 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 81.52083333333333, + "epoch": 0.22922557065029997, + "grad_norm": 10.366461753845215, + "kl": 0.056650797526041664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5885416691501936, + "reward_std": 0.3513532504439354, + "rewards/accuracy_reward": 0.5885416691501936, + "step": 277 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 149.25, + "epoch": 0.23005309978622163, + "grad_norm": 2.3423874378204346, + "kl": 0.041361490885416664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6387500030299028, + "reward_std": 0.34371013815204304, + "rewards/accuracy_reward": 0.6387500030299028, + "step": 278 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 99.80208333333333, + "epoch": 0.2308806289221433, + "grad_norm": 2.300457715988159, + "kl": 0.051371256510416664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4531250124176343, + "reward_std": 0.32881950462857884, + "rewards/accuracy_reward": 0.4531250124176343, + "step": 279 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 82.69791666666667, + "epoch": 0.23170815805806497, + "grad_norm": 2.3513426780700684, + "kl": 0.0503082275390625, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.49500000445793074, + "reward_std": 0.2863917065163453, + "rewards/accuracy_reward": 0.49500000445793074, + "step": 280 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 131.23958333333334, + "epoch": 0.23253568719398662, + "grad_norm": 2.249485731124878, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5008333325386047, + "reward_std": 0.3668517305826147, + "rewards/accuracy_reward": 0.5008333325386047, + "step": 281 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 95.375, + "epoch": 0.23336321632990828, + "grad_norm": 1.8864588737487793, + "kl": 0.07094319661458333, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.659270832935969, + "reward_std": 0.2948411280910174, + "rewards/accuracy_reward": 0.659270832935969, + "step": 282 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 79.875, + "epoch": 0.23419074546582994, + "grad_norm": 2.3514208793640137, + "kl": 0.058563232421875, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5444791714350382, + "reward_std": 0.3508310193816821, + "rewards/accuracy_reward": 0.5444791714350382, + "step": 283 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 81.73958333333333, + "epoch": 0.2350182746017516, + "grad_norm": 2.192047119140625, + "kl": 0.036921183268229164, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5688541730244955, + "reward_std": 0.35211565842231113, + "rewards/accuracy_reward": 0.5688541730244955, + "step": 284 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 110.95833333333333, + "epoch": 0.23584580373767328, + "grad_norm": 2.3644192218780518, + "kl": 0.046579996744791664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5048958361148834, + "reward_std": 0.34671246881286305, + "rewards/accuracy_reward": 0.5048958361148834, + "step": 285 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 87.58333333333333, + "epoch": 0.23667333287359493, + "grad_norm": 1.4868648052215576, + "kl": 0.048192342122395836, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6281249976406494, + "reward_std": 0.24108945081631342, + "rewards/accuracy_reward": 0.6281249976406494, + "step": 286 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 82.32291666666667, + "epoch": 0.2375008620095166, + "grad_norm": 3.471268653869629, + "kl": 0.053253173828125, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6368750010927519, + "reward_std": 0.35827722152074176, + "rewards/accuracy_reward": 0.6368750010927519, + "step": 287 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 88.96875, + "epoch": 0.23832839114543825, + "grad_norm": 3.9259915351867676, + "kl": 0.04095458984375, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5901041701436043, + "reward_std": 0.35183051228523254, + "rewards/accuracy_reward": 0.5901041701436043, + "step": 288 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 73.15625, + "epoch": 0.2391559202813599, + "grad_norm": 26.748281478881836, + "kl": 0.17400614420572916, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 0.688229168454806, + "reward_std": 0.282804557432731, + "rewards/accuracy_reward": 0.688229168454806, + "step": 289 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 71.20833333333333, + "epoch": 0.23998344941728156, + "grad_norm": 2.5124173164367676, + "kl": 0.06502278645833333, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.45666666453083354, + "reward_std": 0.32377713918685913, + "rewards/accuracy_reward": 0.45666666453083354, + "step": 290 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 111.03125, + "epoch": 0.24081097855320321, + "grad_norm": 4.117656230926514, + "kl": 0.09077962239583333, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.46635416398445767, + "reward_std": 0.31834621851642925, + "rewards/accuracy_reward": 0.46635416398445767, + "step": 291 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 77.375, + "epoch": 0.2416385076891249, + "grad_norm": 3.1420164108276367, + "kl": 0.055867513020833336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5080208331346512, + "reward_std": 0.313231300873061, + "rewards/accuracy_reward": 0.5080208331346512, + "step": 292 + }, + { + "ave_tool_num": 0.125, + "completion_length": 83.55208333333333, + "epoch": 0.24246603682504655, + "grad_norm": 2.1593399047851562, + "kl": 0.041859944661458336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6591666688521703, + "reward_std": 0.3542352405687173, + "rewards/accuracy_reward": 0.6591666688521703, + "step": 293 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 93.15625, + "epoch": 0.2432935659609682, + "grad_norm": 11.789937019348145, + "kl": 0.266754150390625, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 0.5934374990562598, + "reward_std": 0.36113183448712033, + "rewards/accuracy_reward": 0.5934374990562598, + "step": 294 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 75.65625, + "epoch": 0.24412109509688987, + "grad_norm": 3.668234348297119, + "kl": 0.053924560546875, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.4417708283290267, + "reward_std": 0.36382726052155095, + "rewards/accuracy_reward": 0.4417708283290267, + "step": 295 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 76.96875, + "epoch": 0.24494862423281152, + "grad_norm": 2.2839136123657227, + "kl": 0.10179646809895833, + "learning_rate": 1e-06, + "loss": 0.0041, + "reward": 0.5988541667660078, + "reward_std": 0.40687859803438187, + "rewards/accuracy_reward": 0.5988541667660078, + "step": 296 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 87.89583333333333, + "epoch": 0.24577615336873318, + "grad_norm": 11.067449569702148, + "kl": 0.13292439778645834, + "learning_rate": 1e-06, + "loss": 0.0053, + "reward": 0.5699999978144964, + "reward_std": 0.35919085517525673, + "rewards/accuracy_reward": 0.5699999978144964, + "step": 297 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 75.4375, + "epoch": 0.24660368250465486, + "grad_norm": 4.210636138916016, + "kl": 0.04754638671875, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5611458371082941, + "reward_std": 0.395455705622832, + "rewards/accuracy_reward": 0.5611458371082941, + "step": 298 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 107.57291666666667, + "epoch": 0.24743121164057652, + "grad_norm": 2.7290737628936768, + "kl": 0.097503662109375, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.5191666632890701, + "reward_std": 0.31757373300691444, + "rewards/accuracy_reward": 0.5191666632890701, + "step": 299 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 77.625, + "epoch": 0.24825874077649818, + "grad_norm": 2.3226654529571533, + "kl": 0.04425048828125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6567708340783914, + "reward_std": 0.2577727437019348, + "rewards/accuracy_reward": 0.6567708340783914, + "step": 300 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 83.63541666666667, + "epoch": 0.24908626991241983, + "grad_norm": 2.556267023086548, + "kl": 0.049855550130208336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5331250006953875, + "reward_std": 0.40565891315539676, + "rewards/accuracy_reward": 0.5331250006953875, + "step": 301 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 63.645833333333336, + "epoch": 0.2499137990483415, + "grad_norm": 3.1652591228485107, + "kl": 0.08949788411458333, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.7025000030795733, + "reward_std": 0.27332080403963727, + "rewards/accuracy_reward": 0.7025000030795733, + "step": 302 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 69.98958333333333, + "epoch": 0.25074132818426315, + "grad_norm": 2.479010581970215, + "kl": 0.047627766927083336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5183333357175192, + "reward_std": 0.3445269465446472, + "rewards/accuracy_reward": 0.5183333357175192, + "step": 303 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 79.90625, + "epoch": 0.2515688573201848, + "grad_norm": 5.685761451721191, + "kl": 0.08355204264322917, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.5089583322405815, + "reward_std": 0.33765202139814693, + "rewards/accuracy_reward": 0.5089583322405815, + "step": 304 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 79.83333333333333, + "epoch": 0.25239638645610646, + "grad_norm": 2.7872517108917236, + "kl": 0.040761311848958336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5769791727264723, + "reward_std": 0.37356114263335866, + "rewards/accuracy_reward": 0.5769791727264723, + "step": 305 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 80.72916666666667, + "epoch": 0.2532239155920281, + "grad_norm": 2.6739895343780518, + "kl": 0.05242919921875, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5167708359658718, + "reward_std": 0.2655997183173895, + "rewards/accuracy_reward": 0.5167708359658718, + "step": 306 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 78.95833333333333, + "epoch": 0.25405144472794977, + "grad_norm": 2.9677326679229736, + "kl": 0.057851155598958336, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.48364583775401115, + "reward_std": 0.2832975958784421, + "rewards/accuracy_reward": 0.48364583775401115, + "step": 307 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 72.10416666666667, + "epoch": 0.2548789738638715, + "grad_norm": 2.068842887878418, + "kl": 0.040486653645833336, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.6137499958276749, + "reward_std": 0.2607078083480398, + "rewards/accuracy_reward": 0.6137499958276749, + "step": 308 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 77.44791666666667, + "epoch": 0.25570650299979314, + "grad_norm": 1.9453202486038208, + "kl": 0.040384928385416664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5736458307364956, + "reward_std": 0.24043827752272287, + "rewards/accuracy_reward": 0.5736458307364956, + "step": 309 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 73.6875, + "epoch": 0.2565340321357148, + "grad_norm": 1.977246880531311, + "kl": 0.044703165690104164, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5622916705906391, + "reward_std": 0.3516452970604102, + "rewards/accuracy_reward": 0.5622916705906391, + "step": 310 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 79.33333333333333, + "epoch": 0.25736156127163645, + "grad_norm": 2.4129979610443115, + "kl": 0.036982218424479164, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.583333338300387, + "reward_std": 0.32531017158180475, + "rewards/accuracy_reward": 0.583333338300387, + "step": 311 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 66.48958333333333, + "epoch": 0.2581890904075581, + "grad_norm": 2.4929771423339844, + "kl": 0.043924967447916664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6045833316942056, + "reward_std": 0.2705496462682883, + "rewards/accuracy_reward": 0.6045833316942056, + "step": 312 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 80.25, + "epoch": 0.25901661954347976, + "grad_norm": 2.569655656814575, + "kl": 0.06475830078125, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.4963541701436043, + "reward_std": 0.3014462509502967, + "rewards/accuracy_reward": 0.4963541701436043, + "step": 313 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 74.78125, + "epoch": 0.2598441486794014, + "grad_norm": 2.5648655891418457, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.42093750337759656, + "reward_std": 0.3545615002512932, + "rewards/accuracy_reward": 0.42093750337759656, + "step": 314 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 76.85416666666667, + "epoch": 0.2606716778153231, + "grad_norm": 131.25721740722656, + "kl": 2.24713134765625, + "learning_rate": 1e-06, + "loss": 0.0897, + "reward": 0.49291667093833286, + "reward_std": 0.33037736142675084, + "rewards/accuracy_reward": 0.49291667093833286, + "step": 315 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 66.0, + "epoch": 0.26149920695124473, + "grad_norm": 3.599480152130127, + "kl": 0.043996175130208336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5713541656732559, + "reward_std": 0.353777954975764, + "rewards/accuracy_reward": 0.5713541656732559, + "step": 316 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 77.1875, + "epoch": 0.2623267360871664, + "grad_norm": 2.4715471267700195, + "kl": 0.040537516276041664, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5651041728754839, + "reward_std": 0.33095305661360425, + "rewards/accuracy_reward": 0.5651041728754839, + "step": 317 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 71.16666666666667, + "epoch": 0.26315426522308805, + "grad_norm": 2.2171261310577393, + "kl": 0.044779459635416664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6293749983112017, + "reward_std": 0.3248366055389245, + "rewards/accuracy_reward": 0.6293749983112017, + "step": 318 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 89.29166666666667, + "epoch": 0.2639817943590097, + "grad_norm": 4.429813861846924, + "kl": 0.03704833984375, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5686458349227905, + "reward_std": 0.3590480697651704, + "rewards/accuracy_reward": 0.5686458349227905, + "step": 319 + }, + { + "ave_tool_num": 0.21875, + "completion_length": 79.33333333333333, + "epoch": 0.26480932349493136, + "grad_norm": 2.6368188858032227, + "kl": 0.036346435546875, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.6056250085433325, + "reward_std": 0.30750772667427856, + "rewards/accuracy_reward": 0.6056250085433325, + "step": 320 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 84.85416666666667, + "epoch": 0.26563685263085307, + "grad_norm": 2.6968822479248047, + "kl": 0.048624674479166664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6642708331346512, + "reward_std": 0.3289969066778819, + "rewards/accuracy_reward": 0.6642708331346512, + "step": 321 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 68.65625, + "epoch": 0.2664643817667747, + "grad_norm": 2.49312424659729, + "kl": 0.048909505208333336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6835416704416275, + "reward_std": 0.32504016160964966, + "rewards/accuracy_reward": 0.6835416704416275, + "step": 322 + }, + { + "ave_tool_num": 0.21875, + "completion_length": 90.78125, + "epoch": 0.2672919109026964, + "grad_norm": 2.153907299041748, + "kl": 0.042307535807291664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.44468749562899273, + "reward_std": 0.370198130607605, + "rewards/accuracy_reward": 0.44468749562899273, + "step": 323 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 106.11458333333333, + "epoch": 0.26811944003861804, + "grad_norm": 2.3154096603393555, + "kl": 0.042083740234375, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6559375040233135, + "reward_std": 0.35408538455764454, + "rewards/accuracy_reward": 0.6559375040233135, + "step": 324 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 82.02083333333333, + "epoch": 0.2689469691745397, + "grad_norm": 2.202115297317505, + "kl": 0.042856852213541664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5371875030299028, + "reward_std": 0.27588166296482086, + "rewards/accuracy_reward": 0.5371875030299028, + "step": 325 + }, + { + "ave_tool_num": 0.3333333333333333, + "completion_length": 62.739583333333336, + "epoch": 0.26977449831046135, + "grad_norm": 13.349494934082031, + "kl": 0.0408935546875, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5683333352208138, + "reward_std": 0.3136815552910169, + "rewards/accuracy_reward": 0.5683333352208138, + "step": 326 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 84.11458333333333, + "epoch": 0.270602027446383, + "grad_norm": 3.95904541015625, + "kl": 0.032307942708333336, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": 0.6163541649778684, + "reward_std": 0.2193842108050982, + "rewards/accuracy_reward": 0.6163541649778684, + "step": 327 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 70.96875, + "epoch": 0.27142955658230467, + "grad_norm": 2.774343252182007, + "kl": 0.0390625, + "learning_rate": 1e-06, + "loss": 0.0016, + "reward": 0.5950000000496706, + "reward_std": 0.2960427391032378, + "rewards/accuracy_reward": 0.5950000000496706, + "step": 328 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 72.88541666666667, + "epoch": 0.2722570857182263, + "grad_norm": 3.3405139446258545, + "kl": 0.06072998046875, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.541770838201046, + "reward_std": 0.3441335732738177, + "rewards/accuracy_reward": 0.541770838201046, + "step": 329 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 80.97916666666667, + "epoch": 0.273084614854148, + "grad_norm": 2.3913626670837402, + "kl": 0.04278564453125, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5970833313961824, + "reward_std": 0.3916328400373459, + "rewards/accuracy_reward": 0.5970833313961824, + "step": 330 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 89.53125, + "epoch": 0.27391214399006963, + "grad_norm": 2.539032459259033, + "kl": 0.055074055989583336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5176041672627131, + "reward_std": 0.3393767885863781, + "rewards/accuracy_reward": 0.5176041672627131, + "step": 331 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 76.96875, + "epoch": 0.2747396731259913, + "grad_norm": 3.0450282096862793, + "kl": 0.045806884765625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5391666640837988, + "reward_std": 0.30918868631124496, + "rewards/accuracy_reward": 0.5391666640837988, + "step": 332 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 75.30208333333333, + "epoch": 0.27556720226191295, + "grad_norm": 2.040248394012451, + "kl": 0.048126220703125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6504166672627131, + "reward_std": 0.31944992517431575, + "rewards/accuracy_reward": 0.6504166672627131, + "step": 333 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 82.64583333333333, + "epoch": 0.27639473139783466, + "grad_norm": 2.4626095294952393, + "kl": 0.035797119140625, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.5366666639844576, + "reward_std": 0.3083285105725129, + "rewards/accuracy_reward": 0.5366666639844576, + "step": 334 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 78.23958333333333, + "epoch": 0.2772222605337563, + "grad_norm": 2.4936327934265137, + "kl": 0.036295572916666664, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5798958341280619, + "reward_std": 0.3234400873382886, + "rewards/accuracy_reward": 0.5798958341280619, + "step": 335 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 83.65625, + "epoch": 0.27804978966967797, + "grad_norm": 4.151541233062744, + "kl": 0.051615397135416664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4958333373069763, + "reward_std": 0.32823999722798664, + "rewards/accuracy_reward": 0.4958333373069763, + "step": 336 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 72.21875, + "epoch": 0.2788773188055996, + "grad_norm": 3.2838284969329834, + "kl": 0.046468098958333336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.3779166688521703, + "reward_std": 0.3233847878873348, + "rewards/accuracy_reward": 0.3779166688521703, + "step": 337 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 84.58333333333333, + "epoch": 0.2797048479415213, + "grad_norm": 4.054309844970703, + "kl": 0.06589253743489583, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.4784374957283338, + "reward_std": 0.3280762092520793, + "rewards/accuracy_reward": 0.4784374957283338, + "step": 338 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 83.95833333333333, + "epoch": 0.28053237707744294, + "grad_norm": 2.890490770339966, + "kl": 0.08111572265625, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5989583333333334, + "reward_std": 0.3287709429860115, + "rewards/accuracy_reward": 0.5989583333333334, + "step": 339 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 86.625, + "epoch": 0.2813599062133646, + "grad_norm": 2.013335943222046, + "kl": 0.04888916015625, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6312499990065893, + "reward_std": 0.2919432347019513, + "rewards/accuracy_reward": 0.6312499990065893, + "step": 340 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 75.0625, + "epoch": 0.28218743534928625, + "grad_norm": 4.444045066833496, + "kl": 0.048492431640625, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6130208298563957, + "reward_std": 0.3376052553455035, + "rewards/accuracy_reward": 0.6130208298563957, + "step": 341 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 91.59375, + "epoch": 0.2830149644852079, + "grad_norm": 2.8637893199920654, + "kl": 0.053415934244791664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4784374963492155, + "reward_std": 0.27168870344758034, + "rewards/accuracy_reward": 0.4784374963492155, + "step": 342 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 82.69791666666667, + "epoch": 0.28384249362112957, + "grad_norm": 2.5117480754852295, + "kl": 0.050862630208333336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.45375000196509063, + "reward_std": 0.3140154629945755, + "rewards/accuracy_reward": 0.45375000196509063, + "step": 343 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 77.94791666666667, + "epoch": 0.2846700227570512, + "grad_norm": 3.585646152496338, + "kl": 0.16484578450520834, + "learning_rate": 1e-06, + "loss": 0.0066, + "reward": 0.5335416744152705, + "reward_std": 0.3245095058033864, + "rewards/accuracy_reward": 0.5335416744152705, + "step": 344 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 73.01041666666667, + "epoch": 0.2854975518929729, + "grad_norm": 3.070460081100464, + "kl": 0.049123128255208336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.667083332935969, + "reward_std": 0.3452583321680625, + "rewards/accuracy_reward": 0.667083332935969, + "step": 345 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 83.54166666666667, + "epoch": 0.28632508102889453, + "grad_norm": 2.571122407913208, + "kl": 0.042851765950520836, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6337500065565109, + "reward_std": 0.40783099333445233, + "rewards/accuracy_reward": 0.6337500065565109, + "step": 346 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 80.79166666666667, + "epoch": 0.28715261016481625, + "grad_norm": 2.388455867767334, + "kl": 0.061167399088541664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6154166658719381, + "reward_std": 0.3963600310186545, + "rewards/accuracy_reward": 0.6154166658719381, + "step": 347 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 75.25, + "epoch": 0.2879801393007379, + "grad_norm": 2.293395519256592, + "kl": 0.041829427083333336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.701354157179594, + "reward_std": 0.2383863776922226, + "rewards/accuracy_reward": 0.701354157179594, + "step": 348 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 89.54166666666667, + "epoch": 0.28880766843665956, + "grad_norm": 2.24953031539917, + "kl": 0.044123331705729164, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5228125005960464, + "reward_std": 0.28243951002756756, + "rewards/accuracy_reward": 0.5228125005960464, + "step": 349 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 86.8125, + "epoch": 0.2896351975725812, + "grad_norm": 3.628622531890869, + "kl": 0.047220865885416664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.47166666326423484, + "reward_std": 0.3456512838602066, + "rewards/accuracy_reward": 0.47166666326423484, + "step": 350 + }, + { + "ave_tool_num": 0.125, + "completion_length": 83.66666666666667, + "epoch": 0.29046272670850287, + "grad_norm": 3.140397310256958, + "kl": 0.056493123372395836, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5176041647791862, + "reward_std": 0.36610142265756923, + "rewards/accuracy_reward": 0.5176041647791862, + "step": 351 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 72.70833333333333, + "epoch": 0.29129025584442453, + "grad_norm": 1.934268832206726, + "kl": 0.04473876953125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.666770838201046, + "reward_std": 0.2588828004275759, + "rewards/accuracy_reward": 0.666770838201046, + "step": 352 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 91.38541666666667, + "epoch": 0.2921177849803462, + "grad_norm": 2.0483269691467285, + "kl": 0.057912190755208336, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.542291671037674, + "reward_std": 0.33911585435271263, + "rewards/accuracy_reward": 0.542291671037674, + "step": 353 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 75.94791666666667, + "epoch": 0.29294531411626784, + "grad_norm": 3.2072317600250244, + "kl": 0.060109456380208336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5436458364129066, + "reward_std": 0.384388508896033, + "rewards/accuracy_reward": 0.5436458364129066, + "step": 354 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 75.20833333333333, + "epoch": 0.2937728432521895, + "grad_norm": 1.708344578742981, + "kl": 0.0361328125, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.5590625032782555, + "reward_std": 0.3136417691906293, + "rewards/accuracy_reward": 0.5590625032782555, + "step": 355 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 96.6875, + "epoch": 0.29460037238811115, + "grad_norm": 2.557690143585205, + "kl": 0.042683919270833336, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6753124992052714, + "reward_std": 0.3042913191020489, + "rewards/accuracy_reward": 0.6753124992052714, + "step": 356 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 79.875, + "epoch": 0.2954279015240328, + "grad_norm": 2.1327064037323, + "kl": 0.046549479166666664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6782291680574417, + "reward_std": 0.30116037962337333, + "rewards/accuracy_reward": 0.6782291680574417, + "step": 357 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 81.5625, + "epoch": 0.29625543065995447, + "grad_norm": 3.1827077865600586, + "kl": 0.049652099609375, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5344791685541471, + "reward_std": 0.33653195947408676, + "rewards/accuracy_reward": 0.5344791685541471, + "step": 358 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 79.05208333333333, + "epoch": 0.2970829597958761, + "grad_norm": 2.7919037342071533, + "kl": 0.037221272786458336, + "learning_rate": 1e-06, + "loss": 0.0015, + "reward": 0.5785416721676787, + "reward_std": 0.2961090859025717, + "rewards/accuracy_reward": 0.5785416721676787, + "step": 359 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 80.0625, + "epoch": 0.29791048893179783, + "grad_norm": 3.140406608581543, + "kl": 0.068603515625, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5008333375056585, + "reward_std": 0.3610582904269298, + "rewards/accuracy_reward": 0.5008333375056585, + "step": 360 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 99.11458333333333, + "epoch": 0.2987380180677195, + "grad_norm": 3.069945812225342, + "kl": 0.073822021484375, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6672916660706202, + "reward_std": 0.32811462755004567, + "rewards/accuracy_reward": 0.6672916660706202, + "step": 361 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 76.11458333333333, + "epoch": 0.29956554720364115, + "grad_norm": 2.957824468612671, + "kl": 0.058451334635416664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5541666696468989, + "reward_std": 0.38005131607254344, + "rewards/accuracy_reward": 0.5541666696468989, + "step": 362 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 93.30208333333333, + "epoch": 0.3003930763395628, + "grad_norm": 2.429201602935791, + "kl": 0.04669189453125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6573958359658718, + "reward_std": 0.2959101901700099, + "rewards/accuracy_reward": 0.6573958359658718, + "step": 363 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 78.10416666666667, + "epoch": 0.30122060547548446, + "grad_norm": 2.814821720123291, + "kl": 0.053395589192708336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5106250047683716, + "reward_std": 0.38161856681108475, + "rewards/accuracy_reward": 0.5106250047683716, + "step": 364 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 80.15625, + "epoch": 0.3020481346114061, + "grad_norm": 2.4849698543548584, + "kl": 0.052490234375, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.7207291672627131, + "reward_std": 0.3172510117292404, + "rewards/accuracy_reward": 0.7207291672627131, + "step": 365 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 82.44791666666667, + "epoch": 0.3028756637473278, + "grad_norm": 2.407177209854126, + "kl": 0.055043538411458336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5339583282669386, + "reward_std": 0.3990442318220933, + "rewards/accuracy_reward": 0.5339583282669386, + "step": 366 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 78.33333333333333, + "epoch": 0.30370319288324943, + "grad_norm": 3.4935824871063232, + "kl": 0.048095703125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6175000021855036, + "reward_std": 0.30566784739494324, + "rewards/accuracy_reward": 0.6175000021855036, + "step": 367 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 75.28125, + "epoch": 0.3045307220191711, + "grad_norm": 2.2393903732299805, + "kl": 0.04541015625, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5709374969204267, + "reward_std": 0.36021552483240765, + "rewards/accuracy_reward": 0.5709374969204267, + "step": 368 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 92.09375, + "epoch": 0.30535825115509274, + "grad_norm": 2.0404980182647705, + "kl": 0.051656087239583336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6331250071525574, + "reward_std": 0.3795782191058, + "rewards/accuracy_reward": 0.6331250071525574, + "step": 369 + }, + { + "ave_tool_num": 0.125, + "completion_length": 83.17708333333333, + "epoch": 0.3061857802910144, + "grad_norm": 2.6197705268859863, + "kl": 0.057078043619791664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5644791722297668, + "reward_std": 0.37148235117395717, + "rewards/accuracy_reward": 0.5644791722297668, + "step": 370 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 74.42708333333333, + "epoch": 0.30701330942693605, + "grad_norm": 3.0796995162963867, + "kl": 0.07232666015625, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5257291669646899, + "reward_std": 0.34552216281493503, + "rewards/accuracy_reward": 0.5257291669646899, + "step": 371 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 88.04166666666667, + "epoch": 0.3078408385628577, + "grad_norm": 2.3560240268707275, + "kl": 0.04815673828125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5207291655242443, + "reward_std": 0.31180041407545406, + "rewards/accuracy_reward": 0.5207291655242443, + "step": 372 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 83.60416666666667, + "epoch": 0.3086683676987794, + "grad_norm": 2.692615509033203, + "kl": 0.066375732421875, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.49968749781449634, + "reward_std": 0.3758527437845866, + "rewards/accuracy_reward": 0.49968749781449634, + "step": 373 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 75.17708333333333, + "epoch": 0.3094958968347011, + "grad_norm": 2.5978450775146484, + "kl": 0.067626953125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5818750038743019, + "reward_std": 0.3650831754008929, + "rewards/accuracy_reward": 0.5818750038743019, + "step": 374 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 63.895833333333336, + "epoch": 0.31032342597062273, + "grad_norm": 2.6648340225219727, + "kl": 0.049835205078125, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5888541663686434, + "reward_std": 0.25041733992596465, + "rewards/accuracy_reward": 0.5888541663686434, + "step": 375 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 85.98958333333333, + "epoch": 0.3111509551065444, + "grad_norm": 2.222111940383911, + "kl": 0.047322591145833336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5400000015894572, + "reward_std": 0.29390808877845603, + "rewards/accuracy_reward": 0.5400000015894572, + "step": 376 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 82.79166666666667, + "epoch": 0.31197848424246605, + "grad_norm": 2.3736939430236816, + "kl": 0.046010335286458336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.7265625024835268, + "reward_std": 0.30952448087433976, + "rewards/accuracy_reward": 0.7265625024835268, + "step": 377 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 72.80208333333333, + "epoch": 0.3128060133783877, + "grad_norm": 2.2500717639923096, + "kl": 0.054982503255208336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6996875007947286, + "reward_std": 0.33806512628992397, + "rewards/accuracy_reward": 0.6996875007947286, + "step": 378 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 80.35416666666667, + "epoch": 0.31363354251430936, + "grad_norm": 2.3708856105804443, + "kl": 0.0616455078125, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6807291656732559, + "reward_std": 0.3853408719102542, + "rewards/accuracy_reward": 0.6807291656732559, + "step": 379 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 95.22916666666667, + "epoch": 0.314461071650231, + "grad_norm": 5.381865978240967, + "kl": 0.10931396484375, + "learning_rate": 1e-06, + "loss": 0.0044, + "reward": 0.6569791634877523, + "reward_std": 0.3523280620574951, + "rewards/accuracy_reward": 0.6569791634877523, + "step": 380 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 94.375, + "epoch": 0.3152886007861527, + "grad_norm": 2.216244697570801, + "kl": 0.07970682779947917, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6346875006953875, + "reward_std": 0.37422069162130356, + "rewards/accuracy_reward": 0.6346875006953875, + "step": 381 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 75.98958333333333, + "epoch": 0.31611612992207433, + "grad_norm": 2.787137269973755, + "kl": 0.06302897135416667, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5308333362142245, + "reward_std": 0.30324298578004044, + "rewards/accuracy_reward": 0.5308333362142245, + "step": 382 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 73.60416666666667, + "epoch": 0.316943659057996, + "grad_norm": 3.0902276039123535, + "kl": 0.08127848307291667, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.5653125022848448, + "reward_std": 0.3349315399924914, + "rewards/accuracy_reward": 0.5653125022848448, + "step": 383 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 87.89583333333333, + "epoch": 0.31777118819391764, + "grad_norm": 2.054920196533203, + "kl": 0.0623779296875, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5540625030795733, + "reward_std": 0.2757724275191625, + "rewards/accuracy_reward": 0.5540625030795733, + "step": 384 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 88.88541666666667, + "epoch": 0.3185987173298393, + "grad_norm": 3.115396499633789, + "kl": 0.043975830078125, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6314583346247673, + "reward_std": 0.33379599700371426, + "rewards/accuracy_reward": 0.6314583346247673, + "step": 385 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 87.6875, + "epoch": 0.31942624646576095, + "grad_norm": 2.5537235736846924, + "kl": 0.06865437825520833, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.71385416512688, + "reward_std": 0.2965495400130749, + "rewards/accuracy_reward": 0.71385416512688, + "step": 386 + }, + { + "ave_tool_num": 0.125, + "completion_length": 87.39583333333333, + "epoch": 0.32025377560168267, + "grad_norm": 2.6104378700256348, + "kl": 0.050745646158854164, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5898958295583725, + "reward_std": 0.30003349234660465, + "rewards/accuracy_reward": 0.5898958295583725, + "step": 387 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 75.46875, + "epoch": 0.3210813047376043, + "grad_norm": 2.842622995376587, + "kl": 0.055277506510416664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5724999920154611, + "reward_std": 0.3286873834828536, + "rewards/accuracy_reward": 0.5724999920154611, + "step": 388 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 82.90625, + "epoch": 0.321908833873526, + "grad_norm": 3.323568344116211, + "kl": 0.10235595703125, + "learning_rate": 1e-06, + "loss": 0.0041, + "reward": 0.5685416621466478, + "reward_std": 0.323000172773997, + "rewards/accuracy_reward": 0.5685416621466478, + "step": 389 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 82.72916666666667, + "epoch": 0.32273636300944764, + "grad_norm": 1.9498776197433472, + "kl": 0.049214680989583336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6091666743159294, + "reward_std": 0.35227737079064053, + "rewards/accuracy_reward": 0.6091666743159294, + "step": 390 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 87.41666666666667, + "epoch": 0.3235638921453693, + "grad_norm": 2.844447612762451, + "kl": 0.045847574869791664, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.5798958316445351, + "reward_std": 0.32965763627241057, + "rewards/accuracy_reward": 0.5798958316445351, + "step": 391 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 81.80208333333333, + "epoch": 0.32439142128129095, + "grad_norm": 2.4592700004577637, + "kl": 0.053680419921875, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.558645830800136, + "reward_std": 0.30168161913752556, + "rewards/accuracy_reward": 0.558645830800136, + "step": 392 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 82.94791666666667, + "epoch": 0.3252189504172126, + "grad_norm": 2.7374062538146973, + "kl": 0.07200113932291667, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6010416659216086, + "reward_std": 0.32768502210577327, + "rewards/accuracy_reward": 0.6010416659216086, + "step": 393 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 85.70833333333333, + "epoch": 0.32604647955313426, + "grad_norm": 2.000427007675171, + "kl": 0.052164713541666664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.56739583487312, + "reward_std": 0.3163845290740331, + "rewards/accuracy_reward": 0.56739583487312, + "step": 394 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 101.94791666666667, + "epoch": 0.3268740086890559, + "grad_norm": 2.3888015747070312, + "kl": 0.055775960286458336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.56218750278155, + "reward_std": 0.3636915075282256, + "rewards/accuracy_reward": 0.56218750278155, + "step": 395 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 89.02083333333333, + "epoch": 0.3277015378249776, + "grad_norm": 2.007768154144287, + "kl": 0.046681722005208336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.6849999924500784, + "reward_std": 0.2640085257589817, + "rewards/accuracy_reward": 0.6849999924500784, + "step": 396 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 115.375, + "epoch": 0.32852906696089923, + "grad_norm": 44.4231071472168, + "kl": 0.5502115885416666, + "learning_rate": 1e-06, + "loss": 0.0219, + "reward": 0.5770833427086473, + "reward_std": 0.2945540513222416, + "rewards/accuracy_reward": 0.5770833427086473, + "step": 397 + }, + { + "ave_tool_num": 0.125, + "completion_length": 74.77083333333333, + "epoch": 0.3293565960968209, + "grad_norm": 2.092595100402832, + "kl": 0.043284098307291664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.6468749990065893, + "reward_std": 0.2570246023436387, + "rewards/accuracy_reward": 0.6468749990065893, + "step": 398 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 92.45833333333333, + "epoch": 0.33018412523274254, + "grad_norm": 2.1372499465942383, + "kl": 0.044881184895833336, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.6542708352208138, + "reward_std": 0.3234165670971076, + "rewards/accuracy_reward": 0.6542708352208138, + "step": 399 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 70.52083333333333, + "epoch": 0.33101165436866425, + "grad_norm": 1.9710873365402222, + "kl": 0.055562337239583336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6724999994039536, + "reward_std": 0.2961947483321031, + "rewards/accuracy_reward": 0.6724999994039536, + "step": 400 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 94.38541666666667, + "epoch": 0.3318391835045859, + "grad_norm": 42.94346237182617, + "kl": 0.8001302083333334, + "learning_rate": 1e-06, + "loss": 0.0323, + "reward": 0.42249999816219014, + "reward_std": 0.3208470543225606, + "rewards/accuracy_reward": 0.42249999816219014, + "step": 401 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 93.61458333333333, + "epoch": 0.33266671264050757, + "grad_norm": 5.367246150970459, + "kl": 0.13096110026041666, + "learning_rate": 1e-06, + "loss": 0.0053, + "reward": 0.5082291675110658, + "reward_std": 0.40094277014334995, + "rewards/accuracy_reward": 0.5082291675110658, + "step": 402 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 82.46875, + "epoch": 0.3334942417764292, + "grad_norm": 2.4224953651428223, + "kl": 0.05096435546875, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.4878125029305617, + "reward_std": 0.34266817073027295, + "rewards/accuracy_reward": 0.4878125029305617, + "step": 403 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 75.95833333333333, + "epoch": 0.3343217709123509, + "grad_norm": 2.7762441635131836, + "kl": 0.053558349609375, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6645833365619183, + "reward_std": 0.26916606972614926, + "rewards/accuracy_reward": 0.6645833365619183, + "step": 404 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 88.60416666666667, + "epoch": 0.33514930004827254, + "grad_norm": 2.655616283416748, + "kl": 0.051320393880208336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.573854165772597, + "reward_std": 0.301048976679643, + "rewards/accuracy_reward": 0.573854165772597, + "step": 405 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 83.23958333333333, + "epoch": 0.3359768291841942, + "grad_norm": 2.7072184085845947, + "kl": 0.059193929036458336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5317708396663269, + "reward_std": 0.30466883319119614, + "rewards/accuracy_reward": 0.5317708396663269, + "step": 406 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 93.5, + "epoch": 0.33680435832011585, + "grad_norm": 1.9978324174880981, + "kl": 0.041788736979166664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5172916650772095, + "reward_std": 0.2882639169692993, + "rewards/accuracy_reward": 0.5172916650772095, + "step": 407 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 93.48958333333333, + "epoch": 0.3376318874560375, + "grad_norm": 2.465507984161377, + "kl": 0.0577392578125, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5401041681567827, + "reward_std": 0.37685637796918553, + "rewards/accuracy_reward": 0.5401041681567827, + "step": 408 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 86.30208333333333, + "epoch": 0.33845941659195916, + "grad_norm": 2.7758982181549072, + "kl": 0.0753173828125, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.45625000322858494, + "reward_std": 0.35297298648705083, + "rewards/accuracy_reward": 0.45625000322858494, + "step": 409 + }, + { + "ave_tool_num": 0.125, + "completion_length": 89.5625, + "epoch": 0.3392869457278808, + "grad_norm": 2.3110055923461914, + "kl": 0.046305338541666664, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.693020835518837, + "reward_std": 0.2935180465380351, + "rewards/accuracy_reward": 0.693020835518837, + "step": 410 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 93.09375, + "epoch": 0.3401144748638025, + "grad_norm": 2.1172337532043457, + "kl": 0.049570719401041664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6183333341032267, + "reward_std": 0.2291252457847198, + "rewards/accuracy_reward": 0.6183333341032267, + "step": 411 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 94.78125, + "epoch": 0.34094200399972413, + "grad_norm": 2.207946300506592, + "kl": 0.057515462239583336, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.42906250432133675, + "reward_std": 0.36027201513449353, + "rewards/accuracy_reward": 0.42906250432133675, + "step": 412 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 88.01041666666667, + "epoch": 0.34176953313564584, + "grad_norm": 3.490591287612915, + "kl": 0.10303751627604167, + "learning_rate": 1e-06, + "loss": 0.0041, + "reward": 0.51760416974624, + "reward_std": 0.32783228034774464, + "rewards/accuracy_reward": 0.51760416974624, + "step": 413 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 84.88541666666667, + "epoch": 0.3425970622715675, + "grad_norm": 3.8745181560516357, + "kl": 0.07098388671875, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.40406250208616257, + "reward_std": 0.3898789460460345, + "rewards/accuracy_reward": 0.40406250208616257, + "step": 414 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 76.03125, + "epoch": 0.34342459140748915, + "grad_norm": 2.525129795074463, + "kl": 0.07353719075520833, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6208333323399226, + "reward_std": 0.31014274060726166, + "rewards/accuracy_reward": 0.6208333323399226, + "step": 415 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 111.07291666666667, + "epoch": 0.3442521205434108, + "grad_norm": 2.0778844356536865, + "kl": 0.086669921875, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.6489583402872086, + "reward_std": 0.2811567212144534, + "rewards/accuracy_reward": 0.6489583402872086, + "step": 416 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 86.57291666666667, + "epoch": 0.34507964967933247, + "grad_norm": 2.7181901931762695, + "kl": 0.09407552083333333, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.5901041651765505, + "reward_std": 0.3079790820678075, + "rewards/accuracy_reward": 0.5901041651765505, + "step": 417 + }, + { + "ave_tool_num": 0.125, + "completion_length": 98.4375, + "epoch": 0.3459071788152541, + "grad_norm": 1.8927408456802368, + "kl": 0.05908203125, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6313541680574417, + "reward_std": 0.38911137729883194, + "rewards/accuracy_reward": 0.6313541680574417, + "step": 418 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 86.20833333333333, + "epoch": 0.3467347079511758, + "grad_norm": 5.112675666809082, + "kl": 0.14792887369791666, + "learning_rate": 1e-06, + "loss": 0.0059, + "reward": 0.5769791699325045, + "reward_std": 0.33497043140232563, + "rewards/accuracy_reward": 0.5769791699325045, + "step": 419 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 85.875, + "epoch": 0.34756223708709744, + "grad_norm": 2.791884183883667, + "kl": 0.08371988932291667, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6232291708389918, + "reward_std": 0.29703788831830025, + "rewards/accuracy_reward": 0.6232291708389918, + "step": 420 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 147.58333333333334, + "epoch": 0.3483897662230191, + "grad_norm": 2.1637933254241943, + "kl": 0.052998860677083336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6478124981125196, + "reward_std": 0.3512174151837826, + "rewards/accuracy_reward": 0.6478124981125196, + "step": 421 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 77.0, + "epoch": 0.34921729535894075, + "grad_norm": 2.4625589847564697, + "kl": 0.054901123046875, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.617916668454806, + "reward_std": 0.37398961869378883, + "rewards/accuracy_reward": 0.617916668454806, + "step": 422 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 96.80208333333333, + "epoch": 0.3500448244948624, + "grad_norm": 1.757582426071167, + "kl": 0.052632649739583336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.546875, + "reward_std": 0.3719704809288184, + "rewards/accuracy_reward": 0.546875, + "step": 423 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 86.54166666666667, + "epoch": 0.35087235363078406, + "grad_norm": 1.8485451936721802, + "kl": 0.050333658854166664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5908333336313566, + "reward_std": 0.2937912940979004, + "rewards/accuracy_reward": 0.5908333336313566, + "step": 424 + }, + { + "ave_tool_num": 0.125, + "completion_length": 90.29166666666667, + "epoch": 0.3516998827667057, + "grad_norm": 2.108473539352417, + "kl": 0.060323079427083336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.45885416865348816, + "reward_std": 0.33889754737416905, + "rewards/accuracy_reward": 0.45885416865348816, + "step": 425 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 92.03125, + "epoch": 0.35252741190262743, + "grad_norm": 2.2411317825317383, + "kl": 0.063446044921875, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.4711458335320155, + "reward_std": 0.3216232992708683, + "rewards/accuracy_reward": 0.4711458335320155, + "step": 426 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 104.9375, + "epoch": 0.3533549410385491, + "grad_norm": 2.7765555381774902, + "kl": 0.052510579427083336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.7120833297570547, + "reward_std": 0.33565126607815426, + "rewards/accuracy_reward": 0.7120833297570547, + "step": 427 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 89.10416666666667, + "epoch": 0.35418247017447074, + "grad_norm": 2.6107354164123535, + "kl": 0.0665283203125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6196875013411045, + "reward_std": 0.35930094743768376, + "rewards/accuracy_reward": 0.6196875013411045, + "step": 428 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 95.23958333333333, + "epoch": 0.3550099993103924, + "grad_norm": 1.8843382596969604, + "kl": 0.044036865234375, + "learning_rate": 1e-06, + "loss": 0.0018, + "reward": 0.4670833299557368, + "reward_std": 0.32446541761358577, + "rewards/accuracy_reward": 0.4670833299557368, + "step": 429 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 79.83333333333333, + "epoch": 0.35583752844631406, + "grad_norm": 2.7637746334075928, + "kl": 0.059183756510416664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6216666623950005, + "reward_std": 0.31048790117104846, + "rewards/accuracy_reward": 0.6216666623950005, + "step": 430 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 83.10416666666667, + "epoch": 0.3566650575822357, + "grad_norm": 6.829006671905518, + "kl": 0.20063273111979166, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 0.515104167163372, + "reward_std": 0.34989264731605846, + "rewards/accuracy_reward": 0.515104167163372, + "step": 431 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 76.85416666666667, + "epoch": 0.35749258671815737, + "grad_norm": 2.570462226867676, + "kl": 0.08049519856770833, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5502083345005909, + "reward_std": 0.24982727132737637, + "rewards/accuracy_reward": 0.5502083345005909, + "step": 432 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 78.4375, + "epoch": 0.358320115854079, + "grad_norm": 2.7669501304626465, + "kl": 0.054911295572916664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6336458350221316, + "reward_std": 0.3598809652030468, + "rewards/accuracy_reward": 0.6336458350221316, + "step": 433 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 75.89583333333333, + "epoch": 0.3591476449900007, + "grad_norm": 4.176877021789551, + "kl": 0.061208089192708336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.7018750011920929, + "reward_std": 0.3096493873745203, + "rewards/accuracy_reward": 0.7018750011920929, + "step": 434 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 74.53125, + "epoch": 0.35997517412592234, + "grad_norm": 2.5555152893066406, + "kl": 0.06871541341145833, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5611458383500576, + "reward_std": 0.3996543275813262, + "rewards/accuracy_reward": 0.5611458383500576, + "step": 435 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 92.96875, + "epoch": 0.360802703261844, + "grad_norm": 2.342385768890381, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.610520833482345, + "reward_std": 0.26787882049878436, + "rewards/accuracy_reward": 0.610520833482345, + "step": 436 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 105.46875, + "epoch": 0.36163023239776565, + "grad_norm": 2.1473445892333984, + "kl": 0.06656901041666667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.49781249836087227, + "reward_std": 0.3226981349289417, + "rewards/accuracy_reward": 0.49781249836087227, + "step": 437 + }, + { + "ave_tool_num": 0.125, + "completion_length": 74.38541666666667, + "epoch": 0.3624577615336873, + "grad_norm": 25.128156661987305, + "kl": 0.15907796223958334, + "learning_rate": 1e-06, + "loss": 0.0063, + "reward": 0.6843749961505333, + "reward_std": 0.2978026668230693, + "rewards/accuracy_reward": 0.6843749961505333, + "step": 438 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 72.98958333333333, + "epoch": 0.363285290669609, + "grad_norm": 4.106093406677246, + "kl": 0.08565266927083333, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5768750036756197, + "reward_std": 0.3291751767198245, + "rewards/accuracy_reward": 0.5768750036756197, + "step": 439 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 80.75, + "epoch": 0.3641128198055307, + "grad_norm": 2.601335048675537, + "kl": 0.06438191731770833, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.49000000456968945, + "reward_std": 0.3869963126877944, + "rewards/accuracy_reward": 0.49000000456968945, + "step": 440 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 70.09375, + "epoch": 0.36494034894145233, + "grad_norm": 2.334836721420288, + "kl": 0.07216389973958333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5633333387474219, + "reward_std": 0.3391263944407304, + "rewards/accuracy_reward": 0.5633333387474219, + "step": 441 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 129.89583333333334, + "epoch": 0.365767878077374, + "grad_norm": 3.4158713817596436, + "kl": 0.07147216796875, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5060416708389918, + "reward_std": 0.32649823402365047, + "rewards/accuracy_reward": 0.5060416708389918, + "step": 442 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 91.22916666666667, + "epoch": 0.36659540721329564, + "grad_norm": 4.669519901275635, + "kl": 0.058725992838541664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6676041608055433, + "reward_std": 0.342531135926644, + "rewards/accuracy_reward": 0.6676041608055433, + "step": 443 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 70.59375, + "epoch": 0.3674229363492173, + "grad_norm": 2.391209125518799, + "kl": 0.08852132161458333, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.5436458351711432, + "reward_std": 0.34518032396833104, + "rewards/accuracy_reward": 0.5436458351711432, + "step": 444 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 77.13541666666667, + "epoch": 0.36825046548513896, + "grad_norm": 2.6492819786071777, + "kl": 0.049418131510416664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5192708323399226, + "reward_std": 0.32641540840268135, + "rewards/accuracy_reward": 0.5192708323399226, + "step": 445 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 107.97916666666667, + "epoch": 0.3690779946210606, + "grad_norm": 3.7569103240966797, + "kl": 0.060139973958333336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6653124988079071, + "reward_std": 0.2291006309290727, + "rewards/accuracy_reward": 0.6653124988079071, + "step": 446 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 88.625, + "epoch": 0.36990552375698227, + "grad_norm": 2.377312421798706, + "kl": 0.046620686848958336, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5210416770229737, + "reward_std": 0.35212751664221287, + "rewards/accuracy_reward": 0.5210416770229737, + "step": 447 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 82.79166666666667, + "epoch": 0.3707330528929039, + "grad_norm": 3.4724843502044678, + "kl": 0.06285603841145833, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6935416708389918, + "reward_std": 0.31975941856702167, + "rewards/accuracy_reward": 0.6935416708389918, + "step": 448 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 87.16666666666667, + "epoch": 0.3715605820288256, + "grad_norm": 3.9665839672088623, + "kl": 0.043162027994791664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.5372916633884112, + "reward_std": 0.4008233758310477, + "rewards/accuracy_reward": 0.5372916633884112, + "step": 449 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 76.27083333333333, + "epoch": 0.37238811116474724, + "grad_norm": 2.6960608959198, + "kl": 0.056365966796875, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6124999957780043, + "reward_std": 0.31522744335234165, + "rewards/accuracy_reward": 0.6124999957780043, + "step": 450 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 70.53125, + "epoch": 0.3732156403006689, + "grad_norm": 3.1424853801727295, + "kl": 0.06748453776041667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.615625003973643, + "reward_std": 0.3474488090723753, + "rewards/accuracy_reward": 0.615625003973643, + "step": 451 + }, + { + "ave_tool_num": 0.125, + "completion_length": 74.45833333333333, + "epoch": 0.3740431694365906, + "grad_norm": 2.3932924270629883, + "kl": 0.04766845703125, + "learning_rate": 1e-06, + "loss": 0.0019, + "reward": 0.5932291653007269, + "reward_std": 0.2891123853623867, + "rewards/accuracy_reward": 0.5932291653007269, + "step": 452 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 84.09375, + "epoch": 0.37487069857251226, + "grad_norm": 5.080103874206543, + "kl": 0.068084716796875, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.47239583854873973, + "reward_std": 0.3381155828634898, + "rewards/accuracy_reward": 0.47239583854873973, + "step": 453 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 97.15625, + "epoch": 0.3756982277084339, + "grad_norm": 2.295944929122925, + "kl": 0.06620279947916667, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6311458299557368, + "reward_std": 0.3349374532699585, + "rewards/accuracy_reward": 0.6311458299557368, + "step": 454 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 101.89583333333333, + "epoch": 0.3765257568443556, + "grad_norm": 2.6252005100250244, + "kl": 0.059611002604166664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6502083291610082, + "reward_std": 0.2721450769652923, + "rewards/accuracy_reward": 0.6502083291610082, + "step": 455 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 79.32291666666667, + "epoch": 0.37735328598027723, + "grad_norm": 2.7697794437408447, + "kl": 0.06897989908854167, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5136458327372869, + "reward_std": 0.30437763780355453, + "rewards/accuracy_reward": 0.5136458327372869, + "step": 456 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 86.1875, + "epoch": 0.3781808151161989, + "grad_norm": 3.2622265815734863, + "kl": 0.06398518880208333, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.7243749996026357, + "reward_std": 0.34105027467012405, + "rewards/accuracy_reward": 0.7243749996026357, + "step": 457 + }, + { + "ave_tool_num": 0.125, + "completion_length": 86.05208333333333, + "epoch": 0.37900834425212054, + "grad_norm": 3.1431055068969727, + "kl": 0.08906046549479167, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.6372916648785273, + "reward_std": 0.3762894583245118, + "rewards/accuracy_reward": 0.6372916648785273, + "step": 458 + }, + { + "ave_tool_num": 0.125, + "completion_length": 80.98958333333333, + "epoch": 0.3798358733880422, + "grad_norm": 3.494805335998535, + "kl": 0.07602945963541667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.49718749585251015, + "reward_std": 0.27898920451601344, + "rewards/accuracy_reward": 0.49718749585251015, + "step": 459 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 86.55208333333333, + "epoch": 0.38066340252396386, + "grad_norm": 2.5072741508483887, + "kl": 0.049601236979166664, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6293750057617823, + "reward_std": 0.26824164514740306, + "rewards/accuracy_reward": 0.6293750057617823, + "step": 460 + }, + { + "ave_tool_num": 0.125, + "completion_length": 79.97916666666667, + "epoch": 0.3814909316598855, + "grad_norm": 3.0097672939300537, + "kl": 0.051661173502604164, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5765624940395355, + "reward_std": 0.3909371035794417, + "rewards/accuracy_reward": 0.5765624940395355, + "step": 461 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 75.3125, + "epoch": 0.38231846079580717, + "grad_norm": 2.3569204807281494, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6626041742662588, + "reward_std": 0.3010958408315976, + "rewards/accuracy_reward": 0.6626041742662588, + "step": 462 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 75.78125, + "epoch": 0.3831459899317288, + "grad_norm": 39.71074676513672, + "kl": 0.07155354817708333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5836458255847295, + "reward_std": 0.337051456173261, + "rewards/accuracy_reward": 0.5836458255847295, + "step": 463 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 81.09375, + "epoch": 0.3839735190676505, + "grad_norm": 3.936245918273926, + "kl": 0.11431884765625, + "learning_rate": 1e-06, + "loss": 0.0046, + "reward": 0.5261458357175192, + "reward_std": 0.3242458614210288, + "rewards/accuracy_reward": 0.5261458357175192, + "step": 464 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 84.53125, + "epoch": 0.3848010482035722, + "grad_norm": 2.2571544647216797, + "kl": 0.049336751302083336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.6725000018874804, + "reward_std": 0.3531714801987012, + "rewards/accuracy_reward": 0.6725000018874804, + "step": 465 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 80.64583333333333, + "epoch": 0.38562857733949385, + "grad_norm": 2.570573329925537, + "kl": 0.06853230794270833, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.7590625012914339, + "reward_std": 0.3326664827764034, + "rewards/accuracy_reward": 0.7590625012914339, + "step": 466 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 89.875, + "epoch": 0.3864561064754155, + "grad_norm": 2.1712069511413574, + "kl": 0.09224446614583333, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.5016666675607363, + "reward_std": 0.30503516271710396, + "rewards/accuracy_reward": 0.5016666675607363, + "step": 467 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 78.69791666666667, + "epoch": 0.38728363561133716, + "grad_norm": 1.8506814241409302, + "kl": 0.052388509114583336, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5352083332836628, + "reward_std": 0.35910292466481525, + "rewards/accuracy_reward": 0.5352083332836628, + "step": 468 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 82.42708333333333, + "epoch": 0.3881111647472588, + "grad_norm": 2.672823190689087, + "kl": 0.06060791015625, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6492708325386047, + "reward_std": 0.3448348641395569, + "rewards/accuracy_reward": 0.6492708325386047, + "step": 469 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 88.02083333333333, + "epoch": 0.3889386938831805, + "grad_norm": 3.4910147190093994, + "kl": 0.07222493489583333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.4882291654745738, + "reward_std": 0.3268120586872101, + "rewards/accuracy_reward": 0.4882291654745738, + "step": 470 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 84.40625, + "epoch": 0.38976622301910213, + "grad_norm": 3.0309712886810303, + "kl": 0.055165608723958336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6457291717330614, + "reward_std": 0.33750447630882263, + "rewards/accuracy_reward": 0.6457291717330614, + "step": 471 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 67.32291666666667, + "epoch": 0.3905937521550238, + "grad_norm": 4.398570537567139, + "kl": 0.06951904296875, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.6874999900658926, + "reward_std": 0.34041650717457134, + "rewards/accuracy_reward": 0.6874999900658926, + "step": 472 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 89.23958333333333, + "epoch": 0.39142128129094544, + "grad_norm": 2.4567558765411377, + "kl": 0.056355794270833336, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.627500002582868, + "reward_std": 0.31325888199110824, + "rewards/accuracy_reward": 0.627500002582868, + "step": 473 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 80.05208333333333, + "epoch": 0.3922488104268671, + "grad_norm": 8.877187728881836, + "kl": 0.1605224609375, + "learning_rate": 1e-06, + "loss": 0.0064, + "reward": 0.6366666654745737, + "reward_std": 0.30868744974335033, + "rewards/accuracy_reward": 0.6366666654745737, + "step": 474 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 90.23958333333333, + "epoch": 0.39307633956278876, + "grad_norm": 2.290982723236084, + "kl": 0.06373087565104167, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5997916658719381, + "reward_std": 0.24585207551717758, + "rewards/accuracy_reward": 0.5997916658719381, + "step": 475 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 79.39583333333333, + "epoch": 0.3939038686987104, + "grad_norm": 6.408491134643555, + "kl": 0.050893147786458336, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.5540625030795733, + "reward_std": 0.3156542517244816, + "rewards/accuracy_reward": 0.5540625030795733, + "step": 476 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 88.17708333333333, + "epoch": 0.39473139783463207, + "grad_norm": 2.441671848297119, + "kl": 0.05145263671875, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.584791666517655, + "reward_std": 0.36274126668771106, + "rewards/accuracy_reward": 0.584791666517655, + "step": 477 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 83.04166666666667, + "epoch": 0.3955589269705538, + "grad_norm": 2.3616368770599365, + "kl": 0.11506144205729167, + "learning_rate": 1e-06, + "loss": 0.0046, + "reward": 0.5564583341280619, + "reward_std": 0.27746492872635525, + "rewards/accuracy_reward": 0.5564583341280619, + "step": 478 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 79.61458333333333, + "epoch": 0.39638645610647544, + "grad_norm": 2.5547938346862793, + "kl": 0.053782145182291664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6201041638851166, + "reward_std": 0.29325684408346814, + "rewards/accuracy_reward": 0.6201041638851166, + "step": 479 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 77.79166666666667, + "epoch": 0.3972139852423971, + "grad_norm": 4.969768524169922, + "kl": 0.08894856770833333, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.4898958330353101, + "reward_std": 0.35761539824306965, + "rewards/accuracy_reward": 0.4898958330353101, + "step": 480 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 90.25, + "epoch": 0.39804151437831875, + "grad_norm": 2.3286705017089844, + "kl": 0.061625162760416664, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6042708350966374, + "reward_std": 0.2940767475714286, + "rewards/accuracy_reward": 0.6042708350966374, + "step": 481 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 76.55208333333333, + "epoch": 0.3988690435142404, + "grad_norm": 2.22641658782959, + "kl": 0.063262939453125, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5836458305517832, + "reward_std": 0.266540564596653, + "rewards/accuracy_reward": 0.5836458305517832, + "step": 482 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 66.11458333333333, + "epoch": 0.39969657265016206, + "grad_norm": 2.534559965133667, + "kl": 0.055216471354166664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6314583395918211, + "reward_std": 0.2755290362983942, + "rewards/accuracy_reward": 0.6314583395918211, + "step": 483 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.91666666666667, + "epoch": 0.4005241017860837, + "grad_norm": 2.5155751705169678, + "kl": 0.13877360026041666, + "learning_rate": 1e-06, + "loss": 0.0055, + "reward": 0.6543750030299028, + "reward_std": 0.3011659272015095, + "rewards/accuracy_reward": 0.6543750030299028, + "step": 484 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 74.15625, + "epoch": 0.4013516309220054, + "grad_norm": 2.5521512031555176, + "kl": 0.06597900390625, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.4929166721800963, + "reward_std": 0.31452468410134315, + "rewards/accuracy_reward": 0.4929166721800963, + "step": 485 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 79.84375, + "epoch": 0.40217916005792703, + "grad_norm": 2.2335448265075684, + "kl": 0.061147054036458336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6681249986092249, + "reward_std": 0.29376762608687085, + "rewards/accuracy_reward": 0.6681249986092249, + "step": 486 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 88.8125, + "epoch": 0.4030066891938487, + "grad_norm": 2.660547971725464, + "kl": 0.06675211588541667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.3727083299309015, + "reward_std": 0.3821374736726284, + "rewards/accuracy_reward": 0.3727083299309015, + "step": 487 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 80.15625, + "epoch": 0.40383421832977034, + "grad_norm": 2.703193187713623, + "kl": 0.07903035481770833, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6215625007947286, + "reward_std": 0.3091547430182497, + "rewards/accuracy_reward": 0.6215625007947286, + "step": 488 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 76.98958333333333, + "epoch": 0.404661747465692, + "grad_norm": 1.941788673400879, + "kl": 0.061909993489583336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.697604164481163, + "reward_std": 0.31756891931096715, + "rewards/accuracy_reward": 0.697604164481163, + "step": 489 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 100.94791666666667, + "epoch": 0.40548927660161366, + "grad_norm": 3.279859781265259, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0044, + "reward": 0.4911458318432172, + "reward_std": 0.38452348733941716, + "rewards/accuracy_reward": 0.4911458318432172, + "step": 490 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 74.35416666666667, + "epoch": 0.40631680573753537, + "grad_norm": 2.855191707611084, + "kl": 0.09745279947916667, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.4348958333333333, + "reward_std": 0.32758783486982185, + "rewards/accuracy_reward": 0.4348958333333333, + "step": 491 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 77.79166666666667, + "epoch": 0.407144334873457, + "grad_norm": 2.3664543628692627, + "kl": 0.055079142252604164, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6382291664679846, + "reward_std": 0.295665663977464, + "rewards/accuracy_reward": 0.6382291664679846, + "step": 492 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 79.33333333333333, + "epoch": 0.4079718640093787, + "grad_norm": 1.7798676490783691, + "kl": 0.054723103841145836, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6714583312471708, + "reward_std": 0.2517998901506265, + "rewards/accuracy_reward": 0.6714583312471708, + "step": 493 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 97.32291666666667, + "epoch": 0.40879939314530034, + "grad_norm": 2.655663013458252, + "kl": 0.063140869140625, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5479166706403097, + "reward_std": 0.2903564839313428, + "rewards/accuracy_reward": 0.5479166706403097, + "step": 494 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 74.82291666666667, + "epoch": 0.409626922281222, + "grad_norm": 2.7197587490081787, + "kl": 0.060017903645833336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6937499940395355, + "reward_std": 0.371356134613355, + "rewards/accuracy_reward": 0.6937499940395355, + "step": 495 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.33333333333333, + "epoch": 0.41045445141714365, + "grad_norm": 5.2610249519348145, + "kl": 0.14138285319010416, + "learning_rate": 1e-06, + "loss": 0.0057, + "reward": 0.4920833359162013, + "reward_std": 0.3418480691810449, + "rewards/accuracy_reward": 0.4920833359162013, + "step": 496 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 112.85416666666667, + "epoch": 0.4112819805530653, + "grad_norm": 20.029863357543945, + "kl": 0.2694295247395833, + "learning_rate": 1e-06, + "loss": 0.0108, + "reward": 0.5473958303531011, + "reward_std": 0.39735861495137215, + "rewards/accuracy_reward": 0.5473958303531011, + "step": 497 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 78.35416666666667, + "epoch": 0.41210950968898696, + "grad_norm": 2.389545440673828, + "kl": 0.08465576171875, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5679166664679846, + "reward_std": 0.34177835409839946, + "rewards/accuracy_reward": 0.5679166664679846, + "step": 498 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 81.44791666666667, + "epoch": 0.4129370388249086, + "grad_norm": 1.9008443355560303, + "kl": 0.055246988932291664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5254166709880034, + "reward_std": 0.25208471684406203, + "rewards/accuracy_reward": 0.5254166709880034, + "step": 499 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 89.02083333333333, + "epoch": 0.4137645679608303, + "grad_norm": 2.5143511295318604, + "kl": 0.055714925130208336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5808333332339922, + "reward_std": 0.36091291283567745, + "rewards/accuracy_reward": 0.5808333332339922, + "step": 500 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 78.95833333333333, + "epoch": 0.41459209709675193, + "grad_norm": 3.301988363265991, + "kl": 0.06633504231770833, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6759375035762787, + "reward_std": 0.362764115134875, + "rewards/accuracy_reward": 0.6759375035762787, + "step": 501 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 79.02083333333333, + "epoch": 0.4154196262326736, + "grad_norm": 3.4665110111236572, + "kl": 0.061787923177083336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5959375003973643, + "reward_std": 0.33722777167956036, + "rewards/accuracy_reward": 0.5959375003973643, + "step": 502 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 71.73958333333333, + "epoch": 0.41624715536859525, + "grad_norm": 2.2843403816223145, + "kl": 0.07674153645833333, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.49020833397905034, + "reward_std": 0.3350026396413644, + "rewards/accuracy_reward": 0.49020833397905034, + "step": 503 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 87.59375, + "epoch": 0.41707468450451696, + "grad_norm": 2.4463114738464355, + "kl": 0.0804443359375, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5013541653752327, + "reward_std": 0.36176376913984615, + "rewards/accuracy_reward": 0.5013541653752327, + "step": 504 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 92.42708333333333, + "epoch": 0.4179022136404386, + "grad_norm": 3.340136766433716, + "kl": 0.0684814453125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5408333316445351, + "reward_std": 0.3662014032403628, + "rewards/accuracy_reward": 0.5408333316445351, + "step": 505 + }, + { + "ave_tool_num": 0.125, + "completion_length": 85.40625, + "epoch": 0.41872974277636027, + "grad_norm": 12.52741527557373, + "kl": 0.273162841796875, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 0.613645834227403, + "reward_std": 0.30271811162432033, + "rewards/accuracy_reward": 0.613645834227403, + "step": 506 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 68.63541666666667, + "epoch": 0.4195572719122819, + "grad_norm": 2.359797954559326, + "kl": 0.06315104166666667, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5941666687528292, + "reward_std": 0.2944101244211197, + "rewards/accuracy_reward": 0.5941666687528292, + "step": 507 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 70.21875, + "epoch": 0.4203848010482036, + "grad_norm": 3.3555843830108643, + "kl": 0.07889811197916667, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.559895838300387, + "reward_std": 0.3046487619479497, + "rewards/accuracy_reward": 0.559895838300387, + "step": 508 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 81.15625, + "epoch": 0.42121233018412524, + "grad_norm": 2.5118165016174316, + "kl": 0.065216064453125, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.7283333353698254, + "reward_std": 0.2423290442675352, + "rewards/accuracy_reward": 0.7283333353698254, + "step": 509 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 77.625, + "epoch": 0.4220398593200469, + "grad_norm": 2.601414442062378, + "kl": 0.09843953450520833, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.6467708374063174, + "reward_std": 0.2615513900915782, + "rewards/accuracy_reward": 0.6467708374063174, + "step": 510 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 89.59375, + "epoch": 0.42286738845596855, + "grad_norm": 2.3883564472198486, + "kl": 0.059468587239583336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5592708388964335, + "reward_std": 0.3428346514701843, + "rewards/accuracy_reward": 0.5592708388964335, + "step": 511 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 69.07291666666667, + "epoch": 0.4236949175918902, + "grad_norm": 6.7317938804626465, + "kl": 0.072906494140625, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6904166638851166, + "reward_std": 0.35321945572892827, + "rewards/accuracy_reward": 0.6904166638851166, + "step": 512 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 84.15625, + "epoch": 0.42452244672781186, + "grad_norm": 3.510201930999756, + "kl": 0.09830729166666667, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.5460416668405136, + "reward_std": 0.29387207080920535, + "rewards/accuracy_reward": 0.5460416668405136, + "step": 513 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 79.10416666666667, + "epoch": 0.4253499758637335, + "grad_norm": 2.52903413772583, + "kl": 0.08607991536458333, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5127083373566469, + "reward_std": 0.3080407294134299, + "rewards/accuracy_reward": 0.5127083373566469, + "step": 514 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 81.32291666666667, + "epoch": 0.4261775049996552, + "grad_norm": 2.5130763053894043, + "kl": 0.061503092447916664, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5221875105053186, + "reward_std": 0.3523649958272775, + "rewards/accuracy_reward": 0.5221875105053186, + "step": 515 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 90.84375, + "epoch": 0.42700503413557683, + "grad_norm": 2.425412893295288, + "kl": 0.07014973958333333, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5071875009064873, + "reward_std": 0.34423115601142246, + "rewards/accuracy_reward": 0.5071875009064873, + "step": 516 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 90.69791666666667, + "epoch": 0.42783256327149854, + "grad_norm": 2.987445831298828, + "kl": 0.056396484375, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.7265624950329462, + "reward_std": 0.29423307068645954, + "rewards/accuracy_reward": 0.7265624950329462, + "step": 517 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 85.88541666666667, + "epoch": 0.4286600924074202, + "grad_norm": 2.8483691215515137, + "kl": 0.09425862630208333, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.5781250086923441, + "reward_std": 0.3119236460576455, + "rewards/accuracy_reward": 0.5781250086923441, + "step": 518 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 84.88541666666667, + "epoch": 0.42948762154334186, + "grad_norm": 1.7578259706497192, + "kl": 0.057413736979166664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.7205208390951157, + "reward_std": 0.27651917561888695, + "rewards/accuracy_reward": 0.7205208390951157, + "step": 519 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 69.55208333333333, + "epoch": 0.4303151506792635, + "grad_norm": 2.472729206085205, + "kl": 0.068603515625, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5325000037749609, + "reward_std": 0.3141136057674885, + "rewards/accuracy_reward": 0.5325000037749609, + "step": 520 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 88.79166666666667, + "epoch": 0.43114267981518517, + "grad_norm": 1.7543365955352783, + "kl": 0.061279296875, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5565625025580326, + "reward_std": 0.2625620576242606, + "rewards/accuracy_reward": 0.5565625025580326, + "step": 521 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 119.55208333333333, + "epoch": 0.4319702089511068, + "grad_norm": 3.2144923210144043, + "kl": 0.08314005533854167, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.5077083321909109, + "reward_std": 0.3109541026254495, + "rewards/accuracy_reward": 0.5077083321909109, + "step": 522 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 84.66666666666667, + "epoch": 0.4327977380870285, + "grad_norm": 9.808760643005371, + "kl": 0.060160319010416664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5968749982615312, + "reward_std": 0.29418232167760533, + "rewards/accuracy_reward": 0.5968749982615312, + "step": 523 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 88.11458333333333, + "epoch": 0.43362526722295014, + "grad_norm": 2.398836851119995, + "kl": 0.081939697265625, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.5339583313713471, + "reward_std": 0.29705464156965417, + "rewards/accuracy_reward": 0.5339583313713471, + "step": 524 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 76.55208333333333, + "epoch": 0.4344527963588718, + "grad_norm": 2.6958365440368652, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.644895834227403, + "reward_std": 0.310048695653677, + "rewards/accuracy_reward": 0.644895834227403, + "step": 525 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 83.70833333333333, + "epoch": 0.43528032549479345, + "grad_norm": 2.6507952213287354, + "kl": 0.07966105143229167, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6429166719317436, + "reward_std": 0.3418510730067889, + "rewards/accuracy_reward": 0.6429166719317436, + "step": 526 + }, + { + "ave_tool_num": 0.125, + "completion_length": 86.96875, + "epoch": 0.4361078546307151, + "grad_norm": 4.914710521697998, + "kl": 0.08599853515625, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.630208338300387, + "reward_std": 0.320287103842323, + "rewards/accuracy_reward": 0.630208338300387, + "step": 527 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 83.25, + "epoch": 0.43693538376663676, + "grad_norm": 2.200441598892212, + "kl": 0.07137044270833333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5129166667660078, + "reward_std": 0.33183320487538975, + "rewards/accuracy_reward": 0.5129166667660078, + "step": 528 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 76.14583333333333, + "epoch": 0.4377629129025584, + "grad_norm": 2.2616074085235596, + "kl": 0.07598876953125, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.42208333127200603, + "reward_std": 0.3151808424542348, + "rewards/accuracy_reward": 0.42208333127200603, + "step": 529 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.90625, + "epoch": 0.4385904420384801, + "grad_norm": 3.10115647315979, + "kl": 0.06587727864583333, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5521875023841858, + "reward_std": 0.3814128910501798, + "rewards/accuracy_reward": 0.5521875023841858, + "step": 530 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 82.69791666666667, + "epoch": 0.4394179711744018, + "grad_norm": 4.260361671447754, + "kl": 0.06322224934895833, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6747916688521703, + "reward_std": 0.29260610168178874, + "rewards/accuracy_reward": 0.6747916688521703, + "step": 531 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 79.35416666666667, + "epoch": 0.44024550031032345, + "grad_norm": 2.2120413780212402, + "kl": 0.06287638346354167, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.7212499976158142, + "reward_std": 0.34772756323218346, + "rewards/accuracy_reward": 0.7212499976158142, + "step": 532 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 92.21875, + "epoch": 0.4410730294462451, + "grad_norm": 2.1801977157592773, + "kl": 0.055908203125, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5826041648785273, + "reward_std": 0.37083063771327335, + "rewards/accuracy_reward": 0.5826041648785273, + "step": 533 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 90.61458333333333, + "epoch": 0.44190055858216676, + "grad_norm": 3.9577343463897705, + "kl": 0.12738037109375, + "learning_rate": 1e-06, + "loss": 0.0051, + "reward": 0.5044791645680865, + "reward_std": 0.29050329575936, + "rewards/accuracy_reward": 0.5044791645680865, + "step": 534 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 79.77083333333333, + "epoch": 0.4427280877180884, + "grad_norm": 1.9693899154663086, + "kl": 0.09455362955729167, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.621458334227403, + "reward_std": 0.3199639481802781, + "rewards/accuracy_reward": 0.621458334227403, + "step": 535 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 82.375, + "epoch": 0.44355561685401007, + "grad_norm": 2.2961039543151855, + "kl": 0.057657877604166664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6564583331346512, + "reward_std": 0.3004794629911582, + "rewards/accuracy_reward": 0.6564583331346512, + "step": 536 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 85.95833333333333, + "epoch": 0.4443831459899317, + "grad_norm": 3.846783399581909, + "kl": 0.13338216145833334, + "learning_rate": 1e-06, + "loss": 0.0053, + "reward": 0.6118749976158142, + "reward_std": 0.3688078410923481, + "rewards/accuracy_reward": 0.6118749976158142, + "step": 537 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 78.20833333333333, + "epoch": 0.4452106751258534, + "grad_norm": 6.9856414794921875, + "kl": 0.19624837239583334, + "learning_rate": 1e-06, + "loss": 0.0079, + "reward": 0.5202083339293798, + "reward_std": 0.3239744318028291, + "rewards/accuracy_reward": 0.5202083339293798, + "step": 538 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 76.14583333333333, + "epoch": 0.44603820426177504, + "grad_norm": 2.2047231197357178, + "kl": 0.055684407552083336, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.5912500023841858, + "reward_std": 0.3739611729979515, + "rewards/accuracy_reward": 0.5912500023841858, + "step": 539 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 89.04166666666667, + "epoch": 0.4468657333976967, + "grad_norm": 2.2939956188201904, + "kl": 0.060089111328125, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6232291683554649, + "reward_std": 0.3336292468011379, + "rewards/accuracy_reward": 0.6232291683554649, + "step": 540 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 79.97916666666667, + "epoch": 0.44769326253361835, + "grad_norm": 2.3232414722442627, + "kl": 0.07425944010416667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.6006250033775965, + "reward_std": 0.31665030742685, + "rewards/accuracy_reward": 0.6006250033775965, + "step": 541 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 69.80208333333333, + "epoch": 0.44852079166954, + "grad_norm": 2.428826093673706, + "kl": 0.0718994140625, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.7462500035762787, + "reward_std": 0.29222796981533367, + "rewards/accuracy_reward": 0.7462500035762787, + "step": 542 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 79.40625, + "epoch": 0.44934832080546167, + "grad_norm": 1.774810791015625, + "kl": 0.055918375651041664, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6652083396911621, + "reward_std": 0.28938665613532066, + "rewards/accuracy_reward": 0.6652083396911621, + "step": 543 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 77.16666666666667, + "epoch": 0.4501758499413834, + "grad_norm": 2.912398099899292, + "kl": 0.06297810872395833, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6106250059480468, + "reward_std": 0.29190604326625663, + "rewards/accuracy_reward": 0.6106250059480468, + "step": 544 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 77.88541666666667, + "epoch": 0.45100337907730503, + "grad_norm": 2.415285348892212, + "kl": 0.0511627197265625, + "learning_rate": 1e-06, + "loss": 0.002, + "reward": 0.45666666825612384, + "reward_std": 0.35057665531833965, + "rewards/accuracy_reward": 0.45666666825612384, + "step": 545 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 75.47916666666667, + "epoch": 0.4518309082132267, + "grad_norm": 22.66848373413086, + "kl": 0.5979817708333334, + "learning_rate": 1e-06, + "loss": 0.0239, + "reward": 0.46937499443689984, + "reward_std": 0.3258407811323802, + "rewards/accuracy_reward": 0.46937499443689984, + "step": 546 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 80.15625, + "epoch": 0.45265843734914835, + "grad_norm": 2.2880702018737793, + "kl": 0.08616129557291667, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5184375022848448, + "reward_std": 0.36886994043986004, + "rewards/accuracy_reward": 0.5184375022848448, + "step": 547 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 70.76041666666667, + "epoch": 0.45348596648507, + "grad_norm": 2.9146740436553955, + "kl": 0.06337483723958333, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5218749977648258, + "reward_std": 0.3987623167534669, + "rewards/accuracy_reward": 0.5218749977648258, + "step": 548 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 77.47916666666667, + "epoch": 0.45431349562099166, + "grad_norm": 2.447324752807617, + "kl": 0.08259073893229167, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.4831250036756198, + "reward_std": 0.3009151400377353, + "rewards/accuracy_reward": 0.4831250036756198, + "step": 549 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 90.36458333333333, + "epoch": 0.4551410247569133, + "grad_norm": 2.1301467418670654, + "kl": 0.060475667317708336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5156250049670538, + "reward_std": 0.3178652413189411, + "rewards/accuracy_reward": 0.5156250049670538, + "step": 550 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 76.14583333333333, + "epoch": 0.45596855389283497, + "grad_norm": 2.116184711456299, + "kl": 0.052500406901041664, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.5848958330849806, + "reward_std": 0.3174169932802518, + "rewards/accuracy_reward": 0.5848958330849806, + "step": 551 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 72.67708333333333, + "epoch": 0.4567960830287566, + "grad_norm": 2.18562912940979, + "kl": 0.0787353515625, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5554166585206985, + "reward_std": 0.3661230293413003, + "rewards/accuracy_reward": 0.5554166585206985, + "step": 552 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 95.42708333333333, + "epoch": 0.4576236121646783, + "grad_norm": 1.6227275133132935, + "kl": 0.052734375, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.4968749980131785, + "reward_std": 0.33039665843049687, + "rewards/accuracy_reward": 0.4968749980131785, + "step": 553 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 79.57291666666667, + "epoch": 0.45845114130059994, + "grad_norm": 4.555810928344727, + "kl": 0.06182861328125, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6433333357175192, + "reward_std": 0.32328893306354684, + "rewards/accuracy_reward": 0.6433333357175192, + "step": 554 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 67.375, + "epoch": 0.4592786704365216, + "grad_norm": 1.7391284704208374, + "kl": 0.041514078776041664, + "learning_rate": 1e-06, + "loss": 0.0017, + "reward": 0.7261458362142245, + "reward_std": 0.3125275665273269, + "rewards/accuracy_reward": 0.7261458362142245, + "step": 555 + }, + { + "ave_tool_num": 0.010416666666666666, + "completion_length": 88.54166666666667, + "epoch": 0.46010619957244325, + "grad_norm": 2.364255905151367, + "kl": 0.06388346354166667, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5259375032037497, + "reward_std": 0.31210918352007866, + "rewards/accuracy_reward": 0.5259375032037497, + "step": 556 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 89.27083333333333, + "epoch": 0.46093372870836496, + "grad_norm": 2.1205263137817383, + "kl": 0.07616170247395833, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5086458399891853, + "reward_std": 0.3179463321963946, + "rewards/accuracy_reward": 0.5086458399891853, + "step": 557 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 68.23958333333333, + "epoch": 0.4617612578442866, + "grad_norm": 1.9089363813400269, + "kl": 0.09134928385416667, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.6478125005960464, + "reward_std": 0.29955217738946277, + "rewards/accuracy_reward": 0.6478125005960464, + "step": 558 + }, + { + "ave_tool_num": 0.010416666666666666, + "completion_length": 75.375, + "epoch": 0.4625887869802083, + "grad_norm": 2.512138605117798, + "kl": 0.07062276204427083, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5665624986092249, + "reward_std": 0.3460591062903404, + "rewards/accuracy_reward": 0.5665624986092249, + "step": 559 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 70.0625, + "epoch": 0.46341631611612993, + "grad_norm": 7.94404935836792, + "kl": 0.13767496744791666, + "learning_rate": 1e-06, + "loss": 0.0055, + "reward": 0.558958334227403, + "reward_std": 0.3682125322520733, + "rewards/accuracy_reward": 0.558958334227403, + "step": 560 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 86.77083333333333, + "epoch": 0.4642438452520516, + "grad_norm": 2.656304359436035, + "kl": 0.08031209309895833, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6535416692495346, + "reward_std": 0.3014206712444623, + "rewards/accuracy_reward": 0.6535416692495346, + "step": 561 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 71.67708333333333, + "epoch": 0.46507137438797325, + "grad_norm": 2.172314167022705, + "kl": 0.08550008138020833, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5890625019868215, + "reward_std": 0.3199536092579365, + "rewards/accuracy_reward": 0.5890625019868215, + "step": 562 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 80.40625, + "epoch": 0.4658989035238949, + "grad_norm": 3.965059518814087, + "kl": 0.12660725911458334, + "learning_rate": 1e-06, + "loss": 0.0051, + "reward": 0.5603124996026357, + "reward_std": 0.29361430493493873, + "rewards/accuracy_reward": 0.5603124996026357, + "step": 563 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 97.97916666666667, + "epoch": 0.46672643265981656, + "grad_norm": 2.303295373916626, + "kl": 0.054804484049479164, + "learning_rate": 1e-06, + "loss": 0.0022, + "reward": 0.6004166702429453, + "reward_std": 0.39157138764858246, + "rewards/accuracy_reward": 0.6004166702429453, + "step": 564 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 70.1875, + "epoch": 0.4675539617957382, + "grad_norm": 3.9469234943389893, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6820833285649618, + "reward_std": 0.3344019142289956, + "rewards/accuracy_reward": 0.6820833285649618, + "step": 565 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 73.48958333333333, + "epoch": 0.46838149093165987, + "grad_norm": 2.2909750938415527, + "kl": 0.06005859375, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.6901041666666666, + "reward_std": 0.339948703845342, + "rewards/accuracy_reward": 0.6901041666666666, + "step": 566 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 83.90625, + "epoch": 0.46920902006758153, + "grad_norm": 2.1296281814575195, + "kl": 0.08475748697916667, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.5630208278695742, + "reward_std": 0.35333140070239705, + "rewards/accuracy_reward": 0.5630208278695742, + "step": 567 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 83.0625, + "epoch": 0.4700365492035032, + "grad_norm": 2.0478339195251465, + "kl": 0.07987467447916667, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5618750005960464, + "reward_std": 0.31137128671010333, + "rewards/accuracy_reward": 0.5618750005960464, + "step": 568 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 96.54166666666667, + "epoch": 0.47086407833942484, + "grad_norm": 1.8684468269348145, + "kl": 0.061482747395833336, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5445833342770735, + "reward_std": 0.4250177244345347, + "rewards/accuracy_reward": 0.5445833342770735, + "step": 569 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 67.39583333333333, + "epoch": 0.47169160747534655, + "grad_norm": 2.6493306159973145, + "kl": 0.10428873697916667, + "learning_rate": 1e-06, + "loss": 0.0042, + "reward": 0.5719791632145643, + "reward_std": 0.2530343836794297, + "rewards/accuracy_reward": 0.5719791632145643, + "step": 570 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 82.88541666666667, + "epoch": 0.4725191366112682, + "grad_norm": 2.295318365097046, + "kl": 0.07517496744791667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5371875092387199, + "reward_std": 0.31798192982872325, + "rewards/accuracy_reward": 0.5371875092387199, + "step": 571 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 78.375, + "epoch": 0.47334666574718987, + "grad_norm": 1.9621201753616333, + "kl": 0.06315104166666667, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6102083325386047, + "reward_std": 0.318935605386893, + "rewards/accuracy_reward": 0.6102083325386047, + "step": 572 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 65.3125, + "epoch": 0.4741741948831115, + "grad_norm": 3.0958058834075928, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.7042708347241083, + "reward_std": 0.2561557930894196, + "rewards/accuracy_reward": 0.7042708347241083, + "step": 573 + }, + { + "ave_tool_num": 0.125, + "completion_length": 68.73958333333333, + "epoch": 0.4750017240190332, + "grad_norm": 2.280121088027954, + "kl": 0.06379191080729167, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6471875011920929, + "reward_std": 0.2399348213026921, + "rewards/accuracy_reward": 0.6471875011920929, + "step": 574 + }, + { + "ave_tool_num": 0.125, + "completion_length": 77.5625, + "epoch": 0.47582925315495483, + "grad_norm": 2.7139246463775635, + "kl": 0.06941731770833333, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.674062505364418, + "reward_std": 0.34673816959063214, + "rewards/accuracy_reward": 0.674062505364418, + "step": 575 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 68.54166666666667, + "epoch": 0.4766567822908765, + "grad_norm": 3.628288507461548, + "kl": 0.1802978515625, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 0.6553125071028868, + "reward_std": 0.2737979417045911, + "rewards/accuracy_reward": 0.6553125071028868, + "step": 576 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 74.35416666666667, + "epoch": 0.47748431142679815, + "grad_norm": 2.9806854724884033, + "kl": 0.08087158203125, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6517708351214727, + "reward_std": 0.4173324480652809, + "rewards/accuracy_reward": 0.6517708351214727, + "step": 577 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 79.89583333333333, + "epoch": 0.4783118405627198, + "grad_norm": 2.353421449661255, + "kl": 0.06398518880208333, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.47156249980131787, + "reward_std": 0.35989080369472504, + "rewards/accuracy_reward": 0.47156249980131787, + "step": 578 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 75.32291666666667, + "epoch": 0.47913936969864146, + "grad_norm": 2.2766690254211426, + "kl": 0.06629435221354167, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6161458324640989, + "reward_std": 0.3195975013077259, + "rewards/accuracy_reward": 0.6161458324640989, + "step": 579 + }, + { + "ave_tool_num": 0.125, + "completion_length": 72.64583333333333, + "epoch": 0.4799668988345631, + "grad_norm": 2.5053882598876953, + "kl": 0.08115641276041667, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6671875019868215, + "reward_std": 0.2682109648982684, + "rewards/accuracy_reward": 0.6671875019868215, + "step": 580 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 84.05208333333333, + "epoch": 0.4807944279704848, + "grad_norm": 2.2217953205108643, + "kl": 0.06195068359375, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5558333247900009, + "reward_std": 0.3164493354658286, + "rewards/accuracy_reward": 0.5558333247900009, + "step": 581 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 79.77083333333333, + "epoch": 0.48162195710640643, + "grad_norm": 2.263160467147827, + "kl": 0.0745849609375, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.4933333372076352, + "reward_std": 0.293539447709918, + "rewards/accuracy_reward": 0.4933333372076352, + "step": 582 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 93.39583333333333, + "epoch": 0.48244948624232814, + "grad_norm": 2.4654808044433594, + "kl": 0.06450398763020833, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5992708255847295, + "reward_std": 0.3815460739036401, + "rewards/accuracy_reward": 0.5992708255847295, + "step": 583 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 90.23958333333333, + "epoch": 0.4832770153782498, + "grad_norm": 1.903236746788025, + "kl": 0.053466796875, + "learning_rate": 1e-06, + "loss": 0.0021, + "reward": 0.6982291638851166, + "reward_std": 0.3260043685634931, + "rewards/accuracy_reward": 0.6982291638851166, + "step": 584 + }, + { + "ave_tool_num": 0.125, + "completion_length": 72.45833333333333, + "epoch": 0.48410454451417145, + "grad_norm": 3.6287171840667725, + "kl": 0.06938680013020833, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.6228125070532163, + "reward_std": 0.2937763420244058, + "rewards/accuracy_reward": 0.6228125070532163, + "step": 585 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 103.03125, + "epoch": 0.4849320736500931, + "grad_norm": 2.684760808944702, + "kl": 0.0762939453125, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.4622916678587596, + "reward_std": 0.3239370783170064, + "rewards/accuracy_reward": 0.4622916678587596, + "step": 586 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 74.10416666666667, + "epoch": 0.48575960278601477, + "grad_norm": 16.33587646484375, + "kl": 0.16324869791666666, + "learning_rate": 1e-06, + "loss": 0.0065, + "reward": 0.6232291609048843, + "reward_std": 0.40722757826248807, + "rewards/accuracy_reward": 0.6232291609048843, + "step": 587 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 83.36458333333333, + "epoch": 0.4865871319219364, + "grad_norm": 4.333184242248535, + "kl": 0.07413736979166667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5892708326379458, + "reward_std": 0.2966805125276248, + "rewards/accuracy_reward": 0.5892708326379458, + "step": 588 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 59.354166666666664, + "epoch": 0.4874146610578581, + "grad_norm": 2.384908676147461, + "kl": 0.08083089192708333, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6576041678587595, + "reward_std": 0.3459964270393054, + "rewards/accuracy_reward": 0.6576041678587595, + "step": 589 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 67.07291666666667, + "epoch": 0.48824219019377973, + "grad_norm": 2.9705660343170166, + "kl": 0.09808349609375, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.6146874949336052, + "reward_std": 0.2763717845082283, + "rewards/accuracy_reward": 0.6146874949336052, + "step": 590 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 69.33333333333333, + "epoch": 0.4890697193297014, + "grad_norm": 3.002455711364746, + "kl": 0.09084065755208333, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.5316666634753346, + "reward_std": 0.2919038248558839, + "rewards/accuracy_reward": 0.5316666634753346, + "step": 591 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 72.57291666666667, + "epoch": 0.48989724846562305, + "grad_norm": 2.0532848834991455, + "kl": 0.08001708984375, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6184375037749609, + "reward_std": 0.3272281587123871, + "rewards/accuracy_reward": 0.6184375037749609, + "step": 592 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 73.92708333333333, + "epoch": 0.4907247776015447, + "grad_norm": 2.031703472137451, + "kl": 0.06361897786458333, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6696875020861626, + "reward_std": 0.2547011325756709, + "rewards/accuracy_reward": 0.6696875020861626, + "step": 593 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 81.0, + "epoch": 0.49155230673746636, + "grad_norm": 2.309112548828125, + "kl": 0.07450358072916667, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.5971875041723251, + "reward_std": 0.3916532521446546, + "rewards/accuracy_reward": 0.5971875041723251, + "step": 594 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 69.64583333333333, + "epoch": 0.492379835873388, + "grad_norm": 2.391300916671753, + "kl": 0.06663004557291667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5627083331346512, + "reward_std": 0.3693668854733308, + "rewards/accuracy_reward": 0.5627083331346512, + "step": 595 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 69.22916666666667, + "epoch": 0.49320736500930973, + "grad_norm": 2.450965642929077, + "kl": 0.06870524088541667, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.5704166640837988, + "reward_std": 0.3495526909828186, + "rewards/accuracy_reward": 0.5704166640837988, + "step": 596 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 72.35416666666667, + "epoch": 0.4940348941452314, + "grad_norm": 2.2386152744293213, + "kl": 0.07645670572916667, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.6613541667660078, + "reward_std": 0.29742420588930446, + "rewards/accuracy_reward": 0.6613541667660078, + "step": 597 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 86.13541666666667, + "epoch": 0.49486242328115304, + "grad_norm": 2.763503074645996, + "kl": 0.07145182291666667, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6004166702429453, + "reward_std": 0.32757997636993724, + "rewards/accuracy_reward": 0.6004166702429453, + "step": 598 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 73.97916666666667, + "epoch": 0.4956899524170747, + "grad_norm": 2.6623826026916504, + "kl": 0.07855224609375, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.5298958346247673, + "reward_std": 0.3401822332913677, + "rewards/accuracy_reward": 0.5298958346247673, + "step": 599 + }, + { + "ave_tool_num": 0.125, + "completion_length": 96.86458333333333, + "epoch": 0.49651748155299635, + "grad_norm": 3.196239709854126, + "kl": 0.06695556640625, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.47645833032826584, + "reward_std": 0.33053406824668247, + "rewards/accuracy_reward": 0.47645833032826584, + "step": 600 + }, + { + "ave_tool_num": 0.125, + "completion_length": 59.65625, + "epoch": 0.497345010688918, + "grad_norm": 40.007713317871094, + "kl": 0.5743611653645834, + "learning_rate": 1e-06, + "loss": 0.0229, + "reward": 0.7223958323399226, + "reward_std": 0.297960601747036, + "rewards/accuracy_reward": 0.7223958323399226, + "step": 601 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 78.64583333333333, + "epoch": 0.49817253982483967, + "grad_norm": 2.3864357471466064, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5000000031044086, + "reward_std": 0.32323666165272397, + "rewards/accuracy_reward": 0.5000000031044086, + "step": 602 + }, + { + "ave_tool_num": 0.125, + "completion_length": 78.77083333333333, + "epoch": 0.4990000689607613, + "grad_norm": 2.8447437286376953, + "kl": 0.06852213541666667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6334374969204267, + "reward_std": 0.3631387948989868, + "rewards/accuracy_reward": 0.6334374969204267, + "step": 603 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 76.05208333333333, + "epoch": 0.499827598096683, + "grad_norm": 2.1374473571777344, + "kl": 0.0745849609375, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.6723958303531011, + "reward_std": 0.34496060634652775, + "rewards/accuracy_reward": 0.6723958303531011, + "step": 604 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 84.33333333333333, + "epoch": 0.5006551272326046, + "grad_norm": 2.482074499130249, + "kl": 0.07253011067708333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5107291614015897, + "reward_std": 0.3619339354336262, + "rewards/accuracy_reward": 0.5107291614015897, + "step": 605 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 74.33333333333333, + "epoch": 0.5014826563685263, + "grad_norm": 2.533046245574951, + "kl": 0.08249918619791667, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6520833348234495, + "reward_std": 0.3187204884986083, + "rewards/accuracy_reward": 0.6520833348234495, + "step": 606 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 74.30208333333333, + "epoch": 0.502310185504448, + "grad_norm": 1.9126131534576416, + "kl": 0.06946818033854167, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.612916665772597, + "reward_std": 0.3208845580617587, + "rewards/accuracy_reward": 0.612916665772597, + "step": 607 + }, + { + "ave_tool_num": 0.020833333333333332, + "completion_length": 81.66666666666667, + "epoch": 0.5031377146403696, + "grad_norm": 2.936410665512085, + "kl": 0.07270304361979167, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.6088541696468989, + "reward_std": 0.3386625833809376, + "rewards/accuracy_reward": 0.6088541696468989, + "step": 608 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 87.64583333333333, + "epoch": 0.5039652437762913, + "grad_norm": 3.339529037475586, + "kl": 0.1151123046875, + "learning_rate": 1e-06, + "loss": 0.0046, + "reward": 0.5141666699200869, + "reward_std": 0.39185433089733124, + "rewards/accuracy_reward": 0.5141666699200869, + "step": 609 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 75.94791666666667, + "epoch": 0.5047927729122129, + "grad_norm": 2.2883472442626953, + "kl": 0.09501139322916667, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.4413541679581006, + "reward_std": 0.4429648170868556, + "rewards/accuracy_reward": 0.4413541679581006, + "step": 610 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 66.5625, + "epoch": 0.5056203020481346, + "grad_norm": 3.4013302326202393, + "kl": 0.09423828125, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.6945833340287209, + "reward_std": 0.3113906302799781, + "rewards/accuracy_reward": 0.6945833340287209, + "step": 611 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 73.0625, + "epoch": 0.5064478311840562, + "grad_norm": 2.2812039852142334, + "kl": 0.08489990234375, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.6136458376422524, + "reward_std": 0.32398570391039055, + "rewards/accuracy_reward": 0.6136458376422524, + "step": 612 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 82.25, + "epoch": 0.5072753603199779, + "grad_norm": 2.58058762550354, + "kl": 0.08876546223958333, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.5606250042716662, + "reward_std": 0.3654996765156587, + "rewards/accuracy_reward": 0.5606250042716662, + "step": 613 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 74.01041666666667, + "epoch": 0.5081028894558995, + "grad_norm": 2.371154546737671, + "kl": 0.12404378255208333, + "learning_rate": 1e-06, + "loss": 0.005, + "reward": 0.614791676402092, + "reward_std": 0.29678966601689655, + "rewards/accuracy_reward": 0.614791676402092, + "step": 614 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 70.84375, + "epoch": 0.5089304185918212, + "grad_norm": 2.3881278038024902, + "kl": 0.07334391276041667, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5736458351214727, + "reward_std": 0.21461191152532896, + "rewards/accuracy_reward": 0.5736458351214727, + "step": 615 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 76.21875, + "epoch": 0.509757947727743, + "grad_norm": 1.7989890575408936, + "kl": 0.071441650390625, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.7251041680574417, + "reward_std": 0.22688862619300684, + "rewards/accuracy_reward": 0.7251041680574417, + "step": 616 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 76.70833333333333, + "epoch": 0.5105854768636646, + "grad_norm": 2.8867318630218506, + "kl": 0.08899943033854167, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.44718749945362407, + "reward_std": 0.2501852884888649, + "rewards/accuracy_reward": 0.44718749945362407, + "step": 617 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 67.14583333333333, + "epoch": 0.5114130059995863, + "grad_norm": 8.214902877807617, + "kl": 0.08231608072916667, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6971875031789144, + "reward_std": 0.2748718510071437, + "rewards/accuracy_reward": 0.6971875031789144, + "step": 618 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 75.04166666666667, + "epoch": 0.5122405351355079, + "grad_norm": 3.043588638305664, + "kl": 0.06734212239583333, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5673958299060663, + "reward_std": 0.30968912566701573, + "rewards/accuracy_reward": 0.5673958299060663, + "step": 619 + }, + { + "ave_tool_num": 0.125, + "completion_length": 102.60416666666667, + "epoch": 0.5130680642714296, + "grad_norm": 2.456730365753174, + "kl": 0.07908121744791667, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5151041659216086, + "reward_std": 0.3318365228672822, + "rewards/accuracy_reward": 0.5151041659216086, + "step": 620 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 76.9375, + "epoch": 0.5138955934073512, + "grad_norm": 1.7609509229660034, + "kl": 0.058797200520833336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.7563541680574417, + "reward_std": 0.29066048562526703, + "rewards/accuracy_reward": 0.7563541680574417, + "step": 621 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 83.6875, + "epoch": 0.5147231225432729, + "grad_norm": 2.3008711338043213, + "kl": 0.07979329427083333, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.5047916645805041, + "reward_std": 0.3370451939602693, + "rewards/accuracy_reward": 0.5047916645805041, + "step": 622 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 78.47916666666667, + "epoch": 0.5155506516791946, + "grad_norm": 2.620901107788086, + "kl": 0.06632486979166667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6009375005960464, + "reward_std": 0.3395130994419257, + "rewards/accuracy_reward": 0.6009375005960464, + "step": 623 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 71.21875, + "epoch": 0.5163781808151162, + "grad_norm": 2.8326680660247803, + "kl": 0.07912699381510417, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.567500002682209, + "reward_std": 0.342291496694088, + "rewards/accuracy_reward": 0.567500002682209, + "step": 624 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 74.53125, + "epoch": 0.5172057099510379, + "grad_norm": 2.6321613788604736, + "kl": 0.07143147786458333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5169791777928671, + "reward_std": 0.3350421264767647, + "rewards/accuracy_reward": 0.5169791777928671, + "step": 625 + }, + { + "ave_tool_num": 0.125, + "completion_length": 70.35416666666667, + "epoch": 0.5180332390869595, + "grad_norm": 2.650554895401001, + "kl": 0.08530680338541667, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.6357291676104069, + "reward_std": 0.348691167930762, + "rewards/accuracy_reward": 0.6357291676104069, + "step": 626 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 64.82291666666667, + "epoch": 0.5188607682228812, + "grad_norm": 2.744054079055786, + "kl": 0.09499104817708333, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.3957291630407174, + "reward_std": 0.295221875111262, + "rewards/accuracy_reward": 0.3957291630407174, + "step": 627 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 83.67708333333333, + "epoch": 0.5196882973588028, + "grad_norm": 2.0574443340301514, + "kl": 0.06911214192708333, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.6594791722794374, + "reward_std": 0.3129894162217776, + "rewards/accuracy_reward": 0.6594791722794374, + "step": 628 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 71.97916666666667, + "epoch": 0.5205158264947245, + "grad_norm": 2.556722402572632, + "kl": 0.066925048828125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5892708351214727, + "reward_std": 0.300207690646251, + "rewards/accuracy_reward": 0.5892708351214727, + "step": 629 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 78.95833333333333, + "epoch": 0.5213433556306462, + "grad_norm": 2.4000015258789062, + "kl": 0.081146240234375, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6342708319425583, + "reward_std": 0.2909121575454871, + "rewards/accuracy_reward": 0.6342708319425583, + "step": 630 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 75.8125, + "epoch": 0.5221708847665678, + "grad_norm": 2.994248867034912, + "kl": 0.07332356770833333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.7002083336313566, + "reward_std": 0.24951399117708206, + "rewards/accuracy_reward": 0.7002083336313566, + "step": 631 + }, + { + "ave_tool_num": 0.041666666666666664, + "completion_length": 82.3125, + "epoch": 0.5229984139024895, + "grad_norm": 2.8175840377807617, + "kl": 0.06483968098958333, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6097916687528292, + "reward_std": 0.32033295929431915, + "rewards/accuracy_reward": 0.6097916687528292, + "step": 632 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 83.28125, + "epoch": 0.5238259430384111, + "grad_norm": 3.1699378490448, + "kl": 0.060943603515625, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5892708351214727, + "reward_std": 0.3692458023627599, + "rewards/accuracy_reward": 0.5892708351214727, + "step": 633 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 83.82291666666667, + "epoch": 0.5246534721743328, + "grad_norm": 2.3907463550567627, + "kl": 0.06709798177083333, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6808333446582159, + "reward_std": 0.2898837185154359, + "rewards/accuracy_reward": 0.6808333446582159, + "step": 634 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 77.01041666666667, + "epoch": 0.5254810013102544, + "grad_norm": 2.3644447326660156, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.6914583345254263, + "reward_std": 0.30552857865889865, + "rewards/accuracy_reward": 0.6914583345254263, + "step": 635 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 79.15625, + "epoch": 0.5263085304461761, + "grad_norm": 2.862058401107788, + "kl": 0.06257120768229167, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.5603124996026357, + "reward_std": 0.32204293956359226, + "rewards/accuracy_reward": 0.5603124996026357, + "step": 636 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 71.65625, + "epoch": 0.5271360595820977, + "grad_norm": 2.279616355895996, + "kl": 0.07303873697916667, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5606250005463759, + "reward_std": 0.34751687571406364, + "rewards/accuracy_reward": 0.5606250005463759, + "step": 637 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 89.125, + "epoch": 0.5279635887180194, + "grad_norm": 3.252340078353882, + "kl": 0.058451334635416664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5677083358168602, + "reward_std": 0.3212751584748427, + "rewards/accuracy_reward": 0.5677083358168602, + "step": 638 + }, + { + "ave_tool_num": 0.125, + "completion_length": 78.34375, + "epoch": 0.5287911178539411, + "grad_norm": 8.419788360595703, + "kl": 0.22237141927083334, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 0.4869791691501935, + "reward_std": 0.3665263392031193, + "rewards/accuracy_reward": 0.4869791691501935, + "step": 639 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 78.55208333333333, + "epoch": 0.5296186469898627, + "grad_norm": 2.3190619945526123, + "kl": 0.0972900390625, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.5926041652758917, + "reward_std": 0.3472303220381339, + "rewards/accuracy_reward": 0.5926041652758917, + "step": 640 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 75.72916666666667, + "epoch": 0.5304461761257844, + "grad_norm": 2.7022063732147217, + "kl": 0.09755452473958333, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.6454166732728481, + "reward_std": 0.3569144730766614, + "rewards/accuracy_reward": 0.6454166732728481, + "step": 641 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 76.72916666666667, + "epoch": 0.5312737052617061, + "grad_norm": 3.114621639251709, + "kl": 0.0834808349609375, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6743750075499216, + "reward_std": 0.2849394505222638, + "rewards/accuracy_reward": 0.6743750075499216, + "step": 642 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 69.66666666666667, + "epoch": 0.5321012343976278, + "grad_norm": 2.4324159622192383, + "kl": 0.09116617838541667, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.6140625054637591, + "reward_std": 0.31500863408048946, + "rewards/accuracy_reward": 0.6140625054637591, + "step": 643 + }, + { + "ave_tool_num": 0.0625, + "completion_length": 73.82291666666667, + "epoch": 0.5329287635335495, + "grad_norm": 2.705008029937744, + "kl": 0.06812540690104167, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.5815625041723251, + "reward_std": 0.3310500606894493, + "rewards/accuracy_reward": 0.5815625041723251, + "step": 644 + }, + { + "ave_tool_num": 0.03125, + "completion_length": 71.16666666666667, + "epoch": 0.5337562926694711, + "grad_norm": 3.5381240844726562, + "kl": 0.10262044270833333, + "learning_rate": 1e-06, + "loss": 0.0041, + "reward": 0.3985416628420353, + "reward_std": 0.34929608926177025, + "rewards/accuracy_reward": 0.3985416628420353, + "step": 645 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 79.13541666666667, + "epoch": 0.5345838218053928, + "grad_norm": 2.82149600982666, + "kl": 0.11311848958333333, + "learning_rate": 1e-06, + "loss": 0.0045, + "reward": 0.44625000407298404, + "reward_std": 0.3992507755756378, + "rewards/accuracy_reward": 0.44625000407298404, + "step": 646 + }, + { + "ave_tool_num": 0.08333333333333333, + "completion_length": 74.89583333333333, + "epoch": 0.5354113509413144, + "grad_norm": 2.008429527282715, + "kl": 0.09760538736979167, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.6498958344260851, + "reward_std": 0.33340540652473766, + "rewards/accuracy_reward": 0.6498958344260851, + "step": 647 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 84.73958333333333, + "epoch": 0.5362388800772361, + "grad_norm": 2.899233102798462, + "kl": 0.058186848958333336, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.5470833331346512, + "reward_std": 0.39085408486425877, + "rewards/accuracy_reward": 0.5470833331346512, + "step": 648 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 87.35416666666667, + "epoch": 0.5370664092131577, + "grad_norm": 4.416645526885986, + "kl": 0.10508219401041667, + "learning_rate": 1e-06, + "loss": 0.0042, + "reward": 0.5976041654745737, + "reward_std": 0.31877059986193973, + "rewards/accuracy_reward": 0.5976041654745737, + "step": 649 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 66.61458333333333, + "epoch": 0.5378939383490794, + "grad_norm": 3.007969379425049, + "kl": 0.07744344075520833, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.635833332935969, + "reward_std": 0.2936238826562961, + "rewards/accuracy_reward": 0.635833332935969, + "step": 650 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 74.54166666666667, + "epoch": 0.538721467485001, + "grad_norm": 2.9346702098846436, + "kl": 0.08951822916666667, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.5431249936421713, + "reward_std": 0.3391691669821739, + "rewards/accuracy_reward": 0.5431249936421713, + "step": 651 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 79.27083333333333, + "epoch": 0.5395489966209227, + "grad_norm": 3.56426739692688, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.6902083357175192, + "reward_std": 0.34439563006162643, + "rewards/accuracy_reward": 0.6902083357175192, + "step": 652 + }, + { + "ave_tool_num": 0.052083333333333336, + "completion_length": 88.625, + "epoch": 0.5403765257568444, + "grad_norm": 3.4688327312469482, + "kl": 0.082366943359375, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.5840625017881393, + "reward_std": 0.33906380583842594, + "rewards/accuracy_reward": 0.5840625017881393, + "step": 653 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 72.79166666666667, + "epoch": 0.541204054892766, + "grad_norm": 7.402387619018555, + "kl": 0.18939208984375, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 0.6512499948342642, + "reward_std": 0.2657398035128911, + "rewards/accuracy_reward": 0.6512499948342642, + "step": 654 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 74.26041666666667, + "epoch": 0.5420315840286877, + "grad_norm": 3.6100809574127197, + "kl": 0.073486328125, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5723958412806193, + "reward_std": 0.36208177730441093, + "rewards/accuracy_reward": 0.5723958412806193, + "step": 655 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 63.78125, + "epoch": 0.5428591131646093, + "grad_norm": 4.97236967086792, + "kl": 0.12862141927083334, + "learning_rate": 1e-06, + "loss": 0.0051, + "reward": 0.5204166720310847, + "reward_std": 0.35665113540987176, + "rewards/accuracy_reward": 0.5204166720310847, + "step": 656 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 77.8125, + "epoch": 0.543686642300531, + "grad_norm": 2.410679578781128, + "kl": 0.0833740234375, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6110416650772095, + "reward_std": 0.3251986261457205, + "rewards/accuracy_reward": 0.6110416650772095, + "step": 657 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 92.23958333333333, + "epoch": 0.5445141714364526, + "grad_norm": 2.4158823490142822, + "kl": 0.06886800130208333, + "learning_rate": 1e-06, + "loss": 0.0028, + "reward": 0.43447916954755783, + "reward_std": 0.35421760752797127, + "rewards/accuracy_reward": 0.43447916954755783, + "step": 658 + }, + { + "ave_tool_num": 0.125, + "completion_length": 82.72916666666667, + "epoch": 0.5453417005723743, + "grad_norm": 3.0567808151245117, + "kl": 0.09810384114583333, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.6188541681816181, + "reward_std": 0.31367556502421695, + "rewards/accuracy_reward": 0.6188541681816181, + "step": 659 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 72.61458333333333, + "epoch": 0.546169229708296, + "grad_norm": 2.5052108764648438, + "kl": 0.09311930338541667, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.6094791640837988, + "reward_std": 0.31602198009689647, + "rewards/accuracy_reward": 0.6094791640837988, + "step": 660 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 82.90625, + "epoch": 0.5469967588442176, + "grad_norm": 5.859192371368408, + "kl": 0.19136555989583334, + "learning_rate": 1e-06, + "loss": 0.0077, + "reward": 0.6331249997019768, + "reward_std": 0.31223108246922493, + "rewards/accuracy_reward": 0.6331249997019768, + "step": 661 + }, + { + "ave_tool_num": 0.09375, + "completion_length": 81.59375, + "epoch": 0.5478242879801393, + "grad_norm": 2.179460048675537, + "kl": 0.14284261067708334, + "learning_rate": 1e-06, + "loss": 0.0057, + "reward": 0.6228125008443991, + "reward_std": 0.27214765797058743, + "rewards/accuracy_reward": 0.6228125008443991, + "step": 662 + }, + { + "ave_tool_num": 0.11458333333333333, + "completion_length": 65.41666666666667, + "epoch": 0.5486518171160609, + "grad_norm": 2.3499932289123535, + "kl": 0.0911865234375, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.6195833335320154, + "reward_std": 0.3504956215620041, + "rewards/accuracy_reward": 0.6195833335320154, + "step": 663 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 75.86458333333333, + "epoch": 0.5494793462519826, + "grad_norm": 2.667130470275879, + "kl": 0.11176554361979167, + "learning_rate": 1e-06, + "loss": 0.0045, + "reward": 0.5555208350221316, + "reward_std": 0.36525258297721547, + "rewards/accuracy_reward": 0.5555208350221316, + "step": 664 + }, + { + "ave_tool_num": 0.13541666666666666, + "completion_length": 86.52083333333333, + "epoch": 0.5503068753879042, + "grad_norm": 2.445265054702759, + "kl": 0.08732096354166667, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.5732291663686434, + "reward_std": 0.30809296543399495, + "rewards/accuracy_reward": 0.5732291663686434, + "step": 665 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 83.60416666666667, + "epoch": 0.5511344045238259, + "grad_norm": 3.6088433265686035, + "kl": 0.08013916015625, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.40864583229025203, + "reward_std": 0.28707069158554077, + "rewards/accuracy_reward": 0.40864583229025203, + "step": 666 + }, + { + "ave_tool_num": 0.23958333333333334, + "completion_length": 62.375, + "epoch": 0.5519619336597476, + "grad_norm": 2.3718793392181396, + "kl": 0.09916178385416667, + "learning_rate": 1e-06, + "loss": 0.004, + "reward": 0.7322916686534882, + "reward_std": 0.24349368208398423, + "rewards/accuracy_reward": 0.7322916686534882, + "step": 667 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 88.59375, + "epoch": 0.5527894627956693, + "grad_norm": 2.788560390472412, + "kl": 0.11623128255208333, + "learning_rate": 1e-06, + "loss": 0.0046, + "reward": 0.5014583369096121, + "reward_std": 0.2889605996509393, + "rewards/accuracy_reward": 0.5014583369096121, + "step": 668 + }, + { + "ave_tool_num": 0.125, + "completion_length": 83.38541666666667, + "epoch": 0.553616991931591, + "grad_norm": 1.8903924226760864, + "kl": 0.06499226888020833, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.5789583300550779, + "reward_std": 0.29482940553377074, + "rewards/accuracy_reward": 0.5789583300550779, + "step": 669 + }, + { + "ave_tool_num": 0.10416666666666667, + "completion_length": 87.69791666666667, + "epoch": 0.5544445210675126, + "grad_norm": 2.871703863143921, + "kl": 0.07249959309895833, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5370833352208138, + "reward_std": 0.3353977104028066, + "rewards/accuracy_reward": 0.5370833352208138, + "step": 670 + }, + { + "ave_tool_num": 0.19791666666666666, + "completion_length": 79.98958333333333, + "epoch": 0.5552720502034343, + "grad_norm": 2.5376415252685547, + "kl": 0.09142049153645833, + "learning_rate": 1e-06, + "loss": 0.0037, + "reward": 0.5506250013907751, + "reward_std": 0.28855451196432114, + "rewards/accuracy_reward": 0.5506250013907751, + "step": 671 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 75.17708333333333, + "epoch": 0.5560995793393559, + "grad_norm": 2.438952922821045, + "kl": 0.06734212239583333, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.509166669100523, + "reward_std": 0.3384817975262801, + "rewards/accuracy_reward": 0.509166669100523, + "step": 672 + }, + { + "ave_tool_num": 0.25, + "completion_length": 73.85416666666667, + "epoch": 0.5569271084752776, + "grad_norm": 2.403315782546997, + "kl": 0.07916259765625, + "learning_rate": 1e-06, + "loss": 0.0032, + "reward": 0.6928125023841858, + "reward_std": 0.34873295327027637, + "rewards/accuracy_reward": 0.6928125023841858, + "step": 673 + }, + { + "ave_tool_num": 0.21875, + "completion_length": 82.16666666666667, + "epoch": 0.5577546376111993, + "grad_norm": 2.236635208129883, + "kl": 0.0738525390625, + "learning_rate": 1e-06, + "loss": 0.003, + "reward": 0.7220833351214727, + "reward_std": 0.3280428697665532, + "rewards/accuracy_reward": 0.7220833351214727, + "step": 674 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 81.07291666666667, + "epoch": 0.5585821667471209, + "grad_norm": 2.274181365966797, + "kl": 0.08571370442708333, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.6530208376546701, + "reward_std": 0.23265751202901205, + "rewards/accuracy_reward": 0.6530208376546701, + "step": 675 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 92.47916666666667, + "epoch": 0.5594096958830426, + "grad_norm": 1.8823641538619995, + "kl": 0.09393310546875, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.6586458335320154, + "reward_std": 0.29354630162318546, + "rewards/accuracy_reward": 0.6586458335320154, + "step": 676 + }, + { + "ave_tool_num": 0.1875, + "completion_length": 88.05208333333333, + "epoch": 0.5602372250189642, + "grad_norm": 2.0049359798431396, + "kl": 0.06312052408854167, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.49697916706403095, + "reward_std": 0.25419448254009086, + "rewards/accuracy_reward": 0.49697916706403095, + "step": 677 + }, + { + "ave_tool_num": 0.23958333333333334, + "completion_length": 81.375, + "epoch": 0.5610647541548859, + "grad_norm": 3.682424545288086, + "kl": 0.08587646484375, + "learning_rate": 1e-06, + "loss": 0.0034, + "reward": 0.651458332935969, + "reward_std": 0.292859748005867, + "rewards/accuracy_reward": 0.651458332935969, + "step": 678 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 86.01041666666667, + "epoch": 0.5618922832908075, + "grad_norm": 18.924272537231445, + "kl": 0.087066650390625, + "learning_rate": 1e-06, + "loss": 0.0035, + "reward": 0.6046874995032946, + "reward_std": 0.38599306096633273, + "rewards/accuracy_reward": 0.6046874995032946, + "step": 679 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 82.65625, + "epoch": 0.5627198124267292, + "grad_norm": 2.2716808319091797, + "kl": 0.101104736328125, + "learning_rate": 1e-06, + "loss": 0.004, + "reward": 0.5280208339293798, + "reward_std": 0.35621793381869793, + "rewards/accuracy_reward": 0.5280208339293798, + "step": 680 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 83.875, + "epoch": 0.5635473415626508, + "grad_norm": 2.5465664863586426, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0038, + "reward": 0.5998958311975002, + "reward_std": 0.2616371810436249, + "rewards/accuracy_reward": 0.5998958311975002, + "step": 681 + }, + { + "ave_tool_num": 0.17708333333333334, + "completion_length": 73.1875, + "epoch": 0.5643748706985725, + "grad_norm": 2.9087963104248047, + "kl": 0.057688395182291664, + "learning_rate": 1e-06, + "loss": 0.0023, + "reward": 0.6638541718324026, + "reward_std": 0.29098161185781163, + "rewards/accuracy_reward": 0.6638541718324026, + "step": 682 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 87.02083333333333, + "epoch": 0.5652023998344942, + "grad_norm": 3.309297800064087, + "kl": 0.14200846354166666, + "learning_rate": 1e-06, + "loss": 0.0057, + "reward": 0.594479168454806, + "reward_std": 0.2603524612883727, + "rewards/accuracy_reward": 0.594479168454806, + "step": 683 + }, + { + "ave_tool_num": 0.07291666666666667, + "completion_length": 85.22916666666667, + "epoch": 0.5660299289704158, + "grad_norm": 2.575688362121582, + "kl": 0.059824625651041664, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5803125003973643, + "reward_std": 0.3057475872337818, + "rewards/accuracy_reward": 0.5803125003973643, + "step": 684 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 83.59375, + "epoch": 0.5668574581063375, + "grad_norm": 3.584404230117798, + "kl": 0.09722900390625, + "learning_rate": 1e-06, + "loss": 0.0039, + "reward": 0.5985416695475578, + "reward_std": 0.32548602670431137, + "rewards/accuracy_reward": 0.5985416695475578, + "step": 685 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 78.95833333333333, + "epoch": 0.5676849872422591, + "grad_norm": 6.369125843048096, + "kl": 0.06815592447916667, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.6531250029802322, + "reward_std": 0.3123789181311925, + "rewards/accuracy_reward": 0.6531250029802322, + "step": 686 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 79.76041666666667, + "epoch": 0.5685125163781808, + "grad_norm": 2.2395241260528564, + "kl": 0.07798258463541667, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.7136458357175192, + "reward_std": 0.33338900841772556, + "rewards/accuracy_reward": 0.7136458357175192, + "step": 687 + }, + { + "ave_tool_num": 0.16666666666666666, + "completion_length": 90.82291666666667, + "epoch": 0.5693400455141024, + "grad_norm": 2.2504518032073975, + "kl": 0.06418863932291667, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6262500012914339, + "reward_std": 0.36573715570072335, + "rewards/accuracy_reward": 0.6262500012914339, + "step": 688 + }, + { + "ave_tool_num": 0.20833333333333334, + "completion_length": 81.16666666666667, + "epoch": 0.5701675746500241, + "grad_norm": 2.8439090251922607, + "kl": 0.07639567057291667, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.44437500089406967, + "reward_std": 0.34816691527764004, + "rewards/accuracy_reward": 0.44437500089406967, + "step": 689 + }, + { + "ave_tool_num": 0.22916666666666666, + "completion_length": 86.17708333333333, + "epoch": 0.5709951037859458, + "grad_norm": 14.50684928894043, + "kl": 0.15484619140625, + "learning_rate": 1e-06, + "loss": 0.0062, + "reward": 0.6604166689018408, + "reward_std": 0.23048480258633694, + "rewards/accuracy_reward": 0.6604166689018408, + "step": 690 + }, + { + "ave_tool_num": 0.28125, + "completion_length": 77.14583333333333, + "epoch": 0.5718226329218674, + "grad_norm": 2.7099475860595703, + "kl": 0.07822672526041667, + "learning_rate": 1e-06, + "loss": 0.0031, + "reward": 0.6500000034769376, + "reward_std": 0.3350578298171361, + "rewards/accuracy_reward": 0.6500000034769376, + "step": 691 + }, + { + "ave_tool_num": 0.125, + "completion_length": 95.9375, + "epoch": 0.5726501620577891, + "grad_norm": 4.6230692863464355, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": 0.0054, + "reward": 0.523229164381822, + "reward_std": 0.4099666588008404, + "rewards/accuracy_reward": 0.523229164381822, + "step": 692 + }, + { + "ave_tool_num": 0.125, + "completion_length": 76.25, + "epoch": 0.5734776911937107, + "grad_norm": 5.018466472625732, + "kl": 0.180206298828125, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 0.6047916697959105, + "reward_std": 0.3419861781100432, + "rewards/accuracy_reward": 0.6047916697959105, + "step": 693 + }, + { + "ave_tool_num": 0.15625, + "completion_length": 85.05208333333333, + "epoch": 0.5743052203296325, + "grad_norm": 2.3767220973968506, + "kl": 0.06578572591145833, + "learning_rate": 1e-06, + "loss": 0.0026, + "reward": 0.6551041677594185, + "reward_std": 0.32017891729871434, + "rewards/accuracy_reward": 0.6551041677594185, + "step": 694 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 100.9375, + "epoch": 0.5751327494655541, + "grad_norm": 2.0284504890441895, + "kl": 0.061777750651041664, + "learning_rate": 1e-06, + "loss": 0.0025, + "reward": 0.6058333329856396, + "reward_std": 0.3859557844698429, + "rewards/accuracy_reward": 0.6058333329856396, + "step": 695 + }, + { + "ave_tool_num": 0.3020833333333333, + "completion_length": 80.92708333333333, + "epoch": 0.5759602786014758, + "grad_norm": 3.367058515548706, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0036, + "reward": 0.5001041665673256, + "reward_std": 0.379233181476593, + "rewards/accuracy_reward": 0.5001041665673256, + "step": 696 + }, + { + "ave_tool_num": 0.28125, + "completion_length": 74.27083333333333, + "epoch": 0.5767878077373975, + "grad_norm": 2.107775926589966, + "kl": 0.06622314453125, + "learning_rate": 1e-06, + "loss": 0.0027, + "reward": 0.8052083303531011, + "reward_std": 0.2040052649875482, + "rewards/accuracy_reward": 0.8052083303531011, + "step": 697 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 77.08333333333333, + "epoch": 0.5776153368733191, + "grad_norm": 2.3181984424591064, + "kl": 0.07297770182291667, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5331249982118607, + "reward_std": 0.33617171707252663, + "rewards/accuracy_reward": 0.5331249982118607, + "step": 698 + }, + { + "ave_tool_num": 0.2604166666666667, + "completion_length": 92.375, + "epoch": 0.5784428660092408, + "grad_norm": 1.9895869493484497, + "kl": 0.059224446614583336, + "learning_rate": 1e-06, + "loss": 0.0024, + "reward": 0.5880208387970924, + "reward_std": 0.22961691891153654, + "rewards/accuracy_reward": 0.5880208387970924, + "step": 699 + }, + { + "ave_tool_num": 0.14583333333333334, + "completion_length": 90.77083333333333, + "epoch": 0.5792703951451624, + "grad_norm": 2.4381203651428223, + "kl": 0.07320149739583333, + "learning_rate": 1e-06, + "loss": 0.0029, + "reward": 0.5132291627426943, + "reward_std": 0.35270869731903076, + "rewards/accuracy_reward": 0.5132291627426943, + "step": 700 + } + ], + "logging_steps": 1.0, + "max_steps": 1208, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}