{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06764374295377677, "eval_steps": 300, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 129.59375, "epoch": 0.00045095828635851183, "grad_norm": 8.434212673465728, "kl": 0.0, "learning_rate": 9.997744700045105e-07, "log_metrics/accuracy": 0.007260729558765888, "log_metrics/iou_log": 0.0078125, "loss": 0.0, "max_completion_length": 444.5, "min_completion_length": 45.0, "reward": 0.29296875, "reward_std": 0.42108407616615295, "rewards/format_reward": 0.28515625, "rewards/iou_reward": 0.0078125, "rewards/log_reward": 0.0, "step": 1, "temperature": 1.0 }, { "completion_length": 142.15234375, "epoch": 0.0009019165727170237, "grad_norm": 4.356998223352768, "kl": 0.0017852783203125, "learning_rate": 9.995489400090211e-07, "log_metrics/accuracy": 0.02364518865942955, "log_metrics/iou_log": 0.0234375, "loss": 0.0001, "max_completion_length": 512.0, "min_completion_length": 38.5, "reward": 0.39453125, "reward_std": 0.4661460518836975, "rewards/format_reward": 0.37109375, "rewards/iou_reward": 0.0234375, "rewards/log_reward": 0.0, "step": 2, "temperature": 1.0 }, { "completion_length": 140.8671875, "epoch": 0.0013528748590755355, "grad_norm": 2.5874222658859782, "kl": 0.00408172607421875, "learning_rate": 9.993234100135317e-07, "log_metrics/accuracy": 0.04108293540775776, "log_metrics/iou_log": 0.046875, "loss": 0.0002, "max_completion_length": 512.0, "min_completion_length": 46.5, "reward": 0.7578125, "reward_std": 0.38541457056999207, "rewards/format_reward": 0.7109375, "rewards/iou_reward": 0.046875, "rewards/log_reward": 0.0, "step": 3, "temperature": 1.0 }, { "completion_length": 138.29296875, "epoch": 0.0018038331454340473, "grad_norm": 2.444955671283294, "kl": 0.006561279296875, "learning_rate": 9.990978800180425e-07, "log_metrics/accuracy": 0.019979181233793497, "log_metrics/iou_log": 0.0234375, "loss": 0.0003, "max_completion_length": 373.0, "min_completion_length": 53.0, "reward": 0.7578125, "reward_std": 0.3646235316991806, "rewards/format_reward": 0.734375, "rewards/iou_reward": 0.0234375, "rewards/log_reward": 0.0, "step": 4, "temperature": 1.0 }, { "completion_length": 139.26953125, "epoch": 0.002254791431792559, "grad_norm": 2.4127784594267783, "kl": 0.009033203125, "learning_rate": 9.98872350022553e-07, "log_metrics/accuracy": 0.019243303686380386, "log_metrics/iou_log": 0.0234375, "loss": 0.0004, "max_completion_length": 376.0, "min_completion_length": 50.5, "reward": 0.921875, "reward_std": 0.20379295945167542, "rewards/format_reward": 0.8984375, "rewards/iou_reward": 0.0234375, "rewards/log_reward": 0.0, "step": 5, "temperature": 1.0 }, { "completion_length": 141.515625, "epoch": 0.002705749718151071, "grad_norm": 3.4995669528512625, "kl": 0.0137939453125, "learning_rate": 9.986468200270636e-07, "log_metrics/accuracy": 0.011071678251028061, "log_metrics/iou_log": 0.01171875, "loss": 0.0006, "max_completion_length": 278.5, "min_completion_length": 76.5, "reward": 0.92578125, "reward_std": 0.18992366641759872, "rewards/format_reward": 0.9140625, "rewards/iou_reward": 0.01171875, "rewards/log_reward": 0.0, "step": 6, "temperature": 1.0 }, { "completion_length": 143.9609375, "epoch": 0.003156708004509583, "grad_norm": 1.3423944744269296, "kl": 0.015625, "learning_rate": 9.984212900315742e-07, "log_metrics/accuracy": 0.00020430245785973966, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 296.0, "min_completion_length": 84.5, "reward": 0.9296875, "reward_std": 0.1642879694700241, "rewards/format_reward": 0.9296875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 7, "temperature": 1.0 }, { "completion_length": 136.3125, "epoch": 0.0036076662908680946, "grad_norm": 1.0911367434212607, "kl": 0.011993408203125, "learning_rate": 9.981957600360848e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 279.0, "min_completion_length": 84.5, "reward": 0.96484375, "reward_std": 0.12082062661647797, "rewards/format_reward": 0.96484375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 8, "temperature": 1.0 }, { "completion_length": 137.1953125, "epoch": 0.004058624577226606, "grad_norm": 1.0840026316503646, "kl": 0.014739990234375, "learning_rate": 9.979702300405953e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 262.0, "min_completion_length": 88.5, "reward": 0.93359375, "reward_std": 0.13721734285354614, "rewards/format_reward": 0.93359375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 9, "temperature": 1.0 }, { "completion_length": 141.21484375, "epoch": 0.004509582863585118, "grad_norm": 0.5693248065798014, "kl": 0.016387939453125, "learning_rate": 9.97744700045106e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 290.5, "min_completion_length": 92.5, "reward": 0.98828125, "reward_std": 0.03697281330823898, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 10, "temperature": 1.0 }, { "completion_length": 139.66015625, "epoch": 0.00496054114994363, "grad_norm": 0.7385235986125652, "kl": 0.0325927734375, "learning_rate": 9.975191700496165e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0013, "max_completion_length": 236.0, "min_completion_length": 94.5, "reward": 0.98828125, "reward_std": 0.03697281330823898, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 11, "temperature": 1.0 }, { "completion_length": 142.58203125, "epoch": 0.005411499436302142, "grad_norm": 0.2699152956849861, "kl": 0.0230712890625, "learning_rate": 9.972936400541273e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0009, "max_completion_length": 243.5, "min_completion_length": 95.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 12, "temperature": 1.0 }, { "completion_length": 137.46484375, "epoch": 0.005862457722660654, "grad_norm": 4.498549971158498, "kl": 0.0223388671875, "learning_rate": 9.970681100586379e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0009, "max_completion_length": 224.5, "min_completion_length": 90.0, "reward": 0.98828125, "reward_std": 0.03697281330823898, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 13, "temperature": 1.0 }, { "completion_length": 137.58203125, "epoch": 0.006313416009019166, "grad_norm": 1.316359142547647, "kl": 0.020263671875, "learning_rate": 9.968425800631484e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 203.5, "min_completion_length": 100.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 14, "temperature": 1.0 }, { "completion_length": 146.5625, "epoch": 0.006764374295377677, "grad_norm": 0.5411629015808064, "kl": 0.01934814453125, "learning_rate": 9.96617050067659e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 244.0, "min_completion_length": 99.0, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 15, "temperature": 1.0 }, { "completion_length": 145.5625, "epoch": 0.007215332581736189, "grad_norm": 0.5837134769756639, "kl": 0.01678466796875, "learning_rate": 9.963915200721696e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 265.0, "min_completion_length": 89.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 16, "temperature": 1.0 }, { "completion_length": 135.44921875, "epoch": 0.007666290868094701, "grad_norm": 0.5126143243632768, "kl": 0.02001953125, "learning_rate": 9.961659900766802e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 247.5, "min_completion_length": 99.0, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 17, "temperature": 1.0 }, { "completion_length": 139.48046875, "epoch": 0.008117249154453212, "grad_norm": 0.8984743564170407, "kl": 0.015838623046875, "learning_rate": 9.959404600811907e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 233.0, "min_completion_length": 84.0, "reward": 0.98828125, "reward_std": 0.046875, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 18, "temperature": 1.0 }, { "completion_length": 138.43359375, "epoch": 0.008568207440811725, "grad_norm": 0.42188832602144527, "kl": 0.01397705078125, "learning_rate": 9.957149300857013e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 231.5, "min_completion_length": 95.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 19, "temperature": 1.0 }, { "completion_length": 131.62890625, "epoch": 0.009019165727170236, "grad_norm": 0.32128454476676716, "kl": 0.01458740234375, "learning_rate": 9.954894000902119e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 216.5, "min_completion_length": 88.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 20, "temperature": 1.0 }, { "completion_length": 126.828125, "epoch": 0.00947012401352875, "grad_norm": 0.48546223545241646, "kl": 0.0172119140625, "learning_rate": 9.952638700947225e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 194.5, "min_completion_length": 87.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 21, "temperature": 1.0 }, { "completion_length": 129.39453125, "epoch": 0.00992108229988726, "grad_norm": 0.4258953137179797, "kl": 0.015869140625, "learning_rate": 9.950383400992333e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 247.5, "min_completion_length": 85.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 22, "temperature": 1.0 }, { "completion_length": 129.10546875, "epoch": 0.010372040586245771, "grad_norm": 0.2833217695971168, "kl": 0.013458251953125, "learning_rate": 9.948128101037438e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 192.5, "min_completion_length": 86.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 23, "temperature": 1.0 }, { "completion_length": 135.796875, "epoch": 0.010822998872604284, "grad_norm": 0.48148300062802873, "kl": 0.0255126953125, "learning_rate": 9.945872801082544e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.001, "max_completion_length": 259.5, "min_completion_length": 89.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 24, "temperature": 1.0 }, { "completion_length": 123.671875, "epoch": 0.011273957158962795, "grad_norm": 0.15786664076306023, "kl": 0.015167236328125, "learning_rate": 9.94361750112765e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 207.5, "min_completion_length": 83.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 25, "temperature": 1.0 }, { "completion_length": 122.7109375, "epoch": 0.011724915445321308, "grad_norm": 0.13036537143577448, "kl": 0.0208740234375, "learning_rate": 9.941362201172756e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 187.5, "min_completion_length": 83.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 26, "temperature": 1.0 }, { "completion_length": 126.6171875, "epoch": 0.01217587373167982, "grad_norm": 1.025009042819294, "kl": 0.01922607421875, "learning_rate": 9.939106901217861e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 222.5, "min_completion_length": 87.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 27, "temperature": 1.0 }, { "completion_length": 126.890625, "epoch": 0.012626832018038332, "grad_norm": 0.16079440644093526, "kl": 0.0172119140625, "learning_rate": 9.936851601262967e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 204.0, "min_completion_length": 84.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 28, "temperature": 1.0 }, { "completion_length": 121.3671875, "epoch": 0.013077790304396843, "grad_norm": 0.19002133711259844, "kl": 0.02008056640625, "learning_rate": 9.934596301308073e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 192.5, "min_completion_length": 85.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 29, "temperature": 1.0 }, { "completion_length": 120.9296875, "epoch": 0.013528748590755355, "grad_norm": 0.3356237867953282, "kl": 0.02044677734375, "learning_rate": 9.932341001353179e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 201.0, "min_completion_length": 80.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 30, "temperature": 1.0 }, { "completion_length": 121.890625, "epoch": 0.013979706877113867, "grad_norm": 0.0972578202221844, "kl": 0.0167236328125, "learning_rate": 9.930085701398284e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 225.0, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 31, "temperature": 1.0 }, { "completion_length": 125.453125, "epoch": 0.014430665163472379, "grad_norm": 0.16574400218238333, "kl": 0.01788330078125, "learning_rate": 9.92783040144339e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 237.5, "min_completion_length": 84.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 32, "temperature": 1.0 }, { "completion_length": 121.51953125, "epoch": 0.014881623449830891, "grad_norm": 0.11795457231634025, "kl": 0.02032470703125, "learning_rate": 9.925575101488498e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 235.0, "min_completion_length": 86.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 33, "temperature": 1.0 }, { "completion_length": 124.75390625, "epoch": 0.015332581736189402, "grad_norm": 0.126113812729237, "kl": 0.01666259765625, "learning_rate": 9.923319801533604e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 183.0, "min_completion_length": 87.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 34, "temperature": 1.0 }, { "completion_length": 124.16015625, "epoch": 0.015783540022547914, "grad_norm": 0.07662853380658595, "kl": 0.01458740234375, "learning_rate": 9.92106450157871e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 180.0, "min_completion_length": 87.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 35, "temperature": 1.0 }, { "completion_length": 128.6484375, "epoch": 0.016234498308906425, "grad_norm": 0.4478082644592694, "kl": 0.014801025390625, "learning_rate": 9.918809201623815e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 208.0, "min_completion_length": 84.5, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 36, "temperature": 1.0 }, { "completion_length": 130.171875, "epoch": 0.01668545659526494, "grad_norm": 0.5289725828976242, "kl": 0.01373291015625, "learning_rate": 9.91655390166892e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 223.0, "min_completion_length": 85.5, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 37, "temperature": 1.0 }, { "completion_length": 132.4609375, "epoch": 0.01713641488162345, "grad_norm": 0.05806697595641871, "kl": 0.013671875, "learning_rate": 9.914298601714027e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 257.0, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 38, "temperature": 1.0 }, { "completion_length": 131.6796875, "epoch": 0.01758737316798196, "grad_norm": 0.07390387951612225, "kl": 0.015869140625, "learning_rate": 9.912043301759133e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 241.0, "min_completion_length": 94.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 39, "temperature": 1.0 }, { "completion_length": 133.99609375, "epoch": 0.018038331454340473, "grad_norm": 0.5552696074810277, "kl": 0.01605224609375, "learning_rate": 9.909788001804238e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 380.0, "min_completion_length": 91.5, "reward": 0.98828125, "reward_std": 0.03697281330823898, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 40, "temperature": 1.0 }, { "completion_length": 134.671875, "epoch": 0.018489289740698984, "grad_norm": 0.09608326888981661, "kl": 0.0177001953125, "learning_rate": 9.907532701849346e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 262.0, "min_completion_length": 90.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 41, "temperature": 1.0 }, { "completion_length": 128.27734375, "epoch": 0.0189402480270575, "grad_norm": 0.22125892499138428, "kl": 0.013702392578125, "learning_rate": 9.905277401894452e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 251.0, "min_completion_length": 88.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 42, "temperature": 1.0 }, { "completion_length": 135.359375, "epoch": 0.01939120631341601, "grad_norm": 0.08775045275198128, "kl": 0.015533447265625, "learning_rate": 9.903022101939558e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 291.5, "min_completion_length": 83.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 43, "temperature": 1.0 }, { "completion_length": 135.53125, "epoch": 0.01984216459977452, "grad_norm": 0.12136918886632062, "kl": 0.015167236328125, "learning_rate": 9.900766801984663e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 288.5, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 44, "temperature": 1.0 }, { "completion_length": 128.52734375, "epoch": 0.020293122886133032, "grad_norm": 0.09774392937230035, "kl": 0.010894775390625, "learning_rate": 9.89851150202977e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 222.0, "min_completion_length": 86.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 45, "temperature": 1.0 }, { "completion_length": 131.52734375, "epoch": 0.020744081172491543, "grad_norm": 0.07216886654321743, "kl": 0.012908935546875, "learning_rate": 9.896256202074875e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 246.0, "min_completion_length": 89.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 46, "temperature": 1.0 }, { "completion_length": 134.6796875, "epoch": 0.021195039458850057, "grad_norm": 0.3652700320877076, "kl": 0.011932373046875, "learning_rate": 9.89400090211998e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 250.0, "min_completion_length": 91.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 47, "temperature": 1.0 }, { "completion_length": 134.94140625, "epoch": 0.02164599774520857, "grad_norm": 0.0712237478501456, "kl": 0.0118408203125, "learning_rate": 9.891745602165089e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 262.0, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 48, "temperature": 1.0 }, { "completion_length": 131.27734375, "epoch": 0.02209695603156708, "grad_norm": 0.07461422124475418, "kl": 0.010101318359375, "learning_rate": 9.889490302210194e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 216.0, "min_completion_length": 89.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 49, "temperature": 1.0 }, { "completion_length": 132.33984375, "epoch": 0.02254791431792559, "grad_norm": 2.3229008992621183, "kl": 0.011199951171875, "learning_rate": 9.8872350022553e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 225.0, "min_completion_length": 89.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 50, "temperature": 1.0 }, { "completion_length": 129.7734375, "epoch": 0.022998872604284102, "grad_norm": 0.0875125604597023, "kl": 0.015960693359375, "learning_rate": 9.884979702300406e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 210.5, "min_completion_length": 90.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 51, "temperature": 1.0 }, { "completion_length": 129.12890625, "epoch": 0.023449830890642617, "grad_norm": 0.16094378550497987, "kl": 0.01641845703125, "learning_rate": 9.882724402345512e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 203.5, "min_completion_length": 86.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 52, "temperature": 1.0 }, { "completion_length": 130.06640625, "epoch": 0.023900789177001128, "grad_norm": 0.07783652453734344, "kl": 0.013031005859375, "learning_rate": 9.880469102390617e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 195.5, "min_completion_length": 80.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 53, "temperature": 1.0 }, { "completion_length": 129.9453125, "epoch": 0.02435174746335964, "grad_norm": 0.6621735516896475, "kl": 0.013824462890625, "learning_rate": 9.878213802435723e-07, "log_metrics/accuracy": 0.0036423311103135347, "log_metrics/iou_log": 0.00390625, "loss": 0.0006, "max_completion_length": 208.0, "min_completion_length": 92.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.00390625, "rewards/log_reward": 0.0, "step": 54, "temperature": 1.0 }, { "completion_length": 135.72265625, "epoch": 0.02480270574971815, "grad_norm": 0.056449991094393824, "kl": 0.00994873046875, "learning_rate": 9.875958502480829e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 328.0, "min_completion_length": 89.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 55, "temperature": 1.0 }, { "completion_length": 134.05078125, "epoch": 0.025253664036076665, "grad_norm": 0.6059033859841335, "kl": 0.00775146484375, "learning_rate": 9.873703202525937e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 228.5, "min_completion_length": 89.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 56, "temperature": 1.0 }, { "completion_length": 132.85546875, "epoch": 0.025704622322435176, "grad_norm": 0.037734113426563874, "kl": 0.009033203125, "learning_rate": 9.871447902571042e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 240.5, "min_completion_length": 92.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 57, "temperature": 1.0 }, { "completion_length": 132.78125, "epoch": 0.026155580608793687, "grad_norm": 0.09813082705622107, "kl": 0.010498046875, "learning_rate": 9.869192602616148e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 225.0, "min_completion_length": 92.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 58, "temperature": 1.0 }, { "completion_length": 129.6796875, "epoch": 0.026606538895152198, "grad_norm": 0.272077654342032, "kl": 0.011505126953125, "learning_rate": 9.866937302661254e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 218.5, "min_completion_length": 84.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 59, "temperature": 1.0 }, { "completion_length": 131.65234375, "epoch": 0.02705749718151071, "grad_norm": 0.0822946603987711, "kl": 0.01092529296875, "learning_rate": 9.86468200270636e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 209.0, "min_completion_length": 84.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 60, "temperature": 1.0 }, { "completion_length": 130.19140625, "epoch": 0.027508455467869224, "grad_norm": 0.055871650503174664, "kl": 0.01324462890625, "learning_rate": 9.862426702751465e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 206.5, "min_completion_length": 86.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 61, "temperature": 1.0 }, { "completion_length": 136.4375, "epoch": 0.027959413754227735, "grad_norm": 0.3732397114410677, "kl": 0.010528564453125, "learning_rate": 9.860171402796571e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 250.5, "min_completion_length": 86.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 62, "temperature": 1.0 }, { "completion_length": 132.015625, "epoch": 0.028410372040586246, "grad_norm": 1.215038113924891, "kl": 0.0142822265625, "learning_rate": 9.857916102841677e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 285.5, "min_completion_length": 82.0, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 63, "temperature": 1.0 }, { "completion_length": 132.875, "epoch": 0.028861330326944757, "grad_norm": 0.3792773946237974, "kl": 0.011932373046875, "learning_rate": 9.855660802886783e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 221.5, "min_completion_length": 88.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 64, "temperature": 1.0 }, { "completion_length": 129.75390625, "epoch": 0.029312288613303268, "grad_norm": 0.06882533947287736, "kl": 0.010345458984375, "learning_rate": 9.85340550293189e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 197.0, "min_completion_length": 90.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 65, "temperature": 1.0 }, { "completion_length": 132.9609375, "epoch": 0.029763246899661783, "grad_norm": 0.7469897504399653, "kl": 0.0126953125, "learning_rate": 9.851150202976996e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 243.0, "min_completion_length": 87.5, "reward": 0.98828125, "reward_std": 0.046875, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 66, "temperature": 1.0 }, { "completion_length": 130.66015625, "epoch": 0.030214205186020294, "grad_norm": 0.47470539847306675, "kl": 0.01165771484375, "learning_rate": 9.848894903022102e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 269.0, "min_completion_length": 91.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 67, "temperature": 1.0 }, { "completion_length": 131.70703125, "epoch": 0.030665163472378805, "grad_norm": 0.38448758695832475, "kl": 0.016815185546875, "learning_rate": 9.846639603067208e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0007, "max_completion_length": 259.5, "min_completion_length": 91.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 68, "temperature": 1.0 }, { "completion_length": 138.6484375, "epoch": 0.031116121758737316, "grad_norm": 0.15390242756935207, "kl": 0.01177978515625, "learning_rate": 9.844384303112314e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 258.0, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 69, "temperature": 1.0 }, { "completion_length": 130.78515625, "epoch": 0.03156708004509583, "grad_norm": 0.6291008512876635, "kl": 0.014617919921875, "learning_rate": 9.84212900315742e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 232.5, "min_completion_length": 91.5, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 70, "temperature": 1.0 }, { "completion_length": 132.33984375, "epoch": 0.03201803833145434, "grad_norm": 0.05684071522205286, "kl": 0.010711669921875, "learning_rate": 9.839873703202525e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 269.0, "min_completion_length": 90.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 71, "temperature": 1.0 }, { "completion_length": 127.94140625, "epoch": 0.03246899661781285, "grad_norm": 0.05157290002711523, "kl": 0.009552001953125, "learning_rate": 9.83761840324763e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 234.0, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 72, "temperature": 1.0 }, { "completion_length": 130.69140625, "epoch": 0.032919954904171364, "grad_norm": 0.36195683796616246, "kl": 0.0101318359375, "learning_rate": 9.835363103292737e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 274.5, "min_completion_length": 90.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 73, "temperature": 1.0 }, { "completion_length": 133.31640625, "epoch": 0.03337091319052988, "grad_norm": 0.05629834594638958, "kl": 0.014434814453125, "learning_rate": 9.833107803337842e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 249.5, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 74, "temperature": 1.0 }, { "completion_length": 131.1171875, "epoch": 0.033821871476888386, "grad_norm": 0.06013498747067093, "kl": 0.0140380859375, "learning_rate": 9.830852503382948e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 264.0, "min_completion_length": 87.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 75, "temperature": 1.0 }, { "completion_length": 129.15234375, "epoch": 0.0342728297632469, "grad_norm": 0.06570934118415175, "kl": 0.009246826171875, "learning_rate": 9.828597203428056e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 223.0, "min_completion_length": 86.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 76, "temperature": 1.0 }, { "completion_length": 133.3125, "epoch": 0.03472378804960541, "grad_norm": 0.05957393968454419, "kl": 0.0108642578125, "learning_rate": 9.826341903473162e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 287.5, "min_completion_length": 96.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 77, "temperature": 1.0 }, { "completion_length": 134.0, "epoch": 0.03517474633596392, "grad_norm": 0.05504430029751346, "kl": 0.008758544921875, "learning_rate": 9.824086603518268e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 221.0, "min_completion_length": 92.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 78, "temperature": 1.0 }, { "completion_length": 130.1875, "epoch": 0.03562570462232244, "grad_norm": 0.06155937354569755, "kl": 0.011016845703125, "learning_rate": 9.821831303563373e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 217.0, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 79, "temperature": 1.0 }, { "completion_length": 130.0703125, "epoch": 0.036076662908680945, "grad_norm": 0.09185105404429818, "kl": 0.00909423828125, "learning_rate": 9.81957600360848e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 218.0, "min_completion_length": 87.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 80, "temperature": 1.0 }, { "completion_length": 132.06640625, "epoch": 0.03652762119503946, "grad_norm": 0.04961982123085122, "kl": 0.011138916015625, "learning_rate": 9.817320703653585e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 231.0, "min_completion_length": 84.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 81, "temperature": 1.0 }, { "completion_length": 131.515625, "epoch": 0.03697857948139797, "grad_norm": 0.0410521754154648, "kl": 0.01123046875, "learning_rate": 9.81506540369869e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 252.5, "min_completion_length": 90.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 82, "temperature": 1.0 }, { "completion_length": 131.09765625, "epoch": 0.03742953776775648, "grad_norm": 0.029161244357827847, "kl": 0.0074005126953125, "learning_rate": 9.812810103743796e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 219.5, "min_completion_length": 86.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 83, "temperature": 1.0 }, { "completion_length": 133.95703125, "epoch": 0.037880496054115, "grad_norm": 0.0550794318873668, "kl": 0.011444091796875, "learning_rate": 9.810554803788902e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 253.0, "min_completion_length": 93.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 84, "temperature": 1.0 }, { "completion_length": 133.12890625, "epoch": 0.038331454340473504, "grad_norm": 0.3794360227093856, "kl": 0.0089111328125, "learning_rate": 9.80829950383401e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 235.0, "min_completion_length": 85.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 85, "temperature": 1.0 }, { "completion_length": 130.875, "epoch": 0.03878241262683202, "grad_norm": 0.05260711761694229, "kl": 0.008270263671875, "learning_rate": 9.806044203879116e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 236.5, "min_completion_length": 85.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 86, "temperature": 1.0 }, { "completion_length": 136.50390625, "epoch": 0.03923337091319053, "grad_norm": 0.03637909476903296, "kl": 0.0069580078125, "learning_rate": 9.803788903924222e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 215.0, "min_completion_length": 91.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 87, "temperature": 1.0 }, { "completion_length": 131.37109375, "epoch": 0.03968432919954904, "grad_norm": 0.031058014010275966, "kl": 0.013153076171875, "learning_rate": 9.801533603969327e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 226.5, "min_completion_length": 90.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 88, "temperature": 1.0 }, { "completion_length": 141.54296875, "epoch": 0.040135287485907556, "grad_norm": 0.04156967471425809, "kl": 0.01055908203125, "learning_rate": 9.799278304014433e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 287.5, "min_completion_length": 92.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 89, "temperature": 1.0 }, { "completion_length": 133.45703125, "epoch": 0.040586245772266064, "grad_norm": 0.04432572627688822, "kl": 0.0111083984375, "learning_rate": 9.797023004059539e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 210.0, "min_completion_length": 95.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 90, "temperature": 1.0 }, { "completion_length": 134.28125, "epoch": 0.04103720405862458, "grad_norm": 0.7407667374975673, "kl": 0.022186279296875, "learning_rate": 9.794767704104645e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0009, "max_completion_length": 218.0, "min_completion_length": 93.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 91, "temperature": 1.0 }, { "completion_length": 137.15625, "epoch": 0.041488162344983086, "grad_norm": 0.41097995708484697, "kl": 0.011474609375, "learning_rate": 9.792512404149752e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 248.0, "min_completion_length": 85.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 92, "temperature": 1.0 }, { "completion_length": 137.140625, "epoch": 0.0419391206313416, "grad_norm": 0.04758776396448601, "kl": 0.009674072265625, "learning_rate": 9.790257104194858e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 262.0, "min_completion_length": 85.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 93, "temperature": 1.0 }, { "completion_length": 141.03125, "epoch": 0.042390078917700115, "grad_norm": 0.2485793639149779, "kl": 0.0203857421875, "learning_rate": 9.788001804239964e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 269.5, "min_completion_length": 98.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 94, "temperature": 1.0 }, { "completion_length": 134.24609375, "epoch": 0.04284103720405862, "grad_norm": 0.2540785535950062, "kl": 0.0247802734375, "learning_rate": 9.78574650428507e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.001, "max_completion_length": 230.0, "min_completion_length": 87.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 95, "temperature": 1.0 }, { "completion_length": 134.51953125, "epoch": 0.04329199549041714, "grad_norm": 0.16330960689627924, "kl": 0.0205078125, "learning_rate": 9.783491204330175e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0008, "max_completion_length": 217.5, "min_completion_length": 91.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 96, "temperature": 1.0 }, { "completion_length": 136.6953125, "epoch": 0.043742953776775645, "grad_norm": 0.14972836319565624, "kl": 0.015106201171875, "learning_rate": 9.781235904375281e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0006, "max_completion_length": 232.0, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 97, "temperature": 1.0 }, { "completion_length": 135.2109375, "epoch": 0.04419391206313416, "grad_norm": 0.06666610741611737, "kl": 0.010040283203125, "learning_rate": 9.778980604420387e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 230.5, "min_completion_length": 95.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 98, "temperature": 1.0 }, { "completion_length": 136.07421875, "epoch": 0.044644870349492674, "grad_norm": 0.03692527598375854, "kl": 0.01007080078125, "learning_rate": 9.776725304465493e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 216.0, "min_completion_length": 87.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 99, "temperature": 1.0 }, { "completion_length": 138.29296875, "epoch": 0.04509582863585118, "grad_norm": 0.0697985947271456, "kl": 0.01080322265625, "learning_rate": 9.7744700045106e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 308.0, "min_completion_length": 91.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 100, "temperature": 1.0 }, { "completion_length": 133.515625, "epoch": 0.045546786922209696, "grad_norm": 0.07309140477924224, "kl": 0.010772705078125, "learning_rate": 9.772214704555706e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 224.0, "min_completion_length": 88.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 101, "temperature": 1.0 }, { "completion_length": 130.36328125, "epoch": 0.045997745208568204, "grad_norm": 0.05574240124141491, "kl": 0.01068115234375, "learning_rate": 9.769959404600812e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 208.0, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 102, "temperature": 1.0 }, { "completion_length": 139.234375, "epoch": 0.04644870349492672, "grad_norm": 0.03519854121109168, "kl": 0.013427734375, "learning_rate": 9.767704104645918e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 327.0, "min_completion_length": 91.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 103, "temperature": 1.0 }, { "completion_length": 131.1015625, "epoch": 0.04689966178128523, "grad_norm": 0.03554996767560797, "kl": 0.010955810546875, "learning_rate": 9.765448804691024e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 238.5, "min_completion_length": 73.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 104, "temperature": 1.0 }, { "completion_length": 136.60546875, "epoch": 0.04735062006764374, "grad_norm": 0.042584011523593555, "kl": 0.01165771484375, "learning_rate": 9.76319350473613e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 257.5, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 105, "temperature": 1.0 }, { "completion_length": 135.66796875, "epoch": 0.047801578354002255, "grad_norm": 0.04615459145131747, "kl": 0.010589599609375, "learning_rate": 9.760938204781235e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 310.5, "min_completion_length": 87.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 106, "temperature": 1.0 }, { "completion_length": 135.6328125, "epoch": 0.04825253664036077, "grad_norm": 0.045923587392494976, "kl": 0.010223388671875, "learning_rate": 9.758682904826343e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 225.0, "min_completion_length": 79.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 107, "temperature": 1.0 }, { "completion_length": 133.73046875, "epoch": 0.04870349492671928, "grad_norm": 0.044568286212471234, "kl": 0.0089111328125, "learning_rate": 9.756427604871449e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 217.5, "min_completion_length": 92.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 108, "temperature": 1.0 }, { "completion_length": 131.06640625, "epoch": 0.04915445321307779, "grad_norm": 0.2549839899096623, "kl": 0.0092620849609375, "learning_rate": 9.754172304916554e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 229.0, "min_completion_length": 83.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 109, "temperature": 1.0 }, { "completion_length": 133.9375, "epoch": 0.0496054114994363, "grad_norm": 0.28421440889201194, "kl": 0.01019287109375, "learning_rate": 9.75191700496166e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 246.0, "min_completion_length": 93.0, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 110, "temperature": 1.0 }, { "completion_length": 135.9296875, "epoch": 0.050056369785794814, "grad_norm": 0.7286024428170503, "kl": 0.01153564453125, "learning_rate": 9.749661705006766e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0005, "max_completion_length": 240.0, "min_completion_length": 90.5, "reward": 0.99609375, "reward_std": 0.015625, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 111, "temperature": 1.0 }, { "completion_length": 135.11328125, "epoch": 0.05050732807215333, "grad_norm": 0.03670784769416828, "kl": 0.010528564453125, "learning_rate": 9.747406405051872e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 231.5, "min_completion_length": 92.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 112, "temperature": 1.0 }, { "completion_length": 127.98828125, "epoch": 0.05095828635851184, "grad_norm": 0.042164123293671474, "kl": 0.0087890625, "learning_rate": 9.745151105096978e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 239.0, "min_completion_length": 81.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 113, "temperature": 1.0 }, { "completion_length": 130.52734375, "epoch": 0.05140924464487035, "grad_norm": 0.04359303746661243, "kl": 0.01031494140625, "learning_rate": 9.742895805142083e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 265.5, "min_completion_length": 82.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 114, "temperature": 1.0 }, { "completion_length": 133.34765625, "epoch": 0.05186020293122886, "grad_norm": 0.05312958242052849, "kl": 0.009002685546875, "learning_rate": 9.74064050518719e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 217.5, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 115, "temperature": 1.0 }, { "completion_length": 135.421875, "epoch": 0.052311161217587374, "grad_norm": 0.04383689920001928, "kl": 0.0076446533203125, "learning_rate": 9.738385205232295e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 257.5, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 116, "temperature": 1.0 }, { "completion_length": 129.4375, "epoch": 0.05276211950394589, "grad_norm": 0.046719015497805653, "kl": 0.008270263671875, "learning_rate": 9.7361299052774e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 225.0, "min_completion_length": 87.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 117, "temperature": 1.0 }, { "completion_length": 134.2734375, "epoch": 0.053213077790304396, "grad_norm": 0.04862725334362555, "kl": 0.00933837890625, "learning_rate": 9.733874605322508e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 237.5, "min_completion_length": 82.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 118, "temperature": 1.0 }, { "completion_length": 133.13671875, "epoch": 0.05366403607666291, "grad_norm": 0.10031986851741669, "kl": 0.0108642578125, "learning_rate": 9.731619305367614e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 243.5, "min_completion_length": 89.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 119, "temperature": 1.0 }, { "completion_length": 132.49609375, "epoch": 0.05411499436302142, "grad_norm": 0.031112150572192358, "kl": 0.0073394775390625, "learning_rate": 9.72936400541272e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 288.0, "min_completion_length": 86.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 120, "temperature": 1.0 }, { "completion_length": 134.01953125, "epoch": 0.05456595264937993, "grad_norm": 0.5940559108446518, "kl": 0.013671875, "learning_rate": 9.727108705457826e-07, "log_metrics/accuracy": 0.003257421776652336, "log_metrics/iou_log": 0.00390625, "loss": 0.0005, "max_completion_length": 258.0, "min_completion_length": 85.5, "reward": 1.0, "reward_std": 0.03125, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.00390625, "rewards/log_reward": 0.0, "step": 121, "temperature": 1.0 }, { "completion_length": 139.04296875, "epoch": 0.05501691093573845, "grad_norm": 0.03639097352803687, "kl": 0.007720947265625, "learning_rate": 9.724853405502931e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 262.0, "min_completion_length": 94.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 122, "temperature": 1.0 }, { "completion_length": 131.33984375, "epoch": 0.055467869222096955, "grad_norm": 0.029926586040708514, "kl": 0.008544921875, "learning_rate": 9.722598105548037e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 185.5, "min_completion_length": 92.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 123, "temperature": 1.0 }, { "completion_length": 135.640625, "epoch": 0.05591882750845547, "grad_norm": 0.07007980580824669, "kl": 0.008758544921875, "learning_rate": 9.720342805593143e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 263.5, "min_completion_length": 88.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 124, "temperature": 1.0 }, { "completion_length": 134.91015625, "epoch": 0.05636978579481398, "grad_norm": 0.038029665047696073, "kl": 0.010833740234375, "learning_rate": 9.718087505638249e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 301.5, "min_completion_length": 93.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 125, "temperature": 1.0 }, { "completion_length": 136.80078125, "epoch": 0.05682074408117249, "grad_norm": 0.03399658389217789, "kl": 0.010498046875, "learning_rate": 9.715832205683354e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 253.5, "min_completion_length": 90.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 126, "temperature": 1.0 }, { "completion_length": 132.7421875, "epoch": 0.057271702367531006, "grad_norm": 0.0288651577135693, "kl": 0.0081787109375, "learning_rate": 9.71357690572846e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0003, "max_completion_length": 233.5, "min_completion_length": 87.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 127, "temperature": 1.0 }, { "completion_length": 138.23828125, "epoch": 0.057722660653889514, "grad_norm": 0.029894092306988484, "kl": 0.00921630859375, "learning_rate": 9.711321605773566e-07, "log_metrics/accuracy": 0.0, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 239.0, "min_completion_length": 91.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 128, "temperature": 1.0 }, { "completion_length": 136.296875, "epoch": 0.05817361894024803, "grad_norm": 0.02817290776298427, "kl": 0.009246826171875, "learning_rate": 9.709066305818674e-07, "log_metrics/accuracy": 0.001520317979156971, "log_metrics/iou_log": 0.0, "loss": 0.0004, "max_completion_length": 316.5, "min_completion_length": 83.5, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.0, "rewards/log_reward": 0.0, "step": 129, "temperature": 1.0 }, { "completion_length": 130.125, "epoch": 0.058624577226606536, "grad_norm": 0.6165285520370335, "kl": 0.01171875, "learning_rate": 9.70681100586378e-07, "log_metrics/accuracy": 0.0029867857228964567, "log_metrics/iou_log": 0.00390625, "loss": 0.0005, "max_completion_length": 216.0, "min_completion_length": 82.0, "reward": 1.00390625, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.00390625, "rewards/log_reward": 0.0, "step": 130, "temperature": 1.0 }, { "completion_length": 133.953125, "epoch": 0.05907553551296505, "grad_norm": 0.4803658881359604, "kl": 0.00921630859375, "learning_rate": 9.704555705908885e-07, "log_metrics/accuracy": 0.0028719999827444553, "log_metrics/iou_log": 0.00390625, "loss": 0.0004, "max_completion_length": 246.0, "min_completion_length": 79.0, "reward": 1.00390625, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.00390625, "rewards/log_reward": 0.0, "step": 131, "temperature": 1.0 }, { "completion_length": 129.59765625, "epoch": 0.059526493799323565, "grad_norm": 1.7435921168622184, "kl": 0.01397705078125, "learning_rate": 9.702300405953991e-07, "log_metrics/accuracy": 0.03585699386894703, "log_metrics/iou_log": 0.03515625, "loss": 0.0006, "max_completion_length": 238.0, "min_completion_length": 69.5, "reward": 1.0234375, "reward_std": 0.125, "rewards/format_reward": 0.98828125, "rewards/iou_reward": 0.03515625, "rewards/log_reward": 0.0, "step": 132, "temperature": 1.0 }, { "completion_length": 117.4921875, "epoch": 0.05997745208568207, "grad_norm": 6.152409630252781, "kl": 0.05615234375, "learning_rate": 9.700045105999097e-07, "log_metrics/accuracy": 0.3532668203115463, "log_metrics/iou_log": 0.3984375, "loss": 0.0022, "max_completion_length": 229.0, "min_completion_length": 62.0, "reward": 1.3828125, "reward_std": 0.4916256368160248, "rewards/format_reward": 0.984375, "rewards/iou_reward": 0.3984375, "rewards/log_reward": 0.0, "step": 133, "temperature": 1.0 }, { "completion_length": 99.53125, "epoch": 0.06042841037204059, "grad_norm": 24.520553591942157, "kl": 0.1240234375, "learning_rate": 9.697789806044203e-07, "log_metrics/accuracy": 0.6385847628116608, "log_metrics/iou_log": 0.6953125, "loss": 0.0049, "max_completion_length": 230.0, "min_completion_length": 55.5, "reward": 1.67578125, "reward_std": 0.31701020896434784, "rewards/format_reward": 0.98046875, "rewards/iou_reward": 0.6953125, "rewards/log_reward": 0.0, "step": 134, "temperature": 1.0 }, { "completion_length": 102.87890625, "epoch": 0.060879368658399095, "grad_norm": 2.4974832118570727, "kl": 0.113525390625, "learning_rate": 9.695534506089308e-07, "log_metrics/accuracy": 0.6731529831886292, "log_metrics/iou_log": 0.76171875, "loss": 0.0045, "max_completion_length": 186.0, "min_completion_length": 58.5, "reward": 1.7578125, "reward_std": 0.3095604404807091, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.76171875, "rewards/log_reward": 0.0, "step": 135, "temperature": 1.0 }, { "completion_length": 103.109375, "epoch": 0.06133032694475761, "grad_norm": 3.516339202499408, "kl": 0.118408203125, "learning_rate": 9.693279206134416e-07, "log_metrics/accuracy": 0.6554303467273712, "log_metrics/iou_log": 0.6796875, "loss": 0.0047, "max_completion_length": 226.5, "min_completion_length": 54.0, "reward": 1.6640625, "reward_std": 0.28480498492717743, "rewards/format_reward": 0.984375, "rewards/iou_reward": 0.6796875, "rewards/log_reward": 0.0, "step": 136, "temperature": 1.0 }, { "completion_length": 104.65625, "epoch": 0.061781285231116125, "grad_norm": 4.120853025373372, "kl": 0.180908203125, "learning_rate": 9.691023906179522e-07, "log_metrics/accuracy": 0.7559227645397186, "log_metrics/iou_log": 0.8984375, "loss": 0.0072, "max_completion_length": 204.0, "min_completion_length": 60.5, "reward": 1.8828125, "reward_std": 0.12466736882925034, "rewards/format_reward": 0.984375, "rewards/iou_reward": 0.8984375, "rewards/log_reward": 0.0, "step": 137, "temperature": 1.0 }, { "completion_length": 104.96484375, "epoch": 0.06223224351747463, "grad_norm": 1.9791163456647438, "kl": 0.11181640625, "learning_rate": 9.688768606224628e-07, "log_metrics/accuracy": 0.6887724995613098, "log_metrics/iou_log": 0.7109375, "loss": 0.0045, "max_completion_length": 246.0, "min_completion_length": 60.0, "reward": 1.70703125, "reward_std": 0.21135114878416061, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.7109375, "rewards/log_reward": 0.0, "step": 138, "temperature": 1.0 }, { "completion_length": 99.19140625, "epoch": 0.06268320180383315, "grad_norm": 5.5467544556496335, "kl": 0.114990234375, "learning_rate": 9.686513306269734e-07, "log_metrics/accuracy": 0.7241671979427338, "log_metrics/iou_log": 0.82421875, "loss": 0.0046, "max_completion_length": 194.0, "min_completion_length": 60.0, "reward": 1.82421875, "reward_std": 0.15309549123048782, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.82421875, "rewards/log_reward": 0.0, "step": 139, "temperature": 1.0 }, { "completion_length": 101.36328125, "epoch": 0.06313416009019165, "grad_norm": 12.142762617479395, "kl": 0.11865234375, "learning_rate": 9.68425800631484e-07, "log_metrics/accuracy": 0.712556004524231, "log_metrics/iou_log": 0.79296875, "loss": 0.0047, "max_completion_length": 190.0, "min_completion_length": 55.5, "reward": 1.78515625, "reward_std": 0.2706931382417679, "rewards/format_reward": 0.9921875, "rewards/iou_reward": 0.79296875, "rewards/log_reward": 0.0, "step": 140, "temperature": 1.0 }, { "completion_length": 101.61328125, "epoch": 0.06358511837655018, "grad_norm": 3.640232102574435, "kl": 0.1201171875, "learning_rate": 9.682002706359945e-07, "log_metrics/accuracy": 0.7055022418498993, "log_metrics/iou_log": 0.765625, "loss": 0.0048, "max_completion_length": 211.5, "min_completion_length": 55.5, "reward": 1.765625, "reward_std": 0.2804790586233139, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.765625, "rewards/log_reward": 0.0, "step": 141, "temperature": 1.0 }, { "completion_length": 97.63671875, "epoch": 0.06403607666290868, "grad_norm": 2.667593861521047, "kl": 0.122314453125, "learning_rate": 9.67974740640505e-07, "log_metrics/accuracy": 0.7026576399803162, "log_metrics/iou_log": 0.7890625, "loss": 0.0049, "max_completion_length": 207.0, "min_completion_length": 58.5, "reward": 1.78515625, "reward_std": 0.24758073687553406, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.7890625, "rewards/log_reward": 0.0, "step": 142, "temperature": 1.0 }, { "completion_length": 100.74609375, "epoch": 0.06448703494926719, "grad_norm": 3.1074124076238916, "kl": 0.123779296875, "learning_rate": 9.677492106450157e-07, "log_metrics/accuracy": 0.6754811108112335, "log_metrics/iou_log": 0.75, "loss": 0.005, "max_completion_length": 209.5, "min_completion_length": 55.5, "reward": 1.74609375, "reward_std": 0.268774151802063, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.75, "rewards/log_reward": 0.0, "step": 143, "temperature": 1.0 }, { "completion_length": 101.8125, "epoch": 0.0649379932356257, "grad_norm": 1.7944760480191515, "kl": 0.1220703125, "learning_rate": 9.675236806495264e-07, "log_metrics/accuracy": 0.71114382147789, "log_metrics/iou_log": 0.7890625, "loss": 0.0049, "max_completion_length": 217.0, "min_completion_length": 56.5, "reward": 1.78515625, "reward_std": 0.27572914958000183, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.7890625, "rewards/log_reward": 0.0, "step": 144, "temperature": 1.0 }, { "completion_length": 99.64453125, "epoch": 0.06538895152198422, "grad_norm": 2.086624174741133, "kl": 0.124267578125, "learning_rate": 9.67298150654037e-07, "log_metrics/accuracy": 0.748405933380127, "log_metrics/iou_log": 0.83984375, "loss": 0.005, "max_completion_length": 213.0, "min_completion_length": 63.5, "reward": 1.8359375, "reward_std": 0.23154567182064056, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.83984375, "rewards/log_reward": 0.0, "step": 145, "temperature": 1.0 }, { "completion_length": 96.7890625, "epoch": 0.06583990980834273, "grad_norm": 3.3643857707503435, "kl": 0.12841796875, "learning_rate": 9.670726206585476e-07, "log_metrics/accuracy": 0.7452348172664642, "log_metrics/iou_log": 0.8515625, "loss": 0.0052, "max_completion_length": 176.0, "min_completion_length": 59.0, "reward": 1.8515625, "reward_std": 0.23864974081516266, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "rewards/log_reward": 0.0, "step": 146, "temperature": 1.0 }, { "completion_length": 97.89453125, "epoch": 0.06629086809470124, "grad_norm": 2.0566073759257706, "kl": 0.1337890625, "learning_rate": 9.668470906630582e-07, "log_metrics/accuracy": 0.7216435968875885, "log_metrics/iou_log": 0.79296875, "loss": 0.0054, "max_completion_length": 206.0, "min_completion_length": 58.5, "reward": 1.79296875, "reward_std": 0.13226625323295593, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.79296875, "rewards/log_reward": 0.0, "step": 147, "temperature": 1.0 }, { "completion_length": 97.98828125, "epoch": 0.06674182638105976, "grad_norm": 1.379192128046406, "kl": 0.1318359375, "learning_rate": 9.666215606675687e-07, "log_metrics/accuracy": 0.622128963470459, "log_metrics/iou_log": 0.65625, "loss": 0.0053, "max_completion_length": 198.0, "min_completion_length": 56.5, "reward": 1.65625, "reward_std": 0.21148452162742615, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.65625, "rewards/log_reward": 0.0, "step": 148, "temperature": 1.0 }, { "completion_length": 97.74609375, "epoch": 0.06719278466741826, "grad_norm": 1.740571009912497, "kl": 0.118896484375, "learning_rate": 9.663960306720793e-07, "log_metrics/accuracy": 0.7791113555431366, "log_metrics/iou_log": 0.8515625, "loss": 0.0048, "max_completion_length": 203.0, "min_completion_length": 54.0, "reward": 1.84765625, "reward_std": 0.15834103524684906, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.8515625, "rewards/log_reward": 0.0, "step": 149, "temperature": 1.0 }, { "completion_length": 97.7734375, "epoch": 0.06764374295377677, "grad_norm": 1.832460748446639, "kl": 0.1318359375, "learning_rate": 9.6617050067659e-07, "log_metrics/accuracy": 0.7626213431358337, "log_metrics/iou_log": 0.83203125, "loss": 0.0053, "max_completion_length": 151.5, "min_completion_length": 63.0, "reward": 1.828125, "reward_std": 0.22953035682439804, "rewards/format_reward": 0.99609375, "rewards/iou_reward": 0.83203125, "rewards/log_reward": 0.0, "step": 150, "temperature": 1.0 } ], "logging_steps": 1.0, "max_steps": 4434, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }