dddraxxx's picture
Training in progress, step 150, checkpoint
7138c43 verified
raw
history blame contribute delete
87.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.06764374295377677,
"eval_steps": 300,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 129.59375,
"epoch": 0.00045095828635851183,
"grad_norm": 8.434212673465728,
"kl": 0.0,
"learning_rate": 9.997744700045105e-07,
"log_metrics/accuracy": 0.007260729558765888,
"log_metrics/iou_log": 0.0078125,
"loss": 0.0,
"max_completion_length": 444.5,
"min_completion_length": 45.0,
"reward": 0.29296875,
"reward_std": 0.42108407616615295,
"rewards/format_reward": 0.28515625,
"rewards/iou_reward": 0.0078125,
"rewards/log_reward": 0.0,
"step": 1,
"temperature": 1.0
},
{
"completion_length": 142.15234375,
"epoch": 0.0009019165727170237,
"grad_norm": 4.356998223352768,
"kl": 0.0017852783203125,
"learning_rate": 9.995489400090211e-07,
"log_metrics/accuracy": 0.02364518865942955,
"log_metrics/iou_log": 0.0234375,
"loss": 0.0001,
"max_completion_length": 512.0,
"min_completion_length": 38.5,
"reward": 0.39453125,
"reward_std": 0.4661460518836975,
"rewards/format_reward": 0.37109375,
"rewards/iou_reward": 0.0234375,
"rewards/log_reward": 0.0,
"step": 2,
"temperature": 1.0
},
{
"completion_length": 140.8671875,
"epoch": 0.0013528748590755355,
"grad_norm": 2.5874222658859782,
"kl": 0.00408172607421875,
"learning_rate": 9.993234100135317e-07,
"log_metrics/accuracy": 0.04108293540775776,
"log_metrics/iou_log": 0.046875,
"loss": 0.0002,
"max_completion_length": 512.0,
"min_completion_length": 46.5,
"reward": 0.7578125,
"reward_std": 0.38541457056999207,
"rewards/format_reward": 0.7109375,
"rewards/iou_reward": 0.046875,
"rewards/log_reward": 0.0,
"step": 3,
"temperature": 1.0
},
{
"completion_length": 138.29296875,
"epoch": 0.0018038331454340473,
"grad_norm": 2.444955671283294,
"kl": 0.006561279296875,
"learning_rate": 9.990978800180425e-07,
"log_metrics/accuracy": 0.019979181233793497,
"log_metrics/iou_log": 0.0234375,
"loss": 0.0003,
"max_completion_length": 373.0,
"min_completion_length": 53.0,
"reward": 0.7578125,
"reward_std": 0.3646235316991806,
"rewards/format_reward": 0.734375,
"rewards/iou_reward": 0.0234375,
"rewards/log_reward": 0.0,
"step": 4,
"temperature": 1.0
},
{
"completion_length": 139.26953125,
"epoch": 0.002254791431792559,
"grad_norm": 2.4127784594267783,
"kl": 0.009033203125,
"learning_rate": 9.98872350022553e-07,
"log_metrics/accuracy": 0.019243303686380386,
"log_metrics/iou_log": 0.0234375,
"loss": 0.0004,
"max_completion_length": 376.0,
"min_completion_length": 50.5,
"reward": 0.921875,
"reward_std": 0.20379295945167542,
"rewards/format_reward": 0.8984375,
"rewards/iou_reward": 0.0234375,
"rewards/log_reward": 0.0,
"step": 5,
"temperature": 1.0
},
{
"completion_length": 141.515625,
"epoch": 0.002705749718151071,
"grad_norm": 3.4995669528512625,
"kl": 0.0137939453125,
"learning_rate": 9.986468200270636e-07,
"log_metrics/accuracy": 0.011071678251028061,
"log_metrics/iou_log": 0.01171875,
"loss": 0.0006,
"max_completion_length": 278.5,
"min_completion_length": 76.5,
"reward": 0.92578125,
"reward_std": 0.18992366641759872,
"rewards/format_reward": 0.9140625,
"rewards/iou_reward": 0.01171875,
"rewards/log_reward": 0.0,
"step": 6,
"temperature": 1.0
},
{
"completion_length": 143.9609375,
"epoch": 0.003156708004509583,
"grad_norm": 1.3423944744269296,
"kl": 0.015625,
"learning_rate": 9.984212900315742e-07,
"log_metrics/accuracy": 0.00020430245785973966,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 296.0,
"min_completion_length": 84.5,
"reward": 0.9296875,
"reward_std": 0.1642879694700241,
"rewards/format_reward": 0.9296875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 7,
"temperature": 1.0
},
{
"completion_length": 136.3125,
"epoch": 0.0036076662908680946,
"grad_norm": 1.0911367434212607,
"kl": 0.011993408203125,
"learning_rate": 9.981957600360848e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 279.0,
"min_completion_length": 84.5,
"reward": 0.96484375,
"reward_std": 0.12082062661647797,
"rewards/format_reward": 0.96484375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 8,
"temperature": 1.0
},
{
"completion_length": 137.1953125,
"epoch": 0.004058624577226606,
"grad_norm": 1.0840026316503646,
"kl": 0.014739990234375,
"learning_rate": 9.979702300405953e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 262.0,
"min_completion_length": 88.5,
"reward": 0.93359375,
"reward_std": 0.13721734285354614,
"rewards/format_reward": 0.93359375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 9,
"temperature": 1.0
},
{
"completion_length": 141.21484375,
"epoch": 0.004509582863585118,
"grad_norm": 0.5693248065798014,
"kl": 0.016387939453125,
"learning_rate": 9.97744700045106e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 290.5,
"min_completion_length": 92.5,
"reward": 0.98828125,
"reward_std": 0.03697281330823898,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 10,
"temperature": 1.0
},
{
"completion_length": 139.66015625,
"epoch": 0.00496054114994363,
"grad_norm": 0.7385235986125652,
"kl": 0.0325927734375,
"learning_rate": 9.975191700496165e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0013,
"max_completion_length": 236.0,
"min_completion_length": 94.5,
"reward": 0.98828125,
"reward_std": 0.03697281330823898,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 11,
"temperature": 1.0
},
{
"completion_length": 142.58203125,
"epoch": 0.005411499436302142,
"grad_norm": 0.2699152956849861,
"kl": 0.0230712890625,
"learning_rate": 9.972936400541273e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0009,
"max_completion_length": 243.5,
"min_completion_length": 95.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 12,
"temperature": 1.0
},
{
"completion_length": 137.46484375,
"epoch": 0.005862457722660654,
"grad_norm": 4.498549971158498,
"kl": 0.0223388671875,
"learning_rate": 9.970681100586379e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0009,
"max_completion_length": 224.5,
"min_completion_length": 90.0,
"reward": 0.98828125,
"reward_std": 0.03697281330823898,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 13,
"temperature": 1.0
},
{
"completion_length": 137.58203125,
"epoch": 0.006313416009019166,
"grad_norm": 1.316359142547647,
"kl": 0.020263671875,
"learning_rate": 9.968425800631484e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 203.5,
"min_completion_length": 100.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 14,
"temperature": 1.0
},
{
"completion_length": 146.5625,
"epoch": 0.006764374295377677,
"grad_norm": 0.5411629015808064,
"kl": 0.01934814453125,
"learning_rate": 9.96617050067659e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 244.0,
"min_completion_length": 99.0,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 15,
"temperature": 1.0
},
{
"completion_length": 145.5625,
"epoch": 0.007215332581736189,
"grad_norm": 0.5837134769756639,
"kl": 0.01678466796875,
"learning_rate": 9.963915200721696e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 265.0,
"min_completion_length": 89.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 16,
"temperature": 1.0
},
{
"completion_length": 135.44921875,
"epoch": 0.007666290868094701,
"grad_norm": 0.5126143243632768,
"kl": 0.02001953125,
"learning_rate": 9.961659900766802e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 247.5,
"min_completion_length": 99.0,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 17,
"temperature": 1.0
},
{
"completion_length": 139.48046875,
"epoch": 0.008117249154453212,
"grad_norm": 0.8984743564170407,
"kl": 0.015838623046875,
"learning_rate": 9.959404600811907e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 233.0,
"min_completion_length": 84.0,
"reward": 0.98828125,
"reward_std": 0.046875,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 18,
"temperature": 1.0
},
{
"completion_length": 138.43359375,
"epoch": 0.008568207440811725,
"grad_norm": 0.42188832602144527,
"kl": 0.01397705078125,
"learning_rate": 9.957149300857013e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 231.5,
"min_completion_length": 95.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 19,
"temperature": 1.0
},
{
"completion_length": 131.62890625,
"epoch": 0.009019165727170236,
"grad_norm": 0.32128454476676716,
"kl": 0.01458740234375,
"learning_rate": 9.954894000902119e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 216.5,
"min_completion_length": 88.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 20,
"temperature": 1.0
},
{
"completion_length": 126.828125,
"epoch": 0.00947012401352875,
"grad_norm": 0.48546223545241646,
"kl": 0.0172119140625,
"learning_rate": 9.952638700947225e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 194.5,
"min_completion_length": 87.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 21,
"temperature": 1.0
},
{
"completion_length": 129.39453125,
"epoch": 0.00992108229988726,
"grad_norm": 0.4258953137179797,
"kl": 0.015869140625,
"learning_rate": 9.950383400992333e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 247.5,
"min_completion_length": 85.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 22,
"temperature": 1.0
},
{
"completion_length": 129.10546875,
"epoch": 0.010372040586245771,
"grad_norm": 0.2833217695971168,
"kl": 0.013458251953125,
"learning_rate": 9.948128101037438e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 192.5,
"min_completion_length": 86.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 23,
"temperature": 1.0
},
{
"completion_length": 135.796875,
"epoch": 0.010822998872604284,
"grad_norm": 0.48148300062802873,
"kl": 0.0255126953125,
"learning_rate": 9.945872801082544e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.001,
"max_completion_length": 259.5,
"min_completion_length": 89.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 24,
"temperature": 1.0
},
{
"completion_length": 123.671875,
"epoch": 0.011273957158962795,
"grad_norm": 0.15786664076306023,
"kl": 0.015167236328125,
"learning_rate": 9.94361750112765e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 207.5,
"min_completion_length": 83.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 25,
"temperature": 1.0
},
{
"completion_length": 122.7109375,
"epoch": 0.011724915445321308,
"grad_norm": 0.13036537143577448,
"kl": 0.0208740234375,
"learning_rate": 9.941362201172756e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 187.5,
"min_completion_length": 83.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 26,
"temperature": 1.0
},
{
"completion_length": 126.6171875,
"epoch": 0.01217587373167982,
"grad_norm": 1.025009042819294,
"kl": 0.01922607421875,
"learning_rate": 9.939106901217861e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 222.5,
"min_completion_length": 87.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 27,
"temperature": 1.0
},
{
"completion_length": 126.890625,
"epoch": 0.012626832018038332,
"grad_norm": 0.16079440644093526,
"kl": 0.0172119140625,
"learning_rate": 9.936851601262967e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 204.0,
"min_completion_length": 84.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 28,
"temperature": 1.0
},
{
"completion_length": 121.3671875,
"epoch": 0.013077790304396843,
"grad_norm": 0.19002133711259844,
"kl": 0.02008056640625,
"learning_rate": 9.934596301308073e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 192.5,
"min_completion_length": 85.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 29,
"temperature": 1.0
},
{
"completion_length": 120.9296875,
"epoch": 0.013528748590755355,
"grad_norm": 0.3356237867953282,
"kl": 0.02044677734375,
"learning_rate": 9.932341001353179e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 201.0,
"min_completion_length": 80.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 30,
"temperature": 1.0
},
{
"completion_length": 121.890625,
"epoch": 0.013979706877113867,
"grad_norm": 0.0972578202221844,
"kl": 0.0167236328125,
"learning_rate": 9.930085701398284e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 225.0,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 31,
"temperature": 1.0
},
{
"completion_length": 125.453125,
"epoch": 0.014430665163472379,
"grad_norm": 0.16574400218238333,
"kl": 0.01788330078125,
"learning_rate": 9.92783040144339e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 237.5,
"min_completion_length": 84.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 32,
"temperature": 1.0
},
{
"completion_length": 121.51953125,
"epoch": 0.014881623449830891,
"grad_norm": 0.11795457231634025,
"kl": 0.02032470703125,
"learning_rate": 9.925575101488498e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 235.0,
"min_completion_length": 86.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 33,
"temperature": 1.0
},
{
"completion_length": 124.75390625,
"epoch": 0.015332581736189402,
"grad_norm": 0.126113812729237,
"kl": 0.01666259765625,
"learning_rate": 9.923319801533604e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 183.0,
"min_completion_length": 87.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 34,
"temperature": 1.0
},
{
"completion_length": 124.16015625,
"epoch": 0.015783540022547914,
"grad_norm": 0.07662853380658595,
"kl": 0.01458740234375,
"learning_rate": 9.92106450157871e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 180.0,
"min_completion_length": 87.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 35,
"temperature": 1.0
},
{
"completion_length": 128.6484375,
"epoch": 0.016234498308906425,
"grad_norm": 0.4478082644592694,
"kl": 0.014801025390625,
"learning_rate": 9.918809201623815e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 208.0,
"min_completion_length": 84.5,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 36,
"temperature": 1.0
},
{
"completion_length": 130.171875,
"epoch": 0.01668545659526494,
"grad_norm": 0.5289725828976242,
"kl": 0.01373291015625,
"learning_rate": 9.91655390166892e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 223.0,
"min_completion_length": 85.5,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 37,
"temperature": 1.0
},
{
"completion_length": 132.4609375,
"epoch": 0.01713641488162345,
"grad_norm": 0.05806697595641871,
"kl": 0.013671875,
"learning_rate": 9.914298601714027e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 257.0,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 38,
"temperature": 1.0
},
{
"completion_length": 131.6796875,
"epoch": 0.01758737316798196,
"grad_norm": 0.07390387951612225,
"kl": 0.015869140625,
"learning_rate": 9.912043301759133e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 241.0,
"min_completion_length": 94.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 39,
"temperature": 1.0
},
{
"completion_length": 133.99609375,
"epoch": 0.018038331454340473,
"grad_norm": 0.5552696074810277,
"kl": 0.01605224609375,
"learning_rate": 9.909788001804238e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 380.0,
"min_completion_length": 91.5,
"reward": 0.98828125,
"reward_std": 0.03697281330823898,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 40,
"temperature": 1.0
},
{
"completion_length": 134.671875,
"epoch": 0.018489289740698984,
"grad_norm": 0.09608326888981661,
"kl": 0.0177001953125,
"learning_rate": 9.907532701849346e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 262.0,
"min_completion_length": 90.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 41,
"temperature": 1.0
},
{
"completion_length": 128.27734375,
"epoch": 0.0189402480270575,
"grad_norm": 0.22125892499138428,
"kl": 0.013702392578125,
"learning_rate": 9.905277401894452e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 251.0,
"min_completion_length": 88.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 42,
"temperature": 1.0
},
{
"completion_length": 135.359375,
"epoch": 0.01939120631341601,
"grad_norm": 0.08775045275198128,
"kl": 0.015533447265625,
"learning_rate": 9.903022101939558e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 291.5,
"min_completion_length": 83.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 43,
"temperature": 1.0
},
{
"completion_length": 135.53125,
"epoch": 0.01984216459977452,
"grad_norm": 0.12136918886632062,
"kl": 0.015167236328125,
"learning_rate": 9.900766801984663e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 288.5,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 44,
"temperature": 1.0
},
{
"completion_length": 128.52734375,
"epoch": 0.020293122886133032,
"grad_norm": 0.09774392937230035,
"kl": 0.010894775390625,
"learning_rate": 9.89851150202977e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 222.0,
"min_completion_length": 86.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 45,
"temperature": 1.0
},
{
"completion_length": 131.52734375,
"epoch": 0.020744081172491543,
"grad_norm": 0.07216886654321743,
"kl": 0.012908935546875,
"learning_rate": 9.896256202074875e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 246.0,
"min_completion_length": 89.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 46,
"temperature": 1.0
},
{
"completion_length": 134.6796875,
"epoch": 0.021195039458850057,
"grad_norm": 0.3652700320877076,
"kl": 0.011932373046875,
"learning_rate": 9.89400090211998e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 250.0,
"min_completion_length": 91.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 47,
"temperature": 1.0
},
{
"completion_length": 134.94140625,
"epoch": 0.02164599774520857,
"grad_norm": 0.0712237478501456,
"kl": 0.0118408203125,
"learning_rate": 9.891745602165089e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 262.0,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 48,
"temperature": 1.0
},
{
"completion_length": 131.27734375,
"epoch": 0.02209695603156708,
"grad_norm": 0.07461422124475418,
"kl": 0.010101318359375,
"learning_rate": 9.889490302210194e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 216.0,
"min_completion_length": 89.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 49,
"temperature": 1.0
},
{
"completion_length": 132.33984375,
"epoch": 0.02254791431792559,
"grad_norm": 2.3229008992621183,
"kl": 0.011199951171875,
"learning_rate": 9.8872350022553e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 225.0,
"min_completion_length": 89.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 50,
"temperature": 1.0
},
{
"completion_length": 129.7734375,
"epoch": 0.022998872604284102,
"grad_norm": 0.0875125604597023,
"kl": 0.015960693359375,
"learning_rate": 9.884979702300406e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 210.5,
"min_completion_length": 90.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 51,
"temperature": 1.0
},
{
"completion_length": 129.12890625,
"epoch": 0.023449830890642617,
"grad_norm": 0.16094378550497987,
"kl": 0.01641845703125,
"learning_rate": 9.882724402345512e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 203.5,
"min_completion_length": 86.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 52,
"temperature": 1.0
},
{
"completion_length": 130.06640625,
"epoch": 0.023900789177001128,
"grad_norm": 0.07783652453734344,
"kl": 0.013031005859375,
"learning_rate": 9.880469102390617e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 195.5,
"min_completion_length": 80.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 53,
"temperature": 1.0
},
{
"completion_length": 129.9453125,
"epoch": 0.02435174746335964,
"grad_norm": 0.6621735516896475,
"kl": 0.013824462890625,
"learning_rate": 9.878213802435723e-07,
"log_metrics/accuracy": 0.0036423311103135347,
"log_metrics/iou_log": 0.00390625,
"loss": 0.0006,
"max_completion_length": 208.0,
"min_completion_length": 92.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.00390625,
"rewards/log_reward": 0.0,
"step": 54,
"temperature": 1.0
},
{
"completion_length": 135.72265625,
"epoch": 0.02480270574971815,
"grad_norm": 0.056449991094393824,
"kl": 0.00994873046875,
"learning_rate": 9.875958502480829e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 328.0,
"min_completion_length": 89.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 55,
"temperature": 1.0
},
{
"completion_length": 134.05078125,
"epoch": 0.025253664036076665,
"grad_norm": 0.6059033859841335,
"kl": 0.00775146484375,
"learning_rate": 9.873703202525937e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 228.5,
"min_completion_length": 89.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 56,
"temperature": 1.0
},
{
"completion_length": 132.85546875,
"epoch": 0.025704622322435176,
"grad_norm": 0.037734113426563874,
"kl": 0.009033203125,
"learning_rate": 9.871447902571042e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 240.5,
"min_completion_length": 92.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 57,
"temperature": 1.0
},
{
"completion_length": 132.78125,
"epoch": 0.026155580608793687,
"grad_norm": 0.09813082705622107,
"kl": 0.010498046875,
"learning_rate": 9.869192602616148e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 225.0,
"min_completion_length": 92.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 58,
"temperature": 1.0
},
{
"completion_length": 129.6796875,
"epoch": 0.026606538895152198,
"grad_norm": 0.272077654342032,
"kl": 0.011505126953125,
"learning_rate": 9.866937302661254e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 218.5,
"min_completion_length": 84.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 59,
"temperature": 1.0
},
{
"completion_length": 131.65234375,
"epoch": 0.02705749718151071,
"grad_norm": 0.0822946603987711,
"kl": 0.01092529296875,
"learning_rate": 9.86468200270636e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 209.0,
"min_completion_length": 84.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 60,
"temperature": 1.0
},
{
"completion_length": 130.19140625,
"epoch": 0.027508455467869224,
"grad_norm": 0.055871650503174664,
"kl": 0.01324462890625,
"learning_rate": 9.862426702751465e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 206.5,
"min_completion_length": 86.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 61,
"temperature": 1.0
},
{
"completion_length": 136.4375,
"epoch": 0.027959413754227735,
"grad_norm": 0.3732397114410677,
"kl": 0.010528564453125,
"learning_rate": 9.860171402796571e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 250.5,
"min_completion_length": 86.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 62,
"temperature": 1.0
},
{
"completion_length": 132.015625,
"epoch": 0.028410372040586246,
"grad_norm": 1.215038113924891,
"kl": 0.0142822265625,
"learning_rate": 9.857916102841677e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 285.5,
"min_completion_length": 82.0,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 63,
"temperature": 1.0
},
{
"completion_length": 132.875,
"epoch": 0.028861330326944757,
"grad_norm": 0.3792773946237974,
"kl": 0.011932373046875,
"learning_rate": 9.855660802886783e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 221.5,
"min_completion_length": 88.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 64,
"temperature": 1.0
},
{
"completion_length": 129.75390625,
"epoch": 0.029312288613303268,
"grad_norm": 0.06882533947287736,
"kl": 0.010345458984375,
"learning_rate": 9.85340550293189e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 197.0,
"min_completion_length": 90.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 65,
"temperature": 1.0
},
{
"completion_length": 132.9609375,
"epoch": 0.029763246899661783,
"grad_norm": 0.7469897504399653,
"kl": 0.0126953125,
"learning_rate": 9.851150202976996e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 243.0,
"min_completion_length": 87.5,
"reward": 0.98828125,
"reward_std": 0.046875,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 66,
"temperature": 1.0
},
{
"completion_length": 130.66015625,
"epoch": 0.030214205186020294,
"grad_norm": 0.47470539847306675,
"kl": 0.01165771484375,
"learning_rate": 9.848894903022102e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 269.0,
"min_completion_length": 91.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 67,
"temperature": 1.0
},
{
"completion_length": 131.70703125,
"epoch": 0.030665163472378805,
"grad_norm": 0.38448758695832475,
"kl": 0.016815185546875,
"learning_rate": 9.846639603067208e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0007,
"max_completion_length": 259.5,
"min_completion_length": 91.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 68,
"temperature": 1.0
},
{
"completion_length": 138.6484375,
"epoch": 0.031116121758737316,
"grad_norm": 0.15390242756935207,
"kl": 0.01177978515625,
"learning_rate": 9.844384303112314e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 258.0,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 69,
"temperature": 1.0
},
{
"completion_length": 130.78515625,
"epoch": 0.03156708004509583,
"grad_norm": 0.6291008512876635,
"kl": 0.014617919921875,
"learning_rate": 9.84212900315742e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 232.5,
"min_completion_length": 91.5,
"reward": 0.9921875,
"reward_std": 0.03125,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 70,
"temperature": 1.0
},
{
"completion_length": 132.33984375,
"epoch": 0.03201803833145434,
"grad_norm": 0.05684071522205286,
"kl": 0.010711669921875,
"learning_rate": 9.839873703202525e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 269.0,
"min_completion_length": 90.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 71,
"temperature": 1.0
},
{
"completion_length": 127.94140625,
"epoch": 0.03246899661781285,
"grad_norm": 0.05157290002711523,
"kl": 0.009552001953125,
"learning_rate": 9.83761840324763e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 234.0,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 72,
"temperature": 1.0
},
{
"completion_length": 130.69140625,
"epoch": 0.032919954904171364,
"grad_norm": 0.36195683796616246,
"kl": 0.0101318359375,
"learning_rate": 9.835363103292737e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 274.5,
"min_completion_length": 90.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 73,
"temperature": 1.0
},
{
"completion_length": 133.31640625,
"epoch": 0.03337091319052988,
"grad_norm": 0.05629834594638958,
"kl": 0.014434814453125,
"learning_rate": 9.833107803337842e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 249.5,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 74,
"temperature": 1.0
},
{
"completion_length": 131.1171875,
"epoch": 0.033821871476888386,
"grad_norm": 0.06013498747067093,
"kl": 0.0140380859375,
"learning_rate": 9.830852503382948e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 264.0,
"min_completion_length": 87.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 75,
"temperature": 1.0
},
{
"completion_length": 129.15234375,
"epoch": 0.0342728297632469,
"grad_norm": 0.06570934118415175,
"kl": 0.009246826171875,
"learning_rate": 9.828597203428056e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 223.0,
"min_completion_length": 86.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 76,
"temperature": 1.0
},
{
"completion_length": 133.3125,
"epoch": 0.03472378804960541,
"grad_norm": 0.05957393968454419,
"kl": 0.0108642578125,
"learning_rate": 9.826341903473162e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 287.5,
"min_completion_length": 96.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 77,
"temperature": 1.0
},
{
"completion_length": 134.0,
"epoch": 0.03517474633596392,
"grad_norm": 0.05504430029751346,
"kl": 0.008758544921875,
"learning_rate": 9.824086603518268e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 221.0,
"min_completion_length": 92.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 78,
"temperature": 1.0
},
{
"completion_length": 130.1875,
"epoch": 0.03562570462232244,
"grad_norm": 0.06155937354569755,
"kl": 0.011016845703125,
"learning_rate": 9.821831303563373e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 217.0,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 79,
"temperature": 1.0
},
{
"completion_length": 130.0703125,
"epoch": 0.036076662908680945,
"grad_norm": 0.09185105404429818,
"kl": 0.00909423828125,
"learning_rate": 9.81957600360848e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 218.0,
"min_completion_length": 87.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 80,
"temperature": 1.0
},
{
"completion_length": 132.06640625,
"epoch": 0.03652762119503946,
"grad_norm": 0.04961982123085122,
"kl": 0.011138916015625,
"learning_rate": 9.817320703653585e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 231.0,
"min_completion_length": 84.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 81,
"temperature": 1.0
},
{
"completion_length": 131.515625,
"epoch": 0.03697857948139797,
"grad_norm": 0.0410521754154648,
"kl": 0.01123046875,
"learning_rate": 9.81506540369869e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 252.5,
"min_completion_length": 90.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 82,
"temperature": 1.0
},
{
"completion_length": 131.09765625,
"epoch": 0.03742953776775648,
"grad_norm": 0.029161244357827847,
"kl": 0.0074005126953125,
"learning_rate": 9.812810103743796e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 219.5,
"min_completion_length": 86.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 83,
"temperature": 1.0
},
{
"completion_length": 133.95703125,
"epoch": 0.037880496054115,
"grad_norm": 0.0550794318873668,
"kl": 0.011444091796875,
"learning_rate": 9.810554803788902e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 253.0,
"min_completion_length": 93.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 84,
"temperature": 1.0
},
{
"completion_length": 133.12890625,
"epoch": 0.038331454340473504,
"grad_norm": 0.3794360227093856,
"kl": 0.0089111328125,
"learning_rate": 9.80829950383401e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 235.0,
"min_completion_length": 85.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 85,
"temperature": 1.0
},
{
"completion_length": 130.875,
"epoch": 0.03878241262683202,
"grad_norm": 0.05260711761694229,
"kl": 0.008270263671875,
"learning_rate": 9.806044203879116e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 236.5,
"min_completion_length": 85.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 86,
"temperature": 1.0
},
{
"completion_length": 136.50390625,
"epoch": 0.03923337091319053,
"grad_norm": 0.03637909476903296,
"kl": 0.0069580078125,
"learning_rate": 9.803788903924222e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 215.0,
"min_completion_length": 91.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 87,
"temperature": 1.0
},
{
"completion_length": 131.37109375,
"epoch": 0.03968432919954904,
"grad_norm": 0.031058014010275966,
"kl": 0.013153076171875,
"learning_rate": 9.801533603969327e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 226.5,
"min_completion_length": 90.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 88,
"temperature": 1.0
},
{
"completion_length": 141.54296875,
"epoch": 0.040135287485907556,
"grad_norm": 0.04156967471425809,
"kl": 0.01055908203125,
"learning_rate": 9.799278304014433e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 287.5,
"min_completion_length": 92.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 89,
"temperature": 1.0
},
{
"completion_length": 133.45703125,
"epoch": 0.040586245772266064,
"grad_norm": 0.04432572627688822,
"kl": 0.0111083984375,
"learning_rate": 9.797023004059539e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 210.0,
"min_completion_length": 95.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 90,
"temperature": 1.0
},
{
"completion_length": 134.28125,
"epoch": 0.04103720405862458,
"grad_norm": 0.7407667374975673,
"kl": 0.022186279296875,
"learning_rate": 9.794767704104645e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0009,
"max_completion_length": 218.0,
"min_completion_length": 93.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 91,
"temperature": 1.0
},
{
"completion_length": 137.15625,
"epoch": 0.041488162344983086,
"grad_norm": 0.41097995708484697,
"kl": 0.011474609375,
"learning_rate": 9.792512404149752e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 248.0,
"min_completion_length": 85.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 92,
"temperature": 1.0
},
{
"completion_length": 137.140625,
"epoch": 0.0419391206313416,
"grad_norm": 0.04758776396448601,
"kl": 0.009674072265625,
"learning_rate": 9.790257104194858e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 262.0,
"min_completion_length": 85.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 93,
"temperature": 1.0
},
{
"completion_length": 141.03125,
"epoch": 0.042390078917700115,
"grad_norm": 0.2485793639149779,
"kl": 0.0203857421875,
"learning_rate": 9.788001804239964e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 269.5,
"min_completion_length": 98.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 94,
"temperature": 1.0
},
{
"completion_length": 134.24609375,
"epoch": 0.04284103720405862,
"grad_norm": 0.2540785535950062,
"kl": 0.0247802734375,
"learning_rate": 9.78574650428507e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.001,
"max_completion_length": 230.0,
"min_completion_length": 87.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 95,
"temperature": 1.0
},
{
"completion_length": 134.51953125,
"epoch": 0.04329199549041714,
"grad_norm": 0.16330960689627924,
"kl": 0.0205078125,
"learning_rate": 9.783491204330175e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0008,
"max_completion_length": 217.5,
"min_completion_length": 91.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 96,
"temperature": 1.0
},
{
"completion_length": 136.6953125,
"epoch": 0.043742953776775645,
"grad_norm": 0.14972836319565624,
"kl": 0.015106201171875,
"learning_rate": 9.781235904375281e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0006,
"max_completion_length": 232.0,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 97,
"temperature": 1.0
},
{
"completion_length": 135.2109375,
"epoch": 0.04419391206313416,
"grad_norm": 0.06666610741611737,
"kl": 0.010040283203125,
"learning_rate": 9.778980604420387e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 230.5,
"min_completion_length": 95.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 98,
"temperature": 1.0
},
{
"completion_length": 136.07421875,
"epoch": 0.044644870349492674,
"grad_norm": 0.03692527598375854,
"kl": 0.01007080078125,
"learning_rate": 9.776725304465493e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 216.0,
"min_completion_length": 87.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 99,
"temperature": 1.0
},
{
"completion_length": 138.29296875,
"epoch": 0.04509582863585118,
"grad_norm": 0.0697985947271456,
"kl": 0.01080322265625,
"learning_rate": 9.7744700045106e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 308.0,
"min_completion_length": 91.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 100,
"temperature": 1.0
},
{
"completion_length": 133.515625,
"epoch": 0.045546786922209696,
"grad_norm": 0.07309140477924224,
"kl": 0.010772705078125,
"learning_rate": 9.772214704555706e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 224.0,
"min_completion_length": 88.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 101,
"temperature": 1.0
},
{
"completion_length": 130.36328125,
"epoch": 0.045997745208568204,
"grad_norm": 0.05574240124141491,
"kl": 0.01068115234375,
"learning_rate": 9.769959404600812e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 208.0,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 102,
"temperature": 1.0
},
{
"completion_length": 139.234375,
"epoch": 0.04644870349492672,
"grad_norm": 0.03519854121109168,
"kl": 0.013427734375,
"learning_rate": 9.767704104645918e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 327.0,
"min_completion_length": 91.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 103,
"temperature": 1.0
},
{
"completion_length": 131.1015625,
"epoch": 0.04689966178128523,
"grad_norm": 0.03554996767560797,
"kl": 0.010955810546875,
"learning_rate": 9.765448804691024e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 238.5,
"min_completion_length": 73.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 104,
"temperature": 1.0
},
{
"completion_length": 136.60546875,
"epoch": 0.04735062006764374,
"grad_norm": 0.042584011523593555,
"kl": 0.01165771484375,
"learning_rate": 9.76319350473613e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 257.5,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 105,
"temperature": 1.0
},
{
"completion_length": 135.66796875,
"epoch": 0.047801578354002255,
"grad_norm": 0.04615459145131747,
"kl": 0.010589599609375,
"learning_rate": 9.760938204781235e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 310.5,
"min_completion_length": 87.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 106,
"temperature": 1.0
},
{
"completion_length": 135.6328125,
"epoch": 0.04825253664036077,
"grad_norm": 0.045923587392494976,
"kl": 0.010223388671875,
"learning_rate": 9.758682904826343e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 225.0,
"min_completion_length": 79.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 107,
"temperature": 1.0
},
{
"completion_length": 133.73046875,
"epoch": 0.04870349492671928,
"grad_norm": 0.044568286212471234,
"kl": 0.0089111328125,
"learning_rate": 9.756427604871449e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 217.5,
"min_completion_length": 92.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 108,
"temperature": 1.0
},
{
"completion_length": 131.06640625,
"epoch": 0.04915445321307779,
"grad_norm": 0.2549839899096623,
"kl": 0.0092620849609375,
"learning_rate": 9.754172304916554e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 229.0,
"min_completion_length": 83.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 109,
"temperature": 1.0
},
{
"completion_length": 133.9375,
"epoch": 0.0496054114994363,
"grad_norm": 0.28421440889201194,
"kl": 0.01019287109375,
"learning_rate": 9.75191700496166e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 246.0,
"min_completion_length": 93.0,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 110,
"temperature": 1.0
},
{
"completion_length": 135.9296875,
"epoch": 0.050056369785794814,
"grad_norm": 0.7286024428170503,
"kl": 0.01153564453125,
"learning_rate": 9.749661705006766e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0005,
"max_completion_length": 240.0,
"min_completion_length": 90.5,
"reward": 0.99609375,
"reward_std": 0.015625,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 111,
"temperature": 1.0
},
{
"completion_length": 135.11328125,
"epoch": 0.05050732807215333,
"grad_norm": 0.03670784769416828,
"kl": 0.010528564453125,
"learning_rate": 9.747406405051872e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 231.5,
"min_completion_length": 92.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 112,
"temperature": 1.0
},
{
"completion_length": 127.98828125,
"epoch": 0.05095828635851184,
"grad_norm": 0.042164123293671474,
"kl": 0.0087890625,
"learning_rate": 9.745151105096978e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 239.0,
"min_completion_length": 81.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 113,
"temperature": 1.0
},
{
"completion_length": 130.52734375,
"epoch": 0.05140924464487035,
"grad_norm": 0.04359303746661243,
"kl": 0.01031494140625,
"learning_rate": 9.742895805142083e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 265.5,
"min_completion_length": 82.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 114,
"temperature": 1.0
},
{
"completion_length": 133.34765625,
"epoch": 0.05186020293122886,
"grad_norm": 0.05312958242052849,
"kl": 0.009002685546875,
"learning_rate": 9.74064050518719e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 217.5,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 115,
"temperature": 1.0
},
{
"completion_length": 135.421875,
"epoch": 0.052311161217587374,
"grad_norm": 0.04383689920001928,
"kl": 0.0076446533203125,
"learning_rate": 9.738385205232295e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 257.5,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 116,
"temperature": 1.0
},
{
"completion_length": 129.4375,
"epoch": 0.05276211950394589,
"grad_norm": 0.046719015497805653,
"kl": 0.008270263671875,
"learning_rate": 9.7361299052774e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 225.0,
"min_completion_length": 87.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 117,
"temperature": 1.0
},
{
"completion_length": 134.2734375,
"epoch": 0.053213077790304396,
"grad_norm": 0.04862725334362555,
"kl": 0.00933837890625,
"learning_rate": 9.733874605322508e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 237.5,
"min_completion_length": 82.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 118,
"temperature": 1.0
},
{
"completion_length": 133.13671875,
"epoch": 0.05366403607666291,
"grad_norm": 0.10031986851741669,
"kl": 0.0108642578125,
"learning_rate": 9.731619305367614e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 243.5,
"min_completion_length": 89.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 119,
"temperature": 1.0
},
{
"completion_length": 132.49609375,
"epoch": 0.05411499436302142,
"grad_norm": 0.031112150572192358,
"kl": 0.0073394775390625,
"learning_rate": 9.72936400541272e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 288.0,
"min_completion_length": 86.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 120,
"temperature": 1.0
},
{
"completion_length": 134.01953125,
"epoch": 0.05456595264937993,
"grad_norm": 0.5940559108446518,
"kl": 0.013671875,
"learning_rate": 9.727108705457826e-07,
"log_metrics/accuracy": 0.003257421776652336,
"log_metrics/iou_log": 0.00390625,
"loss": 0.0005,
"max_completion_length": 258.0,
"min_completion_length": 85.5,
"reward": 1.0,
"reward_std": 0.03125,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.00390625,
"rewards/log_reward": 0.0,
"step": 121,
"temperature": 1.0
},
{
"completion_length": 139.04296875,
"epoch": 0.05501691093573845,
"grad_norm": 0.03639097352803687,
"kl": 0.007720947265625,
"learning_rate": 9.724853405502931e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 262.0,
"min_completion_length": 94.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 122,
"temperature": 1.0
},
{
"completion_length": 131.33984375,
"epoch": 0.055467869222096955,
"grad_norm": 0.029926586040708514,
"kl": 0.008544921875,
"learning_rate": 9.722598105548037e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 185.5,
"min_completion_length": 92.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 123,
"temperature": 1.0
},
{
"completion_length": 135.640625,
"epoch": 0.05591882750845547,
"grad_norm": 0.07007980580824669,
"kl": 0.008758544921875,
"learning_rate": 9.720342805593143e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 263.5,
"min_completion_length": 88.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 124,
"temperature": 1.0
},
{
"completion_length": 134.91015625,
"epoch": 0.05636978579481398,
"grad_norm": 0.038029665047696073,
"kl": 0.010833740234375,
"learning_rate": 9.718087505638249e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 301.5,
"min_completion_length": 93.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 125,
"temperature": 1.0
},
{
"completion_length": 136.80078125,
"epoch": 0.05682074408117249,
"grad_norm": 0.03399658389217789,
"kl": 0.010498046875,
"learning_rate": 9.715832205683354e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 253.5,
"min_completion_length": 90.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 126,
"temperature": 1.0
},
{
"completion_length": 132.7421875,
"epoch": 0.057271702367531006,
"grad_norm": 0.0288651577135693,
"kl": 0.0081787109375,
"learning_rate": 9.71357690572846e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0003,
"max_completion_length": 233.5,
"min_completion_length": 87.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 127,
"temperature": 1.0
},
{
"completion_length": 138.23828125,
"epoch": 0.057722660653889514,
"grad_norm": 0.029894092306988484,
"kl": 0.00921630859375,
"learning_rate": 9.711321605773566e-07,
"log_metrics/accuracy": 0.0,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 239.0,
"min_completion_length": 91.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 128,
"temperature": 1.0
},
{
"completion_length": 136.296875,
"epoch": 0.05817361894024803,
"grad_norm": 0.02817290776298427,
"kl": 0.009246826171875,
"learning_rate": 9.709066305818674e-07,
"log_metrics/accuracy": 0.001520317979156971,
"log_metrics/iou_log": 0.0,
"loss": 0.0004,
"max_completion_length": 316.5,
"min_completion_length": 83.5,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.0,
"rewards/log_reward": 0.0,
"step": 129,
"temperature": 1.0
},
{
"completion_length": 130.125,
"epoch": 0.058624577226606536,
"grad_norm": 0.6165285520370335,
"kl": 0.01171875,
"learning_rate": 9.70681100586378e-07,
"log_metrics/accuracy": 0.0029867857228964567,
"log_metrics/iou_log": 0.00390625,
"loss": 0.0005,
"max_completion_length": 216.0,
"min_completion_length": 82.0,
"reward": 1.00390625,
"reward_std": 0.015625,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.00390625,
"rewards/log_reward": 0.0,
"step": 130,
"temperature": 1.0
},
{
"completion_length": 133.953125,
"epoch": 0.05907553551296505,
"grad_norm": 0.4803658881359604,
"kl": 0.00921630859375,
"learning_rate": 9.704555705908885e-07,
"log_metrics/accuracy": 0.0028719999827444553,
"log_metrics/iou_log": 0.00390625,
"loss": 0.0004,
"max_completion_length": 246.0,
"min_completion_length": 79.0,
"reward": 1.00390625,
"reward_std": 0.015625,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.00390625,
"rewards/log_reward": 0.0,
"step": 131,
"temperature": 1.0
},
{
"completion_length": 129.59765625,
"epoch": 0.059526493799323565,
"grad_norm": 1.7435921168622184,
"kl": 0.01397705078125,
"learning_rate": 9.702300405953991e-07,
"log_metrics/accuracy": 0.03585699386894703,
"log_metrics/iou_log": 0.03515625,
"loss": 0.0006,
"max_completion_length": 238.0,
"min_completion_length": 69.5,
"reward": 1.0234375,
"reward_std": 0.125,
"rewards/format_reward": 0.98828125,
"rewards/iou_reward": 0.03515625,
"rewards/log_reward": 0.0,
"step": 132,
"temperature": 1.0
},
{
"completion_length": 117.4921875,
"epoch": 0.05997745208568207,
"grad_norm": 6.152409630252781,
"kl": 0.05615234375,
"learning_rate": 9.700045105999097e-07,
"log_metrics/accuracy": 0.3532668203115463,
"log_metrics/iou_log": 0.3984375,
"loss": 0.0022,
"max_completion_length": 229.0,
"min_completion_length": 62.0,
"reward": 1.3828125,
"reward_std": 0.4916256368160248,
"rewards/format_reward": 0.984375,
"rewards/iou_reward": 0.3984375,
"rewards/log_reward": 0.0,
"step": 133,
"temperature": 1.0
},
{
"completion_length": 99.53125,
"epoch": 0.06042841037204059,
"grad_norm": 24.520553591942157,
"kl": 0.1240234375,
"learning_rate": 9.697789806044203e-07,
"log_metrics/accuracy": 0.6385847628116608,
"log_metrics/iou_log": 0.6953125,
"loss": 0.0049,
"max_completion_length": 230.0,
"min_completion_length": 55.5,
"reward": 1.67578125,
"reward_std": 0.31701020896434784,
"rewards/format_reward": 0.98046875,
"rewards/iou_reward": 0.6953125,
"rewards/log_reward": 0.0,
"step": 134,
"temperature": 1.0
},
{
"completion_length": 102.87890625,
"epoch": 0.060879368658399095,
"grad_norm": 2.4974832118570727,
"kl": 0.113525390625,
"learning_rate": 9.695534506089308e-07,
"log_metrics/accuracy": 0.6731529831886292,
"log_metrics/iou_log": 0.76171875,
"loss": 0.0045,
"max_completion_length": 186.0,
"min_completion_length": 58.5,
"reward": 1.7578125,
"reward_std": 0.3095604404807091,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.76171875,
"rewards/log_reward": 0.0,
"step": 135,
"temperature": 1.0
},
{
"completion_length": 103.109375,
"epoch": 0.06133032694475761,
"grad_norm": 3.516339202499408,
"kl": 0.118408203125,
"learning_rate": 9.693279206134416e-07,
"log_metrics/accuracy": 0.6554303467273712,
"log_metrics/iou_log": 0.6796875,
"loss": 0.0047,
"max_completion_length": 226.5,
"min_completion_length": 54.0,
"reward": 1.6640625,
"reward_std": 0.28480498492717743,
"rewards/format_reward": 0.984375,
"rewards/iou_reward": 0.6796875,
"rewards/log_reward": 0.0,
"step": 136,
"temperature": 1.0
},
{
"completion_length": 104.65625,
"epoch": 0.061781285231116125,
"grad_norm": 4.120853025373372,
"kl": 0.180908203125,
"learning_rate": 9.691023906179522e-07,
"log_metrics/accuracy": 0.7559227645397186,
"log_metrics/iou_log": 0.8984375,
"loss": 0.0072,
"max_completion_length": 204.0,
"min_completion_length": 60.5,
"reward": 1.8828125,
"reward_std": 0.12466736882925034,
"rewards/format_reward": 0.984375,
"rewards/iou_reward": 0.8984375,
"rewards/log_reward": 0.0,
"step": 137,
"temperature": 1.0
},
{
"completion_length": 104.96484375,
"epoch": 0.06223224351747463,
"grad_norm": 1.9791163456647438,
"kl": 0.11181640625,
"learning_rate": 9.688768606224628e-07,
"log_metrics/accuracy": 0.6887724995613098,
"log_metrics/iou_log": 0.7109375,
"loss": 0.0045,
"max_completion_length": 246.0,
"min_completion_length": 60.0,
"reward": 1.70703125,
"reward_std": 0.21135114878416061,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.7109375,
"rewards/log_reward": 0.0,
"step": 138,
"temperature": 1.0
},
{
"completion_length": 99.19140625,
"epoch": 0.06268320180383315,
"grad_norm": 5.5467544556496335,
"kl": 0.114990234375,
"learning_rate": 9.686513306269734e-07,
"log_metrics/accuracy": 0.7241671979427338,
"log_metrics/iou_log": 0.82421875,
"loss": 0.0046,
"max_completion_length": 194.0,
"min_completion_length": 60.0,
"reward": 1.82421875,
"reward_std": 0.15309549123048782,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.82421875,
"rewards/log_reward": 0.0,
"step": 139,
"temperature": 1.0
},
{
"completion_length": 101.36328125,
"epoch": 0.06313416009019165,
"grad_norm": 12.142762617479395,
"kl": 0.11865234375,
"learning_rate": 9.68425800631484e-07,
"log_metrics/accuracy": 0.712556004524231,
"log_metrics/iou_log": 0.79296875,
"loss": 0.0047,
"max_completion_length": 190.0,
"min_completion_length": 55.5,
"reward": 1.78515625,
"reward_std": 0.2706931382417679,
"rewards/format_reward": 0.9921875,
"rewards/iou_reward": 0.79296875,
"rewards/log_reward": 0.0,
"step": 140,
"temperature": 1.0
},
{
"completion_length": 101.61328125,
"epoch": 0.06358511837655018,
"grad_norm": 3.640232102574435,
"kl": 0.1201171875,
"learning_rate": 9.682002706359945e-07,
"log_metrics/accuracy": 0.7055022418498993,
"log_metrics/iou_log": 0.765625,
"loss": 0.0048,
"max_completion_length": 211.5,
"min_completion_length": 55.5,
"reward": 1.765625,
"reward_std": 0.2804790586233139,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.765625,
"rewards/log_reward": 0.0,
"step": 141,
"temperature": 1.0
},
{
"completion_length": 97.63671875,
"epoch": 0.06403607666290868,
"grad_norm": 2.667593861521047,
"kl": 0.122314453125,
"learning_rate": 9.67974740640505e-07,
"log_metrics/accuracy": 0.7026576399803162,
"log_metrics/iou_log": 0.7890625,
"loss": 0.0049,
"max_completion_length": 207.0,
"min_completion_length": 58.5,
"reward": 1.78515625,
"reward_std": 0.24758073687553406,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.7890625,
"rewards/log_reward": 0.0,
"step": 142,
"temperature": 1.0
},
{
"completion_length": 100.74609375,
"epoch": 0.06448703494926719,
"grad_norm": 3.1074124076238916,
"kl": 0.123779296875,
"learning_rate": 9.677492106450157e-07,
"log_metrics/accuracy": 0.6754811108112335,
"log_metrics/iou_log": 0.75,
"loss": 0.005,
"max_completion_length": 209.5,
"min_completion_length": 55.5,
"reward": 1.74609375,
"reward_std": 0.268774151802063,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.75,
"rewards/log_reward": 0.0,
"step": 143,
"temperature": 1.0
},
{
"completion_length": 101.8125,
"epoch": 0.0649379932356257,
"grad_norm": 1.7944760480191515,
"kl": 0.1220703125,
"learning_rate": 9.675236806495264e-07,
"log_metrics/accuracy": 0.71114382147789,
"log_metrics/iou_log": 0.7890625,
"loss": 0.0049,
"max_completion_length": 217.0,
"min_completion_length": 56.5,
"reward": 1.78515625,
"reward_std": 0.27572914958000183,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.7890625,
"rewards/log_reward": 0.0,
"step": 144,
"temperature": 1.0
},
{
"completion_length": 99.64453125,
"epoch": 0.06538895152198422,
"grad_norm": 2.086624174741133,
"kl": 0.124267578125,
"learning_rate": 9.67298150654037e-07,
"log_metrics/accuracy": 0.748405933380127,
"log_metrics/iou_log": 0.83984375,
"loss": 0.005,
"max_completion_length": 213.0,
"min_completion_length": 63.5,
"reward": 1.8359375,
"reward_std": 0.23154567182064056,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.83984375,
"rewards/log_reward": 0.0,
"step": 145,
"temperature": 1.0
},
{
"completion_length": 96.7890625,
"epoch": 0.06583990980834273,
"grad_norm": 3.3643857707503435,
"kl": 0.12841796875,
"learning_rate": 9.670726206585476e-07,
"log_metrics/accuracy": 0.7452348172664642,
"log_metrics/iou_log": 0.8515625,
"loss": 0.0052,
"max_completion_length": 176.0,
"min_completion_length": 59.0,
"reward": 1.8515625,
"reward_std": 0.23864974081516266,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.8515625,
"rewards/log_reward": 0.0,
"step": 146,
"temperature": 1.0
},
{
"completion_length": 97.89453125,
"epoch": 0.06629086809470124,
"grad_norm": 2.0566073759257706,
"kl": 0.1337890625,
"learning_rate": 9.668470906630582e-07,
"log_metrics/accuracy": 0.7216435968875885,
"log_metrics/iou_log": 0.79296875,
"loss": 0.0054,
"max_completion_length": 206.0,
"min_completion_length": 58.5,
"reward": 1.79296875,
"reward_std": 0.13226625323295593,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.79296875,
"rewards/log_reward": 0.0,
"step": 147,
"temperature": 1.0
},
{
"completion_length": 97.98828125,
"epoch": 0.06674182638105976,
"grad_norm": 1.379192128046406,
"kl": 0.1318359375,
"learning_rate": 9.666215606675687e-07,
"log_metrics/accuracy": 0.622128963470459,
"log_metrics/iou_log": 0.65625,
"loss": 0.0053,
"max_completion_length": 198.0,
"min_completion_length": 56.5,
"reward": 1.65625,
"reward_std": 0.21148452162742615,
"rewards/format_reward": 1.0,
"rewards/iou_reward": 0.65625,
"rewards/log_reward": 0.0,
"step": 148,
"temperature": 1.0
},
{
"completion_length": 97.74609375,
"epoch": 0.06719278466741826,
"grad_norm": 1.740571009912497,
"kl": 0.118896484375,
"learning_rate": 9.663960306720793e-07,
"log_metrics/accuracy": 0.7791113555431366,
"log_metrics/iou_log": 0.8515625,
"loss": 0.0048,
"max_completion_length": 203.0,
"min_completion_length": 54.0,
"reward": 1.84765625,
"reward_std": 0.15834103524684906,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.8515625,
"rewards/log_reward": 0.0,
"step": 149,
"temperature": 1.0
},
{
"completion_length": 97.7734375,
"epoch": 0.06764374295377677,
"grad_norm": 1.832460748446639,
"kl": 0.1318359375,
"learning_rate": 9.6617050067659e-07,
"log_metrics/accuracy": 0.7626213431358337,
"log_metrics/iou_log": 0.83203125,
"loss": 0.0053,
"max_completion_length": 151.5,
"min_completion_length": 63.0,
"reward": 1.828125,
"reward_std": 0.22953035682439804,
"rewards/format_reward": 0.99609375,
"rewards/iou_reward": 0.83203125,
"rewards/log_reward": 0.0,
"step": 150,
"temperature": 1.0
}
],
"logging_steps": 1.0,
"max_steps": 4434,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}