{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 389.875, "epoch": 0.005, "grad_norm": 0.0, "kl_div": 0.0024652908323332667, "learning_rate": 1e-07, "loss": 2.689900156838121e-05, "reinforce_loss": -2.081711530685425, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 1, "warning": 0 }, { "completion_length": 438.0, "epoch": 0.01, "grad_norm": 0.0, "kl_div": 0.002304812252987176, "learning_rate": 2e-07, "loss": 3.394197756279027e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 2, "warning": 0 }, { "completion_length": 343.625, "epoch": 0.015, "grad_norm": 0.0, "kl_div": 0.004121052101254463, "learning_rate": 3e-07, "loss": 5.875274655409157e-05, "reinforce_loss": -2.357510566711426, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 3, "warning": 0 }, { "completion_length": 424.25, "epoch": 0.02, "grad_norm": 0.0, "kl_div": 0.002568428695667535, "learning_rate": 4e-07, "loss": 2.8150654543424025e-05, "reinforce_loss": -1.6755762100219727, "reward": 1.4901161193847656e-08, "reward_max": 1.4992505311965942, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.49975016713142395, "reward_std": 0.5891520529985428, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.15625, "step": 4, "warning": 0 }, { "completion_length": 425.625, "epoch": 0.025, "grad_norm": 0.0, "kl_div": 0.0027664770605042577, "learning_rate": 5e-07, "loss": 4.087784509465564e-05, "reinforce_loss": -0.6637083292007446, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 5, "warning": 0 }, { "completion_length": 207.125, "epoch": 0.03, "grad_norm": 0.0, "kl_div": 0.0042053768411278725, "learning_rate": 6e-07, "loss": 4.389536115922965e-05, "reinforce_loss": -1.0390745401382446, "reward": 0.0, "reward_max": 0.5455992817878723, "reward_mean": 2.9507249421634185e-10, "reward_min": -0.42435500025749207, "reward_std": 0.34537841379642487, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 6, "warning": 0 }, { "completion_length": 200.5, "epoch": 0.035, "grad_norm": 0.0, "kl_div": 0.005972708575427532, "learning_rate": 7e-07, "loss": 4.1292132664239034e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 7, "warning": 0 }, { "completion_length": 296.25, "epoch": 0.04, "grad_norm": 0.0, "kl_div": 0.00496573734562844, "learning_rate": 8e-07, "loss": 2.4183737878047395e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 8, "warning": 0 }, { "completion_length": 310.125, "epoch": 0.045, "grad_norm": 0.0, "kl_div": 0.005726104602217674, "learning_rate": 9e-07, "loss": 2.4668616788403597e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 9, "warning": 0 }, { "completion_length": 282.5, "epoch": 0.05, "grad_norm": 0.0, "kl_div": 0.0038934459444135427, "learning_rate": 1e-06, "loss": 3.763325912586879e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 10, "warning": 0 }, { "completion_length": 392.625, "epoch": 0.055, "grad_norm": 0.0, "kl_div": 0.004181380150839686, "learning_rate": 1.1e-06, "loss": 4.593009361997247e-05, "reinforce_loss": 0.9842181205749512, "reward": 1.1175870895385742e-08, "reward_max": 1.4025049209594727, "reward_mean": 7.450580596923828e-09, "reward_min": -0.6415429264307022, "reward_std": 0.6777083277702332, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 11, "warning": 0 }, { "completion_length": 317.875, "epoch": 0.06, "grad_norm": 0.0, "kl_div": 0.0026295960415154696, "learning_rate": 1.2e-06, "loss": 3.523451141518308e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 12, "warning": 0 }, { "completion_length": 399.75, "epoch": 0.065, "grad_norm": 0.0, "kl_div": 0.0022935228189453483, "learning_rate": 1.3e-06, "loss": 4.2767487684614025e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 13, "warning": 0 }, { "completion_length": 319.375, "epoch": 0.07, "grad_norm": 0.0, "kl_div": 0.002985095838084817, "learning_rate": 1.4e-06, "loss": 2.2724162136000814e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 14, "warning": 0 }, { "completion_length": 384.0, "epoch": 0.075, "grad_norm": 0.0, "kl_div": 0.003995223436504602, "learning_rate": 1.5e-06, "loss": 2.8646978535107337e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 15, "warning": 0 }, { "completion_length": 336.625, "epoch": 0.08, "grad_norm": 0.0, "kl_div": 0.006195983849465847, "learning_rate": 1.6e-06, "loss": 1.8612197891343385e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 16, "warning": 0 }, { "completion_length": 364.75, "epoch": 0.085, "grad_norm": 0.0, "kl_div": 0.0036015671212226152, "learning_rate": 1.6999999999999998e-06, "loss": 5.09898363816319e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 17, "warning": 0 }, { "completion_length": 400.0, "epoch": 0.09, "grad_norm": 0.0, "kl_div": 0.002769404381979257, "learning_rate": 1.8e-06, "loss": 3.3756236007320695e-05, "reinforce_loss": 1.4402513243258, "reward": 7.450580596923828e-09, "reward_max": 1.4567568898200989, "reward_mean": 7.450580596923828e-09, "reward_min": -0.6034034341573715, "reward_std": 0.6464022770524025, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 18, "warning": 0 }, { "completion_length": 244.125, "epoch": 0.095, "grad_norm": 0.0, "kl_div": 0.010267159435898066, "learning_rate": 1.8999999999999998e-06, "loss": 2.0851229237450752e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 19, "warning": 0 }, { "completion_length": 360.75, "epoch": 0.1, "grad_norm": 0.0, "kl_div": 0.004437663941644132, "learning_rate": 2e-06, "loss": 2.949235113192117e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 20, "warning": 0 }, { "completion_length": 320.75, "epoch": 0.105, "grad_norm": 0.0, "kl_div": 0.0030930734938010573, "learning_rate": 1.9998476951563913e-06, "loss": 3.9009461033856496e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 21, "warning": 0 }, { "completion_length": 246.375, "epoch": 0.11, "grad_norm": 0.0, "kl_div": 0.004803084302693605, "learning_rate": 1.9993908270190957e-06, "loss": 4.433061076269951e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 22, "warning": 0 }, { "completion_length": 377.25, "epoch": 0.115, "grad_norm": 0.0, "kl_div": 0.003878451883792877, "learning_rate": 1.998629534754574e-06, "loss": 2.3614016754436307e-05, "reinforce_loss": 1.9879957437515259, "reward": 0.0, "reward_max": 0.43293771147727966, "reward_mean": 0.0, "reward_min": -0.43293771147727966, "reward_std": 0.36085928976535797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 23, "warning": 0 }, { "completion_length": 267.75, "epoch": 0.12, "grad_norm": 0.0, "kl_div": 0.00566307152621448, "learning_rate": 1.997564050259824e-06, "loss": 3.9344242395600304e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 24, "warning": 0 }, { "completion_length": 222.375, "epoch": 0.125, "grad_norm": 0.0, "kl_div": 0.005958934547379613, "learning_rate": 1.9961946980917456e-06, "loss": 3.059400296478998e-05, "reinforce_loss": -2.8684964179992676, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 25, "warning": 0 }, { "completion_length": 330.75, "epoch": 0.13, "grad_norm": 0.0, "kl_div": 0.00351740384940058, "learning_rate": 1.994521895368273e-06, "loss": 3.0958593924879096e-05, "reinforce_loss": -2.1411049365997314, "reward": 0.0, "reward_max": 0.43293771147727966, "reward_mean": 0.0, "reward_min": -0.43293771147727966, "reward_std": 0.36085928976535797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 26, "warning": 0 }, { "completion_length": 330.625, "epoch": 0.135, "grad_norm": 0.0, "kl_div": 0.0047903163358569145, "learning_rate": 1.992546151641322e-06, "loss": 4.207479469187092e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 27, "warning": 0 }, { "completion_length": 220.75, "epoch": 0.14, "grad_norm": 0.0, "kl_div": 0.005703125614672899, "learning_rate": 1.99026806874157e-06, "loss": 8.662217078381218e-05, "reinforce_loss": -4.32262122631073, "reward": 3.412944815650576e-09, "reward_max": 1.4996501207351685, "reward_mean": 6.521162276840187e-09, "reward_min": -0.49988336861133575, "reward_std": 0.651803269982338, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 28, "warning": 0 }, { "completion_length": 433.125, "epoch": 0.145, "grad_norm": 0.0, "kl_div": 0.002054017852060497, "learning_rate": 1.9876883405951377e-06, "loss": 4.1989251258200966e-05, "reinforce_loss": 0.0058427536860108376, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 29, "warning": 0 }, { "completion_length": 286.25, "epoch": 0.15, "grad_norm": 0.0, "kl_div": 0.00513452710583806, "learning_rate": 1.984807753012208e-06, "loss": 3.149910207866924e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 30, "warning": 0 }, { "completion_length": 376.5, "epoch": 0.155, "grad_norm": 0.0, "kl_div": 0.004596561484504491, "learning_rate": 1.981627183447664e-06, "loss": 4.240875023242552e-05, "reinforce_loss": -2.439726948738098, "reward": 3.725290298461914e-09, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 31, "warning": 0 }, { "completion_length": 369.625, "epoch": 0.16, "grad_norm": 0.0, "kl_div": 0.0038089824374765158, "learning_rate": 1.9781476007338054e-06, "loss": 3.030582684004912e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 32, "warning": 0 }, { "completion_length": 341.25, "epoch": 0.165, "grad_norm": 0.0, "kl_div": 0.004064984852448106, "learning_rate": 1.9743700647852355e-06, "loss": 2.5097384423133917e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 33, "warning": 0 }, { "completion_length": 419.875, "epoch": 0.17, "grad_norm": 0.0, "kl_div": 0.004246899508871138, "learning_rate": 1.9702957262759963e-06, "loss": 3.744237073988188e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 34, "warning": 0 }, { "completion_length": 286.0, "epoch": 0.175, "grad_norm": 0.0, "kl_div": 0.003998161060735583, "learning_rate": 1.965925826289068e-06, "loss": 3.475222365523223e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 35, "warning": 0 }, { "completion_length": 310.125, "epoch": 0.18, "grad_norm": 0.0, "kl_div": 0.0067025176249444485, "learning_rate": 1.9612616959383188e-06, "loss": 5.352716470952146e-05, "reinforce_loss": 1.3420970290899277, "reward": 1.1175870895385742e-08, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 36, "warning": 0 }, { "completion_length": 367.75, "epoch": 0.185, "grad_norm": 0.0, "kl_div": 0.004387187073007226, "learning_rate": 1.9563047559630356e-06, "loss": 2.930309892690275e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 37, "warning": 0 }, { "completion_length": 329.625, "epoch": 0.19, "grad_norm": 0.0, "kl_div": 0.003985945135354996, "learning_rate": 1.9510565162951534e-06, "loss": 3.388912318769144e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 38, "warning": 0 }, { "completion_length": 226.875, "epoch": 0.195, "grad_norm": 0.0, "kl_div": 0.008749458938837051, "learning_rate": 1.945518575599317e-06, "loss": 4.3733822167268954e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 39, "warning": 0 }, { "completion_length": 299.875, "epoch": 0.2, "grad_norm": 0.0, "kl_div": 0.004355705808848143, "learning_rate": 1.9396926207859082e-06, "loss": 2.14438346120005e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 40, "warning": 0 }, { "completion_length": 386.5, "epoch": 0.205, "grad_norm": 0.0, "kl_div": 0.0073004127480089664, "learning_rate": 1.9335804264972015e-06, "loss": 0.00018038249618257396, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 41, "warning": 0 }, { "completion_length": 332.125, "epoch": 0.21, "grad_norm": 0.0, "kl_div": 0.007190827513113618, "learning_rate": 1.9271838545667875e-06, "loss": 4.538575012702495e-05, "reinforce_loss": -1.2670938968658447, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 42, "warning": 0 }, { "completion_length": 234.625, "epoch": 0.215, "grad_norm": 0.0, "kl_div": 0.0050254217348992825, "learning_rate": 1.9205048534524403e-06, "loss": 4.025547423225362e-05, "reinforce_loss": 1.1434005498886108, "reward": 0.0, "reward_max": 0.39159291982650757, "reward_mean": 0.0, "reward_min": -0.6526548862457275, "reward_std": 0.33619239926338196, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 43, "warning": 0 }, { "completion_length": 399.625, "epoch": 0.22, "grad_norm": 0.0, "kl_div": 0.004784395627211779, "learning_rate": 1.9135454576426007e-06, "loss": 2.125667924701702e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 44, "warning": 0 }, { "completion_length": 403.375, "epoch": 0.225, "grad_norm": 0.0, "kl_div": 0.0041511922609061, "learning_rate": 1.9063077870366499e-06, "loss": 3.388222467037849e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 45, "warning": 0 }, { "completion_length": 243.75, "epoch": 0.23, "grad_norm": 0.0, "kl_div": 0.005325152073055506, "learning_rate": 1.8987940462991669e-06, "loss": 5.25302421010565e-05, "reinforce_loss": 0.044938743114471436, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 46, "warning": 0 }, { "completion_length": 318.5, "epoch": 0.235, "grad_norm": 0.0, "kl_div": 0.005990996723994613, "learning_rate": 1.8910065241883678e-06, "loss": 4.268721750122495e-05, "reinforce_loss": -2.43072172626853, "reward": 1.4901161193847656e-08, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 47, "warning": 0 }, { "completion_length": 332.625, "epoch": 0.24, "grad_norm": 0.0, "kl_div": 0.003994525643065572, "learning_rate": 1.8829475928589268e-06, "loss": 2.510198737581959e-05, "reinforce_loss": -1.0816690921783447, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 48, "warning": 0 }, { "completion_length": 233.875, "epoch": 0.245, "grad_norm": 0.0, "kl_div": 0.006486562779173255, "learning_rate": 1.8746197071393956e-06, "loss": 4.779673145094421e-05, "reinforce_loss": -0.2872549891471863, "reward": 1.3526161524168856e-08, "reward_max": 1.4997000694274902, "reward_mean": 1.3590114811279363e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830952018499374, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 49, "warning": 0 }, { "completion_length": 336.625, "epoch": 0.25, "grad_norm": 0.0, "kl_div": 0.004392045782878995, "learning_rate": 1.8660254037844386e-06, "loss": 3.21057441396988e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 50, "warning": 0 }, { "completion_length": 304.5, "epoch": 0.255, "grad_norm": 0.0, "kl_div": 0.003484749235212803, "learning_rate": 1.8571673007021123e-06, "loss": 2.3217376110551413e-05, "reinforce_loss": -1.3902740478515625, "reward": 1.4024621464159281e-09, "reward_max": 0.7497001886367798, "reward_mean": 7.853787842293514e-09, "reward_min": -0.24990005791187286, "reward_std": 0.2789834886789322, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 51, "warning": 0 }, { "completion_length": 353.75, "epoch": 0.26, "grad_norm": 0.0, "kl_div": 0.003450308693572879, "learning_rate": 1.8480480961564257e-06, "loss": 3.571236629795749e-05, "reinforce_loss": 2.141695976257324, "reward": 3.725290298461914e-09, "reward_max": 0.7494004964828491, "reward_mean": 7.450580596923828e-09, "reward_min": -0.2498001605272293, "reward_std": 0.2476361244916916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.03125, "step": 52, "warning": 0 }, { "completion_length": 277.5, "epoch": 0.265, "grad_norm": 0.0, "kl_div": 0.006728983484208584, "learning_rate": 1.838670567945424e-06, "loss": 3.710938290168997e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 53, "warning": 0 }, { "completion_length": 250.5, "epoch": 0.27, "grad_norm": 0.0, "kl_div": 0.006101694190874696, "learning_rate": 1.8290375725550415e-06, "loss": 4.1359570786880795e-05, "reinforce_loss": -0.41635100543498993, "reward": -1.1175870895385742e-08, "reward_max": 1.4760686159133911, "reward_mean": 0.0, "reward_min": -0.5800493806600571, "reward_std": 0.6763399988412857, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28125, "step": 54, "warning": 0 }, { "completion_length": 291.25, "epoch": 0.275, "grad_norm": 0.0, "kl_div": 0.006650205934420228, "learning_rate": 1.8191520442889917e-06, "loss": 4.112614624318667e-05, "reinforce_loss": -0.05228543281555176, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 55, "warning": 0 }, { "completion_length": 252.625, "epoch": 0.28, "grad_norm": 0.0, "kl_div": 0.005602831020951271, "learning_rate": 1.8090169943749474e-06, "loss": 5.373989915824495e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 56, "warning": 0 }, { "completion_length": 385.25, "epoch": 0.285, "grad_norm": 0.0, "kl_div": 0.0038474855246022344, "learning_rate": 1.7986355100472927e-06, "loss": 2.705613678699592e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 57, "warning": 0 }, { "completion_length": 380.25, "epoch": 0.29, "grad_norm": 0.0, "kl_div": 0.0029824297525919974, "learning_rate": 1.7880107536067217e-06, "loss": 4.4811640691477805e-05, "reinforce_loss": -2.3075480461120605, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 58, "warning": 0 }, { "completion_length": 269.125, "epoch": 0.295, "grad_norm": 0.0, "kl_div": 0.004293393110856414, "learning_rate": 1.7771459614569707e-06, "loss": 4.70823360956274e-05, "reinforce_loss": -0.0201259832829237, "reward": 0.0, "reward_max": 0.6526548862457275, "reward_mean": 0.0, "reward_min": -0.39159291982650757, "reward_std": 0.3362482786178589, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 59, "warning": 0 }, { "completion_length": 270.0, "epoch": 0.3, "grad_norm": 0.0, "kl_div": 0.0054718246683478355, "learning_rate": 1.766044443118978e-06, "loss": 3.922827909264015e-05, "reinforce_loss": -0.8225432634353638, "reward": 1.046844211316511e-08, "reward_max": 1.499550223350525, "reward_mean": 1.4585076257844776e-08, "reward_min": -0.4998500645160675, "reward_std": 0.6204904019832611, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 60, "warning": 0 }, { "completion_length": 344.875, "epoch": 0.305, "grad_norm": 0.0, "kl_div": 0.004222210263833404, "learning_rate": 1.7547095802227721e-06, "loss": 3.131217454210855e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 61, "warning": 0 }, { "completion_length": 314.5, "epoch": 0.31, "grad_norm": 0.0, "kl_div": 0.004381764214485884, "learning_rate": 1.743144825477394e-06, "loss": 3.6830904718954116e-05, "reinforce_loss": 1.131558895111084, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 62, "warning": 0 }, { "completion_length": 309.0, "epoch": 0.315, "grad_norm": 0.0, "kl_div": 0.0033557538990862668, "learning_rate": 1.7313537016191704e-06, "loss": 2.9077909857733175e-05, "reinforce_loss": -0.632893979549408, "reward": 2.5478668064238263e-09, "reward_max": 0.7498500347137451, "reward_mean": 0.0, "reward_min": -0.24995000660419464, "reward_std": 0.3416033834218979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 63, "warning": 0 }, { "completion_length": 344.75, "epoch": 0.32, "grad_norm": 0.0, "kl_div": 0.005219147773459554, "learning_rate": 1.719339800338651e-06, "loss": 2.4095863409456797e-05, "reinforce_loss": -1.6702743768692017, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 64, "warning": 0 }, { "completion_length": 295.0, "epoch": 0.325, "grad_norm": 0.0, "kl_div": 0.008704626467078924, "learning_rate": 1.7071067811865474e-06, "loss": 3.565949373296462e-05, "reinforce_loss": 0.009210370481014252, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 65, "warning": 0 }, { "completion_length": 280.5, "epoch": 0.33, "grad_norm": 0.0, "kl_div": 0.0059855489525943995, "learning_rate": 1.6946583704589972e-06, "loss": 4.989975241187494e-05, "reinforce_loss": -4.200153112411499, "reward": 6.969897548003701e-09, "reward_max": 1.4996501207351685, "reward_mean": 6.969897548003701e-09, "reward_min": -0.49988336861133575, "reward_std": 0.651769146323204, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 66, "warning": 0 }, { "completion_length": 370.375, "epoch": 0.335, "grad_norm": 0.0, "kl_div": 0.0033422550186514854, "learning_rate": 1.6819983600624985e-06, "loss": 4.219137008476537e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 67, "warning": 0 }, { "completion_length": 303.5, "epoch": 0.34, "grad_norm": 0.0, "kl_div": 0.005509930197149515, "learning_rate": 1.669130606358858e-06, "loss": 1.9536934360075975e-05, "reinforce_loss": -2.1764976978302, "reward": 7.450580596923828e-09, "reward_max": 0.7497001886367798, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24990005791187286, "reward_std": 0.27897267043590546, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 68, "warning": 0 }, { "completion_length": 311.0, "epoch": 0.345, "grad_norm": 0.0, "kl_div": 0.004351093899458647, "learning_rate": 1.6560590289905071e-06, "loss": 2.8206985462020384e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 69, "warning": 0 }, { "completion_length": 248.5, "epoch": 0.35, "grad_norm": 0.0, "kl_div": 0.005079559748992324, "learning_rate": 1.6427876096865393e-06, "loss": 5.014676571590826e-05, "reinforce_loss": -2.2033958435058594, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 70, "warning": 0 }, { "completion_length": 343.5, "epoch": 0.355, "grad_norm": 0.0, "kl_div": 0.005345726967789233, "learning_rate": 1.6293203910498375e-06, "loss": 3.858805575873703e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 71, "warning": 0 }, { "completion_length": 456.125, "epoch": 0.36, "grad_norm": 0.0, "kl_div": 0.002764557837508619, "learning_rate": 1.615661475325658e-06, "loss": 3.0939889256842434e-05, "reinforce_loss": -0.40432941913604736, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 72, "warning": 0 }, { "completion_length": 378.25, "epoch": 0.365, "grad_norm": 0.0, "kl_div": 0.004045464680530131, "learning_rate": 1.6018150231520484e-06, "loss": 5.938168214925099e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 73, "warning": 0 }, { "completion_length": 280.375, "epoch": 0.37, "grad_norm": 0.0, "kl_div": 0.005999768851324916, "learning_rate": 1.587785252292473e-06, "loss": 3.983633996540448e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 74, "warning": 0 }, { "completion_length": 405.375, "epoch": 0.375, "grad_norm": 0.0, "kl_div": 0.0027218845207244158, "learning_rate": 1.573576436351046e-06, "loss": 2.5808294594753534e-05, "reinforce_loss": -0.5276992321014404, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 75, "warning": 0 }, { "completion_length": 305.875, "epoch": 0.38, "grad_norm": 0.0, "kl_div": 0.006257050670683384, "learning_rate": 1.5591929034707466e-06, "loss": 3.807945449807448e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 76, "warning": 0 }, { "completion_length": 224.5, "epoch": 0.385, "grad_norm": 0.0, "kl_div": 0.006410181755200028, "learning_rate": 1.544639035015027e-06, "loss": 4.560468551062513e-05, "reinforce_loss": 0.8713245987892151, "reward": 1.3926317876666872e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.7987385438405e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415263146162033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 77, "warning": 0 }, { "completion_length": 349.875, "epoch": 0.39, "grad_norm": 0.0, "kl_div": 0.0022797980345785618, "learning_rate": 1.5299192642332049e-06, "loss": 2.5755748538358603e-05, "reinforce_loss": -0.7357044517993927, "reward": 6.108550554273506e-09, "reward_max": 1.4997000694274902, "reward_mean": 1.0412302131612705e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830630600452423, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 78, "warning": 0 }, { "completion_length": 240.75, "epoch": 0.395, "grad_norm": 0.0, "kl_div": 0.00676532369107008, "learning_rate": 1.5150380749100543e-06, "loss": 2.8328733606031165e-05, "reinforce_loss": 0.922637939453125, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 79, "warning": 0 }, { "completion_length": 249.75, "epoch": 0.4, "grad_norm": 0.0, "kl_div": 0.0062239880207926035, "learning_rate": 1.5e-06, "loss": 3.268045838922262e-05, "reinforce_loss": -2.222928047180176, "reward": 3.725290298461914e-09, "reward_max": 0.7497001886367798, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24990005791187286, "reward_std": 0.27897265553474426, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 80, "warning": 0 }, { "completion_length": 229.5, "epoch": 0.405, "grad_norm": 0.0, "kl_div": 0.006288101198151708, "learning_rate": 1.4848096202463372e-06, "loss": 6.7749786467175e-05, "reinforce_loss": -2.0730273723602295, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 81, "warning": 0 }, { "completion_length": 232.375, "epoch": 0.41, "grad_norm": 0.0, "kl_div": 0.005008954321965575, "learning_rate": 1.4694715627858908e-06, "loss": 4.1546782085788436e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 82, "warning": 0 }, { "completion_length": 273.875, "epoch": 0.415, "grad_norm": 0.0, "kl_div": 0.004290303448215127, "learning_rate": 1.4539904997395467e-06, "loss": 3.414790171518689e-05, "reinforce_loss": -1.623497486114502, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 83, "warning": 0 }, { "completion_length": 248.375, "epoch": 0.42, "grad_norm": 0.0, "kl_div": 0.007052561501041055, "learning_rate": 1.4383711467890773e-06, "loss": 3.932178333343472e-05, "reinforce_loss": -2.885895252227783, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 84, "warning": 0 }, { "completion_length": 267.125, "epoch": 0.425, "grad_norm": 0.0, "kl_div": 0.006008623633533716, "learning_rate": 1.4226182617406994e-06, "loss": 4.0231695493275765e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 85, "warning": 0 }, { "completion_length": 315.25, "epoch": 0.43, "grad_norm": 0.0, "kl_div": 0.006115781143307686, "learning_rate": 1.4067366430758004e-06, "loss": 4.45719160779845e-05, "reinforce_loss": -3.5447645783424377, "reward": 1.1175870895385742e-08, "reward_max": 1.4992505311965942, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.49975016713142395, "reward_std": 0.5891520529985428, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.15625, "step": 86, "warning": 0 }, { "completion_length": 350.625, "epoch": 0.435, "grad_norm": 0.0, "kl_div": 0.004203971242532134, "learning_rate": 1.3907311284892735e-06, "loss": 3.865012240567012e-05, "reinforce_loss": -1.091771125793457, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 87, "warning": 0 }, { "completion_length": 246.875, "epoch": 0.44, "grad_norm": 0.0, "kl_div": 0.004629536415450275, "learning_rate": 1.374606593415912e-06, "loss": 5.260626130620949e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 88, "warning": 0 }, { "completion_length": 436.25, "epoch": 0.445, "grad_norm": 0.0, "kl_div": 0.0024904462043195963, "learning_rate": 1.3583679495453e-06, "loss": 3.403911614441313e-05, "reinforce_loss": 0.007955007255077362, "reward": 0.0, "reward_max": 0.43293771147727966, "reward_mean": 0.0, "reward_min": -0.43293771147727966, "reward_std": 0.36085928976535797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 89, "warning": 0 }, { "completion_length": 403.75, "epoch": 0.45, "grad_norm": 0.0, "kl_div": 0.0029668795177713037, "learning_rate": 1.3420201433256689e-06, "loss": 3.4081568628607783e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 90, "warning": 0 }, { "completion_length": 370.625, "epoch": 0.455, "grad_norm": 0.0, "kl_div": 0.00392012984957546, "learning_rate": 1.3255681544571566e-06, "loss": 3.156075490551302e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 91, "warning": 0 }, { "completion_length": 474.0, "epoch": 0.46, "grad_norm": 0.0, "kl_div": 0.0012439819984138012, "learning_rate": 1.3090169943749473e-06, "loss": 2.3863924070610665e-05, "reinforce_loss": 1.0544613599777222, "reward": 0.0, "reward_max": 0.4329127371311188, "reward_mean": 0.0, "reward_min": -0.721521258354187, "reward_std": 0.32476241141557693, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 92, "warning": 0 }, { "completion_length": 284.625, "epoch": 0.465, "grad_norm": 0.0, "kl_div": 0.005175206810235977, "learning_rate": 1.2923717047227368e-06, "loss": 5.378553396440111e-05, "reinforce_loss": -0.07951122522354126, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 93, "warning": 0 }, { "completion_length": 368.75, "epoch": 0.47, "grad_norm": 0.0, "kl_div": 0.004898180486634374, "learning_rate": 1.275637355816999e-06, "loss": 3.207201461918885e-05, "reinforce_loss": -3.131664752960205, "reward": 3.725290298461914e-09, "reward_max": 1.4567568898200989, "reward_mean": 7.450580596923828e-09, "reward_min": -0.6034034341573715, "reward_std": 0.6464022770524025, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 94, "warning": 0 }, { "completion_length": 341.625, "epoch": 0.475, "grad_norm": 0.0, "kl_div": 0.004983491729944944, "learning_rate": 1.2588190451025207e-06, "loss": 3.347035817569122e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 95, "warning": 0 }, { "completion_length": 314.875, "epoch": 0.48, "grad_norm": 0.0, "kl_div": 0.003061264520511031, "learning_rate": 1.2419218955996676e-06, "loss": 2.4862270947778597e-05, "reinforce_loss": -1.7192867994308472, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 96, "warning": 0 }, { "completion_length": 321.625, "epoch": 0.485, "grad_norm": 0.0, "kl_div": 0.0032563700806349516, "learning_rate": 1.2249510543438651e-06, "loss": 2.07839752874861e-05, "reinforce_loss": -0.5695414543151855, "reward": 1.1175870895385742e-08, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 97, "warning": 0 }, { "completion_length": 254.5, "epoch": 0.49, "grad_norm": 0.0, "kl_div": 0.00792686827480793, "learning_rate": 1.207911690817759e-06, "loss": 2.4943179596448317e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 98, "warning": 0 }, { "completion_length": 276.875, "epoch": 0.495, "grad_norm": 0.0, "kl_div": 0.002792665036395192, "learning_rate": 1.1908089953765447e-06, "loss": 2.5957137950172182e-05, "reinforce_loss": 1.4168927669525146, "reward": 7.450580596923828e-09, "reward_max": 0.7494004964828491, "reward_mean": 7.450580596923828e-09, "reward_min": -0.2498001605272293, "reward_std": 0.2476361244916916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.03125, "step": 99, "warning": 0 }, { "completion_length": 412.375, "epoch": 0.5, "grad_norm": 0.0, "kl_div": 0.003308691084384918, "learning_rate": 1.1736481776669305e-06, "loss": 2.979737519126502e-05, "reinforce_loss": 0.24106240272521973, "reward": 3.5025411460054556e-09, "reward_max": 1.4997000694274902, "reward_mean": 1.408697425375749e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830348074436188, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 100, "warning": 0 }, { "completion_length": 234.875, "epoch": 0.505, "grad_norm": 0.0, "kl_div": 0.005344756878912449, "learning_rate": 1.156434465040231e-06, "loss": 2.4595957256678957e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 101, "warning": 0 }, { "completion_length": 277.625, "epoch": 0.51, "grad_norm": 0.0, "kl_div": 0.006992830196395516, "learning_rate": 1.1391731009600653e-06, "loss": 3.892023960361257e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 102, "warning": 0 }, { "completion_length": 345.125, "epoch": 0.515, "grad_norm": 0.0, "kl_div": 0.0050448826514184475, "learning_rate": 1.1218693434051474e-06, "loss": 3.941721479350235e-05, "reinforce_loss": -0.1830889880657196, "reward": 3.6588991836339346e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.258783796260104e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415166586637497, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 103, "warning": 0 }, { "completion_length": 388.25, "epoch": 0.52, "grad_norm": 0.0, "kl_div": 0.004101503058336675, "learning_rate": 1.1045284632676535e-06, "loss": 2.8712290259136353e-05, "reinforce_loss": -0.0005872459150850773, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 104, "warning": 0 }, { "completion_length": 256.0, "epoch": 0.525, "grad_norm": 0.0, "kl_div": 0.005036562215536833, "learning_rate": 1.0871557427476583e-06, "loss": 9.094200686377008e-05, "reinforce_loss": -1.6851162910461426, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 105, "warning": 0 }, { "completion_length": 353.875, "epoch": 0.53, "grad_norm": 0.0, "kl_div": 0.002854132413631305, "learning_rate": 1.069756473744125e-06, "loss": 2.4509951799700502e-05, "reinforce_loss": -0.03017403557896614, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 106, "warning": 0 }, { "completion_length": 291.0, "epoch": 0.535, "grad_norm": 0.0, "kl_div": 0.005613622954115272, "learning_rate": 1.052335956242944e-06, "loss": 2.761695304798195e-05, "reinforce_loss": -0.6514058113098145, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 107, "warning": 0 }, { "completion_length": 356.75, "epoch": 0.54, "grad_norm": 0.0, "kl_div": 0.0045632352121174335, "learning_rate": 1.034899496702501e-06, "loss": 4.324555447965395e-05, "reinforce_loss": 1.488743543624878, "reward": 0.0, "reward_max": 1.1827877461910248, "reward_mean": 7.450580596923828e-09, "reward_min": -0.6828877180814743, "reward_std": 0.7023752182722092, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 108, "warning": 0 }, { "completion_length": 358.625, "epoch": 0.545, "grad_norm": 0.0, "kl_div": 0.0044156176736578345, "learning_rate": 1.0174524064372837e-06, "loss": 2.0340425635367865e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 109, "warning": 0 }, { "completion_length": 417.625, "epoch": 0.55, "grad_norm": 0.0, "kl_div": 0.002261101733893156, "learning_rate": 1e-06, "loss": 3.4580974897835404e-05, "reinforce_loss": -0.1422603875398636, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 110, "warning": 0 }, { "completion_length": 252.25, "epoch": 0.555, "grad_norm": 0.0, "kl_div": 0.007964152144268155, "learning_rate": 9.825475935627165e-07, "loss": 4.0119478398992214e-05, "reinforce_loss": -0.03738582134246826, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 111, "warning": 0 }, { "completion_length": 292.75, "epoch": 0.56, "grad_norm": 0.0, "kl_div": 0.004941635997965932, "learning_rate": 9.651005032974993e-07, "loss": 3.508669396978803e-05, "reinforce_loss": -0.6642346978187561, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 112, "warning": 0 }, { "completion_length": 238.375, "epoch": 0.565, "grad_norm": 0.0, "kl_div": 0.006107961526140571, "learning_rate": 9.476640437570561e-07, "loss": 4.175661888439208e-05, "reinforce_loss": -0.7468752861022949, "reward": 0.0, "reward_max": 0.5831778645515442, "reward_mean": 0.0, "reward_min": -0.41655561327934265, "reward_std": 0.3102668225765228, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.15625, "step": 113, "warning": 0 }, { "completion_length": 297.25, "epoch": 0.57, "grad_norm": 0.0, "kl_div": 0.0029309506062418222, "learning_rate": 9.302435262558747e-07, "loss": 3.703889979078667e-05, "reinforce_loss": -1.1365559101104736, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 114, "warning": 0 }, { "completion_length": 401.25, "epoch": 0.575, "grad_norm": 0.0, "kl_div": 0.003738588420674205, "learning_rate": 9.128442572523417e-07, "loss": 3.471513082331512e-05, "reinforce_loss": -0.9452813863754272, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 115, "warning": 0 }, { "completion_length": 355.75, "epoch": 0.58, "grad_norm": 0.0, "kl_div": 0.004489046172238886, "learning_rate": 8.954715367323466e-07, "loss": 2.3953835807333235e-05, "reinforce_loss": -1.1807630062103271, "reward": 7.450580596923828e-09, "reward_max": 0.7497001886367798, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24990005791187286, "reward_std": 0.27897267043590546, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 116, "warning": 0 }, { "completion_length": 294.5, "epoch": 0.585, "grad_norm": 0.0, "kl_div": 0.0062689975602552295, "learning_rate": 8.781306565948526e-07, "loss": 4.562715366773773e-05, "reinforce_loss": -0.8640267848968506, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 117, "warning": 0 }, { "completion_length": 279.5, "epoch": 0.59, "grad_norm": 0.0, "kl_div": 0.005104512441903353, "learning_rate": 8.608268990399348e-07, "loss": 4.153374538873322e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 118, "warning": 0 }, { "completion_length": 320.25, "epoch": 0.595, "grad_norm": 0.0, "kl_div": 0.003830036846920848, "learning_rate": 8.435655349597689e-07, "loss": 4.7817045015108306e-05, "reinforce_loss": -0.07280240952968597, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 119, "warning": 0 }, { "completion_length": 338.875, "epoch": 0.6, "grad_norm": 0.0, "kl_div": 0.01220963301602751, "learning_rate": 8.263518223330696e-07, "loss": 3.313684828754049e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 120, "warning": 0 }, { "completion_length": 237.5, "epoch": 0.605, "grad_norm": 0.0, "kl_div": 0.006152846151962876, "learning_rate": 8.091910046234551e-07, "loss": 5.825521657243371e-05, "reinforce_loss": -2.523540496826172, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 121, "warning": 0 }, { "completion_length": 386.75, "epoch": 0.61, "grad_norm": 0.0, "kl_div": 0.0034146409016102552, "learning_rate": 7.920883091822408e-07, "loss": 2.2596315375267295e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 122, "warning": 0 }, { "completion_length": 303.875, "epoch": 0.615, "grad_norm": 0.0, "kl_div": 0.005648862104862928, "learning_rate": 7.750489456561351e-07, "loss": 4.065406301378971e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 123, "warning": 0 }, { "completion_length": 224.375, "epoch": 0.62, "grad_norm": 0.0, "kl_div": 0.004628933500498533, "learning_rate": 7.580781044003324e-07, "loss": 3.449977793934522e-05, "reinforce_loss": 0.6699845790863037, "reward": 0.0, "reward_max": 0.43293771147727966, "reward_mean": 0.0, "reward_min": -0.43293771147727966, "reward_std": 0.36085928976535797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 124, "warning": 0 }, { "completion_length": 297.375, "epoch": 0.625, "grad_norm": 0.0, "kl_div": 0.004623484332114458, "learning_rate": 7.411809548974791e-07, "loss": 3.470513274805853e-05, "reinforce_loss": -1.5379300117492676, "reward": 4.538424747124736e-09, "reward_max": 0.7498500347137451, "reward_mean": 2.723054670639158e-09, "reward_min": -0.24995000660419464, "reward_std": 0.34153175354003906, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 125, "warning": 0 }, { "completion_length": 398.375, "epoch": 0.63, "grad_norm": 0.0, "kl_div": 0.003666030883323401, "learning_rate": 7.243626441830009e-07, "loss": 4.213204010738991e-05, "reinforce_loss": 0.45349860191345215, "reward": 3.725290298461914e-09, "reward_max": 1.4996501207351685, "reward_mean": 7.450580596923828e-09, "reward_min": -0.49988336861133575, "reward_std": 0.6517674326896667, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 126, "warning": 0 }, { "completion_length": 246.5, "epoch": 0.635, "grad_norm": 0.0, "kl_div": 0.0037888718070462346, "learning_rate": 7.076282952772633e-07, "loss": 4.236815948388539e-05, "reinforce_loss": 1.8752317428588867, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 127, "warning": 0 }, { "completion_length": 334.25, "epoch": 0.64, "grad_norm": 0.0, "kl_div": 0.004845935618504882, "learning_rate": 6.909830056250526e-07, "loss": 8.860153320711106e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 128, "warning": 0 }, { "completion_length": 361.625, "epoch": 0.645, "grad_norm": 0.0, "kl_div": 0.0034735145163722336, "learning_rate": 6.744318455428435e-07, "loss": 3.458486207819078e-05, "reinforce_loss": -2.2845919132232666, "reward": 7.450580596923828e-09, "reward_max": 0.7494004964828491, "reward_mean": 7.450580596923828e-09, "reward_min": -0.2498001605272293, "reward_std": 0.2476361244916916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.03125, "step": 129, "warning": 0 }, { "completion_length": 328.25, "epoch": 0.65, "grad_norm": 0.0, "kl_div": 0.004264246206730604, "learning_rate": 6.579798566743313e-07, "loss": 2.5235848625015933e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 130, "warning": 0 }, { "completion_length": 359.875, "epoch": 0.655, "grad_norm": 0.0, "kl_div": 0.0058629007544368505, "learning_rate": 6.416320504546997e-07, "loss": 2.944394600490341e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 131, "warning": 0 }, { "completion_length": 341.0, "epoch": 0.66, "grad_norm": 0.0, "kl_div": 0.004888089140877128, "learning_rate": 6.253934065840879e-07, "loss": 3.964222742069978e-05, "reinforce_loss": -1.8639789819717407, "reward": 7.450580596923828e-09, "reward_max": 1.4996501207351685, "reward_mean": 7.450580596923828e-09, "reward_min": -0.49988336861133575, "reward_std": 0.6517674326896667, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 132, "warning": 0 }, { "completion_length": 403.125, "epoch": 0.665, "grad_norm": 0.0, "kl_div": 0.0030030603520572186, "learning_rate": 6.092688715107263e-07, "loss": 2.807083637890173e-05, "reinforce_loss": -1.9768284559249878, "reward": 7.450580596923828e-09, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 133, "warning": 0 }, { "completion_length": 283.5, "epoch": 0.67, "grad_norm": 0.0, "kl_div": 0.005025205900892615, "learning_rate": 5.932633569241999e-07, "loss": 1.9844385860778857e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 134, "warning": 0 }, { "completion_length": 269.125, "epoch": 0.675, "grad_norm": 0.0, "kl_div": 0.004547963617369533, "learning_rate": 5.773817382593007e-07, "loss": 3.15488414344145e-05, "reinforce_loss": -0.0686352476477623, "reward": 0.0, "reward_max": 0.5455992817878723, "reward_mean": -2.921796485311212e-11, "reward_min": -0.42435500025749207, "reward_std": 0.3454175740480423, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 135, "warning": 0 }, { "completion_length": 306.5, "epoch": 0.68, "grad_norm": 0.0, "kl_div": 0.00781878549605608, "learning_rate": 5.616288532109224e-07, "loss": 2.9904594157414977e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 136, "warning": 0 }, { "completion_length": 296.75, "epoch": 0.685, "grad_norm": 0.0, "kl_div": 0.003896760055795312, "learning_rate": 5.460095002604532e-07, "loss": 2.8330107852525543e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 137, "warning": 0 }, { "completion_length": 387.625, "epoch": 0.69, "grad_norm": 0.0, "kl_div": 0.003190419520251453, "learning_rate": 5.305284372141095e-07, "loss": 3.8214377127587795e-05, "reinforce_loss": -0.7974895238876343, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 138, "warning": 0 }, { "completion_length": 407.5, "epoch": 0.695, "grad_norm": 0.0, "kl_div": 0.003127658274024725, "learning_rate": 5.15190379753663e-07, "loss": 3.129008018731838e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 139, "warning": 0 }, { "completion_length": 320.75, "epoch": 0.7, "grad_norm": 0.0, "kl_div": 0.004971324000507593, "learning_rate": 5.000000000000002e-07, "loss": 4.383660780149512e-05, "reinforce_loss": -1.2445292472839355, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 140, "warning": 0 }, { "completion_length": 301.75, "epoch": 0.705, "grad_norm": 0.0, "kl_div": 0.00471379142254591, "learning_rate": 4.849619250899458e-07, "loss": 3.928630212612916e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 141, "warning": 0 }, { "completion_length": 326.625, "epoch": 0.71, "grad_norm": 0.0, "kl_div": 0.005247897235676646, "learning_rate": 4.700807357667952e-07, "loss": 4.4394892029231414e-05, "reinforce_loss": -1.4265222549438477, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 142, "warning": 0 }, { "completion_length": 325.75, "epoch": 0.715, "grad_norm": 0.0, "kl_div": 0.005402783048339188, "learning_rate": 4.5536096498497287e-07, "loss": 2.0742975721077528e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 143, "warning": 0 }, { "completion_length": 381.25, "epoch": 0.72, "grad_norm": 0.0, "kl_div": 0.0037028805818408728, "learning_rate": 4.408070965292533e-07, "loss": 2.682470130821457e-05, "reinforce_loss": -0.7678046226501465, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 144, "warning": 0 }, { "completion_length": 367.625, "epoch": 0.725, "grad_norm": 0.0, "kl_div": 0.0031455521238967776, "learning_rate": 4.2642356364895417e-07, "loss": 4.086682474735426e-05, "reinforce_loss": -0.289047509431839, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 145, "warning": 0 }, { "completion_length": 303.75, "epoch": 0.73, "grad_norm": 0.0, "kl_div": 0.004748902982100844, "learning_rate": 4.1221474770752696e-07, "loss": 3.2237231607723515e-05, "reinforce_loss": -2.3467034101486206, "reward": 0.0, "reward_max": 1.4996501207351685, "reward_mean": 7.450580596923828e-09, "reward_min": -0.49988336861133575, "reward_std": 0.6518016159534454, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 146, "warning": 0 }, { "completion_length": 290.75, "epoch": 0.735, "grad_norm": 0.0, "kl_div": 0.005546088912524283, "learning_rate": 3.981849768479516e-07, "loss": 3.252903934480855e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 147, "warning": 0 }, { "completion_length": 349.875, "epoch": 0.74, "grad_norm": 0.0, "kl_div": 0.0044729511719197035, "learning_rate": 3.843385246743417e-07, "loss": 4.0787699617794715e-05, "reinforce_loss": -1.5376715660095215, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 148, "warning": 0 }, { "completion_length": 297.875, "epoch": 0.745, "grad_norm": 0.0, "kl_div": 0.00683275586925447, "learning_rate": 3.706796089501627e-07, "loss": 3.786283650697442e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 149, "warning": 0 }, { "completion_length": 282.125, "epoch": 0.75, "grad_norm": 0.0, "kl_div": 0.0042821625247597694, "learning_rate": 3.5721239031346063e-07, "loss": 1.946631891769357e-05, "reinforce_loss": -1.383352518081665, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 150, "warning": 0 }, { "completion_length": 402.0, "epoch": 0.755, "grad_norm": 0.0, "kl_div": 0.00205736025236547, "learning_rate": 3.4394097100949283e-07, "loss": 2.9023183742538095e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 151, "warning": 0 }, { "completion_length": 200.375, "epoch": 0.76, "grad_norm": 0.0, "kl_div": 0.0047654545633122325, "learning_rate": 3.308693936411421e-07, "loss": 4.075010110682342e-05, "reinforce_loss": 0.44592535495758057, "reward": 0.0, "reward_max": 0.43293771147727966, "reward_mean": 0.0, "reward_min": -0.43293771147727966, "reward_std": 0.36090312898159027, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 152, "warning": 0 }, { "completion_length": 400.5, "epoch": 0.765, "grad_norm": 0.0, "kl_div": 0.0038307145005092025, "learning_rate": 3.180016399375016e-07, "loss": 3.341036426718347e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 153, "warning": 0 }, { "completion_length": 439.375, "epoch": 0.77, "grad_norm": 0.0, "kl_div": 0.0033623495255596936, "learning_rate": 3.0534162954100263e-07, "loss": 5.321637763699982e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 154, "warning": 0 }, { "completion_length": 297.875, "epoch": 0.775, "grad_norm": 0.0, "kl_div": 0.004120369907468557, "learning_rate": 2.9289321881345254e-07, "loss": 3.715060120157432e-05, "reinforce_loss": -0.7116526663303375, "reward": 7.273185831380147e-09, "reward_max": 1.499100685119629, "reward_mean": 1.525595116902423e-08, "reward_min": -0.4997002184391022, "reward_std": 0.5266364961862564, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 155, "warning": 0 }, { "completion_length": 328.5, "epoch": 0.78, "grad_norm": 0.0, "kl_div": 0.004674249328672886, "learning_rate": 2.8066019966134904e-07, "loss": 4.945438195136376e-05, "reinforce_loss": -0.590359091758728, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 156, "warning": 0 }, { "completion_length": 465.0, "epoch": 0.785, "grad_norm": 0.0, "kl_div": 0.002253161510452628, "learning_rate": 2.6864629838082954e-07, "loss": 3.636195833678357e-05, "reinforce_loss": -0.21075540781021118, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 157, "warning": 0 }, { "completion_length": 266.375, "epoch": 0.79, "grad_norm": 0.0, "kl_div": 0.005323680583387613, "learning_rate": 2.568551745226056e-07, "loss": 3.559106153261382e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 158, "warning": 0 }, { "completion_length": 335.375, "epoch": 0.795, "grad_norm": 0.0, "kl_div": 0.005256355740129948, "learning_rate": 2.45290419777228e-07, "loss": 3.1689378374721855e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 159, "warning": 0 }, { "completion_length": 369.75, "epoch": 0.8, "grad_norm": 0.0, "kl_div": 0.003511201008222997, "learning_rate": 2.339555568810221e-07, "loss": 2.4199387553380802e-05, "reinforce_loss": -0.2926635146141052, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 160, "warning": 0 }, { "completion_length": 298.875, "epoch": 0.805, "grad_norm": 0.0, "kl_div": 0.003870577202178538, "learning_rate": 2.228540385430291e-07, "loss": 2.1880807253182866e-05, "reinforce_loss": -2.5981526374816895, "reward": 0.0, "reward_max": 0.5455992817878723, "reward_mean": 0.0, "reward_min": -0.42435500025749207, "reward_std": 0.3453642576932907, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 161, "warning": 0 }, { "completion_length": 289.875, "epoch": 0.81, "grad_norm": 0.0, "kl_div": 0.0046180798672139645, "learning_rate": 2.1198924639327808e-07, "loss": 3.398945591470692e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 162, "warning": 0 }, { "completion_length": 327.0, "epoch": 0.815, "grad_norm": 0.0, "kl_div": 0.005427177296951413, "learning_rate": 2.0136448995270738e-07, "loss": 4.30914278695127e-05, "reinforce_loss": -0.5667372345924377, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025321781635284, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 163, "warning": 0 }, { "completion_length": 359.75, "epoch": 0.82, "grad_norm": 0.0, "kl_div": 0.005541174556128681, "learning_rate": 1.9098300562505264e-07, "loss": 4.954127507517114e-05, "reinforce_loss": -2.7894344329833984, "reward": 1.1175870895385742e-08, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 164, "warning": 0 }, { "completion_length": 349.0, "epoch": 0.825, "grad_norm": 0.0, "kl_div": 0.005921049974858761, "learning_rate": 1.8084795571100809e-07, "loss": 2.717350753300707e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 165, "warning": 0 }, { "completion_length": 263.375, "epoch": 0.83, "grad_norm": 0.0, "kl_div": 0.005009780637919903, "learning_rate": 1.7096242744495838e-07, "loss": 3.339421255077468e-05, "reinforce_loss": -0.12249474972486496, "reward": -7.411876001839346e-09, "reward_max": 0.726218581199646, "reward_mean": -9.90830439917545e-09, "reward_min": -0.3300993740558624, "reward_std": 0.3348415195941925, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.15625, "step": 166, "warning": 0 }, { "completion_length": 321.0, "epoch": 0.835, "grad_norm": 0.0, "kl_div": 0.005321947275660932, "learning_rate": 1.6132943205457606e-07, "loss": 3.31825604007463e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 167, "warning": 0 }, { "completion_length": 316.25, "epoch": 0.84, "grad_norm": 0.0, "kl_div": 0.00566250248812139, "learning_rate": 1.5195190384357404e-07, "loss": 2.9384233130258508e-05, "reinforce_loss": -1.0765857696533203, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 168, "warning": 0 }, { "completion_length": 320.25, "epoch": 0.845, "grad_norm": 0.0, "kl_div": 0.0044976952485740185, "learning_rate": 1.4283269929788776e-07, "loss": 1.920255908771651e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 169, "warning": 0 }, { "completion_length": 309.25, "epoch": 0.85, "grad_norm": 0.0, "kl_div": 0.005127235548570752, "learning_rate": 1.3397459621556128e-07, "loss": 3.580936481739627e-05, "reinforce_loss": 1.3514456748962402, "reward": 7.450580596923828e-09, "reward_max": 0.7494004964828491, "reward_mean": 7.450580596923828e-09, "reward_min": -0.2498001605272293, "reward_std": 0.2476361244916916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.03125, "step": 170, "warning": 0 }, { "completion_length": 360.875, "epoch": 0.855, "grad_norm": 0.0, "kl_div": 0.0036428944440558553, "learning_rate": 1.2538029286060424e-07, "loss": 3.965238465752918e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 171, "warning": 0 }, { "completion_length": 328.75, "epoch": 0.86, "grad_norm": 0.0, "kl_div": 0.008559396490454674, "learning_rate": 1.1705240714107301e-07, "loss": 3.577783900254872e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 172, "warning": 0 }, { "completion_length": 248.75, "epoch": 0.865, "grad_norm": 0.0, "kl_div": 0.004873627098277211, "learning_rate": 1.089934758116322e-07, "loss": 3.500456023175502e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 173, "warning": 0 }, { "completion_length": 244.0, "epoch": 0.87, "grad_norm": 0.0, "kl_div": 0.0051840199157595634, "learning_rate": 1.0120595370083318e-07, "loss": 3.954645217163488e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 174, "warning": 0 }, { "completion_length": 255.5, "epoch": 0.875, "grad_norm": 0.0, "kl_div": 0.003001044853590429, "learning_rate": 9.369221296335006e-08, "loss": 2.7848267563967966e-05, "reinforce_loss": -1.3166033029556274, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 175, "warning": 0 }, { "completion_length": 354.125, "epoch": 0.88, "grad_norm": 0.0, "kl_div": 0.0033484817249700427, "learning_rate": 8.645454235739902e-08, "loss": 3.623686825449113e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 176, "warning": 0 }, { "completion_length": 356.25, "epoch": 0.885, "grad_norm": 0.0, "kl_div": 0.004082909319549799, "learning_rate": 7.949514654755962e-08, "loss": 2.8525084417196922e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 177, "warning": 0 }, { "completion_length": 286.25, "epoch": 0.89, "grad_norm": 0.0, "kl_div": 0.0052701825043186545, "learning_rate": 7.281614543321269e-08, "loss": 3.640830982476473e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 178, "warning": 0 }, { "completion_length": 279.125, "epoch": 0.895, "grad_norm": 0.0, "kl_div": 0.0064017921686172485, "learning_rate": 6.641957350279837e-08, "loss": 2.44384909819928e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 179, "warning": 0 }, { "completion_length": 431.875, "epoch": 0.9, "grad_norm": 0.0, "kl_div": 0.0021319069783203304, "learning_rate": 6.030737921409168e-08, "loss": 4.011308283224935e-05, "reinforce_loss": -1.3017764687538147, "reward": -7.450580596923828e-09, "reward_max": 1.4760686159133911, "reward_mean": 0.0, "reward_min": -0.5800493806600571, "reward_std": 0.6763399988412857, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28125, "step": 180, "warning": 0 }, { "completion_length": 367.25, "epoch": 0.905, "grad_norm": 0.0, "kl_div": 0.003989931603427976, "learning_rate": 5.448142440068315e-08, "loss": 3.2466932680108584e-05, "reinforce_loss": -0.34161150455474854, "reward": 7.450580596923828e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 181, "warning": 0 }, { "completion_length": 323.625, "epoch": 0.91, "grad_norm": 0.0, "kl_div": 0.003815294476225972, "learning_rate": 4.8943483704846465e-08, "loss": 3.2748805097071454e-05, "reinforce_loss": 1.9752720594406128, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 182, "warning": 0 }, { "completion_length": 326.375, "epoch": 0.915, "grad_norm": 0.0, "kl_div": 0.006371562834829092, "learning_rate": 4.3695244036964564e-08, "loss": 4.290088691050187e-05, "reinforce_loss": -2.2492411136627197, "reward": 0.0, "reward_max": 0.8658754229545593, "reward_mean": 0.0, "reward_min": -0.8658754229545593, "reward_std": 0.7217185795307159, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 183, "warning": 0 }, { "completion_length": 288.125, "epoch": 0.92, "grad_norm": 0.0, "kl_div": 0.008472530171275139, "learning_rate": 3.87383040616811e-08, "loss": 0.0003003362926392583, "reinforce_loss": -1.9210008084774017, "reward": 7.450580596923828e-09, "reward_max": 1.4996501207351685, "reward_mean": 7.450580596923828e-09, "reward_min": -0.49988336861133575, "reward_std": 0.651801347732544, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 184, "warning": 0 }, { "completion_length": 349.25, "epoch": 0.925, "grad_norm": 0.0, "kl_div": 0.004282757407054305, "learning_rate": 3.4074173710931796e-08, "loss": 2.6969591999659315e-05, "reinforce_loss": 0.8394008874893188, "reward": 0.0, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 185, "warning": 0 }, { "completion_length": 373.75, "epoch": 0.93, "grad_norm": 0.0, "kl_div": 0.0022731663193553686, "learning_rate": 2.9704273724003526e-08, "loss": 4.327474562160205e-05, "reinforce_loss": -1.8910201787948608, "reward": 1.4864001141035033e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.580641003812616e-09, "reward_min": -0.24995000660419464, "reward_std": 0.34153057634830475, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 186, "warning": 0 }, { "completion_length": 301.375, "epoch": 0.935, "grad_norm": 0.0, "kl_div": 0.006663185544312, "learning_rate": 2.5629935214764864e-08, "loss": 2.792886061797617e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 187, "warning": 0 }, { "completion_length": 416.0, "epoch": 0.94, "grad_norm": 0.0, "kl_div": 0.0032071031164377928, "learning_rate": 2.185239926619431e-08, "loss": 3.325228408357361e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 188, "warning": 0 }, { "completion_length": 348.375, "epoch": 0.945, "grad_norm": 0.0, "kl_div": 0.0033624732168391347, "learning_rate": 1.8372816552336023e-08, "loss": 4.7012106733745895e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 189, "warning": 0 }, { "completion_length": 236.5, "epoch": 0.95, "grad_norm": 0.0, "kl_div": 0.003992203623056412, "learning_rate": 1.519224698779198e-08, "loss": 3.758694583666511e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 190, "warning": 0 }, { "completion_length": 355.125, "epoch": 0.955, "grad_norm": 0.0, "kl_div": 0.0034447858342900872, "learning_rate": 1.231165940486234e-08, "loss": 4.340646137279691e-05, "reinforce_loss": 0.22765234112739563, "reward": 0.0, "reward_max": 0.7498000860214233, "reward_mean": 0.0, "reward_min": -0.2499333620071411, "reward_std": 0.31025150418281555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.09375, "step": 191, "warning": 0 }, { "completion_length": 302.625, "epoch": 0.96, "grad_norm": 0.0, "kl_div": 0.006092249182984233, "learning_rate": 9.731931258429638e-09, "loss": 0.0001205687367473729, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 192, "warning": 0 }, { "completion_length": 348.0, "epoch": 0.965, "grad_norm": 0.0, "kl_div": 0.0061495862901210785, "learning_rate": 7.453848358678017e-09, "loss": 0.00011797392380685778, "reinforce_loss": -0.5728763341903687, "reward": 3.725290298461914e-09, "reward_max": 0.7498500347137451, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24995000660419464, "reward_std": 0.3415159285068512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 193, "warning": 0 }, { "completion_length": 313.75, "epoch": 0.97, "grad_norm": 0.0, "kl_div": 0.002809734083712101, "learning_rate": 5.47810463172671e-09, "loss": 3.324962381157093e-05, "reinforce_loss": 0.4938305765390396, "reward": 9.634371522793117e-09, "reward_max": 1.4025049209594727, "reward_mean": 6.165997490370501e-09, "reward_min": -0.6415429264307022, "reward_std": 0.6777721792459488, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 194, "warning": 0 }, { "completion_length": 286.875, "epoch": 0.975, "grad_norm": 0.0, "kl_div": 0.004895730991847813, "learning_rate": 3.805301908254455e-09, "loss": 3.1812018278287724e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 195, "warning": 0 }, { "completion_length": 255.125, "epoch": 0.98, "grad_norm": 0.0, "kl_div": 0.005137150175869465, "learning_rate": 2.435949740175802e-09, "loss": 2.6147503376705572e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 196, "warning": 0 }, { "completion_length": 344.125, "epoch": 0.985, "grad_norm": 0.0, "kl_div": 0.0036462006391957402, "learning_rate": 1.3704652454261667e-09, "loss": 2.504587700968841e-05, "reinforce_loss": -1.8295396566390991, "reward": 3.725290298461914e-09, "reward_max": 0.7497001886367798, "reward_mean": 7.450580596923828e-09, "reward_min": -0.24990005791187286, "reward_std": 0.27897265553474426, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 197, "warning": 0 }, { "completion_length": 332.375, "epoch": 0.99, "grad_norm": 0.0, "kl_div": 0.004262488102540374, "learning_rate": 6.091729809042379e-10, "loss": 3.882066630467307e-05, "reinforce_loss": 0.5422813296318054, "reward": 7.332316975805497e-09, "reward_max": 0.7498500347137451, "reward_mean": 2.3652635405824185e-09, "reward_min": -0.24995000660419464, "reward_std": 0.34153467416763306, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 198, "warning": 0 }, { "completion_length": 388.125, "epoch": 0.995, "grad_norm": 0.0, "kl_div": 0.003484022803604603, "learning_rate": 1.5230484360873042e-10, "loss": 3.713557271112222e-05, "reinforce_loss": -0.11149996519088745, "reward": 7.450580596923828e-09, "reward_max": 1.4997000694274902, "reward_mean": 1.4901161193847656e-08, "reward_min": -0.4999000132083893, "reward_std": 0.6830318570137024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 199, "warning": 0 }, { "completion_length": 246.25, "epoch": 1.0, "grad_norm": 0.0, "kl_div": 0.006254963111132383, "learning_rate": 0.0, "loss": 3.669069155876059e-05, "reinforce_loss": 0.0, "reward": 0.0, "reward_max": 0.0, "reward_mean": 0.0, "reward_min": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 200, "warning": 0 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }